Skip to content

Commit

Permalink
Refine vocab and nearby
Browse files Browse the repository at this point in the history
  • Loading branch information
hailiang-wang committed Mar 3, 2018
1 parent a1af98a commit d4f20e9
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 73 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# 3.0 - 更简单的定制和配置,增加了额外的开销
* 去掉nearby words, 使用 kdtree检索空间词汇的最近临
* 增加了对sk-learn的依赖,但是减少了对词向量的预处理
* 优化了分词所使用的字典,也可以使用环境变量声明主字典
* 支持自定义word2vec模型,使用环境变量声明

# 2.5
* 使用空间距离近的词汇优化编辑距离计算

Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Chinese Synonyms for Natural Language Processing and Understanding.
```
pip install -U synonyms
```
兼容py2和py3,当前稳定版本 [v2.x](https://github.com/huyingxi/Synonyms/releases)
兼容py2和py3,当前稳定版本 [v3.x](https://github.com/huyingxi/Synonyms/releases)

**Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。**

Expand Down Expand Up @@ -53,16 +53,16 @@ print("识别: %s" % (synonyms.nearby("识别")))
print("NOT_EXIST: %s" % (synonyms.nearby("NOT_EXIST")))
```

```synonyms.nearby(WORD)```返回一个list,list中包含两项```[[nearby_words], [nearby_words_score]]``````nearby_words```是WORD的近义词们,也以list的方式存储,并且按照距离的长度由近及远排列,```nearby_words_score``````nearby_words```**对应位置**的词的距离的分数,分数在(0-1)区间内,越接近于1,代表越相近。比如:
```synonyms.nearby(WORD)```返回一个元组,元组中包含两项```([nearby_words], [nearby_words_score])``````nearby_words```是WORD的近义词们,也以list的方式存储,并且按照距离的长度由近及远排列,```nearby_words_score``````nearby_words```**对应位置**的词的距离的分数,分数在(0-1)区间内,越接近于1,代表越相近。比如:

```
synonyms.nearby(人脸) = [
synonyms.nearby(人脸) = (
["图片", "图像", "通过观察", "数字图像", "几何图形", "脸部", "图象", "放大镜", "面孔", "Mii"],
[0.597284, 0.580373, 0.568486, 0.535674, 0.531835, 0.530
095, 0.525344, 0.524009, 0.523101, 0.516046]]
095, 0.525344, 0.524009, 0.523101, 0.516046])
```

在OOV的情况下,返回 ```[[], []]```,目前的字典大小: 125,792。
在OOV的情况下,返回 ```([], [])```,目前的字典大小: 125,792。

### synonyms#compare
两个句子的相似度比较
Expand Down
2 changes: 1 addition & 1 deletion Requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
synonyms>=2.5
synonyms>=2.7
4 changes: 2 additions & 2 deletions demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
import numpy
import unittest

compare_ = lambda x,y,z: "*"* 30 + "\n%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z))
compare_ = lambda x,y,z: "%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z)) + "\n" +"*"* 30 + "\n"

# run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample
class Test(unittest.TestCase):
Expand Down Expand Up @@ -97,7 +97,7 @@ def test_similarity(self):
print("%s vs %s" % (sen1, sen2), r)

def test_nearby(self):
synonyms.display("人脸") # synonyms.display calls synonyms.nearby
synonyms.display("奥运") # synonyms.display calls synonyms.nearby


def test():
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setup(
name='synonyms',
version='2.6',
version='3.0',
description='Chinese Synonyms for Natural Language Processing and Understanding',
long_description=LONGDOC,
author='Hai Liang Wang, Hu Ying Xi',
Expand Down Expand Up @@ -41,7 +41,8 @@
install_requires=[
'jieba>=0.39',
'six>=1.11.0',
'numpy>=1.13.1'
'numpy>=1.13.1',
'scikit-learn==0.19.1'
],
package_data={
'synonyms': [
Expand Down
90 changes: 28 additions & 62 deletions synonyms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@
from synonyms.utils import any2utf8
from synonyms.utils import any2unicode
from synonyms.utils import sigmoid
from synonyms.utils import cosine
from sklearn.neighbors import KDTree
import jieba.posseg as _tokenizer
import jieba

Expand All @@ -67,54 +69,23 @@
'''
# combine similarity scores
_similarity_smooth = lambda x, y, z: (x * y) + z
_sim_molecule = lambda x: np.sum(x, axis=0) # 分子

_flat_sum_array = lambda x: np.sum(x, axis=0) # 分子

'''
tokenizer settings
'''
tokenizer_dict = os.path.join(curdir, 'data', 'vocab.txt')
if "SYNONYMS_WORDSEG_DICT" in ENVIRON:
tokenizer_dict = ENVIRON["SYNONYMS_WORDSEG_DICT"]
if os.exist(tokenizer_dict):
jieba.set_dictionary(tokenizer_dict)
if os.exist(ENVIRON["SYNONYMS_WORDSEG_DICT"]):
print("info: set wordseg dict with %s" % tokenizer_dict)
tokenizer_dict = ENVIRON["SYNONYMS_WORDSEG_DICT"]
else: print("warning: can not find dict at [%s]" % tokenizer_dict)

'''
nearby
'''
def _load_vocab(file_path):
'''
load vocab dict
'''
global _vocab
if PLT == 2:
import io
fin = io.TextIOWrapper(
io.BufferedReader(
gzip.open(file_path)),
encoding='utf8',
errors='ignore')
else:
fin = gzip.open(file_path, 'rt', encoding='utf-8', errors="ignore")

_vocab = json.loads(fin.read())

# build on load
print(">> Synonyms on loading vocab ...")
_load_vocab(os.path.join(curdir, "data", "words.nearby.json.gz"))

def nearby(word):
'''
Nearby word
'''
try:
return _vocab[any2unicode(word)]
except KeyError as e:
return [[], []]
print(">> Synonyms load wordseg dict [%s] ... " % tokenizer_dict)
jieba.set_dictionary(tokenizer_dict)

'''
similarity
word embedding
'''
# stopwords
_fin_stopwords_path = os.path.join(curdir, 'data', 'stopwords.txt')
Expand All @@ -131,7 +102,7 @@ def _load_stopwords(file_path):
for w in stopwords:
_stopwords.add(any2unicode(w).strip())

print(">> Synonyms on loading stopwords ...")
print(">> Synonyms on loading stopwords [%s] ..." % _fin_stopwords_path)
_load_stopwords(_fin_stopwords_path)

def _segment_words(sen):
Expand All @@ -158,7 +129,7 @@ def _load_w2v(model_file=_f_model, binary=True):
raise Exception("Model file [%s] does not exist." % model_file)
return KeyedVectors.load_word2vec_format(
model_file, binary=binary, unicode_errors='ignore')
print(">> Synonyms on loading vectors ...")
print(">> Synonyms on loading vectors [%s] ..." % _f_model)
_vectors = _load_w2v(model_file=_f_model)

def _get_wv(sentence):
Expand Down Expand Up @@ -195,18 +166,6 @@ def _get_wv(sentence):
vectors.append(r)
return vectors

def _unigram_overlap(sentence1, sentence2):
'''
compute unigram overlap
'''
x = set(sentence1.split())
y = set(sentence2.split())

intersection = x & y
union = x | y

return ((float)(len(intersection)) / (float)(len(union)))

def _levenshtein_distance(sentence1, sentence2):
'''
Return the Levenshtein distance between two strings.
Expand Down Expand Up @@ -262,25 +221,32 @@ def _similarity_distance(s1, s2):
'''
compute similarity with distance measurement
'''
a = _sim_molecule(_get_wv(s1))
b = _sim_molecule(_get_wv(s2))
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
g = 1 / (np.linalg.norm(a - b) + 1)

g = cosine(_flat_sum_array(_get_wv(s1)), _flat_sum_array(_get_wv(s2)))
u = _nearby_levenshtein_distance(s1, s2)
# print("g: %s, u: %s" % (g, u))
if u > 0.8:
r = _similarity_smooth(g, 1, u)
elif u > 0.7:
r = _similarity_smooth(g, 1.5, u)
r = _similarity_smooth(g, 0.1, u)
elif u > 0.6:
r = _similarity_smooth(g, 2, u)
r = _similarity_smooth(g, 0.25, u)
elif u > 0.4:
r = _similarity_smooth(g, 0.5, u)
else:
r = _similarity_smooth(g, 4, u)
r = _similarity_smooth(g, 1, u)

if r < 0: r = abs(r)
r = min(r, 1.0)
return float("%.3f" % r)

def nearby(word):
'''
Nearby word
'''
words, scores = [], []
for x in _vectors.neighbours(any2unicode(word)):
words.append(x[0])
scores.append(x[1])
return words, scores

def compare(s1, s2, seg=True):
'''
compare similarity
Expand Down
Binary file not shown.
6 changes: 6 additions & 0 deletions synonyms/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,12 @@ def any2unicode(text, encoding='utf8', errors='strict'):

to_unicode = any2unicode

# cosine distance
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
from numpy import dot
from numpy.linalg import norm
cosine = lambda a, b: dot(a, b)/(norm(a)*norm(b))

def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))

Expand Down
25 changes: 24 additions & 1 deletion synonyms/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
double, array, vstack, fromstring, sqrt, newaxis,\
ndarray, sum as np_sum, prod, ascontiguousarray,\
argmax
from sklearn.neighbors import KDTree

class Vocab(object):
"""
Expand Down Expand Up @@ -68,6 +69,7 @@ def __init__(self):
self.vocab = {}
self.index2word = []
self.vector_size = None
self.kdt = None

@property
def wv(self):
Expand Down Expand Up @@ -198,7 +200,12 @@ def add_word(word, weights):
(result.syn0.shape[0], len(result.vocab)))
result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)])
assert (len(result.vocab), vector_size) == result.syn0.shape

'''
KDTree
Build KDTree with vectors.
http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html#sklearn.neighbors.KDTree
'''
result.kdt = KDTree(result.syn0, leaf_size=10, metric = "euclidean")
print("loaded %s matrix from %s" % (result.syn0.shape, fname))
return result

Expand All @@ -222,6 +229,22 @@ def word_vec(self, word, use_norm=False):
else:
raise KeyError("word '%s' not in vocabulary" % word)

def neighbours(self, word, size = 10):
"""
Get nearest words with KDTree, ranking by cosine distance
"""
v = self.word_vec(word)
[distances], [points] = self.kdt.query(array([v]), k = size, return_distance = True)
assert len(distances) == len(points), "distances and points should be in same shape."
words, scores = [], {}
for (x,y) in zip(points, distances):
w = self.index2word[x]
s = utils.cosine(v, self.syn0[x])
if s < 0: s = abs(s)
words.append(w)
scores[w] = min(s, 1.0)
for x in sorted(words, key=scores.get, reverse=True):
yield x, scores[x]

import unittest

Expand Down

0 comments on commit d4f20e9

Please sign in to comment.