Refine vocab and nearby

0ldm0s · Mar 3, 2018 · d4f20e9 · d4f20e9
1 parent a1af98a
commit d4f20e9
Show file tree

Hide file tree

Showing 9 changed files with 75 additions and 73 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+# 3.0 - 更简单的定制和配置，增加了额外的开销
+* 去掉nearby words, 使用 kdtree检索空间词汇的最近临
+* 增加了对sk-learn的依赖，但是减少了对词向量的预处理
+* 优化了分词所使用的字典，也可以使用环境变量声明主字典
+* 支持自定义word2vec模型，使用环境变量声明
+
 # 2.5
 * 使用空间距离近的词汇优化编辑距离计算
 

diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ Chinese Synonyms for Natural Language Processing and Understanding.
 ```
 pip install -U synonyms
 ```
-兼容py2和py3，当前稳定版本 [v2.x](https://github.com/huyingxi/Synonyms/releases)。
+兼容py2和py3，当前稳定版本 [v3.x](https://github.com/huyingxi/Synonyms/releases)。
 
 **Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。**
 
@@ -53,16 +53,16 @@ print("识别: %s" % (synonyms.nearby("识别")))
 print("NOT_EXIST: %s" % (synonyms.nearby("NOT_EXIST")))
 ```
 
-```synonyms.nearby(WORD)```返回一个list，list中包含两项：```[[nearby_words], [nearby_words_score]]```，```nearby_words```是WORD的近义词们，也以list的方式存储，并且按照距离的长度由近及远排列，```nearby_words_score```是```nearby_words```中**对应位置**的词的距离的分数，分数在(0-1)区间内，越接近于1，代表越相近。比如:
+```synonyms.nearby(WORD)```返回一个元组，元组中包含两项：```([nearby_words], [nearby_words_score])```，```nearby_words```是WORD的近义词们，也以list的方式存储，并且按照距离的长度由近及远排列，```nearby_words_score```是```nearby_words```中**对应位置**的词的距离的分数，分数在(0-1)区间内，越接近于1，代表越相近。比如:
 
 ```
-synonyms.nearby(人脸) = [
+synonyms.nearby(人脸) = (
     ["图片", "图像", "通过观察", "数字图像", "几何图形", "脸部", "图象", "放大镜", "面孔", "Mii"], 
     [0.597284, 0.580373, 0.568486, 0.535674, 0.531835, 0.530
-095, 0.525344, 0.524009, 0.523101, 0.516046]]
+095, 0.525344, 0.524009, 0.523101, 0.516046])
 ```
 
-在OOV的情况下，返回  ```[[], []]```，目前的字典大小: 125,792。
+在OOV的情况下，返回  ```([], [])```，目前的字典大小: 125,792。
 
 ### synonyms#compare
 两个句子的相似度比较

diff --git a/Requirements.txt b/Requirements.txt
@@ -1 +1 @@
-synonyms>=2.5
+synonyms>=2.7
diff --git a/demo.py b/demo.py
@@ -36,7 +36,7 @@
 import numpy
 import unittest
 
-compare_ = lambda x,y,z: "*"* 30 + "\n%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z))
+compare_ = lambda x,y,z: "%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z)) + "\n" +"*"* 30 + "\n"
 
 # run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample
 class Test(unittest.TestCase):
@@ -97,7 +97,7 @@ def test_similarity(self):
         print("%s vs %s" % (sen1, sen2), r)
 
     def test_nearby(self):
-        synonyms.display("人脸")  # synonyms.display calls synonyms.nearby
+        synonyms.display("奥运")  # synonyms.display calls synonyms.nearby
 
 
 def test():

diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@
 
 setup(
     name='synonyms',
-    version='2.6',
+    version='3.0',
     description='Chinese Synonyms for Natural Language Processing and Understanding',
     long_description=LONGDOC,
     author='Hai Liang Wang, Hu Ying Xi',
@@ -41,7 +41,8 @@
     install_requires=[
         'jieba>=0.39',
         'six>=1.11.0',
-        'numpy>=1.13.1'
+        'numpy>=1.13.1',
+        'scikit-learn==0.19.1'
     ],
     package_data={
         'synonyms': [

diff --git a/synonyms/__init__.py b/synonyms/__init__.py
@@ -51,6 +51,8 @@
 from synonyms.utils import any2utf8
 from synonyms.utils import any2unicode
 from synonyms.utils import sigmoid
+from synonyms.utils import cosine
+from sklearn.neighbors import KDTree
 import jieba.posseg as _tokenizer
 import jieba
 
@@ -67,54 +69,23 @@
 '''
 # combine similarity scores
 _similarity_smooth = lambda x, y, z: (x * y) + z
-_sim_molecule = lambda x: np.sum(x, axis=0)  # 分子
-
+_flat_sum_array = lambda x: np.sum(x, axis=0)  # 分子
 
 '''
 tokenizer settings
 '''
+tokenizer_dict = os.path.join(curdir, 'data', 'vocab.txt')
 if "SYNONYMS_WORDSEG_DICT" in ENVIRON:
-    tokenizer_dict = ENVIRON["SYNONYMS_WORDSEG_DICT"]
-    if os.exist(tokenizer_dict):
-        jieba.set_dictionary(tokenizer_dict)
+    if os.exist(ENVIRON["SYNONYMS_WORDSEG_DICT"]):
         print("info: set wordseg dict with %s" % tokenizer_dict)
+        tokenizer_dict = ENVIRON["SYNONYMS_WORDSEG_DICT"]
     else: print("warning: can not find dict at [%s]" % tokenizer_dict)
 
-'''
-nearby
-'''
-def _load_vocab(file_path):
-    '''
-    load vocab dict
-    '''
-    global _vocab
-    if PLT == 2:
-        import io
-        fin = io.TextIOWrapper(
-            io.BufferedReader(
-                gzip.open(file_path)),
-            encoding='utf8',
-            errors='ignore')
-    else:
-        fin = gzip.open(file_path, 'rt', encoding='utf-8', errors="ignore")
-
-    _vocab = json.loads(fin.read())
-
-# build on load
-print(">> Synonyms on loading vocab ...")
-_load_vocab(os.path.join(curdir, "data", "words.nearby.json.gz"))
-
-def nearby(word):
-    '''
-    Nearby word
-    '''
-    try:
-        return _vocab[any2unicode(word)]
-    except KeyError as e:
-        return [[], []]
+print(">> Synonyms load wordseg dict [%s] ... " % tokenizer_dict)
+jieba.set_dictionary(tokenizer_dict)
 
 '''
-similarity
+word embedding
 '''
 # stopwords
 _fin_stopwords_path = os.path.join(curdir, 'data', 'stopwords.txt')
@@ -131,7 +102,7 @@ def _load_stopwords(file_path):
     for w in stopwords:
         _stopwords.add(any2unicode(w).strip())
 
-print(">> Synonyms on loading stopwords ...")
+print(">> Synonyms on loading stopwords [%s] ..." % _fin_stopwords_path)
 _load_stopwords(_fin_stopwords_path)
 
 def _segment_words(sen):
@@ -158,7 +129,7 @@ def _load_w2v(model_file=_f_model, binary=True):
         raise Exception("Model file [%s] does not exist." % model_file)
     return KeyedVectors.load_word2vec_format(
         model_file, binary=binary, unicode_errors='ignore')
-print(">> Synonyms on loading vectors ...")
+print(">> Synonyms on loading vectors [%s] ..." % _f_model)
 _vectors = _load_w2v(model_file=_f_model)
 
 def _get_wv(sentence):
@@ -195,18 +166,6 @@ def _get_wv(sentence):
             vectors.append(r)
     return vectors
 
-def _unigram_overlap(sentence1, sentence2):
-    '''
-    compute unigram overlap
-    '''
-    x = set(sentence1.split())
-    y = set(sentence2.split())
-
-    intersection = x & y
-    union = x | y
-
-    return ((float)(len(intersection)) / (float)(len(union)))
-
 def _levenshtein_distance(sentence1, sentence2):
     '''
     Return the Levenshtein distance between two strings.
@@ -262,25 +221,32 @@ def _similarity_distance(s1, s2):
     '''
     compute similarity with distance measurement
     '''
-    a = _sim_molecule(_get_wv(s1))
-    b = _sim_molecule(_get_wv(s2))
-    # https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
-    g = 1 / (np.linalg.norm(a - b) + 1)
-
+    g = cosine(_flat_sum_array(_get_wv(s1)), _flat_sum_array(_get_wv(s2)))
     u = _nearby_levenshtein_distance(s1, s2)
     # print("g: %s, u: %s" % (g, u))
     if u > 0.8:
-        r = _similarity_smooth(g, 1, u)
-    elif u > 0.7:
-        r = _similarity_smooth(g, 1.5, u)
+        r = _similarity_smooth(g, 0.1, u)
     elif u > 0.6:
-        r = _similarity_smooth(g, 2, u)
+        r = _similarity_smooth(g, 0.25, u)
+    elif u > 0.4:
+        r = _similarity_smooth(g, 0.5, u)
     else:
-        r = _similarity_smooth(g, 4, u)
+        r = _similarity_smooth(g, 1, u)
 
+    if r < 0: r = abs(r)
     r = min(r, 1.0)
     return float("%.3f" % r)
 
+def nearby(word):
+    '''
+    Nearby word
+    '''
+    words, scores = [], []
+    for x in _vectors.neighbours(any2unicode(word)):
+        words.append(x[0])
+        scores.append(x[1])
+    return words, scores
+
 def compare(s1, s2, seg=True):
     '''
     compare similarity

diff --git a/synonyms/data/words.nearby.json.gz → synonyms/data/vocab.txt b/synonyms/data/words.nearby.json.gz → synonyms/data/vocab.txt
diff --git a/synonyms/utils.py b/synonyms/utils.py
@@ -239,6 +239,12 @@ def any2unicode(text, encoding='utf8', errors='strict'):
 
 to_unicode = any2unicode
 
+# cosine distance
+# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
+from numpy import dot
+from numpy.linalg import norm
+cosine = lambda a, b: dot(a, b)/(norm(a)*norm(b))
+
 def sigmoid(x):
     return 1.0 / (1.0 + np.exp(-x))
 

diff --git a/synonyms/word2vec.py b/synonyms/word2vec.py
@@ -33,6 +33,7 @@
     double, array, vstack, fromstring, sqrt, newaxis,\
     ndarray, sum as np_sum, prod, ascontiguousarray,\
     argmax
+from sklearn.neighbors import KDTree
 
 class Vocab(object):
     """
@@ -68,6 +69,7 @@ def __init__(self):
         self.vocab = {}
         self.index2word = []
         self.vector_size = None
+        self.kdt = None
 
     @property
     def wv(self):
@@ -198,7 +200,12 @@ def add_word(word, weights):
                 (result.syn0.shape[0], len(result.vocab)))
             result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)])
         assert (len(result.vocab), vector_size) == result.syn0.shape
-
+        '''
+        KDTree
+        Build KDTree with vectors.
+        http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html#sklearn.neighbors.KDTree
+        '''
+        result.kdt = KDTree(result.syn0, leaf_size=10, metric = "euclidean")
         print("loaded %s matrix from %s" % (result.syn0.shape, fname))
         return result
 
@@ -222,6 +229,22 @@ def word_vec(self, word, use_norm=False):
         else:
             raise KeyError("word '%s' not in vocabulary" % word)
 
+    def neighbours(self, word, size = 10):
+        """
+        Get nearest words with KDTree, ranking by cosine distance
+        """
+        v = self.word_vec(word)
+        [distances], [points] = self.kdt.query(array([v]), k = size, return_distance = True)
+        assert len(distances) == len(points), "distances and points should be in same shape."
+        words, scores = [], {}
+        for (x,y) in zip(points, distances):
+            w = self.index2word[x]
+            s = utils.cosine(v, self.syn0[x])
+            if s < 0: s = abs(s)
+            words.append(w)
+            scores[w] = min(s, 1.0)
+        for x in sorted(words, key=scores.get, reverse=True):
+            yield x, scores[x]
 
 import unittest