Update synonyms.py

update _nearby_levenshtein_distance which is calculated by considering both first and second sentence's nearby words
xiaonengqiu · Apr 28, 2018 · c580b3d · c580b3d
1 parent 8c2a15f
commit c580b3d
Showing 1 changed file with 16 additions and 7 deletions.
diff --git a/synonyms/synonyms.py b/synonyms/synonyms.py
@@ -213,17 +213,26 @@ def _nearby_levenshtein_distance(s1, s2):
     s1_len, s2_len = len(s1), len(s2)
     maxlen = max(s1_len, s2_len)
     first, second = (s2, s1) if s1_len == maxlen else (s1, s2)
-    ft = set() # all related words with first sentence 
+    ft_1 = set() # all related words with first sentence 
     for x in first:
-        ft.add(x)
+        ft_1.add(x)
         n, _ = nearby(x)
         for o in n[:5]:
-            ft.add(o)
-    scores = []
-    if len(ft) == 0: return 0.0 # invalid length for first string
+            ft_1.add(o)
+            
+    ft_2 = set() # all related words with second sentence
     for x in second:
-        scores.append(max([_levenshtein_distance(x, y) for y in ft]))
-    s = np.sum(scores) / maxlen
+        ft_2.add(x)
+        n, _ = nearby(x)
+        for o in n[:5]:
+            ft_2.add(0)
+
+    scores = []
+    if len(ft_1) == 0 or len(ft_2) == 0: return 0.0 # invalid length
+    for x in ft_1:
+        for y in ft_2:
+            scores.append([_levenshtein_distance(x, y)])
+    s = np.sum(scores) / (s1_len * s2_len)
     return s
 
 def _similarity_distance(s1, s2, ignore):