Skip to content

Commit

Permalink
Merge branch 'master' of github.com:huyingxi/Synonyms
Browse files Browse the repository at this point in the history
  • Loading branch information
hailiang-wang committed Apr 11, 2018
2 parents ae69e67 + eaf2d16 commit 86e24d5
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 10 deletions.
71 changes: 70 additions & 1 deletion synonyms/data/stopwords.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1595,4 +1595,73 @@
非特
非独
高兴
若果
若果
·
~
-
——
=
+
{
}
|
/
*
@
#
%
……
&
`
~
!
@
#
$
%
^
&
(
)
[
]
|
\
;
:
'
"
,
<
.
>
/
?
0
1
2
3
4
5
6
7
8
9
21 changes: 12 additions & 9 deletions synonyms/synonyms.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def _load_w2v(model_file=_f_model, binary=True):
print(">> Synonyms on loading vectors [%s] ..." % _f_model)
_vectors = _load_w2v(model_file=_f_model)

def _get_wv(sentence):
def _get_wv(sentence, ignore=False):
'''
get word2vec data by sentence
sentence is segmented string.
Expand All @@ -151,10 +151,13 @@ def _get_wv(sentence):
try:
c.append(_vectors.word_vec(y_))
except KeyError as error:
logging.warn("not exist in w2v model: %s" % y_)
# c.append(np.zeros((100,), dtype=float))
random_state = np.random.RandomState(seed=(hash(y_) % (2**32 - 1)))
c.append(random_state.uniform(low=-10.0, high=10.0, size=(100,)))
if ignore:
continue
else:
logging.warning("not exist in w2v model: %s" % y_)
# c.append(np.zeros((100,), dtype=float))
random_state = np.random.RandomState(seed=(hash(y_) % (2**32 - 1)))
c.append(random_state.uniform(low=-10.0, high=10.0, size=(100,)))
for n in syns:
if n is None: continue
try:
Expand Down Expand Up @@ -223,13 +226,13 @@ def _nearby_levenshtein_distance(s1, s2):
s = np.sum(scores) / maxlen
return s

def _similarity_distance(s1, s2):
def _similarity_distance(s1, s2, ignore):
'''
compute similarity with distance measurement
'''
g = 0.0
try:
g_ = cosine(_flat_sum_array(_get_wv(s1)), _flat_sum_array(_get_wv(s2)))
g_ = cosine(_flat_sum_array(_get_wv(s1, ignore)), _flat_sum_array(_get_wv(s2, ignore)))
if is_digit(g_): g = g_
except: pass

Expand Down Expand Up @@ -275,7 +278,7 @@ def nearby(word):
_cache_nearby[w] = (words, scores)
return words, scores

def compare(s1, s2, seg=True):
def compare(s1, s2, seg=True, ignore=False):
'''
compare similarity
s1 : sentence1
Expand All @@ -291,7 +294,7 @@ def compare(s1, s2, seg=True):
s1 = s1.split()
s2 = s2.split()
assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
return _similarity_distance(s1, s2)
return _similarity_distance(s1, s2, ignore)

def display(word):
print("'%s'近义词:" % word)
Expand Down

0 comments on commit 86e24d5

Please sign in to comment.