Skip to content

Commit

Permalink
Fix for Possible bug with BPE RegEx #5
Browse files Browse the repository at this point in the history
  • Loading branch information
caleb-artifact committed Apr 21, 2021
1 parent 79387f4 commit 80a424b
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 4 deletions.
4 changes: 2 additions & 2 deletions Encoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ function get_pairs(word) {
return pairs
}

const pat = /'s|'t|'re|'ve|'m|'l l|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu
const pat = /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu

const decoder = {}
Object.keys(encoder).map(x => { decoder[encoder[x]] = x })
Expand All @@ -87,7 +87,7 @@ const cache = {}
function bpe(token) {
if (token in cache) {
return cache[token]
}
}``

let word = token.split('')

Expand Down
2 changes: 1 addition & 1 deletion encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(self, encoder, bpe_merges, errors="replace"):
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.pat = re.compile(
r"""'s|'t|'re|'ve|'m|'l l|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)

def bpe(self, token):
Expand Down
2 changes: 1 addition & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 80a424b

Please sign in to comment.