From 80a424b3e9fa2b56d657b35b6f6e79be1d5f8e1d Mon Sep 17 00:00:00 2001 From: Caleb Bartholomew Date: Wed, 21 Apr 2021 11:12:56 -0600 Subject: [PATCH] Fix for Possible bug with BPE RegEx #5 --- Encoder.js | 4 ++-- encoder.py | 2 +- package-lock.json | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Encoder.js b/Encoder.js index 844eeb4..f875b13 100644 --- a/Encoder.js +++ b/Encoder.js @@ -65,7 +65,7 @@ function get_pairs(word) { return pairs } -const pat = /'s|'t|'re|'ve|'m|'l l|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu +const pat = /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu const decoder = {} Object.keys(encoder).map(x => { decoder[encoder[x]] = x }) @@ -87,7 +87,7 @@ const cache = {} function bpe(token) { if (token in cache) { return cache[token] - } + }`` let word = token.split('') diff --git a/encoder.py b/encoder.py index 49d686a..f461b87 100644 --- a/encoder.py +++ b/encoder.py @@ -48,7 +48,7 @@ def __init__(self, encoder, bpe_merges, errors="replace"): self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} self.pat = re.compile( - r"""'s|'t|'re|'ve|'m|'l l|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" + r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ) def bpe(self, token): diff --git a/package-lock.json b/package-lock.json index 7398fa6..1f7b54a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "gpt-3-encoder", - "version": "1.1.1", + "version": "1.1.3", "lockfileVersion": 1, "requires": true, "dependencies": {