From b5851c6c40e753606765ac45b85b298e3ae9e00d Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Wed, 29 Mar 2023 16:12:36 -0400 Subject: [PATCH] Update tokenizer.py (#1163) --- whisper/tokenizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/whisper/tokenizer.py b/whisper/tokenizer.py index 236f65e0..4030e15a 100644 --- a/whisper/tokenizer.py +++ b/whisper/tokenizer.py @@ -6,7 +6,6 @@ from typing import Dict, List, Optional, Tuple import tiktoken -from tiktoken_ext.openai_public import gpt2 LANGUAGES = { "en": "english", @@ -352,7 +351,7 @@ def get_encoding(name: str = "gpt2"): return tiktoken.Encoding( name=os.path.basename(vocab_path), explicit_n_vocab=n_vocab, - pat_str=gpt2()["pat_str"], + pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", mergeable_ranks=ranks, special_tokens=special_tokens, )