From b5851c6c40e753606765ac45b85b298e3ae9e00d Mon Sep 17 00:00:00 2001
From: Jong Wook Kim <jongwook@openai.com>
Date: Wed, 29 Mar 2023 16:12:36 -0400
Subject: [PATCH] Update tokenizer.py (#1163)

---
 whisper/tokenizer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/whisper/tokenizer.py b/whisper/tokenizer.py
index 236f65e0..4030e15a 100644
--- a/whisper/tokenizer.py
+++ b/whisper/tokenizer.py
@@ -6,7 +6,6 @@
 from typing import Dict, List, Optional, Tuple
 
 import tiktoken
-from tiktoken_ext.openai_public import gpt2
 
 LANGUAGES = {
     "en": "english",
@@ -352,7 +351,7 @@ def get_encoding(name: str = "gpt2"):
     return tiktoken.Encoding(
         name=os.path.basename(vocab_path),
         explicit_n_vocab=n_vocab,
-        pat_str=gpt2()["pat_str"],
+        pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
         mergeable_ranks=ranks,
         special_tokens=special_tokens,
     )