Skip to content

Commit

Permalink
Update tokenizer.py (openai#1163)
Browse files Browse the repository at this point in the history
  • Loading branch information
jongwook committed Mar 29, 2023
1 parent 6dea21f commit b5851c6
Showing 1 changed file with 1 addition and 2 deletions.
3 changes: 1 addition & 2 deletions whisper/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from typing import Dict, List, Optional, Tuple

import tiktoken
from tiktoken_ext.openai_public import gpt2

LANGUAGES = {
"en": "english",
Expand Down Expand Up @@ -352,7 +351,7 @@ def get_encoding(name: str = "gpt2"):
return tiktoken.Encoding(
name=os.path.basename(vocab_path),
explicit_n_vocab=n_vocab,
pat_str=gpt2()["pat_str"],
pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
mergeable_ranks=ranks,
special_tokens=special_tokens,
)
Expand Down

0 comments on commit b5851c6

Please sign in to comment.