bypassing load_tiktoken_bpe to avoid blobfile dep

openai · jongwook · Mar 13, 2023 · Mar 7, 2023 · Mar 7, 2023 · Mar 7, 2023
commit a0bd014f13593101b95ea30ca7155356267f519b
diff --git a/whisper/tokenizer.py b/whisper/tokenizer.py
@@ -1,11 +1,11 @@
+import base64
 import os
 import string
 from dataclasses import dataclass, field
 from functools import cached_property, lru_cache
 from typing import Dict, List, Optional, Tuple
 
 import tiktoken
-from tiktoken.load import load_tiktoken_bpe
 from tiktoken_ext.openai_public import gpt2
 
 LANGUAGES = {
@@ -315,7 +315,10 @@ def split_tokens_on_spaces(self, tokens: List[int]):
 @lru_cache(maxsize=None)
 def get_encoding(name: str = "gpt2"):
  vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
- ranks = load_tiktoken_bpe(vocab_path)
+ ranks = {
+ base64.b64decode(token): int(rank)
+ for token, rank in (line.split() for line in open(vocab_path) if line)
+ }
  n_vocab = len(ranks)
  special_tokens = {}