Skip to content

Commit

Permalink
allow hyphens and single quotes between words
Browse files Browse the repository at this point in the history
  • Loading branch information
jongwook committed Sep 23, 2022
1 parent 15ab548 commit 8cf36f3
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions whisper/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,7 @@ def non_speech_tokens(self) -> Tuple[int]:
keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
"""

result = set()
symbols = list("'\"#()*+-/:;<=>@[\\]^_`{|}~「」『』")
symbols = list("\"#()*+/:;<=>@[\\]^_`{|}~「」『』")
symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()

# symbols that may be a single token or multiple tokens depending on the tokenizer.
Expand All @@ -257,6 +255,8 @@ def non_speech_tokens(self) -> Tuple[int]:
miscellaneous = set("♩♪♫♬♭♮♯")
assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)

# allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
result = {self.tokenizer.encode(" -")[0], self.tokenizer.encode(" '")[0]}
for symbol in symbols + list(miscellaneous):
for tokens in [self.tokenizer.encode(symbol), self.tokenizer.encode(" " + symbol)]:
if len(tokens) == 1 or symbol in miscellaneous:
Expand Down

0 comments on commit 8cf36f3

Please sign in to comment.