Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
hisashi-ito committed Mar 11, 2023
1 parent b9203d9 commit 705f04c
Showing 1 changed file with 5 additions and 7 deletions.
12 changes: 5 additions & 7 deletions megatron/tokenizer/tokenizer_ja.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# 2023.03.10
"""Megatron tokenizers."""

from abc import ABC
Expand Down Expand Up @@ -57,17 +58,14 @@ def build_tokenizer(args):
elif args.tokenizer_type.lower() == "TiktokenTokenizer".lower():
assert args.vocab_file is not None
tokenizer = TiktokenTokenizer(args.vocab_file)

# 2023.03.11
# 2023.03.11 PreTrainedTokenizerFast を追加
elif args.tokenizer_type.lower() == "PreTrainedTokenizerFast".lower():
assert args.hf_tokenizer is not None:
tokenizer = _PreTrainedTokenizerFast.from_pretrained(args.hf_tokenizer)

assert args.hf_tokenizer is not None
tokenizer = _PreTrainedTokenizerFast(args.hf_tokenizer)
else:
raise NotImplementedError(
"{} tokenizer is not " "implemented.".format(args.tokenizer_type)
)

# Add vocab size.
args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)

Expand Down Expand Up @@ -195,7 +193,7 @@ class _PreTrainedTokenizerFast(AbstractTokenizer):
def __init__(self, hf_tokenizer):
name = "PreTrainedTokenizerFast"
super().__init__(name)
self.tokenizer = PreTrainedTokenizerFast.from_pretrained(args.hf_tokenizer)
self.tokenizer = PreTrainedTokenizerFast.from_pretrained(hf_tokenizer)
self.eod_id = self.tokenizer.encoder["</s>"]

@property
Expand Down

0 comments on commit 705f04c

Please sign in to comment.