Skip to content

Commit

Permalink
Merge pull request #497 from sjeffry-o/main
Browse files Browse the repository at this point in the history
adding SentencePiece Tokenizer
  • Loading branch information
StellaAthena committed Feb 2, 2022
2 parents a639d30 + bd26082 commit fec5d35
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 4 deletions.
4 changes: 2 additions & 2 deletions configs/neox_arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -899,11 +899,11 @@ Tokenizer Arguments



- **tokenizer_type**: typing.Literal['GPT2BPETokenizer', 'HFTokenizer', 'HFGPT2Tokenizer', 'CharLevelTokenizer']
- **tokenizer_type**: typing.Literal['GPT2BPETokenizer', 'HFTokenizer', 'HFGPT2Tokenizer', 'SPMTokenizer', 'CharLevelTokenizer']

Default = GPT2BPETokenizer

Type of tokenizer to use - should be one of ["GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "CharLevelTokenizer"]
Type of tokenizer to use - should be one of ["GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "SPMTokenizer", "CharLevelTokenizer"]



Expand Down
4 changes: 2 additions & 2 deletions megatron/neox_arguments/neox_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,10 +606,10 @@ class NeoXArgsTokenizer(NeoXArgsTemplate):
"""

tokenizer_type: Literal[
"GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "CharLevelTokenizer"
"GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "SPMTokenizer", "CharLevelTokenizer"
] = "GPT2BPETokenizer"
"""
Type of tokenizer to use - should be one of ["GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "CharLevelTokenizer"]
Type of tokenizer to use - should be one of ["GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "SPMTokenizer", "CharLevelTokenizer"]
"""

padded_vocab_size: int = None
Expand Down
37 changes: 37 additions & 0 deletions megatron/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from tokenizers import Tokenizer
from transformers import GPT2Tokenizer, GPT2TokenizerFast
import numpy as np
import sentencepiece as spm
from typing import List, Union
from .gpt2_tokenization import GPT2Tokenizer

Expand All @@ -38,6 +39,9 @@ def build_tokenizer(args):
assert args.vocab_file is not None
assert args.merge_file is not None
tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
elif args.tokenizer_type.lower() == 'SPMTokenizer'.lower():
assert args.vocab_file is not None
tokenizer = SentencePieceTokenizer(args.vocab_file)
elif args.tokenizer_type.lower() == 'HFTokenizer'.lower():
assert args.vocab_file is not None
tokenizer = HFTokenizer(args.vocab_file)
Expand Down Expand Up @@ -166,6 +170,39 @@ def eod(self):
return self.eod_id


class SentencePieceTokenizer(AbstractTokenizer):
"""Designed to Integrate SP's Tokenizer."""

def __init__(self, vocab_file):
name = 'SPM'
super().__init__(name)

self.tokenizer = spm.SentencePieceProcessor(model_file=vocab_file)
self.eod_id = self.tokenizer.piece_to_id('<|endoftext|>')

@property
def vocab_size(self):
return self.tokenizer.get_piece_size()

@property
def vocab(self):
return {self.tokenizer.id_to_piece(idx):idx for idx in range(self.tokenizer.get_piece_size())}

@property
def inv_vocab(self):
return {idx:self.tokenizer.id_to_piece(idx) for idx in range(self.tokenizer.get_piece_size())}

def tokenize(self, text):
return self.tokenizer.encode(text)

def detokenize(self, token_ids):
return self.tokenizer.decode(token_ids)

@property
def eod(self):
return self.eod_id


class HFTokenizer(AbstractTokenizer):
"""Designed to Integrate HF's Tokenizer library."""

Expand Down

0 comments on commit fec5d35

Please sign in to comment.