Skip to content

Commit

Permalink
修改tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
moon-hotel committed Oct 1, 2021
1 parent 283b6cb commit 05929ab
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions data_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def my_tokenizer(s):
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
tokenizer = get_tokenizer('basic_english')
return tokenizer(s)


Expand Down Expand Up @@ -99,7 +99,7 @@ def data_process(self, filepath):
raw_iter = iter(open(filepath, encoding="utf8"))
data = []
max_len = 0
for raw in raw_iter:
for raw in tqdm(raw_iter):
line = raw.rstrip("\n").split('","')
s, l = line[-1][:-1], line[0][1:]
s = clean_str(s)
Expand Down

0 comments on commit 05929ab

Please sign in to comment.