Skip to content

Commit

Permalink
修改tokenizer方法
Browse files Browse the repository at this point in the history
  • Loading branch information
moon-hotel committed Sep 22, 2021
1 parent a7fc0fd commit 283b6cb
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 17 deletions.
23 changes: 6 additions & 17 deletions data_helper.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,19 @@
from collections import Counter
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
import torch
import re
from tqdm import tqdm


def my_tokenizer(s):
s = s.replace(',', " ,").replace(".", " .").replace("?", " ?").replace("!", " !")
return s.split()
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
return tokenizer(s)


def clean_str(string):
string = re.sub("[^A-Za-z0-9\-\?\!\.\,]", " ", string).lower()
string = string.replace("that's", "that is")
string = string.replace("isn't", "is not")
string = string.replace("don't", "do not")
string = string.replace("didn't", "did not")
string = string.replace("won't", "will not")
string = string.replace("can't", "can not")
string = string.replace("you're", "you are")
string = string.replace("they're", "they are")
string = string.replace("you'll", "you will")
string = string.replace("we'll", "we will")
string = string.replace("what's", "what is")
string = string.replace("i'm", "i am")
string = string.replace("let's", "let us")
return string


Expand All @@ -42,8 +31,8 @@ def build_vocab(tokenizer, filepath, min_freq, specials=None):
specials = ['<unk>', '<pad>']
counter = Counter()
with open(filepath, encoding='utf8') as f:
for string_ in f:
string_ = string_.strip().split('","')[-1][:-1] # 取标签和新闻描述
for string_ in tqdm(f):
string_ = string_.strip().split('","')[-1][:-1] # 取标签和新闻描述
counter.update(tokenizer(clean_str(string_)))
return Vocab(counter, min_freq=min_freq, specials=specials)

Expand Down
Binary file added en_core_web_sm-3.0.0.tar.gz
Binary file not shown.

0 comments on commit 283b6cb

Please sign in to comment.