Skip to content

Commit

Permalink
changed NER BERT to uncased
Browse files Browse the repository at this point in the history
  • Loading branch information
plkmo committed Sep 19, 2019
1 parent 2e90cb8 commit 123f64f
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 6 deletions.
3 changes: 2 additions & 1 deletion ner/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,10 @@ def infer(args, from_data=False):
time.sleep(7)
else:
max_len = args.tokens_length - 2
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
while True:
sent = input("Type input sentence:\n")
sent = sent.lower()
if sent in ["quit", "exit"]:
break

Expand Down
8 changes: 4 additions & 4 deletions ner/preprocessing_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def get_NER_data(args, load_extracted=True):
if len(line) == 4:
word, pos, btag, ner = line
if word != '-DOCSTART-':
sent.append(word); sent_ner.append(re.sub("\n", "", ner))
sent.append(word.lower()); sent_ner.append(re.sub("\n", "", ner))
else:
sents.append(sent); ners.append(sent_ner)
sent, sent_ner = [], []
Expand All @@ -95,7 +95,7 @@ def get_NER_data(args, load_extracted=True):
if len(line) == 4:
word, pos, btag, ner = line
if word != '-DOCSTART-':
sent.append(word); sent_ner.append(re.sub("\n", "", ner))
sent.append(word.lower()); sent_ner.append(re.sub("\n", "", ner))
else:
sents.append(sent); ners.append(sent_ner)
sent, sent_ner = [], []
Expand All @@ -111,15 +111,15 @@ def convert_ners_to_ids(ners, vocab):
return [vocab.ner2idx[ner] for ner in ners]


def ner_preprocess(args, df_train, df_test=None, include_cls=False):
def ner_preprocess(args, df_train, df_test=None, include_cls=True):
logger.info("Preprocessing...")
vocab = vocab_mapper(df_train, df_test)
vocab.save()

logger.info("Tokenizing...")
if args.model_no == 0: # BERT
max_len = args.tokens_length
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
if include_cls:
df_train['sents_ids'] = df_train.progress_apply(lambda x: tokenizer.convert_tokens_to_ids(["[CLS]"] + x['sents'][:max_len] + ["[SEP]"]),\
axis=1)
Expand Down
2 changes: 1 addition & 1 deletion ner/train_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def load_model_and_optimizer(args, cuda=False):

if args.model_no == 0:
logger.info("Loading pre-trained BERT for token classification...")
net = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=args.num_classes)
net = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=args.num_classes)

for p in net.parameters():
if p.dim() > 1:
Expand Down

0 comments on commit 123f64f

Please sign in to comment.