Add files via upload

Schlampig · Nov 13, 2020 · 121ce4f · 121ce4f
1 parent 90ac382
commit 121ce4f
Showing 1 changed file with 156 additions and 0 deletions.
diff --git a/predict.py b/predict.py
@@ -0,0 +1,156 @@
+import os
+import random
+import argparse
+import numpy as np
+from time import time
+import torch
+from torch.utils.data import TensorDataset, DataLoader
+import bert_codes.tokenization as tokenization
+from train import data2fea, get_entity_dict, compare_entity_dict # NOTE： it can trigger the utils.check_args() to delete train/dev log files.
+import ipdb
+
+# Configuration
+##############################################################################################
+DICT_LABEL = {"症状": [1, 2], "病史": [3, 4], "症状_程度": [5, 6], "症状_生理": [7, 8], "部位": [9, 10],
+ "检查": [11, 12], "诊断": [13, 14], "治疗": [15, 16], "药物": [17, 18], "预后": [19, 20],
+ "器材": [21, 22], "人群": [23, 24], "科室": [25, 26], "时间": [27, 28], "其他": [29, 30]}
+DICT_LABEL_REV = dict()
+for k, v in DICT_LABEL.items():
+ DICT_LABEL_REV.update({v[0]: k})
+ DICT_LABEL_REV.update({v[1]: k})
+
+t_config = time()
+# set args
+parser = argparse.ArgumentParser()
+parser.add_argument('--gpu_ids', type=str, default='2, 3')
+# training parameter
+parser.add_argument('--test_batch_size', type=int, default=256)
+parser.add_argument('--float16', type=bool, default=True) # only sm >= 7.0 (tensorcores)
+parser.add_argument('--seed', type=int, default=42)
+# data and model dir
+parser.add_argument('--test_dir', type=str, default='./datasets/ner_test.json')
+parser.add_argument('--feature_test_dir', type=str, default='./datasets/fea_ner_test.json')
+parser.add_argument('--vocab_file', type=str, default='./pretrained_models/bert_chinese/vocab.txt')
+parser.add_argument('--checkpoint_dir', type=str, default='check_points/base_ner')
+parser.add_argument('--predict_file', type=str, default='predict_log.txt')
+args = parser.parse_args()
+
+# tokenizer initialization
+tokenizer = tokenization.BertTokenizer(vocab_file=args.vocab_file, do_lower_case=True)
+
+# set seed
+random.seed(args.seed)
+np.random.seed(args.seed)
+torch.manual_seed(args.seed)
+
+# set gpu
+os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids
+device = torch.device("cuda")
+n_gpu = torch.cuda.device_count()
+if n_gpu > 0:
+ torch.cuda.manual_seed_all(args.seed)
+print("device %s n_gpu %d" % (device, n_gpu))
+print("device: {} n_gpu: {} 16-bits training: {}".format(device, n_gpu, args.float16))
+
+# load model
+print('***** Loading Model *****')
+model = torch.load(args.checkpoint_dir + "/best_model.pth")
+model.to(device)
+
+print("Configuration Time: {}".format(time() - t_config))
+
+
+# Predict Batch
+##############################################################################################
+def prepare_for_test():
+ global args
+ # get features
+ test_features = data2fea(load_path=args.test_dir, save_path=args.feature_test_dir)
+ # get test dataloader
+ test_input_ids = torch.tensor([f['input_ids'] for f in test_features], dtype=torch.long)
+ test_input_mask = torch.tensor([f['input_mask'] for f in test_features], dtype=torch.long)
+ test_input_segments = torch.tensor([f['input_segments'] for f in test_features], dtype=torch.long)
+ test_input_tags = torch.tensor([f['input_tags'] for f in test_features], dtype=torch.long)
+ test_tensor = TensorDataset(test_input_ids, test_input_mask, test_input_segments, test_input_tags)
+ test_dataloader = DataLoader(test_tensor, batch_size=args.test_batch_size, shuffle=True)
+ print("Test-{}".format(len(test_features)))
+ return test_features, test_dataloader
+
+
+def print_and_save_batch_dict(input_id, d_true, d_pred):
+ global args, tokenizer
+ lst_token = tokenizer.convert_ids_to_tokens(input_id.cpu().numpy())
+ lst_token = [t.replace("##", "") if t.startswith("##") else t for t in lst_token]
+ s_token = "".join(lst_token).replace("[PAD]", "").replace("[CLS]", "").replace("[SEP]", "")
+ with open(args.predict_file, 'a') as aw_dev:
+ aw_dev.write("Sent: {} \n".format(s_token))
+ s_true, s_pred = "", ""
+ for k, lst_v in d_true.items():
+ if len(lst_v) > 0:
+ s_true = s_true + ">" + k + ": "
+ for i_m, m in enumerate(lst_v):
+ lst_m = m.split("_")
+ idx_s, idx_e = int(lst_m[0]), int(lst_m[1])
+ s_true = s_true + "".join(lst_token[idx_s:idx_e]) + "(" + lst_m[0] + ", " + lst_m[1] + "); "
+ for k, lst_v in d_pred.items():
+ if len(lst_v) > 0:
+ s_pred = s_pred + ">" + k + ": "
+ for m in lst_v:
+ lst_m = m.split("_")
+ idx_s, idx_e = int(lst_m[0]), int(lst_m[1])
+ s_pred = s_pred + "".join(lst_token[idx_s:idx_e]) + "(" + lst_m[0] + ", " + lst_m[1] + "); "
+ with open(args.predict_file, 'a') as f:
+ f.write("True: {} \n".format(s_true))
+ f.write("Pred: {} \n".format(s_pred))
+ f.write("\n")
+ return None
+
+
+def predict():
+ global args, model, DICT_LABEL
+ print("***** Preprocessing *****")
+ _, test_dataloader = prepare_for_test()
+
+ print("***** Predict *****")
+ model.eval()
+ d_res = {k: {"precision": [], "recall": [], "f1": []} for k in DICT_LABEL.keys()}
+ with torch.no_grad():
+ for step, batch in enumerate(test_dataloader):
+ batch = tuple(t.to(device) for t in batch)
+ input_ids, input_mask, input_segments, input_tags = batch
+ logits = model(input_ids=input_ids,
+ token_type_ids=input_segments,
+ attention_mask=input_mask)
+ logits = logits.detach().cpu().numpy() # [bs, len, dim]
+ pred_batch = np.argmax(logits, axis=-1) # get predicted labels: [bs,len]
+ true_batch = input_tags.detach().cpu().numpy() # get true labels
+ # calculate each sample in the batch
+ batch_size = true_batch.shape[0]
+ for i in range(batch_size):
+ true_batch_now = get_entity_dict(true_batch[i])
+ pred_batch_now = get_entity_dict(pred_batch[i])
+ input_id_now = input_ids[i]
+ res_batch = compare_entity_dict(d_true=true_batch_now, d_pred=pred_batch_now)
+ print_and_save_batch_dict(input_id=input_id_now, d_true=true_batch_now, d_pred=pred_batch_now)
+ for k, v in res_batch.items():
+ d_res[k]["precision"].append(v["precision"])
+ d_res[k]["recall"].append(v["recall"])
+ d_res[k]["f1"].append(v["f1"])
+ with open(args.predict_file, 'a') as f:
+ f.write("Result of batch {} is: \n {} \n".format(step, res_batch))
+ f.write(" ---------------------------------------------------- \n")
+ print("Result of batch {} is: \n {} \n ".format(step, res_batch))
+ # get final scores
+ f1 = np.mean([np.mean(v["f1"]) for k, v in d_res.items()])
+ with open(args.predict_file, 'a') as f:
+ f.write("F1 of all batches is: {:.4f} \n".format(f1))
+ f.write(" ---------------------------------------------------- \n")
+ print("F1 of all batches is: {:.4f} \n".format(f1))
+ return f1
+
+
+# Main
+##############################################################################################
+if __name__ == '__main__':
+ res = predict()
+