From 2e32c58d711f3c90663df02af6769435ed958f2b Mon Sep 17 00:00:00 2001 From: tauriel Date: Mon, 20 Jan 2020 22:36:16 +0800 Subject: [PATCH] fix bugs --- .gitignore | 2 + ZEN/modeling.py | 12 +- ZEN/ngram_utils.py | 9 +- examples/create_pre_train_data.py | 15 +- examples/run_pre_train.py | 168 +++++++++---- examples/run_token_level_classification.py | 273 +++++++++++++++++---- examples/utils_token_level_task.py | 187 +++++++++++++- finetune-ner.sh | 15 ++ 8 files changed, 568 insertions(+), 113 deletions(-) create mode 100644 .gitignore create mode 100644 finetune-ner.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ee05595 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea +__pycache__ diff --git a/ZEN/modeling.py b/ZEN/modeling.py index c3805f9..0835405 100644 --- a/ZEN/modeling.py +++ b/ZEN/modeling.py @@ -266,19 +266,19 @@ def __init__(self, "or the path to a pretrained model config file (str)") @classmethod - def from_dict(cls, json_object): + def from_dict(cls, json_object, ngram_size): """Constructs a `BertConfig` from a Python dictionary of parameters.""" - config = ZenConfig(vocab_size_or_config_json_file=-1, word_vocab_size=104089) + config = ZenConfig(vocab_size_or_config_json_file=-1, word_vocab_size=ngram_size) for key, value in json_object.items(): config.__dict__[key] = value return config @classmethod - def from_json_file(cls, json_file): + def from_json_file(cls, json_file, ngram_size): """Constructs a `BertConfig` from a json file of parameters.""" with open(json_file, "r", encoding='utf-8') as reader: text = reader.read() - return cls.from_dict(json.loads(text)) + return cls.from_dict(json.loads(text), ngram_size) def __repr__(self): return str(self.to_json_string()) @@ -729,6 +729,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): kwargs.pop('from_tf', None) multift = kwargs.get("multift", False) kwargs.pop('multift', None) + ngram_size = kwargs.get('ngram_size', 0) + kwargs.pop('ngram_size', None) if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP: archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path] @@ -783,7 +785,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): logger.info("loading configuration file {} from cache at {}".format( config_file, resolved_config_file)) # Load config - config = ZenConfig.from_json_file(resolved_config_file) + config = ZenConfig.from_json_file(resolved_config_file, ngram_size) logger.info("Model config {}".format(config)) # Instantiate model. model = cls(config, *inputs, **kwargs) diff --git a/ZEN/ngram_utils.py b/ZEN/ngram_utils.py index 73e7eea..7479034 100644 --- a/ZEN/ngram_utils.py +++ b/ZEN/ngram_utils.py @@ -37,16 +37,23 @@ def __init__(self, ngram_freq_path, tokenizer, max_ngram_in_seq=128): self.id_to_ngram_list = ["[pad]"] self.ngram_to_id_dict = {"[pad]": 0} self.ngram_to_freq_dict = {} + self.ngram_size = 0 logger.info("loading ngram frequency file {}".format(ngram_freq_path)) with open(ngram_freq_path, "r", encoding="utf-8") as fin: for i, line in enumerate(fin): - ngram,freq = line.split(",") + line = line.strip() + items = line.split(',') + if len(items) != 2: + continue + ngram,freq = items tokens = tuple(tokenizer.tokenize(ngram)) self.ngram_to_freq_dict[ngram] = freq self.id_to_ngram_list.append(tokens) self.ngram_to_id_dict[tokens] = i + 1 + self.ngram_size = i + 1 + def save(self, ngram_freq_path): with open(ngram_freq_path, "w", encoding="utf-8") as fout: for ngram,freq in self.ngram_to_freq_dict.items(): diff --git a/examples/create_pre_train_data.py b/examples/create_pre_train_data.py index bd6d2c1..3c4b431 100644 --- a/examples/create_pre_train_data.py +++ b/examples/create_pre_train_data.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Create data for pretrain.""" - +import sys from argparse import ArgumentParser from pathlib import Path @@ -34,8 +34,7 @@ def __init__(self, reduce_memory=False): self.temp_dir = TemporaryDirectory() self.working_dir = Path(self.temp_dir.name) self.document_shelf_filepath = self.working_dir / 'shelf.db' - self.document_shelf = shelve.open(str(self.document_shelf_filepath), - flag='n', protocol=-1) + self.document_shelf = shelve.open(str(self.document_shelf_filepath)) self.documents = None else: self.documents = [] @@ -253,9 +252,12 @@ def create_instances_from_document( is_random_next = False for j in range(a_end, len(current_chunk)): tokens_b.extend(current_chunk[j]) + + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens) + tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"] # The segment IDs are 0 for the [CLS] token, the A tokens and the first [SEP] # They are 1 for the B tokens and the final [SEP] @@ -318,6 +320,7 @@ def main(): parser.add_argument("--epochs_to_generate", type=int, default=3, help="Number of epochs of data to pregenerate") + parser.add_argument("--start_epoch", type=int, default=0) parser.add_argument("--max_seq_len", type=int, default=128) parser.add_argument("--short_seq_prob", type=float, default=0.1, help="Probability of making a short sentence as a training example") @@ -333,7 +336,7 @@ def main(): tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) vocab_list = list(tokenizer.vocab.keys()) - ngram_dict = ZenNgramDict(args.bert_model, tokenizer=tokenizer) + ngram_dict = ZenNgramDict(args.bert_model, tokenizer=tokenizer, max_ngram_in_seq=args.max_ngram_in_sequence) with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: with args.train_corpus.open() as f: @@ -346,6 +349,8 @@ def main(): else: tokens = tokenizer.tokenize(line) doc.append(tokens) + # if len(docs) > 1: + # break if doc: docs.add_document(doc) # If the last doc didn't end on a newline, make sure it still gets added if len(docs) <= 1: @@ -356,7 +361,7 @@ def main(): "sections or paragraphs.") args.output_dir.mkdir(exist_ok=True) - for epoch in trange(args.epochs_to_generate, desc="Epoch"): + for epoch in trange(args.start_epoch, args.epochs_to_generate, desc="Epoch"): epoch_filename = args.output_dir / f"epoch_{epoch}.json" num_instances = 0 with epoch_filename.open('w') as epoch_file: diff --git a/examples/run_pre_train.py b/examples/run_pre_train.py index afcff25..b39fe1f 100644 --- a/examples/run_pre_train.py +++ b/examples/run_pre_train.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """PyTorch pretrain for ZEN model.""" +import sys from argparse import ArgumentParser from pathlib import Path @@ -30,6 +31,8 @@ from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm +from tensorboardX import SummaryWriter + from ZEN import WEIGHTS_NAME, CONFIG_NAME from ZEN import ZenConfig, ZenForPreTraining from ZEN import BertTokenizer @@ -42,6 +45,16 @@ log_format = '%(asctime)-10s: %(message)s' logging.basicConfig(level=logging.INFO, format=log_format) +NGRAM_DICT_NAME = 'ngram.txt' + +def get_ngram_size(bert_model_path): + """ + Get ngram size. + :param bert_model_path: + :return: + """ + ngram_path = os.path.join(bert_model_path, NGRAM_DICT_NAME) + return int(os.popen(f"wc -l {ngram_path}").read().split()[0]) def convert_example_to_features(example, tokenizer, max_seq_length, max_ngram_in_sequence): tokens = example["tokens"] @@ -56,11 +69,43 @@ def convert_example_to_features(example, tokenizer, max_seq_length, max_ngram_in ngram_lengths = example["ngram_lengths"] ngram_segment_ids = example["ngram_segment_ids"] + # def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens): + # """Truncates a pair of sequences to a maximum sequence length. Lifted from Google's BERT repo.""" + # while True: + # total_length = len(tokens_a) + len(tokens_b) + # if total_length <= max_num_tokens: + # break + # + # trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + # assert len(trunc_tokens) >= 1 + # + # # We want to sometimes truncate from the front and sometimes from the + # # back to add more randomness and avoid biases. + # if random.random() < 0.5: + # del trunc_tokens[0] + # else: + # trunc_tokens.pop() + # + # + # sep_idx = tokens.index('[SEP]') + # tokens_a = tokens[1:sep_idx] + # tokens_b = tokens[sep_idx:-1] + # truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + # + # tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"] + # # The segment IDs are 0 for the [CLS] token, the A tokens and the first [SEP] + # # They are 1 for the B tokens and the final [SEP] + # + # segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)] + assert len(tokens) == len(segment_ids) <= max_seq_length # The preprocessed data should be already truncated + + input_ids = tokenizer.convert_tokens_to_ids(tokens) masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels) input_array = np.zeros(max_seq_length, dtype=np.int) + input_array[:len(input_ids)] = input_ids mask_array = np.zeros(max_seq_length, dtype=np.bool) @@ -74,6 +119,7 @@ def convert_example_to_features(example, tokenizer, max_seq_length, max_ngram_in # add ngram pads ngram_id_array = np.zeros(max_ngram_in_sequence, dtype=np.int) + ngram_id_array[:len(ngram_ids)] = ngram_ids # record the masked positions @@ -111,11 +157,12 @@ def convert_example_to_features(example, tokenizer, max_seq_length, max_ngram_in class PregeneratedDataset(Dataset): - def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memory=False, fp16=False): + def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memory=False, fp16=False, load_exist=False): self.vocab = tokenizer.vocab self.tokenizer = tokenizer self.epoch = epoch - self.data_epoch = epoch % num_data_epochs + # self.data_epoch = epoch % num_data_epochs + self.data_epoch = epoch data_file = training_path / f"epoch_{self.data_epoch}.json" metrics_file = training_path / f"epoch_{self.data_epoch}_metrics.json" assert data_file.is_file() and metrics_file.is_file() @@ -126,39 +173,46 @@ def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memo self.temp_dir = None self.working_dir = None self.fp16 = fp16 + self.temp_dir = "/tmp" + # TemporaryDirectory() + self.working_dir = Path(self.temp_dir) if reduce_memory: - self.temp_dir = "/tmp" - # TemporaryDirectory() - self.working_dir = Path(self.temp_dir) + + if load_exist: + mode = 'r+' + else: + mode = 'w+' + input_ids = np.memmap(filename=self.working_dir / 'input_ids.memmap', - mode='w+', dtype=np.int32, shape=(num_samples, seq_len)) + mode=mode, dtype=np.int32, shape=(num_samples, seq_len)) input_masks = np.memmap(filename=self.working_dir / 'input_masks.memmap', - shape=(num_samples, seq_len), mode='w+', dtype=np.bool) + shape=(num_samples, seq_len), mode=mode, dtype=np.bool) segment_ids = np.memmap(filename=self.working_dir / 'segment_ids.memmap', - shape=(num_samples, seq_len), mode='w+', dtype=np.bool) + shape=(num_samples, seq_len), mode=mode, dtype=np.bool) lm_label_ids = np.memmap(filename=self.working_dir / 'lm_label_ids.memmap', - shape=(num_samples, seq_len), mode='w+', dtype=np.int32) + shape=(num_samples, seq_len), mode=mode, dtype=np.int32) lm_label_ids[:] = -1 + is_nexts = np.memmap(filename=self.working_dir / 'is_nexts.memmap', - shape=(num_samples,), mode='w+', dtype=np.bool) + shape=(num_samples,), mode=mode, dtype=np.bool) # add ngram level features ngram_ids = np.memmap(filename=self.working_dir / 'ngram_ids.memmap', - mode='w+', dtype=np.int32, shape=(num_samples, max_ngram_in_sequence)) + mode=mode, dtype=np.int32, shape=(num_samples, max_ngram_in_sequence)) ngram_masks = np.memmap(filename=self.working_dir / 'ngram_masks.memmap', - mode='w+', dtype=np.bool, shape=(num_samples, max_ngram_in_sequence)) + mode=mode, dtype=np.bool, shape=(num_samples, max_ngram_in_sequence)) ngram_positions = np.memmap(filename=self.working_dir / 'ngram_positions.memmap', - mode='w+', dtype=np.bool, shape=(num_samples, seq_len, max_ngram_in_sequence)) + mode=mode, dtype=np.bool, shape=(num_samples, seq_len, max_ngram_in_sequence)) ngram_starts = np.memmap(filename=self.working_dir / 'ngram_starts.memmap', - mode='w+', dtype=np.int32, shape=(num_samples, max_ngram_in_sequence)) + mode=mode, dtype=np.int32, shape=(num_samples, max_ngram_in_sequence)) ngram_lengths = np.memmap(filename=self.working_dir / 'ngram_lengths.memmap', - mode='w+', dtype=np.int32, shape=(num_samples, max_ngram_in_sequence)) + mode=mode, dtype=np.int32, shape=(num_samples, max_ngram_in_sequence)) ngram_segment_ids = np.memmap(filename=self.working_dir / 'ngram_segment_ids.memmap', - mode='w+', dtype=np.bool, shape=(num_samples, max_ngram_in_sequence)) + mode=mode, dtype=np.bool, shape=(num_samples, max_ngram_in_sequence)) else: input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32) @@ -171,32 +225,38 @@ def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memo ngram_ids = np.zeros(shape=(num_samples, max_ngram_in_sequence), dtype=np.int32) ngram_masks = np.zeros(shape=(num_samples, max_ngram_in_sequence), dtype=np.bool) - ngram_positions = np.zeros(shape=(num_samples, seq_len, max_ngram_in_sequence), dtype=np.bool) + # ngram_positions = np.zeros(shape=(num_samples, seq_len, max_ngram_in_sequence), dtype=np.bool) + ngram_positions = np.memmap(filename=self.working_dir / 'ngram_positions.memmap', + mode='w+', dtype=np.bool, shape=(num_samples, seq_len, max_ngram_in_sequence)) ngram_starts = np.zeros(shape=(num_samples, max_ngram_in_sequence), dtype=np.int32) ngram_lengths = np.zeros(shape=(num_samples, max_ngram_in_sequence), dtype=np.int32) ngram_segment_ids = np.zeros(shape=(num_samples, max_ngram_in_sequence), dtype=np.bool) logging.info(f"Loading training examples for epoch {epoch}") - with data_file.open() as f: - for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")): - line = line.strip() - example = json.loads(line) - features = convert_example_to_features(example, tokenizer, seq_len, max_ngram_in_sequence) - input_ids[i] = features.input_ids - segment_ids[i] = features.segment_ids - input_masks[i] = features.input_mask - lm_label_ids[i] = features.lm_label_ids - is_nexts[i] = features.is_next - # add ngram related ids - ngram_ids[i] = features.ngram_ids - ngram_masks[i] = features.ngram_masks - ngram_positions[i] = features.ngram_positions - ngram_starts[i] = features.ngram_starts - ngram_lengths[i] = features.ngram_lengths - ngram_segment_ids[i] = features.ngram_segment_ids - - assert i == num_samples - 1 # Assert that the sample count metric was true + if not load_exist: + with data_file.open() as f: + for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")): + line = line.strip() + example = json.loads(line) + features = convert_example_to_features(example, tokenizer, seq_len, max_ngram_in_sequence) + input_ids[i] = features.input_ids + segment_ids[i] = features.segment_ids + input_masks[i] = features.input_mask + lm_label_ids[i] = features.lm_label_ids + is_nexts[i] = features.is_next + # add ngram related ids + ngram_ids[i] = features.ngram_ids + ngram_masks[i] = features.ngram_masks + ngram_positions[i] = features.ngram_positions + ngram_starts[i] = features.ngram_starts + ngram_lengths[i] = features.ngram_lengths + ngram_segment_ids[i] = features.ngram_segment_ids + + assert i == num_samples - 1 # Assert that the sample count metric was true + + + logging.info("Loading complete!") self.num_samples = num_samples self.seq_len = seq_len @@ -246,6 +306,7 @@ def main(): parser.add_argument("--do_lower_case", action="store_true") parser.add_argument("--reduce_memory", action="store_true", help="Store training data as on-disc memmaps to massively reduce memory usage") + parser.add_argument("--load_exist", action='store_true', help="Load np.memmap cache") parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", @@ -355,7 +416,8 @@ def main(): num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() if args.scratch: - config = ZenConfig(21128, 104089) + # ngram embedding size 需要额外+1 + config = ZenConfig(21128, get_ngram_size(args.bert_model) + 1) model = ZenForPreTraining(config) else: model = ZenForPreTraining.from_pretrained(args.bert_model) @@ -384,7 +446,7 @@ def main(): if args.fp16: try: - from apex.optimizers import FP16_Optimizer + from apex.fp16_utils import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( @@ -393,7 +455,7 @@ def main(): optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, - max_grad_norm=1.0) + ) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: @@ -406,25 +468,35 @@ def main(): warmup=args.warmup_proportion, t_total=num_train_optimization_steps) - global_step = 0 + import shutil + tb_log_dir = os.path.join(args.output_dir, 'tb-log') + if os.path.exists(tb_log_dir): + shutil.rmtree(tb_log_dir) + tb_writer = SummaryWriter(tb_log_dir) + + # 修改学习率阶段 + # global_step = total_train_examples / args.epochs * 4 + start_step = 120000 + global_step = start_step logging.info("***** Running training *****") logging.info(" Num examples = %d", total_train_examples) logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() - for epoch in range(args.epochs): - + for epoch in range(args.already_trained_epoch+1, args.epochs): + ## 数据从data_new第0份开始读 epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory, - fp16=args.fp16) + fp16=args.fp16, load_exist=args.load_exist) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) - train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) + train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, + batch_size=args.train_batch_size, num_workers=8) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: @@ -462,20 +534,24 @@ def main(): if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically - lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) + lr_this_step = args.learning_rate * warmup_linear.get_lr_(global_step / num_train_optimization_steps) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 + tb_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step - start_step) + tb_writer.add_scalar('loss', loss.item(), global_step - start_step) + + # Save a trained model ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%m%d%H%M%S') saving_path = args.output_dir - saving_path = Path(os.path.join(saving_path, args.save_name + st + "_epoch_" + str(epoch + args.already_trained_epoch))) + saving_path = Path(os.path.join(saving_path, args.save_name + st + "_epoch_" + str(epoch))) if saving_path.is_dir() and list(saving_path.iterdir()): logging.warning(f"Output directory ({ saving_path }) already exists and is not empty!") diff --git a/examples/run_token_level_classification.py b/examples/run_token_level_classification.py index 84386ff..e60e548 100644 --- a/examples/run_token_level_classification.py +++ b/examples/run_token_level_classification.py @@ -13,26 +13,39 @@ # See the License for the specific language governing permissions and # limitations under the License. """Run token level classification task on ZEN model.""" - from __future__ import absolute_import, division, print_function +import sys +sys.path.insert(0, '/home/lim/anaconda3/envs/pytorch/lib/python3.7/site-packages') +import torch +print(f'torch version: {torch.__version__}') + import argparse import json import logging import os import random +import shutil import numpy as np import torch import torch.nn.functional as F from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) + TensorDataset, Dataset) from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange from seqeval.metrics import classification_report, f1_score import datetime +from tensorboardX import SummaryWriter + +# from ignite.engine import Engine, Events, create_supervised_evaluator +# from ignite.metrics import RunningAverage, Accuracy, Precision, Recall, Loss, TopKCategoricalAccuracy +# +# from ignite.contrib.handlers import TensorboardLogger +# from ignite.contrib.handlers.tensorboard_logger import OutputHandler, OptimizerParamsHandler +# from ignite.handlers import ModelCheckpoint, EarlyStopping -from utils_token_level_task import processors, convert_examples_to_features +from utils_token_level_task import processors, convert_examples_to_features, convert_singlel_example_to_feature from ZEN import BertTokenizer, BertAdam, WarmupLinearSchedule from ZEN import ZenForTokenClassification from ZEN import ZenNgramDict @@ -65,10 +78,33 @@ def load_examples(args, tokenizer, ngram_dict, processor, label_list, mode): all_ngram_lengths = torch.tensor([f.ngram_lengths for f in features], dtype=torch.long) all_ngram_seg_ids = torch.tensor([f.ngram_seg_ids for f in features], dtype=torch.long) all_ngram_masks = torch.tensor([f.ngram_masks for f in features], dtype=torch.long) - + logger.info('Dataset is ready.') return TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_ngram_ids,all_ngram_positions, all_ngram_lengths, all_ngram_seg_ids, all_ngram_masks, all_valid_ids, all_lmask_ids) +class BERTDataset(Dataset): + def __init__(self, args, tokenizer, ngram_dict, processor, label_list, examples): + self.args = args + self.tokenizer = tokenizer + self.ngram_dict = ngram_dict + self.processor = processor + self.label_list = label_list + self.examples = examples + # self.mode = mode + + # if mode == "train": + # self.examples = processor.get_train_examples(args.data_dir) + # elif mode == "test": + # self.examples = processor.get_test_examples(args.data_dir) + + def __len__(self): + return len(self.examples) + + def __getitem__(self, item): + example = self.examples[item] + feature = convert_singlel_example_to_feature(example, self.label_list, self.args.max_seq_length, self.tokenizer, self.ngram_dict) + return feature + def cws_evaluate_word_PRF(y_pred, y): #dict = {'E': 2, 'S': 3, 'B':0, 'I':1} cor_num = 0 @@ -113,10 +149,12 @@ def save_zen_model(save_zen_model_path, model, tokenizer, ngram_dict, args): def evaluate(args, model, tokenizer, ngram_dict, processor, label_list): num_labels = len(label_list) + 1 - eval_dataset = load_examples(args, tokenizer, ngram_dict, processor, label_list, mode="test") + # eval_dataset = load_examples(args, tokenizer, ngram_dict, processor, label_list, mode="test") + examples = processor.get_test_examples(args.data_dir) + eval_dataset = BERTDataset(args, tokenizer, ngram_dict, processor, label_list, examples) # Run prediction for full data eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, num_workers=8) # Eval! logger.info("***** Running evaluation *****") @@ -127,8 +165,10 @@ def evaluate(args, model, tokenizer, ngram_dict, processor, label_list): y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} + # label_map = {i: label for i, label in enumerate(label_list, 0)} + label_map[0] = '' for batch in tqdm(eval_dataloader, desc="Evaluating"): - batch = tuple(t.to(args.device) for t in batch) + batch = tuple(t.to(args.device) for t in batch.values()) input_ids, input_mask, segment_ids, label_ids, ngram_ids, ngram_positions, \ ngram_lengths, ngram_seg_ids, ngram_masks, valid_ids, l_mask = batch @@ -148,6 +188,14 @@ def evaluate(args, model, tokenizer, ngram_dict, processor, label_list): break y_true.append(label_map[label_ids[i][j]]) y_pred.append(label_map[logits[i][j]]) + # import pickle + # pred_fn = 'labels.pred' + # with open(os.path.join(args.output_dir, pred_fn), 'wb') as f: + # pickle.dump([y_true, y_pred], f) + # + # logger.info("Prediction is done.") + # exit() + if args.task_name == 'cwsmsra' or args.task_name == 'cwspku': #evaluating CWS result = cws_evaluate_word_PRF(y_pred, y_true) @@ -165,19 +213,10 @@ def evaluate(args, model, tokenizer, ngram_dict, processor, label_list): return result def train(args, model, tokenizer, ngram_dict, processor, label_list): - train_dataset = load_examples(args, tokenizer, ngram_dict, processor, label_list, mode="train") - - if args.fp16: - model.half() - if args.local_rank != -1: - try: - from apex.parallel import DistributedDataParallel as DDP - except ImportError: - raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") - model = DDP(model) - elif args.n_gpu > 1: - model = torch.nn.DataParallel(model) + # train_dataset = load_examples(args, tokenizer, ngram_dict, processor, label_list, mode="train") + examples = processor.get_train_examples(args.data_dir) + train_dataset = BERTDataset(args, tokenizer, ngram_dict, processor, label_list, examples) + model = model.cuda() num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs @@ -190,31 +229,64 @@ def train(args, model, tokenizer, ngram_dict, processor, label_list): {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] + + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) if args.fp16: try: - from apex.optimizers import FP16_Optimizer - from apex.optimizers import FusedAdam + from apex.parallel import DistributedDataParallel as DDP + # from apex.fp16_utils import * + from apex import amp, optimizers + from apex.multi_tensor_apply import multi_tensor_applier except ImportError: - raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") - - optimizer = FusedAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - bias_correction=False, - max_grad_norm=1.0) - if args.loss_scale == 0: - optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) - else: - optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.") + warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps) - else: - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) + model, optimizer = amp.initialize(model, optimizer, opt_level='O1') + + # if args.fp16: + # model.half() + + # Distributed training + if args.local_rank != -1: + # By default, apex.parallel.DistributedDataParallel overlaps communication with + # computation in the backward pass. + # model = DDP(model) + # delay_allreduce delays all communication to the end of the backward pass. + model = DDP(model, delay_allreduce=True) + elif args.n_gpu > 1: + model = torch.nn.DataParallel(model) + + + # if args.fp16: + # try: + # from apex.fp16_utils import FP16_Optimizer + # from apex.optimizers import FusedAdam + # except ImportError: + # raise ImportError( + # "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + # + # optimizer = FusedAdam(optimizer_grouped_parameters, + # lr=args.learning_rate, + # bias_correction=False, + # ) + # if args.loss_scale == 0: + # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + # else: + # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + + # else: + + global_step = 0 + tb_log_dir = os.path.join(args.output_dir, 'tb-log') + if args.local_rank in [-1, 0]: + tb_writer = SummaryWriter(tb_log_dir) + logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) @@ -224,7 +296,7 @@ def train(args, model, tokenizer, ngram_dict, processor, label_list): train_sampler = RandomSampler(train_dataset) else: train_sampler = DistributedSampler(train_dataset) - train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=8) best_f1 = -1 best_epoch = -1 @@ -234,7 +306,7 @@ def train(args, model, tokenizer, ngram_dict, processor, label_list): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): - batch = tuple(t.to(args.device) for t in batch) + batch = tuple(t.to(args.device) for t in batch.values()) input_ids, input_mask, segment_ids, label_ids, ngram_ids, ngram_positions, ngram_lengths, ngram_seg_ids, ngram_masks, valid_ids, l_mask = batch loss = model(input_ids, token_type_ids=None, attention_mask=None, labels=label_ids, valid_ids=valid_ids, attention_mask_label=None, ngram_ids=ngram_ids, ngram_positions=ngram_positions) @@ -244,7 +316,9 @@ def train(args, model, tokenizer, ngram_dict, processor, label_list): loss = loss / args.gradient_accumulation_steps if args.fp16: - optimizer.backward(loss) + # optimizer.backward(loss) + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() else: loss.backward() @@ -256,18 +330,93 @@ def train(args, model, tokenizer, ngram_dict, processor, label_list): # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * \ - warmup_linear(global_step / num_train_optimization_steps, args.warmup_proportion) + warmup_linear.get_lr_(global_step / num_train_optimization_steps) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step + optimizer.step() optimizer.zero_grad() global_step += 1 - if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: - # Save model checkpoint - output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) - if not os.path.exists(output_dir): - os.makedirs(output_dir) + if args.local_rank in [-1, 0]: + tb_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step) + tb_writer.add_scalar('loss', loss.item(), global_step) + # if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: + # # Save model checkpoint + # output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) + # if not os.path.exists(output_dir): + # os.makedirs(output_dir) + # save_zen_model(output_dir, model, tokenizer, ngram_dict, args) + if epoch_num % 3 == 0: + if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): + result = evaluate(args, model, tokenizer, ngram_dict, processor, label_list) + tb_writer.add_scalar('f1', result['f1'], global_step) + logger.info("\nf1=%s\n" % (str(result["f1"]))) + + if result['f1'] > best_f1: + best_f1 = result['f1'] + # output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) + output_dir = os.path.join(args.output_dir, "checkpoint-best") + + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + os.makedirs(output_dir) save_zen_model(output_dir, model, tokenizer, ngram_dict, args) + logging.info(f'Saving best model, f1 is {best_f1}') + + +def predict(args, model, tokenizer, ngram_dict, processor, label_list): + num_labels = len(label_list) + 1 + # eval_dataset = load_examples(args, tokenizer, ngram_dict, processor, label_list, mode="test") + examples = processor.get_test_examples(args.data_dir) + eval_dataset = BERTDataset(args, tokenizer, ngram_dict, processor, label_list, examples) + # Run prediction for full data + eval_sampler = SequentialSampler(eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, num_workers=8) + + # Eval! + logger.info("***** Running prediction *****") + logger.info(" Num examples = %d", len(eval_dataset)) + logger.info(" Batch size = %d", args.eval_batch_size) + + model.eval() + y_true = [] + y_pred = [] + label_map = {i: label for i, label in enumerate(label_list, 1)} + # print(label_map) + # exit() + label_map[0] = '' + for batch in tqdm(eval_dataloader, desc="Evaluating"): + batch = tuple(t.to(args.device) for t in batch.values()) + input_ids, input_mask, segment_ids, label_ids, ngram_ids, ngram_positions, \ + ngram_lengths, ngram_seg_ids, ngram_masks, valid_ids, l_mask = batch + + with torch.no_grad(): + logits = model(input_ids, token_type_ids=None, attention_mask=None, labels=None, valid_ids=valid_ids, + attention_mask_label=None, ngram_ids=ngram_ids, ngram_positions=ngram_positions) + + logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) + logits = logits.detach().cpu().numpy() + label_ids = label_ids.detach().cpu().numpy() + + # print(label_ids) + # print(logits) + # exit() + + for i, label in enumerate(label_ids): + for j, m in enumerate(label): + if j == 0: + continue + # if label_ids[i][j] == num_labels - 1: + if logits[i][j] == num_labels - 1: + break + y_true.append(label_map[label_ids[i][j]]) + y_pred.append(label_map[logits[i][j]]) + import pickle + pred_fn = 'labels.pred' + with open(os.path.join(args.output_dir, pred_fn), 'wb') as f: + pickle.dump(y_pred, f) + logger.info("Prediction is done.") + def main(): parser = argparse.ArgumentParser() @@ -315,6 +464,9 @@ def main(): parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") + parser.add_argument("--do_predict", + action='store_true', + help="Whether to run predcit on the test set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") @@ -364,6 +516,9 @@ def main(): "Positive power of 2: static loss scaling value.\n") parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") + parser.add_argument("--finetune_top_only", + action='store_true', + help="Whether to finetune classifier only") args = parser.parse_args() @@ -396,8 +551,8 @@ def main(): # Set seed set_seed(args) - if not args.do_train and not args.do_eval: - raise ValueError("At least one of `do_train` or `do_eval` must be True.") + if not args.do_train and not args.do_eval and not args.do_predict: + raise ValueError("At least one of `do_train` or `do_eval` or `do_predict` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: print("Output directory already exists and is not empty.") @@ -410,7 +565,8 @@ def main(): raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() - label_list = processor.get_labels() + label_list = processor.get_labels(args.data_dir) + logger.info(f"label_list: {label_list}") num_labels = len(label_list) + 1 # Prepare model tokenizer @@ -421,7 +577,19 @@ def main(): model = ZenForTokenClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels, - multift=args.multift) + multift=args.multift, + ngram_size=ngram_dict.ngram_size) + + for name, parameters in model.named_parameters(): + # print(name.split('.')) + if args.finetune_top_only: + if name.split('.')[0] != 'classifier': + parameters.requires_grad = False + logger.info(f'{name} : {parameters.size()}, {parameters.requires_grad}') + + # for param in model.parameters(): + # print(param.size(), param.requires_grad) + model.to(args.device) if args.do_train: @@ -429,6 +597,9 @@ def main(): if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): result = evaluate(args, model, tokenizer, ngram_dict, processor, label_list) logger.info("\nf1=%s\n" % (str(result["f1"]))) + if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): + predict(args, model, tokenizer, ngram_dict, processor, label_list) + if __name__ == "__main__": main() \ No newline at end of file diff --git a/examples/utils_token_level_task.py b/examples/utils_token_level_task.py index 57878cf..ef2c174 100644 --- a/examples/utils_token_level_task.py +++ b/examples/utils_token_level_task.py @@ -18,10 +18,13 @@ import logging import os +import tqdm import math from random import shuffle logger = logging.getLogger(__name__) +import torch + class InputExample(object): """A single training/test example for simple sequence classification.""" @@ -247,14 +250,29 @@ def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") - def get_labels(self): - return ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "[CLS]", "[SEP]"] + def get_labels(self, data_dir): + # return ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "[CLS]", "[SEP]"] + + # labels = set() + # with open(os.path.join(data_dir, 'train.tsv')) as f: + # for line in f: + # line = line.strip() + # items = line.split('\t') + # if len(items) != 2: + # continue + # _, label = items + # labels.add(label) + # labels = list(labels) + # labels.extend(['[CLS]', '[SEP]']) + # return labels + + return ["O", "B-PER", "M-PER", "E-PER", "[CLS]", "[SEP]"] def _create_examples(self, lines, set_type): examples = [] for i, (sentence, label) in enumerate(lines): guid = "%s-%s" % (set_type, i) - text_a = ' '.join(sentence) + text_a = ''.join(sentence) text_b = None label = label examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) @@ -299,13 +317,22 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer label_map = {label: i for i, label in enumerate(label_list, 1)} features = [] - for (ex_index, example) in enumerate(examples): - textlist = example.text_a.split(' ') + for (ex_index, example) in enumerate(tqdm.tqdm(examples, desc='Converting examples')): + # split(' ')会导致句子中的空格被错分,使用``作为句子分隔符 + # textlist = [c for c in example.text_a.split(' ') if c] + textlist = example.text_a.split('') labellist = example.label tokens = [] labels = [] valid = [] label_mask = [] + + # logger.info(example.text_a) + # logger.info(textlist) + # logger.info(f'len(textlist) is {len(textlist)}') + # logger.info(labellist) + # break + for i, word in enumerate(textlist): token = tokenizer.tokenize(word) tokens.extend(token) @@ -417,6 +444,156 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer label_mask=label_mask)) return features +def convert_singlel_example_to_feature(example, label_list, max_seq_length, tokenizer, ngram_dict): + """Loads a data file into a list of `InputBatch`s.""" + + label_map = {label: i for i, label in enumerate(label_list, 1)} + + # features = [] + # for (ex_index, example) in enumerate(tqdm.tqdm(examples, desc='Converting examples')): + + + # split(' ')会导致句子中的空格被错分,使用``作为句子分隔符 + # textlist = [c for c in example.text_a.split(' ') if c] + textlist = example.text_a.split('') + labellist = example.label + tokens = [] + labels = [] + valid = [] + label_mask = [] + + # logger.info(example.text_a) + # logger.info(textlist) + # logger.info(f'len(textlist) is {len(textlist)}') + # logger.info(labellist) + # break + + for i, word in enumerate(textlist): + token = tokenizer.tokenize(word) + tokens.extend(token) + label_1 = labellist[i] + for m in range(len(token)): + if m == 0: + labels.append(label_1) + valid.append(1) + label_mask.append(1) + else: + valid.append(0) + if len(tokens) >= max_seq_length - 1: + tokens = tokens[0:(max_seq_length - 2)] + labels = labels[0:(max_seq_length - 2)] + valid = valid[0:(max_seq_length - 2)] + label_mask = label_mask[0:(max_seq_length - 2)] + ntokens = [] + segment_ids = [] + label_ids = [] + ntokens.append("[CLS]") + segment_ids.append(0) + valid.insert(0, 1) + label_mask.insert(0, 1) + label_ids.append(label_map["[CLS]"]) + for i, token in enumerate(tokens): + ntokens.append(token) + segment_ids.append(0) + if len(labels) > i: + label_ids.append(label_map[labels[i]]) + ntokens.append("[SEP]") + segment_ids.append(0) + valid.append(1) + label_mask.append(1) + label_ids.append(label_map["[SEP]"]) + input_ids = tokenizer.convert_tokens_to_ids(ntokens) + input_mask = [1] * len(input_ids) + label_mask = [1] * len(label_ids) + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + label_ids.append(0) + valid.append(1) + label_mask.append(0) + while len(label_ids) < max_seq_length: + label_ids.append(0) + label_mask.append(0) + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(label_ids) == max_seq_length + assert len(valid) == max_seq_length + assert len(label_mask) == max_seq_length + + # ----------- code for ngram BEGIN----------- + ngram_matches = [] + # Filter the ngram segment from 2 to 7 to check whether there is a ngram + for p in range(2, 8): + for q in range(0, len(tokens) - p + 1): + character_segment = tokens[q:q + p] + # j is the starting position of the ngram + # i is the length of the current ngram + character_segment = tuple(character_segment) + if character_segment in ngram_dict.ngram_to_id_dict: + ngram_index = ngram_dict.ngram_to_id_dict[character_segment] + ngram_matches.append([ngram_index, q, p, character_segment]) + + shuffle(ngram_matches) + + max_ngram_in_seq_proportion = math.ceil((len(tokens) / max_seq_length) * ngram_dict.max_ngram_in_seq) + if len(ngram_matches) > max_ngram_in_seq_proportion: + ngram_matches = ngram_matches[:max_ngram_in_seq_proportion] + + ngram_ids = [ngram[0] for ngram in ngram_matches] + ngram_positions = [ngram[1] for ngram in ngram_matches] + ngram_lengths = [ngram[2] for ngram in ngram_matches] + ngram_tuples = [ngram[3] for ngram in ngram_matches] + ngram_seg_ids = [0 if position < (len(tokens) + 2) else 1 for position in ngram_positions] + + import numpy as np + ngram_mask_array = np.zeros(ngram_dict.max_ngram_in_seq, dtype=np.bool) + ngram_mask_array[:len(ngram_ids)] = 1 + + # record the masked positions + ngram_positions_matrix = np.zeros(shape=(max_seq_length, ngram_dict.max_ngram_in_seq), dtype=np.int32) + for i in range(len(ngram_ids)): + ngram_positions_matrix[ngram_positions[i]:ngram_positions[i] + ngram_lengths[i], i] = 1.0 + + # Zero-pad up to the max ngram in seq length. + padding = [0] * (ngram_dict.max_ngram_in_seq - len(ngram_ids)) + ngram_ids += padding + ngram_lengths += padding + ngram_seg_ids += padding + + # ----------- code for ngram END----------- + + # return InputFeatures(input_ids=input_ids, + # input_mask=input_mask, + # segment_ids=segment_ids, + # label_id=label_ids, + # ngram_ids=ngram_ids, + # ngram_positions=ngram_positions_matrix, + # ngram_lengths=ngram_lengths, + # ngram_tuples=ngram_tuples, + # ngram_seg_ids=ngram_seg_ids, + # ngram_masks=ngram_mask_array, + # valid_ids=valid, + # label_mask=label_mask) + from collections import OrderedDict + feature = OrderedDict({'input_ids': input_ids, + 'input_mask': input_mask, + 'segment_ids': segment_ids, + 'label_id': label_ids, + 'ngram_ids': ngram_ids, + 'ngram_positions': ngram_positions_matrix, + 'ngram_lengths': ngram_lengths, + # 'ngram_tuples': ngram_tuples, + 'ngram_seg_ids': ngram_seg_ids, + 'ngram_masks': ngram_mask_array, + 'valid_ids': valid, + 'label_mask': label_mask}) + # for k, v in feature.items(): + # print(k, torch.tensor(v).size()) + return {k: torch.tensor(v) for k, v in feature.items()} + + processors = { "conll":ConllProcessor, "peopledaily": PeopledailyProcessor, diff --git a/finetune-ner.sh b/finetune-ner.sh new file mode 100644 index 0000000..c369a5c --- /dev/null +++ b/finetune-ner.sh @@ -0,0 +1,15 @@ +data_dir=$1 +bert_model=$2 +python ./examples/run_token_level_classification.py \ + --task_name msra \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir $data_dir\ + --bert_model $bert_model\ + --max_seq_length 128 \ + --train_batch_size 32 \ + --num_train_epochs 30 \ + --warmup_proportion 0.1 \ + --cache_dir ./tmp \ + --no_cuda