From 404785ffe6fd7d6ae25662a1e67d3b670d78cf24 Mon Sep 17 00:00:00 2001 From: adolphk-yk <49522434+adolphk-yk@users.noreply.github.com> Date: Mon, 17 Jun 2019 21:15:24 +0800 Subject: [PATCH] Add Sequence labeling in Tutorial and tag scheme convert script (#65) * add sequence labeling in tutorial * add tag scheme convert script * add paper link --- Tutorial.md | 30 ++++++++ Tutorial_zh_CN.md | 31 +++++++++ tools/taggingSchemes_Converter.py | 112 ++++++++++++++++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 tools/taggingSchemes_Converter.py diff --git a/Tutorial.md b/Tutorial.md index b926bea..fe8d95d 100644 --- a/Tutorial.md +++ b/Tutorial.md @@ -21,6 +21,7 @@ 4. [Compression for MRC Model](#task-6.4) * [Task 7: Chinese Sentiment Analysis](#task-7) * [Task 8: Chinese Text Matching](#task-8) + * [Task 9: Sequence Labeling](#task-9) * [Advanced Usage](#advanced-usage) * [Extra Feature Support](#extra-feature) * [Learning Rate Decay](#lr-decay) @@ -562,7 +563,36 @@ Here is an example using Chinese data, for text matching task. ``` *Tips: you can try different models by running different JSON config files. The model file and train log file can be found in JOSN config file's outputs/save_base_dir after you finish training.* +### Task 9: Sequence Labeling +Sequence Labeling is an important NLP task, which includes NER, Slot Tagging, Pos Tagging, etc. +- ***Dataset*** + + [CoNLL 2003](https://www.clips.uantwerpen.be/conll2003/) is a popular dataset in Sequence Labeling task. We use CoNLL 2003 English NER data for our experiment and you can refer the data format in [sample data](https://github.com/microsoft/NeuronBlocks/tree/master/dataset/slot_tagging/conll_2003). + +- ***Tagging Scheme*** + + - NeuronBlocks support both BIO and BIOES tag schemes. + - The IOB scheme is not supported, because of its worse performance in most [experiment](https://arxiv.org/pdf/1707.06799.pdf). + - NeuronBlocks provides a [script](./tools/taggingSchemes_Converter.py) that converts the tag scheme among IOB/BIO/BIOES (NOTE: the script only supports tsv file which has data and label in two columns). + +- ***Usages*** + + 1. BiLSTM representation and Softmax output. + ```bash + cd PROJECT_ROOT + python train.py --conf_path=model_zoo/nlp_tasks/slot_tagging/conf_slot_tagging.json + ``` + +- ***Result*** + + 1. BiLSTM representation and Softmax output. + + Model | F1-score + -------- | -------- + [Ma and Hovy(2016)](https://arxiv.org/pdf/1603.01354.pdf)|87.00 + BiLSTM+Softmax(NeuronBlocks)|88.50 + ## Advanced Usage After building a model, the next goal is to train a model with good performance. It depends on a highly expressive model and tricks of the model training. NeuronBlocks provides some tricks of model training. diff --git a/Tutorial_zh_CN.md b/Tutorial_zh_CN.md index b1e29e6..53e6e60 100644 --- a/Tutorial_zh_CN.md +++ b/Tutorial_zh_CN.md @@ -21,6 +21,7 @@ 4. [机器阅读理解模型的模型压缩](#task-6.4) * [任务 7: 中文情感分析](#task-7) * [任务 8:中文文本匹配](#task-8) + * [任务 9:序列标注](#task-9) * [高阶用法](#advanced-usage) * [额外的feature](#extra-feature) * [学习率衰减](#lr-decay) @@ -552,6 +553,36 @@ This task is to train a query-passage regression model to learn from a heavy tea ``` *提示:您可以通过运行不同的JSON配置文件来尝试不同的模型。当训练完成后,模型文件和训练日志文件可以在JSON配置的outputs/save_base_dir目录中找到。* +### 任务 9: 序列标注 +序列标注是一项重要的NLP任务,包括 NER, Slot Tagging, Pos Tagging 等任务。 + +- ***数据集*** + + 在序列标注任务中,[CoNLL 2003](https://www.clips.uantwerpen.be/conll2003/)是一个很常用的数据集。在我们的序列标注任务中,使用 CoNLL 2003 中英文 NER 数据作为实验数据,其中数据格式可以参考我们给出的[抽样数据](https://github.com/microsoft/NeuronBlocks/tree/master/dataset/slot_tagging/conll_2003)。 + +- ***标注策略*** + + - NeuronBlocks 支持 BIO 和 BIOES 标注策略。 + - IOB 标注标注是不被支持的,因为在大多[实验](https://arxiv.org/pdf/1707.06799.pdf)中它具有很差的表现。 + - NeuronBlocks 提供一个在不同标注策略(IOB/BIO/BIOES)中的[转化脚本](./tools/taggingSchemes_Converter.py)(脚本仅支持具有 数据和标签 的两列tsv文件输入)。 + +- ***用法*** + + 1. BiLSTM 词表示和 Softmax 输出 + ```bash + cd PROJECT_ROOT + python train.py --conf_path=model_zoo/nlp_tasks/slot_tagging/conf_slot_tagging.json + ``` + +- ***结果*** + + 1. BiLSTM 词表示和 Softmax 输出 + + Model | F1-score + -------- | -------- + [Ma and Hovy(2016)](https://arxiv.org/pdf/1603.01354.pdf)|87.00 + BiLSTM+Softmax(NeuronBlocks)|88.50 + ## 高阶用法 After building a model, the next goal is to train a model with good performance. It depends on a highly expressive model and tricks of the model training. NeuronBlocks provides some tricks of model training. diff --git a/tools/taggingSchemes_Converter.py b/tools/taggingSchemes_Converter.py new file mode 100644 index 0000000..f206191 --- /dev/null +++ b/tools/taggingSchemes_Converter.py @@ -0,0 +1,112 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +import sys + + +def BIO2BIOES(input_labels_list): + output_labels_list = [] + for labels in input_labels_list: + new_labels = [] + sent_len = len(labels) + for idx in range(sent_len): + if "-" not in labels[idx]: + new_labels.append(labels[idx]) + else: + label_type = labels[idx].split('-')[-1] + if "B-" in labels[idx]: + if (idx == sent_len - 1) or ("I-" not in labels[idx + 1]): + new_labels.append("S-"+label_type) + else: + new_labels.append("B-"+label_type) + elif "I-" in labels[idx]: + if (idx == sent_len - 1) or ("I-" not in labels[idx + 1]): + new_labels.append("E-"+label_type) + else: + new_labels.append("I-"+label_type) + assert len(labels) == len(new_labels) + output_labels_list.append(new_labels) + return output_labels_list + + +def BIOES2BIO(input_labels_list): + output_labels_list = [] + for labels in input_labels_list: + new_labels = [] + sent_len = len(labels) + for idx in range(sent_len): + if "-" not in labels[idx]: + new_labels.append(labels[idx]) + else: + label_type = labels[idx].split('-')[-1] + if "E-" in labels[idx]: + new_labels.append("I-" + label_type) + elif "S-" in labels[idx]: + new_labels.append("B-" + label_type) + else: + new_labels.append(labels[idx]) + assert len(labels) == len(new_labels) + output_labels_list.append(new_labels) + return output_labels_list + + +def IOB2BIO(input_labels_list): + output_labels_list = [] + for labels in input_labels_list: + new_labels = [] + sent_len = len(labels) + for idx in range(sent_len): + if "I-" in labels[idx]: + label_type = labels[idx].split('-')[-1] + if (idx == 0) or (labels[idx - 1] == "O") or (label_type != labels[idx - 1].split('-')[-1]): + new_labels.append("B-" + label_type) + else: + new_labels.append(labels[idx]) + else: + new_labels.append(labels[idx]) + assert len(labels) == len(new_labels) + output_labels_list.append(new_labels) + return output_labels_list + + +if __name__ == '__main__': + '''Convert NER tagging schemes among IOB/BIO/BIOES. + For example: if you want to convert the IOB tagging scheme to BIO, then you run as following: + python taggingSchemes_Converter.py IOB2BIO input_iob_file output_bio_file + Input data format is tsv format. + ''' + input_file_name, output_file_name = sys.argv[2], sys.argv[3] + words_list, labels_list, new_labels_list = [], [], [] + with open(input_file_name, 'r') as input_file: + for line in input_file: + item = line.rstrip().split('\t') + assert len(item) == 2 + words, labels = item[0].split(' '), item[1].split(' ') + if len(words) != len(labels): + print("Error line: " + line.rstrip()) + continue + words_list.append(words) + labels_list.append(labels) + + if sys.argv[1].upper() == "IOB2BIO": + print("Convert IOB -> BIO...") + new_labels_list = IOB2BIO(labels_list) + elif sys.argv[1].upper() == "BIO2BIOES": + print("Convert BIO -> BIOES...") + new_labels_list = BIO2BIOES(labels_list) + elif sys.argv[1].upper() == "BIOES2BIO": + print("Convert BIOES -> BIO...") + new_labels_list = BIOES2BIO(labels_list) + elif sys.argv[1].upper() == "IOB2BIOES": + print("Convert IOB -> BIOES...") + tmp_labels_list = IOB2BIO(labels_list) + new_labels_list = BIO2BIOES(tmp_labels_list) + else: + print("Argument error: sys.argv[1] should belongs to \"IOB2BIO/BIO2BIOES/BIOES2BIO/IOB2BIOES\"") + + with open(output_file_name, 'w') as output_file: + for index in range(len(words_list)): + words, labels = words_list[index], new_labels_list[index] + line = " ".join(words) + '\t' + " ".join(labels) + '\n' + output_file.write(line) +