Add emotion dataset (huggingface#346)

* Add template for emotions dataset - Download does not work. Needs debugging. * Rename dataset to emotion * Fix style * Fix path to pickled data * Replace pickled dataset with raw text files * Fix style * Add dataset information * Add dummy data * Add supervised labels * Remove redundant configuration * Add dataset info and dummy data
borgr · Jul 13, 2020 · e2096cf · e2096cf
1 parent 4b95345
commit e2096cf
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 0 deletions.
diff --git a/datasets/emotion/dataset_infos.json b/datasets/emotion/dataset_infos.json
@@ -0,0 +1 @@
+{"emotion": {"description": "Emotion is a dataset of English Twitter messages with eight basic emotions: anger, anticipation,\ndisgust, fear, joy, sadness, surprise, and trust. For more detailed information please refer to the\npaper.\n", "citation": "@inproceedings{saravia-etal-2018-carer,\n title = \"{CARER}: Contextualized Affect Representations for Emotion Recognition\",\n author = \"Saravia, Elvis and\n Liu, Hsien-Chi Toby and\n Huang, Yen-Hao and\n Wu, Junlin and\n Chen, Yi-Shin\",\n booktitle = \"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing\",\n month = oct # \"-\" # nov,\n year = \"2018\",\n address = \"Brussels, Belgium\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/D18-1404\",\n doi = \"10.18653/v1/D18-1404\",\n pages = \"3687--3697\",\n abstract = \"Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.\",\n}\n", "homepage": "https://github.com/dair-ai/emotion_dataset", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "emotion", "config_name": "emotion", "version": {"version_str": "0.1.0", "description": "First Emotion release", "nlp_version_to_prepare": null, "major": 0, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1754632, "num_examples": 16000, "dataset_name": "emotion"}, "validation": {"name": "validation", "num_bytes": 216248, "num_examples": 2000, "dataset_name": "emotion"}, "test": {"name": "test", "num_bytes": 218768, "num_examples": 2000, "dataset_name": "emotion"}}, "download_checksums": {"https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt?dl=1": {"num_bytes": 1658616, "checksum": "3ab03d945a6cb783d818ccd06dafd52d2ed8b4f62f0f85a09d7d11870865b190"}, "https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt?dl=1": {"num_bytes": 204240, "checksum": "34faaa31962fe63cdf5dbf6c132ef8ab166c640254ab991af78f3aea375e79ef"}, "https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt?dl=1": {"num_bytes": 206760, "checksum": "60f531690d20127339e7f054edc299a82c627b5ec0dd5d552d53d544e0cfcc17"}}, "download_size": 2069616, "dataset_size": 2189648, "size_in_bytes": 4259264}, "default": {"description": "Emotion is a dataset of English Twitter messages with eight basic emotions: anger, anticipation,\ndisgust, fear, joy, sadness, surprise, and trust. For more detailed information please refer to the\npaper.\n", "citation": "@inproceedings{saravia-etal-2018-carer,\n title = \"{CARER}: Contextualized Affect Representations for Emotion Recognition\",\n author = \"Saravia, Elvis and\n Liu, Hsien-Chi Toby and\n Huang, Yen-Hao and\n Wu, Junlin and\n Chen, Yi-Shin\",\n booktitle = \"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing\",\n month = oct # \"-\" # nov,\n year = \"2018\",\n address = \"Brussels, Belgium\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/D18-1404\",\n doi = \"10.18653/v1/D18-1404\",\n pages = \"3687--3697\",\n abstract = \"Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.\",\n}\n", "homepage": "https://github.com/dair-ai/emotion_dataset", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "emotion", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "nlp_version_to_prepare": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1754632, "num_examples": 16000, "dataset_name": "emotion"}, "validation": {"name": "validation", "num_bytes": 216248, "num_examples": 2000, "dataset_name": "emotion"}, "test": {"name": "test", "num_bytes": 218768, "num_examples": 2000, "dataset_name": "emotion"}}, "download_checksums": {"https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt?dl=1": {"num_bytes": 1658616, "checksum": "3ab03d945a6cb783d818ccd06dafd52d2ed8b4f62f0f85a09d7d11870865b190"}, "https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt?dl=1": {"num_bytes": 204240, "checksum": "34faaa31962fe63cdf5dbf6c132ef8ab166c640254ab991af78f3aea375e79ef"}, "https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt?dl=1": {"num_bytes": 206760, "checksum": "60f531690d20127339e7f054edc299a82c627b5ec0dd5d552d53d544e0cfcc17"}}, "download_size": 2069616, "dataset_size": 2189648, "size_in_bytes": 4259264}}
diff --git a/datasets/emotion/dummy/0.0.0/dummy_data.zip b/datasets/emotion/dummy/0.0.0/dummy_data.zip
diff --git a/datasets/emotion/emotion.py b/datasets/emotion/emotion.py
@@ -0,0 +1,69 @@
+from __future__ import absolute_import, division, print_function
+
+import csv
+import os
+
+import nlp
+
+
+_CITATION = """\
+@inproceedings{saravia-etal-2018-carer,
+ title = "{CARER}: Contextualized Affect Representations for Emotion Recognition",
+ author = "Saravia, Elvis and
+ Liu, Hsien-Chi Toby and
+ Huang, Yen-Hao and
+ Wu, Junlin and
+ Chen, Yi-Shin",
+ booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
+ month = oct # "-" # nov,
+ year = "2018",
+ address = "Brussels, Belgium",
+ publisher = "Association for Computational Linguistics",
+ url = "https://www.aclweb.org/anthology/D18-1404",
+ doi = "10.18653/v1/D18-1404",
+ pages = "3687--3697",
+ abstract = "Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.",
+}
+"""
+
+_DESCRIPTION = """\
+Emotion is a dataset of English Twitter messages with eight basic emotions: anger, anticipation,
+disgust, fear, joy, sadness, surprise, and trust. For more detailed information please refer to the
+paper.
+"""
+_URL = "https://github.com/dair-ai/emotion_dataset"
+# use dl=1 to force browser to download data instead of displaying it
+_TRAIN_DOWNLOAD_URL = "https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt?dl=1"
+_VALIDATION_DOWNLOAD_URL = "https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt?dl=1"
+_TEST_DOWNLOAD_URL = "https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt?dl=1"
+
+
+class Emotion(nlp.GeneratorBasedBuilder):
+
+ def _info(self):
+ return nlp.DatasetInfo(
+ description=_DESCRIPTION,
+ features=nlp.Features({"text": nlp.Value("string"), "label": nlp.Value("string")}),
+ supervised_keys=("text", "label"),
+ homepage=_URL,
+ citation=_CITATION,
+ )
+
+ def _split_generators(self, dl_manager):
+ """Returns SplitGenerators."""
+ train_path = dl_manager.download_and_extract(_TRAIN_DOWNLOAD_URL)
+ valid_path = dl_manager.download_and_extract(_VALIDATION_DOWNLOAD_URL)
+ test_path = dl_manager.download_and_extract(_TEST_DOWNLOAD_URL)
+ return [
+ nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": train_path}),
+ nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"filepath": valid_path}),
+ nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"filepath": test_path}),
+ ]
+
+ def _generate_examples(self, filepath):
+ """Generate examples."""
+ with open(filepath) as csv_file:
+ csv_reader = csv.reader(csv_file, delimiter=";")
+ for id_, row in enumerate(csv_reader):
+ text, label = row
+ yield id_, {"text": text, "label": label}