Skip to content

Commit

Permalink
Add emotion dataset (huggingface#346)
Browse files Browse the repository at this point in the history
* Add template for emotions dataset

- Download does not work. Needs debugging.

* Rename dataset to emotion

* Fix style

* Fix path to pickled data

* Replace pickled dataset with raw text files

* Fix style

* Add dataset information

* Add dummy data

* Add supervised labels

* Remove redundant configuration

* Add dataset info and dummy data
  • Loading branch information
lewtun authored Jul 13, 2020
1 parent 4b95345 commit e2096cf
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 0 deletions.
1 change: 1 addition & 0 deletions datasets/emotion/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"emotion": {"description": "Emotion is a dataset of English Twitter messages with eight basic emotions: anger, anticipation,\ndisgust, fear, joy, sadness, surprise, and trust. For more detailed information please refer to the\npaper.\n", "citation": "@inproceedings{saravia-etal-2018-carer,\n title = \"{CARER}: Contextualized Affect Representations for Emotion Recognition\",\n author = \"Saravia, Elvis and\n Liu, Hsien-Chi Toby and\n Huang, Yen-Hao and\n Wu, Junlin and\n Chen, Yi-Shin\",\n booktitle = \"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing\",\n month = oct # \"-\" # nov,\n year = \"2018\",\n address = \"Brussels, Belgium\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/D18-1404\",\n doi = \"10.18653/v1/D18-1404\",\n pages = \"3687--3697\",\n abstract = \"Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.\",\n}\n", "homepage": "https://github.com/dair-ai/emotion_dataset", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "emotion", "config_name": "emotion", "version": {"version_str": "0.1.0", "description": "First Emotion release", "nlp_version_to_prepare": null, "major": 0, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1754632, "num_examples": 16000, "dataset_name": "emotion"}, "validation": {"name": "validation", "num_bytes": 216248, "num_examples": 2000, "dataset_name": "emotion"}, "test": {"name": "test", "num_bytes": 218768, "num_examples": 2000, "dataset_name": "emotion"}}, "download_checksums": {"https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt?dl=1": {"num_bytes": 1658616, "checksum": "3ab03d945a6cb783d818ccd06dafd52d2ed8b4f62f0f85a09d7d11870865b190"}, "https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt?dl=1": {"num_bytes": 204240, "checksum": "34faaa31962fe63cdf5dbf6c132ef8ab166c640254ab991af78f3aea375e79ef"}, "https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt?dl=1": {"num_bytes": 206760, "checksum": "60f531690d20127339e7f054edc299a82c627b5ec0dd5d552d53d544e0cfcc17"}}, "download_size": 2069616, "dataset_size": 2189648, "size_in_bytes": 4259264}, "default": {"description": "Emotion is a dataset of English Twitter messages with eight basic emotions: anger, anticipation,\ndisgust, fear, joy, sadness, surprise, and trust. For more detailed information please refer to the\npaper.\n", "citation": "@inproceedings{saravia-etal-2018-carer,\n title = \"{CARER}: Contextualized Affect Representations for Emotion Recognition\",\n author = \"Saravia, Elvis and\n Liu, Hsien-Chi Toby and\n Huang, Yen-Hao and\n Wu, Junlin and\n Chen, Yi-Shin\",\n booktitle = \"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing\",\n month = oct # \"-\" # nov,\n year = \"2018\",\n address = \"Brussels, Belgium\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/D18-1404\",\n doi = \"10.18653/v1/D18-1404\",\n pages = \"3687--3697\",\n abstract = \"Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.\",\n}\n", "homepage": "https://github.com/dair-ai/emotion_dataset", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "emotion", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "nlp_version_to_prepare": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1754632, "num_examples": 16000, "dataset_name": "emotion"}, "validation": {"name": "validation", "num_bytes": 216248, "num_examples": 2000, "dataset_name": "emotion"}, "test": {"name": "test", "num_bytes": 218768, "num_examples": 2000, "dataset_name": "emotion"}}, "download_checksums": {"https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt?dl=1": {"num_bytes": 1658616, "checksum": "3ab03d945a6cb783d818ccd06dafd52d2ed8b4f62f0f85a09d7d11870865b190"}, "https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt?dl=1": {"num_bytes": 204240, "checksum": "34faaa31962fe63cdf5dbf6c132ef8ab166c640254ab991af78f3aea375e79ef"}, "https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt?dl=1": {"num_bytes": 206760, "checksum": "60f531690d20127339e7f054edc299a82c627b5ec0dd5d552d53d544e0cfcc17"}}, "download_size": 2069616, "dataset_size": 2189648, "size_in_bytes": 4259264}}
Binary file added datasets/emotion/dummy/0.0.0/dummy_data.zip
Binary file not shown.
69 changes: 69 additions & 0 deletions datasets/emotion/emotion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from __future__ import absolute_import, division, print_function

import csv
import os

import nlp


_CITATION = """\
@inproceedings{saravia-etal-2018-carer,
title = "{CARER}: Contextualized Affect Representations for Emotion Recognition",
author = "Saravia, Elvis and
Liu, Hsien-Chi Toby and
Huang, Yen-Hao and
Wu, Junlin and
Chen, Yi-Shin",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
month = oct # "-" # nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D18-1404",
doi = "10.18653/v1/D18-1404",
pages = "3687--3697",
abstract = "Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.",
}
"""

_DESCRIPTION = """\
Emotion is a dataset of English Twitter messages with eight basic emotions: anger, anticipation,
disgust, fear, joy, sadness, surprise, and trust. For more detailed information please refer to the
paper.
"""
_URL = "https://github.com/dair-ai/emotion_dataset"
# use dl=1 to force browser to download data instead of displaying it
_TRAIN_DOWNLOAD_URL = "https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt?dl=1"
_VALIDATION_DOWNLOAD_URL = "https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt?dl=1"
_TEST_DOWNLOAD_URL = "https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt?dl=1"


class Emotion(nlp.GeneratorBasedBuilder):

def _info(self):
return nlp.DatasetInfo(
description=_DESCRIPTION,
features=nlp.Features({"text": nlp.Value("string"), "label": nlp.Value("string")}),
supervised_keys=("text", "label"),
homepage=_URL,
citation=_CITATION,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
train_path = dl_manager.download_and_extract(_TRAIN_DOWNLOAD_URL)
valid_path = dl_manager.download_and_extract(_VALIDATION_DOWNLOAD_URL)
test_path = dl_manager.download_and_extract(_TEST_DOWNLOAD_URL)
return [
nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": train_path}),
nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"filepath": valid_path}),
nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"filepath": test_path}),
]

def _generate_examples(self, filepath):
"""Generate examples."""
with open(filepath) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=";")
for id_, row in enumerate(csv_reader):
text, label = row
yield id_, {"text": text, "label": label}

0 comments on commit e2096cf

Please sign in to comment.