forked from huggingface/datasets
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add emotion dataset (huggingface#346)
* Add template for emotions dataset - Download does not work. Needs debugging. * Rename dataset to emotion * Fix style * Fix path to pickled data * Replace pickled dataset with raw text files * Fix style * Add dataset information * Add dummy data * Add supervised labels * Remove redundant configuration * Add dataset info and dummy data
- Loading branch information
Showing
3 changed files
with
70 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"emotion": {"description": "Emotion is a dataset of English Twitter messages with eight basic emotions: anger, anticipation,\ndisgust, fear, joy, sadness, surprise, and trust. For more detailed information please refer to the\npaper.\n", "citation": "@inproceedings{saravia-etal-2018-carer,\n title = \"{CARER}: Contextualized Affect Representations for Emotion Recognition\",\n author = \"Saravia, Elvis and\n Liu, Hsien-Chi Toby and\n Huang, Yen-Hao and\n Wu, Junlin and\n Chen, Yi-Shin\",\n booktitle = \"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing\",\n month = oct # \"-\" # nov,\n year = \"2018\",\n address = \"Brussels, Belgium\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/D18-1404\",\n doi = \"10.18653/v1/D18-1404\",\n pages = \"3687--3697\",\n abstract = \"Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.\",\n}\n", "homepage": "https://github.com/dair-ai/emotion_dataset", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "emotion", "config_name": "emotion", "version": {"version_str": "0.1.0", "description": "First Emotion release", "nlp_version_to_prepare": null, "major": 0, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1754632, "num_examples": 16000, "dataset_name": "emotion"}, "validation": {"name": "validation", "num_bytes": 216248, "num_examples": 2000, "dataset_name": "emotion"}, "test": {"name": "test", "num_bytes": 218768, "num_examples": 2000, "dataset_name": "emotion"}}, "download_checksums": {"https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt?dl=1": {"num_bytes": 1658616, "checksum": "3ab03d945a6cb783d818ccd06dafd52d2ed8b4f62f0f85a09d7d11870865b190"}, "https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt?dl=1": {"num_bytes": 204240, "checksum": "34faaa31962fe63cdf5dbf6c132ef8ab166c640254ab991af78f3aea375e79ef"}, "https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt?dl=1": {"num_bytes": 206760, "checksum": "60f531690d20127339e7f054edc299a82c627b5ec0dd5d552d53d544e0cfcc17"}}, "download_size": 2069616, "dataset_size": 2189648, "size_in_bytes": 4259264}, "default": {"description": "Emotion is a dataset of English Twitter messages with eight basic emotions: anger, anticipation,\ndisgust, fear, joy, sadness, surprise, and trust. For more detailed information please refer to the\npaper.\n", "citation": "@inproceedings{saravia-etal-2018-carer,\n title = \"{CARER}: Contextualized Affect Representations for Emotion Recognition\",\n author = \"Saravia, Elvis and\n Liu, Hsien-Chi Toby and\n Huang, Yen-Hao and\n Wu, Junlin and\n Chen, Yi-Shin\",\n booktitle = \"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing\",\n month = oct # \"-\" # nov,\n year = \"2018\",\n address = \"Brussels, Belgium\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/D18-1404\",\n doi = \"10.18653/v1/D18-1404\",\n pages = \"3687--3697\",\n abstract = \"Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.\",\n}\n", "homepage": "https://github.com/dair-ai/emotion_dataset", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "emotion", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "nlp_version_to_prepare": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1754632, "num_examples": 16000, "dataset_name": "emotion"}, "validation": {"name": "validation", "num_bytes": 216248, "num_examples": 2000, "dataset_name": "emotion"}, "test": {"name": "test", "num_bytes": 218768, "num_examples": 2000, "dataset_name": "emotion"}}, "download_checksums": {"https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt?dl=1": {"num_bytes": 1658616, "checksum": "3ab03d945a6cb783d818ccd06dafd52d2ed8b4f62f0f85a09d7d11870865b190"}, "https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt?dl=1": {"num_bytes": 204240, "checksum": "34faaa31962fe63cdf5dbf6c132ef8ab166c640254ab991af78f3aea375e79ef"}, "https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt?dl=1": {"num_bytes": 206760, "checksum": "60f531690d20127339e7f054edc299a82c627b5ec0dd5d552d53d544e0cfcc17"}}, "download_size": 2069616, "dataset_size": 2189648, "size_in_bytes": 4259264}} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
from __future__ import absolute_import, division, print_function | ||
|
||
import csv | ||
import os | ||
|
||
import nlp | ||
|
||
|
||
_CITATION = """\ | ||
@inproceedings{saravia-etal-2018-carer, | ||
title = "{CARER}: Contextualized Affect Representations for Emotion Recognition", | ||
author = "Saravia, Elvis and | ||
Liu, Hsien-Chi Toby and | ||
Huang, Yen-Hao and | ||
Wu, Junlin and | ||
Chen, Yi-Shin", | ||
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", | ||
month = oct # "-" # nov, | ||
year = "2018", | ||
address = "Brussels, Belgium", | ||
publisher = "Association for Computational Linguistics", | ||
url = "https://www.aclweb.org/anthology/D18-1404", | ||
doi = "10.18653/v1/D18-1404", | ||
pages = "3687--3697", | ||
abstract = "Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.", | ||
} | ||
""" | ||
|
||
_DESCRIPTION = """\ | ||
Emotion is a dataset of English Twitter messages with eight basic emotions: anger, anticipation, | ||
disgust, fear, joy, sadness, surprise, and trust. For more detailed information please refer to the | ||
paper. | ||
""" | ||
_URL = "https://github.com/dair-ai/emotion_dataset" | ||
# use dl=1 to force browser to download data instead of displaying it | ||
_TRAIN_DOWNLOAD_URL = "https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt?dl=1" | ||
_VALIDATION_DOWNLOAD_URL = "https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt?dl=1" | ||
_TEST_DOWNLOAD_URL = "https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt?dl=1" | ||
|
||
|
||
class Emotion(nlp.GeneratorBasedBuilder): | ||
|
||
def _info(self): | ||
return nlp.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=nlp.Features({"text": nlp.Value("string"), "label": nlp.Value("string")}), | ||
supervised_keys=("text", "label"), | ||
homepage=_URL, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager): | ||
"""Returns SplitGenerators.""" | ||
train_path = dl_manager.download_and_extract(_TRAIN_DOWNLOAD_URL) | ||
valid_path = dl_manager.download_and_extract(_VALIDATION_DOWNLOAD_URL) | ||
test_path = dl_manager.download_and_extract(_TEST_DOWNLOAD_URL) | ||
return [ | ||
nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": train_path}), | ||
nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"filepath": valid_path}), | ||
nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"filepath": test_path}), | ||
] | ||
|
||
def _generate_examples(self, filepath): | ||
"""Generate examples.""" | ||
with open(filepath) as csv_file: | ||
csv_reader = csv.reader(csv_file, delimiter=";") | ||
for id_, row in enumerate(csv_reader): | ||
text, label = row | ||
yield id_, {"text": text, "label": label} |