Change default dataset from enron to enwik8 (#833)

* Change default dataset from enron to enwik8 * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions <[email protected]>
EleutherAI · Mar 14, 2023 · 2ed00e6 · 2ed00e6
1 parent 3a4af67
commit 2ed00e6
Show file tree

Hide file tree

Showing 8 changed files with 23 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -131,22 +131,22 @@ For a more detailed guide to all the features available and how to configure the
 
 Several preconfigured datasets are available, including most components from [the Pile](https://arxiv.org/abs/2101.00027), as well as the Pile train set itself, for straightforward tokenization using the `prepare_data.py` entry point.
 
-E.G, to download and tokenize the Enron emails corpus with the GPT2 Tokenizer, saving them to `./data` you can run:
+E.G, to download and tokenize the enwik8 dataset with the GPT2 Tokenizer, saving them to `./data` you can run:
 
 ```
 python prepare_data.py -d ./data
 ```
 
-or with the GPT-NeoX-20B tokenizer (assuming you have it saved at `./20B_checkpoints/20B_tokenizer.json`):
+or a single shard of the pile (`pile_subset`) with the GPT-NeoX-20B tokenizer (assuming you have it saved at `./20B_checkpoints/20B_tokenizer.json`):
 
 ```
-python prepare_data.py -d ./data -t HFTokenizer --vocab-file ./20B_checkpoints/20B_tokenizer.json
+python prepare_data.py -d ./data -t HFTokenizer --vocab-file ./20B_checkpoints/20B_tokenizer.json pile_subset
 ```
 
 The tokenized data will be saved out to two files: `[data-dir]/[dataset-name]/[dataset-name]_text_document.bin`and `[data-dir]/[dataset-name]/[dataset-name]_text_document.idx`. You will need to add the prefix that both these files share to your training configuration file under the `data-path` field. E.G:
 
 ```yaml
- "data-path": "./data/enron/enron_text_document",
+ "data-path": "./data/enwik8/enwik8_text_document",
 ```
 
 ## Using Custom Data

diff --git a/configs/README.md b/configs/README.md
@@ -203,10 +203,10 @@ Our global batch size configuration follows deepspeed's and can be configured in
  "data-impl": "mmap",
  "split": "949,50,1",
  # Suggested data paths when using GPT-NeoX locally
- "data-path": "data/enron/enron_text_document",
- #"train-data-path": "data/train/train_text_document",
- #"test-data-path": "data/test/test_text_document",
- #"valid-data-path": "data/valid/valid_text_document",
+ "data-path": "data/enwik8/enwik8_text_document",
+ #"train-data-path": "data/enwik8/enwik8_text_document",
+ #"test-data-path": "data/enwik8/enwik8_text_document",
+ #"valid-data-path": "data/enwik8/enwik8_text_document",
  "vocab-file": "data/gpt2-vocab.json",
  "merge-file": "data/gpt2-merges.txt",
  "save": "checkpoints",

diff --git a/configs/eleutherai_cluster.yml b/configs/eleutherai_cluster.yml
@@ -1,9 +1,9 @@
 # Data paths and options when using EleutherAI cluster
 {
  # you may include multiple distinct datasets if desired
- "train-data-paths": ["/mnt/ssd-1/data/enron/enron_train_text_document"],
- "valid-data-paths": ["/mnt/ssd-1/data/enron/enron_val_text_document"],
- "test-data-paths": ["/mnt/ssd-1/data/enron/enron_test_text_document"],
+ "train-data-paths": ["/mnt/ssd-1/data/enwik8/enwik8_text_document"],
+ "valid-data-paths": ["/mnt/ssd-1/data/enwik8/enwik8_val_text_document"],
+ "test-data-paths": ["/mnt/ssd-1/data/enwik8/enwik8_test_text_document"],
 
  # if using multiple datasets, provide weights for them to be sampled with
  # "train-data-weights": [1., 2.],
@@ -15,7 +15,7 @@
  # "split" determines the relative size of train, val, and test
 
  # "split" 995,4,1
- # "data_path": "/mnt/ssd-1/data/enron/enron_train_text_document",
+ # "data_path": "/mnt/ssd-1/data/enwik8/enwik8_text_document",
 
  "vocab-file": "/mnt/ssd-1/data/gpt2-vocab.json",
  "merge-file": "/mnt/ssd-1/data/gpt2-merges.txt",

diff --git a/configs/local_setup.yml b/configs/local_setup.yml
@@ -1,11 +1,11 @@
 # Suggested data paths when using GPT-NeoX locally
 {
- "data-path": "data/enron/enron_text_document",
+ "data-path": "data/enwik8/enwik8_text_document",
 
  # or for weighted datasets:
- # "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
- # "test-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
- # "valid-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
+ # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+ # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+ # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
  # "train-data-weights": [1., 2.],
  # "test-data-weights": [2., 1.],
  # "valid-data-weights": [0.5, 0.4],

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
- Default = 6616b35
+ Default = 1e65709
 
  current git hash of repository
 

diff --git a/configs/slurm_local.yml b/configs/slurm_local.yml
@@ -1,5 +1,5 @@
 {
- "data-path": "data/enron/enron_text_document",
+ "data-path": "data/enwik8/enwik8_text_document",
  "vocab-file": "data/gpt2-vocab.json",
  "merge-file": "data/gpt2-merges.txt",
  "save": "checkpoints",

diff --git a/prepare_data.py b/prepare_data.py
@@ -30,7 +30,7 @@ def get_args():
  parser.add_argument(
  "dataset",
  nargs="?",
- default="enron",
+ default="enwik8",
  help="name of dataset to download.",
  choices=DATASET_CHOICES,
  )

diff --git a/tests/test_configs/test_train_base.yml b/tests/test_configs/test_train_base.yml
@@ -97,12 +97,12 @@
  "wall_clock_breakdown": true,
 
  # Suggested data paths when using GPT_NeoX locally
- "data_path": "data/enron/enron_text_document",
+ "data_path": "data/enwik8/enwik8_text_document",
 
  # or for weighted datasets:
- # "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
- # "test-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
- # "valid-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
+ # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+ # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+ # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
  # "train-data-weights": [1., 2.],
  # "test-data-weights": [2., 1.],
  # "valid-data-weights": [0.5, 0.4],