Merge pull request EleutherAI#57 from EleutherAI/sdtblck-patch-2

Revert GPT2Dataset back to old working state
HanzhouTang · Jan 13, 2021 · f229ab2 · f229ab2
2 parents 67231bf + 2ec82d1
commit f229ab2
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 16 deletions.
diff --git a/configs/gpt3_small.json b/configs/gpt3_small.json
@@ -5,18 +5,18 @@
  "add_padding_token": false
  },
  "dataset": {
- "name": "owt2",
- "train_path": "./data/owt2/train/*",
- "eval_path": "./data/owt2/eval/*",
+ "name": "enron_tfr",
+ "train_path": "./data/enron_tfr/tokenized/*.tfrecords",
+ "eval_path": "./data/enron_tfr/tokenized/*.tfrecords",
  "seed": 1,
  "shuffle_input_filenames": true,
  "pretokenized": true,
  "filetype": "tfrecords",
  "mode": "chunks"
  },
  "train_steps": 572300,
+ "eval_batch_size": 32,
  "learning_rate": 0.0006,
- "generate_every": 500,
  "generate_length": 256,
  "seq_len": 1024,
  "hidden_dim": 768,

diff --git a/gpt_neox/datasets.py b/gpt_neox/datasets.py
@@ -3,7 +3,7 @@
 from .data_utils import get_tokenizer, natural_sort, skip, FixedSizeOrderedDict
 import random
 import glob
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 import re
 import logging
 from itertools import cycle
@@ -71,19 +71,17 @@ def _get_lens(self):
  lens.append(n_documents)
  self.lens = lens
  self._len = sum(self.lens)
- 
- def _parse_function(self, example_proto):
- features = {
-  "text": tf.io.VarLenFeature(tf.int64)
- }
- parsed_features = tf.io.parse_single_example(example_proto, features)
- return tf.sparse.to_dense(parsed_features["text"], parsed_features["text"].dense_shape[0])
+
+ def _parse_single_example(self, example):
+ data = tf.train.Example.FromString(example)
+ data = torch.tensor(list(data.features.feature["text"].int64_list.value), dtype=torch.long)
+ if self.mode == "chunks":
+  assert data.size(0) == self.seq_len + 1
+ return data
 
  def _process_tfrecord(self, tfrecords_file, resume_idx=None):
- dataset = tf.data.TFRecordDataset([tfrecords_file])
- dataset = dataset.map(self._parse_function, num_parallel_calls=1)
- for example in dataset.as_numpy_iterator():
- yield torch.tensor(example, dtype=torch.long)
+ for idx, example in enumerate(tf.io.tf_record_iterator(tfrecords_file)):
+ yield self._parse_single_example(example)
 
  def _maybe_process_tfrecord(self, file_idx):
  if self.processed_files.get(file_idx) is None: