From 3c8a969724f6d7af980f25d36d9044aa286f9411 Mon Sep 17 00:00:00 2001
From: sdtblck <46172032+sdtblck@users.noreply.github.com>
Date: Wed, 13 Jan 2021 01:19:43 +0100
Subject: [PATCH 1/2] revert dataset fn back to working state

---
 gpt_neox/datasets.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/gpt_neox/datasets.py b/gpt_neox/datasets.py
index d64f120c3..4bfb4efa4 100644
--- a/gpt_neox/datasets.py
+++ b/gpt_neox/datasets.py
@@ -3,7 +3,7 @@
 from .data_utils import get_tokenizer, natural_sort, skip, FixedSizeOrderedDict
 import random
 import glob
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 import re
 import logging
 from itertools import cycle
@@ -71,19 +71,17 @@ def _get_lens(self):
             lens.append(n_documents)
         self.lens = lens
         self._len = sum(self.lens)
-    
-    def _parse_function(self, example_proto):
-        features = {
-            "text": tf.io.VarLenFeature(tf.int64)
-        }
-        parsed_features = tf.io.parse_single_example(example_proto, features)
-        return tf.sparse.to_dense(parsed_features["text"], parsed_features["text"].dense_shape[0])
+
+    def _parse_single_example(self, example):
+        data = tf.train.Example.FromString(example)
+        data = torch.tensor(list(data.features.feature["text"].int64_list.value), dtype=torch.long)
+        if self.mode == "chunks":
+            assert data.size(0) == self.seq_len + 1
+        return data
 
     def _process_tfrecord(self, tfrecords_file, resume_idx=None):
-        dataset = tf.data.TFRecordDataset([tfrecords_file])
-        dataset = dataset.map(self._parse_function, num_parallel_calls=1)
-        for example in dataset.as_numpy_iterator():
-            yield torch.tensor(example, dtype=torch.long)
+        for idx, example in enumerate(tf.io.tf_record_iterator(tfrecords_file)):
+            yield self._parse_single_example(example)
 
     def _maybe_process_tfrecord(self, file_idx):
         if self.processed_files.get(file_idx) is None:

From 2ec82d1e4966e7a6163f5e629f41d1f79c990718 Mon Sep 17 00:00:00 2001
From: sdtblck <46172032+sdtblck@users.noreply.github.com>
Date: Wed, 13 Jan 2021 01:20:35 +0100
Subject: [PATCH 2/2] Update gpt3_small.json

---
 configs/gpt3_small.json | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/configs/gpt3_small.json b/configs/gpt3_small.json
index 4806d55e4..ce4dc3f3a 100644
--- a/configs/gpt3_small.json
+++ b/configs/gpt3_small.json
@@ -5,9 +5,9 @@
     "add_padding_token": false
   },
   "dataset": {
-    "name": "owt2",
-    "train_path": "./data/owt2/train/*",
-    "eval_path": "./data/owt2/eval/*",
+    "name": "enron_tfr",
+    "train_path": "./data/enron_tfr/tokenized/*.tfrecords",
+    "eval_path": "./data/enron_tfr/tokenized/*.tfrecords",
     "seed": 1,
     "shuffle_input_filenames": true,
     "pretokenized": true,
@@ -15,8 +15,8 @@
     "mode": "chunks"
   },
   "train_steps": 572300,
+  "eval_batch_size": 32,
   "learning_rate": 0.0006,
-  "generate_every": 500,
   "generate_length": 256,
   "seq_len": 1024,
   "hidden_dim": 768,