From 3c8a969724f6d7af980f25d36d9044aa286f9411 Mon Sep 17 00:00:00 2001 From: sdtblck <46172032+sdtblck@users.noreply.github.com> Date: Wed, 13 Jan 2021 01:19:43 +0100 Subject: [PATCH 1/2] revert dataset fn back to working state --- gpt_neox/datasets.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/gpt_neox/datasets.py b/gpt_neox/datasets.py index d64f120c3..4bfb4efa4 100644 --- a/gpt_neox/datasets.py +++ b/gpt_neox/datasets.py @@ -3,7 +3,7 @@ from .data_utils import get_tokenizer, natural_sort, skip, FixedSizeOrderedDict import random import glob -import tensorflow as tf +import tensorflow.compat.v1 as tf import re import logging from itertools import cycle @@ -71,19 +71,17 @@ def _get_lens(self): lens.append(n_documents) self.lens = lens self._len = sum(self.lens) - - def _parse_function(self, example_proto): - features = { - "text": tf.io.VarLenFeature(tf.int64) - } - parsed_features = tf.io.parse_single_example(example_proto, features) - return tf.sparse.to_dense(parsed_features["text"], parsed_features["text"].dense_shape[0]) + + def _parse_single_example(self, example): + data = tf.train.Example.FromString(example) + data = torch.tensor(list(data.features.feature["text"].int64_list.value), dtype=torch.long) + if self.mode == "chunks": + assert data.size(0) == self.seq_len + 1 + return data def _process_tfrecord(self, tfrecords_file, resume_idx=None): - dataset = tf.data.TFRecordDataset([tfrecords_file]) - dataset = dataset.map(self._parse_function, num_parallel_calls=1) - for example in dataset.as_numpy_iterator(): - yield torch.tensor(example, dtype=torch.long) + for idx, example in enumerate(tf.io.tf_record_iterator(tfrecords_file)): + yield self._parse_single_example(example) def _maybe_process_tfrecord(self, file_idx): if self.processed_files.get(file_idx) is None: From 2ec82d1e4966e7a6163f5e629f41d1f79c990718 Mon Sep 17 00:00:00 2001 From: sdtblck <46172032+sdtblck@users.noreply.github.com> Date: Wed, 13 Jan 2021 01:20:35 +0100 Subject: [PATCH 2/2] Update gpt3_small.json --- configs/gpt3_small.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/gpt3_small.json b/configs/gpt3_small.json index 4806d55e4..ce4dc3f3a 100644 --- a/configs/gpt3_small.json +++ b/configs/gpt3_small.json @@ -5,9 +5,9 @@ "add_padding_token": false }, "dataset": { - "name": "owt2", - "train_path": "./data/owt2/train/*", - "eval_path": "./data/owt2/eval/*", + "name": "enron_tfr", + "train_path": "./data/enron_tfr/tokenized/*.tfrecords", + "eval_path": "./data/enron_tfr/tokenized/*.tfrecords", "seed": 1, "shuffle_input_filenames": true, "pretokenized": true, @@ -15,8 +15,8 @@ "mode": "chunks" }, "train_steps": 572300, + "eval_batch_size": 32, "learning_rate": 0.0006, - "generate_every": 500, "generate_length": 256, "seq_len": 1024, "hidden_dim": 768,