Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add dynamic dataset for processing/tokenizing examples lazily #46

Closed
wants to merge 7 commits into from
Closed
Prev Previous commit
Next Next commit
minor fixes after testing.
  • Loading branch information
trisongz committed Jan 12, 2021
commit 423d6889746543669d6e0a257d653527ae9e43b2
14 changes: 8 additions & 6 deletions gpt_neox/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,11 +182,11 @@ def setup_files(self, input_files):
for x, file_path in enumerate(self.files):
total_lines = self.total_lines_in_file(file_path)
self.file_idx[x] = {
'start': self.total_lines, 'stop': (self.total_lines + total_lines - 1),
'file': file_path, 'reader': LineSeekableFile(tf.io.gfile.GFile(file_path, 'r'))
'start': self.total_lines, 'stop': (self.total_lines + total_lines),
'file': file_path, 'reader': LineSeekableFile(tf.io.gfile.GFile(file_path, 'rb'))
}
if self.debug:
logging.debug(f'File IDX Start: {self.total_lines} - File IDX End: {self.total_lines + total_lines - 1}')
logging.debug(f'File IDX Start: {self.total_lines} - File IDX End: {self.total_lines + total_lines}')
self.total_lines += total_lines
if self.debug:
logging.debug(f'Total Files: {self.total_files}. Total Lines: {self.total_lines}')
Expand All @@ -201,22 +201,24 @@ def get_file_line(self, idx):

def parse_json(self, line):
try:
return self.parser.parse(line).as_dict()
return self.parser.parse(line).as_dict()[self.target_field]
except ValueError:
return line
except TypeError:
return line

@classmethod
def total_lines_in_file(cls, file_path):
return int(subprocess.check_output(['wc', '-l', file_path]).split()[0])

def tokenize_example(self, ex):
return self.tokenizer(ex[self.target_field], max_length=self.max_seq_len, padding='max_length', truncation=True, return_tensors='pt')['input_ids']
return self.tokenizer(ex, max_length=self.max_seq_len, padding='max_length', truncation=True, return_tensors='pt')['input_ids']

def __getitem__(self, idx):
if self.debug:
logging.debug(f'Getting IDX: {idx}')
ex = self.get_file_line(idx)
return self.tokenize_example(self.parse_json(ex))
return self.tokenize_example(self.parse_json(ex.strip()))

def __len__(self):
return self.total_lines