Skip to content

Commit

Permalink
Chunk DM Math
Browse files Browse the repository at this point in the history
  • Loading branch information
leogao2 committed Nov 11, 2020
1 parent a27af7c commit 1610e61
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 5 deletions.
8 changes: 4 additions & 4 deletions the_pile/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,20 +206,20 @@ def _download(self):
def documents(self):
self._download()

return dummy_meta(concat(
return dummy_meta(chunk_at_even_lines(concat(
map(
lambda x: map(fread, ls('components/dm_math/mathematics_dataset-v1.0/train-' + x)),
['easy', 'medium', 'hard'])
))
), 8192))

def clean(self):
rm_if_exists('components/dm_math')

def size(self):
return 8316165951

def num_docs(self):
return 168
return 1014997


class EnronEmailsDataset(Dataset):
Expand Down
17 changes: 16 additions & 1 deletion the_pile/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,4 +199,19 @@ def parse_size(sizestr):
return size * 1024 * 1024 * 1024 * 1024

def dummy_meta(xs):
return ((x, {}) for x in xs)
return ((x, {}) for x in xs)

def chunk_at_even_lines(it, chunksize):
for doc in it:
totlen = 0
res = []
for i, line in enumerate(doc):
res.append(line)
totlen += len(line)

if totlen > chunksize and i % 2 == 1:
yield '\n'.join(res)
totlen = 0
res = []
if res: yield '\n'.join(res)

0 comments on commit 1610e61

Please sign in to comment.