Skip to content

Commit

Permalink
Update table generation
Browse files Browse the repository at this point in the history
  • Loading branch information
leogao2 committed Nov 9, 2020
1 parent 2fa0a7d commit 0de48f1
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 10 deletions.
31 changes: 27 additions & 4 deletions the_pile/pile.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,24 +92,47 @@ def take(n, iter):
break
return ret

def mk_table(datasets, train_chars):
def mk_table(datasets, train_chars, print_latex=True):
values = []

total_weight = sum([x[1] * x[0].size() for x in datasets])

for dataset, weight in datasets:
size = dataset.size()
relative_weight = size * weight / total_weight
values.append([dataset.name(), size, '{:.2%}'.format(relative_weight), train_chars / size * relative_weight, size * weight, humanbytes(size / dataset.num_docs())])
values.append([dataset.name(), size, '{:.2%}'.format(relative_weight), '{:.4f}'.format(train_chars / size * relative_weight), size * weight, humanbytes(size / dataset.num_docs(), 'KiB')])

values.sort(key=lambda x: -x[4])
values.append(['**Total**', sum([x[1] for x in values]), "", "", sum([x[4] for x in values]), humanbytes(sum([x[1] for x in values]) / sum(x[0].num_docs() for x in datasets))])
values = [[x[0], humanbytes(x[1]), x[2], x[3], humanbytes(x[4]), x[5]] for x in values]
values.append(['**Total**', "", "", "", sum([x[4] for x in values]), humanbytes(sum([x[1] for x in values]) / sum(x[0].num_docs() for x in datasets), 'KiB')])
values = [[x[0], humanbytes(x[1], 'GiB') if x[1] else "", x[2], x[3], humanbytes(x[4], 'GiB'), x[5]] for x in values]

writer = MarkdownTableWriter()
writer.table_name = "The Pile™"
writer.headers = ["Component", "Raw Size", "Weight", "Epochs", "Effective Size", "Mean Document Size"]
writer.value_matrix = values

if print_latex:
rows = []
for row in values[:-1]:
rows.append(" " + " & ".join(map(lambda x: str(x).replace('%', r'\%'), row)) + r" \\")
totalrow = " & ".join(map(lambda x: r'\textbf{%s}' % str(x).replace('%', r'\%') if x else "", values[-1][1:])) + r" \\"
latex = r"""
\begin{table*}[t!]
\centering
\begin{tabular}{l r r r r r}
\toprule
\textbf{Component} & \textbf{Raw Size} & \textbf{Weight} & \textbf{Copies} & \textbf{Effective Size} & \textbf{Mean Document Size} \\
\midrule
""" + "\n".join(rows) + r"""
\midrule
\textbf{The Pile} & """ + totalrow + r"""
\bottomrule
\end{tabular}
\caption{Overview of datasets in \textit{The Pile} before deduplication. The Pile is distributed with a predefined up/down-sampling of the different constituent datasets.}
\label{table:pile_overview}
\end{table*}
"""
print(latex)
return writer.dumps()


Expand Down
12 changes: 6 additions & 6 deletions the_pile/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,23 +147,23 @@ def rm_if_exists(path):


# https://stackoverflow.com/questions/12523586/python-format-size-application-converting-b-to-kb-mb-gb-tb/37423778
def humanbytes(B):
def humanbytes(B, units=None):
'Return the given bytes as a human friendly KB, MB, GB, or TB string'
B = float(B)
KB = float(1024)
MB = float(KB ** 2) # 1,048,576
GB = float(KB ** 3) # 1,073,741,824
TB = float(KB ** 4) # 1,099,511,627,776

if B < KB:
if (B < KB and units is None) or units == "B":
return '{0} {1}'.format(B,'Bytes' if 0 == B > 1 else 'Byte')
elif KB <= B < MB:
elif (KB <= B < MB and units is None) or units == "KiB":
return '{0:.2f} KiB'.format(B/KB)
elif MB <= B < GB:
elif (MB <= B < GB and units is None) or units == "MiB":
return '{0:.2f} MiB'.format(B/MB)
elif GB <= B < TB:
elif (GB <= B < TB and units is None) or units == "GiB":
return '{0:.2f} GiB'.format(B/GB)
elif TB <= B:
elif (TB <= B and units is None) or units == "TiB":
return '{0:.2f} TiB'.format(B/TB)


Expand Down

0 comments on commit 0de48f1

Please sign in to comment.