Skip to content

Commit

Permalink
Added minimum_length parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
ConnorJL committed Jul 3, 2019
1 parent b844888 commit 4563475
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion datasets/openwebtext/create_tfrecords.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
files = glob.glob(os.path.join(base_dir, "**/*.txt"))
processes = 64 # Number of encoding processes to run
encoder_path = "gs:https://openwebtext/stuff/encoder" # Path to encoder files
minimum_size = 25

def _int64_feature(value):
"""Returns an int64_list from a bool / enum / int / uint."""
Expand Down Expand Up @@ -59,7 +60,7 @@ def create_file(args):
d = f.read()
d = ftfy.fix_text(d, normalization='NFKC')
data = np.array(enc.encode(d), np.int32)
if data.shape[0] < 25 or (data == 0).all(): # If text is shorter than 25 tokens, or all tokens are 0, ignore
if data.shape[0] < minimum_size or (data == 0).all(): # If text is shorter than 25 tokens, or all tokens are 0, ignore
continue
hash = fn.split("/")[-1].split(".")[0]
feature = {
Expand Down

0 comments on commit 4563475

Please sign in to comment.