Skip to content

Commit

Permalink
Default read amount
Browse files Browse the repository at this point in the history
  • Loading branch information
leogao2 committed Nov 11, 2020
1 parent a5e0ad9 commit 7637470
Showing 1 changed file with 8 additions and 3 deletions.
11 changes: 8 additions & 3 deletions the_pile/pile.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def docs_for_dedupe():
parser.add_argument('--make_lang_analysis', action='store_true', help='make language analysis data')
parser.add_argument('--make_dataset_samples', type=int, help='make dataset sample data')
parser.add_argument('--profile', action='store_true', help='turn on profiler')
parser.add_argument('--read_amount', type=str, default='1200G', help='the size of the data read from the set')
parser.add_argument('--read_amount', type=str, help='the size of the data read from the set')

args = parser.parse_args()
random.seed(42)
Expand All @@ -338,10 +338,15 @@ def docs_for_dedupe():
# add CC
datasets.append((CommonCrawlDataset(), 1.))

print(mk_table(datasets, parse_size(args.read_amount)))
if args.read_amount is None:
args.read_amount = sum([ds.size() * epochs for ds, epochs in datasets])
else:
args.read_amount = parse_size(args.read_amount)

print(mk_table(datasets, args.read_amount))

if args.using == 'pile_reprod' or args.using == 'pile_reprod_no_cc':
pile = PileReplication(datasets, parse_size(args.read_amount), profile=args.profile)
pile = PileReplication(datasets, args.read_amount, profile=args.profile)
elif args.using == 'cc':
pile = CommonCrawlDataset()
elif args.using == 'pile':
Expand Down

0 comments on commit 7637470

Please sign in to comment.