recent fixes to sharding scripts

EleutherAI · haileyschoelkopf · Jan 4, 2023 · Dec 27, 2022 · Dec 27, 2022 · Dec 28, 2022
commit f3eb842b143f78a2a22504a02994160ce87d24ba
@@ -9,8 +9,8 @@ def shard(
  input_file: str,
  output_dir: str,
 ):
- """Shard a Megatron .bin file into ~ 9 GB chunks"""
- SHARD_SIZE = 10_000_000_000 # bytes ~= 9 GB 
+ """Shard a Megatron .bin file into ~ 4.5 GB chunks"""
+ SHARD_SIZE = 5_000_000_000 # bytes ~= 4.5 GB 
 
  # load in memmapped .bin file
  full_idx_map = np.memmap(input_file, mode="r", order="C")
@@ -58,4 +58,4 @@ def shard(
 
  os.makedirs(args.output_dir, exist_ok=True)
 
- shard(args.input_file, args.output_dir)
+ shard(args.input_file, args.output_dir)
@@ -11,11 +11,15 @@ def unshard(
  output_dir: str,
 ):
  """Reconstruct a Megatron .bin file from shards""" 
-
+ SHARD_SIZE = 5_000_000_000
+
  input_dir = os.path.dirname(input_file)
  base_filename = os.path.basename(input_file)[:-19] # remove 00000-of-xxxxx.bin suffix from shard 0's filename
+
 
- full_idx_map = None
+ open(os.path.join(output_dir, base_filename) + ".bin", "w+").close()
+ full_idx_map = np.memmap(os.path.join(output_dir, base_filename) + ".bin", shape=(num_shards * SHARD_SIZE,), mode="w+", order="C")
+ print(full_idx_map.shape)
 
  # chunk by iterating over file
  print(f"Loading {num_shards} shards from {input_dir}")
@@ -25,11 +29,15 @@ def unshard(
  print(shard_filename)
  shard_memmap = np.memmap(shard_filename, mode="r", order="C")
 
- if not full_idx_map:
- full_idx_map = shard_memmap
- else: 
- np.concatenate([full_idx_map, shard_memmap])
+ #if full_idx_map is None:
+ # full_idx_map = shard_memmap
+ #else:
+ # full_idx_map = np.concatenate([full_idx_map, shard_memmap])
+
+ full_idx_map[i * SHARD_SIZE: (i + 1) * SHARD_SIZE] = shard_memmap
 
+ del shard_memmap
+ print(full_idx_map.shape)
 
  # write full file
  with open(os.path.join(output_dir, base_filename) + ".bin", "wb+") as out_full_file:
@@ -51,7 +59,7 @@ def unshard(
  parser.add_argument(
  "--num_shards",
  type=int,
- help="Provide number of shards (The total seen in shard filenames)"
+ help="Provide number of shards (The total seen in shard filenames + 1)"
  )
  parser.add_argument(
  "--output_dir",
@@ -62,4 +70,4 @@ def unshard(
 
  os.makedirs(args.output_dir, exist_ok=True)
 
- unshard(args.input_file, args.num_shards, args.output_dir)
+ unshard(args.input_file, args.num_shards, args.output_dir)