wip dataprep

RWKV · PicoCreator · Mar 6, 2024 · Mar 6, 2024 · Mar 6, 2024 · Mar 6, 2024
commit 05eb723e61e5735b9e0e2a4147495ac706585854
diff --git a/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb b/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb
@@ -223,14 +223,144 @@
  "!cd \"{TRAINER_DIR}\" && python3 datapack_build.py \"{NOTEBOOK_DIR}/retune-data-build-no-mask.yaml\""
  ]
  },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ ">> Starting datapack build process for: /workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask-32k.yaml\n",
+ ">> Preparing dataset - index: 0 - name: lambada-train\n",
+ "Map (num_proc=160): 100%|███████████| 2662/2662 [00:09<00:00, 294.89 examples/s]\n",
+ "Filter (num_proc=160): 100%|████████| 2662/2662 [00:03<00:00, 723.14 examples/s]\n",
+ "Map (num_proc=160): 100%|███████████| 2661/2661 [00:06<00:00, 436.35 examples/s]\n",
+ "Map (num_proc=160): 100%|██████████| 7221/7221 [00:06<00:00, 1196.52 examples/s]\n",
+ "Warning: packing_enable=true, with text rechunking (either auto, or forced) - packing_enable will be treated as false\n",
+ "Saving the dataset (3/3 shards): 100%|█| 7221/7221 [00:08<00:00, 860.36 examples\n",
+ "Saving the dataset (1/1 shards): 100%|████| 1/1 [00:00<00:00, 122.92 examples/s]\n",
+ ">> Preparing dataset - index: 1 - name: enwiki-train\n",
+ "Map (num_proc=160): 100%|███| 1000000/1000000 [00:21<00:00, 46232.78 examples/s]\n",
+ "Filter (num_proc=160): 100%|█| 1000000/1000000 [00:07<00:00, 136515.60 examples/\n",
+ "Map (num_proc=160): 100%|█████| 472276/472276 [00:14<00:00, 33143.01 examples/s]\n",
+ "Map (num_proc=160): 100%|████████| 15456/15456 [00:12<00:00, 1265.51 examples/s]\n",
+ "Warning: packing_enable=true, with text rechunking (either auto, or forced) - packing_enable will be treated as false\n",
+ "Saving the dataset (7/7 shards): 100%|█| 15456/15456 [00:18<00:00, 850.72 exampl\n",
+ "Saving the dataset (1/1 shards): 100%|████| 1/1 [00:00<00:00, 108.06 examples/s]\n",
+ ">> Preparing dataset - index: 2 - name: balanced-copa-choices\n",
+ "Map (num_proc=160): 100%|███████████| 1000/1000 [00:01<00:00, 944.86 examples/s]\n",
+ "Filter (num_proc=160): 100%|████████| 1000/1000 [00:01<00:00, 972.65 examples/s]\n",
+ "Map (num_proc=160): 100%|█████████████| 999/999 [00:02<00:00, 414.00 examples/s]\n",
+ "Map (num_proc=160): 100%|█████████████| 999/999 [00:01<00:00, 540.03 examples/s]\n",
+ "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4872.92 examples/\n",
+ "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 46.52 examples/s]\n",
+ ">> Preparing dataset - index: 3 - name: balanced-copa-options\n",
+ "Map (num_proc=160): 100%|███████████| 1000/1000 [00:01<00:00, 935.64 examples/s]\n",
+ "Filter (num_proc=160): 100%|████████| 1000/1000 [00:01<00:00, 968.80 examples/s]\n",
+ "Map (num_proc=160): 100%|█████████████| 999/999 [00:02<00:00, 433.41 examples/s]\n",
+ "Map (num_proc=160): 100%|█████████████| 999/999 [00:01<00:00, 528.35 examples/s]\n",
+ "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 5012.50 examples/\n",
+ "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 45.30 examples/s]\n",
+ ">> Preparing dataset - index: 4 - name: MedText-QA\n",
+ "Map (num_proc=160): 100%|██████████| 1412/1412 [00:01<00:00, 1370.04 examples/s]\n",
+ "Filter (num_proc=160): 100%|███████| 1412/1412 [00:00<00:00, 1415.80 examples/s]\n",
+ "Map (num_proc=160): 100%|███████████| 1411/1411 [00:02<00:00, 658.51 examples/s]\n",
+ "Map (num_proc=160): 100%|███████████| 1411/1411 [00:01<00:00, 766.49 examples/s]\n",
+ "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4222.99 examples/\n",
+ "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 43.36 examples/s]\n",
+ ">> Preparing dataset - index: 5 - name: ALMA-prompt-completion\n",
+ "Map (num_proc=160): 100%|█████| 117404/117404 [00:01<00:00, 84427.68 examples/s]\n",
+ "Filter (num_proc=160): 100%|█| 117404/117404 [00:01<00:00, 103449.36 examples/s]\n",
+ "Map (num_proc=160): 100%|█████| 117403/117403 [00:02<00:00, 48025.94 examples/s]\n",
+ "Map (num_proc=160): 100%|█████| 117403/117403 [00:01<00:00, 61484.38 examples/s]\n",
+ "Saving the dataset (1/1 shards): 100%|█| 411/411 [00:00<00:00, 1180.88 examples/\n",
+ "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 42.24 examples/s]\n",
+ ">> Preparing dataset - index: 6 - name: openbookqa-answer-choice\n",
+ "Map (num_proc=160): 100%|██████████| 4957/4957 [00:01<00:00, 4669.88 examples/s]\n",
+ "Filter (num_proc=160): 100%|███████| 4957/4957 [00:00<00:00, 5115.47 examples/s]\n",
+ "Map (num_proc=160): 100%|██████████| 4956/4956 [00:02<00:00, 2154.53 examples/s]\n",
+ "Map (num_proc=160): 100%|██████████| 4956/4956 [00:01<00:00, 2685.23 examples/s]\n",
+ "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 1163.91 examples/\n",
+ "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 44.53 examples/s]\n",
+ ">> Preparing dataset - index: 7 - name: winogrande-debiased-choices\n",
+ "Map (num_proc=160): 100%|██████████| 9248/9248 [00:01<00:00, 8858.91 examples/s]\n",
+ "Filter (num_proc=160): 100%|███████| 9248/9248 [00:01<00:00, 9138.30 examples/s]\n",
+ "Map (num_proc=160): 100%|██████████| 9247/9247 [00:02<00:00, 3961.32 examples/s]\n",
+ "Map (num_proc=160): 100%|██████████| 9247/9247 [00:01<00:00, 4858.31 examples/s]\n",
+ "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 3421.11 examples/\n",
+ "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 24.57 examples/s]\n",
+ ">> Preparing dataset - index: 8 - name: winogrande-l-choices\n",
+ "Map (num_proc=160): 100%|███████| 10234/10234 [00:01<00:00, 10099.86 examples/s]\n",
+ "Filter (num_proc=160): 100%|████| 10234/10234 [00:00<00:00, 10713.05 examples/s]\n",
+ "Map (num_proc=160): 100%|████████| 10233/10233 [00:02<00:00, 4447.36 examples/s]\n",
+ "Map (num_proc=160): 100%|████████| 10233/10233 [00:01<00:00, 5208.21 examples/s]\n",
+ "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 3172.52 examples/\n",
+ "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 38.39 examples/s]\n",
+ ">> Preparing dataset - index: 9 - name: arc_easy-answer-choice\n",
+ "Map (num_proc=160): 100%|██████████| 2251/2251 [00:01<00:00, 2023.89 examples/s]\n",
+ "Filter (num_proc=160): 100%|███████| 2251/2251 [00:00<00:00, 2316.50 examples/s]\n",
+ "Map (num_proc=160): 100%|███████████| 2250/2250 [00:02<00:00, 997.01 examples/s]\n",
+ "Map (num_proc=160): 100%|██████████| 2250/2250 [00:01<00:00, 1159.56 examples/s]\n",
+ "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4271.29 examples/\n",
+ "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 45.69 examples/s]\n",
+ ">> Preparing dataset - index: 10 - name: arc_challenge-answer-choice\n",
+ "Map (num_proc=160): 100%|██████████| 1119/1119 [00:01<00:00, 1016.30 examples/s]\n",
+ "Filter (num_proc=160): 100%|███████| 1119/1119 [00:01<00:00, 1080.18 examples/s]\n",
+ "Map (num_proc=160): 100%|███████████| 1118/1118 [00:02<00:00, 458.11 examples/s]\n",
+ "Map (num_proc=160): 100%|███████████| 1118/1118 [00:01<00:00, 602.26 examples/s]\n",
+ "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4528.08 examples/\n",
+ "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 44.31 examples/s]\n",
+ ">> Preparing dataset - index: 11 - name: piqa-choices\n",
+ "Map (num_proc=160): 100%|███████| 16113/16113 [00:01<00:00, 14850.29 examples/s]\n",
+ "Filter (num_proc=160): 100%|████| 16113/16113 [00:01<00:00, 15600.36 examples/s]\n",
+ "Map (num_proc=160): 100%|████████| 16112/16112 [00:02<00:00, 7100.73 examples/s]\n",
+ "Map (num_proc=160): 100%|████████| 16112/16112 [00:01<00:00, 8771.49 examples/s]\n",
+ "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 884.08 examples/s\n",
+ "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 42.64 examples/s]\n",
+ ">> -----------------------------------\n",
+ ">> Dataset Mixing mode: shuffle\n",
+ ">> Saving dataset to data_path : /datapath/eval-retune/pack-no-mask-32k/\n",
+ "Saving the dataset (10/10 shards): 100%|█| 24528/24528 [00:19<00:00, 1269.30 exa\n",
+ "Saving the dataset (1/1 shards): 100%|███| 12/12 [00:00<00:00, 77.37 examples/s]\n",
+ ">> Dataset saved to data_path\n",
+ ">> -----------------------------------\n",
+ ">> Performing dataset counting\n",
+ ">> -----------------------------------\n",
+ ">> Final dataset count ( train ) : 24,528 samples/chunks/packs\n",
+ ">> Final dataset count ( test ) : 12 samples\n",
+ ">> -----------------------------------\n",
+ "Map (num_proc=160): 100%|█████████| 24528/24528 [00:31<00:00, 767.98 examples/s]\n",
+ "num_proc must be <= 12. Reducing num_proc to 12 for dataset of size 12.\n",
+ "Map (num_proc=12): 100%|█████████████████| 12/12 [00:03<00:00, 3.63 examples/s]\n",
+ ">> -----------------------------------\n",
+ ">> Final 'train' dataset token count ...\n",
+ ">> - Total tokens : 757,250,147\n",
+ ">> - Valid tokens : 749,800,541\n",
+ ">> - Hidden tokens : 7,449,606\n",
+ ">> -----------------------------------\n",
+ ">> Final 'test' dataset token count ...\n",
+ ">> - Total tokens : 66,317\n",
+ ">> - Valid tokens : 65,995\n",
+ ">> - Hidden tokens : 322\n",
+ ">> -----------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Lets build the giant datapack\n",
+ "!cd \"{TRAINER_DIR}\" && python3 datapack_build.py \"{NOTEBOOK_DIR}/retune-data-build-no-mask-32k.yaml\""
+ ]
+ },
  {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
  "# Lets build the giant datapack\n",
- "!cd \"{TRAINER_DIR}\" && python3 datapack_build.py \"{NOTEBOOK_DIR}/retune-data-build-no-mask-32k.yaml\""
+ "!cd \"{TRAINER_DIR}\" && python3 datapack_build.py \"{NOTEBOOK_DIR}/retune-data-build-no-mask-no-text.yaml\""
  ]
  }
  ],
@@ -239,18 +369,6 @@
  "display_name": "Python 3 (ipykernel)",
  "language": "python",
  "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.12"
  }
  },
  "nbformat": 4,