Merge pull request #10 from indobenchmark/develop

Add WReTe example & update README
IndoNLP · Oct 14, 2020 · 247832c · 247832c
2 parents a91815f + 0682eb0
commit 247832c
Show file tree

Hide file tree

Showing 3 changed files with 1,286 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -36,22 +36,28 @@ We provide the access to our large pretraining dataset. In this version, we excl
 - 23 GB Indo4B Dataset [[Link]](https://storage.googleapis.com/babert-pretraining/IndoNLU_finals/dataset/preprocessed/dataset_wot_uncased_blanklines.tar.xz)
 
 ## IndoBERT and IndoBERT-lite Models
-- 8 IndoBERT Pretrained Language Model [[Link]](https://huggingface.co/indobenchmark)
- - IndoBERT-base
-  - Phase 1 [[Link]](https://huggingface.co/indobenchmark/indobert-base-p1)
-  - Phase 2 [[Link]](https://huggingface.co/indobenchmark/indobert-base-p2)
- - IndoBERT-large
-  - Phase 1 [[Link]](https://huggingface.co/indobenchmark/indobert-large-p1)
-  - Phase 2 [[Link]](https://huggingface.co/indobenchmark/indobert-large-p2)
- - IndoBERT-lite-base
-  - Phase 1 [[Link]](https://huggingface.co/indobenchmark/indobert-lite-base-p1)
-  - Phase 2 [[Link]](https://huggingface.co/indobenchmark/indobert-lite-base-p2)
- - IndoBERT-lite-large
-  - Phase 1 [[Link]](https://huggingface.co/indobenchmark/indobert-lite-large-p1)
-  - Phase 2 [[Link]](https://huggingface.co/indobenchmark/indobert-lite-large-p2)
+We provide 4 IndoBERT and 4 IndoBERT-lite Pretrained Language Model [[Link]](https://huggingface.co/indobenchmark)
+- IndoBERT-base
+ - Phase 1 [[Link]](https://huggingface.co/indobenchmark/indobert-base-p1)
+ - Phase 2 [[Link]](https://huggingface.co/indobenchmark/indobert-base-p2)
+- IndoBERT-large
+ - Phase 1 [[Link]](https://huggingface.co/indobenchmark/indobert-large-p1)
+ - Phase 2 [[Link]](https://huggingface.co/indobenchmark/indobert-large-p2)
+- IndoBERT-lite-base
+ - Phase 1 [[Link]](https://huggingface.co/indobenchmark/indobert-lite-base-p1)
+ - Phase 2 [[Link]](https://huggingface.co/indobenchmark/indobert-lite-base-p2)
+- IndoBERT-lite-large
+ - Phase 1 [[Link]](https://huggingface.co/indobenchmark/indobert-lite-large-p1)
+ - Phase 2 [[Link]](https://huggingface.co/indobenchmark/indobert-lite-large-p2)
 
 ## FastText (Indo4B)
-- Uncased 11.9 GB Model File [[Link]](https://storage.googleapis.com/babert-pretraining/IndoNLU_finals/models/fasttext/fasttext.4B.id.300.epoch5.uncased.bin) 3.9 GB Vector File [[Link]](https://storage.googleapis.com/babert-pretraining/IndoNLU_finals/models/fasttext/fasttext.4B.id.300.epoch5.uncased.vec.zip)
+We provide the full uncased FastText model file (11.9 GB)and the corresponding Vector file (3.9 GB)
+- FastText model (11.9 GB) [[Link]](https://storage.googleapis.com/babert-pretraining/IndoNLU_finals/models/fasttext/fasttext.4B.id.300.epoch5.uncased.bin) 
+- Vector file (3.9 GB)[[Link]](https://storage.googleapis.com/babert-pretraining/IndoNLU_finals/models/fasttext/fasttext.4B.id.300.epoch5.uncased.vec.zip)
+
+We provide smaller FastText models with smaller vocabulary for each of the 12 downstream tasks
+- FastText-Indo4B [[Link]](https://storage.googleapis.com/babert-pretraining/IndoNLU_finals/models/fasttext/fasttext-4B-id-uncased.zip)
+- FastText-CC-ID [[Link]](https://storage.googleapis.com/babert-pretraining/IndoNLU_finals/models/fasttext/fasttext-cc-id.zip)
 
 ## Leaderboard
 - Community Portal and Public Leaderboard [[Link]](https://www.indobenchmark.com/leaderboard.html)

diff --git a/examples/finetune_smsa.ipynb b/examples/finetune_smsa.ipynb
@@ -94,7 +94,7 @@
  "name": "stderr",
  "output_type": "stream",
  "text": [
- "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
  "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
  ]
  }
@@ -564,12 +564,12 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "Text: Dasar anak sialan!! Haram Jadah!! | Label : negative (46.278%)\n"
+ "Text: Dasar anak sialan!! Kurang ajar!! | Label : negative (48.687%)\n"
  ]
  }
  ],
  "source": [
- "text = 'Dasar anak sialan!! Haram Jadah!!'\n",
+ "text = 'Dasar anak sialan!! Kurang ajar!!'\n",
  "subwords = tokenizer.encode(text)\n",
  "subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)\n",
  "\n",
@@ -592,7 +592,7 @@
  "metadata": {},
  "outputs": [],
  "source": [
- "optimizer = optim.Adam(model.parameters(), lr=1e-6)\n",
+ "optimizer = optim.Adam(model.parameters(), lr=3e-6)\n",
  "model = model.cuda()"
  ]
  },
@@ -605,125 +605,162 @@
  "name": "stderr",
  "output_type": "stream",
  "text": [
- "(Epoch 1) TRAIN LOSS:0.5083 LR:0.00000100: 100%|██████████| 344/344 [01:34<00:00, 3.63it/s]\n",
+ "(Epoch 1) TRAIN LOSS:0.3223 LR:0.00000300: 100%|██████████| 344/344 [01:38<00:00, 3.49it/s]\n",
  " 0%| | 0/40 [00:00<?, ?it/s]"
  ]
  },
  {
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "(Epoch 1) TRAIN LOSS:0.5083 ACC:0.81 F1:0.71 REC:0.68 PRE:0.82 LR:0.00000100\n"
+ "(Epoch 1) TRAIN LOSS:0.3223 ACC:0.88 F1:0.84 REC:0.81 PRE:0.88 LR:0.00000300\n"
  ]
  },
  {
  "name": "stderr",
  "output_type": "stream",
  "text": [
- "VALID LOSS:0.2924 ACC:0.89 F1:0.85 REC:0.84 PRE:0.87: 100%|██████████| 40/40 [00:03<00:00, 10.01it/s]\n",
+ "VALID LOSS:0.1956 ACC:0.93 F1:0.89 REC:0.89 PRE:0.90: 100%|██████████| 40/40 [00:04<00:00,  9.56it/s]\n",
  " 0%| | 0/344 [00:00<?, ?it/s]"
  ]
  },
  {
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "(Epoch 1) VALID LOSS:0.2924 ACC:0.89 F1:0.85 REC:0.84 PRE:0.87\n"
+ "(Epoch 1) VALID LOSS:0.1956 ACC:0.93 F1:0.89 REC:0.89 PRE:0.90\n"
  ]
  },
  {
  "name": "stderr",
  "output_type": "stream",
  "text": [
- "(Epoch 2) TRAIN LOSS:0.2263 LR:0.00000100: 100%|██████████| 344/344 [01:38<00:00, 3.50it/s]\n",
+ "(Epoch 2) TRAIN LOSS:0.1542 LR:0.00000300: 100%|██████████| 344/344 [01:42<00:00, 3.37it/s]\n",
  " 0%| | 0/40 [00:00<?, ?it/s]"
  ]
  },
  {
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "(Epoch 2) TRAIN LOSS:0.2263 ACC:0.93 F1:0.90 REC:0.89 PRE:0.91 LR:0.00000100\n"
+ "(Epoch 2) TRAIN LOSS:0.1542 ACC:0.95 F1:0.93 REC:0.93 PRE:0.94 LR:0.00000300\n"
  ]
  },
  {
  "name": "stderr",
  "output_type": "stream",
  "text": [
- "VALID LOSS:0.2112 ACC:0.92 F1:0.89 REC:0.88 PRE:0.90: 100%|██████████| 40/40 [00:04<00:00, 9.46it/s]\n",
+ "VALID LOSS:0.1732 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92: 100%|██████████| 40/40 [00:04<00:00, 9.59it/s]\n",
  " 0%| | 0/344 [00:00<?, ?it/s]"
  ]
  },
  {
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "(Epoch 2) VALID LOSS:0.2112 ACC:0.92 F1:0.89 REC:0.88 PRE:0.90\n"
+ "(Epoch 2) VALID LOSS:0.1732 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92\n"
  ]
  },
  {
  "name": "stderr",
  "output_type": "stream",
  "text": [
- "(Epoch 3) TRAIN LOSS:0.1747 LR:0.00000100: 100%|██████████| 344/344 [01:38<00:00, 3.50it/s]\n",
+ "(Epoch 3) TRAIN LOSS:0.1167 LR:0.00000300: 100%|██████████| 344/344 [01:42<00:00, 3.36it/s]\n",
  " 0%| | 0/40 [00:00<?, ?it/s]"
  ]
  },
  {
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "(Epoch 3) TRAIN LOSS:0.1747 ACC:0.94 F1:0.92 REC:0.91 PRE:0.93 LR:0.00000100\n"
+ "(Epoch 3) TRAIN LOSS:0.1167 ACC:0.96 F1:0.95 REC:0.95 PRE:0.95 LR:0.00000300\n"
  ]
  },
  {
  "name": "stderr",
  "output_type": "stream",
  "text": [
- "VALID LOSS:0.1852 ACC:0.93 F1:0.90 REC:0.90 PRE:0.90: 100%|██████████| 40/40 [00:04<00:00, 9.35it/s]\n",
+ "VALID LOSS:0.1707 ACC:0.93 F1:0.90 REC:0.90 PRE:0.91: 100%|██████████| 40/40 [00:04<00:00, 9.52it/s]\n",
  " 0%| | 0/344 [00:00<?, ?it/s]"
  ]
  },
  {
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "(Epoch 3) VALID LOSS:0.1852 ACC:0.93 F1:0.90 REC:0.90 PRE:0.90\n"
+ "(Epoch 3) VALID LOSS:0.1707 ACC:0.93 F1:0.90 REC:0.90 PRE:0.91\n"
  ]
  },
  {
  "name": "stderr",
  "output_type": "stream",
  "text": [
- "(Epoch 4) TRAIN LOSS:0.1498 LR:0.00000100: 100%|██████████| 344/344 [01:37<00:00, 3.51it/s]\n",
+ "(Epoch 4) TRAIN LOSS:0.0869 LR:0.00000300: 100%|██████████| 344/344 [01:42<00:00, 3.35it/s]\n",
  " 0%| | 0/40 [00:00<?, ?it/s]"
  ]
  },
  {
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "(Epoch 4) TRAIN LOSS:0.1498 ACC:0.95 F1:0.93 REC:0.93 PRE:0.94 LR:0.00000100\n"
+ "(Epoch 4) TRAIN LOSS:0.0869 ACC:0.97 F1:0.97 REC:0.96 PRE:0.97 LR:0.00000300\n"
  ]
  },
  {
  "name": "stderr",
  "output_type": "stream",
  "text": [
- "VALID LOSS:0.1750 ACC:0.94 F1:0.91 REC:0.90 PRE:0.92: 100%|██████████| 40/40 [00:04<00:00, 9.97it/s]\n"
+ "VALID LOSS:0.1799 ACC:0.94 F1:0.91 REC:0.90 PRE:0.93: 100%|██████████| 40/40 [00:04<00:00, 9.37it/s]\n",
+ " 0%| | 0/344 [00:00<?, ?it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(Epoch 4) VALID LOSS:0.1799 ACC:0.94 F1:0.91 REC:0.90 PRE:0.93\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "(Epoch 5) TRAIN LOSS:0.0624 LR:0.00000300: 100%|██████████| 344/344 [01:42<00:00, 3.35it/s]\n",
+ " 0%| | 0/40 [00:00<?, ?it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(Epoch 5) TRAIN LOSS:0.0624 ACC:0.98 F1:0.98 REC:0.97 PRE:0.98 LR:0.00000300\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "VALID LOSS:0.2024 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92: 100%|██████████| 40/40 [00:04<00:00, 9.38it/s]"
  ]
  },
  {
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "(Epoch 4) VALID LOSS:0.1750 ACC:0.94 F1:0.91 REC:0.90 PRE:0.92\n"
+ "(Epoch 5) VALID LOSS:0.2024 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
  ]
  }
  ],
  "source": [
  "# Train\n",
- "n_epochs = 4\n",
+ "n_epochs = 5\n",
  "for epoch in range(n_epochs):\n",
  " model.train()\n",
  " torch.set_grad_enabled(True)\n",
@@ -793,7 +830,7 @@
  "name": "stderr",
  "output_type": "stream",
  "text": [
- "100%|██████████| 16/16 [00:01<00:00, 11.78it/s]\n"
+ "100%|██████████| 16/16 [00:01<00:00, 11.51it/s]"
  ]
  },
  {
@@ -815,6 +852,13 @@
  "\n",
  "[500 rows x 2 columns]\n"
  ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
  }
  ],
  "source": [
@@ -853,7 +897,7 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : positive (98.573%)\n"
+ "Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : positive (99.657%)\n"
  ]
  }
  ],
@@ -877,7 +921,7 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "Text: Budi pergi ke pondok indah mall membeli cakwe | Label : neutral (97.988%)\n"
+ "Text: Budi pergi ke pondok indah mall membeli cakwe | Label : neutral (99.752%)\n"
  ]
  }
  ],
@@ -901,12 +945,12 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "Text: Dasar anak sialan!! Haram Jadah!! | Label : negative (98.770%)\n"
+ "Text: Dasar anak sialan!! Kurang ajar!! | Label : negative (99.816%)\n"
  ]
  }
  ],
  "source": [
- "text = 'Dasar anak sialan!! Haram Jadah!!'\n",
+ "text = 'Dasar anak sialan!! Kurang ajar!!'\n",
  "subwords = tokenizer.encode(text)\n",
  "subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)\n",
  "\n",