Update

TajaKuzman · Feb 23, 2023 · 1ebb333 · 1ebb333
1 parent 1a6331a
commit 1ebb333
Show file tree

Hide file tree

Showing 34 changed files with 55,706 additions and 657 deletions.
diff --git a/.gitignore b/.gitignore
@@ -69,4 +69,7 @@ results/TR/ParlaMint-TR-extracted-sample.csv
 .gitignore
 results/IS/ParlaMint-IS-translated-tokenized.csv
 .gitignore
-results/
+results/
+slobench_slen.en.txt
+slobench_slen.sl/slobench_slen.sl.txt
+Testing-MT-models/SloBENCH
diff --git a/2-choose_MT_model.ipynb b/2-choose_MT_model.ipynb
@@ -2,21 +2,21 @@
  "cells": [
  {
  "cell_type": "code",
- "execution_count": 1,
+ "execution_count": 7,
  "metadata": {},
  "outputs": [
  {
  "name": "stdout",
  "output_type": "stream",
  "text": [
  "env: CUDA_DEVICE_ORDER=PCI_BUS_ID\n",
- "env: CUDA_VISIBLE_DEVICES=1\n"
+ "env: CUDA_VISIBLE_DEVICES=5\n"
  ]
  }
  ],
  "source": [
  "%env CUDA_DEVICE_ORDER=PCI_BUS_ID\n",
- "%env CUDA_VISIBLE_DEVICES=1"
+ "%env CUDA_VISIBLE_DEVICES=5"
  ]
  },
  {
@@ -30,7 +30,7 @@
  "import os\n",
  "\n",
  "# Define the language code, used in the file names\n",
- "lang_code = \"HU\"\n",
+ "lang_code = \"SE\"\n",
  "\n",
  "# Main path\n",
  "main_path = \"/home/tajak/Parlamint-translation\"\n",
@@ -105,7 +105,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 25,
+ "execution_count": 9,
  "metadata": {},
  "outputs": [
  {
@@ -386,7 +386,7 @@
  "\tfrom easynmt import EasyNMT\n",
  "\tfrom IPython.display import display\n",
  "\t\n",
- "\tlang_models_dict = {\"BG\": [\"bg\", \"sla\", \"zls\"], \"HR\": [\"zls\", \"sla\"], \"CZ\": [\"cs\", \"sla\", \"zlw\" ], \"DK\": [\"da\", \"gmq\", \"gem\"], \"NL\": [\"nl\", \"gem\", \"gmw\"], \"FR\": [\"fr\", \"itc\",\"roa\"], \"HU\": [\"hu\", \"fiu\", \"urj\"], \"IS\": [\"is\",\"gmq\", \"gem\"], \"IT\": [\"it\", \"roa\", \"itc\"], \"LV\": [\"lv\",\"bat\"], \"LT\": [\"bat\"], \"PL\": [\"pl\", \"sla\", \"zlw\"], \"SI\": [\"sla\"], \"ES\": [\"es\", \"roa\", \"itc\"], \"TR\": [\"tr\", \"trk\" ], \"AT\": [\"de\", \"gem\", \"gmw\"], \"ES-PV\": [\"eu\", \"mul\"], \"BA\": [\"sla\", \"zls\"], \"ES-CT\": [\"ca\", \"roa\", \"itc\"], \"EE\": [\"et\", \"urj\", \"fiu\"], \"FI\": [\"fi\", \"urj\", \"fiu\"], \"ES-GA\": [\"gl\", \"roa\", \"itc\"], \"GR\": [\"el\",\"grk\"], \"NO\": [\"gem\", \"gmq\"], \"PT\": [\"roa\", \"itc\"], \"RO\":[\"roa\", \"itc\"], \"RS\": [\"zls\", \"sla\"], \"SE\": [\"sv\", \"gmq\", \"gem\"], \"UA\":[\"uk\", \"sla\", \"zle\"], \"BE\": [\"nl\", \"gem\", \"gmw\"]}\n",
+ "\tlang_models_dict = {\"BG\": [\"bg\", \"sla\", \"zls\"], \"HR\": [\"zls\", \"sla\"], \"CZ\": [\"cs\", \"sla\", \"zlw\" ], \"DK\": [\"da\", \"gmq\", \"gem\"], \"NL\": [\"nl\", \"gem\", \"gmw\"], \"FR\": [\"fr\", \"itc\",\"roa\"], \"HU\": [\"hu\", \"fiu\", \"urj\"], \"IS\": [\"is\",\"gmq\", \"gem\"], \"IT\": [\"it\", \"roa\", \"itc\"], \"LV\": [\"lv\",\"bat\"], \"LT\": [\"bat\"], \"PL\": [\"pl\", \"sla\", \"zlw\"], \"SI\": [\"sla\"], \"ES\": [\"es\", \"roa\", \"itc\"], \"TR\": [\"tr\", \"trk\" ], \"AT\": [\"de\", \"gem\", \"gmw\"], \"ES-PV\": [\"eu\", \"mul\"], \"BA\": [\"sla\", \"zls\"], \"ES-CT\": [\"ca\", \"roa\", \"itc\"], \"EE\": [\"et\", \"urj\", \"fiu\"], \"FI\": [\"fi\", \"urj\", \"fiu\"], \"ES-GA\": [\"gl\", \"roa\", \"itc\"], \"GR\": [\"grk\"], \"NO\": [\"gem\", \"gmq\"], \"PT\": [\"roa\", \"itc\"], \"RO\":[\"roa\", \"itc\"], \"RS\": [\"zls\", \"sla\"], \"SE\": [\"sv\", \"gmq\", \"gem\"], \"UA\":[\"uk\", \"sla\", \"zle\"], \"BE\": [\"nl\", \"gem\", \"gmw\"]}\n",
  "\n",
  "\n",
  "\t# Open the file, created in the previous step\n",
@@ -438,6 +438,16 @@
  "\t\t\tnew_translation_list.append(translation)\n",
  "\t\t\n",
  "\t\tdf[\"translation-narrativa\"] = new_translation_list\n",
+ "\t\n",
+ "\t# For Greek, let's try another model, that does not work with EasyNMT, but is on HF\n",
+ "\tif lang_code == \"GR\":\n",
+ "\t\tfrom transformers import pipeline\n",
+ "\t\tnew_translation_list = []\n",
+ "\t\tpipe = pipeline(\"translation\", model=\"Helsinki-NLP/opus-mt-tc-big-el-en\")\n",
+ "\t\tfor sentence in sentence_list:\n",
+ "\t\t\ttranslation = pipe(sentence)\n",
+ "\t\t\tnew_translation_list.append(translation[0][\"translation_text\"])\n",
+ "\t\tdf[\"translation-tc-big-el-en\"] = new_translation_list\n",
  "\n",
  "\t# Save the df\n",
  "\tdf.to_csv(\"/home/tajak/Parlamint-translation/results/{}/ParlaMint-{}-sample-model-comparison.csv\".format(lang_code, lang_code))\n",
@@ -456,39 +466,25 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "Entire corpus has 3606 sentences and 59823 words.\n",
- "Sample files has 30 sentences and 330 words.\n"
+ "Entire corpus has 1795381 sentences and 29006165 words.\n",
+ "Sample files has 30 sentences and 452 words.\n"
  ]
  },
  {
  "name": "stderr",
  "output_type": "stream",
  "text": [
- "Downloading (…)olve/main/source.spm: 100%|██████████| 850k/850k [00:00<00:00, 957kB/s]\n",
- "Downloading (…)olve/main/target.spm: 100%|██████████| 792k/792k [00:00<00:00, 1.18MB/s]\n",
- "Downloading (…)olve/main/vocab.json: 100%|██████████| 1.57M/1.57M [00:00<00:00, 1.75MB/s]\n",
- "Downloading (…)okenizer_config.json: 100%|██████████| 42.0/42.0 [00:00<00:00, 3.92kB/s]\n",
- "Downloading (…)lve/main/config.json: 100%|██████████| 1.38k/1.38k [00:00<00:00, 614kB/s]\n",
- "Downloading (…)\"pytorch_model.bin\";: 100%|██████████| 307M/307M [00:10<00:00, 29.2MB/s] \n",
- "Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 68.6kB/s]\n",
+ "Downloading (…)olve/main/source.spm: 100%|██████████| 815k/815k [00:00<00:00, 1.08MB/s]\n",
+ "Downloading (…)olve/main/target.spm: 100%|██████████| 790k/790k [00:00<00:00, 1.23MB/s]\n",
+ "Downloading (…)olve/main/vocab.json: 100%|██████████| 1.29M/1.29M [00:00<00:00, 1.71MB/s]\n",
+ "Downloading (…)okenizer_config.json: 100%|██████████| 42.0/42.0 [00:00<00:00, 17.9kB/s]\n",
+ "Downloading (…)lve/main/config.json: 100%|██████████| 1.38k/1.38k [00:00<00:00, 366kB/s]\n",
+ "Downloading (…)\"pytorch_model.bin\";: 100%|██████████| 295M/295M [00:03<00:00, 82.6MB/s] \n",
+ "Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 91.7kB/s]\n",
  "/home/tajak/Parlamint-translation/parlamint_env/lib/python3.8/site-packages/transformers/generation/utils.py:1273: UserWarning: Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to 512 (`generation_config.max_length`). Controlling `max_length` via the config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
  " warnings.warn(\n",
- "Downloading (…)olve/main/source.spm: 100%|██████████| 828k/828k [00:00<00:00, 1.23MB/s]\n",
- "Downloading (…)olve/main/target.spm: 100%|██████████| 793k/793k [00:00<00:00, 1.02MB/s]\n",
- "Downloading (…)olve/main/vocab.json: 100%|██████████| 1.48M/1.48M [00:00<00:00, 1.67MB/s]\n",
- "Downloading (…)okenizer_config.json: 100%|██████████| 44.0/44.0 [00:00<00:00, 15.2kB/s]\n",
- "Downloading (…)lve/main/config.json: 100%|██████████| 1.40k/1.40k [00:00<00:00, 429kB/s]\n",
- "Downloading (…)\"pytorch_model.bin\";: 100%|██████████| 300M/300M [00:10<00:00, 29.4MB/s] \n",
- "Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 51.0kB/s]\n",
  "/home/tajak/Parlamint-translation/parlamint_env/lib/python3.8/site-packages/transformers/generation/utils.py:1273: UserWarning: Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to 512 (`generation_config.max_length`). Controlling `max_length` via the config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
  " warnings.warn(\n",
- "Downloading (…)olve/main/source.spm: 100%|██████████| 828k/828k [00:00<00:00, 1.09MB/s]\n",
- "Downloading (…)olve/main/target.spm: 100%|██████████| 793k/793k [00:00<00:00, 1.03MB/s]\n",
- "Downloading (…)olve/main/vocab.json: 100%|██████████| 1.48M/1.48M [00:00<00:00, 1.93MB/s]\n",
- "Downloading (…)okenizer_config.json: 100%|██████████| 44.0/44.0 [00:00<00:00, 11.0kB/s]\n",
- "Downloading (…)lve/main/config.json: 100%|██████████| 1.40k/1.40k [00:00<00:00, 150kB/s]\n",
- "Downloading (…)\"pytorch_model.bin\";: 100%|██████████| 300M/300M [00:13<00:00, 21.6MB/s] \n",
- "Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 75.6kB/s]\n",
  "/home/tajak/Parlamint-translation/parlamint_env/lib/python3.8/site-packages/transformers/generation/utils.py:1273: UserWarning: Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to 512 (`generation_config.max_length`). Controlling `max_length` via the config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
  " warnings.warn(\n"
  ]
@@ -497,7 +493,7 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "The file is saved as/home/tajak/Parlamint-translation/results/HU/ParlaMint-HU-sample-model-comparison.csv. \n"
+ "The file is saved as/home/tajak/Parlamint-translation/results/SE/ParlaMint-SE-sample-model-comparison.csv. \n"
  ]
  }
  ],
@@ -533,52 +529,52 @@
  " <th></th>\n",
  " <th>file_path</th>\n",
  " <th>text</th>\n",
- " <th>translation-hu</th>\n",
- " <th>translation-fiu</th>\n",
- " <th>translation-urj</th>\n",
+ " <th>translation-sv</th>\n",
+ " <th>translation-gmq</th>\n",
+ " <th>translation-gem</th>\n",
  " </tr>\n",
  " </thead>\n",
  " <tbody>\n",
  " <tr>\n",
  " <th>0</th>\n",
- " <td>ParlaMint-HU.conllu/2017/ParlaMint-HU_2017-02-...</td>\n",
- " <td>Tisztelt Képviselőtársaim!</td>\n",
- " <td>Ladies and gentlemen,</td>\n",
- " <td>Ladies and gentlemen.</td>\n",
- " <td>Ladies and gentlemen, my dear fellow Members,</td>\n",
+ " <td>/home/tajak/Parlamint-translation/Source-data/...</td>\n",
+ " <td>Herr talman!</td>\n",
+ " <td>Mr President, I would like to thank the rappor...</td>\n",
+ " <td>Mr President, I should like to thank the Presi...</td>\n",
+ " <td>Mr President, I should like to thank you very ...</td>\n",
  " </tr>\n",
  " <tr>\n",
  " <th>1</th>\n",
- " <td>ParlaMint-HU.conllu/2017/ParlaMint-HU_2017-02-...</td>\n",
- " <td>Az Országgyűlés tavaszi ülésszakának 1. ülésna...</td>\n",
- " <td>The 1st day of the Spring Session of the Parli...</td>\n",
- " <td>I will open the 1st session of the spring sess...</td>\n",
- " <td>I will open the 1st session of the Parliamenta...</td>\n",
+ " <td>/home/tajak/Parlamint-translation/Source-data/...</td>\n",
+ " <td>I dag är första dagen vi träffas i riksdagens ...</td>\n",
+ " <td>Today is the first day we meet in Parliament a...</td>\n",
+ " <td>Today is the first day we meet in Parliament a...</td>\n",
+ " <td>Today is the first day we meet in Parliament's...</td>\n",
  " </tr>\n",
  " </tbody>\n",
  "</table>\n",
  "</div>"
  ],
  "text/plain": [
  " file_path \\\n",
- "0 ParlaMint-HU.conllu/2017/ParlaMint-HU_2017-02-... \n",
- "1 ParlaMint-HU.conllu/2017/ParlaMint-HU_2017-02-... \n",
+ "0 /home/tajak/Parlamint-translation/Source-data/... \n",
+ "1 /home/tajak/Parlamint-translation/Source-data/... \n",
  "\n",
  " text \\\n",
- "0 Tisztelt Képviselőtársaim! \n",
- "1 Az Országgyűlés tavaszi ülésszakának 1. ülésna... \n",
+ "0  Herr talman! \n",
+ "1 I dag är första dagen vi träffas i riksdagens ... \n",
  "\n",
- " translation-hu \\\n",
- "0  Ladies and gentlemen, \n",
- "1 The 1st day of the Spring Session of the Parli... \n",
+ " translation-sv \\\n",
+ "0 Mr President, I would like to thank the rappor... \n",
+ "1 Today is the first day we meet in Parliament a... \n",
  "\n",
- " translation-fiu \\\n",
- "0  Ladies and gentlemen. \n",
- "1 I will open the 1st session of the spring sess... \n",
+ " translation-gmq \\\n",
+ "0 Mr President, I should like to thank the Presi... \n",
+ "1 Today is the first day we meet in Parliament a... \n",
  "\n",
- " translation-urj \n",
- "0  Ladies and gentlemen, my dear fellow Members, \n",
- "1 I will open the 1st session of the Parliamenta... "
+ " translation-gem \n",
+ "0 Mr President, I should like to thank you very ... \n",
+ "1 Today is the first day we meet in Parliament's... "
  ]
  },
  "execution_count": 12,

diff --git a/5-create-conllu.py b/5-create-conllu.py
@@ -103,9 +103,10 @@ def create_conllu(file, lang_code, main_path, final_dataframe, nlp):
  word_conllu_index = word["id"]
 
  # Check whether the word conllu index (word id) is in the substituted_words_list (it is if it was substituted)
- # If it is, add information on the original translated word
- if substituted_words_list[sentence_index].get(word_conllu_index, None) != None:
- word["misc"]["Translated"] = substituted_words_list[sentence_index][word_conllu_index]
+ # If it is, add information on the original translated word - do not do this for Bulgarian and Portuguese
+ if lang_code not in ["BG", "PT", "IT", "AT"]:
+ if substituted_words_list[sentence_index].get(word_conllu_index, None) != None:
+ word["misc"]["Translated"] = substituted_words_list[sentence_index][word_conllu_index]
 
  # Do the same for the forward and backward alignment
  if fwd_align_list[sentence_index].get(word_conllu_index, None) != None: