Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
TajaKuzman committed Feb 23, 2023
1 parent 1a6331a commit 1ebb333
Show file tree
Hide file tree
Showing 34 changed files with 55,706 additions and 657 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,7 @@ results/TR/ParlaMint-TR-extracted-sample.csv
.gitignore
results/IS/ParlaMint-IS-translated-tokenized.csv
.gitignore
results/
results/
slobench_slen.en.txt
slobench_slen.sl/slobench_slen.sl.txt
Testing-MT-models/SloBENCH
108 changes: 52 additions & 56 deletions 2-choose_MT_model.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"env: CUDA_DEVICE_ORDER=PCI_BUS_ID\n",
"env: CUDA_VISIBLE_DEVICES=1\n"
"env: CUDA_VISIBLE_DEVICES=5\n"
]
}
],
"source": [
"%env CUDA_DEVICE_ORDER=PCI_BUS_ID\n",
"%env CUDA_VISIBLE_DEVICES=1"
"%env CUDA_VISIBLE_DEVICES=5"
]
},
{
Expand All @@ -30,7 +30,7 @@
"import os\n",
"\n",
"# Define the language code, used in the file names\n",
"lang_code = \"HU\"\n",
"lang_code = \"SE\"\n",
"\n",
"# Main path\n",
"main_path = \"/home/tajak/Parlamint-translation\"\n",
Expand Down Expand Up @@ -105,7 +105,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -386,7 +386,7 @@
"\tfrom easynmt import EasyNMT\n",
"\tfrom IPython.display import display\n",
"\t\n",
"\tlang_models_dict = {\"BG\": [\"bg\", \"sla\", \"zls\"], \"HR\": [\"zls\", \"sla\"], \"CZ\": [\"cs\", \"sla\", \"zlw\" ], \"DK\": [\"da\", \"gmq\", \"gem\"], \"NL\": [\"nl\", \"gem\", \"gmw\"], \"FR\": [\"fr\", \"itc\",\"roa\"], \"HU\": [\"hu\", \"fiu\", \"urj\"], \"IS\": [\"is\",\"gmq\", \"gem\"], \"IT\": [\"it\", \"roa\", \"itc\"], \"LV\": [\"lv\",\"bat\"], \"LT\": [\"bat\"], \"PL\": [\"pl\", \"sla\", \"zlw\"], \"SI\": [\"sla\"], \"ES\": [\"es\", \"roa\", \"itc\"], \"TR\": [\"tr\", \"trk\" ], \"AT\": [\"de\", \"gem\", \"gmw\"], \"ES-PV\": [\"eu\", \"mul\"], \"BA\": [\"sla\", \"zls\"], \"ES-CT\": [\"ca\", \"roa\", \"itc\"], \"EE\": [\"et\", \"urj\", \"fiu\"], \"FI\": [\"fi\", \"urj\", \"fiu\"], \"ES-GA\": [\"gl\", \"roa\", \"itc\"], \"GR\": [\"el\",\"grk\"], \"NO\": [\"gem\", \"gmq\"], \"PT\": [\"roa\", \"itc\"], \"RO\":[\"roa\", \"itc\"], \"RS\": [\"zls\", \"sla\"], \"SE\": [\"sv\", \"gmq\", \"gem\"], \"UA\":[\"uk\", \"sla\", \"zle\"], \"BE\": [\"nl\", \"gem\", \"gmw\"]}\n",
"\tlang_models_dict = {\"BG\": [\"bg\", \"sla\", \"zls\"], \"HR\": [\"zls\", \"sla\"], \"CZ\": [\"cs\", \"sla\", \"zlw\" ], \"DK\": [\"da\", \"gmq\", \"gem\"], \"NL\": [\"nl\", \"gem\", \"gmw\"], \"FR\": [\"fr\", \"itc\",\"roa\"], \"HU\": [\"hu\", \"fiu\", \"urj\"], \"IS\": [\"is\",\"gmq\", \"gem\"], \"IT\": [\"it\", \"roa\", \"itc\"], \"LV\": [\"lv\",\"bat\"], \"LT\": [\"bat\"], \"PL\": [\"pl\", \"sla\", \"zlw\"], \"SI\": [\"sla\"], \"ES\": [\"es\", \"roa\", \"itc\"], \"TR\": [\"tr\", \"trk\" ], \"AT\": [\"de\", \"gem\", \"gmw\"], \"ES-PV\": [\"eu\", \"mul\"], \"BA\": [\"sla\", \"zls\"], \"ES-CT\": [\"ca\", \"roa\", \"itc\"], \"EE\": [\"et\", \"urj\", \"fiu\"], \"FI\": [\"fi\", \"urj\", \"fiu\"], \"ES-GA\": [\"gl\", \"roa\", \"itc\"], \"GR\": [\"grk\"], \"NO\": [\"gem\", \"gmq\"], \"PT\": [\"roa\", \"itc\"], \"RO\":[\"roa\", \"itc\"], \"RS\": [\"zls\", \"sla\"], \"SE\": [\"sv\", \"gmq\", \"gem\"], \"UA\":[\"uk\", \"sla\", \"zle\"], \"BE\": [\"nl\", \"gem\", \"gmw\"]}\n",
"\n",
"\n",
"\t# Open the file, created in the previous step\n",
Expand Down Expand Up @@ -438,6 +438,16 @@
"\t\t\tnew_translation_list.append(translation)\n",
"\t\t\n",
"\t\tdf[\"translation-narrativa\"] = new_translation_list\n",
"\t\n",
"\t# For Greek, let's try another model, that does not work with EasyNMT, but is on HF\n",
"\tif lang_code == \"GR\":\n",
"\t\tfrom transformers import pipeline\n",
"\t\tnew_translation_list = []\n",
"\t\tpipe = pipeline(\"translation\", model=\"Helsinki-NLP/opus-mt-tc-big-el-en\")\n",
"\t\tfor sentence in sentence_list:\n",
"\t\t\ttranslation = pipe(sentence)\n",
"\t\t\tnew_translation_list.append(translation[0][\"translation_text\"])\n",
"\t\tdf[\"translation-tc-big-el-en\"] = new_translation_list\n",
"\n",
"\t# Save the df\n",
"\tdf.to_csv(\"/home/tajak/Parlamint-translation/results/{}/ParlaMint-{}-sample-model-comparison.csv\".format(lang_code, lang_code))\n",
Expand All @@ -456,39 +466,25 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Entire corpus has 3606 sentences and 59823 words.\n",
"Sample files has 30 sentences and 330 words.\n"
"Entire corpus has 1795381 sentences and 29006165 words.\n",
"Sample files has 30 sentences and 452 words.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading (…)olve/main/source.spm: 100%|██████████| 850k/850k [00:00<00:00, 957kB/s]\n",
"Downloading (…)olve/main/target.spm: 100%|██████████| 792k/792k [00:00<00:00, 1.18MB/s]\n",
"Downloading (…)olve/main/vocab.json: 100%|██████████| 1.57M/1.57M [00:00<00:00, 1.75MB/s]\n",
"Downloading (…)okenizer_config.json: 100%|██████████| 42.0/42.0 [00:00<00:00, 3.92kB/s]\n",
"Downloading (…)lve/main/config.json: 100%|██████████| 1.38k/1.38k [00:00<00:00, 614kB/s]\n",
"Downloading (…)\"pytorch_model.bin\";: 100%|██████████| 307M/307M [00:10<00:00, 29.2MB/s] \n",
"Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 68.6kB/s]\n",
"Downloading (…)olve/main/source.spm: 100%|██████████| 815k/815k [00:00<00:00, 1.08MB/s]\n",
"Downloading (…)olve/main/target.spm: 100%|██████████| 790k/790k [00:00<00:00, 1.23MB/s]\n",
"Downloading (…)olve/main/vocab.json: 100%|██████████| 1.29M/1.29M [00:00<00:00, 1.71MB/s]\n",
"Downloading (…)okenizer_config.json: 100%|██████████| 42.0/42.0 [00:00<00:00, 17.9kB/s]\n",
"Downloading (…)lve/main/config.json: 100%|██████████| 1.38k/1.38k [00:00<00:00, 366kB/s]\n",
"Downloading (…)\"pytorch_model.bin\";: 100%|██████████| 295M/295M [00:03<00:00, 82.6MB/s] \n",
"Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 91.7kB/s]\n",
"/home/tajak/Parlamint-translation/parlamint_env/lib/python3.8/site-packages/transformers/generation/utils.py:1273: UserWarning: Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to 512 (`generation_config.max_length`). Controlling `max_length` via the config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
" warnings.warn(\n",
"Downloading (…)olve/main/source.spm: 100%|██████████| 828k/828k [00:00<00:00, 1.23MB/s]\n",
"Downloading (…)olve/main/target.spm: 100%|██████████| 793k/793k [00:00<00:00, 1.02MB/s]\n",
"Downloading (…)olve/main/vocab.json: 100%|██████████| 1.48M/1.48M [00:00<00:00, 1.67MB/s]\n",
"Downloading (…)okenizer_config.json: 100%|██████████| 44.0/44.0 [00:00<00:00, 15.2kB/s]\n",
"Downloading (…)lve/main/config.json: 100%|██████████| 1.40k/1.40k [00:00<00:00, 429kB/s]\n",
"Downloading (…)\"pytorch_model.bin\";: 100%|██████████| 300M/300M [00:10<00:00, 29.4MB/s] \n",
"Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 51.0kB/s]\n",
"/home/tajak/Parlamint-translation/parlamint_env/lib/python3.8/site-packages/transformers/generation/utils.py:1273: UserWarning: Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to 512 (`generation_config.max_length`). Controlling `max_length` via the config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
" warnings.warn(\n",
"Downloading (…)olve/main/source.spm: 100%|██████████| 828k/828k [00:00<00:00, 1.09MB/s]\n",
"Downloading (…)olve/main/target.spm: 100%|██████████| 793k/793k [00:00<00:00, 1.03MB/s]\n",
"Downloading (…)olve/main/vocab.json: 100%|██████████| 1.48M/1.48M [00:00<00:00, 1.93MB/s]\n",
"Downloading (…)okenizer_config.json: 100%|██████████| 44.0/44.0 [00:00<00:00, 11.0kB/s]\n",
"Downloading (…)lve/main/config.json: 100%|██████████| 1.40k/1.40k [00:00<00:00, 150kB/s]\n",
"Downloading (…)\"pytorch_model.bin\";: 100%|██████████| 300M/300M [00:13<00:00, 21.6MB/s] \n",
"Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 75.6kB/s]\n",
"/home/tajak/Parlamint-translation/parlamint_env/lib/python3.8/site-packages/transformers/generation/utils.py:1273: UserWarning: Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to 512 (`generation_config.max_length`). Controlling `max_length` via the config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
" warnings.warn(\n"
]
Expand All @@ -497,7 +493,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"The file is saved as/home/tajak/Parlamint-translation/results/HU/ParlaMint-HU-sample-model-comparison.csv. \n"
"The file is saved as/home/tajak/Parlamint-translation/results/SE/ParlaMint-SE-sample-model-comparison.csv. \n"
]
}
],
Expand Down Expand Up @@ -533,52 +529,52 @@
" <th></th>\n",
" <th>file_path</th>\n",
" <th>text</th>\n",
" <th>translation-hu</th>\n",
" <th>translation-fiu</th>\n",
" <th>translation-urj</th>\n",
" <th>translation-sv</th>\n",
" <th>translation-gmq</th>\n",
" <th>translation-gem</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ParlaMint-HU.conllu/2017/ParlaMint-HU_2017-02-...</td>\n",
" <td>Tisztelt Képviselőtársaim!</td>\n",
" <td>Ladies and gentlemen,</td>\n",
" <td>Ladies and gentlemen.</td>\n",
" <td>Ladies and gentlemen, my dear fellow Members,</td>\n",
" <td>/home/tajak/Parlamint-translation/Source-data/...</td>\n",
" <td>Herr talman!</td>\n",
" <td>Mr President, I would like to thank the rappor...</td>\n",
" <td>Mr President, I should like to thank the Presi...</td>\n",
" <td>Mr President, I should like to thank you very ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ParlaMint-HU.conllu/2017/ParlaMint-HU_2017-02-...</td>\n",
" <td>Az Országgyűlés tavaszi ülésszakának 1. ülésna...</td>\n",
" <td>The 1st day of the Spring Session of the Parli...</td>\n",
" <td>I will open the 1st session of the spring sess...</td>\n",
" <td>I will open the 1st session of the Parliamenta...</td>\n",
" <td>/home/tajak/Parlamint-translation/Source-data/...</td>\n",
" <td>I dag är första dagen vi träffas i riksdagens ...</td>\n",
" <td>Today is the first day we meet in Parliament a...</td>\n",
" <td>Today is the first day we meet in Parliament a...</td>\n",
" <td>Today is the first day we meet in Parliament's...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" file_path \\\n",
"0 ParlaMint-HU.conllu/2017/ParlaMint-HU_2017-02-... \n",
"1 ParlaMint-HU.conllu/2017/ParlaMint-HU_2017-02-... \n",
"0 /home/tajak/Parlamint-translation/Source-data/... \n",
"1 /home/tajak/Parlamint-translation/Source-data/... \n",
"\n",
" text \\\n",
"0 Tisztelt Képviselőtársaim! \n",
"1 Az Országgyűlés tavaszi ülésszakának 1. ülésna... \n",
"0 Herr talman! \n",
"1 I dag är första dagen vi träffas i riksdagens ... \n",
"\n",
" translation-hu \\\n",
"0 Ladies and gentlemen, \n",
"1 The 1st day of the Spring Session of the Parli... \n",
" translation-sv \\\n",
"0 Mr President, I would like to thank the rappor... \n",
"1 Today is the first day we meet in Parliament a... \n",
"\n",
" translation-fiu \\\n",
"0 Ladies and gentlemen. \n",
"1 I will open the 1st session of the spring sess... \n",
" translation-gmq \\\n",
"0 Mr President, I should like to thank the Presi... \n",
"1 Today is the first day we meet in Parliament a... \n",
"\n",
" translation-urj \n",
"0 Ladies and gentlemen, my dear fellow Members, \n",
"1 I will open the 1st session of the Parliamenta... "
" translation-gem \n",
"0 Mr President, I should like to thank you very ... \n",
"1 Today is the first day we meet in Parliament's... "
]
},
"execution_count": 12,
Expand Down
7 changes: 4 additions & 3 deletions 5-create-conllu.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,10 @@ def create_conllu(file, lang_code, main_path, final_dataframe, nlp):
word_conllu_index = word["id"]

# Check whether the word conllu index (word id) is in the substituted_words_list (it is if it was substituted)
# If it is, add information on the original translated word
if substituted_words_list[sentence_index].get(word_conllu_index, None) != None:
word["misc"]["Translated"] = substituted_words_list[sentence_index][word_conllu_index]
# If it is, add information on the original translated word - do not do this for Bulgarian and Portuguese
if lang_code not in ["BG", "PT", "IT", "AT"]:
if substituted_words_list[sentence_index].get(word_conllu_index, None) != None:
word["misc"]["Translated"] = substituted_words_list[sentence_index][word_conllu_index]

# Do the same for the forward and backward alignment
if fwd_align_list[sentence_index].get(word_conllu_index, None) != None:
Expand Down
Loading

0 comments on commit 1ebb333

Please sign in to comment.