-
Notifications
You must be signed in to change notification settings - Fork 0
/
7-note-translation-TEI.ana-GB.py
244 lines (170 loc) · 7.12 KB
/
7-note-translation-TEI.ana-GB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import argparse
import regex as re
import pandas as pd
from bs4 import BeautifulSoup as bs
import os
from knockknock import discord_sender
from easynmt import EasyNMT
from IPython.display import display
import time
import csv
import numpy as np
# Define the language code, used in the file names
lang_code = "GB"
# Define the path to the Source TEI folder
path = "/home/tajak/Parlamint-translation/Note-translation/Source-data-TEI/ParlaMint-{}.TEI.ana".format(lang_code)
# Define final path
notes_path = "/home/tajak/Parlamint-translation/Note-translation/Final-data-CSV/ParlaMint-{}.notes.translated.ana.tsv".format(lang_code)
temp_path = "/home/tajak/Parlamint-translation/Note-translation/Final-data-CSV/before-sorting/ParlaMint-{}.notes.translated-before-sorting.ana.csv".format(lang_code)
def extract_tag(tag, df, content):
if tag in ["note", "head"]:
# Extract all notes from the file
note_list = content.find_all(tag)
note_list_final = []
for i in note_list:
current_note = []
type = ""
if i.attrs.get('type', 'None') != "None":
type = i.get("type")
elif i.attrs.get('reason', 'None') != "None":
type = i.get("reason")
else:
type = ""
if i.attrs.get("xml:lang", "None") != "None":
lang = i.get("xml:lang")
else:
# If note does not have lang id, take the lang id of its parent
lang = i.find_parent(attrs={"xml:lang":True}).attrs.get("xml:lang")
current_note.append(type)
current_note.append(i.get_text().strip())
current_note.append(lang)
note_list_final.append(current_note)
new_df = pd.DataFrame({"type": [x[0] for x in note_list_final], "content": [x[1] for x in note_list_final], "xml:lang": [x[2] for x in note_list_final]})
new_df["tag"] = tag
# Merge df to the previous df
df = pd.concat([df, new_df])
else:
# Extract all other notes from the file
note_list = content.find_all(tag)
note_list_final = []
for i in note_list:
desc_list = []
type = ""
if i.attrs.get('type', 'None') != "None":
type = i.get("type")
elif i.attrs.get('reason', 'None') != "None":
type = i.get("reason")
else:
type = ""
desc_list = i.find_all("desc")
if len(desc_list) == 0:
print("Error - empty desc_list")
print(i)
else:
for desc in desc_list:
current_note = []
if "xml:lang" in list(desc.attrs.keys()):
lang = desc.get("xml:lang")
else:
lang = i.find_parent(attrs={"xml:lang":True}).attrs.get("xml:lang")
current_note.append(type)
current_note.append(desc.get_text().strip())
current_note.append(lang)
note_list_final.append(current_note)
new_df = pd.DataFrame({"type": [x[0] for x in note_list_final], "content": [x[1] for x in note_list_final], "xml:lang": [x[2] for x in note_list_final]})
new_df["tag"] = tag
# Merge df to the previous df
df = pd.concat([df, new_df])
return df
# Get notified once the code ends
webhook_url = open("/home/tajak/Parlamint-translation/discord_key.txt", "r").read()
@discord_sender(webhook_url=webhook_url)
def translate_notes(path, lang_code, notes_path, temp_path):
start_time = time.time()
print("Extraction of the notes and translation started.")
# Extract a list with paths to conllu files and a list with their names
parl_list = []
file_name_list = []
for dir1 in os.listdir(path):
full_path = os.path.join(path, dir1)
if os.path.isdir(full_path):
current = os.listdir(full_path)
# Keep only files with parliamentary sessions:
for file in current:
if "ParlaMint-{}_".format(lang_code) in file:
if ".xml" in file:
final_path = "{}/{}".format(full_path, file)
parl_list.append(final_path)
file_name_list.append(file)
# See how many files we have:
print("No. of files: {}.".format(len(parl_list)))
# Create an empty df
df = pd.DataFrame({"tag": [""],"type": [""], "content": [""], "xml:lang": [""]})
# Go through all files in the list of files and extract notes from all of them
for path in parl_list:
file = open(path, "r")
# Parse the file with beautifulsoup
content = bs(file, "xml")
# Extract all tags from the file
for tag in ["note", "gap", "head", "kinesic", "vocal", "incident"]:
df = extract_tag(tag, df, content)
# At the end, edit the df by deleting the first (empty) row and reseting the index
# Reset index
df = df.reset_index(drop=True)
# Remove the first row
df = df.drop([0], axis="index")
# Reset index
df = df.reset_index(drop=True)
print("Statistics before droping duplicates:\n\n\n")
# Show the results
print(df.describe(include="all").to_markdown())
print("\n")
print(df.head().to_markdown())
print("\n")
print("Statistics for tags:\n")
print(df.tag.value_counts().to_markdown())
print("\n")
print(df.groupby("tag").type.value_counts().to_markdown())
print("Most common notes:\n")
print(df.content.value_counts()[:20].to_markdown())
# Remove duplicated rows (exact duplicates - all values in all columns match)
df = df.drop_duplicates()
print("Statistics after deduplication:\n")
# Add information on length
df["length"] = df["content"].str.split().str.len()
print("Number of words in the notes: {}\n".format(df["length"].sum()))
print(df.describe(include="all").to_markdown())
print("\n")
print(df.head().to_markdown())
print("\n")
print("Statistics for tags:\n")
print(df.tag.value_counts().to_markdown())
print("\n")
print(df.groupby("tag").type.value_counts().to_markdown())
# Define the MT model
#model = EasyNMT('opus-mt')
# Create a list of sentences from the df - we are applying this to the GB corpus, so we don't need translating - we will just copy the sentence_list
sentence_list = df.content.to_list()
translation_list = sentence_list
#Translate the list of sentences - you need to provide the source language as it is in the name of the model - the opus_lang_code
#for opus_lang_code in lang_models_dict[lang_code]:
#translation_list = model.translate(sentence_list, source_lang = "{}".format(opus_lang_code), target_lang='en')
translation_time = round((time.time() - start_time)/60,2)
print("Extraction completed. It took {} minutes for {} instances for the entire process of extraction.".format(translation_time, len(sentence_list)))
# Add the translations to the df
df["translation"] = translation_list
#translation shortening of long MT repetition errors: if len (words) of EN sentence is more than 4X len (words) of original sentence, shorten the EN sentence to length 4X original sentence (to the nearest word) - applicable only if len of translation is longer than 6 words (to avoid cases where a couple of words in source language would be translated with a longer sequences in English that are still correct)
# Add the country code to the df
df["corpus"] = lang_code
# Display the df
print(df.head().to_markdown())
print("\n\n\n")
# Save the df
df.to_csv("{}".format(temp_path), sep="\t", index=False)
# Sort the values
df = df.sort_values(by=["tag", "type"])
# Save the final df
df.to_csv("{}".format(notes_path), index=False, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\")
print("The file is saved as {}".format(notes_path))
return df
df = translate_notes(path, lang_code, notes_path, temp_path)