forked from rosettatype/hyperglot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
readers.py
executable file
·686 lines (626 loc) · 23.3 KB
/
readers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
"""
Read language data (codes and characters) from different sources.
When run as a script, convert all data to YAMLs.
"""
import csv
import glob
import logging
import os
import re
import unicodedata
import xml.etree.ElementTree as ET
from collections import OrderedDict
from copy import copy
from custom_yaml import save_yaml
SCRIPT_TAGS = OrderedDict([
("arabic", "Arab"),
("armenian", "Armn"),
("bengali", "Beng"),
("chakma", "Cakm"),
("cherokee", "Cher"),
("cyrillic", "Cyrl"),
("devanagari", "Deva"),
("ethiopic", "Ethi"),
("georgian", "Geor"),
("greek", "Grek"),
("gujarati", "Gujr"),
("gurmukhi", "Guru"),
("hangul", "Hang"),
("hebrew", "Hebr"),
("kannada", "Knda"),
("khmer", "Khmr"),
("lao", "Laoo"),
("latin", "Latn"),
("malayalam", "Mlym"),
("myanmar", "Mymr"),
("oriya", "Orya"),
("sinhala", "Sinh"),
("tamil", "Taml"),
("telugu", "Telu"),
("thai", "Thai"),
("tibetan", "Tibt"),
("tifinagh", "Tfng"),
("vai", "Vaii"),
("yi", "Yiii"),
])
def guess_script(code, chars):
"""
Guess the script based on characters’ Unicode description.
Should be done more rigorously, but for the purpose of
completing the info from CLDR, it seems sufficient.
"""
# derive from language
if code == "ja": # Japanese
return "Jpan"
elif code == "yue": # Yue Chinese
return "Hani"
elif code == "zh": # Chinese
return "Hani"
# derive from character descriptions
if chars.strip():
for c in chars:
# find letters in the string of characters
if unicodedata.category(c)[0] == "L":
for name, tag in SCRIPT_TAGS.items():
if name in unicodedata.name(c).lower():
return tag
# return undetermined script otherwise
return "Zyyy"
def parse_unicode_set(s):
"""
Parse UnicodeSet record used in Unicode CLDR.
Needs more work, but should be fine to use for most scripts
except Chinese, Japanese, and other that use ranges.
Processes escape sequences, assumes XML entities have been
dealt with when reading the XML. Removes enclosing [].
Returns a string with sorted characters and separated string
with character combinations (originally enclosed in {})
in the main string. Combinations may not be sorted.
"""
def repl(m):
return chr(int(m.group(1), 16))
KNOWN_ESCAPES = OrderedDict([
("\\\"", "\""),
("\\[", "["),
("\\]", "]"),
("\\{", "{"),
("\\}", "}"),
("\\&", "&"),
("\\:", ":"),
("\\-", "-"),
("\\\\", "\\"),
])
# strip brackets
s = s.lstrip("[").rstrip("]")
# unicode codepoint escapes
s = re.sub(r"\\U(........)", repl, s)
s = re.sub(r"\\u(....)", repl, s)
s = re.sub(r"\\x(..)", repl, s)
# ranges
s_ = ""
for i, c in enumerate(s):
if c == "-":
prv = s[i - 1]
nxt = s[i + 1]
if prv not in " \\" and nxt != " ":
# exclude the border characters
# include only the characters in between
for x in range(ord(prv) + 1, ord(nxt)):
s_ += chr(x)
else:
s_ += c
else:
s_ += c
s = s_
# escapes
for k, v in KNOWN_ESCAPES.items():
s = s.replace(k, v)
# braces (get a list of required combinations)
combinations = re.findall(r"\{[^\}]+\}", s)
# convert to list, ignore braces and spaces
s = [c for c in list(s) if c not in "{ }"]
# remove possible duplicates and sort
s = sorted(list(set(s)))
# join back into strings
s = " ".join(s)
combinations = " ".join(combinations)
return s, combinations
def to_lc(mix, script="Latn", append=True):
"""
Save only LC versions of glyphs, add the default set for a script.
"""
az = set("abcdefghijklmnopqrstuvwxyz")
aya = set("абвгдежзийклмнопрстуфхцчшщъыьэюя")
lc = []
for c in mix.replace(" ", ""):
if c.islower():
lc += [c]
elif c == "İ":
# Account for Idotaccent
lc += [c]
elif c.isupper() and c.lower() in mix:
pass
elif c.isupper():
lc += [c.lower()]
else:
lc += [c]
lc = set(lc)
# add basic characters
if append:
if script == "Latn":
lc = lc.union(az)
elif script == "Cyrl":
lc = lc.union(aya)
return " ".join(sorted(list(lc)))
def iso_from_cldr(code, iso_639_3):
"""
Get ISO 639-3 code for a corresponding CLDR code.
"""
code_ = code.split("-")[0]
if code_ in iso_639_3.keys():
return code_
else:
for iso, r in iso_639_3.items():
if code_ == r["639-2B"]:
return iso
elif code_ == r["639-2T"]:
return iso
elif code_ == r["639-1"]:
return iso
def read_iso_639_3(path):
"""
Read .tab file for ISO 639-3 and return a dict
with the ISO 639-3 3-letter language codes as keys.
"""
d = OrderedDict()
with open(path, "r", encoding="utf-8") as csvinput:
reader = csv.DictReader(csvinput, delimiter="\t")
for row in reader:
code = row["Id"]
if code not in d:
lang = OrderedDict()
lang["names"] = [row["Ref_Name"]]
lang["639-2B"] = row["Part2T"]
lang["639-2T"] = row["Part2B"]
lang["639-1"] = row["Part1"]
lang["scope"] = row["Scope"]
lang["type"] = row["Language_Type"]
lang["comment"] = row["Comment"]
d[code] = lang
else:
d[code]["names"].append(row["Ref_Name"])
return d
def read_iso_639_3_retirements(path):
"""
Read .tab file for ISO 639-3 retirements and return a list
of the ISO 639-3 3-letter language codes.
"""
r = []
with open(path, "r", encoding="utf-8") as csvinput:
reader = csv.DictReader(csvinput, delimiter="\t")
for row in reader:
r.append(row["Id"])
return r
def read_latin_plus(path):
"""
Read data from Underwares’s Latin Plus and account for non-iso-639-3
languages (no code), languages with retired code, and bad formatting.
Notes:
- the lists of required characters assume A–Z as default and distinguish
between uppercase and lowercase.
- the language names often differ from those in ISO 639-3.
"""
def get_mapping(txt):
"""
Get mapping between glyph names and Unicode codepoints.
"""
mapping = {}
# line by line, ignore the first
for li in txt.split("\n")[1:]:
if li.strip().startswith("#"):
pass
elif li.strip() == "":
pass
else:
recs = li.split("\t")
gn = recs[1]
u = recs[2]
# avoid PUA used for ijacute glyph
if gn.lower() == "ijacute":
mapping[gn] = ""
# convert a unicode string to a character
if u != "" and len(u) == 4:
mapping[gn] = chr(int(u, 16))
return mapping
def get_language_info(txt, mapping):
"""
Read the required glyph names for each language, convert to Unicode
characters, and return a dict index by correct ISO 639-3 code.
Ignores:
Latino sine Flexione (no iso code),
Slovio (no iso code),
Folkspraak (no iso code).
Fixes code for:
Occidental/Interlingue (occ -> ile)
"""
records = OrderedDict()
# line by line
for li in txt.split("\n"):
if li.strip().startswith("#"):
pass
elif li.strip() == "":
pass
else:
recs = li.split("\t")
if len(recs) == 3:
name = recs[0]
code = recs[1]
gns = recs[2]
elif len(recs) == 2:
name = recs[0]
recs2 = recs[1].split(" ")
if len(recs2) == 2:
code, gns = recs2
else:
logging.warning("Unknown format: %s" % li)
continue
else:
logging.warning("Unknown format: %s" % li)
continue
if code.strip() == "":
logging.warning("No ISO 639-2/3 code for: %s" % name)
continue
chars = []
for gn in gns.split(","):
gn = gn.strip()
if gn == "":
pass
elif gn in mapping:
chars.append(mapping[gn])
else:
logging.warning("Glyph '%s' is does not have Unicode "
"codepoint specified." % gn)
# fix codes
remap_codes = {
"occ": "ile", # Occidental aka Interlingue (retired code)
"azj": "aze", # North Azerbaijani -> Azerbaijani
"gaz": "orm", # Oromo
"swa": "swh", # Swahili (macrolanguage) -> Swahili language
}
if code in remap_codes:
code = remap_codes[code]
records[code] = OrderedDict()
records[code]["name"] = name
records[code]["characters"] = OrderedDict()
# add a–z, make all lower case and sort
records[code]["characters"]["base"] = to_lc("".join(chars))
return records
with open(path, "r", encoding="utf-8") as f:
txt = f.read()
chars_txt, mapping_txt = txt.split("# Character set:")
mapping = get_mapping(mapping_txt)
return(get_language_info(chars_txt, mapping))
def read_cldr(path, iso_639_3):
"""
Read information about exemplar characters from
the Unicode CLDR XML files. There is one file for
some combinations of language + script + region.
Combines draft records into one entry to keep the
structure simple, the order is base, auxiliary, numbers,
punctuation.
"""
def read_cldr_file(path, code):
tree = ET.parse(path)
lang = OrderedDict()
lang["characters"] = OrderedDict()
for t in ["base", "auxiliary", "numbers", "punctuation", "combinations"]:
lang["characters"][t] = ""
all_chars = ""
draft = []
script = None
for ldml in tree.getroot():
for ec in ldml.iter("exemplarCharacters"):
if "type" in ec.attrib:
if ec.attrib["type"] in ["auxiliary", "numbers", "punctuation"]:
t = ec.attrib["type"]
s, c = parse_unicode_set(ec.text)
lang["characters"][t] = s
if c:
if not (t + "_combinations") in lang["characters"]:
lang["characters"][t + "_combinations"] = ""
lang["characters"][t + "_combinations"] += c
else:
s, c = parse_unicode_set(ec.text)
lang["characters"]["base"] = s
lang["characters"]["combinations"] += c
all_chars += ec.text
if "draft" in ec.attrib:
draft.append(ec.attrib["draft"])
else:
draft.append("done")
if list(set(all_chars)) != []:
codes = code.split("-")
if len(codes) > 1:
if len(codes[1]) == 4:
script = codes[1]
lang["draft"] = ", ".join(draft)
if not script:
script = guess_script(code, lang["characters"]["base"])
lang["script"] = script
return lang, script
else:
return None, None
cldr = OrderedDict()
paths = os.path.join(path, "*.xml")
main_paths = []
regi_paths = []
for path in glob.glob(paths):
code, _ = os.path.splitext(os.path.basename(path))
code = code.replace("_", "-") # to use the proper delimiter
if "-" not in code:
main_paths.append((path, code))
else:
regi_paths.append((path, code))
# process the main paths first
for path, code in main_paths + regi_paths:
lang, script = read_cldr_file(path, code)
if lang:
# convert the default language "root" to "und"
if code == "root":
code = "und"
codes = code.split("-")
if script not in cldr:
cldr[script] = OrderedDict()
iso = iso_from_cldr(code, iso_639_3)
lang["cldr_codes"] = [code]
lang["name"] = iso_639_3[iso]["names"][0]
if iso:
if len(codes) == 1 or (len(codes) == 2 and len(codes[1]) == 4):
# store the main variant
cldr[script][iso] = lang
else:
# the main variant has been already stored
# now adding code and characters from the
# regional variants to it
if iso in cldr:
cldr[script][iso]["cldr_codes"] += [code]
for k, v in lang["characters"].items():
v_ = v.replace(" ", "")
if v_ != "":
if k in cldr[script][iso]["characters"]:
c1 = set(cldr[script][iso]["characters"][k].replace(" ", ""))
c2 = set(v_)
both = " ".join(sorted(list(c1.union(c2))))
cldr[script][iso]["characters"][k] = both
else:
cldr[script][iso]["characters"][k] = v
else:
cldr[script][iso] = lang
else:
logging.error("Could not find ISO 639-3 code for CLDR code %s"
% code)
return cldr
def read_rosetta(path, iso_639_3={}):
"""
Read Rosetta XML file with old language support.
Override the names used with ISO 639-3 names
and drop the OpenType language tag (we can add that
later).
"""
tree = ET.parse(path)
rstt = OrderedDict()
for lr in tree.getroot():
lang = OrderedDict()
code = lr.attrib["iso-639-3"]
lang["name"] = iso_639_3[code]["names"][0]
if "script" in lr.attrib:
script = lr.attrib["script"]
else:
logging.warning("Script not specified for %s. Setting to 'Latn'." % lang["name"])
script = "Latn"
lang["script"] = script
if script not in rstt:
rstt[script] = OrderedDict()
for chars in lr.iter("characters"):
if "characters" not in lang:
lang["characters"] = OrderedDict()
if chars.attrib["type"] == "required":
if chars.text:
lang["characters"]["base"] = to_lc(chars.text, script)
else:
lang["characters"]["base"] = to_lc("", script)
else:
# no need for conversion to LC here
lang["characters"][chars.attrib["type"]] = chars.text
if "status" in lr.attrib:
if lr.attrib["status"] in ["todo", "beta"]:
lang["status"] = "todo"
else:
lang["status"] = "done"
rstt[script][code] = lang
return rstt
def read_alvestrand(path, iso_639_3={}):
"""
Read Alvestrand’s draft document in txt/plain
and save “required” characters to “base” and
“important” characters to “auxiliary”.
Some guesswork involved in getting the characters
that did not seem to have name/codepoint at the time
when he wrote it (see functin get_char and get_iso_code).
"""
def get_char(ln):
"""
Convert a line record to a unicode-string character
"""
splt = ln.split(" ")
if len(splt) < 2:
# hard to parse lines are either guessed
# or ignored (not important lines)
if ln == "e` -": # in Norwegian (Nynorsk)
return "è"
elif ln == "E` -": # in Norwegian (Nynorsk)
return "È"
elif ln == "o` -": # in Norwegian (Nynorsk)
return "ò"
elif ln == "O` -": # in Norwegian (Nynorsk)
return "Ò"
elif ln == "KK -": # in Greenlandic (capital Kra)
return None
elif ln == "-A -": # in Spanish (capital feminine ordinal)
return None
elif ln == "-O -": # in Spanish (capital masculine ordinal)
return None
else:
return None
else:
_, code = splt
code = code.strip()[:4]
return chr(int(code, 16))
def get_iso_code(name, code, iso_639_3):
"""
Get iso 639-3 code.
"""
for k, v in iso_639_3.items():
if name.title() in v["names"]:
return k
elif code in v["639-1"]:
return k
elif name == "Sami":
# Northern Sami
return "sme"
elif name == "Gaelic":
# Scottish Gaelic
return "gla"
elif name == "Sorbian":
# Upper Sorbian
return "hsb"
elif name == "Cyrillic":
# the Cyrillic script
# deleted after use, in the end
return "cyr"
i = 1 # chapter counter
langs = OrderedDict()
required = False
important = False
comment = False
with open(path, "r", encoding="utf-8") as f:
content_ = [ln.strip() for ln in f.readlines()]
content = []
for ln in content_:
if "name not known" in ln:
content += ln.split("name not known")
else:
content += [ln]
for ln in content:
ln = ln.strip()
if ln.startswith("3.%d" % i):
if i <= 48:
_, code, name = ln.replace(" ", " ").split(" ")
code = get_iso_code(name.strip(), code.strip(), iso_639_3)
langs[code] = OrderedDict()
langs[code]["characters"] = OrderedDict()
langs[code]["characters"]["base"] = ""
langs[code]["characters"]["auxiliary"] = ""
langs[code]["comment"] = ""
langs[code]["chapter"] = float("3.%d" % i)
required = False
important = False
comment = False
i += 1
else:
break
elif ln.startswith("Required characters"):
required = True
important = False
comment = False
elif ln.startswith("Important characters"):
required = False
important = True
comment = False
elif ln.startswith("Comments"):
required = False
important = False
comment = True
elif ln.startswith("Character sets"):
required = False
important = False
comment = False
elif ln.startswith("Alvestrand") or \
ln.startswith("draft") or \
ln.startswith("This language has no known character set"):
pass
elif ln.startswith("Based on script listed as"):
ln = ln.replace("Based on script listed as", "").strip()
if ln.lower() == "latin":
langs[code]["script"] = "Latn"
elif ln.lower() == "cyrillic":
langs[code]["script"] = "Cyrl"
elif ln.startswith("4. Other languages"):
break
elif required and ln != "":
c = get_char(ln)
if c:
langs[code]["characters"]["base"] += c
elif important and ln != "":
c = get_char(ln)
if c:
langs[code]["characters"]["auxiliary"] += get_char(ln)
elif comment and ln != "":
langs[code]["comment"] += ln + " "
# normalize character sets and structure
avst = OrderedDict()
avst["Latn"] = OrderedDict()
avst["Cyrl"] = OrderedDict()
for code, l in langs.items():
if "script" in l:
script = l["script"]
if script == "Latn":
l["characters"]["base"] += langs["lat"]["characters"]["base"]
if script == "Cyrl":
l["characters"]["base"] += langs["cyr"]["characters"]["base"]
del l["script"]
elif code == "lat":
# Latin language/scritp record
# -> add as language to the Latin script
script = "Latn"
elif code == "deu":
# fix uppercase SS
l["characters"]["base"] = l["characters"]["base"].replace("", "SS")
if l["comment"] == "":
del l["comment"]
l["characters"]["base"] = to_lc(l["characters"]["base"])
l["characters"]["auxiliary"] = to_lc(l["characters"]["auxiliary"], append=False)
# copy language under script
# ignore Cyrillic script record (cyr)
if code != "cyr":
avst[script][code] = l
# copy Upper Sorbian to Lower Sorbian
avst["Latn"]["dsb"] = copy(avst["Latn"]["hsb"])
return avst
if __name__ == '__main__':
# Save ISO 639-3 to YAML
path = os.path.join("data", "iso-639-3", "iso-639-3_20190408.tab")
iso_639_3 = read_iso_639_3(path)
save_yaml(iso_639_3, os.path.join("data", "iso-639-3.yaml"))
# Save ISO 639-3 retirements to YAML
path = os.path.join("data", "iso-639-3",
"iso-639-3_Retirements_20190408.tab")
iso_639_3_retirements = read_iso_639_3_retirements(path)
save_yaml(iso_639_3_retirements, os.path.join("data", "iso-639-3_retirements.yaml"))
# Save CLDR to YAML
path = os.path.join("data", "cldr", "common", "main")
cldr = read_cldr(path, iso_639_3)
save_yaml(cldr, os.path.join("data", "cldr.yaml"))
# Save Latin Plus to YAML
path = os.path.join("data", "other", "latin-plus",
"underware-latin-plus-data_1.txt")
latin_plus = read_latin_plus(path)
save_yaml(latin_plus, os.path.join("data", "other", "latin-plus.yaml"))
# Save Rosetta old XML to YAML
path = os.path.join("data", "other", "rosetta", "rosetta-language-support_old.xml")
rstt = read_rosetta(path, iso_639_3)
save_yaml(rstt, os.path.join("data", "rosetta_old.yaml"))
# Save Alvestrand’s TXT draft to YAML
path = os.path.join("data", "alvestrand", "draft-alvestrand-lang-char-03.txt")
avst = read_alvestrand(path, iso_639_3)
save_yaml(avst, os.path.join("data", "alvestrand.yaml"))