Skip to content

Commit

Permalink
Utils: Fix language detection
Browse files Browse the repository at this point in the history
  • Loading branch information
BLKSerene committed Jun 12, 2024
1 parent 9a28747 commit 7089ce2
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
### 📌 Bugfixes
- File Area: Fix support for .xlsx files
- Utils: Fix downloading of Stanza models
- Utils: Fix language detection
- Work Area: Fix Dependency Parser - analysis of files whose first token is a punctuation mark

### ❌ Removals
Expand Down
4 changes: 1 addition & 3 deletions tests/tests_utils/test_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,15 +298,13 @@ def test_lingua():
re.search(r'^[^\(\)]+', lang.lower()).group().strip()
for lang in main.settings_global['langs']
}
langs_exceptions = {'bokmal', 'ganda', 'nynorsk', 'slovene'}
langs_exceptions = {'bokmal', 'ganda', 'nynorsk'}
langs_extra = set()

for lang in lingua.Language.all(): # pylint: disable=no-member
if lang.name.lower() not in langs | langs_exceptions:
langs_extra.add(lang.name)

print(f"\nExtra languages: {', '.join(langs_extra)}\n")

assert langs_extra == {'BOSNIAN', 'MAORI', 'SHONA', 'SOMALI', 'SOTHO', 'TSONGA', 'XHOSA'}

def test_detection_lang():
Expand Down
2 changes: 2 additions & 0 deletions wordless/wl_utils/wl_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def detect_encoding(main, file_path):
lingua.Language.BOSNIAN,
lingua.Language.MAORI,
lingua.Language.SHONA,
lingua.Language.SOMALI,
lingua.Language.SOTHO,
lingua.Language.TSONGA,
lingua.Language.XHOSA
).build()
Expand Down

0 comments on commit 7089ce2

Please sign in to comment.