Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add infer_lang function (Issue number #3) #79

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
- Change implementation to use pandas apply -Add test for each single…
… ISO code -Change name ret_list to probability - Change name _Language_to_dict to _Language_to_tuple
  • Loading branch information
Tomer Mankita authored and Tomer Mankita committed Jul 17, 2020
commit c284cda835b3ac071c2bb1ee8781e34dadc79e99
1 change: 0 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ install_requires =
numpy>=1.17
scikit-learn>=0.22
spacy>=2.2.2
spacy-langdetect>=0.1.2
langdetect>=1.0.7
tqdm>=4.3
nltk>=3.3
Expand Down
121 changes: 112 additions & 9 deletions tests/test_nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,21 +70,124 @@ def test_count_sentences_wrong_index(self):
self.assertFalse(counted_sentences_s.index.equals(t_different_index.index))

def test_infer_lang(self):

# no found words in the following languages it, hr and hi that the function succeeds to detect.
s = pd.Series(
[
"This is English text.",
" Er lebt mit seinen Eltern und seiner Schwester in Berlin.",
" Yo me divierto todos los días en el parque. ",
"Je m'appelle Angélica Summer, j'ai 12 ans et je suis canadienne.",
"Wêreld",
"مرحبا بالعالم",
"български",
"ওহে বিশ্ব",
"català",
"Ahoj světe",
"Helo Byd",
"dansk",
"Deutsch",
"Γειά σου Κόσμε",
"fox",
"Hola Mundo",
"Tere, Maailm",
"فارسی",
"Hei maailma",
"Bonjour le monde",
"હેલો વર્લ્ડ",
"שלום עולם",
"Helló Világ",
"Bahasa",
"こんにちは世界",
"ಹಲೋ ವರ್ಲ್ಡ್",
"안녕하세요 세계",
"lietuvių kalba",
"Sveika pasaule",
"Здраво свету",
"ഹലോ വേൾഡ്",
"मराठी",
"नेपाली",
"Vlaams",
"Norsk",
"ਸਤਿ ਸ੍ਰੀ ਅਕਾਲ ਦੁਨਿਆ",
"Witaj świecie",
"Olá Mundo",
"Română",
"русский",
"Slovenský",
"Pozdravljen, svet",
"Soomaaliga",
"Përshendetje Botë",
"Hej världen",
"Kiswahili",
"வணக்கம் உலகம்",
"హలో ప్రపంచ",
"สวัสดีชาวโลก",
"Wikang Tagalog",
"Selam Dünya",
"Привіт Світ",
"ہیلو دنیا",
"Chào thế giới",
"中文",
"中華民國國歌",
# "धन्यवाद",
# "Lijepa naša domovino",
# "Italiano",
]
)

s_true = pd.Series(
[
("en", "%.5f" % float(1)),
("de", "%.5f" % float(1)),
("es", "%.5f" % float(1)),
("fr", "%.5f" % float(1)),
"af",
"ar",
"bg",
"bn",
"ca",
"cs",
"cy",
"da",
"de",
"el",
"en",
"es",
"et",
"fa",
"fi",
"fr",
"gu",
"he",
"hu",
"id",
"ja",
"kn",
"ko",
"lt",
"lv",
"mk",
"ml",
"mr",
"ne",
"nl",
"no",
"pa",
"pl",
"pt",
"ro",
"ru",
"sk",
"sl",
"so",
"sq",
"sv",
"sw",
"ta",
"te",
"th",
"tl",
"tr",
"uk",
"ur",
"vi",
"zh-cn",
"zh-tw",
# 'hi',
# 'hr',
# 'it'
]
)
s_result = nlp.infer_lang(s)
Expand Down
63 changes: 19 additions & 44 deletions texthero/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,72 +135,50 @@ def count_sentences(s: pd.Series) -> pd.Series:
return pd.Series(number_of_sentences, index=s.index)


def _Language_to_dict(lang: Language):
def _Language_to_tuple(lang: Language):
return (str(lang.lang), "%.5f" % float(lang.prob))


def _detect_language_list(spaCy_object):
def _detect_language_probability(s):
"""
gured out appling detect_langs function on spacy_object
:param spacy_object
gured out appling detect_langs function on sentence
:param s
"""
try:
detected_language = list(
map(_Language_to_dict, detect_langs(spaCy_object.text))
)
detected_language = list(map(_Language_to_tuple, detect_langs(s)))
return detected_language
except LangDetectException:
return ("UNKNOWN", 0.0)


def _detect_language(spaCy_object):
def _detect_language(s):
"""
gured out appling detect_langs function on spacy_object
:param spacy_object
gured out appling detect_langs function on sentence
:param s
"""
try:
detected_language = _Language_to_dict(detect_langs(spaCy_object.text)[0])
detected_language = str(detect_langs(s)[0].lang)
return detected_language
except LangDetectException:
return ("UNKNOWN", 0.0)
return "UNKNOWN"


def _infer_lang_ret_list(s, nlp, infer_languages):
nlp.add_pipe(
LanguageDetector(_detect_language_list), name="language_detector", last=True
)
for doc in nlp.pipe(s.values, batch_size=32):
infer_languages.append(doc._.language)

return pd.Series(infer_languages, index=s.index)


def _infer_lang(s, nlp, infer_languages):
nlp.add_pipe(
LanguageDetector(_detect_language), name="language_detector", last=True
)
for doc in nlp.pipe(s.values, batch_size=32):
infer_languages.append(doc._.language)

return pd.Series(infer_languages, index=s.index)


def infer_lang(s, ret_list=False):
def infer_lang(s, probability=False):
"""
Return languages and their probabilities.

Return a Pandas Series where each row contains a tuple that has information regarding to the "average" infer language.
Return a Pandas Series where each row contains a ISO nomenclature of the "average" infer language.

Tuple : (language, probability)
If probability = True then each row contains a list of tuples

If ret_list = True then each row contains a list of tuples
Tuple : (language, probability)

Note: infer_lang is nondeterministic function

Parameters
----------
s : Pandas Series
ret_list (optional) : boolean
probability (optional) : boolean

supports 55 languages out of the box (ISO 639-1 codes)
------------------------------------------------------
Expand All @@ -214,15 +192,12 @@ def infer_lang(s, ret_list=False):
>>> import pandas as pd
>>> s = pd.Series("This is an English text!.")
>>> hero.infer_lang(s)
0 (en, 1.00000)
0 en
dtype: object

"""

infer_languages = []
nlp = spacy.load("en_core_web_sm")

if ret_list:
return _infer_lang_ret_list(s, nlp, infer_languages)
if probability:
return s.apply(_detect_language_probability)
else:
return _infer_lang(s, nlp, infer_languages)
return s.apply(_detect_language)