- Change implementation to use pandas apply -Add test for each single…

… ISO code -Change name ret_list to probability - Change name _Language_to_dict to _Language_to_tuple
jbesomi · tmankita · Jul 14, 2020 · Jul 14, 2020 · Jul 14, 2020 · Jul 14, 2020
commit c284cda835b3ac071c2bb1ee8781e34dadc79e99
diff --git a/setup.cfg b/setup.cfg
@@ -30,7 +30,6 @@ install_requires =
  numpy>=1.17
  scikit-learn>=0.22
  spacy>=2.2.2
- spacy-langdetect>=0.1.2
  langdetect>=1.0.7
  tqdm>=4.3
  nltk>=3.3

diff --git a/tests/test_nlp.py b/tests/test_nlp.py
@@ -70,21 +70,124 @@ def test_count_sentences_wrong_index(self):
  self.assertFalse(counted_sentences_s.index.equals(t_different_index.index))
 
  def test_infer_lang(self):
-
+ # no found words in the following languages it, hr and hi that the function succeeds to detect.
  s = pd.Series(
  [
- "This is English text.",
- " Er lebt mit seinen Eltern und seiner Schwester in Berlin.",
- " Yo me divierto todos los días en el parque. ",
- "Je m'appelle Angélica Summer, j'ai 12 ans et je suis canadienne.",
+ "Wêreld",
+ "مرحبا بالعالم",
+ "български",
+ "ওহে বিশ্ব",
+ "català",
+ "Ahoj světe",
+ "Helo Byd",
+ "dansk",
+ "Deutsch",
+ "Γειά σου Κόσμε",
+ "fox",
+ "Hola Mundo",
+ "Tere, Maailm",
+ "فارسی",
+ "Hei maailma",
+ "Bonjour le monde",
+ "હેલો વર્લ્ડ",
+ "שלום עולם",
+ "Helló Világ",
+ "Bahasa",
+ "こんにちは世界",
+ "ಹಲೋ ವರ್ಲ್ಡ್",
+ "안녕하세요 세계",
+ "lietuvių kalba",
+ "Sveika pasaule",
+ "Здраво свету",
+ "ഹലോ വേൾഡ്",
+ "मराठी",
+ "नेपाली",
+ "Vlaams",
+ "Norsk",
+ "ਸਤਿ ਸ੍ਰੀ ਅਕਾਲ ਦੁਨਿਆ",
+ "Witaj świecie",
+ "Olá Mundo",
+ "Română",
+ "русский",
+ "Slovenský",
+ "Pozdravljen, svet",
+ "Soomaaliga",
+ "Përshendetje Botë",
+ "Hej världen",
+ "Kiswahili",
+ "வணக்கம் உலகம்",
+ "హలో ప్రపంచ",
+ "สวัสดีชาวโลก",
+ "Wikang Tagalog",
+ "Selam Dünya",
+ "Привіт Світ",
+ "ہیلو دنیا",
+ "Chào thế giới",
+ "中文",
+ "中華民國國歌",
+ # "धन्यवाद",
+ # "Lijepa naša domovino",
+ # "Italiano",
  ]
  )
+
  s_true = pd.Series(
  [
- ("en", "%.5f" % float(1)),
- ("de", "%.5f" % float(1)),
- ("es", "%.5f" % float(1)),
- ("fr", "%.5f" % float(1)),
+ "af",
+ "ar",
+ "bg",
+ "bn",
+ "ca",
+ "cs",
+ "cy",
+ "da",
+ "de",
+ "el",
+ "en",
+ "es",
+ "et",
+ "fa",
+ "fi",
+ "fr",
+ "gu",
+ "he",
+ "hu",
+ "id",
+ "ja",
+ "kn",
+ "ko",
+ "lt",
+ "lv",
+ "mk",
+ "ml",
+ "mr",
+ "ne",
+ "nl",
+ "no",
+ "pa",
+ "pl",
+ "pt",
+ "ro",
+ "ru",
+ "sk",
+ "sl",
+ "so",
+ "sq",
+ "sv",
+ "sw",
+ "ta",
+ "te",
+ "th",
+ "tl",
+ "tr",
+ "uk",
+ "ur",
+ "vi",
+ "zh-cn",
+ "zh-tw",
+ # 'hi',
+ # 'hr',
+ # 'it'
  ]
  )
  s_result = nlp.infer_lang(s)

diff --git a/texthero/nlp.py b/texthero/nlp.py
@@ -135,72 +135,50 @@ def count_sentences(s: pd.Series) -> pd.Series:
  return pd.Series(number_of_sentences, index=s.index)
 
 
-def _Language_to_dict(lang: Language):
+def _Language_to_tuple(lang: Language):
  return (str(lang.lang), "%.5f" % float(lang.prob))
 
 
-def _detect_language_list(spaCy_object):
+def _detect_language_probability(s):
  """
- gured out appling detect_langs function on spacy_object
- :param spacy_object
+ gured out appling detect_langs function on sentence
+ :param s
  """
  try:
- detected_language = list(
- map(_Language_to_dict, detect_langs(spaCy_object.text))
- )
+ detected_language = list(map(_Language_to_tuple, detect_langs(s)))
  return detected_language
  except LangDetectException:
  return ("UNKNOWN", 0.0)
 
 
-def _detect_language(spaCy_object):
+def _detect_language(s):
  """
- gured out appling detect_langs function on spacy_object
- :param spacy_object
+ gured out appling detect_langs function on sentence
+ :param s
  """
  try:
- detected_language = _Language_to_dict(detect_langs(spaCy_object.text)[0])
+ detected_language = str(detect_langs(s)[0].lang)
  return detected_language
  except LangDetectException:
- return ("UNKNOWN", 0.0)
+ return "UNKNOWN"
 
 
-def _infer_lang_ret_list(s, nlp, infer_languages):
- nlp.add_pipe(
- LanguageDetector(_detect_language_list), name="language_detector", last=True
- )
- for doc in nlp.pipe(s.values, batch_size=32):
- infer_languages.append(doc._.language)
-
- return pd.Series(infer_languages, index=s.index)
-
-
-def _infer_lang(s, nlp, infer_languages):
- nlp.add_pipe(
- LanguageDetector(_detect_language), name="language_detector", last=True
- )
- for doc in nlp.pipe(s.values, batch_size=32):
- infer_languages.append(doc._.language)
-
- return pd.Series(infer_languages, index=s.index)
-
-
-def infer_lang(s, ret_list=False):
+def infer_lang(s, probability=False):
  """
  Return languages and their probabilities.
 
- Return a Pandas Series where each row contains a tuple that has information regarding to the "average" infer language.
+ Return a Pandas Series where each row contains a ISO nomenclature of the "average" infer language.
 
- Tuple : (language, probability)
+ If probability = True then each row contains a list of tuples
 
- If ret_list = True then each row contains a list of tuples
+ Tuple : (language, probability)
 
  Note: infer_lang is nondeterministic function
 
  Parameters
  ----------
  s : Pandas Series
- ret_list (optional) : boolean
+ probability (optional) : boolean
 
  supports 55 languages out of the box (ISO 639-1 codes)
  ------------------------------------------------------
@@ -214,15 +192,12 @@ def infer_lang(s, ret_list=False):
  >>> import pandas as pd
  >>> s = pd.Series("This is an English text!.")
  >>> hero.infer_lang(s)
- 0 (en, 1.00000)
+ 0 en
  dtype: object
 
  """
 
- infer_languages = []
- nlp = spacy.load("en_core_web_sm")
-
- if ret_list:
- return _infer_lang_ret_list(s, nlp, infer_languages)
+ if probability:
+ return s.apply(_detect_language_probability)
  else:
- return _infer_lang(s, nlp, infer_languages)
+ return s.apply(_detect_language)