Skip to content

Commit

Permalink
Improve remove_diacritics function. Fixes #71 (#72)
Browse files Browse the repository at this point in the history
The remove_diacritics function produced transliterated output for e.g.
the Urdu alphabet.

Through the unicodedata package, diacritics are now safely filtered out.

Co-authored-by: Henri Froese <[email protected]>
  • Loading branch information
henrifroese and henrifroese committed Jul 13, 2020
1 parent e36a977 commit 391cec6
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 25 deletions.
4 changes: 2 additions & 2 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ def test_remove_punctation(self):
"""

def test_remove_diactitics(self):
s = pd.Series("hèllo")
s_true = pd.Series("hello")
s = pd.Series("Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس")
s_true = pd.Series("Montreal, uber, 12.89, Mere, Francoise, noel, 889, اس, اس")
self.assertEqual(preprocessing.remove_diacritics(s), s_true)

"""
Expand Down
63 changes: 40 additions & 23 deletions texthero/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
The texthero.preprocess module allow for efficient pre-processing of text-based Pandas Series and DataFrame.
"""

from gensim.sklearn_api.phrases import PhrasesTransformer
import re
import string
from typing import Optional, Set
import unicodedata

import numpy as np
import pandas as pd
Expand All @@ -21,8 +23,6 @@

warnings.filterwarnings(action="ignore", category=UserWarning, module="gensim")

from gensim.sklearn_api.phrases import PhrasesTransformer


def fillna(input: pd.Series) -> pd.Series:
"""Replace not assigned values with empty spaces."""
Expand All @@ -49,7 +49,7 @@ def replace_digits(input: pd.Series, symbols: str = " ", only_blocks=True) -> pd
Symbols to replace
only_blocks : bool
When set to False, remove any digits.
Returns
-------
Pandas Series
Expand Down Expand Up @@ -151,6 +151,24 @@ def remove_punctuation(input: pd.Series) -> pd.Series:
return replace_punctuation(input, " ")


def _remove_diacritics(text: str) -> str:
"""
Remove diacritics and accents from one string.
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> text = "Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس"
>>> _remove_diacritics(text)
'Montreal, uber, 12.89, Mere, Francoise, noel, 889, اس, اس'
"""
nfkd_form = unicodedata.normalize("NFKD", text)
# unicodedata.combinding(char) checks if the character is in
# composed form (consisting of several unicode chars combined), i.e. a diacritic
return "".join([char for char in nfkd_form if not unicodedata.combining(char)])


def remove_diacritics(input: pd.Series) -> pd.Series:
"""
Remove all diacritics and accents.
Expand All @@ -161,12 +179,11 @@ def remove_diacritics(input: pd.Series) -> pd.Series:
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series("Noël means Christmas in French")
>>> hero.remove_diacritics(s)
0 Noel means Christmas in French
dtype: object
>>> s = pd.Series("Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس")
>>> hero.remove_diacritics(s)[0]
'Montreal, uber, 12.89, Mere, Francoise, noel, 889, اس, اس'
"""
return input.apply(unidecode.unidecode)
return input.astype("unicode").apply(_remove_diacritics)


def remove_whitespace(input: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -209,7 +226,7 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str:
>>> stopwords = ["the", "of"]
>>> _replace_stopwords(s, stopwords, symbol)
'$ book $ $ jungle'
"""

pattern = r"""(?x) # Set flag to allow verbose regexps
Expand Down Expand Up @@ -263,7 +280,7 @@ def remove_stopwords(
Parameters
----------
input : Pandas Series
stopwords : Set[str], Optional
Set of stopwords string to remove. If not passed, by default it used NLTK English stopwords.
Expand Down Expand Up @@ -317,7 +334,7 @@ def stem(input: pd.Series, stem="snowball", language="english") -> pd.Series:
Notes
-----
By default NLTK stemming algorithms lowercase all text.
Examples
--------
Expand Down Expand Up @@ -370,7 +387,7 @@ def clean(s: pd.Series, pipeline=None) -> pd.Series:
"""
Pre-process a text-based Pandas Series.
Default pipeline:
1. :meth:`texthero.preprocessing.fillna`
2. :meth:`texthero.preprocessing.lowercase`
Expand Down Expand Up @@ -559,7 +576,7 @@ def remove_html_tags(s: pd.Series) -> pd.Series:
>>> remove_html_tags(s)
0 Title
dtype: object
"""

pattern = r"""(?x) # Turn on free-spacing
Expand Down Expand Up @@ -600,20 +617,20 @@ def tokenize(s: pd.Series) -> pd.Series:

def tokenize_with_phrases(s: pd.Series, min_count: int = 5, threshold: int = 10):
r"""Tokenize and group up collocations words
Tokenize the given pandas Series and group up bigrams where each tokens has at least min_count term frequrncy and where the threshold is larger than the underline formula.
:math:`\frac{(bigram\_a\_b\_count - min\_count)* len\_vocab }{ (word\_a\_count * word\_b\_count)}`.
Parameters
----------
s : Pandas Series
min_count : Int, optional. Default is 5.
ignore tokens with frequency less than this
threshold : Int, optional. Default is 10.
ignore tokens with a score under that threshold
Examples
--------
>>> import pandas as pd
Expand All @@ -623,12 +640,12 @@ def tokenize_with_phrases(s: pd.Series, min_count: int = 5, threshold: int = 10)
0 [New_York, is, a, beautiful, city]
1 [Look, :, New_York, !]
dtype: object
Reference
--------
`Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality"
<https://arxiv.org/abs/1310.4546>`_
"""

if type(s.iloc[0]) != str:
Expand Down Expand Up @@ -689,7 +706,7 @@ def remove_urls(s: pd.Series) -> pd.Series:

def replace_tags(s: pd.Series, symbol: str) -> pd.Series:
"""Replace all tags from a given Pandas Series with symbol.
A tag is a string formed by @ concatenated with a sequence of characters and digits. Example: @texthero123.
Parameters
Expand All @@ -715,9 +732,9 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series:

def remove_tags(s: pd.Series) -> pd.Series:
"""Remove all tags from a given Pandas Series.
A tag is a string formed by @ concatenated with a sequence of characters and digits. Example: @texthero123. Tags are replaceb by an empty space ` `.
Examples
--------
>>> import texthero as hero
Expand Down

0 comments on commit 391cec6

Please sign in to comment.