Improve tfidf (#97)

* Improve TFIDF. Closes #76 Docstring now includes formula/explaination. Normalization disabled. Representation Series is already being handled (although output is still like before). Function representation_series_to_flat_series added. Co-authored-by: Maximilian Krahn <[email protected]> * Improve TFIDF. Closes #76 Docstring now includes formula/explaination. Normalization disabled (the option "normalization=None" was "hidden" in the sklearn code, so that turned out to be an easy fix). Representation Series is already being handled (although output is still like before, using representation_series_to_flat_series). Function representation_series_to_flat_series added. Unit tests are changed accordingly, also one with the explicit calculation using the formula. Co-authored-by: Maximilian Krahn <[email protected]> * Implement suggested changes to tfidf. max_features fixed lowercase=False removed docstring improved tests for different arguments added * Incorporate remote changes. Co-authored-by: Maximilian Krahn <[email protected]>
jbesomi · Jul 17, 2020 · 1d4d5a0 · 1d4d5a0
1 parent a93cc06
commit 1d4d5a0
Show file tree

Hide file tree

Showing 2 changed files with 182 additions and 37 deletions.
diff --git a/tests/test_representation.py b/tests/test_representation.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import numpy as np
 from texthero import representation
 from texthero import preprocessing
 
@@ -7,6 +8,7 @@
 import doctest
 import unittest
 import string
+import math
 import warnings
 
 """
@@ -63,15 +65,37 @@ def test_term_frequency_not_tokenized_yet(self):
  TF-IDF
  """
 
- def test_idf_single_document(self):
- s = pd.Series("a")
+ def test_tfidf_formula(self):
+ s = pd.Series(["Hi Bye", "Test Bye Bye"])
  s = preprocessing.tokenize(s)
- s_true = pd.Series([[1]])
+ s_true = pd.Series(
+ [
+ [
+ 1.0 * (math.log(3 / 3) + 1),
+ 1.0 * (math.log(3 / 2) + 1),
+ 0.0 * (math.log(3 / 2) + 1),
+ ],
+ [
+ 2.0 * (math.log(3 / 3) + 1),
+ 0.0 * (math.log(3 / 2) + 1),
+ 1.0 * (math.log(3 / 2) + 1),
+ ],
+ ]
+ )
+ s_true.rename_axis("document", inplace=True)
+ self.assertEqual(representation.tfidf(s), s_true)
+
+ def test_tfidf_single_document(self):
+ s = pd.Series("a", index=["yo"])
+ s = preprocessing.tokenize(s)
+ s_true = pd.Series([[1]], index=["yo"])
+ s_true.rename_axis("document", inplace=True)
  self.assertEqual(representation.tfidf(s), s_true)
 
- def test_idf_not_tokenized_yet(self):
+ def test_tfidf_not_tokenized_yet(self):
  s = pd.Series("a")
  s_true = pd.Series([[1]])
+ s_true.rename_axis("document", inplace=True)
 
  with warnings.catch_warnings(): # avoid print warning
  warnings.simplefilter("ignore")
@@ -80,10 +104,28 @@ def test_idf_not_tokenized_yet(self):
  with self.assertWarns(DeprecationWarning): # check raise warning
  representation.tfidf(s)
 
- def test_idf_single_not_lowercase(self):
- tfidf_single_smooth = 0.7071067811865475 # TODO
-
+ def test_tfidf_single_not_lowercase(self):
  s = pd.Series("ONE one")
  s = preprocessing.tokenize(s)
- s_true = pd.Series([[tfidf_single_smooth, tfidf_single_smooth]])
+ s_true = pd.Series([[1.0, 1.0]])
+ s_true.rename_axis("document", inplace=True)
  self.assertEqual(representation.tfidf(s), s_true)
+
+ def test_tfidf_max_features(self):
+ s = pd.Series("one one two")
+ s = preprocessing.tokenize(s)
+ s_true = pd.Series([[2.0]])
+ s_true.rename_axis("document", inplace=True)
+ self.assertEqual(representation.tfidf(s, max_features=1), s_true)
+
+ def test_tfidf_min_df(self):
+ s = pd.Series([["one"], ["one", "two"]])
+ s_true = pd.Series([[1.0], [1.0]])
+ s_true.rename_axis("document", inplace=True)
+ self.assertEqual(representation.tfidf(s, min_df=2), s_true)
+
+ def test_tfidf_max_df(self):
+ s = pd.Series([["one"], ["one", "two"]])
+ s_true = pd.Series([[0.0], [1.4054651081081644]])
+ s_true.rename_axis("document", inplace=True)
+ self.assertEqual(representation.tfidf(s, max_df=1), s_true)
diff --git a/texthero/representation.py b/texthero/representation.py
@@ -3,14 +3,16 @@
 """
 
 import pandas as pd
+import numpy as np
 
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.manifold import TSNE
 from sklearn.decomposition import PCA, NMF
 from sklearn.cluster import KMeans, DBSCAN, MeanShift
 from sklearn.metrics.pairwise import cosine_similarity
+from scipy.sparse import coo_matrix
 
-from typing import Optional
+from typing import Optional, Union, Any
 
 from texthero import preprocessing
 
@@ -19,6 +21,72 @@
 
 # from texthero import pandas_ as pd_
 
+"""
+Helper
+"""
+
+
+def representation_series_to_flat_series(
+ s: Union[pd.Series, pd.Series.sparse],
+ index: pd.Index = None,
+ fill_missing_with: Any = np.nan,
+) -> pd.Series:
+ """
+ Transform a Pandas Representation Series to a "normal" (flattened) Pandas Series.
+
+ The given Series should have a multiindex with first level being the document
+ and second level being individual features of that document (e.g. tdidf scores per word).
+ The flattened Series has one cell per document, with the cell being a list of all
+ the individual features of that document.
+
+ Parameters
+ ----------
+ s : Sparse Pandas Series or Pandas Series
+ The multiindexed Pandas Series to flatten.
+ index : Pandas Index, optional, default to None
+ The index the flattened Series should have.
+ fill_missing_with : Any, default to np.nan
+ Value to fill the NaNs (missing values) with. This _does not_ mean
+ that existing values that are np.nan are replaced, but rather that
+ features that are not present in one document but present in others
+ are filled with fill_missing_with. See example below.
+
+
+ Examples
+ --------
+ >>> import texthero as hero
+ >>> import pandas as pd
+ >>> import numpy as np
+ >>> index = pd.MultiIndex.from_tuples([("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], names=['document', 'word'])
+ >>> s = pd.Series([3, np.nan, 4], index=index)
+ >>> s
+ document word 
+ doc0 Word1 3.0
+ Word3 NaN
+ doc1 Word2 4.0
+ dtype: float64
+ >>> hero.representation_series_to_flat_series(s, fill_missing_with=0.0)
+ document
+ doc0 [3.0, 0.0, nan]
+ doc1 [0.0, 4.0, 0.0]
+ dtype: object
+
+ """
+ s = s.unstack(fill_value=fill_missing_with)
+
+ if index is not None:
+ s = s.reindex(index, fill_value=fill_missing_with)
+ # Reindexing makes the documents for which no values
+ # are present in the Sparse Representation Series
+ # "reappear" correctly.
+
+ s = pd.Series(s.values.tolist(), index=s.index)
+
+ s.rename_axis("document", inplace=True)
+
+ return s
+
+
 # Warning message for not-tokenized inputs
 _not_tokenized_warning_message = (
  "It seems like the given Pandas Series s is not tokenized. This function will"
@@ -91,49 +159,64 @@ def term_frequency(
  return s
 
 
-def tfidf(s: pd.Series, max_features=None, min_df=1, return_feature_names=False):
+def tfidf(
+ s: pd.Series, max_features=None, min_df=1, max_df=1.0, return_feature_names=False
+) -> pd.Series.sparse:
  """
  Represent a text-based Pandas Series using TF-IDF.
 
+ *Term Frequency - Inverse Document Frequency (TF-IDF)* is a formula to
+ calculate the _relative importance_ of the words in a document, taking
+ into account the words' occurences in other documents. It consists of two parts:
+
+ The *term frequency (tf)* tells us how frequently a term is present in a document,
+ so tf(document d, term t) = number of times t appears in d.
+
+ The *inverse document frequency (idf)* measures how _important_ or _characteristic_
+ a term is among the whole corpus (i.e. among all documents).
+ Thus, idf(term t) = log((1 + number of documents) / (1 + number of documents where t is present)) + 1.
+
+ Finally, tf-idf(document d, term t) = tf(d, t) * idf(t).
+
+ Different from the `sklearn-implementation of tfidf <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`,
+ this function does *not* normalize the output in any way,
+ so the result is exactly what you
+ get applying the formula described above.
+
  The input Series should already be tokenized. If not, it will
  be tokenized before tfidf is calculated.
 
+ If working with big pandas Series, you might want to limit
+ the number of features through the max_features parameter.
+
  Parameters
  ----------
- s : Pandas Series
- max_features : int, optional
- Maximum number of features to keep.
- min_df : int, optional. Default to 1.
- When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
- return_features_names : Boolean. Default to False.
- If True, return a tuple (*tfidf_series*, *features_names*)
+ s : Pandas Series (tokenized)
+ max_features : int, optional, default to None.
+ If not None, only the max_features most frequent tokens are used.
+ min_df : int, optional, default to 1.
+ When building the vocabulary, ignore terms that have a document 
+ frequency (number of documents a term appears in) strictly lower than the given threshold.
+ max_df : int or double, optional, default to 1.0
+ When building the vocabulary, ignore terms that have a document
+ frequency (number of documents a term appears in) strictly higher than the given threshold. This arguments basically permits to remove corpus-specific stop words. When the argument is a float [0.0, 1.0], the parameter represents a proportion of documents.
+ return_feature_names: Boolean, optional, default to False
+ Whether to return the feature (i.e. word) names with the output.
 
 
  Examples
  --------
  >>> import texthero as hero
  >>> import pandas as pd
- >>> s = pd.Series(["Sentence one", "Sentence two"])
- >>> s = hero.tokenize(s)
- >>> hero.tfidf(s)
- 0 [0.5797386715376657, 0.8148024746671689, 0.0]
- 1 [0.5797386715376657, 0.0, 0.8148024746671689]
- dtype: object
- 
- To return the *feature_names*:
- 
- >>> import texthero as hero
- >>> import pandas as pd
- >>> s = pd.Series(["Sentence one", "Sentence two"])
+ >>> s = pd.Series(["Hi Bye", "Test Bye Bye"])
  >>> s = hero.tokenize(s)
  >>> hero.tfidf(s, return_feature_names=True)
- (0 [0.5797386715376657, 0.8148024746671689, 0.0]
- 1 [0.5797386715376657, 0.0, 0.8148024746671689]
- dtype: object, ['Sentence', 'one', 'two'])
+ (document
+ 0 [1.0, 1.4054651081081644, 0.0]
+ 1 [2.0, 0.0, 1.4054651081081644]
+ dtype: object, ['Bye', 'Hi', 'Test'])
  """
 
- # TODO. In docstring show formula to compute TF-IDF and also avoid using sk-learn if possible.
-
  # Check if input is tokenized. Else, print warning and tokenize.
  if not isinstance(s.iloc[0], list):
  warnings.warn(_not_tokenized_warning_message, DeprecationWarning)
@@ -143,15 +226,35 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, return_feature_names=False)
  use_idf=True,
  max_features=max_features,
  min_df=min_df,
+ max_df=max_df,
  tokenizer=lambda x: x,
  preprocessor=lambda x: x,
+ norm=None, # Disable l1/l2 normalization.
+ )
+
+ tfidf_vectors_csr = tfidf.fit_transform(s)
+
+ # Result from sklearn is in Compressed Sparse Row format.
+ # Pandas Sparse Series can only be initialized from Coordinate format.
+ tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr)
+ s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo)
+
+ # Map word index to word name and keep original index of documents.
+ feature_names = tfidf.get_feature_names()
+ s_out.index = s_out.index.map(lambda x: (s.index[x[0]], feature_names[x[1]]))
+
+ s_out.rename_axis(["document", "word"], inplace=True)
+
+ # NOTE: Currently: still convert to flat series instead of representation series.
+ # Will change to return representation series directly in Version 2.
+ s_out = representation_series_to_flat_series(
+ s_out, fill_missing_with=0.0, index=s.index
  )
- s = pd.Series(tfidf.fit_transform(s).toarray().tolist(), index=s.index)
 
  if return_feature_names:
- return (s, tfidf.get_feature_names())
+ return s_out, feature_names
  else:
- return s
+ return s_out
 
 
 """