Skip to content

Commit

Permalink
Add normalization to representation module. (#134)
Browse files Browse the repository at this point in the history
* Add normalization to representation module.

- Function `normalize` that takes Series and norm (supports l1, l2, max)
- works on both representation series and flat / "normal" series
- after #132 is merged, will add more tests for normalize

Co-authored-by: Maximilian Krahn <[email protected]>

* Remove to_dense()

* Remove support for flat input series

Co-authored-by: Henri Froese <[email protected]>
Co-authored-by: Maximilian Krahn <[email protected]>
  • Loading branch information
3 people committed Aug 5, 2020
1 parent f66f23c commit d21b80e
Showing 1 changed file with 72 additions and 0 deletions.
72 changes: 72 additions & 0 deletions texthero/representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from sklearn.decomposition import PCA, NMF
from sklearn.cluster import KMeans, DBSCAN, MeanShift
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize as sklearn_normalize
from scipy.sparse import coo_matrix

from typing import Optional, Union, Any
Expand Down Expand Up @@ -894,3 +895,74 @@ def meanshift(
"""

# TODO.

"""
Normalization.
"""


def normalize(s: pd.Series, norm="l2") -> pd.Series:
"""
Normalize every cell in a Pandas Series.
Input has to be a Representation Series.
Parameters
----------
s: Pandas Series
norm: str, default to "l2"
One of "l1", "l2", or "max". The norm that is used.
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> idx = pd.MultiIndex.from_tuples(
... [(0, "a"), (0, "b"), (1, "c"), (1, "d")], names=("document", "word")
... )
>>> s = pd.Series([1, 2, 3, 4], index=idx)
>>> hero.normalize(s, norm="max")
document word
0 a 0.50
b 1.00
1 c 0.75
d 1.00
dtype: Sparse[float64, nan]
See Also
--------
Representation Series link TODO add link to tutorial
`Norm on Wikipedia <https://en.wikipedia.org/wiki/Norm_(mathematics)>`_
"""

is_valid_representation = (
isinstance(s.index, pd.MultiIndex) and s.index.nlevels == 2
)

if not is_valid_representation:
raise TypeError(
"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex"
)
# TODO after merging representation: use _check_is_valid_representation instead

if pd.api.types.is_sparse(s):
s_coo_matrix = s.sparse.to_coo()[0]
else:
s = s.astype("Sparse")
s_coo_matrix = s.sparse.to_coo()[0]

s_for_vectorization = s_coo_matrix

result = sklearn_normalize(
s_for_vectorization, norm=norm
) # Can handle sparse input.

result_coo = coo_matrix(result)
s_result = pd.Series.sparse.from_coo(result_coo)
s_result.index = s.index

return s_result

0 comments on commit d21b80e

Please sign in to comment.