Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ClusterSeries to Hero Series Types #170

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
implemented correct sparse support
*missing: test adopting for new types


Co-authored-by: Henri Froese <[email protected]>
  • Loading branch information
mk2510 and henrifroese committed Aug 19, 2020
commit 19c52de3f5ae6a1a01e4262dca00ea5177718311
12 changes: 6 additions & 6 deletions tests/test_representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def _get_multiindex_for_tokenized_output(first_level_name):
[[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]],
index=s_tokenized_output_index,
columns=_get_multiindex_for_tokenized_output("count"),
).astype("Sparse"),
).astype("Sparse[int64, 0]"),
],
[
"term_frequency",
Expand Down Expand Up @@ -108,7 +108,7 @@ def _get_multiindex_for_tokenized_output(first_level_name):
[2, 1],
index=s_tokenized_output_index,
columns=pd.MultiIndex.from_tuples([("count", "Test")]),
).astype("Sparse"),
).astype("Sparse[int64, 0]"),
],
[
"term_frequency",
Expand All @@ -123,7 +123,7 @@ def _get_multiindex_for_tokenized_output(first_level_name):
"tfidf",
representation.tfidf,
pd.DataFrame(
[2.0, 1.0],
[2, 1],
index=s_tokenized_output_index,
columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]),
).astype("Sparse"),
Expand All @@ -146,20 +146,20 @@ class AbstractRepresentationTest(PandasTestCase):
def test_vectorization_simple(self, name, test_function, correct_output):
s_true = correct_output
result_s = test_function(s_tokenized)
pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True)
pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False)

@parameterized.expand(test_cases_vectorization)
def test_vectorization_noncontinuous_index_kept(
self, name, test_function, correct_output=None
):
result_s = test_function(s_tokenized_with_noncontinuous_index)
pd.testing.assert_series_equal(s_tokenized_output_index_noncontinous, result_s)
pd.testing.assert_frame_equal(s_tokenized_output_index_noncontinous, result_s.index, check_dtype = False)

@parameterized.expand(test_cases_vectorization_min_df)
def test_vectorization_min_df(self, name, test_function, correct_output):
s_true = correct_output
result_s = test_function(s_tokenized, min_df=2)
pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True)
pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False)

@parameterized.expand(test_cases_vectorization)
def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args):
Expand Down
59 changes: 34 additions & 25 deletions texthero/representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,12 @@ def count(
Sentence one two
0 1 1 0
1 1 0 1
# FIXME columns pandas doctest
See Also
--------

# FIXME columns pandas doctest

Document Term DataFrame: TODO add tutorial link
"""
# TODO. Can be rewritten without sklearn.
Expand Down Expand Up @@ -375,8 +378,11 @@ def pca(
values = list(s)

return pd.Series(pca.fit_transform(values).tolist(), index=s.index)


# FIXME: merge master again


def nmf(
s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None
) -> pd.Series:
Expand Down Expand Up @@ -437,11 +443,12 @@ def nmf(
nmf = NMF(n_components=n_components, init="random", random_state=random_state,)

if _check_is_valid_DocumentTermDF(s):
values = s.sparse.to_coo()
s_coo = s.sparse.to_coo()
s_for_vectorization = s_coo.astype("float64")
else:
values = list(s)
s_for_vectorization = list(s)

return pd.Series(nmf.fit_transform(values).tolist(), index=s.index)
return pd.Series(nmf.fit_transform(s_for_vectorization).tolist(), index=s.index)


def tsne(
Expand Down Expand Up @@ -535,11 +542,12 @@ def tsne(
)

if _check_is_valid_DocumentTermDF(s):
values = s.sparse.to_coo()
s_coo = s.sparse.to_coo()
s_for_vectorization = s_coo.astype("float64")
else:
values = list(s)
s_for_vectorization = list(s)

return pd.Series(tsne.fit_transform(values).tolist(), index=s.index)
return pd.Series(tsne.fit_transform(s_for_vectorization).tolist(), index=s.index)


"""
Expand Down Expand Up @@ -624,9 +632,10 @@ def kmeans(
"""

if _check_is_valid_DocumentTermDF(s):
vectors = s.sparse.to_coo()
s_coo = s.sparse.to_coo()
s_for_vectorization = s_coo.astype("float64")
else:
vectors = list(s)
s_for_vectorization = list(s)

kmeans = KMeans(
n_clusters=n_clusters,
Expand All @@ -635,8 +644,8 @@ def kmeans(
random_state=random_state,
copy_x=True,
algorithm=algorithm,
).fit(vectors)
return pd.Series(kmeans.predict(vectors), index=s.index).astype("category")
).fit(s_for_vectorization)
return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype("category")


def dbscan(
Expand Down Expand Up @@ -727,9 +736,10 @@ def dbscan(
"""

if _check_is_valid_DocumentTermDF(s):
vectors = s.sparse.to_coo()
s_coo = s.sparse.to_coo()
s_for_vectorization = s_coo.astype("float64")
else:
vectors = list(s)
s_for_vectorization = list(s)

return pd.Series(
DBSCAN(
Expand All @@ -739,7 +749,7 @@ def dbscan(
metric_params=metric_params,
leaf_size=leaf_size,
n_jobs=n_jobs,
).fit_predict(vectors),
).fit_predict(s_for_vectorization),
index=s.index,
).astype("category")

Expand Down Expand Up @@ -877,17 +887,15 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series:
--------
>>> import texthero as hero
>>> import pandas as pd
>>> idx = pd.MultiIndex.from_tuples(
... [(0, "a"), (0, "b"), (1, "c"), (1, "d")], names=("document", "word")
... )
>>> s = pd.Series([1, 2, 3, 4], index=idx)
>>> col = pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "c"), (1, "d")])
>>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], columns=col).astype("Sparse")
>>> hero.normalize(s, norm="max")
document word
0 a 0.50
b 1.00
1 c 0.75
d 1.00
dtype: Sparse[float64, nan]
0 1
a b c d
0 0.250000 0.500000 0.75 1.000000
1 0.571429 0.285714 1.00 0.714286
2 0.400000 0.400000 0.60 1.000000
3 0.111111 0.222222 1.00 0.888889


See Also
Expand All @@ -900,7 +908,8 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series:
isDocumentTermDF = _check_is_valid_DocumentTermDF(s)

if isDocumentTermDF:
s_for_vectorization = s.sparse.to_coo()
s_coo = s.sparse.to_coo()
s_for_vectorization = s_coo.astype("float64")
else:
s_for_vectorization = list(s)

Expand Down