Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…r-pandas into issue-209-static-backup
  • Loading branch information
PokkeFe committed Jul 7, 2021
2 parents e01f888 + 1a002c0 commit 9e518d7
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 15 deletions.
49 changes: 44 additions & 5 deletions text_extensions_for_pandas/array/span.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,14 @@
from memoized_property import memoized_property
# noinspection PyProtectedMember
from pandas.api.types import is_bool_dtype
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
try:
from pandas.core.dtypes.generic import ABCIndexClass
except ImportError:
# ABCIndexClass changed to ABCIndex in Pandas 1.3
# noinspection PyUnresolvedReferences
from pandas.core.dtypes.generic import ABCIndex
ABCIndexClass = ABCIndex
from pandas.core.indexers import check_array_indexer

# Internal imports
Expand Down Expand Up @@ -319,6 +326,7 @@ def __from_arrow__(self, extension_array):
SpanArray.
"""
from text_extensions_for_pandas.array.arrow_conversion import arrow_to_span

return arrow_to_span(extension_array)


Expand Down Expand Up @@ -862,15 +870,46 @@ def is_single_document(self) -> bool:
:return: True if there is at least one span in the and every span is over the
same target text.
"""
# NOTE: For legacy reasons, this method is currently inconsistent with the method
# by the same name in TokenSpanArray. TokenSpanArray.is_single_document() returns
# True on an empty array, while SpanArray.is_single_document() returns false.
if len(self) == 0:
# If there are zero spans, then there are zero documents.
return False
elif self._string_table.num_things == 1:
return True
# Only one string; make sure that this array has a non-null value
for b in self._begins:
if b != Span.NULL_OFFSET_VALUE:
return True
# All nulls --> zero spans
return False
else:
# More than one string in the StringTable and at least one span. Check whether
# every span has the same text ID.
return not np.any(self._text_ids[0] != self._text_ids)
# More than one string in the StringTable and at least one span.
return self._is_single_document_slow_path()

def _is_single_document_slow_path(self) -> bool:
# Slow but reliable way to test whether everything in this SpanArray is from
# the same document.
# Checks whether every span has the same text ID.
# Ignores NAs when making this comparison.

# First we need to find the first text ID that is not NA
first_text_id = None
for b, t in zip(self._begins, self._text_ids):
if b != Span.NULL_OFFSET_VALUE:
first_text_id = t
break
if first_text_id is None:
# Special case: All NAs --> Zero documents
return False
return not np.any(
np.logical_and(
# Row is not null...
np.not_equal(self._begins, Span.NULL_OFFSET_VALUE),
# ...and is over a different text than the first row's text ID
np.not_equal(self._text_ids, first_text_id),
)
)

def split_by_document(self) -> List["SpanArray"]:
"""
Expand Down
34 changes: 30 additions & 4 deletions text_extensions_for_pandas/array/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,14 @@
import numpy as np
import pandas as pd
from pandas.compat import set_function_name
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
try:
from pandas.core.dtypes.generic import ABCIndexClass
except ImportError:
# ABCIndexClass changed to ABCIndex in Pandas 1.3
# noinspection PyUnresolvedReferences
from pandas.core.dtypes.generic import ABCIndex
ABCIndexClass = ABCIndex
from pandas.core.indexers import check_array_indexer, validate_indices

""" Begin Patching of ExtensionArrayFormatter """
Expand Down Expand Up @@ -342,7 +349,12 @@ def isna(self) -> np.array:
for information about this method.
"""
if self._tensor.dtype.type is np.object_:
return self._tensor == None
# Avoid comparing with __eq__ because the elements of the tensor may do
# something funny with that operation.
result_list = [
self._tensor[i] is None for i in range(len(self))
]
return np.array(result_list, dtype=bool)
elif self._tensor.dtype.type is np.str_:
return np.all(self._tensor == "", axis=-1)
else:
Expand Down Expand Up @@ -475,6 +487,11 @@ def astype(self, dtype, copy=True):
return dtype.construct_array_type()._from_sequence(values, copy=False)
else:
return values
elif pd.api.types.is_object_dtype(dtype):
# Interpret astype(object) as "cast to an array of numpy arrays"
values = np.empty(len(self), dtype=object)
for i in range(len(self)):
values[i] = self._tensor[i]
else:
values = self._tensor.astype(dtype, copy=copy)
return values
Expand Down Expand Up @@ -516,15 +533,24 @@ def __getitem__(self, item) -> Union["TensorArray", "TensorElement"]:
See docstring in `Extension Array` class in `pandas/core/arrays/base.py`
for information about this method.
"""
# Return scalar if single value is selected, a TensorElement for single array element,
# or TensorArray for slice
# Return scalar if single value is selected, a TensorElement for single array
# element, or TensorArray for slice
if isinstance(item, int):
value = self._tensor[item]
if np.isscalar(value):
return value
else:
return TensorElement(value)
else:
# BEGIN workaround for Pandas issue #42430
if (pd.__version__ == "1.3.0" and isinstance(item, tuple) and len(item) > 1
and item[0] == Ellipsis):
if len(item) > 2:
# Hopefully this case is not possible, but can't be sure
raise ValueError(f"Workaround Pandas issue #42430 not implemented "
f"for tuple length > 2")
item = item[1]
# END workaround for issue #42430
if isinstance(item, TensorArray):
item = np.asarray(item)
item = check_array_indexer(self, item)
Expand Down
17 changes: 15 additions & 2 deletions text_extensions_for_pandas/array/test_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1015,7 +1015,11 @@ def test_reindex(self, data, na_value):


class TestPandasSetitem(base.BaseSetitemTests):
pass
# Temporarily disabled until Pandas issue #42437 is fixed
# See Text Extensions for Pandas issue #221 for a workaround.
@pytest.mark.skip(reason="See Pandas issue #42437")
def test_setitem_series(self, data, full_indexer):
super().test_setitem_series(data, full_indexer)


class TestPandasMissing(base.BaseMissingTests):
Expand Down Expand Up @@ -1047,15 +1051,24 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
s = pd.Series(data[1:]) # Avoid zero values for div
self.check_opname(s, op_name, s.iloc[0], exc=self.series_scalar_exc)

def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
""" Override to prevent div by zero warning."""
# frame & scalar
op_name = all_arithmetic_operators
df = pd.DataFrame({"A": data[1:]}) # Avoid zero values for div
self.check_opname(df, op_name, data[0], exc=self.frame_scalar_exc)

def test_arith_series_with_array(self, data, all_arithmetic_operators):
""" Override because creates Series from list of TensorElements as dtype=object."""
""" Override because creates Series from list of TensorElements as
dtype=object."""
# ndarray & other series
op_name = all_arithmetic_operators
s = pd.Series(data[1:]) # Avoid zero values for div
self.check_opname(
s, op_name, pd.Series([s.iloc[0]] * len(s), dtype=TensorDtype()), exc=self.series_array_exc
)


@pytest.mark.skip(reason="TensorArray does not error on ops")
def test_error(self, data, all_arithmetic_operators):
# other specific errors tested in the TensorArray specific tests
Expand Down
39 changes: 35 additions & 4 deletions text_extensions_for_pandas/array/token_span.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,15 @@
from memoized_property import memoized_property
# noinspection PyProtectedMember
from pandas.api.types import is_bool_dtype
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
try:
from pandas.core.dtypes.generic import ABCIndexClass
except ImportError:
# ABCIndexClass changed to ABCIndex in Pandas 1.3
# noinspection PyUnresolvedReferences
from pandas.core.dtypes.generic import ABCIndex
ABCIndexClass = ABCIndex

from pandas.core.indexers import check_array_indexer

from text_extensions_for_pandas.array.span import (
Expand Down Expand Up @@ -130,9 +138,13 @@ def __init__(self, tokens: Any, begin_token: int, end_token: int):
)
if end_token > len(tokens) + 1:
raise ValueError(
f"End token offset of {begin_token} larger than "
f"End token offset of {end_token} larger than "
f"number of tokens + 1 ({len(tokens)} + 1)"
)
if len(tokens) == 0 and begin_token != TokenSpan.NULL_OFFSET_VALUE:
raise ValueError(
f"Tried to create a non-null TokenSpan over an empty list of tokens."
)
if TokenSpan.NULL_OFFSET_VALUE == begin_token:
if TokenSpan.NULL_OFFSET_VALUE != end_token:
raise ValueError(
Expand Down Expand Up @@ -471,6 +483,7 @@ def __setitem__(self, key: Union[int, np.ndarray, list, slice], value: Any) -> N
((isinstance(value, Sequence) and isinstance(value[0], TokenSpan)) or
isinstance(value, TokenSpanArray))):
for k, v in zip(key, value):
self._tokens[k] = v.tokens
self._begin_tokens[k] = v.begin_token
self._end_tokens[k] = v.end_token
else:
Expand Down Expand Up @@ -607,7 +620,8 @@ def isna(self) -> np.array:
See docstring in `ExtensionArray` class in `pandas/core/arrays/base.py`
for information about this method.
"""
return self.nulls_mask
# isna() of an ExtensionArray must return a copy that the caller can scribble on.
return self.nulls_mask.copy()

def copy(self) -> "TokenSpanArray":
"""
Expand Down Expand Up @@ -959,14 +973,31 @@ def is_single_document(self) -> bool:
:return: True if every span in this array is over the same target text
or if there are zero spans in this array.
"""
# NOTE: For legacy reasons, this method is currently inconsistent with the method
# by the same name in SpanArray. TokenSpanArray.is_single_document() returns
# True on an empty array, while SpanArray.is_single_document() returns False.
if len(self) == 0:
# If there are zero spans, we consider there to be one document with the
# document text being whatever is the document text for our tokens.
return True
else:
# More than one tokenization and at least one span. Check whether
# every span has the same text.
return not np.any(self.target_text[0] != self.target_text)

# Find the first text ID that is not NA
first_text_id = None
for b, t in zip(self._begins, self._text_ids):
if b != Span.NULL_OFFSET_VALUE:
first_text_id = t
break
if first_text_id is None:
# Special case: All NAs --> Zero documents
return True
return not np.any(np.logical_and(
# Row is not null...
np.not_equal(self._begins, Span.NULL_OFFSET_VALUE),
# ...and is over a different text than the first row's text ID
np.not_equal(self._text_ids, first_text_id)))

def split_by_document(self) -> List["SpanArray"]:
"""
Expand Down

0 comments on commit 9e518d7

Please sign in to comment.