Merge branch 'master' of https://github.com/CODAIT/text-extensions-fo…

…r-pandas into issue-209-static-backup
CODAIT · Jul 7, 2021 · 9e518d7 · 9e518d7
2 parents e01f888 + 1a002c0
commit 9e518d7
Show file tree

Hide file tree

Showing 4 changed files with 124 additions and 15 deletions.
diff --git a/text_extensions_for_pandas/array/span.py b/text_extensions_for_pandas/array/span.py
@@ -30,7 +30,14 @@
 from memoized_property import memoized_property
 # noinspection PyProtectedMember
 from pandas.api.types import is_bool_dtype
-from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
+from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
+try:
+ from pandas.core.dtypes.generic import ABCIndexClass
+except ImportError:
+ # ABCIndexClass changed to ABCIndex in Pandas 1.3
+ # noinspection PyUnresolvedReferences
+ from pandas.core.dtypes.generic import ABCIndex
+ ABCIndexClass = ABCIndex
 from pandas.core.indexers import check_array_indexer
 
 # Internal imports
@@ -319,6 +326,7 @@ def __from_arrow__(self, extension_array):
  SpanArray.
  """
  from text_extensions_for_pandas.array.arrow_conversion import arrow_to_span
+
  return arrow_to_span(extension_array)
 
 
@@ -862,15 +870,46 @@ def is_single_document(self) -> bool:
  :return: True if there is at least one span in the and every span is over the
  same target text.
  """
+ # NOTE: For legacy reasons, this method is currently inconsistent with the method
+ # by the same name in TokenSpanArray. TokenSpanArray.is_single_document() returns
+ # True on an empty array, while SpanArray.is_single_document() returns false.
  if len(self) == 0:
  # If there are zero spans, then there are zero documents.
  return False
  elif self._string_table.num_things == 1:
- return True
+ # Only one string; make sure that this array has a non-null value
+ for b in self._begins:
+ if b != Span.NULL_OFFSET_VALUE:
+ return True
+ # All nulls --> zero spans
+ return False
  else:
- # More than one string in the StringTable and at least one span. Check whether
- # every span has the same text ID.
- return not np.any(self._text_ids[0] != self._text_ids)
+ # More than one string in the StringTable and at least one span.
+ return self._is_single_document_slow_path()
+
+ def _is_single_document_slow_path(self) -> bool:
+ # Slow but reliable way to test whether everything in this SpanArray is from
+ # the same document.
+ # Checks whether every span has the same text ID.
+ # Ignores NAs when making this comparison.
+
+ # First we need to find the first text ID that is not NA
+ first_text_id = None
+ for b, t in zip(self._begins, self._text_ids):
+ if b != Span.NULL_OFFSET_VALUE:
+ first_text_id = t
+ break
+ if first_text_id is None:
+ # Special case: All NAs --> Zero documents
+ return False
+ return not np.any(
+ np.logical_and(
+ # Row is not null...
+ np.not_equal(self._begins, Span.NULL_OFFSET_VALUE),
+ # ...and is over a different text than the first row's text ID
+ np.not_equal(self._text_ids, first_text_id),
+ )
+ )
 
  def split_by_document(self) -> List["SpanArray"]:
  """

diff --git a/text_extensions_for_pandas/array/tensor.py b/text_extensions_for_pandas/array/tensor.py
@@ -29,7 +29,14 @@
 import numpy as np
 import pandas as pd
 from pandas.compat import set_function_name
-from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
+from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
+try:
+ from pandas.core.dtypes.generic import ABCIndexClass
+except ImportError:
+ # ABCIndexClass changed to ABCIndex in Pandas 1.3
+ # noinspection PyUnresolvedReferences
+ from pandas.core.dtypes.generic import ABCIndex
+ ABCIndexClass = ABCIndex
 from pandas.core.indexers import check_array_indexer, validate_indices
 
 """ Begin Patching of ExtensionArrayFormatter """
@@ -342,7 +349,12 @@ def isna(self) -> np.array:
  for information about this method.
  """
  if self._tensor.dtype.type is np.object_:
- return self._tensor == None
+ # Avoid comparing with __eq__ because the elements of the tensor may do
+ # something funny with that operation.
+ result_list = [
+ self._tensor[i] is None for i in range(len(self))
+ ]
+ return np.array(result_list, dtype=bool)
  elif self._tensor.dtype.type is np.str_:
  return np.all(self._tensor == "", axis=-1)
  else:
@@ -475,6 +487,11 @@ def astype(self, dtype, copy=True):
  return dtype.construct_array_type()._from_sequence(values, copy=False)
  else:
  return values
+ elif pd.api.types.is_object_dtype(dtype):
+ # Interpret astype(object) as "cast to an array of numpy arrays"
+ values = np.empty(len(self), dtype=object)
+ for i in range(len(self)):
+ values[i] = self._tensor[i]
  else:
  values = self._tensor.astype(dtype, copy=copy)
  return values
@@ -516,15 +533,24 @@ def __getitem__(self, item) -> Union["TensorArray", "TensorElement"]:
  See docstring in `Extension Array` class in `pandas/core/arrays/base.py`
  for information about this method.
  """
- # Return scalar if single value is selected, a TensorElement for single array element,
- # or TensorArray for slice
+ # Return scalar if single value is selected, a TensorElement for single array
+ # element, or TensorArray for slice
  if isinstance(item, int):
  value = self._tensor[item]
  if np.isscalar(value):
  return value
  else:
  return TensorElement(value)
  else:
+ # BEGIN workaround for Pandas issue #42430
+ if (pd.__version__ == "1.3.0" and isinstance(item, tuple) and len(item) > 1
+ and item[0] == Ellipsis):
+ if len(item) > 2:
+ # Hopefully this case is not possible, but can't be sure
+ raise ValueError(f"Workaround Pandas issue #42430 not implemented "
+ f"for tuple length > 2")
+ item = item[1]
+ # END workaround for issue #42430
  if isinstance(item, TensorArray):
  item = np.asarray(item)
  item = check_array_indexer(self, item)

diff --git a/text_extensions_for_pandas/array/test_tensor.py b/text_extensions_for_pandas/array/test_tensor.py
@@ -1015,7 +1015,11 @@ def test_reindex(self, data, na_value):
 
 
 class TestPandasSetitem(base.BaseSetitemTests):
- pass
+ # Temporarily disabled until Pandas issue #42437 is fixed
+ # See Text Extensions for Pandas issue #221 for a workaround.
+ @pytest.mark.skip(reason="See Pandas issue #42437")
+ def test_setitem_series(self, data, full_indexer):
+ super().test_setitem_series(data, full_indexer)
 
 
 class TestPandasMissing(base.BaseMissingTests):
@@ -1047,15 +1051,24 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
  s = pd.Series(data[1:]) # Avoid zero values for div
  self.check_opname(s, op_name, s.iloc[0], exc=self.series_scalar_exc)
 
+ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
+ """ Override to prevent div by zero warning."""
+ # frame & scalar
+ op_name = all_arithmetic_operators
+ df = pd.DataFrame({"A": data[1:]}) # Avoid zero values for div
+ self.check_opname(df, op_name, data[0], exc=self.frame_scalar_exc)
+
  def test_arith_series_with_array(self, data, all_arithmetic_operators):
- """ Override because creates Series from list of TensorElements as dtype=object."""
+ """ Override because creates Series from list of TensorElements as
+ dtype=object."""
  # ndarray & other series
  op_name = all_arithmetic_operators
  s = pd.Series(data[1:]) # Avoid zero values for div
  self.check_opname(
  s, op_name, pd.Series([s.iloc[0]] * len(s), dtype=TensorDtype()), exc=self.series_array_exc
  )
 
+
  @pytest.mark.skip(reason="TensorArray does not error on ops")
  def test_error(self, data, all_arithmetic_operators):
  # other specific errors tested in the TensorArray specific tests

diff --git a/text_extensions_for_pandas/array/token_span.py b/text_extensions_for_pandas/array/token_span.py
@@ -29,7 +29,15 @@
 from memoized_property import memoized_property
 # noinspection PyProtectedMember
 from pandas.api.types import is_bool_dtype
-from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
+from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
+try:
+ from pandas.core.dtypes.generic import ABCIndexClass
+except ImportError:
+ # ABCIndexClass changed to ABCIndex in Pandas 1.3
+ # noinspection PyUnresolvedReferences
+ from pandas.core.dtypes.generic import ABCIndex
+ ABCIndexClass = ABCIndex
+
 from pandas.core.indexers import check_array_indexer
 
 from text_extensions_for_pandas.array.span import (
@@ -130,9 +138,13 @@ def __init__(self, tokens: Any, begin_token: int, end_token: int):
  )
  if end_token > len(tokens) + 1:
  raise ValueError(
- f"End token offset of {begin_token} larger than "
+ f"End token offset of {end_token} larger than "
  f"number of tokens + 1 ({len(tokens)} + 1)"
  )
+ if len(tokens) == 0 and begin_token != TokenSpan.NULL_OFFSET_VALUE:
+ raise ValueError(
+ f"Tried to create a non-null TokenSpan over an empty list of tokens."
+ )
  if TokenSpan.NULL_OFFSET_VALUE == begin_token:
  if TokenSpan.NULL_OFFSET_VALUE != end_token:
  raise ValueError(
@@ -471,6 +483,7 @@ def __setitem__(self, key: Union[int, np.ndarray, list, slice], value: Any) -> N
  ((isinstance(value, Sequence) and isinstance(value[0], TokenSpan)) or
  isinstance(value, TokenSpanArray))):
  for k, v in zip(key, value):
+ self._tokens[k] = v.tokens
  self._begin_tokens[k] = v.begin_token
  self._end_tokens[k] = v.end_token
  else:
@@ -607,7 +620,8 @@ def isna(self) -> np.array:
  See docstring in `ExtensionArray` class in `pandas/core/arrays/base.py`
  for information about this method.
  """
- return self.nulls_mask
+ # isna() of an ExtensionArray must return a copy that the caller can scribble on.
+ return self.nulls_mask.copy()
 
  def copy(self) -> "TokenSpanArray":
  """
@@ -959,14 +973,31 @@ def is_single_document(self) -> bool:
  :return: True if every span in this array is over the same target text
  or if there are zero spans in this array.
  """
+ # NOTE: For legacy reasons, this method is currently inconsistent with the method
+ # by the same name in SpanArray. TokenSpanArray.is_single_document() returns
+ # True on an empty array, while SpanArray.is_single_document() returns False.
  if len(self) == 0:
  # If there are zero spans, we consider there to be one document with the
  # document text being whatever is the document text for our tokens.
  return True
  else:
  # More than one tokenization and at least one span. Check whether
  # every span has the same text.
- return not np.any(self.target_text[0] != self.target_text)
+
+ # Find the first text ID that is not NA
+ first_text_id = None
+ for b, t in zip(self._begins, self._text_ids):
+ if b != Span.NULL_OFFSET_VALUE:
+ first_text_id = t
+ break
+ if first_text_id is None:
+ # Special case: All NAs --> Zero documents
+ return True
+ return not np.any(np.logical_and(
+ # Row is not null...
+ np.not_equal(self._begins, Span.NULL_OFFSET_VALUE),
+ # ...and is over a different text than the first row's text ID
+ np.not_equal(self._text_ids, first_text_id)))
 
  def split_by_document(self) -> List["SpanArray"]:
  """