Fix bug in is_single_document exposed in Pandas 1.3.0 regression suite

CODAIT · Jul 6, 2021 · d804c92 · d804c92
1 parent 1cf1134
commit d804c92
Showing 1 changed file with 36 additions and 4 deletions.
diff --git a/text_extensions_for_pandas/array/span.py b/text_extensions_for_pandas/array/span.py
@@ -326,6 +326,7 @@ def __from_arrow__(self, extension_array):
  SpanArray.
  """
  from text_extensions_for_pandas.array.arrow_conversion import arrow_to_span
+
  return arrow_to_span(extension_array)
 
 
@@ -869,15 +870,46 @@ def is_single_document(self) -> bool:
  :return: True if there is at least one span in the and every span is over the
  same target text.
  """
+ # NOTE: For legacy reasons, this method is currently inconsistent with the method
+ # by the same name in TokenSpanArray. TokenSpanArray.is_single_document() returns
+ # True on an empty array, while SpanArray.is_single_document() returns false.
  if len(self) == 0:
  # If there are zero spans, then there are zero documents.
  return False
  elif self._string_table.num_things == 1:
- return True
+ # Only one string; make sure that this array has a non-null value
+ for b in self._begins:
+ if b != Span.NULL_OFFSET_VALUE:
+ return True
+ # All nulls --> zero spans
+ return False
  else:
- # More than one string in the StringTable and at least one span. Check whether
- # every span has the same text ID.
- return not np.any(self._text_ids[0] != self._text_ids)
+ # More than one string in the StringTable and at least one span.
+ return self._is_single_document_slow_path()
+
+ def _is_single_document_slow_path(self) -> bool:
+ # Slow but reliable way to test whether everything in this SpanArray is from
+ # the same document.
+ # Checks whether every span has the same text ID.
+ # Ignores NAs when making this comparison.
+
+ # First we need to find the first text ID that is not NA
+ first_text_id = None
+ for b, t in zip(self._begins, self._text_ids):
+ if b != Span.NULL_OFFSET_VALUE:
+ first_text_id = t
+ break
+ if first_text_id is None:
+ # Special case: All NAs --> Zero documents
+ return False
+ return not np.any(
+ np.logical_and(
+ # Row is not null...
+ np.not_equal(self._begins, Span.NULL_OFFSET_VALUE),
+ # ...and is over a different text than the first row's text ID
+ np.not_equal(self._text_ids, first_text_id),
+ )
+ )
 
  def split_by_document(self) -> List["SpanArray"]:
  """