Skip to content

Commit

Permalink
Fix bug in is_single_document exposed in Pandas 1.3.0 regression suite
Browse files Browse the repository at this point in the history
  • Loading branch information
frreiss committed Jul 6, 2021
1 parent 1cf1134 commit d804c92
Showing 1 changed file with 36 additions and 4 deletions.
40 changes: 36 additions & 4 deletions text_extensions_for_pandas/array/span.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ def __from_arrow__(self, extension_array):
SpanArray.
"""
from text_extensions_for_pandas.array.arrow_conversion import arrow_to_span

return arrow_to_span(extension_array)


Expand Down Expand Up @@ -869,15 +870,46 @@ def is_single_document(self) -> bool:
:return: True if there is at least one span in the and every span is over the
same target text.
"""
# NOTE: For legacy reasons, this method is currently inconsistent with the method
# by the same name in TokenSpanArray. TokenSpanArray.is_single_document() returns
# True on an empty array, while SpanArray.is_single_document() returns false.
if len(self) == 0:
# If there are zero spans, then there are zero documents.
return False
elif self._string_table.num_things == 1:
return True
# Only one string; make sure that this array has a non-null value
for b in self._begins:
if b != Span.NULL_OFFSET_VALUE:
return True
# All nulls --> zero spans
return False
else:
# More than one string in the StringTable and at least one span. Check whether
# every span has the same text ID.
return not np.any(self._text_ids[0] != self._text_ids)
# More than one string in the StringTable and at least one span.
return self._is_single_document_slow_path()

def _is_single_document_slow_path(self) -> bool:
# Slow but reliable way to test whether everything in this SpanArray is from
# the same document.
# Checks whether every span has the same text ID.
# Ignores NAs when making this comparison.

# First we need to find the first text ID that is not NA
first_text_id = None
for b, t in zip(self._begins, self._text_ids):
if b != Span.NULL_OFFSET_VALUE:
first_text_id = t
break
if first_text_id is None:
# Special case: All NAs --> Zero documents
return False
return not np.any(
np.logical_and(
# Row is not null...
np.not_equal(self._begins, Span.NULL_OFFSET_VALUE),
# ...and is over a different text than the first row's text ID
np.not_equal(self._text_ids, first_text_id),
)
)

def split_by_document(self) -> List["SpanArray"]:
"""
Expand Down

0 comments on commit d804c92

Please sign in to comment.