Skip to content

Commit

Permalink
Fix bugs in constructor and is_single_document exposed in Pandas 1.3.…
Browse files Browse the repository at this point in the history
…0 regression suite
  • Loading branch information
frreiss committed Jul 6, 2021
1 parent 4fcfbb4 commit 1cf1134
Showing 1 changed file with 26 additions and 3 deletions.
29 changes: 26 additions & 3 deletions text_extensions_for_pandas/array/token_span.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,13 @@ def __init__(self, tokens: Any, begin_token: int, end_token: int):
)
if end_token > len(tokens) + 1:
raise ValueError(
f"End token offset of {begin_token} larger than "
f"End token offset of {end_token} larger than "
f"number of tokens + 1 ({len(tokens)} + 1)"
)
if len(tokens) == 0 and begin_token != TokenSpan.NULL_OFFSET_VALUE:
raise ValueError(
f"Tried to create a non-null TokenSpan over an empty list of tokens."
)
if TokenSpan.NULL_OFFSET_VALUE == begin_token:
if TokenSpan.NULL_OFFSET_VALUE != end_token:
raise ValueError(
Expand Down Expand Up @@ -479,6 +483,7 @@ def __setitem__(self, key: Union[int, np.ndarray, list, slice], value: Any) -> N
((isinstance(value, Sequence) and isinstance(value[0], TokenSpan)) or
isinstance(value, TokenSpanArray))):
for k, v in zip(key, value):
self._tokens[k] = v.tokens
self._begin_tokens[k] = v.begin_token
self._end_tokens[k] = v.end_token
else:
Expand Down Expand Up @@ -615,7 +620,8 @@ def isna(self) -> np.array:
See docstring in `ExtensionArray` class in `pandas/core/arrays/base.py`
for information about this method.
"""
return self.nulls_mask
# isna() of an ExtensionArray must return a copy that the caller can scribble on.
return self.nulls_mask.copy()

def copy(self) -> "TokenSpanArray":
"""
Expand Down Expand Up @@ -967,14 +973,31 @@ def is_single_document(self) -> bool:
:return: True if every span in this array is over the same target text
or if there are zero spans in this array.
"""
# NOTE: For legacy reasons, this method is currently inconsistent with the method
# by the same name in SpanArray. TokenSpanArray.is_single_document() returns
# True on an empty array, while SpanArray.is_single_document() returns False.
if len(self) == 0:
# If there are zero spans, we consider there to be one document with the
# document text being whatever is the document text for our tokens.
return True
else:
# More than one tokenization and at least one span. Check whether
# every span has the same text.
return not np.any(self.target_text[0] != self.target_text)

# Find the first text ID that is not NA
first_text_id = None
for b, t in zip(self._begins, self._text_ids):
if b != Span.NULL_OFFSET_VALUE:
first_text_id = t
break
if first_text_id is None:
# Special case: All NAs --> Zero documents
return True
return not np.any(np.logical_and(
# Row is not null...
np.not_equal(self._begins, Span.NULL_OFFSET_VALUE),
# ...and is over a different text than the first row's text ID
np.not_equal(self._text_ids, first_text_id)))

def split_by_document(self) -> List["SpanArray"]:
"""
Expand Down

0 comments on commit 1cf1134

Please sign in to comment.