Skip to content

Commit 9b13e73

Browse files
authored
Fixed _from_sequence to handle nan, added contains to check for null values (#171)
1 parent 156c365 commit 9b13e73

File tree

3 files changed

+44
-14
lines changed

3 files changed

+44
-14
lines changed

text_extensions_for_pandas/array/span.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,17 @@ def __hash__(self):
466466
self._ends.tobytes()))
467467
return self._hash
468468

469+
def __contains__(self, item) -> bool:
470+
"""
471+
Return true if scalar item exists in this SpanArray.
472+
:param item: scalar Span value.
473+
:return: true if item exists in this SpanArray.
474+
"""
475+
if isinstance(item, Span) and \
476+
item.begin == Span.NULL_OFFSET_VALUE:
477+
return Span.NULL_OFFSET_VALUE in self._begins
478+
return super().__contains__(item)
479+
469480
def equals(self, other: "SpanArray"):
470481
"""
471482
:param other: A second `SpanArray`
@@ -541,12 +552,16 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
541552
i = 0
542553
for s in scalars:
543554
if not isinstance(s, Span):
544-
raise ValueError(f"Can only convert a sequence of Span "
545-
f"objects to a SpanArray. Found an "
546-
f"object of type {type(s)}")
547-
if text is None:
555+
# TODO: Temporary fix for np.nan values, pandas-dev GH#38980
556+
if np.isnan(s):
557+
s = Span(text, Span.NULL_OFFSET_VALUE, Span.NULL_OFFSET_VALUE)
558+
else:
559+
raise ValueError(f"Can only convert a sequence of Span "
560+
f"objects to a SpanArray. Found an "
561+
f"object of type {type(s)}")
562+
if text is None and s.begin != Span.NULL_OFFSET_VALUE:
548563
text = s.target_text
549-
if s.target_text != text:
564+
if s.target_text != text and s.begin != Span.NULL_OFFSET_VALUE:
550565
raise ValueError(
551566
f"Mixing different target texts is not currently "
552567
f"supported. Received two different strings:\n"

text_extensions_for_pandas/array/tensor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None:
433433
def __contains__(self, item) -> bool:
434434
if isinstance(item, TensorElement):
435435
npitem = np.asarray(item)
436-
if npitem.size == 1 and np.isnan(npitem).all():
436+
if npitem.size == 1 and np.isnan(npitem).all():
437437
return self.isna().any()
438438
return super().__contains__(item)
439439

text_extensions_for_pandas/array/token_span.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,17 @@ def __hash__(self):
433433
self._hash = SpanArray.__hash__(self)
434434
return self._hash
435435

436+
def __contains__(self, item) -> bool:
437+
"""
438+
Return true if scalar item exists in this TokenSpanArray.
439+
:param item: scalar TokenSpan value.
440+
:return: true if item exists in this TokenSpanArray.
441+
"""
442+
if isinstance(item, TokenSpan) and \
443+
item.begin == TokenSpan.NULL_OFFSET_VALUE:
444+
return TokenSpan.NULL_OFFSET_VALUE in self._begin_tokens
445+
return super().__contains__(item)
446+
436447
@classmethod
437448
def _concat_same_type(
438449
cls, to_concat: Sequence[pd.api.extensions.ExtensionArray]
@@ -474,7 +485,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
474485
for information about this method.
475486
"""
476487
tokens = None
477-
if isinstance(scalars, Span):
488+
if isinstance(scalars, TokenSpan):
478489
scalars = [scalars]
479490
if isinstance(scalars, TokenSpanArray):
480491
tokens = scalars.tokens
@@ -483,14 +494,18 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
483494
i = 0
484495
for s in scalars:
485496
if not isinstance(s, TokenSpan):
486-
raise ValueError(
487-
f"Can only convert a sequence of TokenSpan "
488-
f"objects to a TokenSpanArray. Found an "
489-
f"object of type {type(s)}"
490-
)
491-
if tokens is None:
497+
# TODO: Temporary fix for np.nan values, pandas-dev GH#38980
498+
if np.isnan(s):
499+
s = TokenSpanDtype().na_value
500+
else:
501+
raise ValueError(
502+
f"Can only convert a sequence of TokenSpan "
503+
f"objects to a TokenSpanArray. Found an "
504+
f"object of type {type(s)}"
505+
)
506+
if tokens is None and not (s.begin == TokenSpan.NULL_OFFSET_VALUE and s.tokens.target_text is None):
492507
tokens = s.tokens
493-
if not s.tokens.equals(tokens):
508+
if not (s.begin == TokenSpan.NULL_OFFSET_VALUE and s.tokens.target_text is None) and not s.tokens.equals(tokens):
494509
raise ValueError(
495510
f"Mixing different token sets is not currently "
496511
f"supported. Received two token sets:\n"

0 commit comments

Comments
 (0)