diff --git a/text_extensions_for_pandas/array/span.py b/text_extensions_for_pandas/array/span.py index 585c94b9..577393e4 100644 --- a/text_extensions_for_pandas/array/span.py +++ b/text_extensions_for_pandas/array/span.py @@ -466,6 +466,17 @@ def __hash__(self): self._ends.tobytes())) return self._hash + def __contains__(self, item) -> bool: + """ + Return true if scalar item exists in this SpanArray. + :param item: scalar Span value. + :return: true if item exists in this SpanArray. + """ + if isinstance(item, Span) and \ + item.begin == Span.NULL_OFFSET_VALUE: + return Span.NULL_OFFSET_VALUE in self._begins + return super().__contains__(item) + def equals(self, other: "SpanArray"): """ :param other: A second `SpanArray` @@ -541,12 +552,16 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): i = 0 for s in scalars: if not isinstance(s, Span): - raise ValueError(f"Can only convert a sequence of Span " - f"objects to a SpanArray. Found an " - f"object of type {type(s)}") - if text is None: + # TODO: Temporary fix for np.nan values, pandas-dev GH#38980 + if np.isnan(s): + s = Span(text, Span.NULL_OFFSET_VALUE, Span.NULL_OFFSET_VALUE) + else: + raise ValueError(f"Can only convert a sequence of Span " + f"objects to a SpanArray. Found an " + f"object of type {type(s)}") + if text is None and s.begin != Span.NULL_OFFSET_VALUE: text = s.target_text - if s.target_text != text: + if s.target_text != text and s.begin != Span.NULL_OFFSET_VALUE: raise ValueError( f"Mixing different target texts is not currently " f"supported. Received two different strings:\n" diff --git a/text_extensions_for_pandas/array/tensor.py b/text_extensions_for_pandas/array/tensor.py index 380fbc88..3929e557 100644 --- a/text_extensions_for_pandas/array/tensor.py +++ b/text_extensions_for_pandas/array/tensor.py @@ -433,7 +433,7 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: def __contains__(self, item) -> bool: if isinstance(item, TensorElement): npitem = np.asarray(item) - if npitem.size == 1 and np.isnan(npitem).all(): + if npitem.size == 1 and np.isnan(npitem).all(): return self.isna().any() return super().__contains__(item) diff --git a/text_extensions_for_pandas/array/token_span.py b/text_extensions_for_pandas/array/token_span.py index 2ab04474..ef6da686 100644 --- a/text_extensions_for_pandas/array/token_span.py +++ b/text_extensions_for_pandas/array/token_span.py @@ -433,6 +433,17 @@ def __hash__(self): self._hash = SpanArray.__hash__(self) return self._hash + def __contains__(self, item) -> bool: + """ + Return true if scalar item exists in this TokenSpanArray. + :param item: scalar TokenSpan value. + :return: true if item exists in this TokenSpanArray. + """ + if isinstance(item, TokenSpan) and \ + item.begin == TokenSpan.NULL_OFFSET_VALUE: + return TokenSpan.NULL_OFFSET_VALUE in self._begin_tokens + return super().__contains__(item) + @classmethod def _concat_same_type( cls, to_concat: Sequence[pd.api.extensions.ExtensionArray] @@ -474,7 +485,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): for information about this method. """ tokens = None - if isinstance(scalars, Span): + if isinstance(scalars, TokenSpan): scalars = [scalars] if isinstance(scalars, TokenSpanArray): tokens = scalars.tokens @@ -483,14 +494,18 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): i = 0 for s in scalars: if not isinstance(s, TokenSpan): - raise ValueError( - f"Can only convert a sequence of TokenSpan " - f"objects to a TokenSpanArray. Found an " - f"object of type {type(s)}" - ) - if tokens is None: + # TODO: Temporary fix for np.nan values, pandas-dev GH#38980 + if np.isnan(s): + s = TokenSpanDtype().na_value + else: + raise ValueError( + f"Can only convert a sequence of TokenSpan " + f"objects to a TokenSpanArray. Found an " + f"object of type {type(s)}" + ) + if tokens is None and not (s.begin == TokenSpan.NULL_OFFSET_VALUE and s.tokens.target_text is None): tokens = s.tokens - if not s.tokens.equals(tokens): + if not (s.begin == TokenSpan.NULL_OFFSET_VALUE and s.tokens.target_text is None) and not s.tokens.equals(tokens): raise ValueError( f"Mixing different token sets is not currently " f"supported. Received two token sets:\n"