-
Notifications
You must be signed in to change notification settings - Fork 35
Fixes for use with Pandas 1.2.1 #171
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -433,6 +433,17 @@ def __hash__(self): | |
self._hash = SpanArray.__hash__(self) | ||
return self._hash | ||
|
||
def __contains__(self, item) -> bool: | ||
""" | ||
Return true if scalar item exists in this TokenSpanArray. | ||
:param item: scalar TokenSpan value. | ||
:return: true if item exists in this TokenSpanArray. | ||
""" | ||
if isinstance(item, TokenSpan) and \ | ||
item.begin == TokenSpan.NULL_OFFSET_VALUE: | ||
return TokenSpan.NULL_OFFSET_VALUE in self._begin_tokens | ||
return super().__contains__(item) | ||
|
||
@classmethod | ||
def _concat_same_type( | ||
cls, to_concat: Sequence[pd.api.extensions.ExtensionArray] | ||
|
@@ -474,7 +485,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): | |
for information about this method. | ||
""" | ||
tokens = None | ||
if isinstance(scalars, Span): | ||
if isinstance(scalars, TokenSpan): | ||
scalars = [scalars] | ||
if isinstance(scalars, TokenSpanArray): | ||
tokens = scalars.tokens | ||
|
@@ -483,14 +494,18 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): | |
i = 0 | ||
for s in scalars: | ||
if not isinstance(s, TokenSpan): | ||
raise ValueError( | ||
f"Can only convert a sequence of TokenSpan " | ||
f"objects to a TokenSpanArray. Found an " | ||
f"object of type {type(s)}" | ||
) | ||
if tokens is None: | ||
# TODO: Temporary fix for np.nan values, pandas-dev GH#38980 | ||
if np.isnan(s): | ||
s = TokenSpanDtype().na_value | ||
else: | ||
raise ValueError( | ||
f"Can only convert a sequence of TokenSpan " | ||
f"objects to a TokenSpanArray. Found an " | ||
f"object of type {type(s)}" | ||
) | ||
if tokens is None and not (s.begin == TokenSpan.NULL_OFFSET_VALUE and s.tokens.target_text is None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This gets a little awkward dealing with with a NULL value with target_text=None, but should be ok for now There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This case will work better once |
||
tokens = s.tokens | ||
if not s.tokens.equals(tokens): | ||
if not (s.begin == TokenSpan.NULL_OFFSET_VALUE and s.tokens.target_text is None) and not s.tokens.equals(tokens): | ||
raise ValueError( | ||
f"Mixing different token sets is not currently " | ||
f"supported. Received two token sets:\n" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is needed because otherwise the default is to check all values for equality, and that will fail if there is a NULL value with
Span.target_text=None
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you know how Pandas defines the result of
NA == NA
for other nullable types? Should that expression evaluateTrue
,False
, orNA
? It could go either way.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like the current Pandas policy is "¯_(ツ)_/¯"; see the warning on https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, good question. The test failure related to this was checking
assert na_value in data_missing
which expects thatna_value == na_value
to be True. Seems like it would be different fornp.nan
though..