-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Add date dtype #34441
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add date dtype #34441
Changes from all commits
ac8e285
0ad60de
ae1a498
5c5ee4b
224b59d
5213efe
e000786
a9ac366
4b441f3
4ec5d72
539444e
6db4aea
6f4eb44
0e30fa5
a26a4f7
af37183
69b297f
9aab22d
2f3f579
eb947d7
a6d6bc5
61d07f9
85e71fd
5673cf3
79c9254
e9c8d96
c209de1
f207989
31fa485
73e278b
068e9bc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
from pandas.core.dtypes.base import ExtensionDtype | ||
from pandas.core.arrays.datetimelike import DatelikeOps, DatetimeLikeArrayMixin | ||
from pandas.core.arrays.datetimes import sequence_to_dt64ns | ||
from pandas.core.dtypes.common import ( | ||
is_integer_dtype, | ||
is_datetime64_dtype, | ||
is_object_dtype, | ||
is_string_dtype, | ||
pandas_dtype, | ||
) | ||
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass | ||
from pandas.core.dtypes.dtypes import DateDtype | ||
from pandas.core.construction import array | ||
from pandas._libs.tslibs import Timestamp | ||
from pandas._libs.tslibs.conversion import DT64NS_DTYPE | ||
from pandas._libs import tslib, lib | ||
from pandas.core.arrays._mixins import _T | ||
|
||
import numpy as np | ||
|
||
D_DATETIME_DTYPE = "datetime64[D]" | ||
INTEGER_BACKEND = "i8" | ||
VALID_TYPES = {INTEGER_BACKEND, "datetime64[ns]", D_DATETIME_DTYPE, "object"} | ||
|
||
|
||
def _to_date_values(values, copy=False): | ||
data, _, _ = sequence_to_dt64ns(values, copy=copy) | ||
return data.astype(D_DATETIME_DTYPE) | ||
|
||
|
||
class DateArray(DatetimeLikeArrayMixin, DatelikeOps): | ||
""" | ||
Pandas ExtensionArray for date (year, month, day only) data. | ||
|
||
Parameters | ||
---------- | ||
values : Series, Index, DateArray, ndarray | ||
The date data. | ||
copy : bool, default False | ||
Whether to copy the underlying array of values. | ||
|
||
Attributes | ||
---------- | ||
None | ||
|
||
Methods | ||
------- | ||
None | ||
""" | ||
|
||
freq = "D" | ||
|
||
def __init__(self, values, copy=False): | ||
if isinstance(values, (ABCSeries, ABCIndexClass)): | ||
values = values._values | ||
|
||
if isinstance(values, type(self)): | ||
values = values._data | ||
|
||
if not isinstance(values, np.ndarray): | ||
msg = ( | ||
f"Unexpected type '{type(values).__name__}'. 'values' must be " | ||
"a DateArray ndarray, or Series or Index containing one of" | ||
" those." | ||
) | ||
raise ValueError(msg) | ||
|
||
if not self._is_compatible_dtype(values.dtype): | ||
msg = ( | ||
f"The dtype of 'values' is incorrect. Must be one of {VALID_TYPES}." | ||
f" Got {values.dtype} instead." | ||
) | ||
raise ValueError(msg) | ||
|
||
if values.dtype == INTEGER_BACKEND: | ||
values = values.view(D_DATETIME_DTYPE) | ||
elif values.dtype != "datetime64[D]": | ||
values = _to_date_values(values, copy) | ||
|
||
if copy: | ||
values = values.copy() | ||
|
||
self._data = values | ||
|
||
@staticmethod | ||
def _is_compatible_dtype(dtype): | ||
return ( | ||
is_integer_dtype(dtype) | ||
or is_object_dtype(dtype) | ||
or is_datetime64_dtype(dtype) | ||
or dtype == "datetime64[D]" | ||
) | ||
|
||
@classmethod | ||
def _simple_new(cls, values, **kwargs): | ||
assert isinstance(values, np.ndarray) | ||
if values.dtype == INTEGER_BACKEND: | ||
values = values.view(D_DATETIME_DTYPE) | ||
|
||
result = object.__new__(cls) | ||
result._data = values | ||
return result | ||
|
||
@classmethod | ||
def _from_sequence(cls, scalars, dtype=None, copy=False): | ||
""" | ||
Construct a new ExtensionArray from a sequence of scalars. | ||
|
||
Parameters | ||
---------- | ||
scalars : Sequence | ||
Each element will be an instance of the scalar type for this | ||
array, ``cls.dtype.type``. | ||
dtype : dtype, optional | ||
Construct for this particular dtype. This should be a Dtype | ||
compatible with the ExtensionArray. | ||
copy : bool, default False | ||
If True, copy the underlying data. | ||
|
||
Returns | ||
------- | ||
DateArray | ||
""" | ||
if ( | ||
isinstance(scalars, np.ndarray) | ||
and lib.infer_dtype(scalars, skipna=True) == "integer" | ||
): | ||
values = scalars.astype(INTEGER_BACKEND) | ||
elif is_integer_dtype(scalars): | ||
values = scalars._data | ||
else: | ||
values = _to_date_values(scalars, copy) | ||
return cls._simple_new(values) | ||
|
||
def _from_backing_data(self: _T, arr: np.ndarray) -> _T: | ||
return type(self)(arr) | ||
|
||
@property | ||
def dtype(self) -> ExtensionDtype: | ||
return DateDtype() | ||
|
||
def __iter__(self): | ||
for date_data in self._data: | ||
yield date_data | ||
|
||
@property | ||
def _box_func(self): | ||
# TODO Implement Datestamp of a similar form in cython | ||
return lambda x: Timestamp(x, freq="D", tz="utc") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we want to be timezone naive by default There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes that's why I have a todo to create a date stamp, but that will need to be implemented in cython, which I can do, I just wanted to get something working first. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wouldnt we want datetime.date objects anyway (or Period[D] objects) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jbrockmendel Yeah probably this is just a place holder |
||
|
||
@property | ||
def asi8(self) -> np.ndarray: | ||
return self._data.view(INTEGER_BACKEND) | ||
|
||
@property | ||
def as_datetime_i8(self) -> np.ndarray: | ||
return self._data.astype(DT64NS_DTYPE).view(INTEGER_BACKEND) | ||
|
||
@property | ||
def date(self): | ||
timestamps = self.as_datetime_i8 | ||
return tslib.ints_to_pydatetime(timestamps, box="date") | ||
|
||
def astype(self, dtype, copy=True): | ||
dtype = pandas_dtype(dtype) | ||
if isinstance(dtype, type(self.dtype)): | ||
if copy: | ||
return self.copy() | ||
return self | ||
if is_datetime64_dtype(dtype): | ||
return array(self._data, dtype=DT64NS_DTYPE) | ||
if is_object_dtype(dtype): | ||
return self._box_values(self.as_datetime_i8) | ||
if is_string_dtype(dtype): | ||
return array(self._format_native_types()) | ||
return super().astype(dtype, copy) | ||
|
||
def _format_native_types(self, na_rep="NaT", date_format=None): | ||
return tslib.format_array_from_datetime( | ||
self.as_datetime_i8, tz="utc", format="%Y-%m-%d", na_rep=na_rep | ||
) | ||
|
||
def __len__(self): | ||
return len(self._data) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
""" | ||
|
||
import re | ||
import datetime | ||
from typing import ( | ||
TYPE_CHECKING, | ||
Any, | ||
|
@@ -34,6 +35,7 @@ | |
IntervalArray, | ||
PeriodArray, | ||
DatetimeArray, | ||
DateArray, | ||
) | ||
from pandas import Categorical # noqa: F401 | ||
|
||
|
@@ -1232,3 +1234,88 @@ def __from_arrow__( | |
results.append(iarr) | ||
|
||
return IntervalArray._concat_same_type(results) | ||
|
||
|
||
@register_extension_dtype | ||
class DateDtype(PandasExtensionDtype): | ||
""" | ||
An ExtensionDtype to hold a single date. | ||
The attributes name & type are set when subclasses are created. | ||
""" | ||
|
||
_date_aliases = {"date", "date64"} | ||
_unit = "D" | ||
_numpy_dtype = np.dtype("datetime64[D]") | ||
|
||
def __str__(self): | ||
return "date" | ||
|
||
@property | ||
def name(self) -> str_type: | ||
return str(self) | ||
|
||
@property | ||
def type(self): | ||
return datetime.date | ||
|
||
@property | ||
def na_value(self): | ||
return NaT | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This needs to be discussed in detail. We need to decide if we want There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. -1 on changing to NA at this time, we have a very well established policy of NaT in all datetimelikes. |
||
|
||
def __repr__(self): | ||
return type(self) | ||
|
||
@property | ||
def kind(self): | ||
return self._numpy_dtype.kind | ||
|
||
@property | ||
def itemsize(self): | ||
""" Return the number of bytes in this dtype """ | ||
return self.numpy_dtype.itemsize | ||
|
||
@classmethod | ||
def construct_from_string(cls, string: str): | ||
if not isinstance(string, str): | ||
raise TypeError( | ||
f"'construct_from_string' expects a string, got {type(string)}" | ||
) | ||
|
||
if string in cls._date_aliases: | ||
return cls() | ||
|
||
msg = ( | ||
f"Cannot construct a 'DateDtype' from '{string}'.\n\n" | ||
"Incorrectly formatted string passed to constructor. " | ||
"Valid formats include only date" | ||
) | ||
raise TypeError(msg) | ||
|
||
@classmethod | ||
def construct_array_type(cls): | ||
""" | ||
Return the array type associated with this dtype. | ||
Returns | ||
------- | ||
type | ||
""" | ||
from pandas.core.arrays import DateArray | ||
|
||
return DateArray | ||
|
||
# TODO make from arrow | ||
|
||
@classmethod | ||
def is_dtype(cls, dtype) -> bool: | ||
if isinstance(dtype, str): | ||
if dtype.lower().startswith("date"): | ||
try: | ||
if cls.construct_from_string(dtype) is not None: | ||
return True | ||
else: | ||
return False | ||
except (ValueError, TypeError): | ||
return False | ||
else: | ||
return False | ||
return super().is_dtype(dtype) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why aren't you simply keeping ordinals since epoch? its performant and much simpler
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am keeping them, as I understand, the view just changes the outer representation, but not the backend. The same thing is done in the datetime array