Skip to content

Add date dtype #34441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 31 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
ac8e285
ENH: Add date dtype implementation
zbrookle May 21, 2020
0ad60de
ENH: Date type now functions as expected
zbrookle May 26, 2020
ae1a498
TST: Start adding unit tests for dates
zbrookle May 26, 2020
5c5ee4b
ENH: ints, datetimes and objects can convert to datearray
zbrookle May 26, 2020
224b59d
ENH: Add proper formatting for dates
zbrookle May 26, 2020
5213efe
ENH: Add initilization tests from datetime, int, and object numpy arrays
zbrookle May 26, 2020
e000786
ENH: All conversions and displays for date object now behave properly
zbrookle May 27, 2020
a9ac366
CLN: Remove print statements
zbrookle May 27, 2020
4b441f3
ENH: Can now convert date to object, string, int, and datetime64
zbrookle May 27, 2020
4ec5d72
CLN: Move dtype testing to test_common
zbrookle May 27, 2020
539444e
BUG: Raise exception when given incompatible dtype ndarray
zbrookle May 27, 2020
6db4aea
ENH: Add integer able to convert to date
zbrookle May 28, 2020
6f4eb44
BUG: Fix numpy kind for date dtype
zbrookle May 28, 2020
0e30fa5
ENH: Add conversion from datetime to date
zbrookle May 28, 2020
a26a4f7
BUG: Add copy to from sequence
zbrookle May 28, 2020
af37183
BUG: Fix cast date as date type
zbrookle May 28, 2020
69b297f
ENH: Remove unneeded tests
zbrookle May 28, 2020
9aab22d
CLN: Remove main
zbrookle May 28, 2020
2f3f579
CLN: Fix linting errors
zbrookle May 28, 2020
eb947d7
CLN: Fix mypy errors
zbrookle May 28, 2020
a6d6bc5
CLN: Fix pep8 errors
zbrookle May 28, 2020
61d07f9
CLN: Fix pep8 problems
zbrookle May 28, 2020
85e71fd
Merge branch 'master' into add_date_dtype
zbrookle May 28, 2020
5673cf3
BUG: Fix convert date to string for newest pandas
zbrookle May 28, 2020
79c9254
BUG: Fix convert integer to date for new framework
zbrookle May 28, 2020
e9c8d96
DOC: Remove incorrect warning from docstring
zbrookle May 29, 2020
c209de1
ENH: Add support for int and datetime series converting to date
zbrookle May 29, 2020
f207989
ENH: Override from backing data
zbrookle May 29, 2020
31fa485
ENH: Add support for conversion from date series to object, string, i…
zbrookle May 29, 2020
73e278b
BUG: String conversion was resulting in object numpy array
zbrookle May 29, 2020
068e9bc
ENH: Change DateType type to datetime.date
zbrookle May 29, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
DatetimeTZDtype,
IntervalDtype,
PeriodDtype,
DateDtype
)
from pandas.core.dtypes.missing import isna, isnull, notna, notnull

Expand Down
2 changes: 2 additions & 0 deletions pandas/core/arrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pandas.core.arrays.sparse import SparseArray
from pandas.core.arrays.string_ import StringArray
from pandas.core.arrays.timedeltas import TimedeltaArray
from pandas.core.arrays.dates import DateArray

__all__ = [
"ExtensionArray",
Expand All @@ -31,4 +32,5 @@
"SparseArray",
"StringArray",
"TimedeltaArray",
"DateArray"
]
184 changes: 184 additions & 0 deletions pandas/core/arrays/dates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.arrays.datetimelike import DatelikeOps, DatetimeLikeArrayMixin
from pandas.core.arrays.datetimes import sequence_to_dt64ns
from pandas.core.dtypes.common import (
is_integer_dtype,
is_datetime64_dtype,
is_object_dtype,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass
from pandas.core.dtypes.dtypes import DateDtype
from pandas.core.construction import array
from pandas._libs.tslibs import Timestamp
from pandas._libs.tslibs.conversion import DT64NS_DTYPE
from pandas._libs import tslib, lib
from pandas.core.arrays._mixins import _T

import numpy as np

D_DATETIME_DTYPE = "datetime64[D]"
INTEGER_BACKEND = "i8"
VALID_TYPES = {INTEGER_BACKEND, "datetime64[ns]", D_DATETIME_DTYPE, "object"}


def _to_date_values(values, copy=False):
data, _, _ = sequence_to_dt64ns(values, copy=copy)
return data.astype(D_DATETIME_DTYPE)


class DateArray(DatetimeLikeArrayMixin, DatelikeOps):
"""
Pandas ExtensionArray for date (year, month, day only) data.

Parameters
----------
values : Series, Index, DateArray, ndarray
The date data.
copy : bool, default False
Whether to copy the underlying array of values.

Attributes
----------
None

Methods
-------
None
"""

freq = "D"

def __init__(self, values, copy=False):
if isinstance(values, (ABCSeries, ABCIndexClass)):
values = values._values

if isinstance(values, type(self)):
values = values._data

if not isinstance(values, np.ndarray):
msg = (
f"Unexpected type '{type(values).__name__}'. 'values' must be "
"a DateArray ndarray, or Series or Index containing one of"
" those."
)
raise ValueError(msg)

if not self._is_compatible_dtype(values.dtype):
msg = (
f"The dtype of 'values' is incorrect. Must be one of {VALID_TYPES}."
f" Got {values.dtype} instead."
)
raise ValueError(msg)

if values.dtype == INTEGER_BACKEND:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why aren't you simply keeping ordinals since epoch? its performant and much simpler

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am keeping them, as I understand, the view just changes the outer representation, but not the backend. The same thing is done in the datetime array

values = values.view(D_DATETIME_DTYPE)
elif values.dtype != "datetime64[D]":
values = _to_date_values(values, copy)

if copy:
values = values.copy()

self._data = values

@staticmethod
def _is_compatible_dtype(dtype):
return (
is_integer_dtype(dtype)
or is_object_dtype(dtype)
or is_datetime64_dtype(dtype)
or dtype == "datetime64[D]"
)

@classmethod
def _simple_new(cls, values, **kwargs):
assert isinstance(values, np.ndarray)
if values.dtype == INTEGER_BACKEND:
values = values.view(D_DATETIME_DTYPE)

result = object.__new__(cls)
result._data = values
return result

@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
"""
Construct a new ExtensionArray from a sequence of scalars.

Parameters
----------
scalars : Sequence
Each element will be an instance of the scalar type for this
array, ``cls.dtype.type``.
dtype : dtype, optional
Construct for this particular dtype. This should be a Dtype
compatible with the ExtensionArray.
copy : bool, default False
If True, copy the underlying data.

Returns
-------
DateArray
"""
if (
isinstance(scalars, np.ndarray)
and lib.infer_dtype(scalars, skipna=True) == "integer"
):
values = scalars.astype(INTEGER_BACKEND)
elif is_integer_dtype(scalars):
values = scalars._data
else:
values = _to_date_values(scalars, copy)
return cls._simple_new(values)

def _from_backing_data(self: _T, arr: np.ndarray) -> _T:
return type(self)(arr)

@property
def dtype(self) -> ExtensionDtype:
return DateDtype()

def __iter__(self):
for date_data in self._data:
yield date_data

@property
def _box_func(self):
# TODO Implement Datestamp of a similar form in cython
return lambda x: Timestamp(x, freq="D", tz="utc")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we want to be timezone naive by default

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that's why I have a todo to create a date stamp, but that will need to be implemented in cython, which I can do, I just wanted to get something working first.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wouldnt we want datetime.date objects anyway (or Period[D] objects)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbrockmendel Yeah probably this is just a place holder


@property
def asi8(self) -> np.ndarray:
return self._data.view(INTEGER_BACKEND)

@property
def as_datetime_i8(self) -> np.ndarray:
return self._data.astype(DT64NS_DTYPE).view(INTEGER_BACKEND)

@property
def date(self):
timestamps = self.as_datetime_i8
return tslib.ints_to_pydatetime(timestamps, box="date")

def astype(self, dtype, copy=True):
dtype = pandas_dtype(dtype)
if isinstance(dtype, type(self.dtype)):
if copy:
return self.copy()
return self
if is_datetime64_dtype(dtype):
return array(self._data, dtype=DT64NS_DTYPE)
if is_object_dtype(dtype):
return self._box_values(self.as_datetime_i8)
if is_string_dtype(dtype):
return array(self._format_native_types())
return super().astype(dtype, copy)

def _format_native_types(self, na_rep="NaT", date_format=None):
return tslib.format_array_from_datetime(
self.as_datetime_i8, tz="utc", format="%Y-%m-%d", na_rep=na_rep
)

def __len__(self):
return len(self._data)
3 changes: 3 additions & 0 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
is_datetime64_dtype,
is_datetime64_ns_dtype,
is_datetime64tz_dtype,
is_date_dtype,
is_dtype_equal,
is_extension_array_dtype,
is_float_dtype,
Expand Down Expand Up @@ -601,6 +602,8 @@ def astype(self, dtype, copy=True):
return self
elif is_period_dtype(dtype):
return self.to_period(freq=dtype.freq)
elif is_date_dtype(dtype):
return dtype.construct_array_type()._from_sequence(self._data, copy)
return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy)

# -----------------------------------------------------------------
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pandas.core.dtypes.common import (
is_bool_dtype,
is_datetime64_dtype,
is_date_dtype,
is_float,
is_float_dtype,
is_integer,
Expand Down Expand Up @@ -453,6 +454,8 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
elif isinstance(dtype, BooleanDtype):
result = self._data.astype("bool", copy=False)
return BooleanArray(result, mask=self._mask, copy=False)
elif is_date_dtype(dtype):
return dtype.construct_array_type()._from_sequence(self._data, copy=False)

# coerce
if is_float_dtype(dtype):
Expand Down
33 changes: 33 additions & 0 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
DateDtype,
ExtensionDtype,
IntervalDtype,
PeriodDtype,
Expand Down Expand Up @@ -419,6 +420,38 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool:
return DatetimeTZDtype.is_dtype(arr_or_dtype)


def is_date_dtype(arr_or_dtype) -> bool:
"""
Check whether an array-like or dtype is of the date dtype.

Parameters
----------
arr_or_dtype : array-like
The array-like or dtype to check.

Returns
-------
boolean
Whether or not the array-like or dtype is of the date dtype.

Examples
--------
>>> is_date_dtype(object)
False
>>> is_date_dtype(np.datetime64)
False
>>> is_date_dtype(pd.Date64Dtype())
True
>>> is_date_dtype([1, 2, 3])
False
>>> is_date_dtype(pd.Series([], dtype="date"))
True
>>> is_date_dtype('0 days')
False
"""
return DateDtype.is_dtype(arr_or_dtype)


def is_timedelta64_dtype(arr_or_dtype) -> bool:
"""
Check whether an array-like or dtype is of the timedelta64 dtype.
Expand Down
87 changes: 87 additions & 0 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import re
import datetime
from typing import (
TYPE_CHECKING,
Any,
Expand Down Expand Up @@ -34,6 +35,7 @@
IntervalArray,
PeriodArray,
DatetimeArray,
DateArray,
)
from pandas import Categorical # noqa: F401

Expand Down Expand Up @@ -1232,3 +1234,88 @@ def __from_arrow__(
results.append(iarr)

return IntervalArray._concat_same_type(results)


@register_extension_dtype
class DateDtype(PandasExtensionDtype):
"""
An ExtensionDtype to hold a single date.
The attributes name & type are set when subclasses are created.
"""

_date_aliases = {"date", "date64"}
_unit = "D"
_numpy_dtype = np.dtype("datetime64[D]")

def __str__(self):
return "date"

@property
def name(self) -> str_type:
return str(self)

@property
def type(self):
return datetime.date

@property
def na_value(self):
return NaT
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs to be discussed in detail. We need to decide if we want NaT or NA semantics. I think I'd prefer NA.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-1 on changing to NA at this time, we have a very well established policy of NaT in all datetimelikes.


def __repr__(self):
return type(self)

@property
def kind(self):
return self._numpy_dtype.kind

@property
def itemsize(self):
""" Return the number of bytes in this dtype """
return self.numpy_dtype.itemsize

@classmethod
def construct_from_string(cls, string: str):
if not isinstance(string, str):
raise TypeError(
f"'construct_from_string' expects a string, got {type(string)}"
)

if string in cls._date_aliases:
return cls()

msg = (
f"Cannot construct a 'DateDtype' from '{string}'.\n\n"
"Incorrectly formatted string passed to constructor. "
"Valid formats include only date"
)
raise TypeError(msg)

@classmethod
def construct_array_type(cls):
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
from pandas.core.arrays import DateArray

return DateArray

# TODO make from arrow

@classmethod
def is_dtype(cls, dtype) -> bool:
if isinstance(dtype, str):
if dtype.lower().startswith("date"):
try:
if cls.construct_from_string(dtype) is not None:
return True
else:
return False
except (ValueError, TypeError):
return False
else:
return False
return super().is_dtype(dtype)
Loading