17
17
18
18
import datetime
19
19
import re
20
+ from typing import Union
20
21
21
22
import numpy
22
23
import packaging .version
29
30
import pandas .core .dtypes .generic
30
31
import pandas .core .nanops
31
32
import pyarrow
33
+ import pyarrow .compute
32
34
33
35
from db_dtypes .version import __version__
34
36
from db_dtypes import core
35
37
36
38
37
39
date_dtype_name = "dbdate"
38
40
time_dtype_name = "dbtime"
41
+ _EPOCH = datetime .datetime (1970 , 1 , 1 )
42
+ _NPEPOCH = numpy .datetime64 (_EPOCH )
39
43
40
44
pandas_release = packaging .version .parse (pandas .__version__ ).release
41
45
@@ -52,6 +56,33 @@ class TimeDtype(core.BaseDatetimeDtype):
52
56
def construct_array_type (self ):
53
57
return TimeArray
54
58
59
+ @staticmethod
60
+ def __from_arrow__ (
61
+ array : Union [pyarrow .Array , pyarrow .ChunkedArray ]
62
+ ) -> "TimeArray" :
63
+ """Convert to dbtime data from an Arrow array.
64
+
65
+ See:
66
+ https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow
67
+ """
68
+ # We can't call combine_chunks on an empty array, so short-circuit the
69
+ # rest of the function logic for this special case.
70
+ if len (array ) == 0 :
71
+ return TimeArray (numpy .array ([], dtype = "datetime64[ns]" ))
72
+
73
+ # We can't cast to timestamp("ns"), but time64("ns") has the same
74
+ # memory layout: 64-bit integers representing the number of nanoseconds
75
+ # since the datetime epoch (midnight 1970-01-01).
76
+ array = pyarrow .compute .cast (array , pyarrow .time64 ("ns" ))
77
+
78
+ # ChunkedArray has no "view" method, so combine into an Array.
79
+ if isinstance (array , pyarrow .ChunkedArray ):
80
+ array = array .combine_chunks ()
81
+
82
+ array = array .view (pyarrow .timestamp ("ns" ))
83
+ np_array = array .to_numpy (zero_copy_only = False )
84
+ return TimeArray (np_array )
85
+
55
86
56
87
class TimeArray (core .BaseDatetimeArray ):
57
88
"""
@@ -61,8 +92,6 @@ class TimeArray(core.BaseDatetimeArray):
61
92
# Data are stored as datetime64 values with a date of Jan 1, 1970
62
93
63
94
dtype = TimeDtype ()
64
- _epoch = datetime .datetime (1970 , 1 , 1 )
65
- _npepoch = numpy .datetime64 (_epoch )
66
95
67
96
@classmethod
68
97
def _datetime (
@@ -75,8 +104,21 @@ def _datetime(
75
104
r"(?:\.(?P<fraction>\d*))?)?)?\s*$"
76
105
).match ,
77
106
):
78
- if isinstance (scalar , datetime .time ):
79
- return datetime .datetime .combine (cls ._epoch , scalar )
107
+ # Convert pyarrow values to datetime.time.
108
+ if isinstance (scalar , (pyarrow .Time32Scalar , pyarrow .Time64Scalar )):
109
+ scalar = (
110
+ scalar .cast (pyarrow .time64 ("ns" ))
111
+ .cast (pyarrow .int64 ())
112
+ .cast (pyarrow .timestamp ("ns" ))
113
+ .as_py ()
114
+ )
115
+
116
+ if scalar is None :
117
+ return None
118
+ elif isinstance (scalar , datetime .time ):
119
+ return datetime .datetime .combine (_EPOCH , scalar )
120
+ elif isinstance (scalar , pandas .Timestamp ):
121
+ return scalar .to_datetime64 ()
80
122
elif isinstance (scalar , str ):
81
123
# iso string
82
124
parsed = match_fn (scalar )
@@ -113,7 +155,7 @@ def _box_func(self, x):
113
155
__return_deltas = {"timedelta" , "timedelta64" , "timedelta64[ns]" , "<m8" , "<m8[ns]" }
114
156
115
157
def astype (self , dtype , copy = True ):
116
- deltas = self ._ndarray - self . _npepoch
158
+ deltas = self ._ndarray - _NPEPOCH
117
159
stype = str (dtype )
118
160
if stype in self .__return_deltas :
119
161
return deltas
@@ -122,15 +164,25 @@ def astype(self, dtype, copy=True):
122
164
else :
123
165
return super ().astype (dtype , copy = copy )
124
166
125
- if pandas_release < (1 ,):
167
+ def __arrow_array__ (self , type = None ):
168
+ """Convert to an Arrow array from dbtime data.
126
169
127
- def to_numpy (self , dtype = "object" ):
128
- return self .astype (dtype )
170
+ See:
171
+ https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow
172
+ """
173
+ array = pyarrow .array (self ._ndarray , type = pyarrow .timestamp ("ns" ))
129
174
130
- def __arrow_array__ (self , type = None ):
131
- return pyarrow .array (
132
- self .to_numpy (dtype = "object" ),
133
- type = type if type is not None else pyarrow .time64 ("ns" ),
175
+ # ChunkedArray has no "view" method, so combine into an Array.
176
+ array = (
177
+ array .combine_chunks () if isinstance (array , pyarrow .ChunkedArray ) else array
178
+ )
179
+
180
+ # We can't cast to time64("ns"), but timestamp("ns") has the same
181
+ # memory layout: 64-bit integers representing the number of nanoseconds
182
+ # since the datetime epoch (midnight 1970-01-01).
183
+ array = array .view (pyarrow .time64 ("ns" ))
184
+ return pyarrow .compute .cast (
185
+ array , type if type is not None else pyarrow .time64 ("ns" ),
134
186
)
135
187
136
188
@@ -146,6 +198,19 @@ class DateDtype(core.BaseDatetimeDtype):
146
198
def construct_array_type (self ):
147
199
return DateArray
148
200
201
+ @staticmethod
202
+ def __from_arrow__ (
203
+ array : Union [pyarrow .Array , pyarrow .ChunkedArray ]
204
+ ) -> "DateArray" :
205
+ """Convert to dbdate data from an Arrow array.
206
+
207
+ See:
208
+ https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow
209
+ """
210
+ array = pyarrow .compute .cast (array , pyarrow .timestamp ("ns" ))
211
+ np_array = array .to_numpy ()
212
+ return DateArray (np_array )
213
+
149
214
150
215
class DateArray (core .BaseDatetimeArray ):
151
216
"""
@@ -161,7 +226,13 @@ def _datetime(
161
226
scalar ,
162
227
match_fn = re .compile (r"\s*(?P<year>\d+)-(?P<month>\d+)-(?P<day>\d+)\s*$" ).match ,
163
228
):
164
- if isinstance (scalar , datetime .date ):
229
+ # Convert pyarrow values to datetime.date.
230
+ if isinstance (scalar , (pyarrow .Date32Scalar , pyarrow .Date64Scalar )):
231
+ scalar = scalar .as_py ()
232
+
233
+ if scalar is None :
234
+ return None
235
+ elif isinstance (scalar , datetime .date ):
165
236
return datetime .datetime (scalar .year , scalar .month , scalar .day )
166
237
elif isinstance (scalar , str ):
167
238
match = match_fn (scalar )
@@ -197,16 +268,22 @@ def astype(self, dtype, copy=True):
197
268
return super ().astype (dtype , copy = copy )
198
269
199
270
def __arrow_array__ (self , type = None ):
200
- return pyarrow .array (
201
- self ._ndarray , type = type if type is not None else pyarrow .date32 (),
271
+ """Convert to an Arrow array from dbdate data.
272
+
273
+ See:
274
+ https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow
275
+ """
276
+ array = pyarrow .array (self ._ndarray , type = pyarrow .timestamp ("ns" ))
277
+ return pyarrow .compute .cast (
278
+ array , type if type is not None else pyarrow .date32 (),
202
279
)
203
280
204
281
def __add__ (self , other ):
205
282
if isinstance (other , pandas .DateOffset ):
206
283
return self .astype ("object" ) + other
207
284
208
285
if isinstance (other , TimeArray ):
209
- return (other ._ndarray - other . _npepoch ) + self ._ndarray
286
+ return (other ._ndarray - _NPEPOCH ) + self ._ndarray
210
287
211
288
return super ().__add__ (other )
212
289
0 commit comments