Skip to content

Commit 173c438

Browse files
Merge pull request #418 from datapythonista/pandas_udf
Add support for new pandas UDF engine
2 parents 3f2f722 + c0e9602 commit 173c438

File tree

2 files changed

+175
-0
lines changed

2 files changed

+175
-0
lines changed

src/blosc2/proxy.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,3 +691,58 @@ def wrapper(*args, **func_kwargs):
691691
return decorator
692692
else:
693693
return decorator(func)
694+
695+
696+
class PandasUdfEngine:
697+
@staticmethod
698+
def _ensure_numpy_data(data):
699+
if not isinstance(data, np.ndarray):
700+
try:
701+
data = data.values
702+
except AttributeError as err:
703+
raise ValueError(
704+
"blosc2.jit received an object of type {data.__name__}, which is not supported. "
705+
"Try casting your Series or DataFrame to a NumPy dtype."
706+
) from err
707+
return data
708+
709+
@classmethod
710+
def map(cls, data, func, args, kwargs, decorator, skip_na):
711+
"""
712+
JIT a NumPy array element-wise. In the case of Blosc2, functions are
713+
expected to be vectorized NumPy operations, so the function is called
714+
with the NumPy array as the function parameter, instead of calling the
715+
function once for each element.
716+
"""
717+
raise NotImplementedError("The Blosc2 engine does not support map. Use apply instead.")
718+
719+
@classmethod
720+
def apply(cls, data, func, args, kwargs, decorator, axis):
721+
"""
722+
JIT a NumPy array by column or row. In the case of Blosc2, functions are
723+
expected to be vectorized NumPy operations, so the function is called
724+
with the NumPy array as the function parameter, instead of calling the
725+
function once for each column or row.
726+
"""
727+
data = cls._ensure_numpy_data(data)
728+
func = decorator(func)
729+
if data.ndim == 1 or axis is None:
730+
# pandas Series.apply or pipe
731+
return func(data, *args, **kwargs)
732+
elif axis in (0, "index"):
733+
# pandas apply(axis=0) column-wise
734+
result = []
735+
for row_idx in range(data.shape[1]):
736+
result.append(func(data[:, row_idx], *args, **kwargs))
737+
return np.vstack(result).transpose()
738+
elif axis in (1, "columns"):
739+
# pandas apply(axis=1) row-wise
740+
result = []
741+
for col_idx in range(data.shape[0]):
742+
result.append(func(data[col_idx, :], *args, **kwargs))
743+
return np.vstack(result)
744+
else:
745+
raise NotImplementedError(f"Unknown axis '{axis}'. Use one of 0, 1 or None.")
746+
747+
748+
jit.__pandas_udf__ = PandasUdfEngine

tests/test_pandas_udf_engine.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <[email protected]>
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under a BSD-style license (found in the
6+
# LICENSE file in the root directory of this source tree)
7+
#######################################################################
8+
9+
import numpy as np
10+
import pytest
11+
12+
import blosc2
13+
14+
15+
class TestPandasUDF:
16+
def test_map(self):
17+
def add_one(x):
18+
return x + 1
19+
20+
data = np.array([1, 2])
21+
22+
with pytest.raises(NotImplementedError):
23+
blosc2.jit.__pandas_udf__.map(
24+
data,
25+
add_one,
26+
args=(),
27+
kwargs={},
28+
decorator=blosc2.jit,
29+
skip_na=False,
30+
)
31+
32+
def test_apply_1d(self):
33+
def add_one(x):
34+
return x + 1
35+
36+
data = np.array([1, 2])
37+
38+
result = blosc2.jit.__pandas_udf__.apply(
39+
data,
40+
add_one,
41+
args=(),
42+
kwargs={},
43+
decorator=blosc2.jit,
44+
axis=0,
45+
)
46+
assert result.shape == (2,)
47+
assert result[0] == 2
48+
assert result[1] == 3
49+
50+
def test_apply_1d_with_args(self):
51+
def add_numbers(x, num1, num2):
52+
return x + num1 + num2
53+
54+
data = np.array([1, 2])
55+
56+
result = blosc2.jit.__pandas_udf__.apply(
57+
data,
58+
add_numbers,
59+
args=(10,),
60+
kwargs={"num2": 100},
61+
decorator=blosc2.jit,
62+
axis=0,
63+
)
64+
assert result.shape == (2,)
65+
assert result[0] == 111
66+
assert result[1] == 112
67+
68+
def test_apply_2d(self):
69+
def add_one(x):
70+
assert x.shape == (2, 3)
71+
return x + 1
72+
73+
data = np.array([[1, 2, 3], [4, 5, 6]])
74+
75+
result = blosc2.jit.__pandas_udf__.apply(
76+
data,
77+
add_one,
78+
args=(),
79+
kwargs={},
80+
decorator=blosc2.jit,
81+
axis=None,
82+
)
83+
expected = np.array([[2, 3, 4], [5, 6, 7]])
84+
assert np.array_equal(result, expected)
85+
86+
def test_apply_2d_by_column(self):
87+
def add_one(x):
88+
assert x.shape == (2,)
89+
return x + 1
90+
91+
data = np.array([[1, 2, 3], [4, 5, 6]])
92+
93+
result = blosc2.jit.__pandas_udf__.apply(
94+
data,
95+
add_one,
96+
args=(),
97+
kwargs={},
98+
decorator=blosc2.jit,
99+
axis=0,
100+
)
101+
expected = np.array([[2, 3, 4], [5, 6, 7]])
102+
assert np.array_equal(result, expected)
103+
104+
def test_apply_2d_by_row(self):
105+
def add_one(x):
106+
assert x.shape == (3,)
107+
return x + 1
108+
109+
data = np.array([[1, 2, 3], [4, 5, 6]])
110+
111+
result = blosc2.jit.__pandas_udf__.apply(
112+
data,
113+
add_one,
114+
args=(),
115+
kwargs={},
116+
decorator=blosc2.jit,
117+
axis=1,
118+
)
119+
expected = np.array([[2, 3, 4], [5, 6, 7]])
120+
assert np.array_equal(result, expected)

0 commit comments

Comments
 (0)