Skip to content

Commit e9e8f19

Browse files
Unique and (#9)
1 parent 490fe65 commit e9e8f19

File tree

2 files changed

+39
-20
lines changed

2 files changed

+39
-20
lines changed

cyberpandas/ip_array.py

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -452,28 +452,20 @@ def index_type(self):
452452

453453
def unique(self):
454454
# type: () -> ExtensionArray
455-
pass
455+
# https://github.com/pandas-dev/pandas/pull/19869
456+
_, indices = np.unique(self.data, return_index=True)
457+
data = self.data.take(np.sort(indices))
458+
return self._from_ndarray(data)
456459

457-
def _factorize(self, sort=False):
460+
def factorize(self, sort=False):
458461
# XXX: Verify this, check for better algo
459-
# astype to avoid endianness issues in pd.factorize
460-
a, _ = pd.factorize(self.data['lo'].astype('u8'))
461-
b, _ = pd.factorize(self.data['hi'].astype('u8'))
462-
463-
labels = np.bitwise_xor.reduce(
464-
np.concatenate([a.reshape(-1, 1),
465-
b.reshape(-1, 1)], axis=1),
466-
axis=1
467-
)
468-
469-
# TODO: refactor into a .unique
470-
# TODO: Handle empty, scalar, etc.
471-
mask = np.zeros(len(labels), dtype=bool)
472-
mask[0] = True
473-
inner_mask = (labels[1:] - labels[:-1]) != 0
474-
mask[1:] = inner_mask
475-
476-
uniques = self[mask]
462+
uniques, indices, labels = np.unique(self.data,
463+
return_index=True,
464+
return_inverse=True)
465+
if not sort:
466+
# Unsort, since np.unique sorts
467+
uniques = self._from_ndarray(self.data.take(np.sort(indices)))
468+
labels = np.argsort(uniques.data).take(labels)
477469
return labels, uniques
478470

479471

cyberpandas/test_ip.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,3 +278,30 @@ def test_bytes_roundtrip():
278278

279279
result = ip.IPArray.from_bytes(bytestring)
280280
assert result.equals(arr)
281+
282+
283+
def test_unique():
284+
arr = ip.IPArray([3, 3, 1, 2, 3, _U8_MAX + 1])
285+
result = arr.unique()
286+
assert isinstance(result, ip.IPArray)
287+
288+
result = result.astype(object)
289+
expected = pd.unique(arr.astype(object))
290+
tm.assert_numpy_array_equal(result, expected)
291+
292+
293+
@pytest.mark.parametrize('sort', [
294+
pytest.param(True, marks=pytest.mark.xfail(reason="Upstream sort_values")),
295+
False
296+
])
297+
def test_factorize(sort):
298+
arr = ip.IPArray([3, 3, 1, 2, 3, _U8_MAX + 1])
299+
labels, uniques = arr.factorize(sort=sort)
300+
expected_labels, expected_uniques = pd.factorize(arr.astype(object),
301+
sort=sort)
302+
303+
assert isinstance(uniques, ip.IPArray)
304+
305+
uniques = uniques.astype(object)
306+
tm.assert_numpy_array_equal(labels, expected_labels)
307+
tm.assert_numpy_array_equal(uniques, expected_uniques)

0 commit comments

Comments
 (0)