Skip to content

Commit 6b5d262

Browse files
Faster isin comparisons (#7)
* Faster isin comparisons * Simplify * Updated docstring * Support IPArray comparison * Linting * Py2 compat
1 parent ac4a7f1 commit 6b5d262

File tree

5 files changed

+157
-17
lines changed

5 files changed

+157
-17
lines changed

cyberpandas/ip_array.py

Lines changed: 74 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
delegated_method)
1515
from ._utils import combine, pack, unpack
1616
from .common import _U8_MAX, _IPv4_MAX
17-
from .parser import _to_ipaddress_pyint
17+
from .parser import _to_ipaddress_pyint, _as_ip_object
1818

1919
# -----------------------------------------------------------------------------
2020
# Extension Type
@@ -286,34 +286,94 @@ def packed(self):
286286
return self.data.tobytes()
287287

288288
def isin(self, other):
289-
"""
289+
"""Check whether elements of 'self' are in 'other'.
290+
291+
Comparison is done elementwise.
292+
293+
Parameters
294+
----------
295+
other : str or sequences
296+
For ``str`` 'other', the argument is attempted to
297+
be converted to an :class:`ipaddress.IPv4Network` or
298+
a :class:`ipaddress.IPv6Network` or an :class:`IPArray`.
299+
If all those conversions fail, a TypeError is raised.
300+
301+
For a sequence of strings, the same conversion is attempted.
302+
You should not mix networks with addresses.
303+
304+
Finally, other may be an ``IPArray`` of addresses to compare to.
305+
306+
Returns
307+
-------
308+
contained : ndarray
309+
A 1-D boolean ndarray with the same length as self.
290310
291311
Examples
292312
--------
313+
Comparison to a single network
314+
293315
>>> s = IPArray(['192.168.1.1', '255.255.255.255'])
294316
>>> s.isin('192.168.1.0/24')
295317
array([ True, False])
318+
319+
Comparison to many networks
320+
>>> s.isin(['192.168.1.0/24', '192.168.2.0/24'])
321+
array([ True, False])
322+
323+
Comparison to many IP Addresses
324+
325+
>>> s.isin(['192.168.1.1', '192.168.1.2', '255.255.255.1']])
326+
array([ True, False])
296327
"""
297-
if isinstance(other, str) or not isinstance(other,
298-
collections.Sequence):
328+
box = (isinstance(other, str) or
329+
not isinstance(other, (IPArray, collections.Sequence)))
330+
if box:
299331
other = [other]
300332

301333
networks = []
302-
for net in other:
303-
try:
304-
networks.append(ipaddress.IPv4Network(net))
305-
except ValueError:
306-
networks.append(ipaddress.IPv6Network(net))
334+
addresses = []
335+
336+
if not isinstance(other, IPArray):
337+
for net in other:
338+
net = _as_ip_object(net)
339+
if isinstance(net, (ipaddress.IPv4Network,
340+
ipaddress.IPv6Network)):
341+
networks.append(net)
342+
if isinstance(net, (ipaddress.IPv4Address,
343+
ipaddress.IPv6Address)):
344+
addresses.append(ipaddress.IPv6Network(net))
345+
else:
346+
addresses = other
347+
348+
# Flatten all the addresses
349+
addresses = IPArray(addresses) # TODO: think about copy=False
307350

308-
# TODO: perf
309-
pyips = self.to_pyipaddress()
310351
mask = np.zeros(len(self), dtype='bool')
311352
for network in networks:
312-
for i, ip in enumerate(pyips):
313-
if ip in network:
314-
mask[i] = True
353+
mask |= self._isin_network(network)
354+
355+
# no... we should flatten this.
356+
mask |= self._isin_addresses(addresses)
315357
return mask
316358

359+
def _isin_network(self, other):
360+
# type: (Union[ipaddress.IPv4Network,ipaddress.IPv6Network]) -> ndarray
361+
"""Check whether an array of addresses is contained in a network."""
362+
# A network is bounded below by 'network_address' and
363+
# above by 'broadcast_address'.
364+
# IPArray handles comparisons between arrays of addresses, and NumPy
365+
# handles broadcasting.
366+
net_lo = type(self)([other.network_address])
367+
net_hi = type(self)([other.broadcast_address])
368+
369+
return (net_lo <= self) & (self <= net_hi)
370+
371+
def _isin_addresses(self, other):
372+
"""Check whether elements of self are present in other."""
373+
from pandas.core.algorithms import isin
374+
# TODO(factorize): replace this
375+
return isin(self, other)
376+
317377
def setitem(self, indexer, value):
318378
"""Set the 'value' inplace.
319379
"""

cyberpandas/parser.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,23 @@ def _to_ipaddress_pyint(values):
8181

8282
values2 = [unpack(pack(x)) for x in values]
8383
return np.atleast_1d(np.asarray(values2, dtype=IPType.mybase))
84+
85+
86+
def _as_ip_object(val):
87+
"""Attempt to parse 'val' as any IP object.
88+
89+
Attempts to parse as these in order:
90+
91+
- IP Address (v4 or v6)
92+
- IP Network (v4 or v6)
93+
"""
94+
try:
95+
return ipaddress.ip_address(val)
96+
except ValueError:
97+
pass
98+
99+
try:
100+
return ipaddress.ip_network(val)
101+
except ValueError:
102+
raise ValueError("Could not parse {} is an address or "
103+
"network".format(val))

cyberpandas/test_ip.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,13 +194,50 @@ def test_attributes(prop):
194194
tm.assert_numpy_array_equal(result, expected)
195195

196196

197-
def test_isin():
197+
def test_isin_all4():
198198
s = ip.IPArray([u'192.168.1.1', u'255.255.255.255'])
199199
result = s.isin([u'192.168.1.0/24'])
200200
expected = np.array([True, False])
201201
tm.assert_numpy_array_equal(result, expected)
202202

203203

204+
def test_isin_all6():
205+
s = ip.IPArray([u'2001:db8::1', u'2001:db9::1'])
206+
result = s.isin([u'2001:db8::0/96'])
207+
expected = np.array([True, False])
208+
tm.assert_numpy_array_equal(result, expected)
209+
210+
211+
def test_isin_mix():
212+
s = ip.IPArray([u'192.168.1.1', u'255.255.255.255',
213+
u'2001:db8::1', u'2001:db9::1'])
214+
215+
result = s.isin([u'192.168.1.0/24'])
216+
expected = np.array([True, False, False, False])
217+
tm.assert_numpy_array_equal(result, expected)
218+
219+
result = s.isin([u'2001:db8::0/96'])
220+
expected = np.array([False, False, True, False])
221+
tm.assert_numpy_array_equal(result, expected)
222+
223+
result = s.isin([u'192.168.1.0/24', u'2001:db8::0/96'])
224+
expected = np.array([True, False, True, False])
225+
tm.assert_numpy_array_equal(result, expected)
226+
227+
s = ip.IPArray([u'192.168.1.1', u'192.168.1.2',
228+
u'255.255.255.255'])
229+
result = s.isin([u'192.168.1.0/24'])
230+
expected = np.array([True, True, False])
231+
tm.assert_numpy_array_equal(result, expected)
232+
233+
234+
def test_isin_iparray():
235+
s = ip.IPArray([10, 20, 20, 30])
236+
result = s.isin(ip.IPArray([30, 20]))
237+
expected = np.array([False, True, True, True])
238+
tm.assert_numpy_array_equal(result, expected)
239+
240+
204241
def test_getitem_scalar():
205242
ser = ip.IPArray([0, 1, 2])
206243
result = ser[1]

cyberpandas/test_parser.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import ipaddress
2+
13
import pytest
24

35
from cyberpandas import parser, IPArray
@@ -31,3 +33,26 @@ def test_to_ipaddress_scalar():
3133
expected = parser.to_ipaddress([1])
3234
assert len(result) == 1
3335
assert all(result == expected)
36+
37+
38+
@pytest.mark.parametrize('val, expected', [
39+
(u'192.168.1.1', ipaddress.IPv4Address(u'192.168.1.1')),
40+
(100, ipaddress.IPv4Address(100)),
41+
(ipaddress.IPv4Address(100), ipaddress.IPv4Address(100)),
42+
(2**64, ipaddress.IPv6Address(2**64)),
43+
(u'192.168.0.0/28', ipaddress.IPv4Network(u'192.168.0.0/28')),
44+
(ipaddress.IPv4Network(u'192.168.0.0/28'),
45+
ipaddress.IPv4Network(u'192.168.0.0/28')),
46+
(u'2001:db00::0/24', ipaddress.IPv6Network(u'2001:db00::0/24')),
47+
])
48+
def test_as_ip_object(val, expected):
49+
result = parser._as_ip_object(val)
50+
assert result == expected
51+
52+
53+
@pytest.mark.parametrize("val", [
54+
u"129", -1
55+
])
56+
def test_as_ip_object_raises(val):
57+
with pytest.raises(ValueError):
58+
parser._as_ip_object(val)

docs/source/conf.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,5 @@
159159
]
160160

161161

162-
163-
164162
# Example configuration for intersphinx: refer to the Python standard library.
165163
intersphinx_mapping = {'https://docs.python.org/': None}

0 commit comments

Comments
 (0)