Skip to content

Commit c81f4da

Browse files
committed
Add tocsr method and sorted checking
pull out linear_loc to separate function
1 parent 973cac3 commit c81f4da

File tree

3 files changed

+132
-42
lines changed

3 files changed

+132
-42
lines changed

sparse/core.py

Lines changed: 117 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from __future__ import absolute_import, division, print_function
22

3-
from collections import Iterable
3+
from collections import Iterable, defaultdict, deque
44
from functools import reduce
55
import numbers
66
import operator
@@ -9,6 +9,12 @@
99
import scipy.sparse
1010

1111

12+
try: # Windows compatibility
13+
int = long
14+
except NameError:
15+
pass
16+
17+
1218
class COO(object):
1319
""" A Sparse Multidimensional Array
1420
@@ -29,9 +35,9 @@ class COO(object):
2935
--------
3036
>>> x = np.eye(4)
3137
>>> x[2, 3] = 5
32-
>>> s = COO.from_numpy(x)
38+
>>> s = COO(x)
3339
>>> s
34-
<COO: shape=(4, 4), dtype=float64, nnz=5>
40+
<COO: shape=(4, 4), dtype=float64, nnz=5, sorted=True, duplicates=False>
3541
>>> s.data
3642
array([ 1., 1., 1., 5., 1.])
3743
>>> s.coords
@@ -50,9 +56,9 @@ class COO(object):
5056
>>> data = [1, 2, 3, 4, 5]
5157
>>> y = COO(coords, data, shape=(3, 4, 5))
5258
>>> y
53-
<COO: shape=(3, 4, 5), dtype=int64, nnz=5>
59+
<COO: shape=(3, 4, 5), dtype=int64, nnz=5, sorted=False, duplicates=True>
5460
>>> tensordot(s, y, axes=(0, 1))
55-
<COO: shape=(4, 3, 5), dtype=float64, nnz=6>
61+
<COO: shape=(4, 3, 5), dtype=float64, nnz=6, sorted=False, duplicates=False>
5662
5763
Following scipy.sparse conventions you can also pass these as a tuple with
5864
rows and columns
@@ -73,7 +79,7 @@ class COO(object):
7379
7480
>>> d = {(0, 0, 0): 1, (1, 2, 3): 2, (1, 1, 0): 3}
7581
>>> COO(d)
76-
<COO: shape=(2, 3, 4), dtype=int64, nnz=3>
82+
<COO: shape=(2, 3, 4), dtype=int64, nnz=3, sorted=False, duplicates=False>
7783
7884
>>> L = [((0, 0), 1),
7985
... ((1, 1), 2),
@@ -89,17 +95,21 @@ class COO(object):
8995
"""
9096
__array_priority__ = 12
9197

92-
def __init__(self, coords, data=None, shape=None, has_duplicates=True):
98+
def __init__(self, coords, data=None, shape=None, has_duplicates=True,
99+
sorted=False):
100+
self._cache = defaultdict(lambda: deque(maxlen=3))
93101
if data is None:
94102
# {(i, j, k): x, (i, j, k): y, ...}
95103
if isinstance(coords, dict):
96104
coords = list(coords.items())
105+
has_duplicates = False
97106

98107
if isinstance(coords, np.ndarray):
99108
result = COO.from_numpy(coords)
100109
self.coords = result.coords
101110
self.data = result.data
102111
self.has_duplicates = result.has_duplicates
112+
self.sorted = result.sorted
103113
self.shape = result.shape
104114
return
105115

@@ -143,6 +153,7 @@ def __init__(self, coords, data=None, shape=None, has_duplicates=True):
143153
self.coords = self.coords.astype(dtype)
144154
assert not self.shape or len(data) == self.coords.shape[1]
145155
self.has_duplicates = has_duplicates
156+
self.sorted = sorted
146157

147158
@classmethod
148159
def from_numpy(cls, x):
@@ -153,7 +164,8 @@ def from_numpy(cls, x):
153164
else:
154165
coords = []
155166
data = x
156-
return cls(coords, data, shape=x.shape)
167+
return cls(coords, data, shape=x.shape, has_duplicates=False,
168+
sorted=True)
157169

158170
def todense(self):
159171
self = self.sum_duplicates()
@@ -169,7 +181,9 @@ def from_scipy_sparse(cls, x):
169181
coords = np.empty((2, x.nnz), dtype=x.row.dtype)
170182
coords[0, :] = x.row
171183
coords[1, :] = x.col
172-
return COO(coords, x.data, shape=x.shape, has_duplicates=not x.has_canonical_format)
184+
return COO(coords, x.data, shape=x.shape,
185+
has_duplicates=not x.has_canonical_format,
186+
sorted=x.has_canonical_format)
173187

174188
@property
175189
def dtype(self):
@@ -235,11 +249,14 @@ def __getitem__(self, index):
235249
shape = tuple(shape)
236250
data = self.data[mask]
237251

238-
return COO(coords, data, shape=shape, has_duplicates=self.has_duplicates)
252+
return COO(coords, data, shape=shape,
253+
has_duplicates=self.has_duplicates,
254+
sorted=self.sorted)
239255

240256
def __str__(self):
241-
return "<COO: shape=%s, dtype=%s, nnz=%d>" % (self.shape, self.dtype,
242-
self.nnz)
257+
return "<COO: shape=%s, dtype=%s, nnz=%d, sorted=%s, duplicates=%s>" % (
258+
self.shape, self.dtype, self.nnz, self.sorted,
259+
self.has_duplicates)
243260

244261
__repr__ = __str__
245262

@@ -272,6 +289,8 @@ def reduction(self, method, axis=None, keepdims=False, dtype=None):
272289
a = getattr(a, method)(axis=0, **kwargs)
273290
if isinstance(a, scipy.sparse.spmatrix):
274291
a = COO.from_scipy_sparse(a)
292+
a.sorted = self.sorted
293+
a.has_duplicates = False
275294
elif isinstance(a, np.matrix):
276295
a = np.asarray(a)[0]
277296
a = COO.from_numpy(a)
@@ -328,40 +347,73 @@ def T(self):
328347
def dot(self, other):
329348
return dot(self, other)
330349

350+
def linear_loc(self, signed=False):
351+
""" Index location of every piece of data in a flattened array
352+
353+
This is used internally to check for duplicates, re-order, reshape,
354+
etc..
355+
"""
356+
n = reduce(operator.mul, self.shape)
357+
if signed:
358+
n = -n
359+
dtype = np.min_scalar_type(n)
360+
out = np.zeros(self.nnz, dtype=dtype)
361+
tmp = np.zeros(self.nnz, dtype=dtype)
362+
strides = 1
363+
for i, d in enumerate(self.shape[::-1]):
364+
# out += self.coords[-(i + 1), :].astype(dtype) * strides
365+
np.multiply(self.coords[-(i + 1), :], strides, out=tmp, dtype=dtype)
366+
np.add(tmp, out, out=out)
367+
strides *= d
368+
return out
369+
331370
def reshape(self, shape):
332371
if self.shape == shape:
333372
return self
334373
if any(d == -1 for d in shape):
335-
extra = np.uint64(np.prod(self.shape) /
336-
np.prod([d for d in shape if d != -1]))
374+
extra = int(np.prod(self.shape) /
375+
np.prod([d for d in shape if d != -1]))
337376
shape = tuple([d if d != -1 else extra for d in shape])
338377
if self.shape == shape:
339378
return self
340379
# TODO: this np.prod(self.shape) enforces a 2**64 limit to array size
341-
dtype = np.min_scalar_type(np.prod(self.shape))
342-
linear_loc = np.zeros(self.nnz, dtype=dtype)
343-
strides = 1
344-
for i, d in enumerate(self.shape[::-1]):
345-
linear_loc += self.coords[-(i + 1), :].astype(dtype) * strides
346-
strides *= d
380+
linear_loc = self.linear_loc()
347381

348382
coords = np.empty((len(shape), self.nnz), dtype=np.min_scalar_type(max(shape)))
349383
strides = 1
350384
for i, d in enumerate(shape[::-1]):
351385
coords[-(i + 1), :] = (linear_loc // strides) % d
352386
strides *= d
353387

354-
return COO(coords, self.data, shape, has_duplicates=self.has_duplicates)
388+
return COO(coords, self.data, shape,
389+
has_duplicates=self.has_duplicates,
390+
sorted=self.sorted)
355391

356392
def to_scipy_sparse(self):
357393
assert self.ndim == 2
358394
result = scipy.sparse.coo_matrix((self.data,
359395
(self.coords[0],
360396
self.coords[1])),
361397
shape=self.shape)
362-
result.has_canonical_format = not self.has_duplicates
398+
result.has_canonical_format = (not self.has_duplicates and self.sorted)
363399
return result
364400

401+
def _tocsr(self):
402+
assert self.ndim == 2
403+
404+
# Pass 1: sum duplicates
405+
self.sum_duplicates()
406+
407+
# Pass 2: sort indices
408+
self.sort_indices()
409+
row, col = self.coords
410+
411+
# Pass 3: count nonzeros in each row
412+
indptr = np.zeros(self.shape[0] + 1, dtype=np.int64)
413+
np.cumsum(np.bincount(row, minlength=self.shape[0]), out=indptr[1:])
414+
415+
return scipy.sparse.csr_matrix((self.data, col, indptr), shape=self.shape)
416+
365417
def tocsr(self):
366418
try:
367419
return self._csr
@@ -373,10 +425,8 @@ def tocsr(self):
373425
except AttributeError:
374426
pass
375427

376-
coo = self.to_scipy_sparse()
377-
csr = coo.tocsr()
378-
self._csr = csr
379-
return csr
428+
self._csr = self._tocsr()
429+
return self._csr
380430

381431
def tocsc(self):
382432
try:
@@ -388,10 +438,25 @@ def tocsc(self):
388438
return self._csc
389439
except AttributeError:
390440
pass
391-
coo = self.to_scipy_sparse()
392-
csc = coo.tocsc()
393-
self._csc = csc
394-
return csc
441+
442+
self._csc = self.tocsr().tocsc()
443+
return self._csc
444+
445+
def sort_indices(self):
446+
if self.sorted:
447+
return
448+
449+
linear = self.linear_loc(signed=True)
450+
451+
if (np.diff(linear) > 0).all(): # already sorted
452+
self.sorted = True
453+
return self
454+
455+
order = np.argsort(linear)
456+
self.coords = self.coords[:, order]
457+
self.data = self.data[order]
458+
self.sorted = True
459+
return self
395460

396461
def sum_duplicates(self):
397462
# Inspired by scipy/sparse/coo.py::sum_duplicates
@@ -400,15 +465,21 @@ def sum_duplicates(self):
400465
return self
401466
if not np.prod(self.coords.shape):
402467
return self
403-
order = np.lexsort(self.coords)
404-
coords = self.coords[:, order]
405-
data = self.data[order]
406-
unique_mask = (coords[:, 1:] != coords[:, :-1]).any(axis=0)
468+
469+
self.sort_indices()
470+
471+
linear = self.linear_loc()
472+
unique_mask = np.diff(linear) != 0
473+
474+
if unique_mask.sum() == len(unique_mask): # already unique
475+
self.has_duplicates = False
476+
return self
477+
407478
unique_mask = np.append(True, unique_mask)
408479

409-
coords = coords[:, unique_mask]
480+
coords = self.coords[:, unique_mask]
410481
(unique_inds,) = np.nonzero(unique_mask)
411-
data = np.add.reduceat(data, unique_inds, dtype=data.dtype)
482+
data = np.add.reduceat(self.data, unique_inds, dtype=self.data.dtype)
412483

413484
self.data = data
414485
self.coords = coords
@@ -430,7 +501,8 @@ def __radd__(self, other):
430501
return self + other
431502

432503
def __neg__(self):
433-
return COO(self.coords, -self.data, self.shape, self.has_duplicates)
504+
return COO(self.coords, -self.data, self.shape, self.has_duplicates,
505+
self.sorted)
434506

435507
def __sub__(self, other):
436508
return self + (-other)
@@ -462,7 +534,9 @@ def elemwise(self, func, *args, **kwargs):
462534
raise ValueError("Performing this operation would produce "
463535
"a dense result: %s" % str(func))
464536
return COO(self.coords, func(self.data, *args, **kwargs),
465-
shape=self.shape, has_duplicates=self.has_duplicates)
537+
shape=self.shape,
538+
has_duplicates=self.has_duplicates,
539+
sorted=self.sorted)
466540

467541
def elemwise_binary(self, func, other, *args, **kwargs):
468542
assert isinstance(other, COO)
@@ -675,6 +749,7 @@ def tensordot(a, b, axes=2):
675749
res = res.todense()
676750
else:
677751
res = COO.from_scipy_sparse(res) # <--- modified
752+
res.has_duplicates = False
678753
if isinstance(res, np.matrix):
679754
res = np.asarray(res)
680755
return res.reshape(olda + oldb)
@@ -743,7 +818,8 @@ def concatenate(arrays, axis=0):
743818
shape[axis] = dim
744819
has_duplicates = any(x.has_duplicates for x in arrays)
745820

746-
return COO(coords, data, shape=shape, has_duplicates=has_duplicates)
821+
return COO(coords, data, shape=shape, has_duplicates=has_duplicates,
822+
sorted=(axis == 0) and all(a.sorted for a in arrays))
747823

748824

749825
def stack(arrays, axis=0):
@@ -769,4 +845,5 @@ def stack(arrays, axis=0):
769845
coords.insert(axis, new)
770846
coords = np.stack(coords, axis=0)
771847

772-
return COO(coords, data, shape=shape, has_duplicates=has_duplicates)
848+
return COO(coords, data, shape=shape, has_duplicates=has_duplicates,
849+
sorted=(axis == 0) and all(a.sorted for a in arrays))

sparse/tests/test_core.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def test_large_reshape():
7171
col = row % m # np.random.randint(0, m, size=n, dtype=np.uint16)
7272
data = np.ones(n, dtype=np.uint8)
7373

74-
x = COO((data, (row, col)))
74+
x = COO((data, (row, col)), sorted=True, has_duplicates=False)
7575

7676
assert_eq(x, x.reshape(x.shape))
7777

@@ -366,8 +366,8 @@ def test_scalar_exponentiation():
366366

367367
def test_create_with_lists_of_tuples():
368368
L = [((0, 0, 0), 1),
369-
((1, 1, 1), 2),
370369
((1, 2, 1), 1),
370+
((1, 1, 1), 2),
371371
((1, 3, 2), 3)]
372372

373373
s = COO(L)

sparse/utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,18 @@
11
import numpy as np
2+
from .core import COO
23

34

45
def assert_eq(x, y):
56
assert x.shape == y.shape
67
assert x.dtype == y.dtype
8+
9+
if isinstance(x, COO):
10+
if x.sorted:
11+
assert is_lexsorted(x)
12+
if isinstance(y, COO):
13+
if y.sorted:
14+
assert is_lexsorted(y)
15+
716
if hasattr(x, 'todense'):
817
xx = x.todense()
918
else:
@@ -13,3 +22,7 @@ def assert_eq(x, y):
1322
else:
1423
yy = y
1524
assert np.allclose(xx, yy)
25+
26+
27+
def is_lexsorted(x):
28+
return not x.shape or (np.diff(x.linear_loc()) > 0).all()

0 commit comments

Comments
 (0)