1
1
from __future__ import absolute_import , division , print_function
2
2
3
- from collections import Iterable
3
+ from collections import Iterable , defaultdict , deque
4
4
from functools import reduce
5
5
import numbers
6
6
import operator
9
9
import scipy .sparse
10
10
11
11
12
+ try : # Windows compatibility
13
+ int = long
14
+ except NameError :
15
+ pass
16
+
17
+
12
18
class COO (object ):
13
19
""" A Sparse Multidimensional Array
14
20
@@ -29,9 +35,9 @@ class COO(object):
29
35
--------
30
36
>>> x = np.eye(4)
31
37
>>> x[2, 3] = 5
32
- >>> s = COO.from_numpy (x)
38
+ >>> s = COO(x)
33
39
>>> s
34
- <COO: shape=(4, 4), dtype=float64, nnz=5>
40
+ <COO: shape=(4, 4), dtype=float64, nnz=5, sorted=True, duplicates=False >
35
41
>>> s.data
36
42
array([ 1., 1., 1., 5., 1.])
37
43
>>> s.coords
@@ -50,9 +56,9 @@ class COO(object):
50
56
>>> data = [1, 2, 3, 4, 5]
51
57
>>> y = COO(coords, data, shape=(3, 4, 5))
52
58
>>> y
53
- <COO: shape=(3, 4, 5), dtype=int64, nnz=5>
59
+ <COO: shape=(3, 4, 5), dtype=int64, nnz=5, sorted=False, duplicates=True >
54
60
>>> tensordot(s, y, axes=(0, 1))
55
- <COO: shape=(4, 3, 5), dtype=float64, nnz=6>
61
+ <COO: shape=(4, 3, 5), dtype=float64, nnz=6, sorted=False, duplicates=False >
56
62
57
63
Following scipy.sparse conventions you can also pass these as a tuple with
58
64
rows and columns
@@ -73,7 +79,7 @@ class COO(object):
73
79
74
80
>>> d = {(0, 0, 0): 1, (1, 2, 3): 2, (1, 1, 0): 3}
75
81
>>> COO(d)
76
- <COO: shape=(2, 3, 4), dtype=int64, nnz=3>
82
+ <COO: shape=(2, 3, 4), dtype=int64, nnz=3, sorted=False, duplicates=False >
77
83
78
84
>>> L = [((0, 0), 1),
79
85
... ((1, 1), 2),
@@ -89,17 +95,21 @@ class COO(object):
89
95
"""
90
96
__array_priority__ = 12
91
97
92
- def __init__ (self , coords , data = None , shape = None , has_duplicates = True ):
98
+ def __init__ (self , coords , data = None , shape = None , has_duplicates = True ,
99
+ sorted = False ):
100
+ self ._cache = defaultdict (lambda : deque (maxlen = 3 ))
93
101
if data is None :
94
102
# {(i, j, k): x, (i, j, k): y, ...}
95
103
if isinstance (coords , dict ):
96
104
coords = list (coords .items ())
105
+ has_duplicates = False
97
106
98
107
if isinstance (coords , np .ndarray ):
99
108
result = COO .from_numpy (coords )
100
109
self .coords = result .coords
101
110
self .data = result .data
102
111
self .has_duplicates = result .has_duplicates
112
+ self .sorted = result .sorted
103
113
self .shape = result .shape
104
114
return
105
115
@@ -143,6 +153,7 @@ def __init__(self, coords, data=None, shape=None, has_duplicates=True):
143
153
self .coords = self .coords .astype (dtype )
144
154
assert not self .shape or len (data ) == self .coords .shape [1 ]
145
155
self .has_duplicates = has_duplicates
156
+ self .sorted = sorted
146
157
147
158
@classmethod
148
159
def from_numpy (cls , x ):
@@ -153,7 +164,8 @@ def from_numpy(cls, x):
153
164
else :
154
165
coords = []
155
166
data = x
156
- return cls (coords , data , shape = x .shape )
167
+ return cls (coords , data , shape = x .shape , has_duplicates = False ,
168
+ sorted = True )
157
169
158
170
def todense (self ):
159
171
self = self .sum_duplicates ()
@@ -169,7 +181,9 @@ def from_scipy_sparse(cls, x):
169
181
coords = np .empty ((2 , x .nnz ), dtype = x .row .dtype )
170
182
coords [0 , :] = x .row
171
183
coords [1 , :] = x .col
172
- return COO (coords , x .data , shape = x .shape , has_duplicates = not x .has_canonical_format )
184
+ return COO (coords , x .data , shape = x .shape ,
185
+ has_duplicates = not x .has_canonical_format ,
186
+ sorted = x .has_canonical_format )
173
187
174
188
@property
175
189
def dtype (self ):
@@ -235,11 +249,14 @@ def __getitem__(self, index):
235
249
shape = tuple (shape )
236
250
data = self .data [mask ]
237
251
238
- return COO (coords , data , shape = shape , has_duplicates = self .has_duplicates )
252
+ return COO (coords , data , shape = shape ,
253
+ has_duplicates = self .has_duplicates ,
254
+ sorted = self .sorted )
239
255
240
256
def __str__ (self ):
241
- return "<COO: shape=%s, dtype=%s, nnz=%d>" % (self .shape , self .dtype ,
242
- self .nnz )
257
+ return "<COO: shape=%s, dtype=%s, nnz=%d, sorted=%s, duplicates=%s>" % (
258
+ self .shape , self .dtype , self .nnz , self .sorted ,
259
+ self .has_duplicates )
243
260
244
261
__repr__ = __str__
245
262
@@ -272,6 +289,8 @@ def reduction(self, method, axis=None, keepdims=False, dtype=None):
272
289
a = getattr (a , method )(axis = 0 , ** kwargs )
273
290
if isinstance (a , scipy .sparse .spmatrix ):
274
291
a = COO .from_scipy_sparse (a )
292
+ a .sorted = self .sorted
293
+ a .has_duplicates = False
275
294
elif isinstance (a , np .matrix ):
276
295
a = np .asarray (a )[0 ]
277
296
a = COO .from_numpy (a )
@@ -328,40 +347,73 @@ def T(self):
328
347
def dot (self , other ):
329
348
return dot (self , other )
330
349
350
+ def linear_loc (self , signed = False ):
351
+ """ Index location of every piece of data in a flattened array
352
+
353
+ This is used internally to check for duplicates, re-order, reshape,
354
+ etc..
355
+ """
356
+ n = reduce (operator .mul , self .shape )
357
+ if signed :
358
+ n = - n
359
+ dtype = np .min_scalar_type (n )
360
+ out = np .zeros (self .nnz , dtype = dtype )
361
+ tmp = np .zeros (self .nnz , dtype = dtype )
362
+ strides = 1
363
+ for i , d in enumerate (self .shape [::- 1 ]):
364
+ # out += self.coords[-(i + 1), :].astype(dtype) * strides
365
+ np .multiply (self .coords [- (i + 1 ), :], strides , out = tmp , dtype = dtype )
366
+ np .add (tmp , out , out = out )
367
+ strides *= d
368
+ return out
369
+
331
370
def reshape (self , shape ):
332
371
if self .shape == shape :
333
372
return self
334
373
if any (d == - 1 for d in shape ):
335
- extra = np . uint64 (np .prod (self .shape ) /
336
- np .prod ([d for d in shape if d != - 1 ]))
374
+ extra = int (np .prod (self .shape ) /
375
+ np .prod ([d for d in shape if d != - 1 ]))
337
376
shape = tuple ([d if d != - 1 else extra for d in shape ])
338
377
if self .shape == shape :
339
378
return self
340
379
# TODO: this np.prod(self.shape) enforces a 2**64 limit to array size
341
- dtype = np .min_scalar_type (np .prod (self .shape ))
342
- linear_loc = np .zeros (self .nnz , dtype = dtype )
343
- strides = 1
344
- for i , d in enumerate (self .shape [::- 1 ]):
345
- linear_loc += self .coords [- (i + 1 ), :].astype (dtype ) * strides
346
- strides *= d
380
+ linear_loc = self .linear_loc ()
347
381
348
382
coords = np .empty ((len (shape ), self .nnz ), dtype = np .min_scalar_type (max (shape )))
349
383
strides = 1
350
384
for i , d in enumerate (shape [::- 1 ]):
351
385
coords [- (i + 1 ), :] = (linear_loc // strides ) % d
352
386
strides *= d
353
387
354
- return COO (coords , self .data , shape , has_duplicates = self .has_duplicates )
388
+ return COO (coords , self .data , shape ,
389
+ has_duplicates = self .has_duplicates ,
390
+ sorted = self .sorted )
355
391
356
392
def to_scipy_sparse (self ):
357
393
assert self .ndim == 2
358
394
result = scipy .sparse .coo_matrix ((self .data ,
359
395
(self .coords [0 ],
360
396
self .coords [1 ])),
361
397
shape = self .shape )
362
- result .has_canonical_format = not self .has_duplicates
398
+ result .has_canonical_format = ( not self .has_duplicates and self . sorted )
363
399
return result
364
400
401
+ def _tocsr (self ):
402
+ assert self .ndim == 2
403
+
404
+ # Pass 1: sum duplicates
405
+ self .sum_duplicates ()
406
+
407
+ # Pass 2: sort indices
408
+ self .sort_indices ()
409
+ row , col = self .coords
410
+
411
+ # Pass 3: count nonzeros in each row
412
+ indptr = np .zeros (self .shape [0 ] + 1 , dtype = np .int64 )
413
+ np .cumsum (np .bincount (row , minlength = self .shape [0 ]), out = indptr [1 :])
414
+
415
+ return scipy .sparse .csr_matrix ((self .data , col , indptr ), shape = self .shape )
416
+
365
417
def tocsr (self ):
366
418
try :
367
419
return self ._csr
@@ -373,10 +425,8 @@ def tocsr(self):
373
425
except AttributeError :
374
426
pass
375
427
376
- coo = self .to_scipy_sparse ()
377
- csr = coo .tocsr ()
378
- self ._csr = csr
379
- return csr
428
+ self ._csr = self ._tocsr ()
429
+ return self ._csr
380
430
381
431
def tocsc (self ):
382
432
try :
@@ -388,10 +438,25 @@ def tocsc(self):
388
438
return self ._csc
389
439
except AttributeError :
390
440
pass
391
- coo = self .to_scipy_sparse ()
392
- csc = coo .tocsc ()
393
- self ._csc = csc
394
- return csc
441
+
442
+ self ._csc = self .tocsr ().tocsc ()
443
+ return self ._csc
444
+
445
+ def sort_indices (self ):
446
+ if self .sorted :
447
+ return
448
+
449
+ linear = self .linear_loc (signed = True )
450
+
451
+ if (np .diff (linear ) > 0 ).all (): # already sorted
452
+ self .sorted = True
453
+ return self
454
+
455
+ order = np .argsort (linear )
456
+ self .coords = self .coords [:, order ]
457
+ self .data = self .data [order ]
458
+ self .sorted = True
459
+ return self
395
460
396
461
def sum_duplicates (self ):
397
462
# Inspired by scipy/sparse/coo.py::sum_duplicates
@@ -400,15 +465,21 @@ def sum_duplicates(self):
400
465
return self
401
466
if not np .prod (self .coords .shape ):
402
467
return self
403
- order = np .lexsort (self .coords )
404
- coords = self .coords [:, order ]
405
- data = self .data [order ]
406
- unique_mask = (coords [:, 1 :] != coords [:, :- 1 ]).any (axis = 0 )
468
+
469
+ self .sort_indices ()
470
+
471
+ linear = self .linear_loc ()
472
+ unique_mask = np .diff (linear ) != 0
473
+
474
+ if unique_mask .sum () == len (unique_mask ): # already unique
475
+ self .has_duplicates = False
476
+ return self
477
+
407
478
unique_mask = np .append (True , unique_mask )
408
479
409
- coords = coords [:, unique_mask ]
480
+ coords = self . coords [:, unique_mask ]
410
481
(unique_inds ,) = np .nonzero (unique_mask )
411
- data = np .add .reduceat (data , unique_inds , dtype = data .dtype )
482
+ data = np .add .reduceat (self . data , unique_inds , dtype = self . data .dtype )
412
483
413
484
self .data = data
414
485
self .coords = coords
@@ -430,7 +501,8 @@ def __radd__(self, other):
430
501
return self + other
431
502
432
503
def __neg__ (self ):
433
- return COO (self .coords , - self .data , self .shape , self .has_duplicates )
504
+ return COO (self .coords , - self .data , self .shape , self .has_duplicates ,
505
+ self .sorted )
434
506
435
507
def __sub__ (self , other ):
436
508
return self + (- other )
@@ -462,7 +534,9 @@ def elemwise(self, func, *args, **kwargs):
462
534
raise ValueError ("Performing this operation would produce "
463
535
"a dense result: %s" % str (func ))
464
536
return COO (self .coords , func (self .data , * args , ** kwargs ),
465
- shape = self .shape , has_duplicates = self .has_duplicates )
537
+ shape = self .shape ,
538
+ has_duplicates = self .has_duplicates ,
539
+ sorted = self .sorted )
466
540
467
541
def elemwise_binary (self , func , other , * args , ** kwargs ):
468
542
assert isinstance (other , COO )
@@ -675,6 +749,7 @@ def tensordot(a, b, axes=2):
675
749
res = res .todense ()
676
750
else :
677
751
res = COO .from_scipy_sparse (res ) # <--- modified
752
+ res .has_duplicates = False
678
753
if isinstance (res , np .matrix ):
679
754
res = np .asarray (res )
680
755
return res .reshape (olda + oldb )
@@ -743,7 +818,8 @@ def concatenate(arrays, axis=0):
743
818
shape [axis ] = dim
744
819
has_duplicates = any (x .has_duplicates for x in arrays )
745
820
746
- return COO (coords , data , shape = shape , has_duplicates = has_duplicates )
821
+ return COO (coords , data , shape = shape , has_duplicates = has_duplicates ,
822
+ sorted = (axis == 0 ) and all (a .sorted for a in arrays ))
747
823
748
824
749
825
def stack (arrays , axis = 0 ):
@@ -769,4 +845,5 @@ def stack(arrays, axis=0):
769
845
coords .insert (axis , new )
770
846
coords = np .stack (coords , axis = 0 )
771
847
772
- return COO (coords , data , shape = shape , has_duplicates = has_duplicates )
848
+ return COO (coords , data , shape = shape , has_duplicates = has_duplicates ,
849
+ sorted = (axis == 0 ) and all (a .sorted for a in arrays ))
0 commit comments