Add tocsr method and sorted checking

mrocklin · mrocklin · commit c81f4da0db02 · 2017-05-04T07:57:04.000-04:00
pull out linear_loc to separate function
diff --git a/sparse/core.py b/sparse/core.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import, division, print_function
 
-from collections import Iterable
+from collections import Iterable, defaultdict, deque
 from functools import reduce
 import numbers
 import operator
@@ -9,6 +9,12 @@
 import scipy.sparse
 
 
+try:  # Windows compatibility
+    int = long
+except NameError:
+    pass
+
+
 class COO(object):
     """ A Sparse Multidimensional Array
 
@@ -29,9 +35,9 @@ class COO(object):
     --------
     >>> x = np.eye(4)
     >>> x[2, 3] = 5
-    >>> s = COO.from_numpy(x)
+    >>> s = COO(x)
     >>> s
-    <COO: shape=(4, 4), dtype=float64, nnz=5>
+    <COO: shape=(4, 4), dtype=float64, nnz=5, sorted=True, duplicates=False>
     >>> s.data
     array([ 1.,  1.,  1.,  5.,  1.])
     >>> s.coords
@@ -50,9 +56,9 @@ class COO(object):
     >>> data = [1, 2, 3, 4, 5]
     >>> y = COO(coords, data, shape=(3, 4, 5))
     >>> y
-    <COO: shape=(3, 4, 5), dtype=int64, nnz=5>
+    <COO: shape=(3, 4, 5), dtype=int64, nnz=5, sorted=False, duplicates=True>
     >>> tensordot(s, y, axes=(0, 1))
-    <COO: shape=(4, 3, 5), dtype=float64, nnz=6>
+    <COO: shape=(4, 3, 5), dtype=float64, nnz=6, sorted=False, duplicates=False>
 
     Following scipy.sparse conventions you can also pass these as a tuple with
     rows and columns
@@ -73,7 +79,7 @@ class COO(object):
 
     >>> d = {(0, 0, 0): 1, (1, 2, 3): 2, (1, 1, 0): 3}
     >>> COO(d)
-    <COO: shape=(2, 3, 4), dtype=int64, nnz=3>
+    <COO: shape=(2, 3, 4), dtype=int64, nnz=3, sorted=False, duplicates=False>
 
     >>> L = [((0, 0), 1),
     ...      ((1, 1), 2),
@@ -89,17 +95,21 @@ class COO(object):
     """
     __array_priority__ = 12
 
-    def __init__(self, coords, data=None, shape=None, has_duplicates=True):
+    def __init__(self, coords, data=None, shape=None, has_duplicates=True,
+                 sorted=False):
+        self._cache = defaultdict(lambda: deque(maxlen=3))
         if data is None:
             # {(i, j, k): x, (i, j, k): y, ...}
             if isinstance(coords, dict):
                 coords = list(coords.items())
+                has_duplicates = False
 
             if isinstance(coords, np.ndarray):
                 result = COO.from_numpy(coords)
                 self.coords = result.coords
                 self.data = result.data
                 self.has_duplicates = result.has_duplicates
+                self.sorted = result.sorted
                 self.shape = result.shape
                 return
 
@@ -143,6 +153,7 @@ def __init__(self, coords, data=None, shape=None, has_duplicates=True):
         self.coords = self.coords.astype(dtype)
         assert not self.shape or len(data) == self.coords.shape[1]
         self.has_duplicates = has_duplicates
+        self.sorted = sorted
 
     @classmethod
     def from_numpy(cls, x):
@@ -153,7 +164,8 @@ def from_numpy(cls, x):
         else:
             coords = []
             data = x
-        return cls(coords, data, shape=x.shape)
+        return cls(coords, data, shape=x.shape, has_duplicates=False,
+                   sorted=True)
 
     def todense(self):
         self = self.sum_duplicates()
@@ -169,7 +181,9 @@ def from_scipy_sparse(cls, x):
         coords = np.empty((2, x.nnz), dtype=x.row.dtype)
         coords[0, :] = x.row
         coords[1, :] = x.col
-        return COO(coords, x.data, shape=x.shape, has_duplicates=not x.has_canonical_format)
+        return COO(coords, x.data, shape=x.shape,
+                   has_duplicates=not x.has_canonical_format,
+                   sorted=x.has_canonical_format)
 
     @property
     def dtype(self):
@@ -235,11 +249,14 @@ def __getitem__(self, index):
         shape = tuple(shape)
         data = self.data[mask]
 
-        return COO(coords, data, shape=shape, has_duplicates=self.has_duplicates)
+        return COO(coords, data, shape=shape,
+                   has_duplicates=self.has_duplicates,
+                   sorted=self.sorted)
 
     def __str__(self):
-        return "<COO: shape=%s, dtype=%s, nnz=%d>" % (self.shape, self.dtype,
-                self.nnz)
+        return "<COO: shape=%s, dtype=%s, nnz=%d, sorted=%s, duplicates=%s>" % (
+                self.shape, self.dtype, self.nnz, self.sorted,
+                self.has_duplicates)
 
     __repr__ = __str__
 
@@ -272,6 +289,8 @@ def reduction(self, method, axis=None, keepdims=False, dtype=None):
             a = getattr(a, method)(axis=0, **kwargs)
             if isinstance(a, scipy.sparse.spmatrix):
                 a = COO.from_scipy_sparse(a)
+                a.sorted = self.sorted
+                a.has_duplicates = False
             elif isinstance(a, np.matrix):
                 a = np.asarray(a)[0]
                 a = COO.from_numpy(a)
@@ -328,40 +347,73 @@ def T(self):
     def dot(self, other):
         return dot(self, other)
 
+    def linear_loc(self, signed=False):
+        """ Index location of every piece of data in a flattened array
+
+        This is used internally to check for duplicates, re-order, reshape,
+        etc..
+        """
+        n = reduce(operator.mul, self.shape)
+        if signed:
+            n = -n
+        dtype = np.min_scalar_type(n)
+        out = np.zeros(self.nnz, dtype=dtype)
+        tmp = np.zeros(self.nnz, dtype=dtype)
+        strides = 1
+        for i, d in enumerate(self.shape[::-1]):
+            # out += self.coords[-(i + 1), :].astype(dtype) * strides
+            np.multiply(self.coords[-(i + 1), :], strides, out=tmp, dtype=dtype)
+            np.add(tmp, out, out=out)
+            strides *= d
+        return out
+
     def reshape(self, shape):
         if self.shape == shape:
             return self
         if any(d == -1 for d in shape):
-            extra = np.uint64(np.prod(self.shape) /
-                              np.prod([d for d in shape if d != -1]))
+            extra = int(np.prod(self.shape) /
+                        np.prod([d for d in shape if d != -1]))
             shape = tuple([d if d != -1 else extra for d in shape])
         if self.shape == shape:
             return self
         # TODO: this np.prod(self.shape) enforces a 2**64 limit to array size
-        dtype = np.min_scalar_type(np.prod(self.shape))
-        linear_loc = np.zeros(self.nnz, dtype=dtype)
-        strides = 1
-        for i, d in enumerate(self.shape[::-1]):
-            linear_loc += self.coords[-(i + 1), :].astype(dtype) * strides
-            strides *= d
+        linear_loc = self.linear_loc()
 
         coords = np.empty((len(shape), self.nnz), dtype=np.min_scalar_type(max(shape)))
         strides = 1
         for i, d in enumerate(shape[::-1]):
             coords[-(i + 1), :] = (linear_loc // strides) % d
             strides *= d
 
-        return COO(coords, self.data, shape, has_duplicates=self.has_duplicates)
+        return COO(coords, self.data, shape,
+                   has_duplicates=self.has_duplicates,
+                   sorted=self.sorted)
 
     def to_scipy_sparse(self):
         assert self.ndim == 2
         result = scipy.sparse.coo_matrix((self.data,
                                           (self.coords[0],
                                            self.coords[1])),
                                           shape=self.shape)
-        result.has_canonical_format = not self.has_duplicates
+        result.has_canonical_format = (not self.has_duplicates and self.sorted)
         return result
 
+    def _tocsr(self):
+        assert self.ndim == 2
+
+        # Pass 1: sum duplicates
+        self.sum_duplicates()
+
+        # Pass 2: sort indices
+        self.sort_indices()
+        row, col = self.coords
+
+        # Pass 3: count nonzeros in each row
+        indptr = np.zeros(self.shape[0] + 1, dtype=np.int64)
+        np.cumsum(np.bincount(row, minlength=self.shape[0]), out=indptr[1:])
+
+        return scipy.sparse.csr_matrix((self.data, col, indptr), shape=self.shape)
+
     def tocsr(self):
         try:
             return self._csr
@@ -373,10 +425,8 @@ def tocsr(self):
         except AttributeError:
             pass
 
-        coo = self.to_scipy_sparse()
-        csr = coo.tocsr()
-        self._csr = csr
-        return csr
+        self._csr = self._tocsr()
+        return self._csr
 
     def tocsc(self):
         try:
@@ -388,10 +438,25 @@ def tocsc(self):
             return self._csc
         except AttributeError:
             pass
-        coo = self.to_scipy_sparse()
-        csc = coo.tocsc()
-        self._csc = csc
-        return csc
+
+        self._csc = self.tocsr().tocsc()
+        return self._csc
+
+    def sort_indices(self):
+        if self.sorted:
+            return
+
+        linear = self.linear_loc(signed=True)
+
+        if (np.diff(linear) > 0).all():  # already sorted
+            self.sorted = True
+            return self
+
+        order = np.argsort(linear)
+        self.coords = self.coords[:, order]
+        self.data = self.data[order]
+        self.sorted = True
+        return self
 
     def sum_duplicates(self):
         # Inspired by scipy/sparse/coo.py::sum_duplicates
@@ -400,15 +465,21 @@ def sum_duplicates(self):
             return self
         if not np.prod(self.coords.shape):
             return self
-        order = np.lexsort(self.coords)
-        coords = self.coords[:, order]
-        data = self.data[order]
-        unique_mask = (coords[:, 1:] != coords[:, :-1]).any(axis=0)
+
+        self.sort_indices()
+
+        linear = self.linear_loc()
+        unique_mask = np.diff(linear) != 0
+
+        if unique_mask.sum() == len(unique_mask):  # already unique
+            self.has_duplicates = False
+            return self
+
         unique_mask = np.append(True, unique_mask)
 
-        coords = coords[:, unique_mask]
+        coords = self.coords[:, unique_mask]
         (unique_inds,) = np.nonzero(unique_mask)
-        data = np.add.reduceat(data, unique_inds, dtype=data.dtype)
+        data = np.add.reduceat(self.data, unique_inds, dtype=self.data.dtype)
 
         self.data = data
         self.coords = coords
@@ -430,7 +501,8 @@ def __radd__(self, other):
         return self + other
 
     def __neg__(self):
-        return COO(self.coords, -self.data, self.shape, self.has_duplicates)
+        return COO(self.coords, -self.data, self.shape, self.has_duplicates,
+                   self.sorted)
 
     def __sub__(self, other):
         return self + (-other)
@@ -462,7 +534,9 @@ def elemwise(self, func, *args, **kwargs):
             raise ValueError("Performing this operation would produce "
                     "a dense result: %s" % str(func))
         return COO(self.coords, func(self.data, *args, **kwargs),
-                   shape=self.shape, has_duplicates=self.has_duplicates)
+                   shape=self.shape,
+                   has_duplicates=self.has_duplicates,
+                   sorted=self.sorted)
 
     def elemwise_binary(self, func, other, *args, **kwargs):
         assert isinstance(other, COO)
@@ -675,6 +749,7 @@ def tensordot(a, b, axes=2):
             res = res.todense()
         else:
             res = COO.from_scipy_sparse(res)  # <--- modified
+            res.has_duplicates = False
     if isinstance(res, np.matrix):
         res = np.asarray(res)
     return res.reshape(olda + oldb)
@@ -743,7 +818,8 @@ def concatenate(arrays, axis=0):
     shape[axis] = dim
     has_duplicates = any(x.has_duplicates for x in arrays)
 
-    return COO(coords, data, shape=shape, has_duplicates=has_duplicates)
+    return COO(coords, data, shape=shape, has_duplicates=has_duplicates,
+               sorted=(axis == 0) and all(a.sorted for a in arrays))
 
 
 def stack(arrays, axis=0):
@@ -769,4 +845,5 @@ def stack(arrays, axis=0):
     coords.insert(axis, new)
     coords = np.stack(coords, axis=0)
 
-    return COO(coords, data, shape=shape, has_duplicates=has_duplicates)
+    return COO(coords, data, shape=shape, has_duplicates=has_duplicates,
+               sorted=(axis == 0) and all(a.sorted for a in arrays))
diff --git a/sparse/tests/test_core.py b/sparse/tests/test_core.py
@@ -71,7 +71,7 @@ def test_large_reshape():
     col = row % m # np.random.randint(0, m, size=n, dtype=np.uint16)
     data = np.ones(n, dtype=np.uint8)
 
-    x = COO((data, (row, col)))
+    x = COO((data, (row, col)), sorted=True, has_duplicates=False)
 
     assert_eq(x, x.reshape(x.shape))
 
@@ -366,8 +366,8 @@ def test_scalar_exponentiation():
 
 def test_create_with_lists_of_tuples():
     L = [((0, 0, 0), 1),
-         ((1, 1, 1), 2),
          ((1, 2, 1), 1),
+         ((1, 1, 1), 2),
          ((1, 3, 2), 3)]
 
     s = COO(L)
diff --git a/sparse/utils.py b/sparse/utils.py
@@ -1,9 +1,18 @@
 import numpy as np
+from .core import COO
 
 
 def assert_eq(x, y):
     assert x.shape == y.shape
     assert x.dtype == y.dtype
+
+    if isinstance(x, COO):
+        if x.sorted:
+            assert is_lexsorted(x)
+    if isinstance(y, COO):
+        if y.sorted:
+            assert is_lexsorted(y)
+
     if hasattr(x, 'todense'):
         xx = x.todense()
     else:
@@ -13,3 +22,7 @@ def assert_eq(x, y):
     else:
         yy = y
     assert np.allclose(xx, yy)
+
+
+def is_lexsorted(x):
+    return not x.shape or (np.diff(x.linear_loc()) > 0).all()