feat: blobs property (#3198)

Joan Fontanals · web-flow · commit 2939170b71fb · 2021-08-18T09:38:50.000+02:00
diff --git a/jina/types/arrays/document.py b/jina/types/arrays/document.py
@@ -57,6 +57,19 @@ class DocumentArrayGetAttrMixin:
     def __iter__(self):
         ...
 
+    @abstractmethod
+    def __len__(self):
+        """Any implementation needs to implement the `length` method"""
+        ...
+
+    @abstractmethod
+    def __getitem__(self, item: int):
+        """Any implementation needs to implement access via integer item
+
+        :param item: the item index to access
+        """
+        ...
+
     def get_attributes(self, *fields: str) -> Union[List, List[List]]:
         """Return all nonempty values of the fields from all docs this array contains
 
@@ -108,6 +121,38 @@ def embeddings(self, emb: np.ndarray):
         """
         ...
 
+    @property
+    def blobs(self) -> np.ndarray:
+        """Return a `np.ndarray` stacking all the `blob` attributes as rows.
+
+        .. warning:: This operation assumes all Documents have `blob` as content, blobs have the same shape and dtype.
+                 All dtype and shape values are assumed to be equal to the values of the
+                 first element in the DocumentArray / DocumentArrayMemmap
+
+        .. warning:: This operation currently does not support sparse arrays.
+
+        :return: blobs stacked per row as `np.ndarray`.
+        """
+        x_mat = b''.join(d.proto.blob.dense.buffer for d in self)
+
+        return np.frombuffer(x_mat, dtype=self[0].proto.blob.dense.dtype).reshape(
+            (len(self), *self[0].proto.blob.dense.shape)
+        )
+
+    @blobs.setter
+    def blobs(self, b: np.ndarray):
+        """Set the blobs of the Documents
+
+        :param b: The blobs matrix to set
+        """
+
+        assert len(b) == len(
+            self
+        ), f'the number of rows in the input ({len(b)}), should match the number of Documents ({len(self)})'
+
+        for d, x in zip(self, b):
+            d.blob = x
+
 
 class DocumentArray(
     TraversableSequence,
@@ -246,39 +291,6 @@ def __iadd__(self, other: Iterable['Document']):
             self.append(doc)
         return self
 
-    @property
-    def embeddings(self) -> np.ndarray:
-        """Return a `np.ndarray` stacking all the `embedding` attributes as rows.
-
-        .. warning:: This operation assumes all embeddings have the same shape and dtype.
-                 All dtype and shape values are assumed to be equal to the values of the
-                 first element in the DocumentArray / DocumentArrayMemmap
-
-        .. warning:: This operation currently does not support sparse arrays.
-
-        :return: embeddings stacked per row as `np.ndarray`.
-        """
-        x_mat = b''.join(d.proto.embedding.dense.buffer for d in self)
-
-        return np.frombuffer(x_mat, dtype=self[0].proto.embedding.dense.dtype).reshape(
-            (len(self), self[0].proto.embedding.dense.shape[0])
-        )
-
-    @embeddings.setter
-    def embeddings(self, emb: np.ndarray):
-        """Set the embeddings of the Documents
-
-        :param emb: The embedding matrix to set
-        """
-
-        assert len(emb) == len(self), (
-            'the number of rows in the input ({len(emb)}),'
-            'should match the number of Documents ({len(self)})'
-        )
-
-        for d, x in zip(self, emb):
-            d.embedding = x
-
     def append(self, doc: 'Document'):
         """
         Append :param:`doc` in :class:`DocumentArray`.
@@ -491,3 +503,36 @@ def load_binary(cls, file: Union[str, BinaryIO]) -> 'DocumentArray':
     @staticmethod
     def _flatten(sequence):
         return DocumentArray(list(itertools.chain.from_iterable(sequence)))
+
+    # Properties for fast access of commonly used attributes
+    @property
+    def embeddings(self) -> np.ndarray:
+        """Return a `np.ndarray` stacking all the `embedding` attributes as rows.
+
+        .. warning:: This operation assumes all embeddings have the same shape and dtype.
+                 All dtype and shape values are assumed to be equal to the values of the
+                 first element in the DocumentArray / DocumentArrayMemmap
+
+        .. warning:: This operation currently does not support sparse arrays.
+
+        :return: embeddings stacked per row as `np.ndarray`.
+        """
+        x_mat = b''.join(d.proto.embedding.dense.buffer for d in self)
+
+        return np.frombuffer(x_mat, dtype=self[0].proto.embedding.dense.dtype).reshape(
+            (len(self), self[0].proto.embedding.dense.shape[0])
+        )
+
+    @embeddings.setter
+    def embeddings(self, emb: np.ndarray):
+        """Set the embeddings of the Documents
+
+        :param emb: The embedding matrix to set
+        """
+
+        assert len(emb) == len(
+            self
+        ), f'the number of rows in the input ({len(emb)}), should match the number of Documents ({len(self)})'
+
+        for d, x in zip(self, emb):
+            d.embedding = x
diff --git a/jina/types/arrays/memmap.py b/jina/types/arrays/memmap.py
@@ -31,6 +31,7 @@
 
 class DocumentArrayMemmap(
     TraversableSequence,
+    DocumentArrayGetAttrMixin,
     DocumentArrayNeuralOpsMixin,
     DocumentArraySearchOpsMixin,
     Itr,
diff --git a/tests/unit/types/arrays/test_documentarray.py b/tests/unit/types/arrays/test_documentarray.py
@@ -472,7 +472,7 @@ def test_da_get_embeddings():
     np.testing.assert_almost_equal(da.get_attributes('embedding'), da.embeddings)
 
 
-def test_da_get_embeddings():
+def test_da_get_embeddings_slice():
     da = DocumentArray(random_docs(100))
     np.testing.assert_almost_equal(
         da.get_attributes('embedding')[10:20], da._get_embeddings(slice(10, 20))
@@ -492,7 +492,25 @@ def test_embeddings_setter_da():
 def test_embeddings_getter_da():
     emb = np.random.random((100, 128))
     da = DocumentArray([Document(embedding=x) for x in emb])
+    assert len(da) == 100
     np.testing.assert_almost_equal(da.embeddings, emb)
 
     for x, doc in zip(emb, da):
         np.testing.assert_almost_equal(x, doc.embedding)
+
+
+def test_blobs_getter_da():
+    blobs = np.random.random((100, 10, 10))
+    da = DocumentArray([Document(blob=blob) for blob in blobs])
+    assert len(da) == 100
+    np.testing.assert_almost_equal(da.get_attributes('blob'), da.blobs)
+
+
+def test_blobs_setter_da():
+    blobs = np.random.random((100, 10, 10))
+    da = DocumentArray([Document() for _ in range(100)])
+    da.blobs = blobs
+    np.testing.assert_almost_equal(da.blobs, blobs)
+
+    for x, doc in zip(blobs, da):
+        np.testing.assert_almost_equal(x, doc.blob)
diff --git a/tests/unit/types/arrays/test_memmap.py b/tests/unit/types/arrays/test_memmap.py
@@ -475,7 +475,7 @@ def test_dam_embeddings(tmpdir):
     np.testing.assert_almost_equal(dam.get_attributes('embedding'), dam.embeddings)
 
 
-def test_dam_get_embeddings(tmpdir):
+def test_dam_get_embeddings_slice(tmpdir):
     da = DocumentArrayMemmap(tmpdir)
     da.extend(Document(embedding=np.array([1, 2, 3, 4])) for _ in range(100))
     np.testing.assert_almost_equal(
@@ -498,7 +498,22 @@ def test_embeddings_getter_dam(tmpdir):
     emb = np.random.random((100, 128))
     dam = DocumentArrayMemmap(tmpdir)
     dam.extend([Document(embedding=x) for x in emb])
-
+    assert len(dam) == 100
     np.testing.assert_almost_equal(dam.embeddings, emb)
-    for x, doc in zip(emb, dam):
-        np.testing.assert_almost_equal(x, doc.embedding)
+
+
+def test_blobs_getter_dam(tmpdir):
+    blobs = np.random.random((100, 10, 10))
+    dam = DocumentArrayMemmap(tmpdir)
+    dam.extend([Document(blob=blob) for blob in blobs])
+    assert len(dam) == 100
+    np.testing.assert_almost_equal(dam.get_attributes('blob'), dam.blobs)
+
+
+def test_blobs_setter_dma():
+    blobs = np.random.random((100, 10, 10))
+    da = DocumentArray([Document() for _ in range(100)])
+    da.blobs = blobs
+    np.testing.assert_almost_equal(da.blobs, blobs)
+    for x, doc in zip(blobs, da):
+        np.testing.assert_almost_equal(x, doc.blob)