Skip to content

Commit 2939170

Browse files
author
Joan Fontanals
authored
feat: blobs property (#3198)
1 parent befb7cd commit 2939170

File tree

4 files changed

+117
-38
lines changed

4 files changed

+117
-38
lines changed

jina/types/arrays/document.py

Lines changed: 78 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,19 @@ class DocumentArrayGetAttrMixin:
5757
def __iter__(self):
5858
...
5959

60+
@abstractmethod
61+
def __len__(self):
62+
"""Any implementation needs to implement the `length` method"""
63+
...
64+
65+
@abstractmethod
66+
def __getitem__(self, item: int):
67+
"""Any implementation needs to implement access via integer item
68+
69+
:param item: the item index to access
70+
"""
71+
...
72+
6073
def get_attributes(self, *fields: str) -> Union[List, List[List]]:
6174
"""Return all nonempty values of the fields from all docs this array contains
6275
@@ -108,6 +121,38 @@ def embeddings(self, emb: np.ndarray):
108121
"""
109122
...
110123

124+
@property
125+
def blobs(self) -> np.ndarray:
126+
"""Return a `np.ndarray` stacking all the `blob` attributes as rows.
127+
128+
.. warning:: This operation assumes all Documents have `blob` as content, blobs have the same shape and dtype.
129+
All dtype and shape values are assumed to be equal to the values of the
130+
first element in the DocumentArray / DocumentArrayMemmap
131+
132+
.. warning:: This operation currently does not support sparse arrays.
133+
134+
:return: blobs stacked per row as `np.ndarray`.
135+
"""
136+
x_mat = b''.join(d.proto.blob.dense.buffer for d in self)
137+
138+
return np.frombuffer(x_mat, dtype=self[0].proto.blob.dense.dtype).reshape(
139+
(len(self), *self[0].proto.blob.dense.shape)
140+
)
141+
142+
@blobs.setter
143+
def blobs(self, b: np.ndarray):
144+
"""Set the blobs of the Documents
145+
146+
:param b: The blobs matrix to set
147+
"""
148+
149+
assert len(b) == len(
150+
self
151+
), f'the number of rows in the input ({len(b)}), should match the number of Documents ({len(self)})'
152+
153+
for d, x in zip(self, b):
154+
d.blob = x
155+
111156

112157
class DocumentArray(
113158
TraversableSequence,
@@ -246,39 +291,6 @@ def __iadd__(self, other: Iterable['Document']):
246291
self.append(doc)
247292
return self
248293

249-
@property
250-
def embeddings(self) -> np.ndarray:
251-
"""Return a `np.ndarray` stacking all the `embedding` attributes as rows.
252-
253-
.. warning:: This operation assumes all embeddings have the same shape and dtype.
254-
All dtype and shape values are assumed to be equal to the values of the
255-
first element in the DocumentArray / DocumentArrayMemmap
256-
257-
.. warning:: This operation currently does not support sparse arrays.
258-
259-
:return: embeddings stacked per row as `np.ndarray`.
260-
"""
261-
x_mat = b''.join(d.proto.embedding.dense.buffer for d in self)
262-
263-
return np.frombuffer(x_mat, dtype=self[0].proto.embedding.dense.dtype).reshape(
264-
(len(self), self[0].proto.embedding.dense.shape[0])
265-
)
266-
267-
@embeddings.setter
268-
def embeddings(self, emb: np.ndarray):
269-
"""Set the embeddings of the Documents
270-
271-
:param emb: The embedding matrix to set
272-
"""
273-
274-
assert len(emb) == len(self), (
275-
'the number of rows in the input ({len(emb)}),'
276-
'should match the number of Documents ({len(self)})'
277-
)
278-
279-
for d, x in zip(self, emb):
280-
d.embedding = x
281-
282294
def append(self, doc: 'Document'):
283295
"""
284296
Append :param:`doc` in :class:`DocumentArray`.
@@ -491,3 +503,36 @@ def load_binary(cls, file: Union[str, BinaryIO]) -> 'DocumentArray':
491503
@staticmethod
492504
def _flatten(sequence):
493505
return DocumentArray(list(itertools.chain.from_iterable(sequence)))
506+
507+
# Properties for fast access of commonly used attributes
508+
@property
509+
def embeddings(self) -> np.ndarray:
510+
"""Return a `np.ndarray` stacking all the `embedding` attributes as rows.
511+
512+
.. warning:: This operation assumes all embeddings have the same shape and dtype.
513+
All dtype and shape values are assumed to be equal to the values of the
514+
first element in the DocumentArray / DocumentArrayMemmap
515+
516+
.. warning:: This operation currently does not support sparse arrays.
517+
518+
:return: embeddings stacked per row as `np.ndarray`.
519+
"""
520+
x_mat = b''.join(d.proto.embedding.dense.buffer for d in self)
521+
522+
return np.frombuffer(x_mat, dtype=self[0].proto.embedding.dense.dtype).reshape(
523+
(len(self), self[0].proto.embedding.dense.shape[0])
524+
)
525+
526+
@embeddings.setter
527+
def embeddings(self, emb: np.ndarray):
528+
"""Set the embeddings of the Documents
529+
530+
:param emb: The embedding matrix to set
531+
"""
532+
533+
assert len(emb) == len(
534+
self
535+
), f'the number of rows in the input ({len(emb)}), should match the number of Documents ({len(self)})'
536+
537+
for d, x in zip(self, emb):
538+
d.embedding = x

jina/types/arrays/memmap.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131

3232
class DocumentArrayMemmap(
3333
TraversableSequence,
34+
DocumentArrayGetAttrMixin,
3435
DocumentArrayNeuralOpsMixin,
3536
DocumentArraySearchOpsMixin,
3637
Itr,

tests/unit/types/arrays/test_documentarray.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ def test_da_get_embeddings():
472472
np.testing.assert_almost_equal(da.get_attributes('embedding'), da.embeddings)
473473

474474

475-
def test_da_get_embeddings():
475+
def test_da_get_embeddings_slice():
476476
da = DocumentArray(random_docs(100))
477477
np.testing.assert_almost_equal(
478478
da.get_attributes('embedding')[10:20], da._get_embeddings(slice(10, 20))
@@ -492,7 +492,25 @@ def test_embeddings_setter_da():
492492
def test_embeddings_getter_da():
493493
emb = np.random.random((100, 128))
494494
da = DocumentArray([Document(embedding=x) for x in emb])
495+
assert len(da) == 100
495496
np.testing.assert_almost_equal(da.embeddings, emb)
496497

497498
for x, doc in zip(emb, da):
498499
np.testing.assert_almost_equal(x, doc.embedding)
500+
501+
502+
def test_blobs_getter_da():
503+
blobs = np.random.random((100, 10, 10))
504+
da = DocumentArray([Document(blob=blob) for blob in blobs])
505+
assert len(da) == 100
506+
np.testing.assert_almost_equal(da.get_attributes('blob'), da.blobs)
507+
508+
509+
def test_blobs_setter_da():
510+
blobs = np.random.random((100, 10, 10))
511+
da = DocumentArray([Document() for _ in range(100)])
512+
da.blobs = blobs
513+
np.testing.assert_almost_equal(da.blobs, blobs)
514+
515+
for x, doc in zip(blobs, da):
516+
np.testing.assert_almost_equal(x, doc.blob)

tests/unit/types/arrays/test_memmap.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,7 @@ def test_dam_embeddings(tmpdir):
475475
np.testing.assert_almost_equal(dam.get_attributes('embedding'), dam.embeddings)
476476

477477

478-
def test_dam_get_embeddings(tmpdir):
478+
def test_dam_get_embeddings_slice(tmpdir):
479479
da = DocumentArrayMemmap(tmpdir)
480480
da.extend(Document(embedding=np.array([1, 2, 3, 4])) for _ in range(100))
481481
np.testing.assert_almost_equal(
@@ -498,7 +498,22 @@ def test_embeddings_getter_dam(tmpdir):
498498
emb = np.random.random((100, 128))
499499
dam = DocumentArrayMemmap(tmpdir)
500500
dam.extend([Document(embedding=x) for x in emb])
501-
501+
assert len(dam) == 100
502502
np.testing.assert_almost_equal(dam.embeddings, emb)
503-
for x, doc in zip(emb, dam):
504-
np.testing.assert_almost_equal(x, doc.embedding)
503+
504+
505+
def test_blobs_getter_dam(tmpdir):
506+
blobs = np.random.random((100, 10, 10))
507+
dam = DocumentArrayMemmap(tmpdir)
508+
dam.extend([Document(blob=blob) for blob in blobs])
509+
assert len(dam) == 100
510+
np.testing.assert_almost_equal(dam.get_attributes('blob'), dam.blobs)
511+
512+
513+
def test_blobs_setter_dma():
514+
blobs = np.random.random((100, 10, 10))
515+
da = DocumentArray([Document() for _ in range(100)])
516+
da.blobs = blobs
517+
np.testing.assert_almost_equal(da.blobs, blobs)
518+
for x, doc in zip(blobs, da):
519+
np.testing.assert_almost_equal(x, doc.blob)

0 commit comments

Comments
 (0)