Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions dataset_reader/ann_h5_reader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import itertools
from typing import Iterator

import h5py
Expand All @@ -14,17 +15,18 @@ def __init__(self, path, normalize=False):

def read_queries(self) -> Iterator[Query]:
data = h5py.File(self.path)
distances = data["distances"] if "distances" in data else itertools.repeat(None)

for vector, expected_result, expected_scores in zip(
data["test"], data["neighbors"], data["distances"]
data["test"], data["neighbors"], distances
):
if self.normalize:
vector /= np.linalg.norm(vector)
yield Query(
vector=vector.tolist(),
meta_conditions=None,
expected_result=expected_result.tolist(),
expected_scores=expected_scores.tolist(),
expected_scores=expected_scores.tolist() if expected_scores is not None else None,
)

def read_data(self, *args, **kwargs) -> Iterator[Record]:
Expand Down
16 changes: 16 additions & 0 deletions datasets/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -1296,5 +1296,21 @@
"path": "random-100-match-kw-small-vocab/random_keywords_1m_vocab_10_no_filters",
"vector_count": 100,
"description": "Synthetic data"
},
{
"name": "cohere-768-1M",
"vector_size": 768,
"distance": "dot",
"type": "h5",
"path": "cohere-768-1M/cohere-768-1M.hdf5",
"link": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/vectorsearch/cohere-wikipedia-22-12-en-embeddings/documents-1m.hdf5.bz2"
},
{
"name": "cohere-768-10M",
"vector_size": 768,
"distance": "dot",
"type": "h5",
"path": "cohere-768-10M/cohere-768-10M.hdf5",
"link": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/vectorsearch/cohere-wikipedia-22-12-en-embeddings/documents-10m.hdf5.bz2"
}
]
Loading
Loading