Skip to content

Commit d29d3a5

Browse files
perf: do not store nor serialize the unfiltered docsstore
Signed-off-by: thiswillbeyourgithub <[email protected]>
1 parent cf9171d commit d29d3a5

File tree

2 files changed

+18
-15
lines changed

2 files changed

+18
-15
lines changed

wdoc/utils/filters.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
def filter_docstore(
2020
loaded_embeddings: VectorStore,
2121
cli_kwargs: dict,
22-
) -> Tuple[VectorStore, bytes]:
22+
) -> VectorStore:
2323
if "filter_metadata" in cli_kwargs:
2424
filter_meta = create_metadata_filter(
2525
loaded_embeddings=loaded_embeddings,
@@ -59,14 +59,15 @@ def filter_cont(cont: str) -> bool:
5959
logger.warning("Your filter matched all stored documents!")
6060
assert good, "No documents in the vectorstore match the given filter"
6161

62-
# directly remove the filtered documents from the docstore
63-
# but first store the docstore before altering it to allow
64-
# unfiltering in the prompt
65-
start_time = time.time()
66-
unfiltered_docstore_bytes = loaded_embeddings.serialize_to_bytes()
67-
serialize_time = time.time() - start_time
68-
logger.debug(f"Serializing unfiltered docstore took {serialize_time:.3f} seconds")
62+
# commented because it's taking quite long
63+
# # first store the docstore before altering it to allow
64+
# # unfiltering in the prompt
65+
# start_time = time.time()
66+
# unfiltered_docstore_bytes = loaded_embeddings.serialize_to_bytes()
67+
# serialize_time = time.time() - start_time
68+
# logger.debug(f"Serializing unfiltered docstore took {serialize_time:.3f} seconds")
6969

70+
# directly remove the filtered documents from the docstore
7071
start_time = time.time()
7172
status = loaded_embeddings.delete(ids_to_del)
7273
delete_time = time.time() - start_time
@@ -87,7 +88,7 @@ def filter_cont(cont: str) -> bool:
8788
loaded_embeddings.index_to_docstore_id
8889
), "Something went wrong when deleting filtered out documents"
8990

90-
return loaded_embeddings, unfiltered_docstore_bytes
91+
return loaded_embeddings
9192

9293

9394
def create_metadata_filter(

wdoc/wdoc.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,9 @@ def print_exception(exc_type, exc_value, exc_traceback):
606606
else:
607607
self.loaded_docs = None # will be loaded when embeddings are loaded
608608

609+
# flag to know if we already filtered or not
610+
self._is_vectorstore_filtered = False
611+
609612
if self.__import_mode__:
610613
logger.debug(
611614
"Ready to query or summarize, call your_instance.query_task(your_question)"
@@ -791,19 +794,18 @@ def _query_or_search_task(self, query: str) -> dict:
791794
}
792795

793796
# parse filters as callable for faiss filtering
794-
if not hasattr(self, "unfiltered_docstore_bytes"):
797+
if not self._is_vectorstore_filtered:
795798
if (
796799
"filter_metadata" in self.cli_kwargs
797800
or "filter_content" in self.cli_kwargs
798801
):
799802
from wdoc.utils.filters import filter_docstore
800803

801-
self.loaded_embeddings, self.unfiltered_docstore_bytes = (
802-
filter_docstore(
803-
loaded_embeddings=self.loaded_embeddings,
804-
cli_kwargs=self.cli_kwargs,
805-
)
804+
self.loaded_embeddings = filter_docstore(
805+
loaded_embeddings=self.loaded_embeddings,
806+
cli_kwargs=self.cli_kwargs,
806807
)
808+
self._is_vectorstore_filtered = True
807809

808810
assert query.strip(), "Cannot accept empty query"
809811
assert all(

0 commit comments

Comments
 (0)