-
-
Notifications
You must be signed in to change notification settings - Fork 57
Dedup Backend Initial Implementation #2868
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: feature-dedup
Are you sure you want to change the base?
Changes from all commits
f2c3ba1
4bd5525
183c47b
85c8691
933020d
f5e609a
84c3b53
5b68f54
3de00d4
96fa722
ec7dfc8
61f5d2f
1118d06
651878d
e609e79
1105ee3
25f01cb
016944e
25f09a6
5507a25
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -316,6 +316,16 @@ async def add_crawl_config( | |
|
||
first_seed = seeds[0].url | ||
|
||
# the dedup collection id must also be in auto add collections | ||
if config_in.dedupCollId: | ||
if ( | ||
not config_in.autoAddCollections | ||
or config_in.dedupCollId not in config_in.autoAddCollections | ||
): | ||
raise HTTPException( | ||
status_code=400, detail="dedup_coll_id_not_in_autoadd" | ||
) | ||
|
||
now = dt_now() | ||
crawlconfig = CrawlConfig( | ||
id=uuid4(), | ||
|
@@ -343,6 +353,7 @@ async def add_crawl_config( | |
firstSeed=first_seed, | ||
seedCount=seed_count, | ||
shareable=config_in.shareable, | ||
dedupCollId=config_in.dedupCollId, | ||
) | ||
|
||
if config_in.runNow: | ||
|
@@ -359,6 +370,9 @@ async def add_crawl_config( | |
storage_quota_reached = False | ||
exec_mins_quota_reached = False | ||
|
||
if config_in.dedupCollId: | ||
await self.coll_ops.enable_dedup_index(config_in.dedupCollId) | ||
|
||
if config_in.runNow: | ||
try: | ||
crawl_id = await self.run_now_internal(crawlconfig, org, user) | ||
|
@@ -608,6 +622,20 @@ async def update_crawl_config( | |
!= sorted(update.autoAddCollections) | ||
) | ||
|
||
metadata_changed = metadata_changed or ( | ||
update.dedupCollId is not None | ||
and update.dedupCollId != orig_crawl_config.dedupCollId | ||
) | ||
|
||
if update.dedupCollId: | ||
if ( | ||
not update.autoAddCollections | ||
or update.dedupCollId not in update.autoAddCollections | ||
): | ||
raise HTTPException( | ||
status_code=400, detail="dedup_coll_id_not_in_autoadd" | ||
) | ||
Comment on lines
+630
to
+637
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This logic needs to account for the possibility that the collection referenced by |
||
|
||
run_now = update.runNow | ||
|
||
if not changed and not metadata_changed and not run_now: | ||
|
@@ -651,6 +679,9 @@ async def update_crawl_config( | |
query["seedCount"] = len(update.config.seeds) | ||
query["seedFileId"] = None | ||
|
||
if update.dedupCollId: | ||
await self.coll_ops.enable_dedup_index(update.dedupCollId) | ||
|
||
# update in db | ||
result = await self.crawl_configs.find_one_and_update( | ||
{"_id": cid, "inactive": {"$ne": True}}, | ||
|
@@ -1116,6 +1147,10 @@ async def remove_collection_from_all_configs( | |
{"$pull": {"autoAddCollections": coll_id}}, | ||
) | ||
|
||
await self.crawl_configs.update_many( | ||
{"oid": org.id, "dedupCollId": coll_id}, {"$set": {"dedupCollId": None}} | ||
) | ||
|
||
async def get_crawl_config_tags(self, org): | ||
"""get distinct tags from all crawl configs for this org""" | ||
return await self.crawl_configs.distinct("tags", {"oid": org.id}) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just a bit more idiomatic