webrecorder · ikreymer · Sep 23, 2025 · Sep 23, 2025 · Sep 24, 2025 · Sep 25, 2025
diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py
@@ -49,6 +49,7 @@
     UserFilePreparer,
     MIN_UPLOAD_PART_SIZE,
     PublicCollOut,
+    ResourcesOnly,
 )
 from .utils import (
     dt_now,
@@ -57,6 +58,8 @@
     get_origin,
 )
 
+from .crawlmanager import CrawlManager
+
 if TYPE_CHECKING:
     from .orgs import OrgOps
     from .storages import StorageOps
@@ -81,8 +84,16 @@ class CollectionOps:
     event_webhook_ops: EventWebhookOps
     crawl_ops: CrawlOps
     page_ops: PageOps
+    crawl_manager: CrawlManager
 
-    def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):
+    def __init__(
+        self,
+        mdb,
+        orgs: OrgOps,
+        storage_ops: StorageOps,
+        crawl_manager: CrawlManager,
+        event_webhook_ops: EventWebhookOps,
+    ):
         self.collections = mdb["collections"]
         self.crawls = mdb["crawls"]
         self.crawl_configs = mdb["crawl_configs"]
@@ -91,6 +102,7 @@ def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):
 
         self.orgs = orgs
         self.storage_ops = storage_ops
+        self.crawl_manager = crawl_manager
         self.event_webhook_ops = event_webhook_ops
 
     def set_crawl_ops(self, ops):
@@ -141,11 +153,15 @@ async def add_collection(self, oid: UUID, coll_in: CollIn):
             access=coll_in.access,
             defaultThumbnailName=coll_in.defaultThumbnailName,
             allowPublicDownload=coll_in.allowPublicDownload,
+            hasDedupIndex=coll_in.hasDedupIndex,
         )
         try:
             await self.collections.insert_one(coll.to_dict())
             org = await self.orgs.get_org_by_id(oid)
             await self.clear_org_previous_slugs_matching_slug(slug, org)
+            # create collection index
+            if coll.hasDedupIndex:
+                await self.crawl_manager.create_coll_index(coll)
 
             if crawl_ids:
                 await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
@@ -194,22 +210,33 @@ async def update_collection(
             db_update["$push"] = {"previousSlugs": previous_slug}
 
         try:
-            result = await self.collections.find_one_and_update(
+            prev_result = await self.collections.find_one_and_update(
                 {"_id": coll_id, "oid": org.id},
                 db_update,
-                return_document=pymongo.ReturnDocument.AFTER,
+                return_document=pymongo.ReturnDocument.BEFORE,
             )
         except pymongo.errors.DuplicateKeyError as err:
             # pylint: disable=raise-missing-from
             field = get_duplicate_key_error_field(err)
             raise HTTPException(status_code=400, detail=f"collection_{field}_taken")
 
-        if not result:
+        if not prev_result:
             raise HTTPException(status_code=404, detail="collection_not_found")
 
         if slug_update:
             await self.clear_org_previous_slugs_matching_slug(slug_update, org)
 
+        # if dedup index is true, but was false
+        if update.hasDedupIndex and not prev_result.get("hasDedupIndex"):
+            # get latest coll, create index
+            coll = await self.get_collection(coll_id, org.id)
+            await self.crawl_manager.create_coll_index(coll)
+
+        # if dedup is false, but was true
+        if update.hasDedupIndex is False and prev_result.get("hasDedupIndex"):
+            # delete index -- may need extra restrictions
+            await self.crawl_manager.delete_coll_index(coll_id)
+
         return {"updated": True}
 
     async def clear_org_previous_slugs_matching_slug(
@@ -221,6 +248,16 @@ async def clear_org_previous_slugs_matching_slug(
             {"$pull": {"previousSlugs": slug}},
         )
 
+    async def get_coll_dedup_index(self, coll_id: UUID) -> bool:
+        """return true/false if collection has dedup index, or raise"""
+        result = await self.collections.find_one(
+            {"_id": coll_id}, projection=["hasDedupIndex"]
+        )
+        if not result:
+            raise HTTPException(status_code=404, detail="collection_not_found")
+
+        return result["hasDedupIndex"] is True
+
     async def add_crawls_to_collection(
         self,
         coll_id: UUID,
@@ -229,8 +266,6 @@ async def add_crawls_to_collection(
         headers: Optional[dict] = None,
     ) -> CollOut:
         """Add crawls to collection"""
-        await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
-
         modified = dt_now()
         result = await self.collections.find_one_and_update(
             {"_id": coll_id},
@@ -240,8 +275,11 @@ async def add_crawls_to_collection(
         if not result:
             raise HTTPException(status_code=404, detail="collection_not_found")
 
+        # do this after checking if collection exists
+        await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
+
         await self.update_collection_counts_and_tags(coll_id)
-        await self.update_collection_dates(coll_id, org.id)
+        await self.update_collection_dates(coll_id, org.id, update_index=True)
 
         asyncio.create_task(
             self.event_webhook_ops.create_added_to_collection_notification(
@@ -294,6 +332,24 @@ async def get_collection_raw(
 
         return result
 
+    async def enable_dedup_index(self, coll_id: UUID):
+        """enable dedup index if it doesn't exist yet"""
+        result = await self.collections.find_one_and_update(
+            {"_id": coll_id, "hasDedupIndex": {"$ne": True}},
+            {"$set": {"hasDedupIndex": True}},
+            return_document=pymongo.ReturnDocument.AFTER,
+        )
+
+        # not changed, nothing to do
+        if not result:
+            return False
+
+        coll = Collection.from_dict(result)
+
+        await self.crawl_manager.create_coll_index(coll)
+
+        return True
+
     async def get_collection_raw_by_slug(
         self,
         coll_slug: str,
@@ -396,6 +452,16 @@ async def get_collection_out(
 
         return CollOut.from_dict(result)
 
+    async def get_internal_replay_list(self, coll_id: UUID, oid: UUID) -> ResourcesOnly:
+        """get list of internally resolved signed WACZ files"""
+        org = await self.orgs.get_org_by_id(oid)
+        resources, _, _ = await self.get_collection_crawl_resources(coll_id, org)
+
+        for file_ in resources:
+            file_.path = self.storage_ops.resolve_internal_access_path(file_.path)
+
+        return ResourcesOnly(resources=resources)
+
     async def get_public_collection_out(
         self,
         coll_id: UUID,
@@ -639,6 +705,9 @@ async def delete_collection(self, coll_id: UUID, org: Organization):
         if coll.thumbnail:
             await self.delete_thumbnail(coll_id, org)
 
+        if coll.hasDedupIndex:
+            await self.crawl_manager.delete_coll_index(coll.id)
+
         result = await self.collections.delete_one({"_id": coll_id, "oid": org.id})
         if result.deleted_count < 1:
             raise HTTPException(status_code=404, detail="collection_not_found")
@@ -740,7 +809,9 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
             },
         )
 
-    async def update_collection_dates(self, coll_id: UUID, oid: UUID):
+    async def update_collection_dates(
+        self, coll_id: UUID, oid: UUID, update_index=False
+    ):
         """Update collection earliest and latest dates from page timestamps"""
         # pylint: disable=too-many-locals
         coll = await self.get_collection(coll_id, oid)
@@ -749,6 +820,10 @@ async def update_collection_dates(self, coll_id: UUID, oid: UUID):
         earliest_ts = None
         latest_ts = None
 
+        # update_index is set, update dedup index if it exists
+        if update_index and coll.hasDedupIndex:
+            await self.crawl_manager.update_coll_index(coll_id)
+
         match_query = {
             "oid": coll.oid,
             "crawl_id": {"$in": crawl_ids},
@@ -783,13 +858,16 @@ async def update_collection_dates(self, coll_id: UUID, oid: UUID):
 
     async def update_crawl_collections(self, crawl_id: str, oid: UUID):
         """Update counts, dates, and modified for all collections in crawl"""
+        # accessing directly to handle both crawls and uploads
         crawl = await self.crawls.find_one({"_id": crawl_id})
-        crawl_coll_ids = crawl.get("collectionIds")
+        crawl_coll_ids = crawl.get("collectionIds") or []
-        crawl_coll_ids = crawl.get("collectionIds") or []
+        crawl_coll_ids = crawl.get("collectionIds", [])
-        crawl_coll_ids = crawl.get("collectionIds") or []
+        crawl_coll_ids = crawl.get("collectionIds", [])
         modified = dt_now()
 
         for coll_id in crawl_coll_ids:
             await self.update_collection_counts_and_tags(coll_id)
-            await self.update_collection_dates(coll_id, oid)
+            await self.update_collection_dates(
+                coll_id, oid, crawl.get("dedupCollId") != coll_id
+            )
             await self.collections.find_one_and_update(
                 {"_id": coll_id},
                 {"$set": {"modified": modified}},
@@ -1000,12 +1078,20 @@ async def calculate_thumbnail_storage(self, oid: UUID) -> int:
 # ============================================================================
 # pylint: disable=too-many-locals
 def init_collections_api(
-    app, mdb, orgs, storage_ops, event_webhook_ops, user_dep
+    app,
+    mdb,
+    orgs: OrgOps,
+    storage_ops: StorageOps,
+    crawl_manager: CrawlManager,
+    event_webhook_ops: EventWebhookOps,
+    user_dep,
 ) -> CollectionOps:
     """init collections api"""
     # pylint: disable=invalid-name, unused-argument, too-many-arguments
 
-    colls: CollectionOps = CollectionOps(mdb, storage_ops, orgs, event_webhook_ops)
+    colls: CollectionOps = CollectionOps(
+        mdb, orgs, storage_ops, crawl_manager, event_webhook_ops
+    )
 
     org_crawl_dep = orgs.org_crawl_dep
     org_viewer_dep = orgs.org_viewer_dep

diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
@@ -316,6 +316,16 @@ async def add_crawl_config(
 
             first_seed = seeds[0].url
 
+        # the dedup collection id must also be in auto add collections
+        if config_in.dedupCollId:
+            if (
+                not config_in.autoAddCollections
+                or config_in.dedupCollId not in config_in.autoAddCollections
+            ):
+                raise HTTPException(
+                    status_code=400, detail="dedup_coll_id_not_in_autoadd"
+                )
+
         now = dt_now()
         crawlconfig = CrawlConfig(
             id=uuid4(),
@@ -343,6 +353,7 @@ async def add_crawl_config(
             firstSeed=first_seed,
             seedCount=seed_count,
             shareable=config_in.shareable,
+            dedupCollId=config_in.dedupCollId,
         )
 
         if config_in.runNow:
@@ -359,6 +370,9 @@ async def add_crawl_config(
         storage_quota_reached = False
         exec_mins_quota_reached = False
 
+        if config_in.dedupCollId:
+            await self.coll_ops.enable_dedup_index(config_in.dedupCollId)
+
         if config_in.runNow:
             try:
                 crawl_id = await self.run_now_internal(crawlconfig, org, user)
@@ -608,6 +622,20 @@ async def update_crawl_config(
             != sorted(update.autoAddCollections)
         )
 
+        metadata_changed = metadata_changed or (
+            update.dedupCollId is not None
+            and update.dedupCollId != orig_crawl_config.dedupCollId
+        )
+
+        if update.dedupCollId:
+            if (
+                not update.autoAddCollections
+                or update.dedupCollId not in update.autoAddCollections
+            ):
+                raise HTTPException(
+                    status_code=400, detail="dedup_coll_id_not_in_autoadd"
+                )
+
         run_now = update.runNow
 
         if not changed and not metadata_changed and not run_now:
@@ -651,6 +679,9 @@ async def update_crawl_config(
             query["seedCount"] = len(update.config.seeds)
             query["seedFileId"] = None
 
+        if update.dedupCollId:
+            await self.coll_ops.enable_dedup_index(update.dedupCollId)
+
         # update in db
         result = await self.crawl_configs.find_one_and_update(
             {"_id": cid, "inactive": {"$ne": True}},
@@ -1116,6 +1147,10 @@ async def remove_collection_from_all_configs(
             {"$pull": {"autoAddCollections": coll_id}},
         )
 
+        await self.crawl_configs.update_many(
+            {"oid": org.id, "dedupCollId": coll_id}, {"$set": {"dedupCollId": None}}
+        )
+
     async def get_crawl_config_tags(self, org):
         """get distinct tags from all crawl configs for this org"""
         return await self.crawl_configs.distinct("tags", {"oid": org.id})

diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py
@@ -5,13 +5,14 @@
 
 from typing import Optional, Dict, Tuple
 from datetime import datetime, timedelta
+from uuid import UUID
 
 from fastapi import HTTPException
 
 from .utils import dt_now, date_to_str, scale_from_browser_windows
 from .k8sapi import K8sAPI
 
-from .models import StorageRef, CrawlConfig, BgJobType
+from .models import StorageRef, CrawlConfig, BgJobType, Collection
 
 
 # ============================================================================
@@ -293,6 +294,9 @@ async def create_crawl_job(
             storage_filename=storage_filename,
             profile_filename=profile_filename,
             proxy_id=crawlconfig.proxyId or DEFAULT_PROXY_ID,
+            dedup_coll_id=(
+                str(crawlconfig.dedupCollId) if crawlconfig.dedupCollId else ""
+            ),
             is_single_page=is_single_page,
             seed_file_url=seed_file_url,
         )
@@ -468,6 +472,31 @@ async def delete_crawl_config_by_id(self, cid: str) -> None:
         """Delete all crawl configs by id"""
         await self._delete_crawl_configs(f"btrix.crawlconfig={cid}")
 
+    async def create_coll_index(self, collection: Collection):
+        """create collection index"""
+        params = {
+            "id": str(collection.id),
+            "oid": str(collection.oid),
+            "collItemsUpdatedAt": date_to_str(collection.modified or dt_now()),
+        }
+        data = self.templates.env.get_template("coll_index.yaml").render(params)
+
+        await self.create_from_yaml(data)
+
+        return str(collection.id)
+
+    async def update_coll_index(self, coll_id: UUID):
+        """force collection index to update"""
+        return await self.patch_custom_object(
+            f"collindex-{coll_id}",
+            {"collItemsUpdatedAt": date_to_str(dt_now())},
+            "collindexes",
+        )
+
+    async def delete_coll_index(self, coll_id: UUID):
+        """delete collection index"""
+        return await self.delete_custom_object(f"collindex-{coll_id}", "collindexes")
+
     # ========================================================================
     # Internal Methods
     async def _delete_crawl_configs(self, label) -> None:

diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
@@ -403,6 +403,7 @@ async def add_new_crawl(
             version=2,
             firstSeed=crawlconfig.firstSeed,
             seedCount=crawlconfig.seedCount,
+            dedupCollId=crawlconfig.dedupCollId,
         )
 
         try: