Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 98 additions & 12 deletions backend/btrixcloud/colls.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
UserFilePreparer,
MIN_UPLOAD_PART_SIZE,
PublicCollOut,
ResourcesOnly,
)
from .utils import (
dt_now,
Expand All @@ -57,6 +58,8 @@
get_origin,
)

from .crawlmanager import CrawlManager

if TYPE_CHECKING:
from .orgs import OrgOps
from .storages import StorageOps
Expand All @@ -81,8 +84,16 @@ class CollectionOps:
event_webhook_ops: EventWebhookOps
crawl_ops: CrawlOps
page_ops: PageOps
crawl_manager: CrawlManager

def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):
def __init__(
self,
mdb,
orgs: OrgOps,
storage_ops: StorageOps,
crawl_manager: CrawlManager,
event_webhook_ops: EventWebhookOps,
):
self.collections = mdb["collections"]
self.crawls = mdb["crawls"]
self.crawl_configs = mdb["crawl_configs"]
Expand All @@ -91,6 +102,7 @@ def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):

self.orgs = orgs
self.storage_ops = storage_ops
self.crawl_manager = crawl_manager
self.event_webhook_ops = event_webhook_ops

def set_crawl_ops(self, ops):
Expand Down Expand Up @@ -141,11 +153,15 @@ async def add_collection(self, oid: UUID, coll_in: CollIn):
access=coll_in.access,
defaultThumbnailName=coll_in.defaultThumbnailName,
allowPublicDownload=coll_in.allowPublicDownload,
hasDedupIndex=coll_in.hasDedupIndex,
)
try:
await self.collections.insert_one(coll.to_dict())
org = await self.orgs.get_org_by_id(oid)
await self.clear_org_previous_slugs_matching_slug(slug, org)
# create collection index
if coll.hasDedupIndex:
await self.crawl_manager.create_coll_index(coll)

if crawl_ids:
await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
Expand Down Expand Up @@ -194,22 +210,33 @@ async def update_collection(
db_update["$push"] = {"previousSlugs": previous_slug}

try:
result = await self.collections.find_one_and_update(
prev_result = await self.collections.find_one_and_update(
{"_id": coll_id, "oid": org.id},
db_update,
return_document=pymongo.ReturnDocument.AFTER,
return_document=pymongo.ReturnDocument.BEFORE,
)
except pymongo.errors.DuplicateKeyError as err:
# pylint: disable=raise-missing-from
field = get_duplicate_key_error_field(err)
raise HTTPException(status_code=400, detail=f"collection_{field}_taken")

if not result:
if not prev_result:
raise HTTPException(status_code=404, detail="collection_not_found")

if slug_update:
await self.clear_org_previous_slugs_matching_slug(slug_update, org)

# if dedup index is true, but was false
if update.hasDedupIndex and not prev_result.get("hasDedupIndex"):
# get latest coll, create index
coll = await self.get_collection(coll_id, org.id)
await self.crawl_manager.create_coll_index(coll)

# if dedup is false, but was true
if update.hasDedupIndex is False and prev_result.get("hasDedupIndex"):
# delete index -- may need extra restrictions
await self.crawl_manager.delete_coll_index(coll_id)

return {"updated": True}

async def clear_org_previous_slugs_matching_slug(
Expand All @@ -221,6 +248,16 @@ async def clear_org_previous_slugs_matching_slug(
{"$pull": {"previousSlugs": slug}},
)

async def get_coll_dedup_index(self, coll_id: UUID) -> bool:
"""return true/false if collection has dedup index, or raise"""
result = await self.collections.find_one(
{"_id": coll_id}, projection=["hasDedupIndex"]
)
if not result:
raise HTTPException(status_code=404, detail="collection_not_found")

return result["hasDedupIndex"] is True

async def add_crawls_to_collection(
self,
coll_id: UUID,
Expand All @@ -229,8 +266,6 @@ async def add_crawls_to_collection(
headers: Optional[dict] = None,
) -> CollOut:
"""Add crawls to collection"""
await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)

modified = dt_now()
result = await self.collections.find_one_and_update(
{"_id": coll_id},
Expand All @@ -240,8 +275,11 @@ async def add_crawls_to_collection(
if not result:
raise HTTPException(status_code=404, detail="collection_not_found")

# do this after checking if collection exists
await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)

await self.update_collection_counts_and_tags(coll_id)
await self.update_collection_dates(coll_id, org.id)
await self.update_collection_dates(coll_id, org.id, update_index=True)

asyncio.create_task(
self.event_webhook_ops.create_added_to_collection_notification(
Expand Down Expand Up @@ -294,6 +332,24 @@ async def get_collection_raw(

return result

async def enable_dedup_index(self, coll_id: UUID):
"""enable dedup index if it doesn't exist yet"""
result = await self.collections.find_one_and_update(
{"_id": coll_id, "hasDedupIndex": {"$ne": True}},
{"$set": {"hasDedupIndex": True}},
return_document=pymongo.ReturnDocument.AFTER,
)

# not changed, nothing to do
if not result:
return False

coll = Collection.from_dict(result)

await self.crawl_manager.create_coll_index(coll)

return True

async def get_collection_raw_by_slug(
self,
coll_slug: str,
Expand Down Expand Up @@ -396,6 +452,16 @@ async def get_collection_out(

return CollOut.from_dict(result)

async def get_internal_replay_list(self, coll_id: UUID, oid: UUID) -> ResourcesOnly:
"""get list of internally resolved signed WACZ files"""
org = await self.orgs.get_org_by_id(oid)
resources, _, _ = await self.get_collection_crawl_resources(coll_id, org)

for file_ in resources:
file_.path = self.storage_ops.resolve_internal_access_path(file_.path)

return ResourcesOnly(resources=resources)

async def get_public_collection_out(
self,
coll_id: UUID,
Expand Down Expand Up @@ -639,6 +705,9 @@ async def delete_collection(self, coll_id: UUID, org: Organization):
if coll.thumbnail:
await self.delete_thumbnail(coll_id, org)

if coll.hasDedupIndex:
await self.crawl_manager.delete_coll_index(coll.id)

result = await self.collections.delete_one({"_id": coll_id, "oid": org.id})
if result.deleted_count < 1:
raise HTTPException(status_code=404, detail="collection_not_found")
Expand Down Expand Up @@ -740,7 +809,9 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
},
)

async def update_collection_dates(self, coll_id: UUID, oid: UUID):
async def update_collection_dates(
self, coll_id: UUID, oid: UUID, update_index=False
):
"""Update collection earliest and latest dates from page timestamps"""
# pylint: disable=too-many-locals
coll = await self.get_collection(coll_id, oid)
Expand All @@ -749,6 +820,10 @@ async def update_collection_dates(self, coll_id: UUID, oid: UUID):
earliest_ts = None
latest_ts = None

# update_index is set, update dedup index if it exists
if update_index and coll.hasDedupIndex:
await self.crawl_manager.update_coll_index(coll_id)

match_query = {
"oid": coll.oid,
"crawl_id": {"$in": crawl_ids},
Expand Down Expand Up @@ -783,13 +858,16 @@ async def update_collection_dates(self, coll_id: UUID, oid: UUID):

async def update_crawl_collections(self, crawl_id: str, oid: UUID):
"""Update counts, dates, and modified for all collections in crawl"""
# accessing directly to handle both crawls and uploads
crawl = await self.crawls.find_one({"_id": crawl_id})
crawl_coll_ids = crawl.get("collectionIds")
crawl_coll_ids = crawl.get("collectionIds") or []
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
crawl_coll_ids = crawl.get("collectionIds") or []
crawl_coll_ids = crawl.get("collectionIds", [])

Just a bit more idiomatic

modified = dt_now()

for coll_id in crawl_coll_ids:
await self.update_collection_counts_and_tags(coll_id)
await self.update_collection_dates(coll_id, oid)
await self.update_collection_dates(
coll_id, oid, crawl.get("dedupCollId") != coll_id
)
await self.collections.find_one_and_update(
{"_id": coll_id},
{"$set": {"modified": modified}},
Expand Down Expand Up @@ -1000,12 +1078,20 @@ async def calculate_thumbnail_storage(self, oid: UUID) -> int:
# ============================================================================
# pylint: disable=too-many-locals
def init_collections_api(
app, mdb, orgs, storage_ops, event_webhook_ops, user_dep
app,
mdb,
orgs: OrgOps,
storage_ops: StorageOps,
crawl_manager: CrawlManager,
event_webhook_ops: EventWebhookOps,
user_dep,
) -> CollectionOps:
"""init collections api"""
# pylint: disable=invalid-name, unused-argument, too-many-arguments

colls: CollectionOps = CollectionOps(mdb, storage_ops, orgs, event_webhook_ops)
colls: CollectionOps = CollectionOps(
mdb, orgs, storage_ops, crawl_manager, event_webhook_ops
)

org_crawl_dep = orgs.org_crawl_dep
org_viewer_dep = orgs.org_viewer_dep
Expand Down
35 changes: 35 additions & 0 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,16 @@ async def add_crawl_config(

first_seed = seeds[0].url

# the dedup collection id must also be in auto add collections
if config_in.dedupCollId:
if (
not config_in.autoAddCollections
or config_in.dedupCollId not in config_in.autoAddCollections
):
raise HTTPException(
status_code=400, detail="dedup_coll_id_not_in_autoadd"
)

now = dt_now()
crawlconfig = CrawlConfig(
id=uuid4(),
Expand Down Expand Up @@ -343,6 +353,7 @@ async def add_crawl_config(
firstSeed=first_seed,
seedCount=seed_count,
shareable=config_in.shareable,
dedupCollId=config_in.dedupCollId,
)

if config_in.runNow:
Expand All @@ -359,6 +370,9 @@ async def add_crawl_config(
storage_quota_reached = False
exec_mins_quota_reached = False

if config_in.dedupCollId:
await self.coll_ops.enable_dedup_index(config_in.dedupCollId)

if config_in.runNow:
try:
crawl_id = await self.run_now_internal(crawlconfig, org, user)
Expand Down Expand Up @@ -608,6 +622,20 @@ async def update_crawl_config(
!= sorted(update.autoAddCollections)
)

metadata_changed = metadata_changed or (
update.dedupCollId is not None
and update.dedupCollId != orig_crawl_config.dedupCollId
)

if update.dedupCollId:
if (
not update.autoAddCollections
or update.dedupCollId not in update.autoAddCollections
):
raise HTTPException(
status_code=400, detail="dedup_coll_id_not_in_autoadd"
)
Comment on lines +630 to +637
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic needs to account for the possibility that the collection referenced by dedupCollId may already be an auto-add collection in the original crawl config, prior to the update, which would also be fine so long as it's not being unset in the update. Can look to seed file validation logic just above for an example, and since it's a bit tricky to get right we should have good tests around dedupCollId (setting in initial collection add, updating when auto-add collection is already set, updating both together).


run_now = update.runNow

if not changed and not metadata_changed and not run_now:
Expand Down Expand Up @@ -651,6 +679,9 @@ async def update_crawl_config(
query["seedCount"] = len(update.config.seeds)
query["seedFileId"] = None

if update.dedupCollId:
await self.coll_ops.enable_dedup_index(update.dedupCollId)

# update in db
result = await self.crawl_configs.find_one_and_update(
{"_id": cid, "inactive": {"$ne": True}},
Expand Down Expand Up @@ -1116,6 +1147,10 @@ async def remove_collection_from_all_configs(
{"$pull": {"autoAddCollections": coll_id}},
)

await self.crawl_configs.update_many(
{"oid": org.id, "dedupCollId": coll_id}, {"$set": {"dedupCollId": None}}
)

async def get_crawl_config_tags(self, org):
"""get distinct tags from all crawl configs for this org"""
return await self.crawl_configs.distinct("tags", {"oid": org.id})
Expand Down
31 changes: 30 additions & 1 deletion backend/btrixcloud/crawlmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@

from typing import Optional, Dict, Tuple
from datetime import datetime, timedelta
from uuid import UUID

from fastapi import HTTPException

from .utils import dt_now, date_to_str, scale_from_browser_windows
from .k8sapi import K8sAPI

from .models import StorageRef, CrawlConfig, BgJobType
from .models import StorageRef, CrawlConfig, BgJobType, Collection


# ============================================================================
Expand Down Expand Up @@ -293,6 +294,9 @@ async def create_crawl_job(
storage_filename=storage_filename,
profile_filename=profile_filename,
proxy_id=crawlconfig.proxyId or DEFAULT_PROXY_ID,
dedup_coll_id=(
str(crawlconfig.dedupCollId) if crawlconfig.dedupCollId else ""
),
is_single_page=is_single_page,
seed_file_url=seed_file_url,
)
Expand Down Expand Up @@ -468,6 +472,31 @@ async def delete_crawl_config_by_id(self, cid: str) -> None:
"""Delete all crawl configs by id"""
await self._delete_crawl_configs(f"btrix.crawlconfig={cid}")

async def create_coll_index(self, collection: Collection):
"""create collection index"""
params = {
"id": str(collection.id),
"oid": str(collection.oid),
"collItemsUpdatedAt": date_to_str(collection.modified or dt_now()),
}
data = self.templates.env.get_template("coll_index.yaml").render(params)

await self.create_from_yaml(data)

return str(collection.id)

async def update_coll_index(self, coll_id: UUID):
"""force collection index to update"""
return await self.patch_custom_object(
f"collindex-{coll_id}",
{"collItemsUpdatedAt": date_to_str(dt_now())},
"collindexes",
)

async def delete_coll_index(self, coll_id: UUID):
"""delete collection index"""
return await self.delete_custom_object(f"collindex-{coll_id}", "collindexes")

# ========================================================================
# Internal Methods
async def _delete_crawl_configs(self, label) -> None:
Expand Down
1 change: 1 addition & 0 deletions backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ async def add_new_crawl(
version=2,
firstSeed=crawlconfig.firstSeed,
seedCount=crawlconfig.seedCount,
dedupCollId=crawlconfig.dedupCollId,
)

try:
Expand Down
Loading
Loading