From 162c26b62c9242401cff8a78cc7c7fc71ecaae7d Mon Sep 17 00:00:00 2001 From: wpxgit Date: Sat, 29 Oct 2016 12:38:53 -0500 Subject: [PATCH 01/14] Added Cassandra Backend --- .gitignore | 3 + docs/README | 3 + docs/source/topics/frontera-settings.rst | 90 +++++ docs/source/topics/frontier-backends.rst | 27 ++ .../contrib/backends/cassandra/__init__.py | 199 +++++++++++ .../contrib/backends/cassandra/components.py | 314 ++++++++++++++++++ frontera/contrib/backends/cassandra/models.py | 101 ++++++ .../contrib/backends/cassandra/revisiting.py | 139 ++++++++ .../backends/cassandra/test_backend.py | 23 ++ frontera/settings/default_settings.py | 17 +- requirements/tests.txt | 1 + 11 files changed, 916 insertions(+), 1 deletion(-) create mode 100644 frontera/contrib/backends/cassandra/__init__.py create mode 100644 frontera/contrib/backends/cassandra/components.py create mode 100644 frontera/contrib/backends/cassandra/models.py create mode 100644 frontera/contrib/backends/cassandra/revisiting.py create mode 100644 frontera/contrib/backends/cassandra/test_backend.py diff --git a/.gitignore b/.gitignore index 287b0b569..e74280063 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,6 @@ docs/_build/ # PyBuilder target/ + +# PyCharm Idea Folder +.idea/ \ No newline at end of file diff --git a/docs/README b/docs/README index 3d9114563..fd04c32f5 100644 --- a/docs/README +++ b/docs/README @@ -30,6 +30,9 @@ from this dir:: Documentation will be generated (in HTML format) inside the ``build/html`` dir. +If you get the error "ImportError: No module named sphinx_rtd_theme" run: + sudo pip install sphinx-rtd-theme + View the documentation ---------------------- diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index 805fc52f7..c6b7585b4 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -487,6 +487,96 @@ Default: ``timedelta(days=1)`` Time between document visits, expressed in ``datetime.timedelta`` objects. Changing of this setting will only affect documents scheduled after the change. All previously queued documents will be crawled with old periodicity. +.. _cassandra-settings: + +Cassandra +--------- + + +.. setting:: CASSANDRABACKEND_DROP_ALL_TABLES + +CASSANDRABACKEND_DROP_ALL_TABLES +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default: ``False`` + +Set to ``True`` if you need to drop of all DB tables on backend instantiation (e.g. every Scrapy spider run). + +.. setting:: SQLALCHEMYBACKEND_ENGINE + +CASSANDRABACKEND_CLUSTER_IPS +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default:: ``['127.0.0.1']`` + +Set IPs from Cassandra Cluster. Default is localhost. To assign more than one IP use this Syntax: ``['192.168.0.1', '192.168.0.2']`` + +CASSANDRABACKEND_CLUSTER_PORT +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default:: ``9042`` + +Set port from Cassandra Cluster / Nodes + + +CASSANDRABACKEND_GENERATE_STATS +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default:: ``False`` + +Set this to true if you want to create an extra Table for stats collection. In this table there will be pages crawled, links queued etv. counted up. + + +CASSANDRABACKEND_KEYSPACE +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default:: ``frontera`` + +Set cassandra Keyspace + +CASSANDRABACKEND_CREATE_KEYSPACE_IF_NOT_EXISTS +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default:: ``True`` + +Creates Keyspace if it not exist. Set to false if you frontera shouldn't check on every startup. + + +CASSANDRABACKEND_CRAWL_ID +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default:: ``default`` + +Sets an ID in each table for the actual crawl. If you want to run another crawl from begining in same Table set to another Crawl ID. Its an Text field. + + +CASSANDRABACKEND_MODELS +^^^^^^^^^^^^^^^^^^^^^^^ + +Default:: + + { + 'MetadataModel': 'frontera.contrib.backends.cassandra.models.MetadataModel', + 'StateModel': 'frontera.contrib.backends.cassandra.models.StateModel', + 'QueueModel': 'frontera.contrib.backends.cassandra.models.QueueModel', + 'CrawlStatsModel': 'frontera.contrib.backends.cassandra.models.CrawlStatsModel' + } + +This is mapping with Cassandra models used by backends. It is mainly used for customization. + + +Revisiting backend +------------------ + +.. setting:: CASSANDRABACKEND_REVISIT_INTERVAL + +CASSANDRABACKEND_REVISIT_INTERVAL +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default: ``timedelta(days=1)`` + +Time between document visits, expressed in ``datetime.timedelta`` objects. Changing of this setting will only affect +documents scheduled after the change. All previously queued documents will be crawled with old periodicity. .. _hbase-settings: diff --git a/docs/source/topics/frontier-backends.rst b/docs/source/topics/frontier-backends.rst index 706da8377..dfaddfc88 100644 --- a/docs/source/topics/frontier-backends.rst +++ b/docs/source/topics/frontier-backends.rst @@ -270,6 +270,33 @@ there are no documents available for crawling, but there are documents waiting f Base class for SQLAlchemy :class:`Backend ` implementation of revisiting back-end. +.. _frontier-backends-cassandra: + +Cassandra backends +^^^^^^^^^^^^^^^^^^ + +This set of :class:`Backend ` objects will use `Cassandra`_ as storage for +:ref:`basic algorithms `. + +Cassandra is a NoSQL Colum-Store Database with Linear scalability and a SQL-Like Query Language. + +If you need to use your own `declarative cassandra models`_, you can do it by using the +:setting:`CASSANDRABACKEND_MODELS` setting. + +This setting uses a dictionary where ``key`` represents the name of the model to define and ``value`` the model to use. + +For a complete list of all settings used for Cassandra backends check the :doc:`settings ` section. + +.. class:: frontera.contrib.backends.cassandra.BASE + + Base class for Cassandra :class:`Backend ` objects. + It runs cassandra in multi-spider one worker mode with the FIFO algorithm. + +.. class:: frontera.contrib.backends.cassandra.Distributed + + Cassandra :class:`Backend ` implementation of the distributed Backend. + + HBase backend ^^^^^^^^^^^^^ diff --git a/frontera/contrib/backends/cassandra/__init__.py b/frontera/contrib/backends/cassandra/__init__.py new file mode 100644 index 000000000..ff4134f6f --- /dev/null +++ b/frontera/contrib/backends/cassandra/__init__.py @@ -0,0 +1,199 @@ +from __future__ import absolute_import +from cassandra.cluster import Cluster +from cassandra.cqlengine import connection +from cassandra.query import dict_factory +from cassandra.policies import RetryPolicy, ConstantReconnectionPolicy +from cassandra.cqlengine.management import sync_table +from cassandra.cqlengine.management import drop_table +from frontera.core.components import DistributedBackend +from frontera.contrib.backends import CommonBackend +from frontera.contrib.backends.cassandra.components import Metadata, Queue, States +from frontera.utils.misc import load_object +import logging + + +class CassandraBackend(CommonBackend): + def __init__(self, manager): + self.manager = manager + settings = manager.settings + cluster_ips = settings.get('CASSANDRABACKEND_CLUSTER_IPS') + cluster_port = settings.get('CASSANDRABACKEND_CLUSTER_PORT') + drop_all_tables = settings.get('CASSANDRABACKEND_DROP_ALL_TABLES') + keyspace = settings.get('CASSANDRABACKEND_KEYSPACE') + keyspace_create = settings.get('CASSANDRABACKEND_CREATE_KEYSPACE_IF_NOT_EXISTS') + models = settings.get('CASSANDRABACKEND_MODELS') + crawl_id = settings.get('CASSANDRABACKEND_CRAWL_ID') + generate_stats = settings.get('CASSANDRABACKEND_GENERATE_STATS') + + self.models = dict([(name, load_object(klass)) for name, klass in models.items()]) + + self.cluster = Cluster( + contact_points=cluster_ips, + port=cluster_port, + compression=True, + default_retry_policy=RetryPolicy(), + reconnection_policy=ConstantReconnectionPolicy(10, 100) + ) + + self.session = self.cluster.connect() + self.session.row_factory = dict_factory + self.session.encoder.mapping[dict] = self.session.encoder.cql_encode_map_collection + self.crawl_id = crawl_id + self.generate_stats = generate_stats + + if keyspace_create: + query = """CREATE KEYSPACE IF NOT EXISTS \"%s\" + WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3}""" % (keyspace, ) + self.session.execute(query) + + self.session.set_keyspace(keyspace) + + connection.set_session(self.session) + + if drop_all_tables: + for key, value in self.models.iteritems(): + drop_table(value) + + for key, value in self.models.iteritems(): + if (self.generate_stats is False and key != 'CrawlStatsModel') or self.generate_stats is True: + sync_table(value) + + self._metadata = Metadata(self.session, self.models['MetadataModel'], self.crawl_id, self.generate_stats) + self._states = States(self.session, self.models['StateModel'], + settings.get('STATE_CACHE_SIZE_LIMIT'), self.crawl_id) + self._queue = self._create_queue(settings) + + def frontier_stop(self): + self.states.flush() + self.session.shutdown() + + def _create_queue(self, settings): + return Queue(self.session, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS'), + self.crawl_id, self.generate_stats) + + @property + def queue(self): + return self._queue + + @property + def metadata(self): + return self._metadata + + @property + def states(self): + return self._states + +BASE = CassandraBackend + + +class Distributed(DistributedBackend): + def __init__(self, manager): + self.manager = manager + settings = manager.settings + cluster_ips = settings.get('CASSANDRABACKEND_CLUSTER_IPS') # Format: ['192.168.0.1', '192.168.0.2'] + cluster_port = settings.get('CASSANDRABACKEND_CLUSTER_PORT') + keyspace = settings.get('CASSANDRABACKEND_KEYSPACE') + keyspace_create = settings.get('CASSANDRABACKEND_CREATE_KEYSPACE_IF_NOT_EXISTS') # Default: true + models = settings.get('CASSANDRABACKEND_MODELS') + + self.cluster = Cluster(cluster_ips, cluster_port) + self.models = dict([(name, load_object(klass)) for name, klass in models.items()]) + + self.session = self.cluster.connect() + self.session.row_factory = dict_factory + + if keyspace_create: + query = """CREATE KEYSPACE IF NOT EXISTS \"%s\" + WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3}""" % (keyspace, ) + self.session.execute(query) + self.session.set_keyspace(keyspace) + connection.set_session(self.session) + + self._metadata = None + self._queue = None + self._states = None + + @classmethod + def strategy_worker(cls, manager): + b = cls(manager) + settings = manager.settings + drop_all_tables = settings.get('CASSANDRABACKEND_DROP_ALL_TABLES') + crawl_id = settings.get('CASSANDRABACKEND_CRAWL_ID') + model = b.models['StateModel'] + + if drop_all_tables: + drop_table(model) + + sync_table(model) + + b._states = States(b.session, model, + settings.get('STATE_CACHE_SIZE_LIMIT'), crawl_id) + return b + + @classmethod + def db_worker(cls, manager): + b = cls(manager) + settings = manager.settings + drop = settings.get('CASSANDRABACKEND_DROP_ALL_TABLES') + crawl_id = settings.get('CASSANDRABACKEND_CRAWL_ID') + generate_stats = settings.get('CASSANDRABACKEND_GENERATE_STATS') + + metadata_m = b.models['MetadataModel'] + queue_m = b.models['QueueModel'] + stats_m = b.models['CrawlStatsModel'] + if drop: + drop_table(metadata_m) + drop_table(queue_m) + drop_table(stats_m) + + sync_table(metadata_m) + sync_table(queue_m) + if generate_stats is True: + sync_table(stats_m) + + b._metadata = Metadata(b.session, metadata_m, crawl_id, generate_stats) + b._queue = Queue(b.session, queue_m, settings.get('SPIDER_FEED_PARTITIONS'), crawl_id, generate_stats) + return b + + @property + def queue(self): + return self._queue + + @property + def metadata(self): + return self._metadata + + @property + def states(self): + return self._states + + def frontier_start(self): + for component in [self.metadata, self.queue, self.states]: + if component: + component.frontier_start() + + def frontier_stop(self): + for component in [self.metadata, self.queue, self.states]: + if component: + component.frontier_stop() + + def add_seeds(self, seeds): + self.metadata.add_seeds(seeds) + + def get_next_requests(self, max_next_requests, **kwargs): + partitions = kwargs.pop('partitions', [0]) # TODO: Collect from all known partitions + batch = [] + for partition_id in partitions: + batch.extend(self.queue.get_next_requests(max_next_requests, partition_id, **kwargs)) + return batch + + def page_crawled(self, response, links): + self.metadata.page_crawled(response, links) + + def request_error(self, request, error): + self.metadata.request_error(request, error) + + def finished(self): + return NotImplementedError + + diff --git a/frontera/contrib/backends/cassandra/components.py b/frontera/contrib/backends/cassandra/components.py new file mode 100644 index 000000000..9584a8629 --- /dev/null +++ b/frontera/contrib/backends/cassandra/components.py @@ -0,0 +1,314 @@ +# -*- coding: utf-8 -*- +import logging +from datetime import datetime +from time import time +from frontera.contrib.backends.partitioners import Crc32NamePartitioner +from frontera.contrib.backends.memory import MemoryStates +from frontera.core.components import Metadata as BaseMetadata, Queue as BaseQueue +from frontera.core.models import Request, Response +from frontera.utils.misc import get_crc32, chunks +from frontera.utils.url import parse_domain_from_url_fast +from cassandra.concurrent import execute_concurrent_with_args +from frontera.contrib.backends.cassandra.models import Meta + + +class Metadata(BaseMetadata): + def __init__(self, session, model_cls, crawl_id, generate_stats): + self.session = session + self.model = model_cls + self.table = 'MetadataModel' + self.logger = logging.getLogger("frontera.contrib.backends.cassandra.components.Metadata") + self.crawl_id = crawl_id + self.generate_stats = generate_stats + self.counter_cls = CassandraCount(crawl_id, self.session, generate_stats) + + def frontier_stop(self): + pass + + def add_seeds(self, seeds): + cql_items = [] + for seed in seeds: + query = self.session.prepare( + "INSERT INTO metadata (crawl, fingerprint, url, created_at, meta, headers, cookies, method, depth) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)") + meta = Meta(domain=seed.meta['domain'], fingerprint=seed.meta['fingerprint'], + origin_is_frontier=seed.meta['origin_is_frontier'], + scrapy_callback=seed.meta['scrapy_callback'], scrapy_errback=seed.meta['scrapy_errback'], + scrapy_meta=seed.meta['scrapy_meta']) + cql_i = (self.crawl_id, seed.meta['fingerprint'], seed.url, datetime.utcnow(), meta, + seed.headers, seed.cookies, seed.method, 0) + cql_items.append(cql_i) + if len(seeds) > 0: + execute_concurrent_with_args(self.session, query, cql_items, concurrency=400) + self.counter_cls.cass_count({"seed_urls": len(seeds)}) + + def request_error(self, page, error): + query_page = self.session.prepare( + "UPDATE metadata SET error = ? WHERE crawl = ? AND fingerprint = ?") + self.session.execute(query_page, (error, self.crawl_id, page.meta['fingerprint'])) + self.counter_cls.cass_count({"error": 1}) + + def page_crawled(self, response, links): + query_page = self.session.prepare( + "UPDATE metadata SET fetched_at = ?, headers = ?, method = ?, cookies = ?, status_code = ? " + "WHERE crawl= ? AND fingerprint = ?") + self.session.execute_async(query_page, (datetime.utcnow(), response.request.headers, response.request.method, + response.request.cookies, response.status_code, self.crawl_id, + response.meta['fingerprint'])) + depth = 0 + page_res = self.model.objects.filter(crawl=self.crawl_id, fingerprint=response.meta['fingerprint']) + if page_res[0].depth > 0: + depth = page_res[0].depth + + query = self.session.prepare( + "INSERT INTO metadata (crawl, fingerprint, created_at, method, url, depth) VALUES (?, ?, ?, ?, ?, ?)") + cql_items = [] + for link in links: + if response.meta['fingerprint'] != link.meta['fingerprint']: + cql_i = (self.crawl_id, link.meta['fingerprint'], datetime.utcnow(), link.method, link.url, depth+1) + cql_items.append(cql_i) + execute_concurrent_with_args(self.session, query, cql_items, concurrency=400) + self.counter_cls.cass_count({"pages_crawled": 1, "links_found": len(cql_items)}) + + def update_score(self, batch): + query = self.session.prepare("UPDATE metadata SET score = ? WHERE crawl = ? AND fingerprint = ?") + cql_items = [] + for fprint, score, request, schedule in batch: + cql_i = (score, self.crawl_id, fprint) + cql_items.append(cql_i) + execute_concurrent_with_args(self.session, query, cql_items, concurrency=400) + self.counter_cls.cass_count({"scored_urls": len(cql_items)}) + + +class States(MemoryStates): + + def __init__(self, session, model_cls, cache_size_limit, crawl_id): + super(States, self).__init__(cache_size_limit) + self.session = session + self.model = model_cls + self.table = 'StateModel' + self.logger = logging.getLogger("frontera.contrib.backends.cassandra.components.States") + self.crawl_id = crawl_id + + def frontier_stop(self): + pass + + def fetch(self, fingerprints): + to_fetch = [f for f in fingerprints if f not in self._cache] + self.logger.debug("cache size %s", len(self._cache)) + self.logger.debug("to fetch %d from %d", (len(to_fetch), len(fingerprints))) + + for chunk in chunks(to_fetch, 128): + for state in self.model.objects.filter(crawl=self.crawl_id, fingerprint__in=chunk): + self._cache[state.fingerprint] = state.state + + def flush(self, force_clear=False): + query = self.session.prepare("INSERT INTO states (crawl, fingerprint, state) VALUES (?, ?, ?)") + cql_items = [] + for fingerprint, state_val in self._cache.iteritems(): + cql_i = (self.crawl_id, fingerprint, state_val) + cql_items.append(cql_i) + execute_concurrent_with_args(self.session, query, cql_items, concurrency=20000) + super(States, self).flush(force_clear) + + +class Queue(BaseQueue): + def __init__(self, session, queue_cls, partitions, crawl_id, generate_stats, ordering='default'): + self.session = session + self.queue_model = queue_cls + self.logger = logging.getLogger("frontera.contrib.backends.cassandra.components.Queue") + self.partitions = [i for i in range(0, partitions)] + self.partitioner = Crc32NamePartitioner(self.partitions) + self.ordering = ordering + self.crawl_id = crawl_id + self.counter_cls = CassandraCount(crawl_id, self.session, generate_stats) + + def frontier_stop(self): + pass + + def _order_by(self): + if self.ordering == 'created': + return "created_at" + return "created_at" + + def get_next_requests(self, max_n_requests, partition_id, **kwargs): + """ + Dequeues new batch of requests for crawling. + + :param max_n_requests: maximum number of requests to return + :param partition_id: partition id + :return: list of :class:`Request ` objects. + """ + results = [] + try: + dequeued_urls = 0 + cql_ditems = [] + d_query = self.session.prepare("DELETE FROM queue WHERE crawl = ? AND fingerprint = ? AND partition_id = ? " + "AND score = ? AND created_at = ?") + for item in self.queue_model.objects.filter(crawl=self.crawl_id, partition_id=partition_id).\ + order_by("partition_id", "score", self._order_by()).limit(max_n_requests): + method = 'GET' if not item.method else item.method + + meta_dict2 = dict((name, getattr(item.meta, name)) for name in dir(item.meta) + if not name.startswith('__')) + # TODO: How the result can be an dict not an object -> Objects get error while encodeing for Message Bus + # If I take meta_dict2 direct to Request i get the same error message + + meta_dict = dict() + meta_dict["fingerprint"] = meta_dict2["fingerprint"] + meta_dict["domain"] = meta_dict2["domain"] + meta_dict["origin_is_frontier"] = meta_dict2["origin_is_frontier"] + meta_dict["scrapy_callback"] = meta_dict2["scrapy_callback"] + meta_dict["scrapy_errback"] = meta_dict2["scrapy_errback"] + meta_dict["scrapy_meta"] = meta_dict2["scrapy_meta"] + meta_dict["score"] = meta_dict2["score"] + meta_dict["jid"] = meta_dict2["jid"] + + r = Request(item.url, method=method, meta=meta_dict, headers=item.headers, cookies=item.cookies) + r.meta['fingerprint'] = item.fingerprint + r.meta['score'] = item.score + results.append(r) + + cql_d = (item.crawl, item.fingerprint, item.partition_id, item.score, item.created_at) + cql_ditems.append(cql_d) + dequeued_urls += 1 + + if dequeued_urls > 0: + execute_concurrent_with_args(self.session, d_query, cql_ditems, concurrency=200) + + self.counter_cls.cass_count({"dequeued_urls": dequeued_urls}) + + except Exception, exc: + self.logger.exception(exc) + + return results + + def schedule(self, batch): + query = self.session.prepare("INSERT INTO queue (crawl, fingerprint, score, partition_id, host_crc32, url, " + "created_at, meta, depth, headers, method, cookies) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") + cql_items = [] + for fprint, score, request, schedule in batch: + if schedule: + _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) + if not hostname: + self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) + partition_id = self.partitions[0] + host_crc32 = 0 + else: + partition_id = self.partitioner.partition(hostname, self.partitions) + host_crc32 = get_crc32(hostname) + created_at = time()*1E+6 + + if "domain" not in request.meta: + request.meta["domain"] = {} + if "origin_is_frontier" not in request.meta: + request.meta["origin_is_frontier"] = '' + if "scrapy_callback" not in request.meta: + request.meta["scrapy_callback"] = None + if "scrapy_errback" not in request.meta: + request.meta["scrapy_errback"] = None + if "scrapy_meta" not in request.meta: + request.meta["scrapy_meta"] = {} + if "score" not in request.meta: + request.meta["score"] = 0 + if "jid" not in request.meta: + request.meta["jid"] = 0 + + meta = Meta(domain=request.meta['domain'], fingerprint=fprint, + origin_is_frontier=request.meta['origin_is_frontier'], + scrapy_callback=request.meta['scrapy_callback'], + scrapy_errback=request.meta['scrapy_errback'], scrapy_meta=request.meta['scrapy_meta']) + + cql_i = (self.crawl_id, fprint, score, partition_id, host_crc32, request.url, created_at, meta, 0, + request.headers, request.method, request.cookies) + cql_items.append(cql_i) + + request.meta['state'] = States.QUEUED + + execute_concurrent_with_args(self.session, query, cql_items, concurrency=400) + self.counter_cls.cass_count({"queued_urls": len(cql_items)}) + + def count(self): + count = self.queue_model.objects.filter(crawl=self.crawl_id).count() + return count + + def cass_count(self, counts, generate_stats): + if generate_stats is True: + for row, count in counts.iteritems(): + count_page = self.session.prepare("UPDATE crawlstats SET "+row+" = "+row+" + ? WHERE crawl= ?") + self.session.execute_async(count_page, (count, self.crawl_id)) + + +class BroadCrawlingQueue(Queue): + GET_RETRIES = 3 + + def get_next_requests(self, max_n_requests, partition_id, **kwargs): + """ + Dequeues new batch of requests for crawling. + + Priorities, from highest to lowest: + - max_requests_per_host + - max_n_requests + - min_hosts & min_requests + + :param max_n_requests: + :param partition_id: + :param kwargs: min_requests, min_hosts, max_requests_per_host + :return: list of :class:`Request ` objects. + """ + min_requests = kwargs.pop("min_requests", None) + min_hosts = kwargs.pop("min_hosts", None) + max_requests_per_host = kwargs.pop("max_requests_per_host", None) + assert(max_n_requests > min_requests) + + queue = {} + limit = max_n_requests + tries = 0 + count = 0 + while tries < self.GET_RETRIES: + tries += 1 + limit *= 5.5 if tries > 1 else 1.0 + self.logger.debug("Try %d, limit %d, last attempt: requests %d, hosts %d" % + (tries, limit, count, len(queue.keys()))) + queue.clear() + count = 0 + for item in self.queue_model.objects.filter(crawl=self.crawl_id, partition_id=partition_id).\ + order_by("crawl", "score", self._order_by()).limit(limit): + if item.host_crc32 not in queue: + queue[item.host_crc32] = [] + if max_requests_per_host is not None and len(queue[item.host_crc32]) > max_requests_per_host: + continue + queue[item.host_crc32].append(item) + count += 1 + if count > max_n_requests: + break + if min_hosts is not None and len(queue.keys()) < min_hosts: + continue + if min_requests is not None and count < min_requests: + continue + break + self.logger.debug("Finished: tries %d, hosts %d, requests %d" % (tries, len(queue.keys()), count)) + + results = [] + for items in queue.itervalues(): + for item in items: + method = 'GET' if not item.method else item.method + results.append(Request(item.url, method=method, meta=item.meta, headers=item.headers, + cookies=item.cookies)) + item.delete() + return results + + +class CassandraCount: + + def __init__(self, crawl_id, session, generate_stats): + self.generate_stats = generate_stats + self.session = session + self.crawl_id = crawl_id + + def cass_count(self, counts): + if self.generate_stats is True: + for row, count in counts.iteritems(): + count_page = self.session.prepare("UPDATE crawlstats SET "+row+" = "+row+" + ? WHERE crawl= ?") + self.session.execute_async(count_page, (count, self.crawl_id)) \ No newline at end of file diff --git a/frontera/contrib/backends/cassandra/models.py b/frontera/contrib/backends/cassandra/models.py new file mode 100644 index 000000000..9065477d0 --- /dev/null +++ b/frontera/contrib/backends/cassandra/models.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +import uuid +from cassandra.cqlengine.models import Model +from cassandra.cqlengine.usertype import UserType +from cassandra.cqlengine.columns import Map, Text, Float, Integer, DateTime, UserDefinedType, Counter, Boolean, \ + SmallInt, BigInt + + +class Meta(UserType): + domain = Map(Text(), Text(), required=False) + fingerprint = Text() + origin_is_frontier = Boolean() + scrapy_callback = Text() + scrapy_errback = Text() + scrapy_meta = Map(Text(), Text(), required=False) + score = Float(required=False) + jid = Integer(required=False) + + +class MetadataModel(Model): + __table_name__ = 'metadata' + + crawl = Text(primary_key=True) + fingerprint = Text(primary_key=True) + url = Text(index=True) + depth = Integer() + created_at = DateTime() + fetched_at = DateTime(required=False) + status_code = Integer(required=False) + score = Float(required=False) + error = Text(required=False) + meta = UserDefinedType(Meta) + headers = Map(Text(), Text(), required=False) + cookies = Map(Text(), Text(), required=False) + method = Text(required=False) + + @classmethod + def query(cls, session): + return session.query(cls) + + def __repr__(self): + return '' % (self.url, self.fingerprint) + + +class StateModel(Model): + __table_name__ = 'states' + + crawl = Text(primary_key=True) + fingerprint = Text(primary_key=True) + state = SmallInt(index=True) + + @classmethod + def query(cls, session): + return session.query(cls) + + def __repr__(self): + return '' % (self.fingerprint, self.state) + + +class QueueModel(Model): + __table_name__ = 'queue' + + crawl = Text(primary_key=True) + partition_id = Integer(primary_key=True) + score = Float(primary_key=True) + created_at = BigInt(primary_key=True) + fingerprint = Text(primary_key=True) + url = Text() + host_crc32 = Integer() + meta = UserDefinedType(Meta) + headers = Map(Text(), Text(), required=False) + cookies = Map(Text(), Text(), required=False) + method = Text(required=False) + depth = SmallInt(required=False) + + @classmethod + def query(cls, session): + return session.query(cls) + + def __repr__(self): + return '' % (self.url, self.id) + + +class CrawlStatsModel(Model): + __table_name__ = 'crawlstats' + + crawl = Text(primary_key=True) + pages_crawled = Counter() + links_found = Counter() + errors = Counter() + seed_urls = Counter() + scored_urls = Counter() + queued_urls = Counter() + dequeued_urls = Counter() + + @classmethod + def query(cls, session): + return session.query(cls) + + def __repr__(self): + return '' % (self.url, self.id) diff --git a/frontera/contrib/backends/cassandra/revisiting.py b/frontera/contrib/backends/cassandra/revisiting.py new file mode 100644 index 000000000..51ce212fc --- /dev/null +++ b/frontera/contrib/backends/cassandra/revisiting.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- +import logging +import json +from datetime import datetime, timedelta +from time import time, sleep + +from frontera import Request +from frontera.contrib.backends.partitioners import Crc32NamePartitioner +from frontera.contrib.backends.cassandra import CassandraBackend +from cassandra.cqlengine import columns +from cassandra.cqlengine.models import Model +from frontera.core.components import Queue as BaseQueue, States +from frontera.utils.misc import get_crc32 +from frontera.utils.url import parse_domain_from_url_fast + + +class RevisitingQueueModel(Model): + __table_name__ = 'revisiting_queue' + + crawl_at = columns.DateTime(required=True, default=datetime.now(), index=True) + + +def retry_and_rollback(func): + def func_wrapper(self, *args, **kwargs): + tries = 5 + while True: + try: + return func(self, *args, **kwargs) + except Exception, exc: + self.logger.exception(exc) + sleep(5) + tries -= 1 + if tries > 0: + self.logger.info("Tries left %i" % tries) + continue + else: + raise exc + return func_wrapper + + +class RevisitingQueue(BaseQueue): + def __init__(self, session, queue_cls, partitions): + self.session = session() + self.queue_model = queue_cls + self.logger = logging.getLogger("frontera.contrib.backends.sqlalchemy.revisiting.RevisitingQueue") + self.partitions = [i for i in range(0, partitions)] + self.partitioner = Crc32NamePartitioner(self.partitions) + + def frontier_stop(self): + pass + + def get_next_requests(self, max_n_requests, partition_id, **kwargs): + results = [] + try: + for item in self.queue_model.objects.filter(crawl_at=datetime.utcnow(), partition_id=partition_id).\ + limit(max_n_requests): + method = 'GET' if not item.method else item.method + results.append(Request(item.url, method=method, meta=item.meta, headers=item.headers, + cookies=item.cookies)) + item.delete() + except Exception, exc: + self.logger.exception(exc) + return results + + @retry_and_rollback + def schedule(self, batch): + for fprint, score, request, schedule_at in batch: + if schedule_at: + _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) + if not hostname: + self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) + partition_id = self.partitions[0] + host_crc32 = 0 + else: + partition_id = self.partitioner.partition(hostname, self.partitions) + host_crc32 = get_crc32(hostname) + created_at = time()*1E+6 + q = self._create_queue(request, fprint, score, partition_id, host_crc32, created_at) + + q.save() + request.meta['state'] = States.QUEUED + + def _create_queue(self, obj, fingerprint, score, partition_id, host_crc32, created_at): + db_queue = self.queue_model() + db_queue.fingerprint = fingerprint + db_queue.score = score + db_queue.partition_id = partition_id + db_queue.host_crc32 = host_crc32 + db_queue.url = obj.url + db_queue.created_at = created_at + + new_dict = {} + for kmeta, vmeta in obj.meta.iteritems(): + if type(vmeta) is dict: + new_dict[kmeta] = json.dumps(vmeta) + else: + new_dict[kmeta] = str(vmeta) + + db_queue.meta = new_dict + db_queue.depth = 0 + + db_queue.headers = obj.headers + db_queue.method = obj.method + db_queue.cookies = obj.cookies + + return db_queue + + @retry_and_rollback + def count(self): + return self.session.query(self.queue_model).count() + + +class Backend(CassandraBackend): + + def _create_queue(self, settings): + self.interval = settings.get("SQLALCHEMYBACKEND_REVISIT_INTERVAL") + assert isinstance(self.interval, timedelta) + return RevisitingQueue(self.session, RevisitingQueueModel, settings.get('SPIDER_FEED_PARTITIONS')) + + def _schedule(self, requests): + batch = [] + queue_incr = 0 + for request in requests: + if request.meta['state'] in [States.NOT_CRAWLED, None]: + schedule_at = datetime.utcnow() + elif request.meta['state'] in [States.CRAWLED, States.ERROR]: + schedule_at = datetime.utcnow() + self.interval + else: # QUEUED + schedule_at = None + batch.append((request.meta['fingerprint'], self._get_score(request), request, schedule_at)) + if schedule_at: + queue_incr += 1 + self.queue.schedule(batch) + self.metadata.update_score(batch) + self.queue_size += queue_incr + + def page_crawled(self, response, links): + super(Backend, self).page_crawled(response, links) + self._schedule([response.request]) diff --git a/frontera/contrib/backends/cassandra/test_backend.py b/frontera/contrib/backends/cassandra/test_backend.py new file mode 100644 index 000000000..1570d4c4c --- /dev/null +++ b/frontera/contrib/backends/cassandra/test_backend.py @@ -0,0 +1,23 @@ +import os + +from psycopg2 import connect +from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT + +from frontera.tests import backends +from frontera.tests.test_revisiting_backend import RevisitingBackendTest + + +#---------------------------------------------------- +# Cassandra base classes +#---------------------------------------------------- +class cassandraFIFO(backends.FIFOBackendTest): + backend_class = 'frontera.contrib.backends.cassandra.FIFO' + + +class cassandraLIFO(backends.LIFOBackendTest): + backend_class = 'frontera.contrib.backends.cassandra.LIFO' + + +class cassandraRevisiting(RevisitingBackendTest): + backend_class = 'frontera.contrib.backends.cassandra.revisiting.Backend' + diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index b049e7bdc..6aecaceb8 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -8,6 +8,21 @@ BC_MIN_HOSTS = 24 BC_MAX_REQUESTS_PER_HOST = 128 CANONICAL_SOLVER = 'frontera.contrib.canonicalsolvers.Basic' +CASSANDRABACKEND_CACHE_SIZE = 10000 +CASSANDRABACKEND_DROP_ALL_TABLES = False +CASSANDRABACKEND_MODELS = { + 'MetadataModel': 'frontera.contrib.backends.cassandra.models.MetadataModel', + 'StateModel': 'frontera.contrib.backends.cassandra.models.StateModel', + 'QueueModel': 'frontera.contrib.backends.cassandra.models.QueueModel', + 'CrawlStatsModel': 'frontera.contrib.backends.cassandra.models.CrawlStatsModel' +} +CASSANDRABACKEND_REVISIT_INTERVAL = timedelta(days=1) +CASSANDRABACKEND_CLUSTER_IPS = ['127.0.0.1'] +CASSANDRABACKEND_CLUSTER_PORT = 9042 +CASSANDRABACKEND_KEYSPACE = 'frontera' +CASSANDRABACKEND_CREATE_KEYSPACE_IF_NOT_EXISTS = True +CASSANDRABACKEND_CRAWL_ID = "default" +CASSANDRABACKEND_GENERATE_STATS = False DELAY_ON_EMPTY = 5.0 DOMAIN_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' @@ -77,4 +92,4 @@ SCORING_LOG_DBW_GROUP = "dbw-scoring-log" SPIDER_FEED_GROUP = "fetchers-spider-feed" -KAFKA_CODEC = None \ No newline at end of file +KAFKA_CODEC = None diff --git a/requirements/tests.txt b/requirements/tests.txt index 0ac170f54..2c4d75a45 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -13,3 +13,4 @@ happybase>=1.0.0 mock boto>=2.42.0 -r logging.txt +cassandra-driver From 8c09db92f565f98b15a85d40f6ff12696773ec52 Mon Sep 17 00:00:00 2001 From: voith Date: Sun, 30 Oct 2016 12:38:53 -0500 Subject: [PATCH 02/14] refactored some sqlalchemy code to reuse in cassandra backend --- .gitignore | 3 - docs/README | 3 - frontera/contrib/backends/__init__.py | 73 ++++++++++++++++++- .../contrib/backends/cassandra/components.py | 19 +++-- frontera/contrib/backends/cassandra/models.py | 6 +- .../contrib/backends/cassandra/revisiting.py | 18 +++-- .../backends/cassandra/test_backend.py | 23 ------ .../contrib/backends/sqlalchemy/__init__.py | 70 ++---------------- 8 files changed, 102 insertions(+), 113 deletions(-) delete mode 100644 frontera/contrib/backends/cassandra/test_backend.py diff --git a/.gitignore b/.gitignore index e74280063..287b0b569 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,3 @@ docs/_build/ # PyBuilder target/ - -# PyCharm Idea Folder -.idea/ \ No newline at end of file diff --git a/docs/README b/docs/README index fd04c32f5..3d9114563 100644 --- a/docs/README +++ b/docs/README @@ -30,9 +30,6 @@ from this dir:: Documentation will be generated (in HTML format) inside the ``build/html`` dir. -If you get the error "ImportError: No module named sphinx_rtd_theme" run: - sudo pip install sphinx-rtd-theme - View the documentation ---------------------- diff --git a/frontera/contrib/backends/__init__.py b/frontera/contrib/backends/__init__.py index 2dc89a1ee..2eb65bb70 100644 --- a/frontera/contrib/backends/__init__.py +++ b/frontera/contrib/backends/__init__.py @@ -3,7 +3,7 @@ from collections import OrderedDict from frontera import Backend -from frontera.core.components import States +from frontera.core.components import States, Queue as BaseQueue, DistributedBackend class CommonBackend(Backend): @@ -84,3 +84,74 @@ def request_error(self, request, error): def finished(self): return self.queue_size == 0 + + +class CommonStorageBackend(CommonBackend): + + def _create_queue(self, settings): + if not isinstance(self.queue_component, BaseQueue): + raise TypeError('expected queue_component to ' + 'belong to class: %s, got %s instead' % (type(BaseQueue).__name__, + type(self.queue_component).__name__)) + return self.queue_component(self.session_cls, + self.models['QueueModel'], + settings.get('SPIDER_FEED_PARTITIONS')) + + @property + def queue(self): + return self._queue + + @property + def metadata(self): + return self._metadata + + @property + def states(self): + return self._states + + +class CommonDistributedStorageBackend(DistributedBackend): + + @property + def queue(self): + return self._queue + + @property + def metadata(self): + return self._metadata + + @property + def states(self): + return self._states + + def frontier_start(self): + for component in [self.metadata, self.queue, self.states]: + if component: + component.frontier_start() + + def frontier_stop(self): + for component in [self.metadata, self.queue, self.states]: + if component: + component.frontier_stop() + + def add_seeds(self, seeds): + self.metadata.add_seeds(seeds) + + def get_next_requests(self, max_next_requests, **kwargs): + partitions = kwargs.pop('partitions', [0]) # TODO: Collect from all known partitions + batch = [] + for partition_id in partitions: + batch.extend(self.queue.get_next_requests(max_next_requests, partition_id, **kwargs)) + return batch + + def page_crawled(self, response): + self.metadata.page_crawled(response) + + def links_extracted(self, request, links): + self.metadata.links_extracted(request, links) + + def request_error(self, request, error): + self.metadata.request_error(request, error) + + def finished(self): + raise NotImplementedError diff --git a/frontera/contrib/backends/cassandra/components.py b/frontera/contrib/backends/cassandra/components.py index 9584a8629..ec9927436 100644 --- a/frontera/contrib/backends/cassandra/components.py +++ b/frontera/contrib/backends/cassandra/components.py @@ -2,14 +2,17 @@ import logging from datetime import datetime from time import time -from frontera.contrib.backends.partitioners import Crc32NamePartitioner -from frontera.contrib.backends.memory import MemoryStates -from frontera.core.components import Metadata as BaseMetadata, Queue as BaseQueue -from frontera.core.models import Request, Response -from frontera.utils.misc import get_crc32, chunks -from frontera.utils.url import parse_domain_from_url_fast + from cassandra.concurrent import execute_concurrent_with_args + from frontera.contrib.backends.cassandra.models import Meta +from frontera.contrib.backends.memory import MemoryStates +from frontera.contrib.backends.partitioners import Crc32NamePartitioner +from frontera.core.components import Metadata as BaseMetadata +from frontera.core.components import Queue as BaseQueue +from frontera.core.models import Request +from frontera.utils.misc import chunks, get_crc32 +from frontera.utils.url import parse_domain_from_url_fast class Metadata(BaseMetadata): @@ -178,7 +181,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): self.counter_cls.cass_count({"dequeued_urls": dequeued_urls}) - except Exception, exc: + except Exception as exc: self.logger.exception(exc) return results @@ -311,4 +314,4 @@ def cass_count(self, counts): if self.generate_stats is True: for row, count in counts.iteritems(): count_page = self.session.prepare("UPDATE crawlstats SET "+row+" = "+row+" + ? WHERE crawl= ?") - self.session.execute_async(count_page, (count, self.crawl_id)) \ No newline at end of file + self.session.execute_async(count_page, (count, self.crawl_id)) diff --git a/frontera/contrib/backends/cassandra/models.py b/frontera/contrib/backends/cassandra/models.py index 9065477d0..2c21854e5 100644 --- a/frontera/contrib/backends/cassandra/models.py +++ b/frontera/contrib/backends/cassandra/models.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- -import uuid +from cassandra.cqlengine.columns import (BigInt, Boolean, Counter, DateTime, + Float, Integer, Map, SmallInt, Text, + UserDefinedType) from cassandra.cqlengine.models import Model from cassandra.cqlengine.usertype import UserType -from cassandra.cqlengine.columns import Map, Text, Float, Integer, DateTime, UserDefinedType, Counter, Boolean, \ - SmallInt, BigInt class Meta(UserType): diff --git a/frontera/contrib/backends/cassandra/revisiting.py b/frontera/contrib/backends/cassandra/revisiting.py index 51ce212fc..f6be289d3 100644 --- a/frontera/contrib/backends/cassandra/revisiting.py +++ b/frontera/contrib/backends/cassandra/revisiting.py @@ -1,15 +1,17 @@ # -*- coding: utf-8 -*- -import logging import json +import logging from datetime import datetime, timedelta -from time import time, sleep +from time import sleep, time -from frontera import Request -from frontera.contrib.backends.partitioners import Crc32NamePartitioner -from frontera.contrib.backends.cassandra import CassandraBackend from cassandra.cqlengine import columns from cassandra.cqlengine.models import Model -from frontera.core.components import Queue as BaseQueue, States + +from frontera import Request +from frontera.contrib.backends.cassandra import CassandraBackend +from frontera.contrib.backends.partitioners import Crc32NamePartitioner +from frontera.core.components import Queue as BaseQueue +from frontera.core.components import States from frontera.utils.misc import get_crc32 from frontera.utils.url import parse_domain_from_url_fast @@ -26,7 +28,7 @@ def func_wrapper(self, *args, **kwargs): while True: try: return func(self, *args, **kwargs) - except Exception, exc: + except Exception as exc: self.logger.exception(exc) sleep(5) tries -= 1 @@ -58,7 +60,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): results.append(Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies)) item.delete() - except Exception, exc: + except Exception as exc: self.logger.exception(exc) return results diff --git a/frontera/contrib/backends/cassandra/test_backend.py b/frontera/contrib/backends/cassandra/test_backend.py deleted file mode 100644 index 1570d4c4c..000000000 --- a/frontera/contrib/backends/cassandra/test_backend.py +++ /dev/null @@ -1,23 +0,0 @@ -import os - -from psycopg2 import connect -from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT - -from frontera.tests import backends -from frontera.tests.test_revisiting_backend import RevisitingBackendTest - - -#---------------------------------------------------- -# Cassandra base classes -#---------------------------------------------------- -class cassandraFIFO(backends.FIFOBackendTest): - backend_class = 'frontera.contrib.backends.cassandra.FIFO' - - -class cassandraLIFO(backends.LIFOBackendTest): - backend_class = 'frontera.contrib.backends.cassandra.LIFO' - - -class cassandraRevisiting(RevisitingBackendTest): - backend_class = 'frontera.contrib.backends.cassandra.revisiting.Backend' - diff --git a/frontera/contrib/backends/sqlalchemy/__init__.py b/frontera/contrib/backends/sqlalchemy/__init__.py index b8e7b8aa1..9ca92bc04 100644 --- a/frontera/contrib/backends/sqlalchemy/__init__.py +++ b/frontera/contrib/backends/sqlalchemy/__init__.py @@ -4,14 +4,16 @@ from sqlalchemy.orm import sessionmaker from sqlalchemy.engine.reflection import Inspector -from frontera.core.components import DistributedBackend -from frontera.contrib.backends import CommonBackend +from frontera.contrib.backends import CommonBackend, CommonStorageBackend, CommonDistributedStorageBackend from frontera.contrib.backends.sqlalchemy.components import Metadata, Queue, States from frontera.contrib.backends.sqlalchemy.models import DeclarativeBase from frontera.utils.misc import load_object -class SQLAlchemyBackend(CommonBackend): +class SQLAlchemyBackend(CommonStorageBackend): + + queue_component = Queue + def __init__(self, manager): self.manager = manager settings = manager.settings @@ -46,21 +48,6 @@ def frontier_stop(self): super(SQLAlchemyBackend, self).frontier_stop() self.engine.dispose() - def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) - - @property - def queue(self): - return self._queue - - @property - def metadata(self): - return self._metadata - - @property - def states(self): - return self._states - class FIFOBackend(SQLAlchemyBackend): component_name = 'SQLAlchemy FIFO Backend' @@ -105,7 +92,7 @@ def _get_score(self, obj): BFS = BFSBackend -class Distributed(DistributedBackend): +class Distributed(CommonDistributedStorageBackend): def __init__(self, manager): self.manager = manager settings = manager.settings @@ -171,48 +158,3 @@ def db_worker(cls, manager): settings.get('SQLALCHEMYBACKEND_CACHE_SIZE')) b._queue = Queue(b.session_cls, queue_m, settings.get('SPIDER_FEED_PARTITIONS')) return b - - @property - def queue(self): - return self._queue - - @property - def metadata(self): - return self._metadata - - @property - def states(self): - return self._states - - def frontier_start(self): - for component in [self.metadata, self.queue, self.states]: - if component: - component.frontier_start() - - def frontier_stop(self): - for component in [self.metadata, self.queue, self.states]: - if component: - component.frontier_stop() - - def add_seeds(self, seeds): - self.metadata.add_seeds(seeds) - - def get_next_requests(self, max_next_requests, **kwargs): - partitions = kwargs.pop('partitions', [0]) # TODO: Collect from all known partitions - batch = [] - for partition_id in partitions: - batch.extend(self.queue.get_next_requests(max_next_requests, partition_id, **kwargs)) - return batch - - def page_crawled(self, response): - self.metadata.page_crawled(response) - - def links_extracted(self, request, links): - self.metadata.links_extracted(request, links) - - def request_error(self, request, error): - self.metadata.request_error(request, error) - - def finished(self): - raise NotImplementedError - From 259377298191b643459a3ae9f561bcf17cba4428 Mon Sep 17 00:00:00 2001 From: voith Date: Mon, 31 Oct 2016 12:38:53 -0500 Subject: [PATCH 03/14] added unit tests for cassandra backend --- .travis.yml | 2 + frontera/contrib/backends/__init__.py | 36 ++++ .../contrib/backends/cassandra/__init__.py | 176 ++++++------------ .../contrib/backends/cassandra/components.py | 97 ++++------ frontera/contrib/backends/cassandra/models.py | 99 +++------- .../contrib/backends/sqlalchemy/components.py | 35 +--- frontera/settings/default_settings.py | 12 +- requirements/tests.txt | 2 +- setup.py | 6 +- .../cassandra/test_backend_cassandra.py | 63 +++++++ .../backends/cassandra/wait_for_cluster_up.py | 30 +++ 11 files changed, 258 insertions(+), 300 deletions(-) create mode 100644 tests/contrib/backends/cassandra/test_backend_cassandra.py create mode 100644 tests/contrib/backends/cassandra/wait_for_cluster_up.py diff --git a/.travis.yml b/.travis.yml index 0f0d34f31..3a9f8062c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -47,6 +47,8 @@ before_script: - docker-compose --version - docker-compose --verbose -f tests/kafka/docker-compose.yml up -d - docker ps -a + - docker run --name cassandra -p 127.0.0.1:9042:9042 -d cassandra + - python tests/contrib/backends/cassandra/wait_for_cluster_up.py script: tox diff --git a/frontera/contrib/backends/__init__.py b/frontera/contrib/backends/__init__.py index 2eb65bb70..06e9719fb 100644 --- a/frontera/contrib/backends/__init__.py +++ b/frontera/contrib/backends/__init__.py @@ -1,9 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import from collections import OrderedDict +from datetime import datetime from frontera import Backend from frontera.core.components import States, Queue as BaseQueue, DistributedBackend +from frontera.core.models import Request, Response + +from w3lib.util import to_native_str class CommonBackend(Backend): @@ -155,3 +159,35 @@ def request_error(self, request, error): def finished(self): raise NotImplementedError + + +class CreateOrModifyPageMixin(object): + + def _create_page(self, obj): + db_page = self.model() + db_page.fingerprint = to_native_str(obj.meta[b'fingerprint']) + db_page.url = obj.url + db_page.created_at = datetime.utcnow() + db_page.meta = obj.meta + db_page.depth = 0 + + if isinstance(obj, Request): + db_page.headers = obj.headers + db_page.method = to_native_str(obj.method) + db_page.cookies = obj.cookies + elif isinstance(obj, Response): + db_page.headers = obj.request.headers + db_page.method = to_native_str(obj.request.method) + db_page.cookies = obj.request.cookies + db_page.status_code = obj.status_code + return db_page + + def _modify_page(self, obj): + db_page = self.cache[obj.meta[b'fingerprint']] + db_page.fetched_at = datetime.utcnow() + if isinstance(obj, Response): + db_page.headers = obj.request.headers + db_page.method = to_native_str(obj.request.method) + db_page.cookies = obj.request.cookies + db_page.status_code = obj.status_code + return db_page diff --git a/frontera/contrib/backends/cassandra/__init__.py b/frontera/contrib/backends/cassandra/__init__.py index ff4134f6f..6fe1990bd 100644 --- a/frontera/contrib/backends/cassandra/__init__.py +++ b/frontera/contrib/backends/cassandra/__init__.py @@ -1,111 +1,91 @@ from __future__ import absolute_import + +import six + from cassandra.cluster import Cluster from cassandra.cqlengine import connection +from cassandra.cqlengine.management import drop_table, sync_table from cassandra.query import dict_factory -from cassandra.policies import RetryPolicy, ConstantReconnectionPolicy -from cassandra.cqlengine.management import sync_table -from cassandra.cqlengine.management import drop_table -from frontera.core.components import DistributedBackend -from frontera.contrib.backends import CommonBackend -from frontera.contrib.backends.cassandra.components import Metadata, Queue, States + +from frontera.contrib.backends import CommonStorageBackend, CommonDistributedStorageBackend +from frontera.contrib.backends.cassandra.components import (Metadata, Queue, + States) from frontera.utils.misc import load_object -import logging -class CassandraBackend(CommonBackend): +class CassandraBackend(CommonStorageBackend): + + queue_component = Queue + def __init__(self, manager): self.manager = manager settings = manager.settings - cluster_ips = settings.get('CASSANDRABACKEND_CLUSTER_IPS') + cluster_hosts = settings.get('CASSANDRABACKEND_CLUSTER_HOSTS') cluster_port = settings.get('CASSANDRABACKEND_CLUSTER_PORT') drop_all_tables = settings.get('CASSANDRABACKEND_DROP_ALL_TABLES') - keyspace = settings.get('CASSANDRABACKEND_KEYSPACE') - keyspace_create = settings.get('CASSANDRABACKEND_CREATE_KEYSPACE_IF_NOT_EXISTS') models = settings.get('CASSANDRABACKEND_MODELS') - crawl_id = settings.get('CASSANDRABACKEND_CRAWL_ID') - generate_stats = settings.get('CASSANDRABACKEND_GENERATE_STATS') + keyspace = settings.get('CASSANDRABACKEND_KEYSPACE') - self.models = dict([(name, load_object(klass)) for name, klass in models.items()]) + self.models = dict([(name, load_object(cls)) for name, cls in six.iteritems(models)]) + cluster_kwargs = { + 'port': cluster_port, + 'compression': True, + 'control_connection_timeout': 240, + } + self.cluster = Cluster(contact_points=cluster_hosts, **cluster_kwargs) - self.cluster = Cluster( - contact_points=cluster_ips, - port=cluster_port, - compression=True, - default_retry_policy=RetryPolicy(), - reconnection_policy=ConstantReconnectionPolicy(10, 100) - ) + self.session = self.cluster.connect(keyspace) + # self.session.row_factory = dict_factory + # self.session.encoder.mapping[dict] = self.session.encoder.cql_encode_map_collection + connection.setup(cluster_hosts, keyspace, **cluster_kwargs) - self.session = self.cluster.connect() - self.session.row_factory = dict_factory - self.session.encoder.mapping[dict] = self.session.encoder.cql_encode_map_collection - self.crawl_id = crawl_id - self.generate_stats = generate_stats - - if keyspace_create: - query = """CREATE KEYSPACE IF NOT EXISTS \"%s\" - WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3}""" % (keyspace, ) - self.session.execute(query) - - self.session.set_keyspace(keyspace) + tables = self._get_tables() + if drop_all_tables: + for name, table in six.iteritems(self.models): + if table.__table_name__ in tables: + drop_table(table) - connection.set_session(self.session) + for name, table in six.iteritems(self.models): + sync_table(table) - if drop_all_tables: - for key, value in self.models.iteritems(): - drop_table(value) + # self._metadata = Metadata(self.session, self.models['MetadataModel']) + # self._states = States(self.session, self.models['StateModel'], settings.get('STATE_CACHE_SIZE_LIMIT')) + # self._queue = self._create_queue(settings) - for key, value in self.models.iteritems(): - if (self.generate_stats is False and key != 'CrawlStatsModel') or self.generate_stats is True: - sync_table(value) + # def _drop_table(self, model): + # self.session.execute('DROP TABLE {0};'.format(model.column_family_name()), timeout=240) - self._metadata = Metadata(self.session, self.models['MetadataModel'], self.crawl_id, self.generate_stats) - self._states = States(self.session, self.models['StateModel'], - settings.get('STATE_CACHE_SIZE_LIMIT'), self.crawl_id) - self._queue = self._create_queue(settings) + def _get_tables(self): + query = self.session.prepare('SELECT table_name FROM system_schema.tables WHERE keyspace_name = ?') + result = self.session.execute(query, (self.session.keyspace,)) + return [row.table_name for row in result.current_rows] def frontier_stop(self): self.states.flush() self.session.shutdown() - def _create_queue(self, settings): - return Queue(self.session, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS'), - self.crawl_id, self.generate_stats) - - @property - def queue(self): - return self._queue - - @property - def metadata(self): - return self._metadata - - @property - def states(self): - return self._states BASE = CassandraBackend -class Distributed(DistributedBackend): +class Distributed(CommonDistributedStorageBackend): def __init__(self, manager): self.manager = manager settings = manager.settings - cluster_ips = settings.get('CASSANDRABACKEND_CLUSTER_IPS') # Format: ['192.168.0.1', '192.168.0.2'] + cluster_hosts = settings.get('CASSANDRABACKEND_CLUSTER_HOSTS') cluster_port = settings.get('CASSANDRABACKEND_CLUSTER_PORT') keyspace = settings.get('CASSANDRABACKEND_KEYSPACE') - keyspace_create = settings.get('CASSANDRABACKEND_CREATE_KEYSPACE_IF_NOT_EXISTS') # Default: true models = settings.get('CASSANDRABACKEND_MODELS') - - self.cluster = Cluster(cluster_ips, cluster_port) - self.models = dict([(name, load_object(klass)) for name, klass in models.items()]) + cluster_kwargs = { + 'port': cluster_port, + 'compression': True + } + self.cluster = Cluster(cluster_hosts, **cluster_kwargs) + self.models = dict([(name, load_object(cls)) for name, cls in six.iteritems(models)]) self.session = self.cluster.connect() self.session.row_factory = dict_factory - if keyspace_create: - query = """CREATE KEYSPACE IF NOT EXISTS \"%s\" - WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3}""" % (keyspace, ) - self.session.execute(query) self.session.set_keyspace(keyspace) connection.set_session(self.session) @@ -118,7 +98,6 @@ def strategy_worker(cls, manager): b = cls(manager) settings = manager.settings drop_all_tables = settings.get('CASSANDRABACKEND_DROP_ALL_TABLES') - crawl_id = settings.get('CASSANDRABACKEND_CRAWL_ID') model = b.models['StateModel'] if drop_all_tables: @@ -127,7 +106,7 @@ def strategy_worker(cls, manager): sync_table(model) b._states = States(b.session, model, - settings.get('STATE_CACHE_SIZE_LIMIT'), crawl_id) + settings.get('STATE_CACHE_SIZE_LIMIT')) return b @classmethod @@ -135,65 +114,16 @@ def db_worker(cls, manager): b = cls(manager) settings = manager.settings drop = settings.get('CASSANDRABACKEND_DROP_ALL_TABLES') - crawl_id = settings.get('CASSANDRABACKEND_CRAWL_ID') - generate_stats = settings.get('CASSANDRABACKEND_GENERATE_STATS') - metadata_m = b.models['MetadataModel'] queue_m = b.models['QueueModel'] - stats_m = b.models['CrawlStatsModel'] + if drop: drop_table(metadata_m) drop_table(queue_m) - drop_table(stats_m) sync_table(metadata_m) sync_table(queue_m) - if generate_stats is True: - sync_table(stats_m) - b._metadata = Metadata(b.session, metadata_m, crawl_id, generate_stats) - b._queue = Queue(b.session, queue_m, settings.get('SPIDER_FEED_PARTITIONS'), crawl_id, generate_stats) + b._metadata = Metadata(b.session, metadata_m) + b._queue = Queue(b.session, queue_m, settings.get('SPIDER_FEED_PARTITIONS')) return b - - @property - def queue(self): - return self._queue - - @property - def metadata(self): - return self._metadata - - @property - def states(self): - return self._states - - def frontier_start(self): - for component in [self.metadata, self.queue, self.states]: - if component: - component.frontier_start() - - def frontier_stop(self): - for component in [self.metadata, self.queue, self.states]: - if component: - component.frontier_stop() - - def add_seeds(self, seeds): - self.metadata.add_seeds(seeds) - - def get_next_requests(self, max_next_requests, **kwargs): - partitions = kwargs.pop('partitions', [0]) # TODO: Collect from all known partitions - batch = [] - for partition_id in partitions: - batch.extend(self.queue.get_next_requests(max_next_requests, partition_id, **kwargs)) - return batch - - def page_crawled(self, response, links): - self.metadata.page_crawled(response, links) - - def request_error(self, request, error): - self.metadata.request_error(request, error) - - def finished(self): - return NotImplementedError - - diff --git a/frontera/contrib/backends/cassandra/components.py b/frontera/contrib/backends/cassandra/components.py index ec9927436..939c0bd25 100644 --- a/frontera/contrib/backends/cassandra/components.py +++ b/frontera/contrib/backends/cassandra/components.py @@ -1,11 +1,14 @@ # -*- coding: utf-8 -*- import logging +import uuid from datetime import datetime from time import time +from cachetools import LRUCache from cassandra.concurrent import execute_concurrent_with_args +from cassandra.cqlengine.query import BatchQuery -from frontera.contrib.backends.cassandra.models import Meta +from frontera.contrib.backends import CreateOrModifyPageMixin from frontera.contrib.backends.memory import MemoryStates from frontera.contrib.backends.partitioners import Crc32NamePartitioner from frontera.core.components import Metadata as BaseMetadata @@ -15,51 +18,47 @@ from frontera.utils.url import parse_domain_from_url_fast -class Metadata(BaseMetadata): - def __init__(self, session, model_cls, crawl_id, generate_stats): +class Metadata(BaseMetadata, CreateOrModifyPageMixin): + + def __init__(self, session, model_cls, cache_size): self.session = session self.model = model_cls - self.table = 'MetadataModel' + self.cache = LRUCache(cache_size) + self.batch = BatchQuery() self.logger = logging.getLogger("frontera.contrib.backends.cassandra.components.Metadata") - self.crawl_id = crawl_id - self.generate_stats = generate_stats - self.counter_cls = CassandraCount(crawl_id, self.session, generate_stats) def frontier_stop(self): pass def add_seeds(self, seeds): - cql_items = [] for seed in seeds: - query = self.session.prepare( - "INSERT INTO metadata (crawl, fingerprint, url, created_at, meta, headers, cookies, method, depth) " - "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)") - meta = Meta(domain=seed.meta['domain'], fingerprint=seed.meta['fingerprint'], - origin_is_frontier=seed.meta['origin_is_frontier'], - scrapy_callback=seed.meta['scrapy_callback'], scrapy_errback=seed.meta['scrapy_errback'], - scrapy_meta=seed.meta['scrapy_meta']) - cql_i = (self.crawl_id, seed.meta['fingerprint'], seed.url, datetime.utcnow(), meta, - seed.headers, seed.cookies, seed.method, 0) - cql_items.append(cql_i) - if len(seeds) > 0: - execute_concurrent_with_args(self.session, query, cql_items, concurrency=400) - self.counter_cls.cass_count({"seed_urls": len(seeds)}) + o = self._create_page(seed) + + # cql_items = [] + # query = self.session.prepare( + # "INSERT INTO metadata (fingerprint, url, created_at, meta, headers, cookies, method, depth) " + # "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)") + # for seed in seeds: + # cql_i = (seed.meta['fingerprint'], seed.url, datetime.utcnow(), seed.meta, + # seed.headers, seed.cookies, seed.method, 0) + # cql_items.append(cql_i) + # if len(seeds) > 0: + # execute_concurrent_with_args(self.session, query, cql_items, concurrency=400) def request_error(self, page, error): query_page = self.session.prepare( - "UPDATE metadata SET error = ? WHERE crawl = ? AND fingerprint = ?") - self.session.execute(query_page, (error, self.crawl_id, page.meta['fingerprint'])) - self.counter_cls.cass_count({"error": 1}) + "UPDATE metadata SET error = ? WHERE fingerprint = ?") + self.session.execute(query_page, (error, page.meta['fingerprint'])) def page_crawled(self, response, links): query_page = self.session.prepare( "UPDATE metadata SET fetched_at = ?, headers = ?, method = ?, cookies = ?, status_code = ? " - "WHERE crawl= ? AND fingerprint = ?") - self.session.execute_async(query_page, (datetime.utcnow(), response.request.headers, response.request.method, - response.request.cookies, response.status_code, self.crawl_id, - response.meta['fingerprint'])) + "WHERE fingerprint = ?") + self.session.execute_async(query_page, (datetime.utcnow(), response.request.headers, + response.request.method, response.request.cookies, + response.status_code, response.meta['fingerprint'])) depth = 0 - page_res = self.model.objects.filter(crawl=self.crawl_id, fingerprint=response.meta['fingerprint']) + page_res = self.model.objects.filter(fingerprint=response.meta['fingerprint']) if page_res[0].depth > 0: depth = page_res[0].depth @@ -89,7 +88,6 @@ def __init__(self, session, model_cls, cache_size_limit, crawl_id): super(States, self).__init__(cache_size_limit) self.session = session self.model = model_cls - self.table = 'StateModel' self.logger = logging.getLogger("frontera.contrib.backends.cassandra.components.States") self.crawl_id = crawl_id @@ -106,10 +104,10 @@ def fetch(self, fingerprints): self._cache[state.fingerprint] = state.state def flush(self, force_clear=False): - query = self.session.prepare("INSERT INTO states (crawl, fingerprint, state) VALUES (?, ?, ?)") + query = self.session.prepare("INSERT INTO states (id, fingerprint, state) VALUES (?, ?, ?)") cql_items = [] for fingerprint, state_val in self._cache.iteritems(): - cql_i = (self.crawl_id, fingerprint, state_val) + cql_i = (uuid.uuid4(), fingerprint, state_val) cql_items.append(cql_i) execute_concurrent_with_args(self.session, query, cql_items, concurrency=20000) super(States, self).flush(force_clear) @@ -123,8 +121,6 @@ def __init__(self, session, queue_cls, partitions, crawl_id, generate_stats, ord self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.ordering = ordering - self.crawl_id = crawl_id - self.counter_cls = CassandraCount(crawl_id, self.session, generate_stats) def frontier_stop(self): pass @@ -187,7 +183,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): return results def schedule(self, batch): - query = self.session.prepare("INSERT INTO queue (crawl, fingerprint, score, partition_id, host_crc32, url, " + query = self.session.prepare("INSERT INTO queue (id, fingerprint, score, partition_id, host_crc32, url, " "created_at, meta, depth, headers, method, cookies) " "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") cql_items = [] @@ -218,13 +214,8 @@ def schedule(self, batch): if "jid" not in request.meta: request.meta["jid"] = 0 - meta = Meta(domain=request.meta['domain'], fingerprint=fprint, - origin_is_frontier=request.meta['origin_is_frontier'], - scrapy_callback=request.meta['scrapy_callback'], - scrapy_errback=request.meta['scrapy_errback'], scrapy_meta=request.meta['scrapy_meta']) - - cql_i = (self.crawl_id, fprint, score, partition_id, host_crc32, request.url, created_at, meta, 0, - request.headers, request.method, request.cookies) + cql_i = (uuid.uuid4(), fprint, score, partition_id, host_crc32, request.url, created_at, + request.meta, 0, request.headers, request.method, request.cookies) cql_items.append(cql_i) request.meta['state'] = States.QUEUED @@ -233,15 +224,9 @@ def schedule(self, batch): self.counter_cls.cass_count({"queued_urls": len(cql_items)}) def count(self): - count = self.queue_model.objects.filter(crawl=self.crawl_id).count() + count = self.queue_model.objects.filter().count() return count - def cass_count(self, counts, generate_stats): - if generate_stats is True: - for row, count in counts.iteritems(): - count_page = self.session.prepare("UPDATE crawlstats SET "+row+" = "+row+" + ? WHERE crawl= ?") - self.session.execute_async(count_page, (count, self.crawl_id)) - class BroadCrawlingQueue(Queue): GET_RETRIES = 3 @@ -301,17 +286,3 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): cookies=item.cookies)) item.delete() return results - - -class CassandraCount: - - def __init__(self, crawl_id, session, generate_stats): - self.generate_stats = generate_stats - self.session = session - self.crawl_id = crawl_id - - def cass_count(self, counts): - if self.generate_stats is True: - for row, count in counts.iteritems(): - count_page = self.session.prepare("UPDATE crawlstats SET "+row+" = "+row+" + ? WHERE crawl= ?") - self.session.execute_async(count_page, (count, self.crawl_id)) diff --git a/frontera/contrib/backends/cassandra/models.py b/frontera/contrib/backends/cassandra/models.py index 2c21854e5..4a73f98d5 100644 --- a/frontera/contrib/backends/cassandra/models.py +++ b/frontera/contrib/backends/cassandra/models.py @@ -1,42 +1,24 @@ # -*- coding: utf-8 -*- -from cassandra.cqlengine.columns import (BigInt, Boolean, Counter, DateTime, - Float, Integer, Map, SmallInt, Text, - UserDefinedType) +from cassandra.cqlengine.columns import (UUID, BigInt, DateTime, Float, + Integer, Map, SmallInt, Text) from cassandra.cqlengine.models import Model -from cassandra.cqlengine.usertype import UserType - - -class Meta(UserType): - domain = Map(Text(), Text(), required=False) - fingerprint = Text() - origin_is_frontier = Boolean() - scrapy_callback = Text() - scrapy_errback = Text() - scrapy_meta = Map(Text(), Text(), required=False) - score = Float(required=False) - jid = Integer(required=False) class MetadataModel(Model): __table_name__ = 'metadata' - crawl = Text(primary_key=True) fingerprint = Text(primary_key=True) - url = Text(index=True) - depth = Integer() - created_at = DateTime() - fetched_at = DateTime(required=False) - status_code = Integer(required=False) - score = Float(required=False) - error = Text(required=False) - meta = UserDefinedType(Meta) - headers = Map(Text(), Text(), required=False) - cookies = Map(Text(), Text(), required=False) - method = Text(required=False) - - @classmethod - def query(cls, session): - return session.query(cls) + url = Text(required=True) + depth = Integer(required=True) + created_at = DateTime(required=True) + fetched_at = DateTime() + status_code = Integer() + score = Float() + error = Text() + meta = Map(Text(), Text()) + headers = Map(Text(), Text()) + cookies = Map(Text(), Text()) + method = Text() def __repr__(self): return '' % (self.url, self.fingerprint) @@ -45,13 +27,8 @@ def __repr__(self): class StateModel(Model): __table_name__ = 'states' - crawl = Text(primary_key=True) fingerprint = Text(primary_key=True) - state = SmallInt(index=True) - - @classmethod - def query(cls, session): - return session.query(cls) + state = SmallInt(required=True) def __repr__(self): return '' % (self.fingerprint, self.state) @@ -60,42 +37,18 @@ def __repr__(self): class QueueModel(Model): __table_name__ = 'queue' - crawl = Text(primary_key=True) - partition_id = Integer(primary_key=True) - score = Float(primary_key=True) - created_at = BigInt(primary_key=True) - fingerprint = Text(primary_key=True) - url = Text() - host_crc32 = Integer() - meta = UserDefinedType(Meta) - headers = Map(Text(), Text(), required=False) - cookies = Map(Text(), Text(), required=False) - method = Text(required=False) - depth = SmallInt(required=False) - - @classmethod - def query(cls, session): - return session.query(cls) + id = UUID(primary_key=True) + partition_id = Integer(required=True) + score = Float(required=True) + url = Text(required=True) + fingerprint = Text(required=True) + host_crc32 = Integer(required=True) + meta = Map(Text(), Text()) + headers = Map(Text(), Text()) + cookies = Map(Text(), Text()) + method = Text() + created_at = BigInt(required=True) + depth = SmallInt() def __repr__(self): return '' % (self.url, self.id) - - -class CrawlStatsModel(Model): - __table_name__ = 'crawlstats' - - crawl = Text(primary_key=True) - pages_crawled = Counter() - links_found = Counter() - errors = Counter() - seed_urls = Counter() - scored_urls = Counter() - queued_urls = Counter() - dequeued_urls = Counter() - - @classmethod - def query(cls, session): - return session.query(cls) - - def __repr__(self): - return '' % (self.url, self.id) diff --git a/frontera/contrib/backends/sqlalchemy/components.py b/frontera/contrib/backends/sqlalchemy/components.py index 8661ac576..a3123f4f5 100644 --- a/frontera/contrib/backends/sqlalchemy/components.py +++ b/frontera/contrib/backends/sqlalchemy/components.py @@ -1,15 +1,15 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import import logging -from datetime import datetime from time import time, sleep from cachetools import LRUCache +from frontera.contrib.backends import CreateOrModifyPageMixin from frontera.contrib.backends.partitioners import Crc32NamePartitioner from frontera.contrib.backends.memory import MemoryStates from frontera.contrib.backends.sqlalchemy.models import DeclarativeBase from frontera.core.components import Metadata as BaseMetadata, Queue as BaseQueue -from frontera.core.models import Request, Response +from frontera.core.models import Request from frontera.utils.misc import get_crc32, chunks from frontera.utils.url import parse_domain_from_url_fast import six @@ -36,7 +36,7 @@ def func_wrapper(self, *args, **kwargs): return func_wrapper -class Metadata(BaseMetadata): +class Metadata(BaseMetadata, CreateOrModifyPageMixin): def __init__(self, session_cls, model_cls, cache_size): self.session = session_cls(expire_on_commit=False) # FIXME: Should be explicitly mentioned in docs self.model = model_cls @@ -73,35 +73,6 @@ def links_extracted(self, request, links): self.cache[link.meta[b'fingerprint']] = self.session.merge(self._create_page(link)) self.session.commit() - def _modify_page(self, obj): - db_page = self.cache[obj.meta[b'fingerprint']] - db_page.fetched_at = datetime.utcnow() - if isinstance(obj, Response): - db_page.headers = obj.request.headers - db_page.method = to_native_str(obj.request.method) - db_page.cookies = obj.request.cookies - db_page.status_code = obj.status_code - return db_page - - def _create_page(self, obj): - db_page = self.model() - db_page.fingerprint = to_native_str(obj.meta[b'fingerprint']) - db_page.url = obj.url - db_page.created_at = datetime.utcnow() - db_page.meta = obj.meta - db_page.depth = 0 - - if isinstance(obj, Request): - db_page.headers = obj.headers - db_page.method = to_native_str(obj.method) - db_page.cookies = obj.cookies - elif isinstance(obj, Response): - db_page.headers = obj.request.headers - db_page.method = to_native_str(obj.request.method) - db_page.cookies = obj.request.cookies - db_page.status_code = obj.status_code - return db_page - @retry_and_rollback def update_score(self, batch): for fprint, score, request, schedule in batch: diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 6aecaceb8..dc06568fa 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -7,22 +7,20 @@ BC_MIN_REQUESTS = 64 BC_MIN_HOSTS = 24 BC_MAX_REQUESTS_PER_HOST = 128 + CANONICAL_SOLVER = 'frontera.contrib.canonicalsolvers.Basic' CASSANDRABACKEND_CACHE_SIZE = 10000 CASSANDRABACKEND_DROP_ALL_TABLES = False CASSANDRABACKEND_MODELS = { 'MetadataModel': 'frontera.contrib.backends.cassandra.models.MetadataModel', 'StateModel': 'frontera.contrib.backends.cassandra.models.StateModel', - 'QueueModel': 'frontera.contrib.backends.cassandra.models.QueueModel', - 'CrawlStatsModel': 'frontera.contrib.backends.cassandra.models.CrawlStatsModel' + 'QueueModel': 'frontera.contrib.backends.cassandra.models.QueueModel' } CASSANDRABACKEND_REVISIT_INTERVAL = timedelta(days=1) -CASSANDRABACKEND_CLUSTER_IPS = ['127.0.0.1'] +CASSANDRABACKEND_CLUSTER_HOSTS = ['127.0.0.1'] CASSANDRABACKEND_CLUSTER_PORT = 9042 -CASSANDRABACKEND_KEYSPACE = 'frontera' -CASSANDRABACKEND_CREATE_KEYSPACE_IF_NOT_EXISTS = True -CASSANDRABACKEND_CRAWL_ID = "default" -CASSANDRABACKEND_GENERATE_STATS = False +CASSANDRABACKEND_KEYSPACE = 'crawler' + DELAY_ON_EMPTY = 5.0 DOMAIN_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' diff --git a/requirements/tests.txt b/requirements/tests.txt index 2c4d75a45..015988671 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -13,4 +13,4 @@ happybase>=1.0.0 mock boto>=2.42.0 -r logging.txt -cassandra-driver +cassandra-driver==3.7.0 diff --git a/setup.py b/setup.py index 87f423e18..d47a9dfb6 100644 --- a/setup.py +++ b/setup.py @@ -71,6 +71,9 @@ ], 'distributed': [ 'Twisted' + ], + 'cassandra': [ + 'cassandra-driver==3.7.0' ] }, tests_require=[ @@ -85,6 +88,7 @@ "mock", "boto>=2.42.0", "colorlog>=2.4.0", - "python-json-logger>=0.1.5" + "python-json-logger>=0.1.5", + "cassandra-driver==3.7.0" ] ) diff --git a/tests/contrib/backends/cassandra/test_backend_cassandra.py b/tests/contrib/backends/cassandra/test_backend_cassandra.py new file mode 100644 index 000000000..0c9b7bf81 --- /dev/null +++ b/tests/contrib/backends/cassandra/test_backend_cassandra.py @@ -0,0 +1,63 @@ +import unittest + +from cassandra.cluster import Cluster +from cassandra.cqlengine import connection +from cassandra.cqlengine.management import drop_table, sync_table + +from frontera.contrib.backends.cassandra import CassandraBackend +from frontera.contrib.backends.cassandra.models import MetadataModel, StateModel, QueueModel +from frontera.settings import Settings + + +class TestCassandraBackend(unittest.TestCase): + + def setUp(self): + settings = Settings() + hosts = ['127.0.0.1'] + port = 9042 + self.manager = type('manager', (object,), {}) + self.manager.settings = settings + self.keyspace = settings.CASSANDRABACKEND_KEYSPACE + cluster = Cluster(hosts, port, control_connection_timeout=240) + self.session = cluster.connect() + self.session.execute("CREATE KEYSPACE IF NOT EXISTS %s WITH " + "replication = {'class':'SimpleStrategy', 'replication_factor' : 3}" % self.keyspace) + connection.setup(hosts, self.keyspace, port=port, control_connection_timeout=240) + self.session.set_keyspace(self.keyspace) + + def tearDown(self): + tables = self._get_tables() + models = [MetadataModel, StateModel, QueueModel] + for model in models: + if model.__table_name__ in tables: + self.session.execute('DROP TABLE {0};'.format(model.column_family_name()), timeout=240) + self.session.shutdown() + + def _get_tables(self): + query = self.session.prepare('SELECT table_name FROM system_schema.tables WHERE keyspace_name = ?') + result = self.session.execute(query, (self.session.keyspace,)) + return [row.table_name for row in result.current_rows] + + def test_tables_created(self): + tables_before = self._get_tables() + self.assertEqual(tables_before, []) + CassandraBackend(self.manager) + tables_after = self._get_tables() + self.assertEqual(set(tables_after), set(['metadata', 'states', 'queue'])) + + def test_tables_droped_and_created(self): + def _get_state_data(): + return StateModel.all() + models = [MetadataModel, StateModel, QueueModel] + for model in models: + sync_table(model) + tables_before = self._get_tables() + self.assertEqual(set(tables_before), set(['metadata', 'states', 'queue'])) + StateModel.create(fingerprint='fingerprint', state=200) + rows_before = _get_state_data() + self.assertEqual(rows_before.count(), 1) + self.manager.settings.CASSANDRABACKEND_DROP_ALL_TABLES = True + CassandraBackend(self.manager) + self.assertEqual(set(tables_before), set(['metadata', 'states', 'queue'])) + rows_after = _get_state_data() + self.assertEqual(rows_after.count(), 0) diff --git a/tests/contrib/backends/cassandra/wait_for_cluster_up.py b/tests/contrib/backends/cassandra/wait_for_cluster_up.py new file mode 100644 index 000000000..f63fe4eb2 --- /dev/null +++ b/tests/contrib/backends/cassandra/wait_for_cluster_up.py @@ -0,0 +1,30 @@ +from time import time + +from cassandra.cluster import Cluster, NoHostAvailable + +_cluster_ips = ['127.0.0.1'] +_port = 9042 +_timeout = 36000 # 10 minutes + + +def _is_cluster_up(): + cluster = Cluster(_cluster_ips, _port) + try: + cluster.connect() + return True + except NoHostAvailable: + return False + + +def _wait_for_cluster_to_start(): + print('waiting for cassandra cluster to setup...') + start = time() + while not _is_cluster_up(): + time_taken = time() - start + if time_taken > _timeout: + raise TimeoutError('Cassandra node could not start within the timeout.') + time_taken = time() - start + print('cassandra cluster is up! Waited for %s seconds.' % int(time_taken)) + +if __name__ == '__main__': + _wait_for_cluster_to_start() From 9fa8fd9827b56bdf49cbbadf0bff1f67634df7a3 Mon Sep 17 00:00:00 2001 From: voith Date: Tue, 1 Nov 2016 12:38:53 -0500 Subject: [PATCH 04/14] added pickle field to serialize dicts in cassandra --- .../contrib/backends/cassandra/__init__.py | 4 +- .../contrib/backends/cassandra/components.py | 73 +++++++--------- frontera/contrib/backends/cassandra/models.py | 40 +++++++-- .../cassandra/test_backend_cassandra.py | 84 ++++++++++++++++++- 4 files changed, 145 insertions(+), 56 deletions(-) diff --git a/frontera/contrib/backends/cassandra/__init__.py b/frontera/contrib/backends/cassandra/__init__.py index 6fe1990bd..c1691e3da 100644 --- a/frontera/contrib/backends/cassandra/__init__.py +++ b/frontera/contrib/backends/cassandra/__init__.py @@ -48,7 +48,9 @@ def __init__(self, manager): for name, table in six.iteritems(self.models): sync_table(table) - # self._metadata = Metadata(self.session, self.models['MetadataModel']) + self._metadata = Metadata(self.session, + self.models['MetadataModel'], + settings.get('CASSANDRABACKEND_CACHE_SIZE')) # self._states = States(self.session, self.models['StateModel'], settings.get('STATE_CACHE_SIZE_LIMIT')) # self._queue = self._create_queue(settings) diff --git a/frontera/contrib/backends/cassandra/components.py b/frontera/contrib/backends/cassandra/components.py index 939c0bd25..d392b8665 100644 --- a/frontera/contrib/backends/cassandra/components.py +++ b/frontera/contrib/backends/cassandra/components.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- import logging import uuid -from datetime import datetime +import pickle +import six from time import time from cachetools import LRUCache @@ -17,6 +18,8 @@ from frontera.utils.misc import chunks, get_crc32 from frontera.utils.url import parse_domain_from_url_fast +from w3lib.util import to_native_str, to_bytes + class Metadata(BaseMetadata, CreateOrModifyPageMixin): @@ -32,54 +35,38 @@ def frontier_stop(self): def add_seeds(self, seeds): for seed in seeds: - o = self._create_page(seed) - - # cql_items = [] - # query = self.session.prepare( - # "INSERT INTO metadata (fingerprint, url, created_at, meta, headers, cookies, method, depth) " - # "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)") - # for seed in seeds: - # cql_i = (seed.meta['fingerprint'], seed.url, datetime.utcnow(), seed.meta, - # seed.headers, seed.cookies, seed.method, 0) - # cql_items.append(cql_i) - # if len(seeds) > 0: - # execute_concurrent_with_args(self.session, query, cql_items, concurrency=400) + page = self._create_page(seed) + page.batch(self.batch).save() + self.cache[to_bytes(page.fingerprint)] = page + self.batch.execute() def request_error(self, page, error): - query_page = self.session.prepare( - "UPDATE metadata SET error = ? WHERE fingerprint = ?") - self.session.execute(query_page, (error, page.meta['fingerprint'])) - - def page_crawled(self, response, links): - query_page = self.session.prepare( - "UPDATE metadata SET fetched_at = ?, headers = ?, method = ?, cookies = ?, status_code = ? " - "WHERE fingerprint = ?") - self.session.execute_async(query_page, (datetime.utcnow(), response.request.headers, - response.request.method, response.request.cookies, - response.status_code, response.meta['fingerprint'])) - depth = 0 - page_res = self.model.objects.filter(fingerprint=response.meta['fingerprint']) - if page_res[0].depth > 0: - depth = page_res[0].depth - - query = self.session.prepare( - "INSERT INTO metadata (crawl, fingerprint, created_at, method, url, depth) VALUES (?, ?, ?, ?, ?, ?)") - cql_items = [] + page = self._modify_page(page) if page.meta[b'fingerprint'] in self.cache else self._create_page(page) + page.error = error + self.cache[to_bytes(page.fingerprint)] = page + page.save() + + def page_crawled(self, response): + page = self._modify_page(response) if response.meta[b'fingerprint'] in self.cache else self._create_page(response) + self.cache[page.fingerprint] = page + page.save() + + def links_extracted(self, request, links): for link in links: - if response.meta['fingerprint'] != link.meta['fingerprint']: - cql_i = (self.crawl_id, link.meta['fingerprint'], datetime.utcnow(), link.method, link.url, depth+1) - cql_items.append(cql_i) - execute_concurrent_with_args(self.session, query, cql_items, concurrency=400) - self.counter_cls.cass_count({"pages_crawled": 1, "links_found": len(cql_items)}) + if link.meta[b'fingerprint'] not in self.cache: + page = self._create_page(link) + self.cache[link.meta[b'fingerprint']] = page + page.batch(self.batch).save() + self.batch.execute() def update_score(self, batch): - query = self.session.prepare("UPDATE metadata SET score = ? WHERE crawl = ? AND fingerprint = ?") - cql_items = [] for fprint, score, request, schedule in batch: - cql_i = (score, self.crawl_id, fprint) - cql_items.append(cql_i) - execute_concurrent_with_args(self.session, query, cql_items, concurrency=400) - self.counter_cls.cass_count({"scored_urls": len(cql_items)}) + page = self.cache[fprint] + page.fingerprint = to_native_str(fprint) + page.score = score + self.cache[fprint] = page + page.batch(self.batch).save() + self.batch.execute() class States(MemoryStates): diff --git a/frontera/contrib/backends/cassandra/models.py b/frontera/contrib/backends/cassandra/models.py index 4a73f98d5..91340d1aa 100644 --- a/frontera/contrib/backends/cassandra/models.py +++ b/frontera/contrib/backends/cassandra/models.py @@ -1,9 +1,31 @@ # -*- coding: utf-8 -*- -from cassandra.cqlengine.columns import (UUID, BigInt, DateTime, Float, +import pickle +import six + +from cassandra.cqlengine.columns import (UUID, BigInt, Bytes, DateTime, Float, Integer, Map, SmallInt, Text) from cassandra.cqlengine.models import Model +class Pickle(Bytes): + + def to_database(self, value): + value = self._pickle_object(value) + return super(Pickle, self).to_database(value) + + def to_python(self, value): + value = super(Pickle, self).to_python(value) + return self._unpickle_object(value) + + def _pickle_object(self, obj): + pickled = pickle.dumps(obj) + return pickled.encode('hex') if six.PY2 else pickled + + def _unpickle_object(self, pickled_obj): + obj = pickled_obj.decode('hex') if six.PY2 else pickled_obj + return pickle.loads(obj) + + class MetadataModel(Model): __table_name__ = 'metadata' @@ -15,9 +37,9 @@ class MetadataModel(Model): status_code = Integer() score = Float() error = Text() - meta = Map(Text(), Text()) - headers = Map(Text(), Text()) - cookies = Map(Text(), Text()) + meta = Pickle() + headers = Pickle() + cookies = Pickle() method = Text() def __repr__(self): @@ -31,7 +53,7 @@ class StateModel(Model): state = SmallInt(required=True) def __repr__(self): - return '' % (self.fingerprint, self.state) + return '' % (self.fingerprint, self.state) class QueueModel(Model): @@ -43,12 +65,12 @@ class QueueModel(Model): url = Text(required=True) fingerprint = Text(required=True) host_crc32 = Integer(required=True) - meta = Map(Text(), Text()) - headers = Map(Text(), Text()) - cookies = Map(Text(), Text()) + meta = Pickle() + headers = Pickle() + cookies = Pickle() method = Text() created_at = BigInt(required=True) depth = SmallInt() def __repr__(self): - return '' % (self.url, self.id) + return '' % (self.url, self.id) diff --git a/tests/contrib/backends/cassandra/test_backend_cassandra.py b/tests/contrib/backends/cassandra/test_backend_cassandra.py index 0c9b7bf81..9cd89406f 100644 --- a/tests/contrib/backends/cassandra/test_backend_cassandra.py +++ b/tests/contrib/backends/cassandra/test_backend_cassandra.py @@ -1,4 +1,7 @@ import unittest +import six +import uuid +from datetime import datetime from cassandra.cluster import Cluster from cassandra.cqlengine import connection @@ -9,8 +12,7 @@ from frontera.settings import Settings -class TestCassandraBackend(unittest.TestCase): - +class BaseCassendraTest(unittest.TestCase): def setUp(self): settings = Settings() hosts = ['127.0.0.1'] @@ -30,7 +32,8 @@ def tearDown(self): models = [MetadataModel, StateModel, QueueModel] for model in models: if model.__table_name__ in tables: - self.session.execute('DROP TABLE {0};'.format(model.column_family_name()), timeout=240) + # self.session.execute('DROP TABLE {0};'.format(model.column_family_name()), timeout=240) + drop_table(model) self.session.shutdown() def _get_tables(self): @@ -38,6 +41,80 @@ def _get_tables(self): result = self.session.execute(query, (self.session.keyspace,)) return [row.table_name for row in result.current_rows] + +class TestCassandraBackendModels(BaseCassendraTest): + def test_pickled_fields(self): + sync_table(MetadataModel) + m = MetadataModel(fingerprint='fingerprint', + url='http://example.com', + depth=0, + created_at=datetime.now()) + meta = {b'fingerprint': b'10', + b'scrapy_meta': {'non_binary': 'www.example.com', + 'number': 81, + 'list': ['str', b'bytes', u'unicode']} + } + m.meta = meta + m.save() + stored_meta = m.get(fingerprint='fingerprint').meta + self.assertDictEqual(meta, stored_meta) + + def test_metadata_model(self): + fields = { + 'fingerprint': 'fingerprint', + 'url': 'http://example.com', + 'depth': 0, + 'created_at': datetime.now(), + 'fetched_at': datetime.now(), + 'status_code': 400, + 'score': 0.9, + 'error': 'Bad Request', + 'meta': {'meta': 'meta'}, + 'headers': {'headers': 'headers'}, + 'cookies': {'cookies': 'cookies'}, + 'method': 'GET', + } + self.assert_db_values(MetadataModel, 'fingerprint', fields) + + def test_state_model(self): + fields = { + 'fingerprint': 'fingerprint', + 'state': 1 + } + self.assert_db_values(StateModel, 'fingerprint', fields) + + def test_queue_model(self): + fields = { + 'id': uuid.uuid4(), + 'partition_id': 0, + 'score': 0.8, + 'url': 'http://example.com', + 'fingerprint': 'fingerprint', + 'host_crc32': 1234, + 'meta': {'meta': 'meta'}, + 'headers': {'headers': 'headers'}, + 'cookies': {'cookies': 'cookies'}, + 'method': 'GET', + 'created_at': datetime.now(), + 'depth': 0, + } + self.assert_db_values(QueueModel, 'id', fields) + + def assert_db_values(self, model, primary_key, fields): + sync_table(model) + m = model(**fields) + m.save() + stored_obj = m.get(fingerprint=fields[primary_key]) + for field, original_value in six.iteritems(fields): + stored_value = getattr(stored_obj, field) + if isinstance(original_value, dict): + self.assertDictEqual(stored_value, original_value) + else: + self.assertEqual(stored_value, original_value) + + +class TestCassandraBackend(BaseCassendraTest): + def test_tables_created(self): tables_before = self._get_tables() self.assertEqual(tables_before, []) @@ -48,6 +125,7 @@ def test_tables_created(self): def test_tables_droped_and_created(self): def _get_state_data(): return StateModel.all() + models = [MetadataModel, StateModel, QueueModel] for model in models: sync_table(model) From 9308b280156b901badb469482ad6ff2742bff34d Mon Sep 17 00:00:00 2001 From: voith Date: Wed, 2 Nov 2016 12:38:53 -0500 Subject: [PATCH 05/14] refactored cassandra metadata and added tests for it --- .../contrib/backends/cassandra/__init__.py | 26 ++---- .../contrib/backends/cassandra/components.py | 52 ++++++++---- frontera/contrib/backends/cassandra/models.py | 38 ++++++--- frontera/settings/default_settings.py | 1 + .../cassandra/test_backend_cassandra.py | 82 +++++++++++++------ 5 files changed, 125 insertions(+), 74 deletions(-) diff --git a/frontera/contrib/backends/cassandra/__init__.py b/frontera/contrib/backends/cassandra/__init__.py index c1691e3da..87ad131ab 100644 --- a/frontera/contrib/backends/cassandra/__init__.py +++ b/frontera/contrib/backends/cassandra/__init__.py @@ -1,13 +1,12 @@ from __future__ import absolute_import import six - from cassandra.cluster import Cluster from cassandra.cqlengine import connection from cassandra.cqlengine.management import drop_table, sync_table -from cassandra.query import dict_factory -from frontera.contrib.backends import CommonStorageBackend, CommonDistributedStorageBackend +from frontera.contrib.backends import (CommonDistributedStorageBackend, + CommonStorageBackend) from frontera.contrib.backends.cassandra.components import (Metadata, Queue, States) from frontera.utils.misc import load_object @@ -29,20 +28,16 @@ def __init__(self, manager): self.models = dict([(name, load_object(cls)) for name, cls in six.iteritems(models)]) cluster_kwargs = { 'port': cluster_port, - 'compression': True, - 'control_connection_timeout': 240, + 'compression': True } self.cluster = Cluster(contact_points=cluster_hosts, **cluster_kwargs) - self.session = self.cluster.connect(keyspace) - # self.session.row_factory = dict_factory - # self.session.encoder.mapping[dict] = self.session.encoder.cql_encode_map_collection connection.setup(cluster_hosts, keyspace, **cluster_kwargs) + self.session.default_timeout = connection.session.default_timeout = \ + settings.get('CASSANDRABACKEND_REQUEST_TIMEOUT') - tables = self._get_tables() if drop_all_tables: for name, table in six.iteritems(self.models): - if table.__table_name__ in tables: drop_table(table) for name, table in six.iteritems(self.models): @@ -54,14 +49,6 @@ def __init__(self, manager): # self._states = States(self.session, self.models['StateModel'], settings.get('STATE_CACHE_SIZE_LIMIT')) # self._queue = self._create_queue(settings) - # def _drop_table(self, model): - # self.session.execute('DROP TABLE {0};'.format(model.column_family_name()), timeout=240) - - def _get_tables(self): - query = self.session.prepare('SELECT table_name FROM system_schema.tables WHERE keyspace_name = ?') - result = self.session.execute(query, (self.session.keyspace,)) - return [row.table_name for row in result.current_rows] - def frontier_stop(self): self.states.flush() self.session.shutdown() @@ -85,9 +72,6 @@ def __init__(self, manager): self.cluster = Cluster(cluster_hosts, **cluster_kwargs) self.models = dict([(name, load_object(cls)) for name, cls in six.iteritems(models)]) - self.session = self.cluster.connect() - self.session.row_factory = dict_factory - self.session.set_keyspace(keyspace) connection.set_session(self.session) diff --git a/frontera/contrib/backends/cassandra/components.py b/frontera/contrib/backends/cassandra/components.py index d392b8665..81c188c93 100644 --- a/frontera/contrib/backends/cassandra/components.py +++ b/frontera/contrib/backends/cassandra/components.py @@ -1,13 +1,17 @@ # -*- coding: utf-8 -*- import logging -import uuid -import pickle import six +import sys +import traceback +import uuid from time import time from cachetools import LRUCache +from cassandra import (OperationTimedOut, ReadFailure, ReadTimeout, + WriteFailure, WriteTimeout) from cassandra.concurrent import execute_concurrent_with_args from cassandra.cqlengine.query import BatchQuery +from w3lib.util import to_bytes, to_native_str from frontera.contrib.backends import CreateOrModifyPageMixin from frontera.contrib.backends.memory import MemoryStates @@ -18,7 +22,24 @@ from frontera.utils.misc import chunks, get_crc32 from frontera.utils.url import parse_domain_from_url_fast -from w3lib.util import to_native_str, to_bytes + +def _retry(func): + def func_wrapper(self, *args, **kwargs): + tries = 5 + count = 0 + while count < tries: + try: + return func(self, *args, **kwargs) + except (OperationTimedOut, ReadTimeout, ReadFailure, WriteTimeout, WriteFailure) as exc: + ex_type, ex, tb = sys.exc_info() + tries += 1 + self.logger.warn("{0}: {1} Backtrace: {2}".format(ex_type.__name__, ex, traceback.extract_tb(tb))) + del tb + self.logger.info("Tries left %i" % tries - count) + + raise exc + + return func_wrapper class Metadata(BaseMetadata, CreateOrModifyPageMixin): @@ -36,38 +57,39 @@ def frontier_stop(self): def add_seeds(self, seeds): for seed in seeds: page = self._create_page(seed) - page.batch(self.batch).save() - self.cache[to_bytes(page.fingerprint)] = page + self._add_to_batch_and_update_cache(page) self.batch.execute() def request_error(self, page, error): page = self._modify_page(page) if page.meta[b'fingerprint'] in self.cache else self._create_page(page) page.error = error - self.cache[to_bytes(page.fingerprint)] = page - page.save() + self._add_to_batch_and_update_cache(page) + self.batch.execute() def page_crawled(self, response): - page = self._modify_page(response) if response.meta[b'fingerprint'] in self.cache else self._create_page(response) - self.cache[page.fingerprint] = page - page.save() + page = self._modify_page(response) \ + if response.meta[b'fingerprint'] in self.cache else self._create_page(response) + self._add_to_batch_and_update_cache(page) + self.batch.execute() def links_extracted(self, request, links): for link in links: if link.meta[b'fingerprint'] not in self.cache: page = self._create_page(link) - self.cache[link.meta[b'fingerprint']] = page - page.batch(self.batch).save() + self._add_to_batch_and_update_cache(page) self.batch.execute() def update_score(self, batch): - for fprint, score, request, schedule in batch: + for fprint, (score, url, schedule) in six.iteritems(batch): page = self.cache[fprint] page.fingerprint = to_native_str(fprint) page.score = score - self.cache[fprint] = page - page.batch(self.batch).save() + self._add_to_batch_and_update_cache(page) self.batch.execute() + def _add_to_batch_and_update_cache(self, page): + self.cache[to_bytes(page.fingerprint)] = page.batch(self.batch).save() + class States(MemoryStates): diff --git a/frontera/contrib/backends/cassandra/models.py b/frontera/contrib/backends/cassandra/models.py index 91340d1aa..3ff13c1f2 100644 --- a/frontera/contrib/backends/cassandra/models.py +++ b/frontera/contrib/backends/cassandra/models.py @@ -3,19 +3,31 @@ import six from cassandra.cqlengine.columns import (UUID, BigInt, Bytes, DateTime, Float, - Integer, Map, SmallInt, Text) + Integer, SmallInt, Text) from cassandra.cqlengine.models import Model -class Pickle(Bytes): +class PickleDict(Bytes): + """ + PickleDict applies Python's ``pickle.dumps()`` to incoming objects + if the value received is a dict, and ``pickle.loads()`` on the way out. + """ def to_database(self, value): - value = self._pickle_object(value) - return super(Pickle, self).to_database(value) + if value is None: + return + if isinstance(value, dict): + value = self._pickle_object(value) + return super(PickleDict, self).to_database(value) def to_python(self, value): - value = super(Pickle, self).to_python(value) - return self._unpickle_object(value) + value = super(PickleDict, self).to_python(value) + if value is None: + return + try: + return self._unpickle_object(value) + except TypeError: + return value def _pickle_object(self, obj): pickled = pickle.dumps(obj) @@ -37,9 +49,9 @@ class MetadataModel(Model): status_code = Integer() score = Float() error = Text() - meta = Pickle() - headers = Pickle() - cookies = Pickle() + meta = PickleDict() + headers = PickleDict() + cookies = PickleDict() method = Text() def __repr__(self): @@ -60,14 +72,14 @@ class QueueModel(Model): __table_name__ = 'queue' id = UUID(primary_key=True) - partition_id = Integer(required=True) + partition_id = Integer(primary_key=True) score = Float(required=True) url = Text(required=True) fingerprint = Text(required=True) host_crc32 = Integer(required=True) - meta = Pickle() - headers = Pickle() - cookies = Pickle() + meta = PickleDict() + headers = PickleDict() + cookies = PickleDict() method = Text() created_at = BigInt(required=True) depth = SmallInt() diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index dc06568fa..46596bb2e 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -20,6 +20,7 @@ CASSANDRABACKEND_CLUSTER_HOSTS = ['127.0.0.1'] CASSANDRABACKEND_CLUSTER_PORT = 9042 CASSANDRABACKEND_KEYSPACE = 'crawler' +CASSANDRABACKEND_REQUEST_TIMEOUT = 100 DELAY_ON_EMPTY = 5.0 DOMAIN_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' diff --git a/tests/contrib/backends/cassandra/test_backend_cassandra.py b/tests/contrib/backends/cassandra/test_backend_cassandra.py index 9cd89406f..88cca4db1 100644 --- a/tests/contrib/backends/cassandra/test_backend_cassandra.py +++ b/tests/contrib/backends/cassandra/test_backend_cassandra.py @@ -1,18 +1,29 @@ import unittest -import six import uuid from datetime import datetime +from time import time +import six from cassandra.cluster import Cluster from cassandra.cqlengine import connection -from cassandra.cqlengine.management import drop_table, sync_table +from cassandra.cqlengine.management import drop_keyspace, sync_table from frontera.contrib.backends.cassandra import CassandraBackend -from frontera.contrib.backends.cassandra.models import MetadataModel, StateModel, QueueModel +from frontera.contrib.backends.cassandra.models import (MetadataModel, + QueueModel, StateModel) from frontera.settings import Settings +from frontera.core.models import Request, Response + +r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', + b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) +r2 = Request('http://example.com/some/page/', meta={b'fingerprint': b'11', + b'domain': {b'name': b'example.com', b'fingerprint': b'82'}}) +r3 = Request('http://www.scrapy.org', meta={b'fingerprint': b'12', + b'domain': {b'name': b'www.scrapy.org', b'fingerprint': b'83'}}) -class BaseCassendraTest(unittest.TestCase): + +class BaseCassandraTest(unittest.TestCase): def setUp(self): settings = Settings() hosts = ['127.0.0.1'] @@ -20,29 +31,22 @@ def setUp(self): self.manager = type('manager', (object,), {}) self.manager.settings = settings self.keyspace = settings.CASSANDRABACKEND_KEYSPACE - cluster = Cluster(hosts, port, control_connection_timeout=240) + cluster = Cluster(hosts, port) self.session = cluster.connect() self.session.execute("CREATE KEYSPACE IF NOT EXISTS %s WITH " - "replication = {'class':'SimpleStrategy', 'replication_factor' : 3}" % self.keyspace) - connection.setup(hosts, self.keyspace, port=port, control_connection_timeout=240) + "replication = {'class':'SimpleStrategy', 'replication_factor' : 1}" % self.keyspace) self.session.set_keyspace(self.keyspace) + timeout = settings.CASSANDRABACKEND_REQUEST_TIMEOUT + connection.setup(hosts, self.keyspace, port=port) + self.session.default_timeout = connection.session.default_timeout = timeout def tearDown(self): - tables = self._get_tables() - models = [MetadataModel, StateModel, QueueModel] - for model in models: - if model.__table_name__ in tables: - # self.session.execute('DROP TABLE {0};'.format(model.column_family_name()), timeout=240) - drop_table(model) + drop_keyspace(self.keyspace) self.session.shutdown() - def _get_tables(self): - query = self.session.prepare('SELECT table_name FROM system_schema.tables WHERE keyspace_name = ?') - result = self.session.execute(query, (self.session.keyspace,)) - return [row.table_name for row in result.current_rows] +class TestCassandraBackendModels(BaseCassandraTest): -class TestCassandraBackendModels(BaseCassendraTest): def test_pickled_fields(self): sync_table(MetadataModel) m = MetadataModel(fingerprint='fingerprint', @@ -74,14 +78,14 @@ def test_metadata_model(self): 'cookies': {'cookies': 'cookies'}, 'method': 'GET', } - self.assert_db_values(MetadataModel, 'fingerprint', fields) + self.assert_db_values(MetadataModel, {'fingerprint': fields['fingerprint']}, fields) def test_state_model(self): fields = { 'fingerprint': 'fingerprint', 'state': 1 } - self.assert_db_values(StateModel, 'fingerprint', fields) + self.assert_db_values(StateModel, {'fingerprint': fields['fingerprint']}, fields) def test_queue_model(self): fields = { @@ -95,25 +99,34 @@ def test_queue_model(self): 'headers': {'headers': 'headers'}, 'cookies': {'cookies': 'cookies'}, 'method': 'GET', - 'created_at': datetime.now(), + 'created_at': int(time()*1E+6), 'depth': 0, } - self.assert_db_values(QueueModel, 'id', fields) + self.assert_db_values(QueueModel, {'id': fields['id']}, fields) - def assert_db_values(self, model, primary_key, fields): + def assert_db_values(self, model, _filter, fields): sync_table(model) m = model(**fields) m.save() - stored_obj = m.get(fingerprint=fields[primary_key]) + stored_obj = m.get(**_filter) for field, original_value in six.iteritems(fields): stored_value = getattr(stored_obj, field) if isinstance(original_value, dict): self.assertDictEqual(stored_value, original_value) + elif isinstance(original_value, datetime): + self.assertEqual(stored_value.ctime(), original_value.ctime()) + elif isinstance(original_value, float): + self.assertAlmostEquals(stored_value, original_value) else: self.assertEqual(stored_value, original_value) -class TestCassandraBackend(BaseCassendraTest): +class TestCassandraBackend(BaseCassandraTest): + + def _get_tables(self): + query = self.session.prepare('SELECT table_name FROM system_schema.tables WHERE keyspace_name = ?') + result = self.session.execute(query, (self.session.keyspace,)) + return [row.table_name for row in result.current_rows] def test_tables_created(self): tables_before = self._get_tables() @@ -139,3 +152,22 @@ def _get_state_data(): self.assertEqual(set(tables_before), set(['metadata', 'states', 'queue'])) rows_after = _get_state_data() self.assertEqual(rows_after.count(), 0) + + def test_metadata(self): + b = CassandraBackend(self.manager) + metadata = b.metadata + metadata.add_seeds([r1, r2, r3]) + meta_qs = MetadataModel.objects.all() + self.assertEqual(set([r1.url, r2.url, r3.url]), set([m.url for m in meta_qs])) + resp = Response('https://www.example.com', request=r1) + metadata.page_crawled(resp) + stored_response = meta_qs.get(fingerprint='10') + self.assertEqual(stored_response.status_code, 200) + metadata.request_error(r3, 'error') + stored_error = meta_qs.get(fingerprint='12') + self.assertEqual(stored_error.error, 'error') + batch = {r2.meta[b'fingerprint']: [0.8, r2.url, False]} + metadata.update_score(batch) + stored_score = meta_qs.get(fingerprint='11') + self.assertAlmostEquals(stored_score.score, 0.8) + self.assertEqual(meta_qs.count(), 3) From e16bd33c826311592b2d7d300f5c8a099985e7dd Mon Sep 17 00:00:00 2001 From: voith Date: Thu, 3 Nov 2016 12:38:53 -0500 Subject: [PATCH 06/14] fix pickledict bugs in py2 --- .../cassandra/test_backend_cassandra.py | 238 +++++++++--------- 1 file changed, 121 insertions(+), 117 deletions(-) diff --git a/tests/contrib/backends/cassandra/test_backend_cassandra.py b/tests/contrib/backends/cassandra/test_backend_cassandra.py index 88cca4db1..119b4d47b 100644 --- a/tests/contrib/backends/cassandra/test_backend_cassandra.py +++ b/tests/contrib/backends/cassandra/test_backend_cassandra.py @@ -24,6 +24,7 @@ class BaseCassandraTest(unittest.TestCase): + def setUp(self): settings = Settings() hosts = ['127.0.0.1'] @@ -45,80 +46,80 @@ def tearDown(self): self.session.shutdown() -class TestCassandraBackendModels(BaseCassandraTest): - - def test_pickled_fields(self): - sync_table(MetadataModel) - m = MetadataModel(fingerprint='fingerprint', - url='http://example.com', - depth=0, - created_at=datetime.now()) - meta = {b'fingerprint': b'10', - b'scrapy_meta': {'non_binary': 'www.example.com', - 'number': 81, - 'list': ['str', b'bytes', u'unicode']} - } - m.meta = meta - m.save() - stored_meta = m.get(fingerprint='fingerprint').meta - self.assertDictEqual(meta, stored_meta) - - def test_metadata_model(self): - fields = { - 'fingerprint': 'fingerprint', - 'url': 'http://example.com', - 'depth': 0, - 'created_at': datetime.now(), - 'fetched_at': datetime.now(), - 'status_code': 400, - 'score': 0.9, - 'error': 'Bad Request', - 'meta': {'meta': 'meta'}, - 'headers': {'headers': 'headers'}, - 'cookies': {'cookies': 'cookies'}, - 'method': 'GET', - } - self.assert_db_values(MetadataModel, {'fingerprint': fields['fingerprint']}, fields) - - def test_state_model(self): - fields = { - 'fingerprint': 'fingerprint', - 'state': 1 - } - self.assert_db_values(StateModel, {'fingerprint': fields['fingerprint']}, fields) - - def test_queue_model(self): - fields = { - 'id': uuid.uuid4(), - 'partition_id': 0, - 'score': 0.8, - 'url': 'http://example.com', - 'fingerprint': 'fingerprint', - 'host_crc32': 1234, - 'meta': {'meta': 'meta'}, - 'headers': {'headers': 'headers'}, - 'cookies': {'cookies': 'cookies'}, - 'method': 'GET', - 'created_at': int(time()*1E+6), - 'depth': 0, - } - self.assert_db_values(QueueModel, {'id': fields['id']}, fields) - - def assert_db_values(self, model, _filter, fields): - sync_table(model) - m = model(**fields) - m.save() - stored_obj = m.get(**_filter) - for field, original_value in six.iteritems(fields): - stored_value = getattr(stored_obj, field) - if isinstance(original_value, dict): - self.assertDictEqual(stored_value, original_value) - elif isinstance(original_value, datetime): - self.assertEqual(stored_value.ctime(), original_value.ctime()) - elif isinstance(original_value, float): - self.assertAlmostEquals(stored_value, original_value) - else: - self.assertEqual(stored_value, original_value) +# class TestCassandraBackendModels(BaseCassandraTest): +# +# def test_pickled_fields(self): +# sync_table(MetadataModel) +# m = MetadataModel(fingerprint='fingerprint', +# url='http://example.com', +# depth=0, +# created_at=datetime.now()) +# meta = {b'fingerprint': b'10', +# b'scrapy_meta': {'non_binary': 'www.example.com', +# 'number': 81, +# 'list': ['str', b'bytes', u'unicode']} +# } +# m.meta = meta +# m.save() +# stored_meta = m.get(fingerprint='fingerprint').meta +# self.assertDictEqual(meta, stored_meta) +# +# def test_metadata_model(self): +# fields = { +# 'fingerprint': 'fingerprint', +# 'url': 'http://example.com', +# 'depth': 0, +# 'created_at': datetime.now(), +# 'fetched_at': datetime.now(), +# 'status_code': 400, +# 'score': 0.9, +# 'error': 'Bad Request', +# 'meta': {'meta': 'meta'}, +# 'headers': {'headers': 'headers'}, +# 'cookies': {'cookies': 'cookies'}, +# 'method': 'GET', +# } +# self.assert_db_values(MetadataModel, {'fingerprint': fields['fingerprint']}, fields) +# +# def test_state_model(self): +# fields = { +# 'fingerprint': 'fingerprint', +# 'state': 1 +# } +# self.assert_db_values(StateModel, {'fingerprint': fields['fingerprint']}, fields) +# +# def test_queue_model(self): +# fields = { +# 'id': uuid.uuid4(), +# 'partition_id': 0, +# 'score': 0.8, +# 'url': 'http://example.com', +# 'fingerprint': 'fingerprint', +# 'host_crc32': 1234, +# 'meta': {'meta': 'meta'}, +# 'headers': {'headers': 'headers'}, +# 'cookies': {'cookies': 'cookies'}, +# 'method': 'GET', +# 'created_at': int(time()*1E+6), +# 'depth': 0, +# } +# self.assert_db_values(QueueModel, {'id': fields['id']}, fields) +# +# def assert_db_values(self, model, _filter, fields): +# sync_table(model) +# m = model(**fields) +# m.save() +# stored_obj = m.get(**_filter) +# for field, original_value in six.iteritems(fields): +# stored_value = getattr(stored_obj, field) +# if isinstance(original_value, dict): +# self.assertDictEqual(stored_value, original_value) +# elif isinstance(original_value, datetime): +# self.assertEqual(stored_value.ctime(), original_value.ctime()) +# elif isinstance(original_value, float): +# self.assertAlmostEquals(stored_value, original_value) +# else: +# self.assertEqual(stored_value, original_value) class TestCassandraBackend(BaseCassandraTest): @@ -128,46 +129,49 @@ def _get_tables(self): result = self.session.execute(query, (self.session.keyspace,)) return [row.table_name for row in result.current_rows] - def test_tables_created(self): - tables_before = self._get_tables() - self.assertEqual(tables_before, []) - CassandraBackend(self.manager) - tables_after = self._get_tables() - self.assertEqual(set(tables_after), set(['metadata', 'states', 'queue'])) - - def test_tables_droped_and_created(self): - def _get_state_data(): - return StateModel.all() - - models = [MetadataModel, StateModel, QueueModel] - for model in models: - sync_table(model) - tables_before = self._get_tables() - self.assertEqual(set(tables_before), set(['metadata', 'states', 'queue'])) - StateModel.create(fingerprint='fingerprint', state=200) - rows_before = _get_state_data() - self.assertEqual(rows_before.count(), 1) - self.manager.settings.CASSANDRABACKEND_DROP_ALL_TABLES = True - CassandraBackend(self.manager) - self.assertEqual(set(tables_before), set(['metadata', 'states', 'queue'])) - rows_after = _get_state_data() - self.assertEqual(rows_after.count(), 0) - - def test_metadata(self): - b = CassandraBackend(self.manager) - metadata = b.metadata - metadata.add_seeds([r1, r2, r3]) - meta_qs = MetadataModel.objects.all() - self.assertEqual(set([r1.url, r2.url, r3.url]), set([m.url for m in meta_qs])) - resp = Response('https://www.example.com', request=r1) - metadata.page_crawled(resp) - stored_response = meta_qs.get(fingerprint='10') - self.assertEqual(stored_response.status_code, 200) - metadata.request_error(r3, 'error') - stored_error = meta_qs.get(fingerprint='12') - self.assertEqual(stored_error.error, 'error') - batch = {r2.meta[b'fingerprint']: [0.8, r2.url, False]} - metadata.update_score(batch) - stored_score = meta_qs.get(fingerprint='11') - self.assertAlmostEquals(stored_score.score, 0.8) - self.assertEqual(meta_qs.count(), 3) + # def test_tables_created(self): + # tables_before = self._get_tables() + # self.assertEqual(tables_before, []) + # CassandraBackend(self.manager) + # tables_after = self._get_tables() + # self.assertEqual(set(tables_after), set(['metadata', 'states', 'queue'])) + # + # def test_tables_droped_and_created(self): + # def _get_state_data(): + # return StateModel.all() + # + # models = [MetadataModel, StateModel, QueueModel] + # for model in models: + # sync_table(model) + # tables_before = self._get_tables() + # self.assertEqual(set(tables_before), set(['metadata', 'states', 'queue'])) + # StateModel.create(fingerprint='fingerprint', state=200) + # rows_before = _get_state_data() + # self.assertEqual(rows_before.count(), 1) + # self.manager.settings.CASSANDRABACKEND_DROP_ALL_TABLES = True + # CassandraBackend(self.manager) + # self.assertEqual(set(tables_before), set(['metadata', 'states', 'queue'])) + # rows_after = _get_state_data() + # self.assertEqual(rows_after.count(), 0) + + # def test_metadata(self): + # b = CassandraBackend(self.manager) + # metadata = b.metadata + # metadata.add_seeds([r1, r2, r3]) + # meta_qs = MetadataModel.objects.all() + # self.assertEqual(set([r1.url, r2.url, r3.url]), set([m.url for m in meta_qs])) + # resp = Response('https://www.example.com', request=r1) + # metadata.page_crawled(resp) + # stored_response = meta_qs.get(fingerprint='10') + # self.assertEqual(stored_response.status_code, 200) + # metadata.request_error(r3, 'error') + # stored_error = meta_qs.get(fingerprint='12') + # self.assertEqual(stored_error.error, 'error') + # batch = {r2.meta[b'fingerprint']: [0.8, r2.url, False]} + # metadata.update_score(batch) + # stored_score = meta_qs.get(fingerprint='11') + # self.assertAlmostEquals(stored_score.score, 0.8) + # self.assertEqual(meta_qs.count(), 3) + + def test_state(self): + pass \ No newline at end of file From b7713d16204cfaa14b7d30dfd1f62e9aa76a54d4 Mon Sep 17 00:00:00 2001 From: voith Date: Fri, 4 Nov 2016 12:38:53 -0500 Subject: [PATCH 07/14] refactored cassandra states and added tests for it --- .../contrib/backends/cassandra/__init__.py | 8 +- .../contrib/backends/cassandra/components.py | 23 +- frontera/contrib/backends/cassandra/models.py | 2 + .../cassandra/test_backend_cassandra.py | 263 ++++++++++-------- 4 files changed, 161 insertions(+), 135 deletions(-) diff --git a/frontera/contrib/backends/cassandra/__init__.py b/frontera/contrib/backends/cassandra/__init__.py index 87ad131ab..82f05a818 100644 --- a/frontera/contrib/backends/cassandra/__init__.py +++ b/frontera/contrib/backends/cassandra/__init__.py @@ -38,15 +38,17 @@ def __init__(self, manager): if drop_all_tables: for name, table in six.iteritems(self.models): - drop_table(table) + drop_table(table) for name, table in six.iteritems(self.models): - sync_table(table) + sync_table(table) self._metadata = Metadata(self.session, self.models['MetadataModel'], settings.get('CASSANDRABACKEND_CACHE_SIZE')) - # self._states = States(self.session, self.models['StateModel'], settings.get('STATE_CACHE_SIZE_LIMIT')) + self._states = States(self.session, + self.models['StateModel'], + settings.get('STATE_CACHE_SIZE_LIMIT')) # self._queue = self._create_queue(settings) def frontier_stop(self): diff --git a/frontera/contrib/backends/cassandra/components.py b/frontera/contrib/backends/cassandra/components.py index 81c188c93..b70b5b908 100644 --- a/frontera/contrib/backends/cassandra/components.py +++ b/frontera/contrib/backends/cassandra/components.py @@ -93,32 +93,31 @@ def _add_to_batch_and_update_cache(self, page): class States(MemoryStates): - def __init__(self, session, model_cls, cache_size_limit, crawl_id): + def __init__(self, session, model_cls, cache_size_limit): super(States, self).__init__(cache_size_limit) self.session = session self.model = model_cls + self.batch = BatchQuery() self.logger = logging.getLogger("frontera.contrib.backends.cassandra.components.States") - self.crawl_id = crawl_id def frontier_stop(self): - pass + self.flush() def fetch(self, fingerprints): - to_fetch = [f for f in fingerprints if f not in self._cache] + to_fetch = [to_native_str(f) for f in fingerprints if f not in self._cache] self.logger.debug("cache size %s", len(self._cache)) self.logger.debug("to fetch %d from %d", (len(to_fetch), len(fingerprints))) for chunk in chunks(to_fetch, 128): - for state in self.model.objects.filter(crawl=self.crawl_id, fingerprint__in=chunk): - self._cache[state.fingerprint] = state.state + for state in self.model.objects.filter(fingerprint__in=chunk): + self._cache[to_bytes(state.fingerprint)] = state.state def flush(self, force_clear=False): - query = self.session.prepare("INSERT INTO states (id, fingerprint, state) VALUES (?, ?, ?)") - cql_items = [] - for fingerprint, state_val in self._cache.iteritems(): - cql_i = (uuid.uuid4(), fingerprint, state_val) - cql_items.append(cql_i) - execute_concurrent_with_args(self.session, query, cql_items, concurrency=20000) + for fingerprint, state_val in six.iteritems(self._cache): + state = self.model(fingerprint=to_native_str(fingerprint), state=state_val) + state.batch(self.batch).save() + self.batch.execute() + self.logger.debug("State cache has been flushed.") super(States, self).flush(force_clear) diff --git a/frontera/contrib/backends/cassandra/models.py b/frontera/contrib/backends/cassandra/models.py index 3ff13c1f2..0ec41d9cd 100644 --- a/frontera/contrib/backends/cassandra/models.py +++ b/frontera/contrib/backends/cassandra/models.py @@ -24,6 +24,8 @@ def to_python(self, value): value = super(PickleDict, self).to_python(value) if value is None: return + if isinstance(value, dict): + return value try: return self._unpickle_object(value) except TypeError: diff --git a/tests/contrib/backends/cassandra/test_backend_cassandra.py b/tests/contrib/backends/cassandra/test_backend_cassandra.py index 119b4d47b..35d9a1c63 100644 --- a/tests/contrib/backends/cassandra/test_backend_cassandra.py +++ b/tests/contrib/backends/cassandra/test_backend_cassandra.py @@ -11,9 +11,9 @@ from frontera.contrib.backends.cassandra import CassandraBackend from frontera.contrib.backends.cassandra.models import (MetadataModel, QueueModel, StateModel) -from frontera.settings import Settings from frontera.core.models import Request, Response - +from frontera.core.components import States +from frontera.settings import Settings r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) @@ -21,6 +21,7 @@ b'domain': {b'name': b'example.com', b'fingerprint': b'82'}}) r3 = Request('http://www.scrapy.org', meta={b'fingerprint': b'12', b'domain': {b'name': b'www.scrapy.org', b'fingerprint': b'83'}}) +r4 = r3.copy() class BaseCassandraTest(unittest.TestCase): @@ -46,80 +47,80 @@ def tearDown(self): self.session.shutdown() -# class TestCassandraBackendModels(BaseCassandraTest): -# -# def test_pickled_fields(self): -# sync_table(MetadataModel) -# m = MetadataModel(fingerprint='fingerprint', -# url='http://example.com', -# depth=0, -# created_at=datetime.now()) -# meta = {b'fingerprint': b'10', -# b'scrapy_meta': {'non_binary': 'www.example.com', -# 'number': 81, -# 'list': ['str', b'bytes', u'unicode']} -# } -# m.meta = meta -# m.save() -# stored_meta = m.get(fingerprint='fingerprint').meta -# self.assertDictEqual(meta, stored_meta) -# -# def test_metadata_model(self): -# fields = { -# 'fingerprint': 'fingerprint', -# 'url': 'http://example.com', -# 'depth': 0, -# 'created_at': datetime.now(), -# 'fetched_at': datetime.now(), -# 'status_code': 400, -# 'score': 0.9, -# 'error': 'Bad Request', -# 'meta': {'meta': 'meta'}, -# 'headers': {'headers': 'headers'}, -# 'cookies': {'cookies': 'cookies'}, -# 'method': 'GET', -# } -# self.assert_db_values(MetadataModel, {'fingerprint': fields['fingerprint']}, fields) -# -# def test_state_model(self): -# fields = { -# 'fingerprint': 'fingerprint', -# 'state': 1 -# } -# self.assert_db_values(StateModel, {'fingerprint': fields['fingerprint']}, fields) -# -# def test_queue_model(self): -# fields = { -# 'id': uuid.uuid4(), -# 'partition_id': 0, -# 'score': 0.8, -# 'url': 'http://example.com', -# 'fingerprint': 'fingerprint', -# 'host_crc32': 1234, -# 'meta': {'meta': 'meta'}, -# 'headers': {'headers': 'headers'}, -# 'cookies': {'cookies': 'cookies'}, -# 'method': 'GET', -# 'created_at': int(time()*1E+6), -# 'depth': 0, -# } -# self.assert_db_values(QueueModel, {'id': fields['id']}, fields) -# -# def assert_db_values(self, model, _filter, fields): -# sync_table(model) -# m = model(**fields) -# m.save() -# stored_obj = m.get(**_filter) -# for field, original_value in six.iteritems(fields): -# stored_value = getattr(stored_obj, field) -# if isinstance(original_value, dict): -# self.assertDictEqual(stored_value, original_value) -# elif isinstance(original_value, datetime): -# self.assertEqual(stored_value.ctime(), original_value.ctime()) -# elif isinstance(original_value, float): -# self.assertAlmostEquals(stored_value, original_value) -# else: -# self.assertEqual(stored_value, original_value) +class TestCassandraBackendModels(BaseCassandraTest): + + def test_pickled_fields(self): + sync_table(MetadataModel) + m = MetadataModel(fingerprint='fingerprint', + url='http://example.com', + depth=0, + created_at=datetime.now()) + meta = {b'fingerprint': b'10', + b'scrapy_meta': {'non_binary': 'www.example.com', + 'number': 81, + 'list': ['str', b'bytes', u'unicode']} + } + m.meta = meta + m.save() + stored_meta = m.get(fingerprint='fingerprint').meta + self.assertDictEqual(meta, stored_meta) + + def test_metadata_model(self): + fields = { + 'fingerprint': 'fingerprint', + 'url': 'http://example.com', + 'depth': 0, + 'created_at': datetime.now(), + 'fetched_at': datetime.now(), + 'status_code': 400, + 'score': 0.9, + 'error': 'Bad Request', + 'meta': {'meta': 'meta'}, + 'headers': {'headers': 'headers'}, + 'cookies': {'cookies': 'cookies'}, + 'method': 'GET', + } + self.assert_db_values(MetadataModel, {'fingerprint': fields['fingerprint']}, fields) + + def test_state_model(self): + fields = { + 'fingerprint': 'fingerprint', + 'state': 1 + } + self.assert_db_values(StateModel, {'fingerprint': fields['fingerprint']}, fields) + + def test_queue_model(self): + fields = { + 'id': uuid.uuid4(), + 'partition_id': 0, + 'score': 0.8, + 'url': 'http://example.com', + 'fingerprint': 'fingerprint', + 'host_crc32': 1234, + 'meta': {'meta': 'meta'}, + 'headers': {'headers': 'headers'}, + 'cookies': {'cookies': 'cookies'}, + 'method': 'GET', + 'created_at': int(time()*1E+6), + 'depth': 0, + } + self.assert_db_values(QueueModel, {'id': fields['id']}, fields) + + def assert_db_values(self, model, _filter, fields): + sync_table(model) + m = model(**fields) + m.save() + stored_obj = m.get(**_filter) + for field, original_value in six.iteritems(fields): + stored_value = getattr(stored_obj, field) + if isinstance(original_value, dict): + self.assertDictEqual(stored_value, original_value) + elif isinstance(original_value, datetime): + self.assertEqual(stored_value.ctime(), original_value.ctime()) + elif isinstance(original_value, float): + self.assertAlmostEquals(stored_value, original_value) + else: + self.assertEqual(stored_value, original_value) class TestCassandraBackend(BaseCassandraTest): @@ -129,49 +130,71 @@ def _get_tables(self): result = self.session.execute(query, (self.session.keyspace,)) return [row.table_name for row in result.current_rows] - # def test_tables_created(self): - # tables_before = self._get_tables() - # self.assertEqual(tables_before, []) - # CassandraBackend(self.manager) - # tables_after = self._get_tables() - # self.assertEqual(set(tables_after), set(['metadata', 'states', 'queue'])) - # - # def test_tables_droped_and_created(self): - # def _get_state_data(): - # return StateModel.all() - # - # models = [MetadataModel, StateModel, QueueModel] - # for model in models: - # sync_table(model) - # tables_before = self._get_tables() - # self.assertEqual(set(tables_before), set(['metadata', 'states', 'queue'])) - # StateModel.create(fingerprint='fingerprint', state=200) - # rows_before = _get_state_data() - # self.assertEqual(rows_before.count(), 1) - # self.manager.settings.CASSANDRABACKEND_DROP_ALL_TABLES = True - # CassandraBackend(self.manager) - # self.assertEqual(set(tables_before), set(['metadata', 'states', 'queue'])) - # rows_after = _get_state_data() - # self.assertEqual(rows_after.count(), 0) - - # def test_metadata(self): - # b = CassandraBackend(self.manager) - # metadata = b.metadata - # metadata.add_seeds([r1, r2, r3]) - # meta_qs = MetadataModel.objects.all() - # self.assertEqual(set([r1.url, r2.url, r3.url]), set([m.url for m in meta_qs])) - # resp = Response('https://www.example.com', request=r1) - # metadata.page_crawled(resp) - # stored_response = meta_qs.get(fingerprint='10') - # self.assertEqual(stored_response.status_code, 200) - # metadata.request_error(r3, 'error') - # stored_error = meta_qs.get(fingerprint='12') - # self.assertEqual(stored_error.error, 'error') - # batch = {r2.meta[b'fingerprint']: [0.8, r2.url, False]} - # metadata.update_score(batch) - # stored_score = meta_qs.get(fingerprint='11') - # self.assertAlmostEquals(stored_score.score, 0.8) - # self.assertEqual(meta_qs.count(), 3) + def test_tables_created(self): + tables_before = self._get_tables() + self.assertEqual(tables_before, []) + CassandraBackend(self.manager) + tables_after = self._get_tables() + self.assertEqual(set(tables_after), set(['metadata', 'states', 'queue'])) + + def test_tables_droped_and_created(self): + def _get_state_data(): + return StateModel.all() + + models = [MetadataModel, StateModel, QueueModel] + for model in models: + sync_table(model) + tables_before = self._get_tables() + self.assertEqual(set(tables_before), set(['metadata', 'states', 'queue'])) + StateModel.create(fingerprint='fingerprint', state=200) + rows_before = _get_state_data() + self.assertEqual(rows_before.count(), 1) + self.manager.settings.CASSANDRABACKEND_DROP_ALL_TABLES = True + CassandraBackend(self.manager) + self.assertEqual(set(tables_before), set(['metadata', 'states', 'queue'])) + rows_after = _get_state_data() + self.assertEqual(rows_after.count(), 0) + + def test_metadata(self): + b = CassandraBackend(self.manager) + metadata = b.metadata + metadata.add_seeds([r1, r2, r3]) + meta_qs = MetadataModel.objects.all() + self.assertEqual(set([r1.url, r2.url, r3.url]), set([m.url for m in meta_qs])) + resp = Response('https://www.example.com', request=r1) + metadata.page_crawled(resp) + stored_response = meta_qs.get(fingerprint='10') + self.assertEqual(stored_response.status_code, 200) + metadata.request_error(r3, 'error') + stored_error = meta_qs.get(fingerprint='12') + self.assertEqual(stored_error.error, 'error') + batch = {r2.meta[b'fingerprint']: [0.8, r2.url, False]} + metadata.update_score(batch) + stored_score = meta_qs.get(fingerprint='11') + self.assertAlmostEquals(stored_score.score, 0.8) + self.assertEqual(meta_qs.count(), 3) def test_state(self): - pass \ No newline at end of file + b = CassandraBackend(self.manager) + state = b.states + state.set_states([r1, r2, r3]) + self.assertEqual([r.meta[b'state'] for r in [r1, r2, r3]], [States.NOT_CRAWLED]*3) + state.update_cache([r1, r2, r3]) + self.assertDictEqual(state._cache, {b'10': States.NOT_CRAWLED, + b'11': States.NOT_CRAWLED, + b'12': States.NOT_CRAWLED}) + r1.meta[b'state'] = States.CRAWLED + r2.meta[b'state'] = States.CRAWLED + r3.meta[b'state'] = States.CRAWLED + state.update_cache([r1, r2, r3]) + state.flush(True) + self.assertDictEqual(state._cache, {}) + state.fetch([b'10', b'11', b'12']) + self.assertDictEqual(state._cache, {b'10': States.CRAWLED, + b'11': States.CRAWLED, + b'12': States.CRAWLED}) + r4.meta[b'state'] = States.ERROR + state.set_states([r1, r2, r4]) + self.assertEqual(r4.meta[b'state'], States.CRAWLED) + state.flush(True) + self.assertEqual(state._cache, {}) From 48c203d4a2de3fe223f9683e7c1232efb1fb7cbb Mon Sep 17 00:00:00 2001 From: voith Date: Sat, 5 Nov 2016 12:38:53 -0500 Subject: [PATCH 08/14] added cassandra queue and tests for it --- frontera/contrib/backends/__init__.py | 4 +- .../contrib/backends/cassandra/__init__.py | 5 +- .../contrib/backends/cassandra/components.py | 127 ++++++------------ frontera/contrib/backends/cassandra/models.py | 6 +- .../cassandra/test_backend_cassandra.py | 25 +++- 5 files changed, 72 insertions(+), 95 deletions(-) diff --git a/frontera/contrib/backends/__init__.py b/frontera/contrib/backends/__init__.py index 06e9719fb..da9e34bfe 100644 --- a/frontera/contrib/backends/__init__.py +++ b/frontera/contrib/backends/__init__.py @@ -93,11 +93,11 @@ def finished(self): class CommonStorageBackend(CommonBackend): def _create_queue(self, settings): - if not isinstance(self.queue_component, BaseQueue): + if not issubclass(self.queue_component, BaseQueue): raise TypeError('expected queue_component to ' 'belong to class: %s, got %s instead' % (type(BaseQueue).__name__, type(self.queue_component).__name__)) - return self.queue_component(self.session_cls, + return self.queue_component(self.session, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) diff --git a/frontera/contrib/backends/cassandra/__init__.py b/frontera/contrib/backends/cassandra/__init__.py index 82f05a818..75ac2d31d 100644 --- a/frontera/contrib/backends/cassandra/__init__.py +++ b/frontera/contrib/backends/cassandra/__init__.py @@ -49,7 +49,7 @@ def __init__(self, manager): self._states = States(self.session, self.models['StateModel'], settings.get('STATE_CACHE_SIZE_LIMIT')) - # self._queue = self._create_queue(settings) + self._queue = self._create_queue(settings) def frontier_stop(self): self.states.flush() @@ -93,8 +93,7 @@ def strategy_worker(cls, manager): sync_table(model) - b._states = States(b.session, model, - settings.get('STATE_CACHE_SIZE_LIMIT')) + b._states = States(b.session, model, settings.get('STATE_CACHE_SIZE_LIMIT')) return b @classmethod diff --git a/frontera/contrib/backends/cassandra/components.py b/frontera/contrib/backends/cassandra/components.py index b70b5b908..c2adbcedc 100644 --- a/frontera/contrib/backends/cassandra/components.py +++ b/frontera/contrib/backends/cassandra/components.py @@ -9,9 +9,7 @@ from cachetools import LRUCache from cassandra import (OperationTimedOut, ReadFailure, ReadTimeout, WriteFailure, WriteTimeout) -from cassandra.concurrent import execute_concurrent_with_args from cassandra.cqlengine.query import BatchQuery -from w3lib.util import to_bytes, to_native_str from frontera.contrib.backends import CreateOrModifyPageMixin from frontera.contrib.backends.memory import MemoryStates @@ -22,6 +20,8 @@ from frontera.utils.misc import chunks, get_crc32 from frontera.utils.url import parse_domain_from_url_fast +from w3lib.util import to_bytes, to_native_str + def _retry(func): def func_wrapper(self, *args, **kwargs): @@ -122,21 +122,26 @@ def flush(self, force_clear=False): class Queue(BaseQueue): - def __init__(self, session, queue_cls, partitions, crawl_id, generate_stats, ordering='default'): + + def __init__(self, session, queue_cls, partitions, ordering='default'): self.session = session self.queue_model = queue_cls self.logger = logging.getLogger("frontera.contrib.backends.cassandra.components.Queue") self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.ordering = ordering + self.batch = BatchQuery() def frontier_stop(self): pass - def _order_by(self): + def _order_by(self, query): if self.ordering == 'created': - return "created_at" - return "created_at" + return query.order_by('created_at') + if self.ordering == 'created_desc': + return query.order_by('-created_at') + return query.order_by('score', 'created_at') # TODO: remove second parameter, + # it's not necessary for proper crawling, but needed for tests def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ @@ -148,53 +153,19 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ results = [] try: - dequeued_urls = 0 - cql_ditems = [] - d_query = self.session.prepare("DELETE FROM queue WHERE crawl = ? AND fingerprint = ? AND partition_id = ? " - "AND score = ? AND created_at = ?") - for item in self.queue_model.objects.filter(crawl=self.crawl_id, partition_id=partition_id).\ - order_by("partition_id", "score", self._order_by()).limit(max_n_requests): - method = 'GET' if not item.method else item.method - - meta_dict2 = dict((name, getattr(item.meta, name)) for name in dir(item.meta) - if not name.startswith('__')) - # TODO: How the result can be an dict not an object -> Objects get error while encodeing for Message Bus - # If I take meta_dict2 direct to Request i get the same error message - - meta_dict = dict() - meta_dict["fingerprint"] = meta_dict2["fingerprint"] - meta_dict["domain"] = meta_dict2["domain"] - meta_dict["origin_is_frontier"] = meta_dict2["origin_is_frontier"] - meta_dict["scrapy_callback"] = meta_dict2["scrapy_callback"] - meta_dict["scrapy_errback"] = meta_dict2["scrapy_errback"] - meta_dict["scrapy_meta"] = meta_dict2["scrapy_meta"] - meta_dict["score"] = meta_dict2["score"] - meta_dict["jid"] = meta_dict2["jid"] - - r = Request(item.url, method=method, meta=meta_dict, headers=item.headers, cookies=item.cookies) - r.meta['fingerprint'] = item.fingerprint - r.meta['score'] = item.score + for item in self._order_by(self.queue_model.filter(partition_id=partition_id).allow_filtering()).limit(max_n_requests): + method = item.method or b'GET' + r = Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies) + r.meta[b'fingerprint'] = to_bytes(item.fingerprint) + r.meta[b'score'] = item.score results.append(r) - - cql_d = (item.crawl, item.fingerprint, item.partition_id, item.score, item.created_at) - cql_ditems.append(cql_d) - dequeued_urls += 1 - - if dequeued_urls > 0: - execute_concurrent_with_args(self.session, d_query, cql_ditems, concurrency=200) - - self.counter_cls.cass_count({"dequeued_urls": dequeued_urls}) - + item.batch(self.batch).delete() + self.batch.execute() except Exception as exc: self.logger.exception(exc) - return results def schedule(self, batch): - query = self.session.prepare("INSERT INTO queue (id, fingerprint, score, partition_id, host_crc32, url, " - "created_at, meta, depth, headers, method, cookies) " - "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") - cql_items = [] for fprint, score, request, schedule in batch: if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) @@ -205,35 +176,23 @@ def schedule(self, batch): else: partition_id = self.partitioner.partition(hostname, self.partitions) host_crc32 = get_crc32(hostname) - created_at = time()*1E+6 - - if "domain" not in request.meta: - request.meta["domain"] = {} - if "origin_is_frontier" not in request.meta: - request.meta["origin_is_frontier"] = '' - if "scrapy_callback" not in request.meta: - request.meta["scrapy_callback"] = None - if "scrapy_errback" not in request.meta: - request.meta["scrapy_errback"] = None - if "scrapy_meta" not in request.meta: - request.meta["scrapy_meta"] = {} - if "score" not in request.meta: - request.meta["score"] = 0 - if "jid" not in request.meta: - request.meta["jid"] = 0 - - cql_i = (uuid.uuid4(), fprint, score, partition_id, host_crc32, request.url, created_at, - request.meta, 0, request.headers, request.method, request.cookies) - cql_items.append(cql_i) - - request.meta['state'] = States.QUEUED - - execute_concurrent_with_args(self.session, query, cql_items, concurrency=400) - self.counter_cls.cass_count({"queued_urls": len(cql_items)}) + q = self.queue_model(id=uuid.uuid4(), + fingerprint=to_native_str(fprint), + score=score, + url=request.url, + meta=request.meta, + headers=request.headers, + cookies=request.cookies, + method=to_native_str(request.method), + partition_id=partition_id, + host_crc32=host_crc32, + created_at=time() * 1E+6) + q.batch(self.batch).save() + request.meta[b'state'] = States.QUEUED + self.batch.execute() def count(self): - count = self.queue_model.objects.filter().count() - return count + return self.queue_model.all().count() class BroadCrawlingQueue(Queue): @@ -265,12 +224,11 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): while tries < self.GET_RETRIES: tries += 1 limit *= 5.5 if tries > 1 else 1.0 - self.logger.debug("Try %d, limit %d, last attempt: requests %d, hosts %d" % - (tries, limit, count, len(queue.keys()))) + self.logger.debug("Try %d, limit %d, last attempt: requests %d, hosts %d", + tries, limit, count, len(queue.keys())) queue.clear() count = 0 - for item in self.queue_model.objects.filter(crawl=self.crawl_id, partition_id=partition_id).\ - order_by("crawl", "score", self._order_by()).limit(limit): + for item in self._order_by(self.queue_model.filter(partition_id=partition_id)).limit(max_n_requests): if item.host_crc32 not in queue: queue[item.host_crc32] = [] if max_requests_per_host is not None and len(queue[item.host_crc32]) > max_requests_per_host: @@ -284,13 +242,14 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): if min_requests is not None and count < min_requests: continue break - self.logger.debug("Finished: tries %d, hosts %d, requests %d" % (tries, len(queue.keys()), count)) + self.logger.debug("Finished: tries %d, hosts %d, requests %d", tries, len(queue.keys()), count) results = [] - for items in queue.itervalues(): + for items in six.itervalues(queue): for item in items: - method = 'GET' if not item.method else item.method - results.append(Request(item.url, method=method, meta=item.meta, headers=item.headers, - cookies=item.cookies)) - item.delete() + method = item.method or b'GET' + results.append(Request(item.url, method=method, + meta=item.meta, headers=item.headers, cookies=item.cookies)) + item.batch(self.batch).delete() + self.batch.execute() return results diff --git a/frontera/contrib/backends/cassandra/models.py b/frontera/contrib/backends/cassandra/models.py index 0ec41d9cd..306f3b415 100644 --- a/frontera/contrib/backends/cassandra/models.py +++ b/frontera/contrib/backends/cassandra/models.py @@ -73,9 +73,10 @@ def __repr__(self): class QueueModel(Model): __table_name__ = 'queue' - id = UUID(primary_key=True) partition_id = Integer(primary_key=True) - score = Float(required=True) + score = Float(primary_key=True) + created_at = BigInt(primary_key=True) + id = UUID(primary_key=True) url = Text(required=True) fingerprint = Text(required=True) host_crc32 = Integer(required=True) @@ -83,7 +84,6 @@ class QueueModel(Model): headers = PickleDict() cookies = PickleDict() method = Text() - created_at = BigInt(required=True) depth = SmallInt() def __repr__(self): diff --git a/tests/contrib/backends/cassandra/test_backend_cassandra.py b/tests/contrib/backends/cassandra/test_backend_cassandra.py index 35d9a1c63..3bc107fcd 100644 --- a/tests/contrib/backends/cassandra/test_backend_cassandra.py +++ b/tests/contrib/backends/cassandra/test_backend_cassandra.py @@ -33,12 +33,13 @@ def setUp(self): self.manager = type('manager', (object,), {}) self.manager.settings = settings self.keyspace = settings.CASSANDRABACKEND_KEYSPACE + timeout = settings.CASSANDRABACKEND_REQUEST_TIMEOUT cluster = Cluster(hosts, port) self.session = cluster.connect() self.session.execute("CREATE KEYSPACE IF NOT EXISTS %s WITH " - "replication = {'class':'SimpleStrategy', 'replication_factor' : 1}" % self.keyspace) + "replication = {'class':'SimpleStrategy', 'replication_factor' : 1}" % self.keyspace, + timeout=timeout) self.session.set_keyspace(self.keyspace) - timeout = settings.CASSANDRABACKEND_REQUEST_TIMEOUT connection.setup(hosts, self.keyspace, port=port) self.session.default_timeout = connection.session.default_timeout = timeout @@ -110,7 +111,7 @@ def assert_db_values(self, model, _filter, fields): sync_table(model) m = model(**fields) m.save() - stored_obj = m.get(**_filter) + stored_obj = m.objects.allow_filtering().get(**_filter) for field, original_value in six.iteritems(fields): stored_value = getattr(stored_obj, field) if isinstance(original_value, dict): @@ -198,3 +199,21 @@ def test_state(self): self.assertEqual(r4.meta[b'state'], States.CRAWLED) state.flush(True) self.assertEqual(state._cache, {}) + + def test_queue(self): + self.manager.settings.SPIDER_FEED_PARTITIONS = 2 + b = CassandraBackend(self.manager) + queue = b.queue + batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True), + ('12', 0.7, r3, True)] + queue.schedule(batch) + self.assertEqual(set([r.url for r in queue.get_next_requests(10, 0, + min_requests=3, + min_hosts=1, + max_requests_per_host=10)]), + set([r3.url])) + self.assertEqual(set([r.url for r in queue.get_next_requests(10, 1, + min_requests=3, + min_hosts=1, + max_requests_per_host=10)]), + set([r1.url, r2.url])) From 59c58a4a9f28827def9a8b9114c435e5ee603c42 Mon Sep 17 00:00:00 2001 From: voith Date: Sun, 6 Nov 2016 12:38:53 -0500 Subject: [PATCH 09/14] Added Lifo, fifo, dfs, bfs backend and correspoding tests for it --- frontera/contrib/backends/__init__.py | 9 --- .../contrib/backends/cassandra/__init__.py | 73 ++++++++++++++----- .../contrib/backends/cassandra/components.py | 49 ++++--------- frontera/contrib/backends/cassandra/models.py | 24 ++++++ .../contrib/backends/cassandra/revisiting.py | 22 +----- .../contrib/backends/sqlalchemy/__init__.py | 5 +- frontera/settings/default_settings.py | 3 +- setup.py | 3 +- .../cassandra/test_backend_cassandra.py | 58 ++++++++++++--- 9 files changed, 145 insertions(+), 101 deletions(-) diff --git a/frontera/contrib/backends/__init__.py b/frontera/contrib/backends/__init__.py index da9e34bfe..142fe8e8d 100644 --- a/frontera/contrib/backends/__init__.py +++ b/frontera/contrib/backends/__init__.py @@ -92,15 +92,6 @@ def finished(self): class CommonStorageBackend(CommonBackend): - def _create_queue(self, settings): - if not issubclass(self.queue_component, BaseQueue): - raise TypeError('expected queue_component to ' - 'belong to class: %s, got %s instead' % (type(BaseQueue).__name__, - type(self.queue_component).__name__)) - return self.queue_component(self.session, - self.models['QueueModel'], - settings.get('SPIDER_FEED_PARTITIONS')) - @property def queue(self): return self._queue diff --git a/frontera/contrib/backends/cassandra/__init__.py b/frontera/contrib/backends/cassandra/__init__.py index 75ac2d31d..0a907a033 100644 --- a/frontera/contrib/backends/cassandra/__init__.py +++ b/frontera/contrib/backends/cassandra/__init__.py @@ -14,8 +14,6 @@ class CassandraBackend(CommonStorageBackend): - queue_component = Queue - def __init__(self, manager): self.manager = manager settings = manager.settings @@ -28,35 +26,70 @@ def __init__(self, manager): self.models = dict([(name, load_object(cls)) for name, cls in six.iteritems(models)]) cluster_kwargs = { 'port': cluster_port, - 'compression': True + 'compression': True, } - self.cluster = Cluster(contact_points=cluster_hosts, **cluster_kwargs) - self.session = self.cluster.connect(keyspace) - connection.setup(cluster_hosts, keyspace, **cluster_kwargs) - self.session.default_timeout = connection.session.default_timeout = \ - settings.get('CASSANDRABACKEND_REQUEST_TIMEOUT') + if not connection.cluster: + connection.setup(cluster_hosts, keyspace, **cluster_kwargs) + connection.session.default_timeout = settings.get('CASSANDRABACKEND_REQUEST_TIMEOUT') if drop_all_tables: for name, table in six.iteritems(self.models): drop_table(table) - for name, table in six.iteritems(self.models): - sync_table(table) - - self._metadata = Metadata(self.session, - self.models['MetadataModel'], - settings.get('CASSANDRABACKEND_CACHE_SIZE')) - self._states = States(self.session, - self.models['StateModel'], - settings.get('STATE_CACHE_SIZE_LIMIT')) + self._metadata = Metadata(self.models['MetadataModel'], settings.get('CASSANDRABACKEND_CACHE_SIZE')) + self._states = States(self.models['StateModel'], settings.get('STATE_CACHE_SIZE_LIMIT')) self._queue = self._create_queue(settings) def frontier_stop(self): self.states.flush() - self.session.shutdown() + + def _create_queue(self, settings): + return Queue(self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) + + +class FIFOBackend(CassandraBackend): + component_name = 'Cassandra FIFO Backend' + + def _create_queue(self, settings): + return Queue(self.models['FifoOrLIfoQueueModel'], + settings.get('SPIDER_FEED_PARTITIONS'), + ordering='created') + + +class LIFOBackend(CassandraBackend): + component_name = 'Cassandra LIFO Backend' + + def _create_queue(self, settings): + return Queue(self.models['FifoOrLIfoQueueModel'], + settings.get('SPIDER_FEED_PARTITIONS'), + ordering='created_desc') + + +class DFSBackend(CassandraBackend): + component_name = 'Cassandra DFS Backend' + + def _create_queue(self, settings): + return Queue(self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) + + def _get_score(self, obj): + return -obj.meta[b'depth'] + + +class BFSBackend(CassandraBackend): + component_name = 'Cassandra BFS Backend' + + def _create_queue(self, settings): + return Queue(self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) + + def _get_score(self, obj): + return obj.meta[b'depth'] BASE = CassandraBackend +LIFO = LIFOBackend +FIFO = FIFOBackend +DFS = DFSBackend +BFS = BFSBackend class Distributed(CommonDistributedStorageBackend): @@ -111,6 +144,6 @@ def db_worker(cls, manager): sync_table(metadata_m) sync_table(queue_m) - b._metadata = Metadata(b.session, metadata_m) - b._queue = Queue(b.session, queue_m, settings.get('SPIDER_FEED_PARTITIONS')) + b._metadata = Metadata(metadata_m) + b._queue = Queue(queue_m, settings.get('SPIDER_FEED_PARTITIONS')) return b diff --git a/frontera/contrib/backends/cassandra/components.py b/frontera/contrib/backends/cassandra/components.py index c2adbcedc..2c47d3a22 100644 --- a/frontera/contrib/backends/cassandra/components.py +++ b/frontera/contrib/backends/cassandra/components.py @@ -1,15 +1,13 @@ # -*- coding: utf-8 -*- import logging -import six -import sys -import traceback import uuid from time import time +import six from cachetools import LRUCache -from cassandra import (OperationTimedOut, ReadFailure, ReadTimeout, - WriteFailure, WriteTimeout) +from cassandra.cqlengine.management import sync_table from cassandra.cqlengine.query import BatchQuery +from w3lib.util import to_bytes, to_native_str from frontera.contrib.backends import CreateOrModifyPageMixin from frontera.contrib.backends.memory import MemoryStates @@ -20,36 +18,15 @@ from frontera.utils.misc import chunks, get_crc32 from frontera.utils.url import parse_domain_from_url_fast -from w3lib.util import to_bytes, to_native_str - - -def _retry(func): - def func_wrapper(self, *args, **kwargs): - tries = 5 - count = 0 - while count < tries: - try: - return func(self, *args, **kwargs) - except (OperationTimedOut, ReadTimeout, ReadFailure, WriteTimeout, WriteFailure) as exc: - ex_type, ex, tb = sys.exc_info() - tries += 1 - self.logger.warn("{0}: {1} Backtrace: {2}".format(ex_type.__name__, ex, traceback.extract_tb(tb))) - del tb - self.logger.info("Tries left %i" % tries - count) - - raise exc - - return func_wrapper - class Metadata(BaseMetadata, CreateOrModifyPageMixin): - def __init__(self, session, model_cls, cache_size): - self.session = session + def __init__(self, model_cls, cache_size): self.model = model_cls self.cache = LRUCache(cache_size) self.batch = BatchQuery() self.logger = logging.getLogger("frontera.contrib.backends.cassandra.components.Metadata") + sync_table(model_cls) def frontier_stop(self): pass @@ -80,7 +57,9 @@ def links_extracted(self, request, links): self.batch.execute() def update_score(self, batch): - for fprint, (score, url, schedule) in six.iteritems(batch): + if isinstance(batch, dict): + batch = [(fprint, score, url, schedule) for fprint, (score, url, schedule) in six.iteritems(batch)] + for fprint, score, url, schedule in batch: page = self.cache[fprint] page.fingerprint = to_native_str(fprint) page.score = score @@ -93,12 +72,12 @@ def _add_to_batch_and_update_cache(self, page): class States(MemoryStates): - def __init__(self, session, model_cls, cache_size_limit): + def __init__(self, model_cls, cache_size_limit): super(States, self).__init__(cache_size_limit) - self.session = session self.model = model_cls self.batch = BatchQuery() self.logger = logging.getLogger("frontera.contrib.backends.cassandra.components.States") + sync_table(model_cls) def frontier_stop(self): self.flush() @@ -106,7 +85,7 @@ def frontier_stop(self): def fetch(self, fingerprints): to_fetch = [to_native_str(f) for f in fingerprints if f not in self._cache] self.logger.debug("cache size %s", len(self._cache)) - self.logger.debug("to fetch %d from %d", (len(to_fetch), len(fingerprints))) + self.logger.debug("to fetch %d from %d", len(to_fetch), len(fingerprints)) for chunk in chunks(to_fetch, 128): for state in self.model.objects.filter(fingerprint__in=chunk): @@ -123,14 +102,14 @@ def flush(self, force_clear=False): class Queue(BaseQueue): - def __init__(self, session, queue_cls, partitions, ordering='default'): - self.session = session + def __init__(self, queue_cls, partitions, ordering='default'): self.queue_model = queue_cls self.logger = logging.getLogger("frontera.contrib.backends.cassandra.components.Queue") self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.ordering = ordering self.batch = BatchQuery() + sync_table(queue_cls) def frontier_stop(self): pass @@ -228,7 +207,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): tries, limit, count, len(queue.keys())) queue.clear() count = 0 - for item in self._order_by(self.queue_model.filter(partition_id=partition_id)).limit(max_n_requests): + for item in self._order_by(self.queue_model.filter(partition_id=partition_id).allow_filtering()).limit(max_n_requests): if item.host_crc32 not in queue: queue[item.host_crc32] = [] if max_requests_per_host is not None and len(queue[item.host_crc32]) > max_requests_per_host: diff --git a/frontera/contrib/backends/cassandra/models.py b/frontera/contrib/backends/cassandra/models.py index 306f3b415..15ed73513 100644 --- a/frontera/contrib/backends/cassandra/models.py +++ b/frontera/contrib/backends/cassandra/models.py @@ -88,3 +88,27 @@ class QueueModel(Model): def __repr__(self): return '' % (self.url, self.id) + + +class FifoOrLIfoQueueModel(Model): + # Separate models are needed as + # order_by is supported on columns + # only in the order, the clustering + # keys were created + + # Also Inheriting model has some runtime issues + # mostly a bug in the driver + # Hence the duplicate code + + partition_id = Integer(primary_key=True) + score = Float(required=True) + created_at = BigInt(primary_key=True) + id = UUID(primary_key=True) + url = Text(required=True) + fingerprint = Text(required=True) + host_crc32 = Integer(required=True) + meta = PickleDict() + headers = PickleDict() + cookies = PickleDict() + method = Text() + depth = SmallInt() diff --git a/frontera/contrib/backends/cassandra/revisiting.py b/frontera/contrib/backends/cassandra/revisiting.py index f6be289d3..f69cb5d9f 100644 --- a/frontera/contrib/backends/cassandra/revisiting.py +++ b/frontera/contrib/backends/cassandra/revisiting.py @@ -2,7 +2,7 @@ import json import logging from datetime import datetime, timedelta -from time import sleep, time +from time import time from cassandra.cqlengine import columns from cassandra.cqlengine.models import Model @@ -22,24 +22,6 @@ class RevisitingQueueModel(Model): crawl_at = columns.DateTime(required=True, default=datetime.now(), index=True) -def retry_and_rollback(func): - def func_wrapper(self, *args, **kwargs): - tries = 5 - while True: - try: - return func(self, *args, **kwargs) - except Exception as exc: - self.logger.exception(exc) - sleep(5) - tries -= 1 - if tries > 0: - self.logger.info("Tries left %i" % tries) - continue - else: - raise exc - return func_wrapper - - class RevisitingQueue(BaseQueue): def __init__(self, session, queue_cls, partitions): self.session = session() @@ -64,7 +46,6 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): self.logger.exception(exc) return results - @retry_and_rollback def schedule(self, batch): for fprint, score, request, schedule_at in batch: if schedule_at: @@ -107,7 +88,6 @@ def _create_queue(self, obj, fingerprint, score, partition_id, host_crc32, creat return db_queue - @retry_and_rollback def count(self): return self.session.query(self.queue_model).count() diff --git a/frontera/contrib/backends/sqlalchemy/__init__.py b/frontera/contrib/backends/sqlalchemy/__init__.py index 9ca92bc04..dcff0ca1d 100644 --- a/frontera/contrib/backends/sqlalchemy/__init__.py +++ b/frontera/contrib/backends/sqlalchemy/__init__.py @@ -12,8 +12,6 @@ class SQLAlchemyBackend(CommonStorageBackend): - queue_component = Queue - def __init__(self, manager): self.manager = manager settings = manager.settings @@ -48,6 +46,9 @@ def frontier_stop(self): super(SQLAlchemyBackend, self).frontier_stop() self.engine.dispose() + def _create_queue(self, settings): + return Queue(self.session, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) + class FIFOBackend(SQLAlchemyBackend): component_name = 'SQLAlchemy FIFO Backend' diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 46596bb2e..408db4708 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -14,7 +14,8 @@ CASSANDRABACKEND_MODELS = { 'MetadataModel': 'frontera.contrib.backends.cassandra.models.MetadataModel', 'StateModel': 'frontera.contrib.backends.cassandra.models.StateModel', - 'QueueModel': 'frontera.contrib.backends.cassandra.models.QueueModel' + 'QueueModel': 'frontera.contrib.backends.cassandra.models.QueueModel', + 'FifoOrLIfoQueueModel': 'frontera.contrib.backends.cassandra.models.FifoOrLIfoQueueModel', } CASSANDRABACKEND_REVISIT_INTERVAL = timedelta(days=1) CASSANDRABACKEND_CLUSTER_HOSTS = ['127.0.0.1'] diff --git a/setup.py b/setup.py index d47a9dfb6..23f5a7698 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,8 @@ 'Twisted' ], 'cassandra': [ - 'cassandra-driver==3.7.0' + 'cassandra-driver==3.7.0', + 'cachetools' ] }, tests_require=[ diff --git a/tests/contrib/backends/cassandra/test_backend_cassandra.py b/tests/contrib/backends/cassandra/test_backend_cassandra.py index 3bc107fcd..90f131001 100644 --- a/tests/contrib/backends/cassandra/test_backend_cassandra.py +++ b/tests/contrib/backends/cassandra/test_backend_cassandra.py @@ -6,14 +6,19 @@ import six from cassandra.cluster import Cluster from cassandra.cqlengine import connection -from cassandra.cqlengine.management import drop_keyspace, sync_table +from cassandra.cqlengine.management import (create_keyspace_simple, + drop_keyspace, drop_table, + sync_table) from frontera.contrib.backends.cassandra import CassandraBackend -from frontera.contrib.backends.cassandra.models import (MetadataModel, +from frontera.contrib.backends.cassandra.models import (FifoOrLIfoQueueModel, + MetadataModel, QueueModel, StateModel) -from frontera.core.models import Request, Response from frontera.core.components import States +from frontera.core.models import Request, Response from frontera.settings import Settings +from tests import backends + r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) @@ -24,7 +29,7 @@ r4 = r3.copy() -class BaseCassandraTest(unittest.TestCase): +class BaseCassandraTest(object): def setUp(self): settings = Settings() @@ -36,19 +41,20 @@ def setUp(self): timeout = settings.CASSANDRABACKEND_REQUEST_TIMEOUT cluster = Cluster(hosts, port) self.session = cluster.connect() - self.session.execute("CREATE KEYSPACE IF NOT EXISTS %s WITH " - "replication = {'class':'SimpleStrategy', 'replication_factor' : 1}" % self.keyspace, - timeout=timeout) + if not connection.cluster: + connection.setup(hosts, self.keyspace, port=port) + connection.session.default_timeout = timeout + create_keyspace_simple(self.keyspace, 1) self.session.set_keyspace(self.keyspace) - connection.setup(hosts, self.keyspace, port=port) - self.session.default_timeout = connection.session.default_timeout = timeout + self.session.default_timeout = timeout + connection.session.set_keyspace(self.keyspace) def tearDown(self): drop_keyspace(self.keyspace) self.session.shutdown() -class TestCassandraBackendModels(BaseCassandraTest): +class TestCassandraBackendModels(BaseCassandraTest, unittest.TestCase): def test_pickled_fields(self): sync_table(MetadataModel) @@ -105,7 +111,9 @@ def test_queue_model(self): 'created_at': int(time()*1E+6), 'depth': 0, } - self.assert_db_values(QueueModel, {'id': fields['id']}, fields) + for model in [FifoOrLIfoQueueModel, QueueModel]: + self.assert_db_values(model, {'id': fields['id']}, fields) + drop_table(model) def assert_db_values(self, model, _filter, fields): sync_table(model) @@ -124,7 +132,7 @@ def assert_db_values(self, model, _filter, fields): self.assertEqual(stored_value, original_value) -class TestCassandraBackend(BaseCassandraTest): +class TestCassandraBackend(BaseCassandraTest, unittest.TestCase): def _get_tables(self): query = self.session.prepare('SELECT table_name FROM system_schema.tables WHERE keyspace_name = ?') @@ -217,3 +225,29 @@ def test_queue(self): min_hosts=1, max_requests_per_host=10)]), set([r1.url, r2.url])) + + +class BaseCassandraIntegrationTests(object): + obj = BaseCassandraTest() + + def setup_backend(self, method): + self.obj.setUp() + + def teardown_backend(self, method): + self.obj.tearDown() + + +class TestCassandraFIFOBackend(BaseCassandraIntegrationTests, backends.FIFOBackendTest): + backend_class = 'frontera.contrib.backends.cassandra.FIFO' + + +class TestCassandraLIFOBackend(BaseCassandraIntegrationTests, backends.LIFOBackendTest): + backend_class = 'frontera.contrib.backends.cassandra.LIFO' + + +class TestCassandraDFSBackend(BaseCassandraIntegrationTests, backends.DFSBackendTest): + backend_class = 'frontera.contrib.backends.cassandra.DFS' + + +class TestCassandraBFSBackend(BaseCassandraIntegrationTests, backends.BFSBackendTest): + backend_class = 'frontera.contrib.backends.cassandra.BFS' From db9a2acfbcf35c0685112776f8e03687d5c8113f Mon Sep 17 00:00:00 2001 From: voith Date: Mon, 7 Nov 2016 12:38:53 -0500 Subject: [PATCH 10/14] fixed connection issue in tests --- .../contrib/backends/cassandra/__init__.py | 1 + .../cassandra/test_backend_cassandra.py | 21 ++++++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/frontera/contrib/backends/cassandra/__init__.py b/frontera/contrib/backends/cassandra/__init__.py index 0a907a033..42235fa06 100644 --- a/frontera/contrib/backends/cassandra/__init__.py +++ b/frontera/contrib/backends/cassandra/__init__.py @@ -42,6 +42,7 @@ def __init__(self, manager): def frontier_stop(self): self.states.flush() + connection.unregister_connection('default') def _create_queue(self, settings): return Queue(self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) diff --git a/tests/contrib/backends/cassandra/test_backend_cassandra.py b/tests/contrib/backends/cassandra/test_backend_cassandra.py index 90f131001..c8ff68b50 100644 --- a/tests/contrib/backends/cassandra/test_backend_cassandra.py +++ b/tests/contrib/backends/cassandra/test_backend_cassandra.py @@ -33,25 +33,30 @@ class BaseCassandraTest(object): def setUp(self): settings = Settings() - hosts = ['127.0.0.1'] - port = 9042 + self.hosts = ['127.0.0.1'] + self.port = 9042 self.manager = type('manager', (object,), {}) self.manager.settings = settings self.keyspace = settings.CASSANDRABACKEND_KEYSPACE - timeout = settings.CASSANDRABACKEND_REQUEST_TIMEOUT - cluster = Cluster(hosts, port) + self.timeout = settings.CASSANDRABACKEND_REQUEST_TIMEOUT + cluster = Cluster(self.hosts, self.port) self.session = cluster.connect() - if not connection.cluster: - connection.setup(hosts, self.keyspace, port=port) - connection.session.default_timeout = timeout + self._set_global_connection(self.hosts, self.port, self.timeout) create_keyspace_simple(self.keyspace, 1) self.session.set_keyspace(self.keyspace) - self.session.default_timeout = timeout + self.session.default_timeout = self.timeout connection.session.set_keyspace(self.keyspace) def tearDown(self): + self._set_global_connection(self.hosts, self.port, self.timeout) drop_keyspace(self.keyspace) self.session.shutdown() + connection.unregister_connection('default') + + def _set_global_connection(self, hosts, port, timeout): + if not connection.cluster: + connection.setup(hosts, self.keyspace, port=port) + connection.session.default_timeout = timeout class TestCassandraBackendModels(BaseCassandraTest, unittest.TestCase): From 93a61ce3f306a67853297c2df71462c3b4f8bb3c Mon Sep 17 00:00:00 2001 From: voith Date: Tue, 8 Nov 2016 12:38:53 -0500 Subject: [PATCH 11/14] Added cassandra revisiting backend and tests for it --- frontera/contrib/backends/__init__.py | 24 ++++ .../contrib/backends/cassandra/__init__.py | 54 +++----- .../contrib/backends/cassandra/components.py | 13 +- frontera/contrib/backends/cassandra/models.py | 47 ++++--- .../contrib/backends/cassandra/revisiting.py | 128 ++++++++---------- .../contrib/backends/sqlalchemy/revisiting.py | 33 +---- frontera/settings/default_settings.py | 1 + frontera/utils/misc.py | 9 +- .../cassandra/test_backend_cassandra.py | 12 +- 9 files changed, 158 insertions(+), 163 deletions(-) diff --git a/frontera/contrib/backends/__init__.py b/frontera/contrib/backends/__init__.py index 142fe8e8d..f52b6aee2 100644 --- a/frontera/contrib/backends/__init__.py +++ b/frontera/contrib/backends/__init__.py @@ -6,6 +6,7 @@ from frontera import Backend from frontera.core.components import States, Queue as BaseQueue, DistributedBackend from frontera.core.models import Request, Response +from frontera.utils.misc import utcnow_timestamp from w3lib.util import to_native_str @@ -182,3 +183,26 @@ def _modify_page(self, obj): db_page.cookies = obj.request.cookies db_page.status_code = obj.status_code return db_page + + +class CommonRevisitingStorageBackendMixin(object): + + def _schedule(self, requests): + batch = [] + for request in requests: + if request.meta[b'state'] in [States.NOT_CRAWLED]: + request.meta[b'crawl_at'] = utcnow_timestamp() + elif request.meta[b'state'] in [States.CRAWLED, States.ERROR]: + request.meta[b'crawl_at'] = utcnow_timestamp() + self.interval + else: + continue # QUEUED + batch.append((request.meta[b'fingerprint'], self._get_score(request), request, True)) + self.queue.schedule(batch) + self.metadata.update_score(batch) + self.queue_size += len(batch) + + def page_crawled(self, response): + super(CommonRevisitingStorageBackendMixin, self).page_crawled(response) + self.states.set_states(response.request) + self._schedule([response.request]) + self.states.update_cache(response.request) diff --git a/frontera/contrib/backends/cassandra/__init__.py b/frontera/contrib/backends/cassandra/__init__.py index 42235fa06..7f59bdf60 100644 --- a/frontera/contrib/backends/cassandra/__init__.py +++ b/frontera/contrib/backends/cassandra/__init__.py @@ -1,14 +1,14 @@ from __future__ import absolute_import import six -from cassandra.cluster import Cluster from cassandra.cqlengine import connection -from cassandra.cqlengine.management import drop_table, sync_table +from cassandra.cqlengine.management import drop_table from frontera.contrib.backends import (CommonDistributedStorageBackend, CommonStorageBackend) -from frontera.contrib.backends.cassandra.components import (Metadata, Queue, - States) +from frontera.contrib.backends.cassandra.components import (Metadata, + BroadCrawlingQueue, + Queue, States) from frontera.utils.misc import load_object @@ -99,17 +99,22 @@ def __init__(self, manager): settings = manager.settings cluster_hosts = settings.get('CASSANDRABACKEND_CLUSTER_HOSTS') cluster_port = settings.get('CASSANDRABACKEND_CLUSTER_PORT') - keyspace = settings.get('CASSANDRABACKEND_KEYSPACE') + drop_all_tables = settings.get('CASSANDRABACKEND_DROP_ALL_TABLES') models = settings.get('CASSANDRABACKEND_MODELS') + keyspace = settings.get('CASSANDRABACKEND_KEYSPACE') + + self.models = dict([(name, load_object(cls)) for name, cls in six.iteritems(models)]) cluster_kwargs = { 'port': cluster_port, - 'compression': True + 'compression': True, } - self.cluster = Cluster(cluster_hosts, **cluster_kwargs) - self.models = dict([(name, load_object(cls)) for name, cls in six.iteritems(models)]) + if not connection.cluster: + connection.setup(cluster_hosts, keyspace, **cluster_kwargs) + connection.session.default_timeout = settings.get('CASSANDRABACKEND_REQUEST_TIMEOUT') - self.session.set_keyspace(keyspace) - connection.set_session(self.session) + if drop_all_tables: + for name, table in six.iteritems(self.models): + drop_table(table) self._metadata = None self._queue = None @@ -119,32 +124,17 @@ def __init__(self, manager): def strategy_worker(cls, manager): b = cls(manager) settings = manager.settings - drop_all_tables = settings.get('CASSANDRABACKEND_DROP_ALL_TABLES') - model = b.models['StateModel'] - - if drop_all_tables: - drop_table(model) - - sync_table(model) - - b._states = States(b.session, model, settings.get('STATE_CACHE_SIZE_LIMIT')) + b._states = States(b.models['StateModel'], settings.get('STATE_CACHE_SIZE_LIMIT')) return b @classmethod def db_worker(cls, manager): b = cls(manager) settings = manager.settings - drop = settings.get('CASSANDRABACKEND_DROP_ALL_TABLES') - metadata_m = b.models['MetadataModel'] - queue_m = b.models['QueueModel'] - - if drop: - drop_table(metadata_m) - drop_table(queue_m) - - sync_table(metadata_m) - sync_table(queue_m) - - b._metadata = Metadata(metadata_m) - b._queue = Queue(queue_m, settings.get('SPIDER_FEED_PARTITIONS')) + b._metadata = Metadata(b.models['MetadataModel'], settings.get('CASSANDRABACKEND_CACHE_SIZE')) + b._queue = BroadCrawlingQueue(b.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) return b + + def frontier_stop(self): + super(Distributed, self).frontier_stop() + connection.unregister_connection('default') diff --git a/frontera/contrib/backends/cassandra/components.py b/frontera/contrib/backends/cassandra/components.py index 2c47d3a22..27f42d4b3 100644 --- a/frontera/contrib/backends/cassandra/components.py +++ b/frontera/contrib/backends/cassandra/components.py @@ -132,7 +132,8 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ results = [] try: - for item in self._order_by(self.queue_model.filter(partition_id=partition_id).allow_filtering()).limit(max_n_requests): + for item in self._order_by(self.queue_model.filter(partition_id=partition_id). + allow_filtering()).limit(max_n_requests): method = item.method or b'GET' r = Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies) r.meta[b'fingerprint'] = to_bytes(item.fingerprint) @@ -207,7 +208,8 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): tries, limit, count, len(queue.keys())) queue.clear() count = 0 - for item in self._order_by(self.queue_model.filter(partition_id=partition_id).allow_filtering()).limit(max_n_requests): + for item in self._order_by(self.queue_model.filter(partition_id=partition_id). + allow_filtering()).limit(max_n_requests): if item.host_crc32 not in queue: queue[item.host_crc32] = [] if max_requests_per_host is not None and len(queue[item.host_crc32]) > max_requests_per_host: @@ -227,8 +229,11 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): for items in six.itervalues(queue): for item in items: method = item.method or b'GET' - results.append(Request(item.url, method=method, - meta=item.meta, headers=item.headers, cookies=item.cookies)) + results.append(Request(item.url, + method=method, + meta=item.meta, + headers=item.headers, + cookies=item.cookies)) item.batch(self.batch).delete() self.batch.execute() return results diff --git a/frontera/contrib/backends/cassandra/models.py b/frontera/contrib/backends/cassandra/models.py index 15ed73513..af20fdcbd 100644 --- a/frontera/contrib/backends/cassandra/models.py +++ b/frontera/contrib/backends/cassandra/models.py @@ -70,13 +70,9 @@ def __repr__(self): return '' % (self.fingerprint, self.state) -class QueueModel(Model): - __table_name__ = 'queue' +class BaseQueueModel(Model): + __abstract__ = True - partition_id = Integer(primary_key=True) - score = Float(primary_key=True) - created_at = BigInt(primary_key=True) - id = UUID(primary_key=True) url = Text(required=True) fingerprint = Text(required=True) host_crc32 = Integer(required=True) @@ -90,25 +86,32 @@ def __repr__(self): return '' % (self.url, self.id) -class FifoOrLIfoQueueModel(Model): - # Separate models are needed as - # order_by is supported on columns - # only in the order, the clustering - # keys were created +class QueueModel(BaseQueueModel): + __abstract__ = False + __table_name__ = 'queue' - # Also Inheriting model has some runtime issues - # mostly a bug in the driver - # Hence the duplicate code + partition_id = Integer(primary_key=True) + score = Float(primary_key=True) + created_at = BigInt(primary_key=True) + id = UUID(primary_key=True) + + +class FifoOrLIfoQueueModel(BaseQueueModel): + __abstract__ = False + __table_name__ = 'fifo_lifo_queue' partition_id = Integer(primary_key=True) score = Float(required=True) created_at = BigInt(primary_key=True) id = UUID(primary_key=True) - url = Text(required=True) - fingerprint = Text(required=True) - host_crc32 = Integer(required=True) - meta = PickleDict() - headers = PickleDict() - cookies = PickleDict() - method = Text() - depth = SmallInt() + + +class RevisitingQueueModel(BaseQueueModel): + __abstract__ = False + __table_name__ = 'revisiting_queue' + + partition_id = Integer(primary_key=True) + crawl_at = BigInt(primary_key=True) + id = UUID(primary_key=True) + score = Float(required=True) + created_at = BigInt(required=True) diff --git a/frontera/contrib/backends/cassandra/revisiting.py b/frontera/contrib/backends/cassandra/revisiting.py index f69cb5d9f..ef02d0a73 100644 --- a/frontera/contrib/backends/cassandra/revisiting.py +++ b/frontera/contrib/backends/cassandra/revisiting.py @@ -1,34 +1,32 @@ # -*- coding: utf-8 -*- -import json import logging -from datetime import datetime, timedelta +import uuid +from datetime import timedelta from time import time -from cassandra.cqlengine import columns -from cassandra.cqlengine.models import Model +from cassandra.cqlengine.management import sync_table +from cassandra.cqlengine.query import BatchQuery +from w3lib.util import to_native_str from frontera import Request +from frontera.contrib.backends import CommonRevisitingStorageBackendMixin from frontera.contrib.backends.cassandra import CassandraBackend +from frontera.contrib.backends.cassandra.models import RevisitingQueueModel from frontera.contrib.backends.partitioners import Crc32NamePartitioner from frontera.core.components import Queue as BaseQueue from frontera.core.components import States -from frontera.utils.misc import get_crc32 +from frontera.utils.misc import get_crc32, utcnow_timestamp from frontera.utils.url import parse_domain_from_url_fast -class RevisitingQueueModel(Model): - __table_name__ = 'revisiting_queue' - - crawl_at = columns.DateTime(required=True, default=datetime.now(), index=True) - - class RevisitingQueue(BaseQueue): - def __init__(self, session, queue_cls, partitions): - self.session = session() + def __init__(self, queue_cls, partitions): self.queue_model = queue_cls - self.logger = logging.getLogger("frontera.contrib.backends.sqlalchemy.revisiting.RevisitingQueue") + self.logger = logging.getLogger("frontera.contrib.backends.cassandra.revisiting.RevisitingQueue") self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) + self.batch = BatchQuery() + sync_table(queue_cls) def frontier_stop(self): pass @@ -36,19 +34,20 @@ def frontier_stop(self): def get_next_requests(self, max_n_requests, partition_id, **kwargs): results = [] try: - for item in self.queue_model.objects.filter(crawl_at=datetime.utcnow(), partition_id=partition_id).\ - limit(max_n_requests): + for item in self.queue_model.objects.filter(partition_id=partition_id, + crawl_at__lte=utcnow_timestamp()).limit(max_n_requests): method = 'GET' if not item.method else item.method results.append(Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies)) - item.delete() + item.batch(self.batch).delete() + self.batch.execute() except Exception as exc: self.logger.exception(exc) return results def schedule(self, batch): - for fprint, score, request, schedule_at in batch: - if schedule_at: + for fprint, score, request, schedule in batch: + if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) @@ -57,65 +56,46 @@ def schedule(self, batch): else: partition_id = self.partitioner.partition(hostname, self.partitions) host_crc32 = get_crc32(hostname) - created_at = time()*1E+6 - q = self._create_queue(request, fprint, score, partition_id, host_crc32, created_at) - - q.save() - request.meta['state'] = States.QUEUED - - def _create_queue(self, obj, fingerprint, score, partition_id, host_crc32, created_at): - db_queue = self.queue_model() - db_queue.fingerprint = fingerprint - db_queue.score = score - db_queue.partition_id = partition_id - db_queue.host_crc32 = host_crc32 - db_queue.url = obj.url - db_queue.created_at = created_at - - new_dict = {} - for kmeta, vmeta in obj.meta.iteritems(): - if type(vmeta) is dict: - new_dict[kmeta] = json.dumps(vmeta) - else: - new_dict[kmeta] = str(vmeta) - - db_queue.meta = new_dict - db_queue.depth = 0 - - db_queue.headers = obj.headers - db_queue.method = obj.method - db_queue.cookies = obj.cookies - - return db_queue + schedule_at = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else utcnow_timestamp() + q = self.queue_model(id=uuid.uuid4(), + fingerprint=to_native_str(fprint), + score=score, + url=request.url, + meta=request.meta, + headers=request.headers, + cookies=request.cookies, + method=to_native_str(request.method), + partition_id=partition_id, + host_crc32=host_crc32, + created_at=time() * 1E+6, + crawl_at=schedule_at) + q.batch(self.batch).save() + request.meta[b'state'] = States.QUEUED + self.batch.execute() + + def _create_queue_obj(self, fprint, score, request, partition_id, host_crc32, schedule_at): + q = self.queue_model(id=uuid.uuid4(), + fingerprint=to_native_str(fprint), + score=score, + url=request.url, + meta=request.meta, + headers=request.headers, + cookies=request.cookies, + method=to_native_str(request.method), + partition_id=partition_id, + host_crc32=host_crc32, + created_at=time() * 1E+6, + crawl_at=schedule_at) + return q def count(self): - return self.session.query(self.queue_model).count() + return self.queue_model.all().count() -class Backend(CassandraBackend): +class Backend(CommonRevisitingStorageBackendMixin, CassandraBackend): def _create_queue(self, settings): - self.interval = settings.get("SQLALCHEMYBACKEND_REVISIT_INTERVAL") + self.interval = settings.get("CASSANDRABACKEND_REVISIT_INTERVAL") assert isinstance(self.interval, timedelta) - return RevisitingQueue(self.session, RevisitingQueueModel, settings.get('SPIDER_FEED_PARTITIONS')) - - def _schedule(self, requests): - batch = [] - queue_incr = 0 - for request in requests: - if request.meta['state'] in [States.NOT_CRAWLED, None]: - schedule_at = datetime.utcnow() - elif request.meta['state'] in [States.CRAWLED, States.ERROR]: - schedule_at = datetime.utcnow() + self.interval - else: # QUEUED - schedule_at = None - batch.append((request.meta['fingerprint'], self._get_score(request), request, schedule_at)) - if schedule_at: - queue_incr += 1 - self.queue.schedule(batch) - self.metadata.update_score(batch) - self.queue_size += queue_incr - - def page_crawled(self, response, links): - super(Backend, self).page_crawled(response, links) - self._schedule([response.request]) + self.interval = self.interval.total_seconds() + return RevisitingQueue(RevisitingQueueModel, settings.get('SPIDER_FEED_PARTITIONS')) diff --git a/frontera/contrib/backends/sqlalchemy/revisiting.py b/frontera/contrib/backends/sqlalchemy/revisiting.py index b2b574715..ccbe056bf 100644 --- a/frontera/contrib/backends/sqlalchemy/revisiting.py +++ b/frontera/contrib/backends/sqlalchemy/revisiting.py @@ -1,27 +1,22 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import import logging -from datetime import datetime, timedelta +from datetime import timedelta from time import time, sleep -from calendar import timegm from sqlalchemy import Column, BigInteger from frontera import Request +from frontera.contrib.backends import CommonRevisitingStorageBackendMixin from frontera.contrib.backends.partitioners import Crc32NamePartitioner from frontera.contrib.backends.sqlalchemy import SQLAlchemyBackend from frontera.contrib.backends.sqlalchemy.models import QueueModelMixin, DeclarativeBase from frontera.core.components import Queue as BaseQueue, States -from frontera.utils.misc import get_crc32 +from frontera.utils.misc import get_crc32, utcnow_timestamp from frontera.utils.url import parse_domain_from_url_fast from six.moves import range -def utcnow_timestamp(): - d = datetime.utcnow() - return timegm(d.timetuple()) - - class RevisitingQueueModel(QueueModelMixin, DeclarativeBase): __tablename__ = 'revisiting_queue' @@ -103,30 +98,10 @@ def count(self): return self.session.query(self.queue_model).count() -class Backend(SQLAlchemyBackend): +class Backend(CommonRevisitingStorageBackendMixin, SQLAlchemyBackend): def _create_queue(self, settings): self.interval = settings.get("SQLALCHEMYBACKEND_REVISIT_INTERVAL") assert isinstance(self.interval, timedelta) self.interval = self.interval.total_seconds() return RevisitingQueue(self.session_cls, RevisitingQueueModel, settings.get('SPIDER_FEED_PARTITIONS')) - - def _schedule(self, requests): - batch = [] - for request in requests: - if request.meta[b'state'] in [States.NOT_CRAWLED]: - request.meta[b'crawl_at'] = utcnow_timestamp() - elif request.meta[b'state'] in [States.CRAWLED, States.ERROR]: - request.meta[b'crawl_at'] = utcnow_timestamp() + self.interval - else: - continue # QUEUED - batch.append((request.meta[b'fingerprint'], self._get_score(request), request, True)) - self.queue.schedule(batch) - self.metadata.update_score(batch) - self.queue_size += len(batch) - - def page_crawled(self, response): - super(Backend, self).page_crawled(response) - self.states.set_states(response.request) - self._schedule([response.request]) - self.states.update_cache(response.request) diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 408db4708..59220566f 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -22,6 +22,7 @@ CASSANDRABACKEND_CLUSTER_PORT = 9042 CASSANDRABACKEND_KEYSPACE = 'crawler' CASSANDRABACKEND_REQUEST_TIMEOUT = 100 +CASSANDRABACKEND_REVISIT_INTERVAL = timedelta(days=1) DELAY_ON_EMPTY = 5.0 DOMAIN_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' diff --git a/frontera/utils/misc.py b/frontera/utils/misc.py index 15731195f..4de40dd7c 100644 --- a/frontera/utils/misc.py +++ b/frontera/utils/misc.py @@ -1,5 +1,7 @@ from __future__ import absolute_import from importlib import import_module +from calendar import timegm +from datetime import datetime from zlib import crc32 from six.moves import range from w3lib.util import to_bytes @@ -72,4 +74,9 @@ def dict_to_unicode(obj): if isinstance(obj, list): return map(dict_to_unicode, obj) else: - return obj \ No newline at end of file + return obj + + +def utcnow_timestamp(): + d = datetime.utcnow() + return timegm(d.timetuple()) diff --git a/tests/contrib/backends/cassandra/test_backend_cassandra.py b/tests/contrib/backends/cassandra/test_backend_cassandra.py index c8ff68b50..d69a2e0ca 100644 --- a/tests/contrib/backends/cassandra/test_backend_cassandra.py +++ b/tests/contrib/backends/cassandra/test_backend_cassandra.py @@ -1,6 +1,6 @@ import unittest import uuid -from datetime import datetime +from datetime import datetime, timedelta from time import time import six @@ -18,6 +18,7 @@ from frontera.core.models import Request, Response from frontera.settings import Settings from tests import backends +from tests.test_revisiting_backend import RevisitingBackendTest r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', @@ -256,3 +257,12 @@ class TestCassandraDFSBackend(BaseCassandraIntegrationTests, backends.DFSBackend class TestCassandraBFSBackend(BaseCassandraIntegrationTests, backends.BFSBackendTest): backend_class = 'frontera.contrib.backends.cassandra.BFS' + + +class TestCassandraRevisiting(BaseCassandraIntegrationTests, RevisitingBackendTest): + backend_class = 'frontera.contrib.backends.cassandra.revisiting.Backend' + + def get_settings(self): + settings = super(TestCassandraRevisiting, self).get_settings() + settings.CASSANDRABACKEND_REVISIT_INTERVAL = timedelta(seconds=0) + return settings From 3f0bab45af2c7e15384b46355f0ef06833b83d87 Mon Sep 17 00:00:00 2001 From: voith Date: Wed, 9 Nov 2016 12:38:53 -0500 Subject: [PATCH 12/14] added unitests for utcnow_timestamp --- frontera/contrib/backends/cassandra/components.py | 8 ++++---- .../backends/cassandra/test_backend_cassandra.py | 13 +++---------- tests/test_utils_misc.py | 15 ++++++++++++++- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/frontera/contrib/backends/cassandra/components.py b/frontera/contrib/backends/cassandra/components.py index 27f42d4b3..389fc4d25 100644 --- a/frontera/contrib/backends/cassandra/components.py +++ b/frontera/contrib/backends/cassandra/components.py @@ -132,8 +132,8 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ results = [] try: - for item in self._order_by(self.queue_model.filter(partition_id=partition_id). - allow_filtering()).limit(max_n_requests): + for item in self._order_by(self.queue_model.filter(partition_id=partition_id).allow_filtering()).\ + limit(max_n_requests): method = item.method or b'GET' r = Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies) r.meta[b'fingerprint'] = to_bytes(item.fingerprint) @@ -208,8 +208,8 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): tries, limit, count, len(queue.keys())) queue.clear() count = 0 - for item in self._order_by(self.queue_model.filter(partition_id=partition_id). - allow_filtering()).limit(max_n_requests): + for item in self._order_by(self.queue_model.filter(partition_id=partition_id).allow_filtering()).\ + limit(max_n_requests): if item.host_crc32 not in queue: queue[item.host_crc32] = [] if max_requests_per_host is not None and len(queue[item.host_crc32]) > max_requests_per_host: diff --git a/tests/contrib/backends/cassandra/test_backend_cassandra.py b/tests/contrib/backends/cassandra/test_backend_cassandra.py index d69a2e0ca..97110e8cd 100644 --- a/tests/contrib/backends/cassandra/test_backend_cassandra.py +++ b/tests/contrib/backends/cassandra/test_backend_cassandra.py @@ -4,7 +4,6 @@ from time import time import six -from cassandra.cluster import Cluster from cassandra.cqlengine import connection from cassandra.cqlengine.management import (create_keyspace_simple, drop_keyspace, drop_table, @@ -40,19 +39,13 @@ def setUp(self): self.manager.settings = settings self.keyspace = settings.CASSANDRABACKEND_KEYSPACE self.timeout = settings.CASSANDRABACKEND_REQUEST_TIMEOUT - cluster = Cluster(self.hosts, self.port) - self.session = cluster.connect() self._set_global_connection(self.hosts, self.port, self.timeout) create_keyspace_simple(self.keyspace, 1) - self.session.set_keyspace(self.keyspace) - self.session.default_timeout = self.timeout connection.session.set_keyspace(self.keyspace) def tearDown(self): self._set_global_connection(self.hosts, self.port, self.timeout) drop_keyspace(self.keyspace) - self.session.shutdown() - connection.unregister_connection('default') def _set_global_connection(self, hosts, port, timeout): if not connection.cluster: @@ -141,9 +134,9 @@ def assert_db_values(self, model, _filter, fields): class TestCassandraBackend(BaseCassandraTest, unittest.TestCase): def _get_tables(self): - query = self.session.prepare('SELECT table_name FROM system_schema.tables WHERE keyspace_name = ?') - result = self.session.execute(query, (self.session.keyspace,)) - return [row.table_name for row in result.current_rows] + query = 'SELECT table_name FROM system_schema.tables WHERE keyspace_name = \'{}\''.format(self.keyspace) + result = connection.execute(query) + return [row['table_name'] for row in result.current_rows] def test_tables_created(self): tables_before = self._get_tables() diff --git a/tests/test_utils_misc.py b/tests/test_utils_misc.py index af6f6d992..a217fffe9 100644 --- a/tests/test_utils_misc.py +++ b/tests/test_utils_misc.py @@ -1,8 +1,11 @@ from __future__ import absolute_import import hashlib import pytest -from frontera.utils.misc import load_object, get_crc32, chunks, to_signed32 import six +from datetime import datetime + +from frontera.utils.misc import load_object, get_crc32, chunks, to_signed32, utcnow_timestamp +from tests import mock class TestGetCRC32(object): @@ -82,3 +85,13 @@ def test_name_error(self): load_object('tests.mocks.load_objects.non_existent_object') assert str(info.value) == ("Module 'tests.mocks.load_objects' doesn't define" " any object named 'non_existent_object'") + + +class TestUtcNowTimestamp(object): + + def test(self): + udt = datetime(2016, 11, 11, 0, 0, 0) + with mock.patch('frontera.utils.misc.datetime') as mocked_datetime: + mocked_datetime.utcnow = mock.Mock(return_value=udt) + utc_tstamp = utcnow_timestamp() + assert utc_tstamp == 1478822400 From 67ca6fa711e82c48e68d86afc55ad7104afe8dfe Mon Sep 17 00:00:00 2001 From: voith Date: Thu, 10 Nov 2016 12:38:53 -0500 Subject: [PATCH 13/14] updated cassandra docs --- docs/source/topics/frontera-settings.rst | 67 ++++++++++++------------ docs/source/topics/frontier-backends.rst | 60 +++++++++++++-------- frontera/settings/default_settings.py | 9 ++-- 3 files changed, 76 insertions(+), 60 deletions(-) diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index c6b7585b4..deec37349 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -492,63 +492,56 @@ documents scheduled after the change. All previously queued documents will be cr Cassandra --------- +.. setting:: CASSANDRABACKEND_CACHE_SIZE -.. setting:: CASSANDRABACKEND_DROP_ALL_TABLES +CASSANDRABACKEND_CACHE_SIZE +^^^^^^^^^^^^^^^^^^^^^^^^^^^ -CASSANDRABACKEND_DROP_ALL_TABLES -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Default:: ``10000`` -Default: ``False`` +Cassandra Metadata LRU Cache size. It's used for caching objects, which are requested from DB every time already known, +documents are crawled. This is mainly saves DB throughput, increase it if you're experiencing problems with too high +volume of SELECT's to Metadata table, or decrease if you need to save memory. -Set to ``True`` if you need to drop of all DB tables on backend instantiation (e.g. every Scrapy spider run). -.. setting:: SQLALCHEMYBACKEND_ENGINE +.. setting:: CASSANDRABACKEND_CLUSTER_HOSTS -CASSANDRABACKEND_CLUSTER_IPS -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +CASSANDRABACKEND_CLUSTER_HOSTS +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Default:: ``['127.0.0.1']`` -Set IPs from Cassandra Cluster. Default is localhost. To assign more than one IP use this Syntax: ``['192.168.0.1', '192.168.0.2']`` +The list of contact points to try connecting for cluster discovery. All contact points are not required, the driver +discovers the rest. + +.. setting:: CASSANDRABACKEND_CLUSTER_PORT CASSANDRABACKEND_CLUSTER_PORT ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Default:: ``9042`` -Set port from Cassandra Cluster / Nodes +The server-side port to open connections to Cassandra. +.. setting:: CASSANDRABACKEND_DROP_ALL_TABLES -CASSANDRABACKEND_GENERATE_STATS -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +CASSANDRABACKEND_DROP_ALL_TABLES +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Default:: ``False`` +Default: ``False`` -Set this to true if you want to create an extra Table for stats collection. In this table there will be pages crawled, links queued etv. counted up. +Set to ``True`` to drop and create all DB tables on backend instantiation. +.. setting:: CASSANDRABACKEND_KEYSPACE CASSANDRABACKEND_KEYSPACE ^^^^^^^^^^^^^^^^^^^^^^^^^ -Default:: ``frontera`` - -Set cassandra Keyspace - -CASSANDRABACKEND_CREATE_KEYSPACE_IF_NOT_EXISTS -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Default:: ``True`` +Default:: ``crawler`` -Creates Keyspace if it not exist. Set to false if you frontera shouldn't check on every startup. - - -CASSANDRABACKEND_CRAWL_ID -^^^^^^^^^^^^^^^^^^^^^^^^^ - -Default:: ``default`` - -Sets an ID in each table for the actual crawl. If you want to run another crawl from begining in same Table set to another Crawl ID. Its an Text field. +Set Cassandra Keyspace. +.. setting:: CASSANDRABACKEND_MODELS CASSANDRABACKEND_MODELS ^^^^^^^^^^^^^^^^^^^^^^^ @@ -559,11 +552,19 @@ Default:: 'MetadataModel': 'frontera.contrib.backends.cassandra.models.MetadataModel', 'StateModel': 'frontera.contrib.backends.cassandra.models.StateModel', 'QueueModel': 'frontera.contrib.backends.cassandra.models.QueueModel', - 'CrawlStatsModel': 'frontera.contrib.backends.cassandra.models.CrawlStatsModel' + 'FifoOrLIfoQueueModel': 'frontera.contrib.backends.cassandra.models.FifoOrLIfoQueueModel', } -This is mapping with Cassandra models used by backends. It is mainly used for customization. +This is mapping of Cassandra models used by backends. It is mainly used for customization. + +.. setting:: CASSANDRABACKEND_REQUEST_TIMEOUT + +CASSANDRABACKEND_REQUEST_TIMEOUT +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default:: ``60`` +Timeout in seconds for every request made by the Cassandra driver for to Cassandra. Revisiting backend ------------------ diff --git a/docs/source/topics/frontier-backends.rst b/docs/source/topics/frontier-backends.rst index dfaddfc88..cb6a715a0 100644 --- a/docs/source/topics/frontier-backends.rst +++ b/docs/source/topics/frontier-backends.rst @@ -254,33 +254,15 @@ For a complete list of all settings used for SQLAlchemy backends check the :doc: SQLAlchemy :class:`Backend ` implementation of a random selection algorithm. - -Revisiting backend -^^^^^^^^^^^^^^^^^^ - -Based on custom SQLAlchemy backend, and queue. Crawling starts with seeds. After seeds are crawled, every new -document will be scheduled for immediate crawling. On fetching every new document will be scheduled for recrawling -after fixed interval set by :setting:`SQLALCHEMYBACKEND_REVISIT_INTERVAL`. - -Current implementation of revisiting backend has no prioritization. During long term runs spider could go idle, because -there are no documents available for crawling, but there are documents waiting for their scheduled revisit time. - - -.. class:: frontera.contrib.backends.sqlalchemy.revisiting.Backend - - Base class for SQLAlchemy :class:`Backend ` implementation of revisiting back-end. - .. _frontier-backends-cassandra: Cassandra backends ^^^^^^^^^^^^^^^^^^ -This set of :class:`Backend ` objects will use `Cassandra`_ as storage for +This set of :class:`Backend ` objects will use Cassandra as storage for :ref:`basic algorithms `. -Cassandra is a NoSQL Colum-Store Database with Linear scalability and a SQL-Like Query Language. - -If you need to use your own `declarative cassandra models`_, you can do it by using the +If you need to use your own `cassandra models`_, you can do it by using the :setting:`CASSANDRABACKEND_MODELS` setting. This setting uses a dictionary where ``key`` represents the name of the model to define and ``value`` the model to use. @@ -290,13 +272,46 @@ For a complete list of all settings used for Cassandra backends check the :doc:` .. class:: frontera.contrib.backends.cassandra.BASE Base class for Cassandra :class:`Backend ` objects. - It runs cassandra in multi-spider one worker mode with the FIFO algorithm. + +.. class:: frontera.contrib.backends.cassandra.FIFO + + Cassandra :class:`Backend ` implementation of `FIFO`_ algorithm. + +.. class:: frontera.contrib.backends.cassandra.LIFO + + Cassandra :class:`Backend ` implementation of `LIFO`_ algorithm. + +.. class:: frontera.contrib.backends.cassandra.BFS + + Cassandra :class:`Backend ` implementation of `BFS`_ algorithm. + +.. class:: frontera.contrib.backends.cassandra.DFS + + Cassandra :class:`Backend ` implementation of `DFS`_ algorithm. .. class:: frontera.contrib.backends.cassandra.Distributed - Cassandra :class:`Backend ` implementation of the distributed Backend. + Cassandra :class:`Backend ` implementation of a distributed backend. + +Revisiting backend +^^^^^^^^^^^^^^^^^^ + +There are two backends for Revisiting which are based on Cassandra and SqlAlchemy Backend and Queue. Crawling starts +with seeds. After seeds are crawled, every new document will be scheduled for immediate crawling. On fetching every new +document will be scheduled for recrawling after fixed interval set by :setting:`SQLALCHEMYBACKEND_REVISIT_INTERVAL` or +:setting:`CASSANDRABACKEND_REVISIT_INTERVAL`. + +Current implementation of revisiting backend has no prioritization. During long term runs spider could go idle, because +there are no documents available for crawling, but there are documents waiting for their scheduled revisit time. + + +.. class:: frontera.contrib.backends.sqlalchemy.revisiting.Backend + + Base class for SQLAlchemy :class:`Backend ` implementation of revisiting back-end. +.. class:: frontera.contrib.backends.cassandra.revisiting.Backend + Base class for Cassandra :class:`Backend ` implementation of revisiting back-end. HBase backend ^^^^^^^^^^^^^ @@ -325,3 +340,4 @@ setting. .. _SQLAlchemy: http://www.sqlalchemy.org/ .. _any databases supported by SQLAlchemy: http://docs.sqlalchemy.org/en/latest/dialects/index.html .. _declarative sqlalchemy models: http://docs.sqlalchemy.org/en/latest/orm/extensions/declarative/index.html +.. _cassandra models: https://datastax.github.io/python-driver/cqlengine/models.html diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 59220566f..9b6b3886a 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -10,18 +10,17 @@ CANONICAL_SOLVER = 'frontera.contrib.canonicalsolvers.Basic' CASSANDRABACKEND_CACHE_SIZE = 10000 +CASSANDRABACKEND_CLUSTER_HOSTS = ['127.0.0.1'] +CASSANDRABACKEND_CLUSTER_PORT = 9042 CASSANDRABACKEND_DROP_ALL_TABLES = False +CASSANDRABACKEND_KEYSPACE = 'crawler' CASSANDRABACKEND_MODELS = { 'MetadataModel': 'frontera.contrib.backends.cassandra.models.MetadataModel', 'StateModel': 'frontera.contrib.backends.cassandra.models.StateModel', 'QueueModel': 'frontera.contrib.backends.cassandra.models.QueueModel', 'FifoOrLIfoQueueModel': 'frontera.contrib.backends.cassandra.models.FifoOrLIfoQueueModel', } -CASSANDRABACKEND_REVISIT_INTERVAL = timedelta(days=1) -CASSANDRABACKEND_CLUSTER_HOSTS = ['127.0.0.1'] -CASSANDRABACKEND_CLUSTER_PORT = 9042 -CASSANDRABACKEND_KEYSPACE = 'crawler' -CASSANDRABACKEND_REQUEST_TIMEOUT = 100 +CASSANDRABACKEND_REQUEST_TIMEOUT = 60 CASSANDRABACKEND_REVISIT_INTERVAL = timedelta(days=1) DELAY_ON_EMPTY = 5.0 From 9c316ea4dbd02a4da689d4ccd5c82c2dcb996147 Mon Sep 17 00:00:00 2001 From: voith Date: Fri, 11 Nov 2016 12:38:53 -0500 Subject: [PATCH 14/14] added tests for cassandra distributed backend --- .../contrib/backends/cassandra/__init__.py | 21 +++--- .../contrib/backends/cassandra/components.py | 2 +- .../contrib/backends/cassandra/revisiting.py | 2 +- .../cassandra/test_backend_cassandra.py | 71 +++++++++++++++---- 4 files changed, 71 insertions(+), 25 deletions(-) diff --git a/frontera/contrib/backends/cassandra/__init__.py b/frontera/contrib/backends/cassandra/__init__.py index 7f59bdf60..ed952bd4a 100644 --- a/frontera/contrib/backends/cassandra/__init__.py +++ b/frontera/contrib/backends/cassandra/__init__.py @@ -99,7 +99,6 @@ def __init__(self, manager): settings = manager.settings cluster_hosts = settings.get('CASSANDRABACKEND_CLUSTER_HOSTS') cluster_port = settings.get('CASSANDRABACKEND_CLUSTER_PORT') - drop_all_tables = settings.get('CASSANDRABACKEND_DROP_ALL_TABLES') models = settings.get('CASSANDRABACKEND_MODELS') keyspace = settings.get('CASSANDRABACKEND_KEYSPACE') @@ -112,10 +111,6 @@ def __init__(self, manager): connection.setup(cluster_hosts, keyspace, **cluster_kwargs) connection.session.default_timeout = settings.get('CASSANDRABACKEND_REQUEST_TIMEOUT') - if drop_all_tables: - for name, table in six.iteritems(self.models): - drop_table(table) - self._metadata = None self._queue = None self._states = None @@ -124,15 +119,25 @@ def __init__(self, manager): def strategy_worker(cls, manager): b = cls(manager) settings = manager.settings - b._states = States(b.models['StateModel'], settings.get('STATE_CACHE_SIZE_LIMIT')) + drop_all_tables = settings.get('CASSANDRABACKEND_DROP_ALL_TABLES') + state_model = b.models['StateModel'] + if drop_all_tables: + drop_table(state_model) + b._states = States(state_model, settings.get('STATE_CACHE_SIZE_LIMIT')) return b @classmethod def db_worker(cls, manager): b = cls(manager) settings = manager.settings - b._metadata = Metadata(b.models['MetadataModel'], settings.get('CASSANDRABACKEND_CACHE_SIZE')) - b._queue = BroadCrawlingQueue(b.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) + drop_all_tables = settings.get('CASSANDRABACKEND_DROP_ALL_TABLES') + metadata_model = b.models['MetadataModel'] + queue_model = b.models['QueueModel'] + if drop_all_tables: + drop_table(metadata_model) + drop_table(queue_model) + b._metadata = Metadata(metadata_model, settings.get('CASSANDRABACKEND_CACHE_SIZE')) + b._queue = BroadCrawlingQueue(queue_model, settings.get('SPIDER_FEED_PARTITIONS')) return b def frontier_stop(self): diff --git a/frontera/contrib/backends/cassandra/components.py b/frontera/contrib/backends/cassandra/components.py index 389fc4d25..d832b2436 100644 --- a/frontera/contrib/backends/cassandra/components.py +++ b/frontera/contrib/backends/cassandra/components.py @@ -172,7 +172,7 @@ def schedule(self, batch): self.batch.execute() def count(self): - return self.queue_model.all().count() + return self.queue_model.objects.count() class BroadCrawlingQueue(Queue): diff --git a/frontera/contrib/backends/cassandra/revisiting.py b/frontera/contrib/backends/cassandra/revisiting.py index ef02d0a73..39f1400c3 100644 --- a/frontera/contrib/backends/cassandra/revisiting.py +++ b/frontera/contrib/backends/cassandra/revisiting.py @@ -89,7 +89,7 @@ def _create_queue_obj(self, fprint, score, request, partition_id, host_crc32, sc return q def count(self): - return self.queue_model.all().count() + return self.queue_model.objects.count() class Backend(CommonRevisitingStorageBackendMixin, CassandraBackend): diff --git a/tests/contrib/backends/cassandra/test_backend_cassandra.py b/tests/contrib/backends/cassandra/test_backend_cassandra.py index 97110e8cd..56e9e03be 100644 --- a/tests/contrib/backends/cassandra/test_backend_cassandra.py +++ b/tests/contrib/backends/cassandra/test_backend_cassandra.py @@ -9,7 +9,7 @@ drop_keyspace, drop_table, sync_table) -from frontera.contrib.backends.cassandra import CassandraBackend +from frontera.contrib.backends.cassandra import CassandraBackend, Distributed from frontera.contrib.backends.cassandra.models import (FifoOrLIfoQueueModel, MetadataModel, QueueModel, StateModel) @@ -29,7 +29,7 @@ r4 = r3.copy() -class BaseCassandraTest(object): +class CassandraConfig(object): def setUp(self): settings = Settings() @@ -53,7 +53,7 @@ def _set_global_connection(self, hosts, port, timeout): connection.session.default_timeout = timeout -class TestCassandraBackendModels(BaseCassandraTest, unittest.TestCase): +class TestCassandraBackendModels(CassandraConfig, unittest.TestCase): def test_pickled_fields(self): sync_table(MetadataModel) @@ -131,7 +131,25 @@ def assert_db_values(self, model, _filter, fields): self.assertEqual(stored_value, original_value) -class TestCassandraBackend(BaseCassandraTest, unittest.TestCase): +class TestCassandraBackend(CassandraConfig, unittest.TestCase): + + def init_backend(self): + self.backend = CassandraBackend(self.manager) + + @property + def metadata(self): + self.init_backend() + return self.backend.metadata + + @property + def states(self): + self.init_backend() + return self.backend.states + + @property + def queue(self): + self.init_backend() + return self.backend.queue def _get_tables(self): query = 'SELECT table_name FROM system_schema.tables WHERE keyspace_name = \'{}\''.format(self.keyspace) @@ -141,7 +159,7 @@ def _get_tables(self): def test_tables_created(self): tables_before = self._get_tables() self.assertEqual(tables_before, []) - CassandraBackend(self.manager) + self.init_backend() tables_after = self._get_tables() self.assertEqual(set(tables_after), set(['metadata', 'states', 'queue'])) @@ -158,14 +176,14 @@ def _get_state_data(): rows_before = _get_state_data() self.assertEqual(rows_before.count(), 1) self.manager.settings.CASSANDRABACKEND_DROP_ALL_TABLES = True - CassandraBackend(self.manager) - self.assertEqual(set(tables_before), set(['metadata', 'states', 'queue'])) + self.init_backend() + tables_after = self._get_tables() + self.assertEqual(set(tables_after), set(['metadata', 'states', 'queue'])) rows_after = _get_state_data() self.assertEqual(rows_after.count(), 0) def test_metadata(self): - b = CassandraBackend(self.manager) - metadata = b.metadata + metadata = self.metadata metadata.add_seeds([r1, r2, r3]) meta_qs = MetadataModel.objects.all() self.assertEqual(set([r1.url, r2.url, r3.url]), set([m.url for m in meta_qs])) @@ -183,10 +201,9 @@ def test_metadata(self): self.assertEqual(meta_qs.count(), 3) def test_state(self): - b = CassandraBackend(self.manager) - state = b.states + state = self.states state.set_states([r1, r2, r3]) - self.assertEqual([r.meta[b'state'] for r in [r1, r2, r3]], [States.NOT_CRAWLED]*3) + self.assertEqual([r.meta[b'state'] for r in [r1, r2, r3]], [States.NOT_CRAWLED] * 3) state.update_cache([r1, r2, r3]) self.assertDictEqual(state._cache, {b'10': States.NOT_CRAWLED, b'11': States.NOT_CRAWLED, @@ -209,11 +226,11 @@ def test_state(self): def test_queue(self): self.manager.settings.SPIDER_FEED_PARTITIONS = 2 - b = CassandraBackend(self.manager) - queue = b.queue + queue = self.queue batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True), ('12', 0.7, r3, True)] queue.schedule(batch) + self.assertEqual(queue.count(), 3) self.assertEqual(set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, @@ -224,10 +241,34 @@ def test_queue(self): min_hosts=1, max_requests_per_host=10)]), set([r1.url, r2.url])) + self.assertEqual(queue.count(), 0) + + +class TestCassandraDistributedBackend(TestCassandraBackend): + + def init_backend(self): + self.backend = Distributed(self.manager) + self.strategy_worker = self.backend.strategy_worker(self.manager) + self.db_worker = self.backend.db_worker(self.manager) + + @property + def metadata(self): + self.init_backend() + return self.db_worker.metadata + + @property + def states(self): + self.init_backend() + return self.strategy_worker.states + + @property + def queue(self): + self.init_backend() + return self.db_worker.queue class BaseCassandraIntegrationTests(object): - obj = BaseCassandraTest() + obj = CassandraConfig() def setup_backend(self, method): self.obj.setUp()