diff --git a/cassandra/cluster.py b/cassandra/cluster.py index d5f80290a9..582a8c2e6e 100644 --- a/cassandra/cluster.py +++ b/cassandra/cluster.py @@ -2725,6 +2725,98 @@ def execute_async(self, query, parameters=None, trace=False, custom_payload=None future.send_request() return future + def execute_concurrent(self, statements_and_parameters, concurrency=100, raise_on_first_error=True, results_generator=False, execution_profile=EXEC_PROFILE_DEFAULT): + """ + Executes a sequence of (statement, parameters) tuples concurrently. Each + ``parameters`` item must be a sequence or :const:`None`. + + The `concurrency` parameter controls how many statements will be executed + concurrently. When :attr:`.Cluster.protocol_version` is set to 1 or 2, + it is recommended that this be kept below 100 times the number of + core connections per host times the number of connected hosts (see + :meth:`.Cluster.set_core_connections_per_host`). If that amount is exceeded, + the event loop thread may attempt to block on new connection creation, + substantially impacting throughput. If :attr:`~.Cluster.protocol_version` + is 3 or higher, you can safely experiment with higher levels of concurrency. + + If `raise_on_first_error` is left as :const:`True`, execution will stop + after the first failed statement and the corresponding exception will be + raised. + + `results_generator` controls how the results are returned. + + * If :const:`False`, the results are returned only after all requests have completed. + * If :const:`True`, a generator expression is returned. Using a generator results in a constrained + memory footprint when the results set will be large -- results are yielded + as they return instead of materializing the entire list at once. The trade for lower memory + footprint is marginal CPU overhead (more thread coordination and sorting out-of-order results + on-the-fly). + + `execution_profile` argument is the execution profile to use for this + request, it is passed directly to :meth:`Session.execute_async`. + + A sequence of ``ExecutionResult(success, result_or_exc)`` namedtuples is returned + in the same order that the statements were passed in. If ``success`` is :const:`False`, + there was an error executing the statement, and ``result_or_exc`` + will be an :class:`Exception`. If ``success`` is :const:`True`, ``result_or_exc`` + will be the query result. + + Example usage:: + + select_statement = session.prepare("SELECT * FROM users WHERE id=?") + + statements_and_params = [] + for user_id in user_ids: + params = (user_id, ) + statements_and_params.append((select_statement, params)) + + results = session.execute_concurrent(statements_and_params, raise_on_first_error=False) + + for (success, result) in results: + if not success: + handle_error(result) # result will be an Exception + else: + process_user(result[0]) # result will be a list of rows + + Note: in the case that `generators` are used, it is important to ensure the consumers do not + block or attempt further synchronous requests, because no further IO will be processed until + the consumer returns. This may also produce a deadlock in the IO event thread. + """ + from cassandra.concurrent import execute_concurrent + return execute_concurrent(self, statements_and_parameters, concurrency, raise_on_first_error, results_generator, execution_profile) + + def execute_concurrent_with_args(self, statement, parameters, *args, **kwargs): + """ + Like :meth:`~cassandra.concurrent.execute_concurrent()`, but takes a single + statement and a sequence of parameters. Each item in ``parameters`` + should be a sequence or :const:`None`. + + Example usage:: + + statement = session.prepare("INSERT INTO mytable (a, b) VALUES (1, ?)") + parameters = [(x,) for x in range(1000)] + session.execute_concurrent_with_args(statement, parameters, concurrency=50) + """ + from cassandra.concurrent import execute_concurrent_with_args + return execute_concurrent_with_args(self, statement, parameters, *args, **kwargs) + + def execute_concurrent_async(self, statements_and_parameters, concurrency=100, raise_on_first_error=False, execution_profile=EXEC_PROFILE_DEFAULT): + """ + Asynchronously executes a sequence of (statement, parameters) tuples concurrently. + + Args: + session: Cassandra session object. + statement_and_parameters: Iterable of (prepared CQL statement, bind parameters) tuples. + concurrency (int, optional): Number of concurrent operations. Default is 100. + raise_on_first_error (bool, optional): If True, execution stops on the first error. Default is True. + execution_profile (ExecutionProfile, optional): Execution profile to use. Default is EXEC_PROFILE_DEFAULT. + + Returns: + A `Future` object that will be completed when all operations are done. + """ + from cassandra.concurrent import execute_concurrent_async + return execute_concurrent_async(self, statements_and_parameters, concurrency, raise_on_first_error, execution_profile) + def execute_graph(self, query, parameters=None, trace=False, execution_profile=EXEC_PROFILE_GRAPH_DEFAULT, execute_as=None): """ Executes a Gremlin query string or GraphStatement synchronously, diff --git a/cassandra/concurrent.py b/cassandra/concurrent.py index fb8f26e1cc..607c29c332 100644 --- a/cassandra/concurrent.py +++ b/cassandra/concurrent.py @@ -13,15 +13,15 @@ # limitations under the License. +import logging from collections import namedtuple +from concurrent.futures import Future from heapq import heappush, heappop from itertools import cycle from threading import Condition -import sys from cassandra.cluster import ResultSet, EXEC_PROFILE_DEFAULT -import logging log = logging.getLogger(__name__) @@ -29,61 +29,7 @@ def execute_concurrent(session, statements_and_parameters, concurrency=100, raise_on_first_error=True, results_generator=False, execution_profile=EXEC_PROFILE_DEFAULT): """ - Executes a sequence of (statement, parameters) tuples concurrently. Each - ``parameters`` item must be a sequence or :const:`None`. - - The `concurrency` parameter controls how many statements will be executed - concurrently. When :attr:`.Cluster.protocol_version` is set to 1 or 2, - it is recommended that this be kept below 100 times the number of - core connections per host times the number of connected hosts (see - :meth:`.Cluster.set_core_connections_per_host`). If that amount is exceeded, - the event loop thread may attempt to block on new connection creation, - substantially impacting throughput. If :attr:`~.Cluster.protocol_version` - is 3 or higher, you can safely experiment with higher levels of concurrency. - - If `raise_on_first_error` is left as :const:`True`, execution will stop - after the first failed statement and the corresponding exception will be - raised. - - `results_generator` controls how the results are returned. - - * If :const:`False`, the results are returned only after all requests have completed. - * If :const:`True`, a generator expression is returned. Using a generator results in a constrained - memory footprint when the results set will be large -- results are yielded - as they return instead of materializing the entire list at once. The trade for lower memory - footprint is marginal CPU overhead (more thread coordination and sorting out-of-order results - on-the-fly). - - `execution_profile` argument is the execution profile to use for this - request, it is passed directly to :meth:`Session.execute_async`. - - A sequence of ``ExecutionResult(success, result_or_exc)`` namedtuples is returned - in the same order that the statements were passed in. If ``success`` is :const:`False`, - there was an error executing the statement, and ``result_or_exc`` will be - an :class:`Exception`. If ``success`` is :const:`True`, ``result_or_exc`` - will be the query result. - - Example usage:: - - select_statement = session.prepare("SELECT * FROM users WHERE id=?") - - statements_and_params = [] - for user_id in user_ids: - params = (user_id, ) - statements_and_params.append((select_statement, params)) - - results = execute_concurrent( - session, statements_and_params, raise_on_first_error=False) - - for (success, result) in results: - if not success: - handle_error(result) # result will be an Exception - else: - process_user(result[0]) # result will be a list of rows - - Note: in the case that `generators` are used, it is important to ensure the consumers do not - block or attempt further synchronous requests, because no further IO will be processed until - the consumer returns. This may also produce a deadlock in the IO event thread. + See :meth:`.Session.execute_concurrent`. """ if concurrency <= 0: raise ValueError("concurrency must be greater than 0") @@ -216,14 +162,50 @@ def _results(self): def execute_concurrent_with_args(session, statement, parameters, *args, **kwargs): """ - Like :meth:`~cassandra.concurrent.execute_concurrent()`, but takes a single - statement and a sequence of parameters. Each item in ``parameters`` - should be a sequence or :const:`None`. + See :meth:`.Session.execute_concurrent_with_args`. + """ + return execute_concurrent(session, zip(cycle((statement,)), parameters), *args, **kwargs) - Example usage:: - statement = session.prepare("INSERT INTO mytable (a, b) VALUES (1, ?)") - parameters = [(x,) for x in range(1000)] - execute_concurrent_with_args(session, statement, parameters, concurrency=50) +class ConcurrentExecutorFutureResults(ConcurrentExecutorListResults): + def __init__(self, session, statements_and_params, execution_profile, future): + super().__init__(session, statements_and_params, execution_profile) + self.future = future + + def _put_result(self, result, idx, success): + super()._put_result(result, idx, success) + with self._condition: + if self._current == self._exec_count: + if self._exception and self._fail_fast: + self.future.set_exception(self._exception) + else: + sorted_results = [r[1] for r in sorted(self._results_queue)] + self.future.set_result(sorted_results) + + +def execute_concurrent_async( + session, + statements_and_parameters, + concurrency=100, + raise_on_first_error=False, + execution_profile=EXEC_PROFILE_DEFAULT +): """ - return execute_concurrent(session, zip(cycle((statement,)), parameters), *args, **kwargs) + See :meth:`.Session.execute_concurrent_async`. + """ + # Create a Future object and initialize the custom ConcurrentExecutor with the Future + future = Future() + executor = ConcurrentExecutorFutureResults( + session=session, + statements_and_params=statements_and_parameters, + execution_profile=execution_profile, + future=future + ) + + # Execute concurrently + try: + executor.execute(concurrency=concurrency, fail_fast=raise_on_first_error) + except Exception as e: + future.set_exception(e) + + return future diff --git a/tests/unit/test_concurrent.py b/tests/unit/test_concurrent.py index bdfd08126e..5edb264b3b 100644 --- a/tests/unit/test_concurrent.py +++ b/tests/unit/test_concurrent.py @@ -24,7 +24,7 @@ import platform from cassandra.cluster import Cluster, Session -from cassandra.concurrent import execute_concurrent, execute_concurrent_with_args +from cassandra.concurrent import execute_concurrent, execute_concurrent_with_args, execute_concurrent_async from cassandra.pool import Host from cassandra.policies import SimpleConvictionPolicy from tests.unit.utils import mock_session_pools @@ -239,6 +239,58 @@ def validate_result_ordering(self, results): self.assertLess(last_time_added, current_time_added) last_time_added = current_time_added + def insert_and_validate_list_async(self, reverse, slowdown): + """ + This utility method will execute submit various statements for execution using execute_concurrent_async, + then invoke a separate thread to execute the callback associated with the futures registered + for those statements. The parameters will toggle various timing, and ordering changes. + Finally it will validate that the results were returned in the order they were submitted + :param reverse: Execute the callbacks in the opposite order that they were submitted + :param slowdown: Cause intermittent queries to perform slowly + """ + our_handler = MockResponseResponseFuture(reverse=reverse) + mock_session = Mock() + statements_and_params = zip(cycle(["INSERT INTO test3rf.test (k, v) VALUES (%s, 0)"]), + [(i, ) for i in range(100)]) + mock_session.execute_async.return_value = our_handler + + t = TimedCallableInvoker(our_handler, slowdown=slowdown) + t.start() + try: + future = execute_concurrent_async(mock_session, statements_and_params) + results = future.result() + self.validate_result_ordering(results) + finally: + t.stop() + + def test_results_ordering_async_forward(self): + """ + This tests the ordering of our execute_concurrent_async function + when queries complete in the order they were executed. + """ + self.insert_and_validate_list_async(False, False) + + def test_results_ordering_async_reverse(self): + """ + This tests the ordering of our execute_concurrent_async function + when queries complete in the reverse order they were executed. + """ + self.insert_and_validate_list_async(True, False) + + def test_results_ordering_async_forward_slowdown(self): + """ + This tests the ordering of our execute_concurrent_async function + when queries complete in the order they were executed, with slow queries mixed in. + """ + self.insert_and_validate_list_async(False, True) + + def test_results_ordering_async_reverse_slowdown(self): + """ + This tests the ordering of our execute_concurrent_async function + when queries complete in the reverse order they were executed, with slow queries mixed in. + """ + self.insert_and_validate_list_async(True, True) + @mock_session_pools def test_recursion_limited(self): """