-
Notifications
You must be signed in to change notification settings - Fork 21
poc: connector builder using concurrent cdk #460
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,7 +34,6 @@ def __init__( | |
partition_enqueuer: PartitionEnqueuer, | ||
thread_pool_manager: ThreadPoolManager, | ||
logger: logging.Logger, | ||
slice_logger: SliceLogger, | ||
message_repository: MessageRepository, | ||
partition_reader: PartitionReader, | ||
): | ||
|
@@ -44,7 +43,6 @@ def __init__( | |
:param partition_enqueuer: PartitionEnqueuer instance | ||
:param thread_pool_manager: ThreadPoolManager instance | ||
:param logger: Logger instance | ||
:param slice_logger: SliceLogger instance | ||
:param message_repository: MessageRepository instance | ||
:param partition_reader: PartitionReader instance | ||
""" | ||
|
@@ -59,7 +57,6 @@ def __init__( | |
self._stream_instances_to_start_partition_generation = stream_instances_to_read_from | ||
self._streams_currently_generating_partitions: List[str] = [] | ||
self._logger = logger | ||
self._slice_logger = slice_logger | ||
self._message_repository = message_repository | ||
self._partition_reader = partition_reader | ||
self._streams_done: Set[str] = set() | ||
|
@@ -95,11 +92,7 @@ def on_partition(self, partition: Partition) -> None: | |
""" | ||
stream_name = partition.stream_name() | ||
self._streams_to_running_partitions[stream_name].add(partition) | ||
if self._slice_logger.should_log_slice_message(self._logger): | ||
self._message_repository.emit_message( | ||
self._slice_logger.create_slice_log_message(partition.to_slice()) | ||
) | ||
self._thread_pool_manager.submit(self._partition_reader.process_partition, partition) | ||
self._thread_pool_manager.submit(self._partition_reader.process_partition, partition, self._stream_name_to_instance[partition.stream_name()].cursor) | ||
|
||
def on_partition_complete_sentinel( | ||
self, sentinel: PartitionCompleteSentinel | ||
|
@@ -112,26 +105,19 @@ def on_partition_complete_sentinel( | |
""" | ||
partition = sentinel.partition | ||
|
||
try: | ||
if sentinel.is_successful: | ||
stream = self._stream_name_to_instance[partition.stream_name()] | ||
stream.cursor.close_partition(partition) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I moved |
||
except Exception as exception: | ||
self._flag_exception(partition.stream_name(), exception) | ||
yield AirbyteTracedException.from_exception( | ||
exception, stream_descriptor=StreamDescriptor(name=partition.stream_name()) | ||
).as_sanitized_airbyte_message() | ||
finally: | ||
partitions_running = self._streams_to_running_partitions[partition.stream_name()] | ||
if partition in partitions_running: | ||
partitions_running.remove(partition) | ||
# If all partitions were generated and this was the last one, the stream is done | ||
if ( | ||
partition.stream_name() not in self._streams_currently_generating_partitions | ||
and len(partitions_running) == 0 | ||
): | ||
yield from self._on_stream_is_done(partition.stream_name()) | ||
yield from self._message_repository.consume_queue() | ||
if sentinel.is_successful: | ||
stream = self._stream_name_to_instance[partition.stream_name()] | ||
|
||
partitions_running = self._streams_to_running_partitions[partition.stream_name()] | ||
if partition in partitions_running: | ||
partitions_running.remove(partition) | ||
# If all partitions were generated and this was the last one, the stream is done | ||
if ( | ||
partition.stream_name() not in self._streams_currently_generating_partitions | ||
and len(partitions_running) == 0 | ||
): | ||
yield from self._on_stream_is_done(partition.stream_name()) | ||
yield from self._message_repository.consume_queue() | ||
|
||
def on_record(self, record: Record) -> Iterable[AirbyteMessage]: | ||
""" | ||
|
@@ -160,7 +146,6 @@ def on_record(self, record: Record) -> Iterable[AirbyteMessage]: | |
stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING | ||
) | ||
self._record_counter[stream.name] += 1 | ||
stream.cursor.observe(record) | ||
yield message | ||
yield from self._message_repository.consume_queue() | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
import concurrent | ||
import logging | ||
from queue import Queue | ||
from typing import Iterable, Iterator, List | ||
from typing import Iterable, Iterator, List, Optional | ||
|
||
from airbyte_cdk.models import AirbyteMessage | ||
from airbyte_cdk.sources.concurrent_source.concurrent_read_processor import ConcurrentReadProcessor | ||
|
@@ -16,7 +16,7 @@ | |
from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository | ||
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream | ||
from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer | ||
from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader | ||
from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionLogger, PartitionReader | ||
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition | ||
from airbyte_cdk.sources.streams.concurrent.partitions.types import ( | ||
PartitionCompleteSentinel, | ||
|
@@ -44,6 +44,7 @@ def create( | |
slice_logger: SliceLogger, | ||
message_repository: MessageRepository, | ||
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, | ||
queue: Optional[Queue[QueueItem]] = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since the MessageRepository also needs to have access to the queue, we need to have the queue passed here instead of being created |
||
) -> "ConcurrentSource": | ||
is_single_threaded = initial_number_of_partitions_to_generate == 1 and num_workers == 1 | ||
too_many_generator = ( | ||
|
@@ -65,6 +66,7 @@ def create( | |
message_repository, | ||
initial_number_of_partitions_to_generate, | ||
timeout_seconds, | ||
queue, | ||
) | ||
|
||
def __init__( | ||
|
@@ -75,6 +77,7 @@ def __init__( | |
message_repository: MessageRepository = InMemoryMessageRepository(), | ||
initial_number_partitions_to_generate: int = 1, | ||
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, | ||
queue: Optional[Queue[QueueItem]] = None, | ||
) -> None: | ||
""" | ||
:param threadpool: The threadpool to submit tasks to | ||
|
@@ -90,6 +93,7 @@ def __init__( | |
self._message_repository = message_repository | ||
self._initial_number_partitions_to_generate = initial_number_partitions_to_generate | ||
self._timeout_seconds = timeout_seconds | ||
self._queue = queue if queue else Queue(maxsize=10_000) | ||
|
||
def read( | ||
self, | ||
|
@@ -101,23 +105,21 @@ def read( | |
# threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating | ||
# partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more | ||
# information and might even need to be configurable depending on the source | ||
queue: Queue[QueueItem] = Queue(maxsize=10_000) | ||
concurrent_stream_processor = ConcurrentReadProcessor( | ||
streams, | ||
PartitionEnqueuer(queue, self._threadpool), | ||
PartitionEnqueuer(self._queue, self._threadpool), | ||
self._threadpool, | ||
self._logger, | ||
self._slice_logger, | ||
self._message_repository, | ||
PartitionReader(queue), | ||
PartitionReader(self._queue, PartitionLogger(self._slice_logger, self._logger, self._message_repository)), | ||
) | ||
|
||
# Enqueue initial partition generation tasks | ||
yield from self._submit_initial_partition_generators(concurrent_stream_processor) | ||
|
||
# Read from the queue until all partitions were generated and read | ||
yield from self._consume_from_queue( | ||
queue, | ||
self._queue, | ||
concurrent_stream_processor, | ||
) | ||
self._threadpool.check_for_errors_and_shutdown() | ||
|
@@ -161,5 +163,7 @@ def _handle_item( | |
yield from concurrent_stream_processor.on_partition_complete_sentinel(queue_item) | ||
elif isinstance(queue_item, Record): | ||
yield from concurrent_stream_processor.on_record(queue_item) | ||
elif isinstance(queue_item, AirbyteMessage): | ||
yield queue_item | ||
else: | ||
raise ValueError(f"Unknown queue item type: {type(queue_item)}") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had to move this to avoid circular dependencies. I assume this was caused by
concurrent_declarative_source.py
having to know aboutTestLimits
butconnector_builder_handler.py
having to know aboutconcurrent_declarative_source