Skip to content

Commit a48323f

Browse files
committed
Merge branch 'main' into issue-10550/streams-without-partition-routers-nor-cursor-to-concurrent
2 parents 56be6d6 + 72117aa commit a48323f

File tree

10 files changed

+239
-173
lines changed

10 files changed

+239
-173
lines changed

.github/workflows/connector-tests.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ jobs:
7272
cdk_extra: n/a
7373
- connector: source-shopify
7474
cdk_extra: n/a
75+
- connector: source-chargebee
76+
cdk_extra: n/a
7577
# Currently not passing CI (unrelated)
7678
# - connector: source-zendesk-support
7779
# cdk_extra: n/a

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ dist
1212
.mypy_cache
1313
.venv
1414
.pytest_cache
15+
.idea
1516
**/__pycache__

airbyte_cdk/sources/declarative/manifest_declarative_source.py

Lines changed: 33 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
import json
66
import logging
77
import pkgutil
8-
import re
98
from copy import deepcopy
109
from importlib import metadata
11-
from typing import Any, Dict, Iterator, List, Mapping, Optional, Tuple
10+
from typing import Any, Dict, Iterator, List, Mapping, Optional
11+
from packaging.version import Version, InvalidVersion
1212

1313
import yaml
1414
from airbyte_cdk.models import (
@@ -245,45 +245,54 @@ def _validate_source(self) -> None:
245245
"Validation against json schema defined in declarative_component_schema.yaml schema failed"
246246
) from e
247247

248-
cdk_version = metadata.version("airbyte_cdk")
249-
cdk_major, cdk_minor, cdk_patch = self._get_version_parts(cdk_version, "airbyte-cdk")
250-
manifest_version = self._source_config.get("version")
251-
if manifest_version is None:
248+
cdk_version_str = metadata.version("airbyte_cdk")
249+
cdk_version = self._parse_version(cdk_version_str, "airbyte-cdk")
250+
manifest_version_str = self._source_config.get("version")
251+
if manifest_version_str is None:
252252
raise RuntimeError(
253253
"Manifest version is not defined in the manifest. This is unexpected since it should be a required field. Please contact support."
254254
)
255-
manifest_major, manifest_minor, manifest_patch = self._get_version_parts(
256-
manifest_version, "manifest"
257-
)
255+
manifest_version = self._parse_version(manifest_version_str, "manifest")
258256

259-
if cdk_version.startswith("0.0.0"):
257+
if (cdk_version.major, cdk_version.minor, cdk_version.micro) == (0, 0, 0):
260258
# Skipping version compatibility check on unreleased dev branch
261259
pass
262-
elif cdk_major < manifest_major or (
263-
cdk_major == manifest_major and cdk_minor < manifest_minor
260+
elif (cdk_version.major, cdk_version.minor) < (
261+
manifest_version.major,
262+
manifest_version.minor,
264263
):
265264
raise ValidationError(
266-
f"The manifest version {manifest_version} is greater than the airbyte-cdk package version ({cdk_version}). Your "
265+
f"The manifest version {manifest_version!s} is greater than the airbyte-cdk package version ({cdk_version!s}). Your "
267266
f"manifest may contain features that are not in the current CDK version."
268267
)
269-
elif manifest_major == 0 and manifest_minor < 29:
268+
elif (manifest_version.major, manifest_version.minor) < (0, 29):
270269
raise ValidationError(
271270
f"The low-code framework was promoted to Beta in airbyte-cdk version 0.29.0 and contains many breaking changes to the "
272-
f"language. The manifest version {manifest_version} is incompatible with the airbyte-cdk package version "
273-
f"{cdk_version} which contains these breaking changes."
271+
f"language. The manifest version {manifest_version!s} is incompatible with the airbyte-cdk package version "
272+
f"{cdk_version!s} which contains these breaking changes."
274273
)
275274

276275
@staticmethod
277-
def _get_version_parts(version: str, version_type: str) -> Tuple[int, int, int]:
278-
"""
279-
Takes a semantic version represented as a string and splits it into a tuple of its major, minor, and patch versions.
276+
def _parse_version(
277+
version: str,
278+
version_type: str,
279+
) -> Version:
280+
"""Takes a semantic version represented as a string and splits it into a tuple.
281+
282+
The fourth part (prerelease) is not returned in the tuple.
283+
284+
Returns:
285+
Version: the parsed version object
280286
"""
281-
version_parts = re.split(r"\.", version)
282-
if len(version_parts) != 3 or not all([part.isdigit() for part in version_parts]):
287+
try:
288+
parsed_version = Version(version)
289+
except InvalidVersion as ex:
283290
raise ValidationError(
284-
f"The {version_type} version {version} specified is not a valid version format (ex. 1.2.3)"
285-
)
286-
return tuple(int(part) for part in version_parts) # type: ignore # We already verified there were 3 parts and they are all digits
291+
f"The {version_type} version '{version}' is not a valid version format."
292+
) from ex
293+
else:
294+
# No exception
295+
return parsed_version
287296

288297
def _stream_configs(self, manifest: Mapping[str, Any]) -> List[Dict[str, Any]]:
289298
# This has a warning flag for static, but after we finish part 4 we'll replace manifest with self._source_config

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

Lines changed: 69 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,25 @@
2929
from airbyte_cdk.utils import is_cloud_environment
3030
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
3131
from unstructured.file_utils.filetype import (
32+
EXT_TO_FILETYPE,
3233
FILETYPE_TO_MIMETYPE,
3334
STR_TO_FILETYPE,
3435
FileType,
3536
detect_filetype,
3637
)
38+
import nltk
3739

3840
unstructured_partition_pdf = None
3941
unstructured_partition_docx = None
4042
unstructured_partition_pptx = None
4143

44+
try:
45+
nltk.data.find("tokenizers/punkt.zip")
46+
nltk.data.find("tokenizers/punkt_tab.zip")
47+
except LookupError:
48+
nltk.download("punkt")
49+
nltk.download("punkt_tab")
50+
4251

4352
def optional_decode(contents: Union[str, bytes]) -> str:
4453
if isinstance(contents, bytes):
@@ -108,9 +117,11 @@ async def infer_schema(
108117
format = _extract_format(config)
109118
with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
110119
filetype = self._get_filetype(file_handle, file)
111-
112120
if filetype not in self._supported_file_types() and not format.skip_unprocessable_files:
113-
raise self._create_parse_error(file, self._get_file_type_error_message(filetype))
121+
raise self._create_parse_error(
122+
file,
123+
self._get_file_type_error_message(filetype),
124+
)
114125

115126
return {
116127
"content": {
@@ -159,6 +170,10 @@ def parse_records(
159170
logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
160171
else:
161172
raise e
173+
except Exception as e:
174+
exception_str = str(e)
175+
logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.")
176+
raise e
162177

163178
def _read_file(
164179
self,
@@ -176,20 +191,32 @@ def _read_file(
176191
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
177192
raise Exception("unstructured library is not available")
178193

179-
filetype = self._get_filetype(file_handle, remote_file)
194+
filetype: FileType | None = self._get_filetype(file_handle, remote_file)
180195

181-
if filetype == FileType.MD or filetype == FileType.TXT:
196+
if filetype is None or filetype not in self._supported_file_types():
197+
raise self._create_parse_error(
198+
remote_file,
199+
self._get_file_type_error_message(filetype),
200+
)
201+
if filetype in {FileType.MD, FileType.TXT}:
182202
file_content: bytes = file_handle.read()
183203
decoded_content: str = optional_decode(file_content)
184204
return decoded_content
185-
if filetype not in self._supported_file_types():
186-
raise self._create_parse_error(remote_file, self._get_file_type_error_message(filetype))
187205
if format.processing.mode == "local":
188-
return self._read_file_locally(file_handle, filetype, format.strategy, remote_file)
206+
return self._read_file_locally(
207+
file_handle,
208+
filetype,
209+
format.strategy,
210+
remote_file,
211+
)
189212
elif format.processing.mode == "api":
190213
try:
191214
result: str = self._read_file_remotely_with_retries(
192-
file_handle, format.processing, filetype, format.strategy, remote_file
215+
file_handle,
216+
format.processing,
217+
filetype,
218+
format.strategy,
219+
remote_file,
193220
)
194221
except Exception as e:
195222
# If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
@@ -336,7 +363,11 @@ def _read_file_locally(
336363

337364
return self._render_markdown([element.to_dict() for element in elements])
338365

339-
def _create_parse_error(self, remote_file: RemoteFile, message: str) -> RecordParseError:
366+
def _create_parse_error(
367+
self,
368+
remote_file: RemoteFile,
369+
message: str,
370+
) -> RecordParseError:
340371
return RecordParseError(
341372
FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message
342373
)
@@ -360,32 +391,51 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
360391
# detect_filetype is either using the file name or file content
361392
# if possible, try to leverage the file name to detect the file type
362393
# if the file name is not available, use the file content
363-
file_type = detect_filetype(
364-
filename=remote_file.uri,
365-
)
366-
if file_type is not None and not file_type == FileType.UNK:
394+
file_type: FileType | None = None
395+
try:
396+
file_type = detect_filetype(
397+
filename=remote_file.uri,
398+
)
399+
except Exception:
400+
# Path doesn't exist locally. Try something else...
401+
pass
402+
403+
if file_type and file_type != FileType.UNK:
367404
return file_type
368405

369406
type_based_on_content = detect_filetype(file=file)
407+
file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
370408

371-
# detect_filetype is reading to read the file content
372-
file.seek(0)
409+
if type_based_on_content and type_based_on_content != FileType.UNK:
410+
return type_based_on_content
373411

374-
return type_based_on_content
412+
extension = "." + remote_file.uri.split(".")[-1].lower()
413+
if extension in EXT_TO_FILETYPE:
414+
return EXT_TO_FILETYPE[extension]
415+
416+
return None
375417

376418
def _supported_file_types(self) -> List[Any]:
377419
return [FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT]
378420

379-
def _get_file_type_error_message(self, file_type: FileType) -> str:
421+
def _get_file_type_error_message(
422+
self,
423+
file_type: FileType | None,
424+
) -> str:
380425
supported_file_types = ", ".join([str(type) for type in self._supported_file_types()])
381-
return f"File type {file_type} is not supported. Supported file types are {supported_file_types}"
426+
return f"File type {file_type or 'None'!s} is not supported. Supported file types are {supported_file_types}"
382427

383428
def _render_markdown(self, elements: List[Any]) -> str:
384429
return "\n\n".join((self._convert_to_markdown(el) for el in elements))
385430

386431
def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
387432
if dpath.get(el, "type") == "Title":
388-
heading_str = "#" * (dpath.get(el, "metadata/category_depth", default=1) or 1)
433+
category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
434+
if not isinstance(category_depth, int):
435+
category_depth = (
436+
int(category_depth) if isinstance(category_depth, (str, float)) else 1
437+
)
438+
heading_str = "#" * category_depth
389439
return f"{heading_str} {dpath.get(el, 'text')}"
390440
elif dpath.get(el, "type") == "ListItem":
391441
return f"- {dpath.get(el, 'text')}"

0 commit comments

Comments
 (0)