Skip to content

Commit 4592368

Browse files
dependabot[bot]aaronsteersaldogonzalez8
authored
chore(deps): bump nltk from 3.8.1 to 3.9.1 (#43)
Co-authored-by: Aaron Steers <[email protected]> Co-authored-by: Aldo Gonzalez <[email protected]>
1 parent e27cb81 commit 4592368

File tree

7 files changed

+181
-147
lines changed

7 files changed

+181
-147
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ dist
1212
.mypy_cache
1313
.venv
1414
.pytest_cache
15+
.idea
1516
**/__pycache__

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

Lines changed: 69 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,25 @@
2929
from airbyte_cdk.utils import is_cloud_environment
3030
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
3131
from unstructured.file_utils.filetype import (
32+
EXT_TO_FILETYPE,
3233
FILETYPE_TO_MIMETYPE,
3334
STR_TO_FILETYPE,
3435
FileType,
3536
detect_filetype,
3637
)
38+
import nltk
3739

3840
unstructured_partition_pdf = None
3941
unstructured_partition_docx = None
4042
unstructured_partition_pptx = None
4143

44+
try:
45+
nltk.data.find("tokenizers/punkt.zip")
46+
nltk.data.find("tokenizers/punkt_tab.zip")
47+
except LookupError:
48+
nltk.download("punkt")
49+
nltk.download("punkt_tab")
50+
4251

4352
def optional_decode(contents: Union[str, bytes]) -> str:
4453
if isinstance(contents, bytes):
@@ -108,9 +117,11 @@ async def infer_schema(
108117
format = _extract_format(config)
109118
with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
110119
filetype = self._get_filetype(file_handle, file)
111-
112120
if filetype not in self._supported_file_types() and not format.skip_unprocessable_files:
113-
raise self._create_parse_error(file, self._get_file_type_error_message(filetype))
121+
raise self._create_parse_error(
122+
file,
123+
self._get_file_type_error_message(filetype),
124+
)
114125

115126
return {
116127
"content": {
@@ -159,6 +170,10 @@ def parse_records(
159170
logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
160171
else:
161172
raise e
173+
except Exception as e:
174+
exception_str = str(e)
175+
logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.")
176+
raise e
162177

163178
def _read_file(
164179
self,
@@ -176,20 +191,32 @@ def _read_file(
176191
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
177192
raise Exception("unstructured library is not available")
178193

179-
filetype = self._get_filetype(file_handle, remote_file)
194+
filetype: FileType | None = self._get_filetype(file_handle, remote_file)
180195

181-
if filetype == FileType.MD or filetype == FileType.TXT:
196+
if filetype is None or filetype not in self._supported_file_types():
197+
raise self._create_parse_error(
198+
remote_file,
199+
self._get_file_type_error_message(filetype),
200+
)
201+
if filetype in {FileType.MD, FileType.TXT}:
182202
file_content: bytes = file_handle.read()
183203
decoded_content: str = optional_decode(file_content)
184204
return decoded_content
185-
if filetype not in self._supported_file_types():
186-
raise self._create_parse_error(remote_file, self._get_file_type_error_message(filetype))
187205
if format.processing.mode == "local":
188-
return self._read_file_locally(file_handle, filetype, format.strategy, remote_file)
206+
return self._read_file_locally(
207+
file_handle,
208+
filetype,
209+
format.strategy,
210+
remote_file,
211+
)
189212
elif format.processing.mode == "api":
190213
try:
191214
result: str = self._read_file_remotely_with_retries(
192-
file_handle, format.processing, filetype, format.strategy, remote_file
215+
file_handle,
216+
format.processing,
217+
filetype,
218+
format.strategy,
219+
remote_file,
193220
)
194221
except Exception as e:
195222
# If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
@@ -336,7 +363,11 @@ def _read_file_locally(
336363

337364
return self._render_markdown([element.to_dict() for element in elements])
338365

339-
def _create_parse_error(self, remote_file: RemoteFile, message: str) -> RecordParseError:
366+
def _create_parse_error(
367+
self,
368+
remote_file: RemoteFile,
369+
message: str,
370+
) -> RecordParseError:
340371
return RecordParseError(
341372
FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message
342373
)
@@ -360,32 +391,51 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
360391
# detect_filetype is either using the file name or file content
361392
# if possible, try to leverage the file name to detect the file type
362393
# if the file name is not available, use the file content
363-
file_type = detect_filetype(
364-
filename=remote_file.uri,
365-
)
366-
if file_type is not None and not file_type == FileType.UNK:
394+
file_type: FileType | None = None
395+
try:
396+
file_type = detect_filetype(
397+
filename=remote_file.uri,
398+
)
399+
except Exception:
400+
# Path doesn't exist locally. Try something else...
401+
pass
402+
403+
if file_type and file_type != FileType.UNK:
367404
return file_type
368405

369406
type_based_on_content = detect_filetype(file=file)
407+
file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
370408

371-
# detect_filetype is reading to read the file content
372-
file.seek(0)
409+
if type_based_on_content and type_based_on_content != FileType.UNK:
410+
return type_based_on_content
373411

374-
return type_based_on_content
412+
extension = "." + remote_file.uri.split(".")[-1].lower()
413+
if extension in EXT_TO_FILETYPE:
414+
return EXT_TO_FILETYPE[extension]
415+
416+
return None
375417

376418
def _supported_file_types(self) -> List[Any]:
377419
return [FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT]
378420

379-
def _get_file_type_error_message(self, file_type: FileType) -> str:
421+
def _get_file_type_error_message(
422+
self,
423+
file_type: FileType | None,
424+
) -> str:
380425
supported_file_types = ", ".join([str(type) for type in self._supported_file_types()])
381-
return f"File type {file_type} is not supported. Supported file types are {supported_file_types}"
426+
return f"File type {file_type or 'None'!s} is not supported. Supported file types are {supported_file_types}"
382427

383428
def _render_markdown(self, elements: List[Any]) -> str:
384429
return "\n\n".join((self._convert_to_markdown(el) for el in elements))
385430

386431
def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
387432
if dpath.get(el, "type") == "Title":
388-
heading_str = "#" * (dpath.get(el, "metadata/category_depth", default=1) or 1)
433+
category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
434+
if not isinstance(category_depth, int):
435+
category_depth = (
436+
int(category_depth) if isinstance(category_depth, (str, float)) else 1
437+
)
438+
heading_str = "#" * category_depth
389439
return f"{heading_str} {dpath.get(el, 'text')}"
390440
elif dpath.get(el, "type") == "ListItem":
391441
return f"- {dpath.get(el, 'text')}"

0 commit comments

Comments
 (0)