2929from airbyte_cdk .utils import is_cloud_environment
3030from airbyte_cdk .utils .traced_exception import AirbyteTracedException
3131from unstructured .file_utils .filetype import (
32+ EXT_TO_FILETYPE ,
3233 FILETYPE_TO_MIMETYPE ,
3334 STR_TO_FILETYPE ,
3435 FileType ,
3536 detect_filetype ,
3637)
38+ import nltk
3739
3840unstructured_partition_pdf = None
3941unstructured_partition_docx = None
4042unstructured_partition_pptx = None
4143
44+ try :
45+ nltk .data .find ("tokenizers/punkt.zip" )
46+ nltk .data .find ("tokenizers/punkt_tab.zip" )
47+ except LookupError :
48+ nltk .download ("punkt" )
49+ nltk .download ("punkt_tab" )
50+
4251
4352def optional_decode (contents : Union [str , bytes ]) -> str :
4453 if isinstance (contents , bytes ):
@@ -108,9 +117,11 @@ async def infer_schema(
108117 format = _extract_format (config )
109118 with stream_reader .open_file (file , self .file_read_mode , None , logger ) as file_handle :
110119 filetype = self ._get_filetype (file_handle , file )
111-
112120 if filetype not in self ._supported_file_types () and not format .skip_unprocessable_files :
113- raise self ._create_parse_error (file , self ._get_file_type_error_message (filetype ))
121+ raise self ._create_parse_error (
122+ file ,
123+ self ._get_file_type_error_message (filetype ),
124+ )
114125
115126 return {
116127 "content" : {
@@ -159,6 +170,10 @@ def parse_records(
159170 logger .warn (f"File { file .uri } cannot be parsed. Skipping it." )
160171 else :
161172 raise e
173+ except Exception as e :
174+ exception_str = str (e )
175+ logger .error (f"File { file .uri } caused an error during parsing: { exception_str } ." )
176+ raise e
162177
163178 def _read_file (
164179 self ,
@@ -176,20 +191,32 @@ def _read_file(
176191 # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
177192 raise Exception ("unstructured library is not available" )
178193
179- filetype = self ._get_filetype (file_handle , remote_file )
194+ filetype : FileType | None = self ._get_filetype (file_handle , remote_file )
180195
181- if filetype == FileType .MD or filetype == FileType .TXT :
196+ if filetype is None or filetype not in self ._supported_file_types ():
197+ raise self ._create_parse_error (
198+ remote_file ,
199+ self ._get_file_type_error_message (filetype ),
200+ )
201+ if filetype in {FileType .MD , FileType .TXT }:
182202 file_content : bytes = file_handle .read ()
183203 decoded_content : str = optional_decode (file_content )
184204 return decoded_content
185- if filetype not in self ._supported_file_types ():
186- raise self ._create_parse_error (remote_file , self ._get_file_type_error_message (filetype ))
187205 if format .processing .mode == "local" :
188- return self ._read_file_locally (file_handle , filetype , format .strategy , remote_file )
206+ return self ._read_file_locally (
207+ file_handle ,
208+ filetype ,
209+ format .strategy ,
210+ remote_file ,
211+ )
189212 elif format .processing .mode == "api" :
190213 try :
191214 result : str = self ._read_file_remotely_with_retries (
192- file_handle , format .processing , filetype , format .strategy , remote_file
215+ file_handle ,
216+ format .processing ,
217+ filetype ,
218+ format .strategy ,
219+ remote_file ,
193220 )
194221 except Exception as e :
195222 # If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
@@ -336,7 +363,11 @@ def _read_file_locally(
336363
337364 return self ._render_markdown ([element .to_dict () for element in elements ])
338365
339- def _create_parse_error (self , remote_file : RemoteFile , message : str ) -> RecordParseError :
366+ def _create_parse_error (
367+ self ,
368+ remote_file : RemoteFile ,
369+ message : str ,
370+ ) -> RecordParseError :
340371 return RecordParseError (
341372 FileBasedSourceError .ERROR_PARSING_RECORD , filename = remote_file .uri , message = message
342373 )
@@ -360,32 +391,51 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
360391 # detect_filetype is either using the file name or file content
361392 # if possible, try to leverage the file name to detect the file type
362393 # if the file name is not available, use the file content
363- file_type = detect_filetype (
364- filename = remote_file .uri ,
365- )
366- if file_type is not None and not file_type == FileType .UNK :
394+ file_type : FileType | None = None
395+ try :
396+ file_type = detect_filetype (
397+ filename = remote_file .uri ,
398+ )
399+ except Exception :
400+ # Path doesn't exist locally. Try something else...
401+ pass
402+
403+ if file_type and file_type != FileType .UNK :
367404 return file_type
368405
369406 type_based_on_content = detect_filetype (file = file )
407+ file .seek (0 ) # detect_filetype is reading to read the file content, so we need to reset
370408
371- # detect_filetype is reading to read the file content
372- file . seek ( 0 )
409+ if type_based_on_content and type_based_on_content != FileType . UNK :
410+ return type_based_on_content
373411
374- return type_based_on_content
412+ extension = "." + remote_file .uri .split ("." )[- 1 ].lower ()
413+ if extension in EXT_TO_FILETYPE :
414+ return EXT_TO_FILETYPE [extension ]
415+
416+ return None
375417
376418 def _supported_file_types (self ) -> List [Any ]:
377419 return [FileType .MD , FileType .PDF , FileType .DOCX , FileType .PPTX , FileType .TXT ]
378420
379- def _get_file_type_error_message (self , file_type : FileType ) -> str :
421+ def _get_file_type_error_message (
422+ self ,
423+ file_type : FileType | None ,
424+ ) -> str :
380425 supported_file_types = ", " .join ([str (type ) for type in self ._supported_file_types ()])
381- return f"File type { file_type } is not supported. Supported file types are { supported_file_types } "
426+ return f"File type { file_type or 'None' !s } is not supported. Supported file types are { supported_file_types } "
382427
383428 def _render_markdown (self , elements : List [Any ]) -> str :
384429 return "\n \n " .join ((self ._convert_to_markdown (el ) for el in elements ))
385430
386431 def _convert_to_markdown (self , el : Dict [str , Any ]) -> str :
387432 if dpath .get (el , "type" ) == "Title" :
388- heading_str = "#" * (dpath .get (el , "metadata/category_depth" , default = 1 ) or 1 )
433+ category_depth = dpath .get (el , "metadata/category_depth" , default = 1 ) or 1
434+ if not isinstance (category_depth , int ):
435+ category_depth = (
436+ int (category_depth ) if isinstance (category_depth , (str , float )) else 1
437+ )
438+ heading_str = "#" * category_depth
389439 return f"{ heading_str } { dpath .get (el , 'text' )} "
390440 elif dpath .get (el , "type" ) == "ListItem" :
391441 return f"- { dpath .get (el , 'text' )} "
0 commit comments