Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
1262f6f
remove redundant init of coordinate_system
Coniferish Dec 6, 2023
a19be27
extract links logic from _process_pdfminer_pages
Coniferish Dec 6, 2023
fb5d8bb
extract updating coordinates out from _process_pdfminer_pages and upd…
Coniferish Dec 6, 2023
b764285
minor refactor edits
Coniferish Dec 7, 2023
2e5a621
extract combining list elements out from _process_pdfminer_pages and …
Coniferish Dec 7, 2023
a419d98
create check_pdfminer_generates_pages() and add to partition routes
Coniferish Dec 11, 2023
0d07c5a
WIP
Coniferish Dec 11, 2023
0525d71
Merge branch 'main' into jj/2212-pdfminer-bug
christinestraub Dec 11, 2023
b75ad79
feat: use `pdf_text_extractable` insteaf of `check_pdfminer_generates…
christinestraub Dec 11, 2023
684fa0b
chore: pass changelog check
christinestraub Dec 11, 2023
ace6058
feat: revert changes for the "fast" strategy workflow
christinestraub Dec 11, 2023
b484475
test: fix unit test errors
christinestraub Dec 11, 2023
d5bdcb5
Merge branch 'main' into jj/2212-pdfminer-bug
christinestraub Dec 11, 2023
7b2b432
test: fix lint error
christinestraub Dec 11, 2023
ba64cc1
chore: update changelog
christinestraub Dec 11, 2023
151cc3d
Merge branch 'main' into jj/2212-pdfminer-bug
christinestraub Dec 11, 2023
7949906
Merge branch 'main' into jj/2212-pdfminer-bug
Coniferish Dec 12, 2023
324bbfd
feat: remove exception handler to raise an exception if PDFMiner fail…
christinestraub Dec 12, 2023
87e331d
test: fix unit test error
christinestraub Dec 12, 2023
92a10dd
Merge branch 'main' into jj/2212-pdfminer-bug
christinestraub Dec 13, 2023
e8a679b
chore: update changelog & version
christinestraub Dec 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.11.4-dev8
## 0.11.4-dev9

### Enhancements

Expand All @@ -16,6 +16,8 @@

### Fixes

* **Fix pdf `hi_res` partitioning failure when pdfminer fails.** Implemented logic to fall back to the "inferred_layout + OCR" if pdfminer fails in the `hi_res` strategy.

## 0.11.3

### Enhancements
Expand Down
1 change: 0 additions & 1 deletion test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1055,7 +1055,6 @@ def test_partition_pdf_with_bad_color_profile():
[
("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
("failure-after-repair.pdf", "PDFMiner failed to process PDF page 26 after repairing it."),
],
)
def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.11.4-dev8" # pragma: no cover
__version__ = "0.11.4-dev9" # pragma: no cover
2 changes: 1 addition & 1 deletion unstructured/partition/pdf_image/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def supplement_page_layout_with_ocr(
)
elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value:
for element in page_layout.elements:
if element.text == "":
if not element.text:
padding = env_config.IMAGE_CROP_PAD
padded_element = pad_element_bboxes(element, padding=padding)
cropped_image = image.crop(
Expand Down
38 changes: 24 additions & 14 deletions unstructured/partition/pdf_image/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ def _partition_pdf_or_image_local(
ocr_mode: str = OCRMode.FULL_PAGE.value,
model_name: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
pdf_text_extractable: bool = False,
extract_images_in_pdf: bool = False,
extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None,
Expand Down Expand Up @@ -281,12 +282,14 @@ def _partition_pdf_or_image_local(
pdf_image_dpi=pdf_image_dpi,
)

# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = process_file_with_pdfminer(
inferred_document_layout,
filename,
is_image,
)
if pdf_text_extractable is True:
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = process_file_with_pdfminer(
inferred_document_layout,
filename,
)
else:
merged_document_layout = inferred_document_layout

if model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
Expand All @@ -310,13 +313,14 @@ def _partition_pdf_or_image_local(
)
if hasattr(file, "seek"):
file.seek(0)

# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = process_data_with_pdfminer(
inferred_document_layout,
file,
is_image,
)
if pdf_text_extractable is True:
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = process_data_with_pdfminer(
inferred_document_layout,
file,
)
else:
merged_document_layout = inferred_document_layout

if model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
Expand All @@ -339,6 +343,11 @@ def _partition_pdf_or_image_local(
kwargs["sort_mode"] = SORT_MODE_DONT

final_document_layout = clean_pdfminer_inner_elements(final_document_layout)

for page in final_document_layout.pages:
for el in page.elements:
el.text = el.text or ""

elements = document_to_element_list(
final_document_layout,
sortable=True,
Expand Down Expand Up @@ -452,7 +461,7 @@ def partition_pdf_or_image(
isinstance(el, Text) and el.text.strip() for el in extracted_elements
)
except Exception as e:
logger.error(e, exc_info=True)
logger.error(e)
logger.warning("PDF text extraction failed, skip text extraction...")

strategy = determine_pdf_or_image_strategy(
Expand All @@ -476,6 +485,7 @@ def partition_pdf_or_image(
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified or last_modification_date,
pdf_text_extractable=pdf_text_extractable,
extract_images_in_pdf=extract_images_in_pdf,
extract_element_types=extract_element_types,
image_output_dir_path=image_output_dir_path,
Expand Down
9 changes: 1 addition & 8 deletions unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,28 +27,21 @@
def process_file_with_pdfminer(
inferred_document_layout: "DocumentLayout",
filename: str = "",
is_image: bool = False,
) -> "DocumentLayout":
with open_filename(filename, "rb") as fp:
fp = cast(BinaryIO, fp)
inferred_document_layout = process_data_with_pdfminer(
inferred_document_layout=inferred_document_layout,
file=fp,
is_image=is_image,
)
return inferred_document_layout


def process_data_with_pdfminer(
inferred_document_layout: "DocumentLayout",
file: Optional[Union[bytes, BinaryIO]] = None,
is_image: bool = False,
) -> "DocumentLayout":
if is_image:
for page in inferred_document_layout.pages:
for el in page.elements:
el.text = el.text or ""
return inferred_document_layout
"""Process document data using PDFMiner to extract layout information."""

extracted_layouts = get_regions_by_pdfminer(file)

Expand Down
10 changes: 2 additions & 8 deletions unstructured/partition/pdf_image/pdfminer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,14 +104,8 @@ def open_pdfminer_pages_generator(
with pikepdf.Pdf.open(error_page_data) as pdf:
pdf.save(tmp.name)
page = next(PDFPage.get_pages(open(tmp.name, "rb"))) # noqa: SIM115
try:
interpreter.process_page(page)
page_layout = device.get_result()
except Exception:
logger.warning(
f"PDFMiner failed to process PDF page {i+1} after repairing it."
)
break
interpreter.process_page(page)
page_layout = device.get_result()
i += 1
yield page, page_layout
except PSSyntaxError:
Expand Down