From eb8edfded7dd80bdfb8a68a93d96d860b8d01f1c Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Tue, 25 Nov 2025 08:57:24 -0400 Subject: [PATCH] Version 0.2.4 --- CHANGES.md | 10 ++ pdf4llm/setup.py | 2 +- pymupdf4llm/pymupdf4llm/__init__.py | 9 ++ pymupdf4llm/pymupdf4llm/helpers/check_ocr.py | 97 +--------------- .../pymupdf4llm/helpers/document_layout.py | 28 +++-- pymupdf4llm/pymupdf4llm/helpers/utils.py | 106 ++++++++++++++++++ pymupdf4llm/pymupdf4llm/versions_file.py | 2 +- pymupdf4llm/setup.py | 2 +- 8 files changed, 148 insertions(+), 108 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index f837638f..b2eeb030 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,15 @@ # Change Log +## Changes in version 0.2.4 + +### Fixes: + +* [335](https://github.com/pymupdf/RAG/issues/335) - KeyError "has_ocr_text" + +### Other Changes: + + +------ ## Changes in version 0.2.3 ### Fixes: diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py index 0b4bbfe9..b3c962d6 100644 --- a/pdf4llm/setup.py +++ b/pdf4llm/setup.py @@ -6,7 +6,7 @@ with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f: readme = f.read() -version = "0.2.3" # must always equal the pymupdf4llm version +version = "0.2.4" # must always equal the pymupdf4llm version classifiers = [ "Development Status :: 5 - Production/Stable", diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py index 91a45744..a0ca993b 100644 --- a/pymupdf4llm/pymupdf4llm/__init__.py +++ b/pymupdf4llm/pymupdf4llm/__init__.py @@ -37,6 +37,7 @@ def parse_document( embed_images=False, show_progress=False, force_text=True, + use_ocr=True, ): return document_layout.parse_document( doc, @@ -50,6 +51,7 @@ def parse_document( embed_images=embed_images, show_progress=show_progress, force_text=force_text, + use_ocr=use_ocr, ) def to_markdown( @@ -72,6 +74,7 @@ def to_markdown( page_height=None, ignore_code=False, show_progress=False, + use_ocr=True, # unsupported options for pymupdf layout: **kwargs, ): @@ -89,6 +92,7 @@ def to_markdown( embed_images=embed_images, show_progress=show_progress, force_text=force_text, + use_ocr=use_ocr, ) return parsed_doc.to_markdown( header=header, @@ -99,6 +103,7 @@ def to_markdown( show_progress=show_progress, page_separators=page_separators, page_chunks=page_chunks, + use_ocr=use_ocr, ) def to_json( @@ -112,6 +117,7 @@ def to_json( embed_images=False, show_progress=False, force_text=True, + use_ocr=True, # unsupported options for pymupdf layout: **kwargs, ): @@ -125,6 +131,7 @@ def to_json( write_images=write_images, show_progress=show_progress, force_text=force_text, + use_ocr=use_ocr, ) return parsed_doc.to_json() @@ -138,6 +145,7 @@ def to_text( show_progress=False, force_text=True, ocr_dpi=400, + use_ocr=True, # unsupported options for pymupdf layout: **kwargs, ): @@ -149,6 +157,7 @@ def to_text( write_images=False, show_progress=show_progress, force_text=force_text, + use_ocr=use_ocr, ) return parsed_doc.to_text( header=header, diff --git a/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py index ab11ea03..aa1087a2 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py +++ b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py @@ -1,7 +1,7 @@ import cv2 import numpy as np import pymupdf # PyMuPDF -from pymupdf4llm.helpers.utils import WHITE_CHARS +from pymupdf4llm.helpers.utils import WHITE_CHARS, analyze_page FLAGS = ( 0 @@ -108,11 +108,6 @@ """ -def is_white(text): - """Identify white text.""" - return WHITE_CHARS.issuperset(text) - - def get_span_ocr(page, bbox, dpi=300): """Return OCR'd span text using Tesseract. @@ -197,96 +192,6 @@ def get_page_image(page, dpi=150, covered=None): return gray, matrix, pix -def analyze_page(page, blocks=None) -> dict: - """Analyze the page for the OCR decision. - - Args: - blocks: output of page.get_text("dict") if already available - Returns: - A dict with analysis results. The area-related float values are - computed as fractions of the total covered area. - - "covered": pymupdf.Rect, page area covered by content - "img_joins": float, fraction of area of the joined images - "img_area": float, fraction of sum of image area sizes - "txt_joins": float, fraction of area of the joined text spans - "txt_area": float, fraction of sum of text span bbox area sizes - "vec_joins": float, fraction of area of the joined vector characters - "vec_area": float, fraction of sum of vector character area sizes - "chars_total": int, count of visible characters - "chars_bad": int, count of Replacement Unicode characters - "ocr_spans": int, count of text spans with 'GlyphLessFont' - - """ - chars_total = 0 - chars_bad = 0 - if blocks is None: - blocks = page.get_text( - "dict", - flags=FLAGS, - clip=pymupdf.INFINITE_RECT(), - )["blocks"] - img_rect = pymupdf.EMPTY_RECT() - txt_rect = +img_rect - vec_rect = +img_rect - img_area = 0 - txt_area = 0 - vec_area = 0 - ocr_spans = 0 - for b in blocks: - bbox = page.rect & b["bbox"] - area = bbox.width * bbox.height - if not area: - continue - if b["type"] == 1: # Image block - img_rect |= bbox - img_area += area - elif b["type"] == 0: # Text block - for l in b["lines"]: - for s in l["spans"]: - if is_white(s["text"]): - continue - sr = page.rect & s["bbox"] - if sr.is_empty or sr.is_infinite: - continue - if s["font"] == "GlyphLessFont": - ocr_spans += 1 - elif s["alpha"] == 0: - continue # skip invisible text - chars_total += len(s["text"].strip()) - chars_bad += len([c for c in s["text"] if c == chr(0xFFFD)]) - txt_rect |= sr - txt_area += sr.width * sr.height - elif ( - 1 - and b["type"] == 3 # vector block - and b["stroked"] # has been stroked - and bbox.width <= 20 # width limit for typical characters - and bbox.height <= 20 # height limit for typical characters - and not b["isrect"] # contains curves - ): - # potential character-like vector block - vec_rect |= bbox - vec_area += area - - # the rectangle on page covered by some content - covered = img_rect | txt_rect | vec_rect - cover_area = abs(covered) - analysis = { - "covered": covered, - "img_joins": (abs(img_rect) / cover_area) if cover_area else 0, - "img_area": img_area / cover_area if cover_area else 0, - "txt_joins": (abs(txt_rect) / cover_area) if cover_area else 0, - "txt_area": txt_area / cover_area if cover_area else 0, - "vec_area": vec_area / cover_area if cover_area else 0, - "vec_joins": (abs(vec_rect) / cover_area) if cover_area else 0, - "chars_total": chars_total, - "chars_bad": chars_bad, - "ocr_spans": ocr_spans, - } - return analysis - - def should_ocr_page( page, dpi=150, diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py index b9ee48f9..9b872d12 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py +++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py @@ -18,7 +18,11 @@ from pymupdf4llm.helpers.progress import ProgressBar try: import cv2 - from pymupdf4llm.helpers import check_ocr + + if hasattr(cv2, "Canny"): + from pymupdf4llm.helpers import check_ocr + else: + cv2 = None except ImportError: cv2 = None @@ -777,6 +781,7 @@ def parse_document( embed_images=False, write_images=False, force_text=False, + use_ocr=True, ) -> ParsedDocument: if isinstance(doc, pymupdf.Document): mydoc = doc @@ -803,14 +808,17 @@ def parse_document( raise ValueError("Cannot both embed and write images.") document.embed_images = embed_images document.write_images = write_images - try: - reason = "OpenCV not installed" - assert cv2 is not None - reason = "Tesseract language data not found" - assert pymupdf.get_tessdata() - document.use_ocr = True - except Exception as e: - print(f"{reason}. OCR disabled.", file=INFO_MESSAGES) + if use_ocr: + try: + reason = "OpenCV not installed" + assert cv2 is not None + reason = "Tesseract language data not found" + assert pymupdf.get_tessdata() + document.use_ocr = True + except Exception as e: + print(f"OCR disabled: {reason}.") + document.use_ocr = False + else: document.use_ocr = False if pages is None: page_filter = range(mydoc.page_count) @@ -848,6 +856,8 @@ def parse_document( ) else: decision = {"should_ocr": False} + page_analysis = utils.analyze_page(page, blocks) + decision["has_ocr_text"] = page_analysis["ocr_spans"] > 0 if decision["has_ocr_text"]: # prevent MD styling if already OCR'd page_full_ocred = True diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py index 5986c187..03f9cdf8 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/utils.py +++ b/pymupdf4llm/pymupdf4llm/helpers/utils.py @@ -46,6 +46,112 @@ + list(map(chr, range(0x25A0, 0x2600))) ) +FLAGS = ( + 0 + | pymupdf.TEXT_COLLECT_STYLES + | pymupdf.TEXT_COLLECT_VECTORS + | pymupdf.TEXT_PRESERVE_IMAGES + | pymupdf.TEXT_ACCURATE_BBOXES + # | pymupdf.TEXT_MEDIABOX_CLIP +) + +REPLACEMENT_CHARACTER = chr(0xFFFD) + + +def is_white(text): + """Identify white text.""" + return WHITE_CHARS.issuperset(text) + + +def analyze_page(page, blocks=None) -> dict: + """Analyze the page for the OCR decision. + + Args: + blocks: output of page.get_text("dict") if already available + Returns: + A dict with analysis results. The area-related float values are + computed as fractions of the total covered area. + + "covered": pymupdf.Rect, page area covered by content + "img_joins": float, fraction of area of the joined images + "img_area": float, fraction of sum of image area sizes + "txt_joins": float, fraction of area of the joined text spans + "txt_area": float, fraction of sum of text span bbox area sizes + "vec_joins": float, fraction of area of the joined vector characters + "vec_area": float, fraction of sum of vector character area sizes + "chars_total": int, count of visible characters + "chars_bad": int, count of Replacement Unicode characters + "ocr_spans": int, count of text spans with 'GlyphLessFont' + + """ + chars_total = 0 + chars_bad = 0 + if blocks is None: + blocks = page.get_text( + "dict", + flags=FLAGS, + clip=pymupdf.INFINITE_RECT(), + )["blocks"] + img_rect = pymupdf.EMPTY_RECT() + txt_rect = +img_rect + vec_rect = +img_rect + img_area = 0 + txt_area = 0 + vec_area = 0 + ocr_spans = 0 + for b in blocks: + bbox = page.rect & b["bbox"] + area = bbox.width * bbox.height + if not area: + continue + if b["type"] == 1: # Image block + img_rect |= bbox + img_area += area + elif b["type"] == 0: # Text block + for l in b["lines"]: + for s in l["spans"]: + if is_white(s["text"]): + continue + sr = page.rect & s["bbox"] + if sr.is_empty or sr.is_infinite: + continue + if s["font"] == "GlyphLessFont": + ocr_spans += 1 + elif s["alpha"] == 0: + continue # skip invisible text + chars_total += len(s["text"].strip()) + chars_bad += len([c for c in s["text"] if c == chr(0xFFFD)]) + txt_rect |= sr + txt_area += sr.width * sr.height + elif ( + 1 + and b["type"] == 3 # vector block + and b["stroked"] # has been stroked + and bbox.width <= 20 # width limit for typical characters + and bbox.height <= 20 # height limit for typical characters + and not b["isrect"] # contains curves + ): + # potential character-like vector block + vec_rect |= bbox + vec_area += area + + # the rectangle on page covered by some content + covered = img_rect | txt_rect | vec_rect + cover_area = abs(covered) + analysis = { + "covered": covered, + "img_joins": (abs(img_rect) / cover_area) if cover_area else 0, + "img_area": img_area / cover_area if cover_area else 0, + "txt_joins": (abs(txt_rect) / cover_area) if cover_area else 0, + "txt_area": txt_area / cover_area if cover_area else 0, + "vec_area": vec_area / cover_area if cover_area else 0, + "vec_joins": (abs(vec_rect) / cover_area) if cover_area else 0, + "chars_total": chars_total, + "chars_bad": chars_bad, + "ocr_spans": ocr_spans, + } + return analysis + def table_cleaner(page, blocks, tbbox): """Clean the table bbox 'tbbox'. diff --git a/pymupdf4llm/pymupdf4llm/versions_file.py b/pymupdf4llm/pymupdf4llm/versions_file.py index 2a913aab..a81d3226 100644 --- a/pymupdf4llm/pymupdf4llm/versions_file.py +++ b/pymupdf4llm/pymupdf4llm/versions_file.py @@ -1,3 +1,3 @@ # Generated file - do not edit. MINIMUM_PYMUPDF_VERSION = (1, 26, 6) -VERSION = '0.2.3' +VERSION = '0.2.4' diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index 5c225995..817afef2 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -14,7 +14,7 @@ "Topic :: Utilities", ] -version = "0.2.3" +version = "0.2.4" requires = ["pymupdf>=1.26.6", "tabulate"] text = requires[0].split("=")[1]