Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Change Log

## Changes in version 0.2.4

### Fixes:

* [335](https://github.com/pymupdf/RAG/issues/335) - KeyError "has_ocr_text"

### Other Changes:


------
## Changes in version 0.2.3

### Fixes:
Expand Down
2 changes: 1 addition & 1 deletion pdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
readme = f.read()

version = "0.2.3" # must always equal the pymupdf4llm version
version = "0.2.4" # must always equal the pymupdf4llm version

classifiers = [
"Development Status :: 5 - Production/Stable",
Expand Down
9 changes: 9 additions & 0 deletions pymupdf4llm/pymupdf4llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def parse_document(
embed_images=False,
show_progress=False,
force_text=True,
use_ocr=True,
):
return document_layout.parse_document(
doc,
Expand All @@ -50,6 +51,7 @@ def parse_document(
embed_images=embed_images,
show_progress=show_progress,
force_text=force_text,
use_ocr=use_ocr,
)

def to_markdown(
Expand All @@ -72,6 +74,7 @@ def to_markdown(
page_height=None,
ignore_code=False,
show_progress=False,
use_ocr=True,
# unsupported options for pymupdf layout:
**kwargs,
):
Expand All @@ -89,6 +92,7 @@ def to_markdown(
embed_images=embed_images,
show_progress=show_progress,
force_text=force_text,
use_ocr=use_ocr,
)
return parsed_doc.to_markdown(
header=header,
Expand All @@ -99,6 +103,7 @@ def to_markdown(
show_progress=show_progress,
page_separators=page_separators,
page_chunks=page_chunks,
use_ocr=use_ocr,
)

def to_json(
Expand All @@ -112,6 +117,7 @@ def to_json(
embed_images=False,
show_progress=False,
force_text=True,
use_ocr=True,
# unsupported options for pymupdf layout:
**kwargs,
):
Expand All @@ -125,6 +131,7 @@ def to_json(
write_images=write_images,
show_progress=show_progress,
force_text=force_text,
use_ocr=use_ocr,
)
return parsed_doc.to_json()

Expand All @@ -138,6 +145,7 @@ def to_text(
show_progress=False,
force_text=True,
ocr_dpi=400,
use_ocr=True,
# unsupported options for pymupdf layout:
**kwargs,
):
Expand All @@ -149,6 +157,7 @@ def to_text(
write_images=False,
show_progress=show_progress,
force_text=force_text,
use_ocr=use_ocr,
)
return parsed_doc.to_text(
header=header,
Expand Down
97 changes: 1 addition & 96 deletions pymupdf4llm/pymupdf4llm/helpers/check_ocr.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import cv2
import numpy as np
import pymupdf # PyMuPDF
from pymupdf4llm.helpers.utils import WHITE_CHARS
from pymupdf4llm.helpers.utils import WHITE_CHARS, analyze_page

FLAGS = (
0
Expand Down Expand Up @@ -108,11 +108,6 @@
"""


def is_white(text):
"""Identify white text."""
return WHITE_CHARS.issuperset(text)


def get_span_ocr(page, bbox, dpi=300):
"""Return OCR'd span text using Tesseract.

Expand Down Expand Up @@ -197,96 +192,6 @@ def get_page_image(page, dpi=150, covered=None):
return gray, matrix, pix


def analyze_page(page, blocks=None) -> dict:
"""Analyze the page for the OCR decision.

Args:
blocks: output of page.get_text("dict") if already available
Returns:
A dict with analysis results. The area-related float values are
computed as fractions of the total covered area.

"covered": pymupdf.Rect, page area covered by content
"img_joins": float, fraction of area of the joined images
"img_area": float, fraction of sum of image area sizes
"txt_joins": float, fraction of area of the joined text spans
"txt_area": float, fraction of sum of text span bbox area sizes
"vec_joins": float, fraction of area of the joined vector characters
"vec_area": float, fraction of sum of vector character area sizes
"chars_total": int, count of visible characters
"chars_bad": int, count of Replacement Unicode characters
"ocr_spans": int, count of text spans with 'GlyphLessFont'

"""
chars_total = 0
chars_bad = 0
if blocks is None:
blocks = page.get_text(
"dict",
flags=FLAGS,
clip=pymupdf.INFINITE_RECT(),
)["blocks"]
img_rect = pymupdf.EMPTY_RECT()
txt_rect = +img_rect
vec_rect = +img_rect
img_area = 0
txt_area = 0
vec_area = 0
ocr_spans = 0
for b in blocks:
bbox = page.rect & b["bbox"]
area = bbox.width * bbox.height
if not area:
continue
if b["type"] == 1: # Image block
img_rect |= bbox
img_area += area
elif b["type"] == 0: # Text block
for l in b["lines"]:
for s in l["spans"]:
if is_white(s["text"]):
continue
sr = page.rect & s["bbox"]
if sr.is_empty or sr.is_infinite:
continue
if s["font"] == "GlyphLessFont":
ocr_spans += 1
elif s["alpha"] == 0:
continue # skip invisible text
chars_total += len(s["text"].strip())
chars_bad += len([c for c in s["text"] if c == chr(0xFFFD)])
txt_rect |= sr
txt_area += sr.width * sr.height
elif (
1
and b["type"] == 3 # vector block
and b["stroked"] # has been stroked
and bbox.width <= 20 # width limit for typical characters
and bbox.height <= 20 # height limit for typical characters
and not b["isrect"] # contains curves
):
# potential character-like vector block
vec_rect |= bbox
vec_area += area

# the rectangle on page covered by some content
covered = img_rect | txt_rect | vec_rect
cover_area = abs(covered)
analysis = {
"covered": covered,
"img_joins": (abs(img_rect) / cover_area) if cover_area else 0,
"img_area": img_area / cover_area if cover_area else 0,
"txt_joins": (abs(txt_rect) / cover_area) if cover_area else 0,
"txt_area": txt_area / cover_area if cover_area else 0,
"vec_area": vec_area / cover_area if cover_area else 0,
"vec_joins": (abs(vec_rect) / cover_area) if cover_area else 0,
"chars_total": chars_total,
"chars_bad": chars_bad,
"ocr_spans": ocr_spans,
}
return analysis


def should_ocr_page(
page,
dpi=150,
Expand Down
28 changes: 19 additions & 9 deletions pymupdf4llm/pymupdf4llm/helpers/document_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@
from pymupdf4llm.helpers.progress import ProgressBar
try:
import cv2
from pymupdf4llm.helpers import check_ocr

if hasattr(cv2, "Canny"):
from pymupdf4llm.helpers import check_ocr
else:
cv2 = None
except ImportError:
cv2 = None

Expand Down Expand Up @@ -777,6 +781,7 @@ def parse_document(
embed_images=False,
write_images=False,
force_text=False,
use_ocr=True,
) -> ParsedDocument:
if isinstance(doc, pymupdf.Document):
mydoc = doc
Expand All @@ -803,14 +808,17 @@ def parse_document(
raise ValueError("Cannot both embed and write images.")
document.embed_images = embed_images
document.write_images = write_images
try:
reason = "OpenCV not installed"
assert cv2 is not None
reason = "Tesseract language data not found"
assert pymupdf.get_tessdata()
document.use_ocr = True
except Exception as e:
print(f"{reason}. OCR disabled.", file=INFO_MESSAGES)
if use_ocr:
try:
reason = "OpenCV not installed"
assert cv2 is not None
reason = "Tesseract language data not found"
assert pymupdf.get_tessdata()
document.use_ocr = True
except Exception as e:
print(f"OCR disabled: {reason}.")
document.use_ocr = False
else:
document.use_ocr = False
if pages is None:
page_filter = range(mydoc.page_count)
Expand Down Expand Up @@ -848,6 +856,8 @@ def parse_document(
)
else:
decision = {"should_ocr": False}
page_analysis = utils.analyze_page(page, blocks)
decision["has_ocr_text"] = page_analysis["ocr_spans"] > 0

if decision["has_ocr_text"]: # prevent MD styling if already OCR'd
page_full_ocred = True
Expand Down
106 changes: 106 additions & 0 deletions pymupdf4llm/pymupdf4llm/helpers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,112 @@
+ list(map(chr, range(0x25A0, 0x2600)))
)

FLAGS = (
0
| pymupdf.TEXT_COLLECT_STYLES
| pymupdf.TEXT_COLLECT_VECTORS
| pymupdf.TEXT_PRESERVE_IMAGES
| pymupdf.TEXT_ACCURATE_BBOXES
# | pymupdf.TEXT_MEDIABOX_CLIP
)

REPLACEMENT_CHARACTER = chr(0xFFFD)


def is_white(text):
"""Identify white text."""
return WHITE_CHARS.issuperset(text)


def analyze_page(page, blocks=None) -> dict:
"""Analyze the page for the OCR decision.

Args:
blocks: output of page.get_text("dict") if already available
Returns:
A dict with analysis results. The area-related float values are
computed as fractions of the total covered area.

"covered": pymupdf.Rect, page area covered by content
"img_joins": float, fraction of area of the joined images
"img_area": float, fraction of sum of image area sizes
"txt_joins": float, fraction of area of the joined text spans
"txt_area": float, fraction of sum of text span bbox area sizes
"vec_joins": float, fraction of area of the joined vector characters
"vec_area": float, fraction of sum of vector character area sizes
"chars_total": int, count of visible characters
"chars_bad": int, count of Replacement Unicode characters
"ocr_spans": int, count of text spans with 'GlyphLessFont'

"""
chars_total = 0
chars_bad = 0
if blocks is None:
blocks = page.get_text(
"dict",
flags=FLAGS,
clip=pymupdf.INFINITE_RECT(),
)["blocks"]
img_rect = pymupdf.EMPTY_RECT()
txt_rect = +img_rect
vec_rect = +img_rect
img_area = 0
txt_area = 0
vec_area = 0
ocr_spans = 0
for b in blocks:
bbox = page.rect & b["bbox"]
area = bbox.width * bbox.height
if not area:
continue
if b["type"] == 1: # Image block
img_rect |= bbox
img_area += area
elif b["type"] == 0: # Text block
for l in b["lines"]:
for s in l["spans"]:
if is_white(s["text"]):
continue
sr = page.rect & s["bbox"]
if sr.is_empty or sr.is_infinite:
continue
if s["font"] == "GlyphLessFont":
ocr_spans += 1
elif s["alpha"] == 0:
continue # skip invisible text
chars_total += len(s["text"].strip())
chars_bad += len([c for c in s["text"] if c == chr(0xFFFD)])
txt_rect |= sr
txt_area += sr.width * sr.height
elif (
1
and b["type"] == 3 # vector block
and b["stroked"] # has been stroked
and bbox.width <= 20 # width limit for typical characters
and bbox.height <= 20 # height limit for typical characters
and not b["isrect"] # contains curves
):
# potential character-like vector block
vec_rect |= bbox
vec_area += area

# the rectangle on page covered by some content
covered = img_rect | txt_rect | vec_rect
cover_area = abs(covered)
analysis = {
"covered": covered,
"img_joins": (abs(img_rect) / cover_area) if cover_area else 0,
"img_area": img_area / cover_area if cover_area else 0,
"txt_joins": (abs(txt_rect) / cover_area) if cover_area else 0,
"txt_area": txt_area / cover_area if cover_area else 0,
"vec_area": vec_area / cover_area if cover_area else 0,
"vec_joins": (abs(vec_rect) / cover_area) if cover_area else 0,
"chars_total": chars_total,
"chars_bad": chars_bad,
"ocr_spans": ocr_spans,
}
return analysis


def table_cleaner(page, blocks, tbbox):
"""Clean the table bbox 'tbbox'.
Expand Down
2 changes: 1 addition & 1 deletion pymupdf4llm/pymupdf4llm/versions_file.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Generated file - do not edit.
MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
VERSION = '0.2.3'
VERSION = '0.2.4'
2 changes: 1 addition & 1 deletion pymupdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"Topic :: Utilities",
]

version = "0.2.3"
version = "0.2.4"
requires = ["pymupdf>=1.26.6", "tabulate"]

text = requires[0].split("=")[1]
Expand Down