diff --git a/CHANGES.md b/CHANGES.md index b2eeb030..3c1d3d04 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,19 @@ # Change Log +## Changes in version 0.2.5 + +### Fixes: + +* [341](https://github.com/pymupdf/RAG/issues/341) - Broken markdown parsing for new line directly followed by 'o'... + +### Other Changes: + +* New parameter `table_format` in method `to_text()` (PyMuPDF-Layout only). This allows selecting the appearance of tables in plain text outputs. The possible values are defined in the list `tabulate.tabulate_formats`. Default is "grid". +* Installaing PyMuPDF4LLM now supports including all optional dependencies in the `pip` command: `pip install --update pymupdf4llm[ocr,layout]`. This will install pymupdf4llm, pymupdf, and pymupdf-layout. The "ocr" parameter - when needed - installs opencv-python for automatic OCR support in PyMuPDF-Layout mode. Combine this with parameters `--update`, `--force-reinstall` or `--no-cache-dir` as necessary. +* Major rework of the heuristics that determine whether a page should be OCR'd. + +------ + ## Changes in version 0.2.4 ### Fixes: @@ -10,6 +24,7 @@ ------ + ## Changes in version 0.2.3 ### Fixes: diff --git a/pymupdf4llm/README.md b/pymupdf4llm/README.md index 00f8c719..4c838849 100644 --- a/pymupdf4llm/README.md +++ b/pymupdf4llm/README.md @@ -1,4 +1,4 @@ -# Using PyMuPDF as Data Feeder in LLM / RAG Applications +# Using PyMuPDF as a Data Feeder in LLM / RAG Applications This package converts the pages of a PDF to text in Markdown format using [PyMuPDF](https://pypi.org/project/PyMuPDF/). @@ -8,8 +8,15 @@ Header lines are identified via the font size and appropriately prefixed with on Bold, italic, mono-spaced text and code blocks are detected and formatted accordingly. Similar applies to ordered and unordered lists. -By default, all document pages are processed. If desired, a subset of pages can be specified by providing a list of 0-based page numbers. +By default, all document pages are processed. If desired, a subset of pages can be specified by providing a sequence of 0-based page numbers. +----- + +[PyMuPDF-Layout](https://pypi.org/project/pymupdf-layout/) is an optional extension of PyMuPDF. It offers AI-based improved page layout analysis, for instance entailing a much higher table recognition. + +Since version 0.2.0, pymupdf4llm fully supports pymupdf-layout. As part of this, output as plain text or a JSON string is also possible. In addition, every page is automatically OCR'd (based on a number of criteria) provided package [opencv-python](https://pypi.org/project/opencv-python/) is installed and Tesseract is available on the platform. + +Layout mode is activated with a simple modification of the import statements - for details, please see below. # Installation @@ -17,13 +24,42 @@ By default, all document pages are processed. If desired, a subset of pages can $ pip install -U pymupdf4llm ``` -> This command will automatically install [PyMuPDF](https://github.com/pymupdf/PyMuPDF) if required. +> This command will automatically install or upgrade [PyMuPDF](https://github.com/pymupdf/PyMuPDF) as required. + +To install all Python packages for full support of the layout feature and automatic OCR, you can use the following command version: + +```bash +$ pip install -U pymupdf4llm[ocr,layout] +``` + +This will install opencv-python and pymupdf-layout in addition to pymupdf4llm and pymupdf. + +# Execution +## Legacy Mode +For **_standard (legacy) markdown extraction_**, use the following simple script + +```python +import pymupdf4llm + +md_text = pymupdf4llm.to_markdown("input.pdf") + +# now work with the markdown text, e.g. store as a UTF8-encoded file +import pathlib +pathlib.Path("output.md").write_bytes(md_text.encode()) +``` + +Instead of the filename string as above, one can also provide a PyMuPDF `Document`. -Then in your script do: +By default, all pages in the PDF will be processed. If desired, the parameter `pages=` can be used to provide a sequence of zero-based page numbers to consider. + +## Layout Mode +To **_activate layout mode_**, use the following ```python +import pymupdf.layout # activate PyMuPDF-Layout in pymupdf import pymupdf4llm +# The remainder of the script is unchanged md_text = pymupdf4llm.to_markdown("input.pdf") # now work with the markdown text, e.g. store as a UTF8-encoded file @@ -31,19 +67,46 @@ import pathlib pathlib.Path("output.md").write_bytes(md_text.encode()) ``` -Instead of the filename string as above, one can also provide a PyMuPDF `Document`. By default, all pages in the PDF will be processed. If desired, the parameter `pages=[...]` can be used to provide a list of zero-based page numbers to consider. +Here are the JSON and plain text output versions. + +### JSON + +```python +import pymupdf.layout # activate PyMuPDF-Layout in pymupdf +import pymupdf4llm + +json_text = pymupdf4llm.to_json("input.pdf") + +# now work with the markdown text, e.g. store as a UTF8-encoded file +import pathlib +pathlib.Path("output.json").write_text(json_text) +``` + +### Plain Text + +```python +import pymupdf.layout # activate PyMuPDF-Layout in pymupdf +import pymupdf4llm + +plain_text = pymupdf4llm.to_text("input.pdf") + +# now work with the markdown text, e.g. store as a UTF8-encoded file +import pathlib +pathlib.Path("output.txt").write_bytes(plain_text.encode()) +``` + **Feature Overview:** * Support for pages with **_multiple text columns_**. * Support for **_image and vector graphics extraction_**: - 1. Specify `pymupdf4llm.to_markdown("input.pdf", write_images=True)`. Default is `False`. - 2. Each image or vector graphic on the page will be extracted and stored as an image named `"input.pdf-pno-index.extension"` in a folder of your choice. The image `extension` can be chosen to represent a PyMuPDF-supported image format (for instance "png" or "jpg"), `pno` is the 0-based page number and `index` is some sequence number. - 3. The image files will have width and height equal to the values on the page. The desired resolution can be chosen via parameter `dpi` (default: `dpi=150`). - 4. Any text contained in the images or graphics will be extracted and **also become visible as part of the generated image**. This behavior can be changed via `force_text=False` (text only apears as part of the image). + 1. Specify either `write_images=True` or `embed_images=True`. Default is `False`. + 2. Images and vector graphics on the page will be stored as images named `"input.pdf-pno-index.extension"` in a folder of your choice or be embedded in the markdown text as base64-encoded strings. The image `extension` can be chosen to represent a PyMuPDF-supported image format (for instance "png" or "jpg"), `pno` is the 0-based page number and `index` is some sequence number. + 3. The image files will have width and height equal to the values on the page. The desired resolution can be chosen via parameter `dpi` (default: `dpi=150`). So this is not an actual **_extraction_** but rather rendering of the respective page area. + 4. Any standard text written in image areas will become a visible part of the generated image and otherwise be ignored. This behavior can be changed via `force_text=True` which causes the text to also become part of the output. -* Support for **page chunks**: Instead of returning one large string for the whole document, a list of dictionaries can be generated: one for each page. Specify `data = pymupdf4llm.to_markdown("input.pdf", page_chunks=True)`. Then, for instance the first item, `data[0]` will contain a dictionary for the first page with the text and some metadata. +* Support for **page chunks**: Instead of returning one large string for the whole document, a list of dictionaries can be generated: one for each page. Specify `data = pymupdf4llm.to_markdown("input.pdf", page_chunks=True)`. Then, for instance the first item, `data[0]` will contain a dictionary for the first page with its text and some metadata. * As a first example for directly supporting LLM / RAG consumers, this version can output **LlamaIndex documents**: @@ -57,6 +120,7 @@ Instead of the filename string as above, one can also provide a PyMuPDF `Documen # Every list item contains metadata and the markdown text of 1 page. ``` - * A LlamaIndex document essentially corresponds to Python dictionary, where the markdown text of the page is one of the dictionary values. For instance the text of the first page is the the value of `data[0].to_dict().["text"]`. + * A LlamaIndex document essentially corresponds to Python dictionary, where the markdown text of the page is one of the dictionary values. For instance the text of the first page is the value of `data[0].to_dict().["text"]`. * For details, please consult LlamaIndex documentation. - * Upon creation of the `LlamaMarkdownReader` all necessary LlamaIndex-related imports are executed. Required related package installations must have been done independently and will not be checked during pymupdf4llm installation. \ No newline at end of file + * Upon creation of the `LlamaMarkdownReader` all necessary LlamaIndex-related imports are executed. Required related package installations must have been done independently and will not be checked during pymupdf4llm installation. + \ No newline at end of file diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py index a0ca993b..06d0221c 100644 --- a/pymupdf4llm/pymupdf4llm/__init__.py +++ b/pymupdf4llm/pymupdf4llm/__init__.py @@ -146,6 +146,7 @@ def to_text( force_text=True, ocr_dpi=400, use_ocr=True, + table_format="grid", # unsupported options for pymupdf layout: **kwargs, ): @@ -164,6 +165,7 @@ def to_text( footer=footer, ignore_code=ignore_code, show_progress=show_progress, + table_format=table_format, ) diff --git a/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py index aa1087a2..450d5b62 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py +++ b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py @@ -107,8 +107,48 @@ -------------------------------------------------------------------------- """ +""" +Functions detecting general photos versus text-heavy images. +""" + + +def entropy_check(img_gray, threshold=4.5): + """Compute Shannon entropy of grayscale image.""" + hist = cv2.calcHist([img_gray], [0], None, [256], [0, 256]) + hist = hist.ravel() / hist.sum() + hist = hist[hist > 0] + entropy = -np.sum(hist * np.log2(hist)) + return entropy < threshold, entropy + + +def fft_check(img_gray, threshold=0.15): + """Check ratio of high-frequency energy in FFT spectrum.""" + # Downsample for speed + small = cv2.resize(img_gray, (128, 128)) + f = np.fft.fft2(small) + fshift = np.fft.fftshift(f) + magnitude = np.abs(fshift) + h, w = magnitude.shape + center = magnitude[h // 4 : 3 * h // 4, w // 4 : 3 * w // 4] + ratio = center.sum() / magnitude.sum() + return ratio < threshold, ratio -def get_span_ocr(page, bbox, dpi=300): + +def components_check(img_gray, min_components=50): + """Count connected components after thresholding.""" + _, bw = cv2.threshold(img_gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + num_labels, _ = cv2.connectedComponents(bw) + return num_labels < min_components, num_labels + + +def edge_density_check(img_gray, threshold=0.01): + """Compute edge density using Canny.""" + edges = cv2.Canny(img_gray, 100, 200) + density = edges.sum() / 255.0 / edges.size + return density < threshold, density + + +def get_span_ocr(page, bbox, dpi=400): """Return OCR'd span text using Tesseract. Args: @@ -127,7 +167,7 @@ def get_span_ocr(page, bbox, dpi=300): return text -def repair_blocks(input_blocks, page): +def repair_blocks(input_blocks, page, dpi=400): """Repair text blocks with missing glyphs using OCR. TODO: Support non-linear block structure. @@ -148,7 +188,7 @@ def repair_blocks(input_blocks, page): if not REPLACEMENT_CHARACTER in span_text: continue span_text_len = len(span_text) - new_text = get_span_ocr(page, span["bbox"])[:span_text_len] + new_text = get_span_ocr(page, span["bbox"], dpi=dpi)[:span_text_len] if "chars" in span: # rebuild chars array new_chars = [] @@ -177,25 +217,48 @@ def get_page_image(page, dpi=150, covered=None): if covered is None: covered = page.rect covered = covered.irect - pix = page.get_pixmap(dpi=dpi) - matrix = pymupdf.Rect(pix.irect).torect(page.rect) - - # make a sub-pixmap of the covered area - pix_covered = pymupdf.Pixmap(pymupdf.csRGB, covered) - pix_covered.copy(pix, covered) # copy over covered area + # make a gray pixmap of the covered area + pix_covered = page.get_pixmap(colorspace=pymupdf.csGRAY, clip=covered) # convert to numpy array - img = np.frombuffer(pix_covered.samples, dtype=np.uint8).reshape( + gray = np.frombuffer(pix_covered.samples, dtype=np.uint8).reshape( pix_covered.height, pix_covered.width, pix_covered.n ) - # cv2 needs the gray image version of this - gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) - return gray, matrix, pix + photo_entropy, entropy_val = entropy_check(gray) + photo_fft, fft_val = fft_check(gray) + photo_components, comp_val = components_check(gray) + photo_edges, edge_val = edge_density_check(gray) + + # print(f"Entropy: {entropy_val:.3f} → {photo_entropy}") + # print(f"FFT ratio: {fft_val:.3f} → {photo_fft}") + # print(f"Components: {comp_val} → {photo_components}") + # print(f"Edge density: {edge_val:.6f} → {photo_edges}") + + # Weighted decision logic + score = 0 + if photo_components: + score += 2 + if photo_edges: + score += 2 + if photo_entropy: + score += 1 + if photo_fft: + score += 1 + # print(f"{score=}") + if score >= 3: + pix = None + matrix = pymupdf.Identity + photo = True + else: + pix = page.get_pixmap(dpi=dpi) + matrix = pymupdf.Rect(pix.irect).torect(page.rect) + photo = False + + return matrix, pix, photo def should_ocr_page( page, dpi=150, - edge_thresh=0.02, vector_thresh=0.9, image_coverage_thresh=0.9, text_readability_thresh=0.9, @@ -207,7 +270,6 @@ def should_ocr_page( Parameters: page: PyMuPDF page object dpi: DPI used for rasterization - edge_thresh: minimum edge density to suggest text presence vector_thresh: minimum number of vector paths to suggest glyph simulation image_coverage_thresh: fraction of page area covered by images to trigger OCR text_readability_thresh: fraction of readable characters to skip OCR @@ -225,7 +287,6 @@ def should_ocr_page( "has_vector_chars": False, "transform": pymupdf.Identity, "pixmap": None, - "edge_density": 0.0, } page_rect = page.rect page_area = abs(page_rect) # size of the full page @@ -279,21 +340,16 @@ def should_ocr_page( assert decision["should_ocr"] is True if not decision["has_text"]: - # Rasterize and analyze edge density - img, matrix, pix = get_page_image(page, dpi=dpi, covered=analysis["covered"]) + # Rasterize and check for photo versus text-heaviness + matrix, pix, photo = get_page_image(page, dpi=dpi, covered=analysis["covered"]) - # Analyze edge density - edges = cv2.Canny(img, 100, 200) - decision["edge_density"] = float(np.sum(edges > 0) / edges.size) - if decision["edge_density"] <= edge_thresh: + if photo: # this seems to be a non-text picture page decision["should_ocr"] = False + decision["pixmap"] = None else: decision["should_ocr"] = True decision["transform"] = matrix decision["pixmap"] = pix - if decision["should_ocr"]: - decision["transform"] = matrix - decision["pixmap"] = pix return decision diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py index 9b872d12..2a278332 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py +++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py @@ -38,7 +38,7 @@ | pymupdf.TEXT_COLLECT_VECTORS | pymupdf.TEXT_PRESERVE_IMAGES | pymupdf.TEXT_ACCURATE_BBOXES - # | pymupdf.TEXT_MEDIABOX_CLIP + | pymupdf.TEXT_MEDIABOX_CLIP ) BULLETS = tuple(utils.BULLETS) @@ -387,6 +387,9 @@ def list_item_to_md(textlines, level): This post-layout heuristics helps cover cases where more than one list item is contained in a single bbox. """ + + if not textlines: + return "" indent = " " * (level - 1) # indentation based on level line = textlines[0] x0 = line["bbox"][0] # left of first line @@ -395,7 +398,7 @@ def list_item_to_md(textlines, level): span0_text = span0["text"].strip() starter = "- " - if span0_text.startswith(BULLETS): + if utils.startswith_bullet(span0_text): span0_text = span0_text[1:].strip() line["spans"][0]["text"] = span0_text elif span0_text.endswith(".") and span0_text[:-1].isdigit(): @@ -714,10 +717,14 @@ def to_text( footer: bool = True, ignore_code: bool = False, show_progress: bool = False, + table_format: str = "grid", ) -> str: """ Serialize ParsedDocument to plain text. Optionally omit page headers or footers. """ + if table_format not in tabulate.tabulate_formats: + print(f"Warning: invalid table format '{table_format}', using 'grid'.") + table_format = "grid" # Flatten all text boxes into plain text output = "" if show_progress and len(self.pages) > 5: @@ -752,7 +759,7 @@ def to_text( continue if btype == "table": output += ( - tabulate.tabulate(box.table["extract"], tablefmt="grid") + tabulate.tabulate(box.table["extract"], tablefmt=table_format) + "\n\n" ) continue @@ -816,7 +823,7 @@ def parse_document( assert pymupdf.get_tessdata() document.use_ocr = True except Exception as e: - print(f"OCR disabled: {reason}.") + print(f"OCR disabled because {reason}.") document.use_ocr = False else: document.use_ocr = False @@ -842,7 +849,7 @@ def parse_document( page_filter = ProgressBar(page_filter) for pno in page_filter: page = mydoc.load_page(pno) - textpage = page.get_textpage(flags=FLAGS) + textpage = page.get_textpage(flags=FLAGS, clip=pymupdf.INFINITE_RECT()) blocks = textpage.extractDICT()["blocks"] page_full_ocred = False page_text_ocred = False @@ -851,7 +858,6 @@ def parse_document( decision = check_ocr.should_ocr_page( page, dpi=ocr_dpi, - edge_thresh=0.015, blocks=blocks, ) else: @@ -884,7 +890,7 @@ def parse_document( page.show_pdf_page(page.rect, ocr_pdf, 0) ocr_pdf.close() # discard temporary OCR PDF del ocr_pdf - textpage = page.get_textpage(flags=FLAGS) + textpage = page.get_textpage(flags=FLAGS, clip=pymupdf.INFINITE_RECT()) blocks = textpage.extractDICT()["blocks"] page_full_ocred = True else: diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index 6b860800..aa5b2cc5 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -45,7 +45,7 @@ from pymupdf import mupdf from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white from pymupdf4llm.helpers.multi_column import column_boxes -from pymupdf4llm.helpers.utils import BULLETS +from pymupdf4llm.helpers.utils import BULLETS, REPLACEMENT_CHARACTER, startswith_bullet try: from tqdm import tqdm as ProgressBar @@ -54,9 +54,6 @@ pymupdf.TOOLS.unset_quad_corrections(True) -# Characters assumed as bullets when starting a line. -bullet = tuple(BULLETS | {"- ", "* ", "> "}) - GRAPHICS_TEXT = "\n![](%s)\n" @@ -301,11 +298,15 @@ def is_significant(box, paths): def to_json(*args, **kwargs): - raise NotImplementedError("Function 'to_json' is only available in layout mode") + raise NotImplementedError( + "Function 'to_json' is only available in PyMuPDF-Layout mode" + ) def to_text(*args, **kwargs): - raise NotImplementedError("Function 'to_text' is only available in layout mode") + raise NotImplementedError( + "Function 'to_text' is only available in PyMuPDF-Layout mode" + ) def to_markdown( @@ -313,8 +314,6 @@ def to_markdown( *, pages=None, hdr_info=None, - header=None, - footer=None, write_images=False, embed_images=False, ignore_images=False, @@ -339,6 +338,7 @@ def to_markdown( show_progress=False, use_glyphs=False, ignore_alpha=False, + **kwargs, ) -> str: """Process the document and return the text of the selected pages. @@ -366,12 +366,11 @@ def to_markdown( ignore_alpha: (bool, True) ignore text with alpha = 0 (transparent). """ + if kwargs.keys(): + print(f"Warning - arguments ignored in legacy mode: {set(kwargs.keys())}.") + if write_images is False and embed_images is False and force_text is False: - raise ValueError("Image and text on images cannot both be suppressed.") - if header is not None: - raise NotImplementedError("Page header handling only works in layout mode") - if footer is not None: - raise NotImplementedError("Page footer handling only works in layout mode") + raise ValueError("Images and text on images cannot both be suppressed.") if embed_images is True: write_images = False image_path = "" @@ -682,7 +681,7 @@ def write_text( prev_lrect and lrect.y1 - prev_lrect.y1 > lrect.height * 1.5 or span0["text"].startswith("[") - or span0["text"].startswith(bullet) + or startswith_bullet(span0["text"]) or span0["flags"] & 1 # superscript? ): out_string += "\n" @@ -721,7 +720,7 @@ def write_text( text = f"{hdr_string}{prefix}{ltext}{suffix} " else: text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} " - if text.startswith(bullet): + if startswith_bullet(text): text = "- " + text[1:] text = text.replace(" ", " ") dist = span0["bbox"][0] - clip.x0 @@ -1169,7 +1168,7 @@ def get_page_output( while parms.md_string.startswith("\n"): parms.md_string = parms.md_string[1:] - parms.md_string = parms.md_string.replace(chr(0), chr(0xFFFD)) + parms.md_string = parms.md_string.replace(chr(0), REPLACEMENT_CHARACTER) if EXTRACT_WORDS is True: # output words in sequence compliant with Markdown text @@ -1213,12 +1212,18 @@ def get_page_output( # omit clipped text, collect styles, use accurate bounding boxes textflags = ( 0 - | mupdf.FZ_STEXT_CLIP - | mupdf.FZ_STEXT_ACCURATE_BBOXES - # | mupdf.FZ_STEXT_IGNORE_ACTUALTEXT - | 32768 # mupdf.FZ_STEXT_COLLECT_STYLES + | pymupdf.TEXT_MEDIABOX_CLIP + # | pymupdf.TEXT_ACCURATE_BBOXES + | pymupdf.TEXT_COLLECT_STYLES + ) + pymupdf.table.FLAGS = ( + 0 + | pymupdf.TEXTFLAGS_TEXT + | pymupdf.TEXT_COLLECT_STYLES + # | pymupdf.TEXT_ACCURATE_BBOXES + | pymupdf.TEXT_MEDIABOX_CLIP ) - # optionally replace 0xFFFD by glyph number + # optionally replace REPLACEMENT_CHARACTER by glyph number if use_glyphs: textflags |= mupdf.FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py index 03f9cdf8..f6e261a8 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/utils.py +++ b/pymupdf4llm/pymupdf4llm/helpers/utils.py @@ -21,9 +21,13 @@ ] ) -BULLETS = set( - [ +REPLACEMENT_CHARACTER = chr(0xFFFD) + +BULLETS = tuple( + { + chr(0x2A), chr(0x2D), + chr(0x3E), chr(0x6F), chr(0xB6), chr(0xB7), @@ -41,9 +45,9 @@ chr(0x2219), chr(0xF0A7), chr(0xF0B7), - chr(0xFFFD), - ] - + list(map(chr, range(0x25A0, 0x2600))) + REPLACEMENT_CHARACTER, + } + | set(map(chr, range(0x25A0, 0x2600))) ) FLAGS = ( @@ -52,10 +56,20 @@ | pymupdf.TEXT_COLLECT_VECTORS | pymupdf.TEXT_PRESERVE_IMAGES | pymupdf.TEXT_ACCURATE_BBOXES - # | pymupdf.TEXT_MEDIABOX_CLIP + | pymupdf.TEXT_MEDIABOX_CLIP ) -REPLACEMENT_CHARACTER = chr(0xFFFD) + +def startswith_bullet(text): + if not text: + return False + if not text.startswith(BULLETS): + return False + if len(text) == 1: + return True + if text[1] == " ": + return True + return False def is_white(text): @@ -81,7 +95,7 @@ def analyze_page(page, blocks=None) -> dict: "vec_area": float, fraction of sum of vector character area sizes "chars_total": int, count of visible characters "chars_bad": int, count of Replacement Unicode characters - "ocr_spans": int, count of text spans with 'GlyphLessFont' + "ocr_spans": int, count: text spans with ignored text (render mode 3) """ chars_total = 0 @@ -100,9 +114,12 @@ def analyze_page(page, blocks=None) -> dict: vec_area = 0 ocr_spans = 0 for b in blocks: + # Intersect each block bbox with the page rectangle. + # Note that this has no effect on text because of the clipping flags, + # which causes that we will not see ANY clipped text. bbox = page.rect & b["bbox"] area = bbox.width * bbox.height - if not area: + if not area: # skip any empty block continue if b["type"] == 1: # Image block img_rect |= bbox @@ -115,12 +132,18 @@ def analyze_page(page, blocks=None) -> dict: sr = page.rect & s["bbox"] if sr.is_empty or sr.is_infinite: continue - if s["font"] == "GlyphLessFont": + if ( + 0 + or s["font"] == "GlyphLessFont" + or (s["char_flags"] & 8 == 0 and s["char_flags"] & 16 == 0) + ): ocr_spans += 1 elif s["alpha"] == 0: continue # skip invisible text chars_total += len(s["text"].strip()) - chars_bad += len([c for c in s["text"] if c == chr(0xFFFD)]) + chars_bad += len( + [c for c in s["text"] if c == REPLACEMENT_CHARACTER] + ) txt_rect |= sr txt_area += sr.width * sr.height elif ( @@ -841,7 +864,7 @@ def outside_cell(bbox, cell): if not span_text.strip(): text += " " else: - text += prefix + span_text + suffix + text += prefix + span_text.rstrip() + suffix text = ( text.replace("$
", "$ ") .replace(" $
", "$ ") diff --git a/pymupdf4llm/pymupdf4llm/versions_file.py b/pymupdf4llm/pymupdf4llm/versions_file.py index a81d3226..f68a2df4 100644 --- a/pymupdf4llm/pymupdf4llm/versions_file.py +++ b/pymupdf4llm/pymupdf4llm/versions_file.py @@ -1,3 +1,3 @@ # Generated file - do not edit. MINIMUM_PYMUPDF_VERSION = (1, 26, 6) -VERSION = '0.2.4' +VERSION = '0.2.5' diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index 817afef2..79c08042 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -1,10 +1,7 @@ -import os import setuptools from pathlib import Path -setup_py_cwd = os.path.dirname(__file__) -with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f: - readme = f.read() +readme = Path("README.md").read_bytes().decode() classifiers = [ "Development Status :: 5 - Production/Stable", @@ -14,12 +11,16 @@ "Topic :: Utilities", ] -version = "0.2.4" -requires = ["pymupdf>=1.26.6", "tabulate"] +version = "0.2.5" +pymupdf_version = "1.26.6" +pymupdf_version_tuple = tuple(int(x) for x in pymupdf_version.split(".")) +requires = [f"pymupdf>={pymupdf_version}", "tabulate"] +extras_require = { + "ocr": ["opencv-python"], + "layout": [f"pymupdf-layout>={pymupdf_version}"], +} -text = requires[0].split("=")[1] -text = tuple(map(int, text.split("."))) -text = f"# Generated file - do not edit.\nMINIMUM_PYMUPDF_VERSION = {text}\nVERSION = '{version}'\n" +text = f"# Generated file - do not edit.\nMINIMUM_PYMUPDF_VERSION = {pymupdf_version_tuple}\nVERSION = '{version}'\n" Path("pymupdf4llm/versions_file.py").write_text(text) setuptools.setup( @@ -32,6 +33,7 @@ long_description=readme, long_description_content_type="text/markdown", install_requires=requires, + extras_require=extras_require, python_requires=">=3.10", license="Dual Licensed - GNU AFFERO GPL 3.0 or Artifex Commercial License", url="https://github.com/pymupdf/RAG",