From 5ba23e3a500d47bf1914a58a3cc3f5f70906bd65 Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Thu, 11 Jul 2024 11:44:12 -0400 Subject: [PATCH] Changes for v0.0.9 See changes.rst --- docs/src/changes.rst | 18 +++- pymupdf4llm/pymupdf4llm/__init__.py | 2 +- .../pymupdf4llm/helpers/get_text_lines.py | 91 ++++++++++++------- .../pymupdf4llm/helpers/pymupdf_rag.py | 29 ++++-- pymupdf4llm/setup.py | 2 +- 5 files changed, 101 insertions(+), 41 deletions(-) diff --git a/docs/src/changes.rst b/docs/src/changes.rst index f3616f1b..b8436151 100644 --- a/docs/src/changes.rst +++ b/docs/src/changes.rst @@ -4,6 +4,22 @@ Change Log =========================================================================== +Changes in version 0.0.9 +-------------------------- + +Fixes: +~~~~~~~ + +* `71 `_ "Unexpected results in pymupdf4llm but pymupdf works" +* `68 `_ "Issue with text extraction near footer of page" + + +Improvements: +~~~~~~~~~~~~~~ +* Improved identification of scattered text span particles. This should address most issues with out-of-sequence situations. +* We now correctly process rotated pages (see issue #68). + + Changes in version 0.0.8 -------------------------- @@ -24,7 +40,7 @@ Fixes: Improvements: ~~~~~~~~~~~~~~~~ -* Improved the algorithm dealing with vector graphics. Vector graphics are now more reliably classified as irrelevant when they are simple background for text (quite often the case for code snippets). +* Improved the algorithm dealing with vector graphics. Vector graphics are now more reliably classified as irrelevant: We now detect when "strokes" only exist in the neighborhood of the graphics boundary box border itself. This is quite often the case for code snippets. Changes in version 0.0.6 diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py index 2b3b7d3f..fd721dca 100644 --- a/pymupdf4llm/pymupdf4llm/__init__.py +++ b/pymupdf4llm/pymupdf4llm/__init__.py @@ -1,6 +1,6 @@ from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown -__version__ = "0.0.8" +__version__ = "0.0.9" version = __version__ version_tuple = tuple(map(int, version.split("."))) diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py index 08f96926..6b972113 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py +++ b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py @@ -28,33 +28,63 @@ def is_white(text): def get_raw_lines(textpage, clip=None, tolerance=3): - """Extract the text spans from a TextPage in a natural reading sequence. + """Extract the text spans from a TextPage in natural reading sequence. All spans roughly on the same line are joined to generate an improved line. This copes with MuPDF's algorithm that generates new lines also for spans - whose horizontal distance is larger than some hreshold. + whose horizontal distance is larger than some threshold. Result is a sorted list of line objects that consist of the recomputed line - rectangle and a sorted list of spans in that line. + boundary box and the sorted list of spans in that line. - This result can then be easily converted e.g. to plain or markdown text. + This result can then easily be converted e.g. to plain or markdown text. Args: textpage: (mandatory) TextPage object - clip: (Rect) specifies a sub-rectangle of the textpage rect (which also - may be based on some part of the original page). + clip: (Rect) specifies a sub-rectangle of the textpage rect (which in + turn may be based on a sub-rectangle of the full page). tolerance: (float) put spans on the same line if their top or bottom - coordinate differ by no mor than this value. + coordinate differ by no more than this value. Returns: - A sorted list of items (rect, [spans]), each representing a line. The - spans are sorted left to right, Span dictionaries have been changed - in that "bbox" is a Rect object and "line" is an integer representing - the line number of the span. This allows to detect where MuPDF has - generated line breaks to indicate large inter-span distances. + A sorted list of items (rect, [spans]), each representing one line. The + spans are sorted left to right, Span dictionaries have been changed: + - "bbox" has been converted to a Rect object + - "line" (new) the line number in TextPage.extractDICT + - "block" (new) the block number in TextPage.extractDICT + This allows to detect where MuPDF has generated line breaks to indicate + large inter-span distances. """ y_delta = tolerance # allowable vertical coordinate deviation - if clip == None: # use TextPage if not provided + + def sanitize_spans(line): + """Sort and join the spans in a re-synthesized line. + + The PDF may contain "broken" text with words cut into pieces. + This funtion joins spans representing the particles and sorts them + left to right. + + Arg: + A list of spans - as drived from TextPage.extractDICT() + Returns: + A list of sorted, and potentially cleaned-up spans + """ + line.sort(key=lambda s: s["bbox"].x0) # sort left to right + for i in range(len(line) - 1, 0, -1): # iterate back to front + s0 = line[i - 1] + s1 = line[i] + # "delta" depends on the font size. Spans will be joined if + # no more than 10% of the font size separates them. + delta = s1["size"] * 0.1 + if s0["bbox"].x1 + delta < s1["bbox"].x0: + continue # all good: no joining neded + s0["bbox"] |= s1["bbox"] # join boundary boxes + s0["text"] += s1["text"] # join the text + del line[i] # delete the joined-in span + line[i - 1] = s0 # update the span + return line + + if clip is None: # use TextPage if not provided clip = textpage.rect # extract text blocks - if bbox is not empty blocks = [ @@ -63,40 +93,38 @@ def get_raw_lines(textpage, clip=None, tolerance=3): if b["type"] == 0 and not fitz.Rect(b["bbox"]).is_empty ] spans = [] # all spans in TextPage here - for bno, b in enumerate(blocks): - for lno, line in enumerate(b["lines"]): - lbbox = fitz.Rect(line["bbox"]) - for sno, s in enumerate(line["spans"]): - sbbox = fitz.Rect(s["bbox"]) # turn to a Rect + for bno, b in enumerate(blocks): # the numbered blocks + for lno, line in enumerate(b["lines"]): # the numbered lines + for sno, s in enumerate(line["spans"]): # the numered spans + sbbox = fitz.Rect(s["bbox"]) # span bbox as a Rect mpoint = (sbbox.tl + sbbox.br) / 2 # middle point if mpoint not in clip: continue if is_white(s["text"]): # ignore white text continue - if s["flags"] & 1 == 1: # if a superscript, modify + if s["flags"] & 1 == 1: # if a superscript, modify bbox + # with that of the preceding or following span i = 1 if sno == 0 else sno - 1 neighbor = line["spans"][i] sbbox.y1 = neighbor["bbox"][3] s["text"] = f"[{s['text']}]" s["bbox"] = sbbox # update with the Rect version - # include line identifier to facilitate separator insertion + # include line/block numbers to facilitate separator insertion s["line"] = lno s["block"] = bno spans.append(s) - if not spans: # we may have no text at all + if not spans: # no text at all return [] - spans.sort( - key=lambda s: s["bbox"].y1 - ) # sort spans by assending bottom coord + spans.sort(key=lambda s: s["bbox"].y1) # sort spans by bottom coord nlines = [] # final result - line = [spans[0]] # collects spans with fitting vertical coordinate + line = [spans[0]] # collects spans with fitting vertical coordinates lrect = spans[0]["bbox"] # rectangle joined from span rectangles - for s in spans[1:]: - sbbox = s["bbox"] - sbbox0 = line[-1]["bbox"] + for s in spans[1:]: # walk through the spans + sbbox = s["bbox"] # this bbox + sbbox0 = line[-1]["bbox"] # previous bbox # if any of top or bottom coordinates are close enough, join... if ( abs(sbbox.y1 - sbbox0.y1) <= y_delta @@ -107,7 +135,7 @@ def get_raw_lines(textpage, clip=None, tolerance=3): continue # end of current line, sort its spans from left to right - line.sort(key=lambda s: s["bbox"].x0) + line = sanitize_spans(line) # append line rect and its spans to final output nlines.append([lrect, line]) @@ -116,7 +144,7 @@ def get_raw_lines(textpage, clip=None, tolerance=3): lrect = sbbox # initialize its rectangle # need to append last line in the same way - line.sort(key=lambda s: s["bbox"].x0) + line = sanitize_spans(line) nlines.append([lrect, line]) return nlines @@ -143,6 +171,7 @@ def get_text_lines( Returns: String of plain text in reading sequence. """ + textflags = fitz.TEXT_MEDIABOX_CLIP page.remove_rotation() prect = page.rect if not clip else fitz.Rect(clip) # area to consider @@ -151,7 +180,7 @@ def get_text_lines( # make a TextPage if required if textpage is None: if ocr is False: - tp = page.get_textpage(clip=prect, flags=fitz.TEXTFLAGS_TEXT) + tp = page.get_textpage(clip=prect, flags=textflags) else: tp = page.get_textpage_ocr(dpi=300, full=True) else: diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index 9a317861..78c8712a 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -187,7 +187,9 @@ def to_markdown( if len(margins) == 2: margins = (0, margins[0], 0, margins[1]) if len(margins) != 4: - raise ValueError("margins must be a float or a sequence of 2 or 4 floats") + raise ValueError( + "margins must be one, two or four floats" + ) elif not all([hasattr(m, "__float__") for m in margins]): raise ValueError("margin values must be floats") @@ -195,8 +197,12 @@ def to_markdown( # document and use font sizes as header level indicators. if callable(hdr_info): get_header_id = hdr_info - elif hasattr(hdr_info, "get_header_id") and callable(hdr_info.get_header_id): + elif hasattr(hdr_info, "get_header_id") and callable( + hdr_info.get_header_id + ): get_header_id = hdr_info.get_header_id + elif hdr_info is False: + get_header_id = lambda s, page=None: "" else: hdr_info = IdentifyHeaders(doc) get_header_id = hdr_info.get_header_id @@ -378,7 +384,9 @@ def write_text( if ltext: text = f"{hdr_string}{prefix}{ltext}{suffix} " else: - text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} " + text = ( + f"{hdr_string}{prefix}{s['text'].strip()}{suffix} " + ) if text.startswith(bullet): text = "- " + text[1:] @@ -391,7 +399,9 @@ def write_text( code = False return ( - out_string.replace(" \n", "\n").replace(" ", " ").replace("\n\n\n", "\n\n") + out_string.replace(" \n", "\n") + .replace(" ", " ") + .replace("\n\n\n", "\n\n") ) def is_in_rects(rect, rect_list): @@ -474,6 +484,7 @@ def get_page_output(doc, pno, margins, textflags): graphics information. """ page = doc[pno] + page.remove_rotation() # make sure we work on rotation=0 md_string = "" if GRAPHICS_LIMIT is not None: test_paths = page.get_cdrawings() @@ -491,7 +502,9 @@ def get_page_output(doc, pno, margins, textflags): # make a TextPage for all later extractions textpage = page.get_textpage(flags=textflags, clip=clip) - img_info = [img for img in page.get_image_info() if img["bbox"] in clip] + img_info = [ + img for img in page.get_image_info() if img["bbox"] in clip + ] images = img_info[:] tables = [] graphics = [] @@ -560,7 +573,9 @@ def get_page_output(doc, pno, margins, textflags): if include is True: # this box is a significant vector graphic vg_clusters.append(bbox) - actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters)] + actual_paths = [ + p for p in paths if is_in_rects(p["rect"], vg_clusters) + ] vg_clusters0 = [ r @@ -620,7 +635,7 @@ def get_page_output(doc, pno, margins, textflags): # read the Table of Contents toc = doc.get_toc() - textflags = fitz.TEXT_MEDIABOX_CLIP + textflags = fitz.TEXT_MEDIABOX_CLIP | fitz.TEXT_CID_FOR_UNKNOWN_UNICODE for pno in pages: page_output, images, tables, graphics = get_page_output( doc, pno, margins, textflags diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index 1c752c4d..0f27b2db 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -17,7 +17,7 @@ setuptools.setup( name="pymupdf4llm", - version="0.0.8", + version="0.0.9", author="Artifex", author_email="support@artifex.com", description="PyMuPDF Utilities for LLM/RAG",