Skip to content

Commit

Permalink
Merge pull request #72 from pymupdf/version-0.0.9
Browse files Browse the repository at this point in the history
Changes for v0.0.9
  • Loading branch information
JorjMcKie authored Jul 11, 2024
2 parents 56eba1f + 5ba23e3 commit c136f93
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 41 deletions.
18 changes: 17 additions & 1 deletion docs/src/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,22 @@
Change Log
===========================================================================

Changes in version 0.0.9
--------------------------

Fixes:
~~~~~~~

* `71 <https://github.com/pymupdf/RAG/issues/71>`_ "Unexpected results in pymupdf4llm but pymupdf works"
* `68 <https://github.com/pymupdf/RAG/issues/68>`_ "Issue with text extraction near footer of page"


Improvements:
~~~~~~~~~~~~~~
* Improved identification of scattered text span particles. This should address most issues with out-of-sequence situations.
* We now correctly process rotated pages (see issue #68).


Changes in version 0.0.8
--------------------------

Expand All @@ -24,7 +40,7 @@ Fixes:
Improvements:
~~~~~~~~~~~~~~~~

* Improved the algorithm dealing with vector graphics. Vector graphics are now more reliably classified as irrelevant when they are simple background for text (quite often the case for code snippets).
* Improved the algorithm dealing with vector graphics. Vector graphics are now more reliably classified as irrelevant: We now detect when "strokes" only exist in the neighborhood of the graphics boundary box border itself. This is quite often the case for code snippets.


Changes in version 0.0.6
Expand Down
2 changes: 1 addition & 1 deletion pymupdf4llm/pymupdf4llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown

__version__ = "0.0.8"
__version__ = "0.0.9"
version = __version__
version_tuple = tuple(map(int, version.split(".")))

Expand Down
91 changes: 60 additions & 31 deletions pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,33 +28,63 @@ def is_white(text):


def get_raw_lines(textpage, clip=None, tolerance=3):
"""Extract the text spans from a TextPage in a natural reading sequence.
"""Extract the text spans from a TextPage in natural reading sequence.
All spans roughly on the same line are joined to generate an improved line.
This copes with MuPDF's algorithm that generates new lines also for spans
whose horizontal distance is larger than some hreshold.
whose horizontal distance is larger than some threshold.
Result is a sorted list of line objects that consist of the recomputed line
rectangle and a sorted list of spans in that line.
boundary box and the sorted list of spans in that line.
This result can then be easily converted e.g. to plain or markdown text.
This result can then easily be converted e.g. to plain or markdown text.
Args:
textpage: (mandatory) TextPage object
clip: (Rect) specifies a sub-rectangle of the textpage rect (which also
may be based on some part of the original page).
clip: (Rect) specifies a sub-rectangle of the textpage rect (which in
turn may be based on a sub-rectangle of the full page).
tolerance: (float) put spans on the same line if their top or bottom
coordinate differ by no mor than this value.
coordinate differ by no more than this value.
Returns:
A sorted list of items (rect, [spans]), each representing a line. The
spans are sorted left to right, Span dictionaries have been changed
in that "bbox" is a Rect object and "line" is an integer representing
the line number of the span. This allows to detect where MuPDF has
generated line breaks to indicate large inter-span distances.
A sorted list of items (rect, [spans]), each representing one line. The
spans are sorted left to right, Span dictionaries have been changed:
- "bbox" has been converted to a Rect object
- "line" (new) the line number in TextPage.extractDICT
- "block" (new) the block number in TextPage.extractDICT
This allows to detect where MuPDF has generated line breaks to indicate
large inter-span distances.
"""
y_delta = tolerance # allowable vertical coordinate deviation
if clip == None: # use TextPage if not provided

def sanitize_spans(line):
"""Sort and join the spans in a re-synthesized line.
The PDF may contain "broken" text with words cut into pieces.
This funtion joins spans representing the particles and sorts them
left to right.
Arg:
A list of spans - as drived from TextPage.extractDICT()
Returns:
A list of sorted, and potentially cleaned-up spans
"""
line.sort(key=lambda s: s["bbox"].x0) # sort left to right
for i in range(len(line) - 1, 0, -1): # iterate back to front
s0 = line[i - 1]
s1 = line[i]
# "delta" depends on the font size. Spans will be joined if
# no more than 10% of the font size separates them.
delta = s1["size"] * 0.1
if s0["bbox"].x1 + delta < s1["bbox"].x0:
continue # all good: no joining neded
s0["bbox"] |= s1["bbox"] # join boundary boxes
s0["text"] += s1["text"] # join the text
del line[i] # delete the joined-in span
line[i - 1] = s0 # update the span
return line

if clip is None: # use TextPage if not provided
clip = textpage.rect
# extract text blocks - if bbox is not empty
blocks = [
Expand All @@ -63,40 +93,38 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
if b["type"] == 0 and not fitz.Rect(b["bbox"]).is_empty
]
spans = [] # all spans in TextPage here
for bno, b in enumerate(blocks):
for lno, line in enumerate(b["lines"]):
lbbox = fitz.Rect(line["bbox"])
for sno, s in enumerate(line["spans"]):
sbbox = fitz.Rect(s["bbox"]) # turn to a Rect
for bno, b in enumerate(blocks): # the numbered blocks
for lno, line in enumerate(b["lines"]): # the numbered lines
for sno, s in enumerate(line["spans"]): # the numered spans
sbbox = fitz.Rect(s["bbox"]) # span bbox as a Rect
mpoint = (sbbox.tl + sbbox.br) / 2 # middle point
if mpoint not in clip:
continue
if is_white(s["text"]): # ignore white text
continue
if s["flags"] & 1 == 1: # if a superscript, modify
if s["flags"] & 1 == 1: # if a superscript, modify bbox
# with that of the preceding or following span
i = 1 if sno == 0 else sno - 1
neighbor = line["spans"][i]
sbbox.y1 = neighbor["bbox"][3]
s["text"] = f"[{s['text']}]"
s["bbox"] = sbbox # update with the Rect version
# include line identifier to facilitate separator insertion
# include line/block numbers to facilitate separator insertion
s["line"] = lno
s["block"] = bno
spans.append(s)

if not spans: # we may have no text at all
if not spans: # no text at all
return []

spans.sort(
key=lambda s: s["bbox"].y1
) # sort spans by assending bottom coord
spans.sort(key=lambda s: s["bbox"].y1) # sort spans by bottom coord
nlines = [] # final result
line = [spans[0]] # collects spans with fitting vertical coordinate
line = [spans[0]] # collects spans with fitting vertical coordinates
lrect = spans[0]["bbox"] # rectangle joined from span rectangles

for s in spans[1:]:
sbbox = s["bbox"]
sbbox0 = line[-1]["bbox"]
for s in spans[1:]: # walk through the spans
sbbox = s["bbox"] # this bbox
sbbox0 = line[-1]["bbox"] # previous bbox
# if any of top or bottom coordinates are close enough, join...
if (
abs(sbbox.y1 - sbbox0.y1) <= y_delta
Expand All @@ -107,7 +135,7 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
continue

# end of current line, sort its spans from left to right
line.sort(key=lambda s: s["bbox"].x0)
line = sanitize_spans(line)

# append line rect and its spans to final output
nlines.append([lrect, line])
Expand All @@ -116,7 +144,7 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
lrect = sbbox # initialize its rectangle

# need to append last line in the same way
line.sort(key=lambda s: s["bbox"].x0)
line = sanitize_spans(line)
nlines.append([lrect, line])

return nlines
Expand All @@ -143,6 +171,7 @@ def get_text_lines(
Returns:
String of plain text in reading sequence.
"""
textflags = fitz.TEXT_MEDIABOX_CLIP
page.remove_rotation()
prect = page.rect if not clip else fitz.Rect(clip) # area to consider

Expand All @@ -151,7 +180,7 @@ def get_text_lines(
# make a TextPage if required
if textpage is None:
if ocr is False:
tp = page.get_textpage(clip=prect, flags=fitz.TEXTFLAGS_TEXT)
tp = page.get_textpage(clip=prect, flags=textflags)
else:
tp = page.get_textpage_ocr(dpi=300, full=True)
else:
Expand Down
29 changes: 22 additions & 7 deletions pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,16 +187,22 @@ def to_markdown(
if len(margins) == 2:
margins = (0, margins[0], 0, margins[1])
if len(margins) != 4:
raise ValueError("margins must be a float or a sequence of 2 or 4 floats")
raise ValueError(
"margins must be one, two or four floats"
)
elif not all([hasattr(m, "__float__") for m in margins]):
raise ValueError("margin values must be floats")

# If "hdr_info" is not an object having method "get_header_id", scan the
# document and use font sizes as header level indicators.
if callable(hdr_info):
get_header_id = hdr_info
elif hasattr(hdr_info, "get_header_id") and callable(hdr_info.get_header_id):
elif hasattr(hdr_info, "get_header_id") and callable(
hdr_info.get_header_id
):
get_header_id = hdr_info.get_header_id
elif hdr_info is False:
get_header_id = lambda s, page=None: ""
else:
hdr_info = IdentifyHeaders(doc)
get_header_id = hdr_info.get_header_id
Expand Down Expand Up @@ -378,7 +384,9 @@ def write_text(
if ltext:
text = f"{hdr_string}{prefix}{ltext}{suffix} "
else:
text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} "
text = (
f"{hdr_string}{prefix}{s['text'].strip()}{suffix} "
)

if text.startswith(bullet):
text = "- " + text[1:]
Expand All @@ -391,7 +399,9 @@ def write_text(
code = False

return (
out_string.replace(" \n", "\n").replace(" ", " ").replace("\n\n\n", "\n\n")
out_string.replace(" \n", "\n")
.replace(" ", " ")
.replace("\n\n\n", "\n\n")
)

def is_in_rects(rect, rect_list):
Expand Down Expand Up @@ -474,6 +484,7 @@ def get_page_output(doc, pno, margins, textflags):
graphics information.
"""
page = doc[pno]
page.remove_rotation() # make sure we work on rotation=0
md_string = ""
if GRAPHICS_LIMIT is not None:
test_paths = page.get_cdrawings()
Expand All @@ -491,7 +502,9 @@ def get_page_output(doc, pno, margins, textflags):
# make a TextPage for all later extractions
textpage = page.get_textpage(flags=textflags, clip=clip)

img_info = [img for img in page.get_image_info() if img["bbox"] in clip]
img_info = [
img for img in page.get_image_info() if img["bbox"] in clip
]
images = img_info[:]
tables = []
graphics = []
Expand Down Expand Up @@ -560,7 +573,9 @@ def get_page_output(doc, pno, margins, textflags):
if include is True: # this box is a significant vector graphic
vg_clusters.append(bbox)

actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters)]
actual_paths = [
p for p in paths if is_in_rects(p["rect"], vg_clusters)
]

vg_clusters0 = [
r
Expand Down Expand Up @@ -620,7 +635,7 @@ def get_page_output(doc, pno, margins, textflags):

# read the Table of Contents
toc = doc.get_toc()
textflags = fitz.TEXT_MEDIABOX_CLIP
textflags = fitz.TEXT_MEDIABOX_CLIP | fitz.TEXT_CID_FOR_UNKNOWN_UNICODE
for pno in pages:
page_output, images, tables, graphics = get_page_output(
doc, pno, margins, textflags
Expand Down
2 changes: 1 addition & 1 deletion pymupdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

setuptools.setup(
name="pymupdf4llm",
version="0.0.8",
version="0.0.9",
author="Artifex",
author_email="support@artifex.com",
description="PyMuPDF Utilities for LLM/RAG",
Expand Down

0 comments on commit c136f93

Please sign in to comment.