Skip to content

Commit 165b67c

Browse files
authored
Merge branch 'pymupdf:main' into main
2 parents 57f8ed7 + 166b007 commit 165b67c

File tree

10 files changed

+113
-33
lines changed

10 files changed

+113
-33
lines changed

changes.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ Change Log
22
==========
33

44

5-
**Changes in version 1.26.3**
5+
**Changes in version 1.26.3 (2025-07-02)**
66

77
* Use MuPDF-1.26.3.
88

docs/document.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1480,7 +1480,7 @@ For details on **embedded files** refer to Appendix 3.
14801480

14811481
.. method:: delete_page(pno=-1)
14821482

1483-
PDF only: Delete a page given by its 0-based number in `-∞ < pno < page_count - 1`.
1483+
PDF only: Delete a page given by its 0-based number in `-∞ < pno < page_count`.
14841484

14851485
* Changed in v1.18.14: support Python's `del` statement.
14861486

docs/page.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ In a nutshell, this is what you can do with PyMuPDF:
6262
:meth:`Page.annot_xrefs` PDF only: a list of annotation (and widget) xrefs
6363
:meth:`Page.annots` return a generator over the annots on the page
6464
:meth:`Page.apply_redactions` PDF only: process the redactions of the page
65+
:meth:`Page.clip_to_rect` PDF only: remove page content outside a rectangle
6566
:meth:`Page.bound` rectangle of the page
6667
:meth:`Page.cluster_drawings` PDF only: bounding boxes of vector graphics
6768
:meth:`Page.delete_annot` PDF only: delete an annotation
@@ -1961,6 +1962,16 @@ In a nutshell, this is what you can do with PyMuPDF:
19611962

19621963
These changes are **permanent** and cannot be reverted.
19631964

1965+
.. method:: clip_to_rect(rect)
1966+
1967+
PDF only: Permanently remove page content outside the given rectangle. This is similar to :meth:`Page.set_cropbox`, but the page's rectangle will not be changed, only the content outside the rectangle will be removed.
1968+
1969+
:arg rect_like rect: The rectangle to clip to. Must be finite and its intersection with the page must not be empty.
1970+
1971+
The method works best for text: All text on the page will be removed (decided by single character) that has no intersection with the rectangle. For vector graphics, the method will remove all paths that have no intersection with the rectangle. For images, the method will remove all images that have no intersection with the rectangle. Vectors and images **having** an intersection with the rectangle, will be kept in their entirety.
1972+
1973+
The method roughly has the same effect as if four redactions had been applied that cover the rectangle's outside.
1974+
19641975
.. method:: remove_rotation()
19651976

19661977
PDF only: Set page rotation to 0 while maintaining appearance and page content.

docs/version.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
.. include:: header.rst
2+
13
----
24

3-
This documentation covers **PyMuPDF v1.26.3**.
5+
This documentation covers PyMuPDF |version|.
46

57
The major and minor versions of |PyMuPDF| and |MuPDF| will always be the same. Only the third qualifier (patch level) may deviate from that of |MuPDF|.
68

src/__init__.py

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3846,28 +3846,7 @@ def del_xml_metadata(self):
38463846
def delete_page(self, pno: int =-1):
38473847
""" Delete one page from a PDF.
38483848
"""
3849-
if not self.is_pdf:
3850-
raise ValueError("is no PDF")
3851-
if self.is_closed:
3852-
raise ValueError("document closed")
3853-
3854-
page_count = self.page_count
3855-
while pno < 0:
3856-
pno += page_count
3857-
3858-
if pno >= page_count:
3859-
raise ValueError("bad page number(s)")
3860-
3861-
# remove TOC bookmarks pointing to deleted page
3862-
toc = self.get_toc()
3863-
ol_xrefs = self.get_outline_xrefs()
3864-
for i, item in enumerate(toc):
3865-
if item[2] == pno + 1:
3866-
self._remove_toc_item(ol_xrefs[i])
3867-
3868-
self._remove_links_to(frozenset((pno,)))
3869-
self._delete_page(pno)
3870-
self._reset_page_refs()
3849+
return self.delete_pages(pno)
38713850

38723851
def delete_pages(self, *args, **kw):
38733852
"""Delete pages from a PDF.
@@ -3877,6 +3856,7 @@ def delete_pages(self, *args, **kw):
38773856
specify the first/last page to delete.
38783857
Or a list/tuple/range object, which can contain arbitrary
38793858
page numbers.
3859+
Or a single integer page number.
38803860
"""
38813861
if not self.is_pdf:
38823862
raise ValueError("is no PDF")
@@ -3909,12 +3889,13 @@ def delete_pages(self, *args, **kw):
39093889
if not f <= t < page_count:
39103890
raise ValueError("bad page number(s)")
39113891
numbers = tuple(range(f, t + 1))
3892+
elif isinstance(args[0], int):
3893+
pno = args[0]
3894+
while pno < 0:
3895+
pno += page_count
3896+
numbers = (pno,)
39123897
else:
3913-
r = args[0]
3914-
if type(r) is int:
3915-
numbers = (r,)
3916-
else:
3917-
numbers = tuple(r)
3898+
numbers = tuple(args[0])
39183899

39193900
numbers = list(map(int, set(numbers))) # ensure unique integers
39203901
if numbers == []:
@@ -8725,6 +8706,16 @@ def recolor(self, components=1):
87258706
ropts = mupdf.PdfRecolorOptions(ropt)
87268707
mupdf.pdf_recolor_page(pdfdoc, self.number, ropts)
87278708

8709+
def clip_to_rect(self, rect):
8710+
"""Clip away page content outside the rectangle."""
8711+
clip = Rect(rect)
8712+
if clip.is_infinite or (clip & self.rect).is_empty:
8713+
raise ValueError("rect must not be infinite or empty")
8714+
clip *= self.transformation_matrix
8715+
pdfpage = _as_pdf_page(self)
8716+
pclip = JM_rect_from_py(clip)
8717+
mupdf.pdf_clip_page(pdfpage, pclip)
8718+
87288719
@property
87298720
def artbox(self):
87308721
"""The ArtBox"""

src/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4345,7 +4345,7 @@ def remove_hidden(cont_lines):
43454345
found_redacts = False
43464346
for annot in page.annots():
43474347
if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files:
4348-
annot.update_file(buffer=b" ") # set file content to empty
4348+
annot.update_file(buffer_=b" ") # set file content to empty
43494349
if reset_responses:
43504350
annot.delete_responses()
43514351
if annot.type[0] == pymupdf.PDF_ANNOT_REDACT: # pylint: disable=no-member

tests/resources/test-4503.pdf

49.5 KB
Binary file not shown.

tests/test_4503.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
"""
2+
Test for issue #4503 in pymupdf:
3+
Correct recognition of strikeout and underline styles in text spans.
4+
"""
5+
6+
import os
7+
import pymupdf
8+
from pymupdf import mupdf
9+
10+
STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
11+
UNDERLINE = mupdf.FZ_STEXT_UNDERLINE
12+
13+
14+
def test_4503():
15+
"""
16+
Check that the text span with the specified text has the correct styling:
17+
strikeout, but no underline.
18+
Previously, the text was broken in multiple spans with span breaks at
19+
every space. and some parts were not detected as strikeout at all.
20+
"""
21+
scriptdir = os.path.dirname(os.path.abspath(__file__))
22+
text = "the right to request the state to review and, if appropriate,"
23+
filename = os.path.join(scriptdir, "resources", "test-4503.pdf")
24+
doc = pymupdf.open(filename)
25+
page = doc[0]
26+
flags = pymupdf.TEXT_ACCURATE_BBOXES | pymupdf.TEXT_COLLECT_STYLES
27+
spans = [
28+
s
29+
for b in page.get_text("dict", flags=flags)["blocks"]
30+
for l in b["lines"]
31+
for s in l["spans"]
32+
if s["text"] == text
33+
]
34+
assert spans, "No spans found with the specified text"
35+
span = spans[0]
36+
37+
assert span["char_flags"] & STRIKEOUT
38+
assert not span["char_flags"] & UNDERLINE

tests/test_clip_page.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""
2+
Test Page method clip_to_rect.
3+
"""
4+
5+
import os
6+
import pymupdf
7+
8+
9+
def test_clip():
10+
"""
11+
Clip a Page to a rectangle and confirm that no text has survived
12+
that is completely outside the rectangle..
13+
"""
14+
scriptdir = os.path.dirname(os.path.abspath(__file__))
15+
rect = pymupdf.Rect(200, 200, 400, 500)
16+
filename = os.path.join(scriptdir, "resources", "v110-changes.pdf")
17+
doc = pymupdf.open(filename)
18+
page = doc[0]
19+
page.clip_to_rect(rect) # clip the page to the rectangle
20+
# capture font warning message of MuPDF
21+
assert pymupdf.TOOLS.mupdf_warnings() == "bogus font ascent/descent values (0 / 0)"
22+
# extract all text characters and assert that each one
23+
# has a non-empty intersection with the rectangle.
24+
chars = [
25+
c
26+
for b in page.get_text("rawdict")["blocks"]
27+
for l in b["lines"]
28+
for s in l["spans"]
29+
for c in s["chars"]
30+
]
31+
for char in chars:
32+
bbox = pymupdf.Rect(char["bbox"])
33+
if bbox.is_empty:
34+
continue
35+
assert bbox.intersects(
36+
rect
37+
), f"Character '{char['c']}' at {bbox} is outside of {rect}."

tests/test_textextract.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -374,8 +374,9 @@ def get_all_page_from_pdf(document, last_page=None):
374374

375375
assert texts1 == texts0
376376

377-
wt = pymupdf.TOOLS.mupdf_warnings()
378-
assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 434 times...'
377+
if pymupdf.mupdf_version_tuple < (1, 27):
378+
wt = pymupdf.TOOLS.mupdf_warnings()
379+
assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 434 times...'
379380

380381
def test_3650():
381382
path = os.path.normpath(f'{__file__}/../../tests/resources/test_3650.pdf')

0 commit comments

Comments
 (0)