Merge branch 'pymupdf:main' into main

knotapun · web-flow · commit 165b67c38930 · 2025-07-10T13:54:26.000-04:00
diff --git a/changes.txt b/changes.txt
@@ -2,7 +2,7 @@ Change Log
 ==========
 
 
-**Changes in version 1.26.3**
+**Changes in version 1.26.3 (2025-07-02)**
 
 * Use MuPDF-1.26.3.
 
diff --git a/docs/document.rst b/docs/document.rst
@@ -1480,7 +1480,7 @@ For details on **embedded files** refer to Appendix 3.
 
   .. method:: delete_page(pno=-1)
 
-    PDF only: Delete a page given by its 0-based number in `-∞ < pno < page_count - 1`.
+    PDF only: Delete a page given by its 0-based number in `-∞ < pno < page_count`.
 
     * Changed in v1.18.14: support Python's `del` statement.
 
diff --git a/docs/page.rst b/docs/page.rst
@@ -62,6 +62,7 @@ In a nutshell, this is what you can do with PyMuPDF:
 :meth:`Page.annot_xrefs`           PDF only: a list of annotation (and widget) xrefs
 :meth:`Page.annots`                return a generator over the annots on the page
 :meth:`Page.apply_redactions`      PDF only: process the redactions of the page
+:meth:`Page.clip_to_rect`          PDF only: remove page content outside a rectangle
 :meth:`Page.bound`                 rectangle of the page
 :meth:`Page.cluster_drawings`      PDF only: bounding boxes of vector graphics
 :meth:`Page.delete_annot`          PDF only: delete an annotation
@@ -1961,6 +1962,16 @@ In a nutshell, this is what you can do with PyMuPDF:
 
       These changes are **permanent** and cannot be reverted.
 
+   .. method:: clip_to_rect(rect)
+
+      PDF only: Permanently remove page content outside the given rectangle. This is similar to :meth:`Page.set_cropbox`, but the page's rectangle will not be changed, only the content outside the rectangle will be removed.
+
+      :arg rect_like rect: The rectangle to clip to. Must be finite and its intersection with the page must not be empty.
+
+      The method works best for text: All text on the page will be removed (decided by single character) that has no intersection with the rectangle. For vector graphics, the method will remove all paths that have no intersection with the rectangle. For images, the method will remove all images that have no intersection with the rectangle. Vectors and images **having** an intersection with the rectangle, will be kept in their entirety.
+
+      The method roughly has the same effect as if four redactions had been applied that cover the rectangle's outside.
+
    .. method:: remove_rotation()
 
       PDF only: Set page rotation to 0 while maintaining appearance and page content.
diff --git a/docs/version.rst b/docs/version.rst
@@ -1,6 +1,8 @@
+.. include:: header.rst
+
 ----
 
-This documentation covers **PyMuPDF v1.26.3**.
+This documentation covers PyMuPDF |version|.
 
 The major and minor versions of |PyMuPDF| and |MuPDF| will always be the same. Only the third qualifier (patch level) may deviate from that of |MuPDF|.
 
diff --git a/src/__init__.py b/src/__init__.py
@@ -3846,28 +3846,7 @@ def del_xml_metadata(self):
     def delete_page(self, pno: int =-1):
         """ Delete one page from a PDF.
         """
-        if not self.is_pdf:
-            raise ValueError("is no PDF")
-        if self.is_closed:
-            raise ValueError("document closed")
-
-        page_count = self.page_count
-        while pno < 0:
-            pno += page_count
-
-        if pno >= page_count:
-            raise ValueError("bad page number(s)")
-
-        # remove TOC bookmarks pointing to deleted page
-        toc = self.get_toc()
-        ol_xrefs = self.get_outline_xrefs()
-        for i, item in enumerate(toc):
-            if item[2] == pno + 1:
-                self._remove_toc_item(ol_xrefs[i])
-
-        self._remove_links_to(frozenset((pno,)))
-        self._delete_page(pno)
-        self._reset_page_refs()
+        return self.delete_pages(pno)
 
     def delete_pages(self, *args, **kw):
         """Delete pages from a PDF.
@@ -3877,6 +3856,7 @@ def delete_pages(self, *args, **kw):
             specify the first/last page to delete.
             Or a list/tuple/range object, which can contain arbitrary
             page numbers.
+            Or a single integer page number.
         """
         if not self.is_pdf:
             raise ValueError("is no PDF")
@@ -3909,12 +3889,13 @@ def delete_pages(self, *args, **kw):
                 if not f <= t < page_count:
                     raise ValueError("bad page number(s)")
                 numbers = tuple(range(f, t + 1))
+            elif isinstance(args[0], int):
+                pno = args[0]
+                while pno < 0:
+                    pno += page_count
+                numbers = (pno,)
             else:
-                r = args[0]
-                if type(r) is int:
-                    numbers = (r,)
-                else:
-                    numbers = tuple(r)
+                numbers = tuple(args[0])
 
         numbers = list(map(int, set(numbers)))  # ensure unique integers
         if numbers == []:
@@ -8725,6 +8706,16 @@ def recolor(self, components=1):
         ropts = mupdf.PdfRecolorOptions(ropt)
         mupdf.pdf_recolor_page(pdfdoc, self.number, ropts)
 
+    def clip_to_rect(self, rect):
+        """Clip away page content outside the rectangle."""
+        clip = Rect(rect)
+        if clip.is_infinite or (clip & self.rect).is_empty:
+            raise ValueError("rect must not be infinite or empty")
+        clip *= self.transformation_matrix
+        pdfpage = _as_pdf_page(self)
+        pclip = JM_rect_from_py(clip)
+        mupdf.pdf_clip_page(pdfpage, pclip)
+
     @property
     def artbox(self):
         """The ArtBox"""
diff --git a/src/utils.py b/src/utils.py
@@ -4345,7 +4345,7 @@ def remove_hidden(cont_lines):
         found_redacts = False
         for annot in page.annots():
             if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files:
-                annot.update_file(buffer=b" ")  # set file content to empty
+                annot.update_file(buffer_=b" ")  # set file content to empty
             if reset_responses:
                 annot.delete_responses()
             if annot.type[0] == pymupdf.PDF_ANNOT_REDACT:  # pylint: disable=no-member
diff --git a/tests/resources/test-4503.pdf b/tests/resources/test-4503.pdf
diff --git a/tests/test_4503.py b/tests/test_4503.py
@@ -0,0 +1,38 @@
+"""
+Test for issue #4503 in pymupdf:
+Correct recognition of strikeout and underline styles in text spans.
+"""
+
+import os
+import pymupdf
+from pymupdf import mupdf
+
+STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
+UNDERLINE = mupdf.FZ_STEXT_UNDERLINE
+
+
+def test_4503():
+    """
+    Check that the text span with the specified text has the correct styling:
+    strikeout, but no underline.
+    Previously, the text was broken in multiple spans with span breaks at
+    every space. and some parts were not detected as strikeout at all.
+    """
+    scriptdir = os.path.dirname(os.path.abspath(__file__))
+    text = "the right to request the state to review and, if appropriate,"
+    filename = os.path.join(scriptdir, "resources", "test-4503.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    flags = pymupdf.TEXT_ACCURATE_BBOXES | pymupdf.TEXT_COLLECT_STYLES
+    spans = [
+        s
+        for b in page.get_text("dict", flags=flags)["blocks"]
+        for l in b["lines"]
+        for s in l["spans"]
+        if s["text"] == text
+    ]
+    assert spans, "No spans found with the specified text"
+    span = spans[0]
+
+    assert span["char_flags"] & STRIKEOUT
+    assert not span["char_flags"] & UNDERLINE
diff --git a/tests/test_clip_page.py b/tests/test_clip_page.py
@@ -0,0 +1,37 @@
+"""
+Test Page method clip_to_rect.
+"""
+
+import os
+import pymupdf
+
+
+def test_clip():
+    """
+    Clip a Page to a rectangle and confirm that no text has survived
+    that is completely outside the rectangle..
+    """
+    scriptdir = os.path.dirname(os.path.abspath(__file__))
+    rect = pymupdf.Rect(200, 200, 400, 500)
+    filename = os.path.join(scriptdir, "resources", "v110-changes.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    page.clip_to_rect(rect)  # clip the page to the rectangle
+    # capture font warning message of MuPDF
+    assert pymupdf.TOOLS.mupdf_warnings() == "bogus font ascent/descent values (0 / 0)"
+    # extract all text characters and assert that each one
+    # has a non-empty intersection with the rectangle.
+    chars = [
+        c
+        for b in page.get_text("rawdict")["blocks"]
+        for l in b["lines"]
+        for s in l["spans"]
+        for c in s["chars"]
+    ]
+    for char in chars:
+        bbox = pymupdf.Rect(char["bbox"])
+        if bbox.is_empty:
+            continue
+        assert bbox.intersects(
+            rect
+        ), f"Character '{char['c']}' at {bbox} is outside of {rect}."
diff --git a/tests/test_textextract.py b/tests/test_textextract.py
@@ -374,8 +374,9 @@ def get_all_page_from_pdf(document, last_page=None):
     
     assert texts1 == texts0
 
-    wt = pymupdf.TOOLS.mupdf_warnings()
-    assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 434 times...'
+    if pymupdf.mupdf_version_tuple < (1, 27):
+        wt = pymupdf.TOOLS.mupdf_warnings()
+        assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 434 times...'
 
 def test_3650():
     path = os.path.normpath(f'{__file__}/../../tests/resources/test_3650.pdf')