From a1b94ccd989c06397bc471c8b246fa0de15e0823 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhdaines@ecolingui.ca>
Date: Fri, 18 Jul 2025 17:29:53 -0400
Subject: [PATCH 01/12] feat: switch from pdfminer to paves

This allows us to also remove PDF repair and monkey patching.
---
 CHANGELOG.md                                  |   1 +
 requirements/extra-pdf-image.in               |   3 +-
 requirements/extra-pdf-image.txt              | 114 ++++++++----------
 .../pdf_image/test_pdfminer_processing.py     |   2 +-
 .../pdf_image/test_pdfminer_utils.py          |   2 +-
 unstructured/partition/pdf.py                 |  71 +++--------
 .../pdf_image/pdfminer_processing.py          |  23 ++--
 .../partition/pdf_image/pdfminer_utils.py     |  79 +++---------
 unstructured/patches/pdfminer.py              |  76 ------------
 9 files changed, 104 insertions(+), 267 deletions(-)
 delete mode 100644 unstructured/patches/pdfminer.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7d2d8950c4..416911c440 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
 ## 0.18.11-dev0
 
 ### Enhancements
+- **Switch from pdfminer.six to PAVÉS** Increases robustness of PDF extraction and uses multiple CPUs when possible.  No more need to patch pdfminer or repair pdfs with pikepdf.
 
 ### Features
 
diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
index b0caffbb95..df4d00ee17 100644
--- a/requirements/extra-pdf-image.in
+++ b/requirements/extra-pdf-image.in
@@ -4,8 +4,7 @@
 onnx>=1.17.0
 onnxruntime>=1.19.0
 pdf2image
-pdfminer.six
-pikepdf
+paves
 pi_heif
 pypdf
 google-cloud-vision
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
index 4d2ef23532..ee12284790 100644
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@@ -1,10 +1,6 @@
-#
-# This file is autogenerated by pip-compile with Python 3.10
-# by the following command:
-#
-#    pip-compile ./extra-pdf-image.in
-#
-accelerate==1.8.1
+# This file was autogenerated by uv via the following command:
+#    uv pip compile --no-strip-extras extra-pdf-image.in
+accelerate==1.9.0
     # via unstructured-inference
 antlr4-python3-runtime==4.9.3
     # via omegaconf
@@ -12,15 +8,15 @@ cachetools==5.5.2
     # via google-auth
 certifi==2025.7.9
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   requests
 cffi==1.17.1
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   cryptography
 charset-normalizer==3.4.2
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   pdfminer-six
     #   requests
 coloredlogs==15.0.1
@@ -29,14 +25,12 @@ contourpy==1.3.2
     # via matplotlib
 cryptography==45.0.5
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   pdfminer-six
 cycler==0.12.1
     # via matplotlib
-deprecated==1.2.18
-    # via pikepdf
 effdet==0.4.1
-    # via -r ./extra-pdf-image.in
+    # via -r extra-pdf-image.in
 filelock==3.18.0
     # via
     #   huggingface-hub
@@ -44,9 +38,9 @@ filelock==3.18.0
     #   transformers
 flatbuffers==25.2.10
     # via onnxruntime
-fonttools==4.58.5
+fonttools==4.59.0
     # via matplotlib
-fsspec==2025.5.1
+fsspec==2025.7.0
     # via
     #   huggingface-hub
     #   torch
@@ -57,21 +51,20 @@ google-auth==2.40.3
     #   google-api-core
     #   google-cloud-vision
 google-cloud-vision==3.10.2
-    # via -r ./extra-pdf-image.in
+    # via -r extra-pdf-image.in
 googleapis-common-protos==1.70.0
     # via
     #   google-api-core
     #   grpcio-status
 grpcio==1.73.1
     # via
-    #   -c requirements/deps/constraints.txt
     #   google-api-core
     #   grpcio-status
 grpcio-status==1.73.1
     # via google-api-core
 hf-xet==1.1.5
     # via huggingface-hub
-huggingface-hub==0.33.2
+huggingface-hub==0.33.4
     # via
     #   accelerate
     #   timm
@@ -82,27 +75,23 @@ humanfriendly==10.0
     # via coloredlogs
 idna==3.10
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   requests
 jinja2==3.1.6
     # via torch
 kiwisolver==1.4.8
     # via matplotlib
-lxml==6.0.0
-    # via
-    #   -c requirements/base.txt
-    #   pikepdf
 markupsafe==3.0.2
     # via jinja2
 matplotlib==3.10.3
     # via unstructured-inference
 mpmath==1.3.0
     # via sympy
-networkx==3.4.2
+networkx==3.5
     # via torch
 numpy==2.2.6
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   accelerate
     #   contourpy
     #   matplotlib
@@ -119,52 +108,52 @@ omegaconf==2.3.0
     # via effdet
 onnx==1.18.0
     # via
-    #   -r ./extra-pdf-image.in
+    #   -r extra-pdf-image.in
     #   unstructured-inference
-onnxruntime==1.22.0
+onnxruntime==1.22.1
     # via
-    #   -r ./extra-pdf-image.in
+    #   -r extra-pdf-image.in
     #   unstructured-inference
 opencv-python==4.12.0.88
     # via unstructured-inference
 packaging==25.0
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   accelerate
     #   huggingface-hub
     #   matplotlib
     #   onnxruntime
-    #   pikepdf
     #   transformers
     #   unstructured-pytesseract
 pandas==2.3.1
     # via unstructured-inference
+paves==0.6.1
+    # via -r extra-pdf-image.in
 pdf2image==1.17.0
-    # via -r ./extra-pdf-image.in
+    # via -r extra-pdf-image.in
 pdfminer-six==20250327
     # via
-    #   -c requirements/deps/constraints.txt
-    #   -r ./extra-pdf-image.in
+    #   -c ./deps/constraints.txt
     #   unstructured-inference
 pi-heif==1.0.0
-    # via -r ./extra-pdf-image.in
-pikepdf==9.9.0
-    # via -r ./extra-pdf-image.in
+    # via -r extra-pdf-image.in
 pillow==11.3.0
     # via
     #   matplotlib
+    #   paves
     #   pdf2image
     #   pi-heif
-    #   pikepdf
     #   torchvision
     #   unstructured-pytesseract
+playa-pdf==0.6.1
+    # via paves
 proto-plus==1.26.1
     # via
     #   google-api-core
     #   google-cloud-vision
 protobuf==6.31.1
     # via
-    #   -c requirements/deps/constraints.txt
+    #   -c ./deps/constraints.txt
     #   google-api-core
     #   google-cloud-vision
     #   googleapis-common-protos
@@ -174,7 +163,7 @@ protobuf==6.31.1
     #   proto-plus
 psutil==7.0.0
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   accelerate
 pyasn1==0.6.1
     # via
@@ -186,19 +175,19 @@ pycocotools==2.0.10
     # via effdet
 pycparser==2.22
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   cffi
 pyparsing==3.2.3
     # via matplotlib
 pypdf==5.7.0
     # via
-    #   -c requirements/base.txt
-    #   -r ./extra-pdf-image.in
+    #   -c base.txt
+    #   -r extra-pdf-image.in
 pypdfium2==4.30.1
     # via unstructured-inference
 python-dateutil==2.9.0.post0
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   matplotlib
     #   pandas
 python-multipart==0.0.20
@@ -214,15 +203,15 @@ pyyaml==6.0.2
     #   transformers
 rapidfuzz==3.13.0
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   unstructured-inference
 regex==2024.11.6
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   transformers
 requests==2.32.4
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   google-api-core
     #   huggingface-hub
     #   transformers
@@ -233,23 +222,25 @@ safetensors==0.5.3
     #   accelerate
     #   timm
     #   transformers
-scipy==1.15.3
+scipy==1.16.0
     # via unstructured-inference
+setuptools==80.9.0
+    # via triton
 six==1.17.0
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   python-dateutil
 sympy==1.14.0
     # via
     #   onnxruntime
     #   torch
-timm==1.0.16
+timm==1.0.17
     # via
     #   effdet
     #   unstructured-inference
 tokenizers==0.21.2
     # via
-    #   -c requirements/deps/constraints.txt
+    #   -c ./deps/constraints.txt
     #   transformers
 torch==2.7.1
     # via
@@ -264,30 +255,27 @@ torchvision==0.22.1
     #   timm
 tqdm==4.67.1
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   huggingface-hub
     #   transformers
-transformers==4.53.1
+transformers==4.53.2
     # via unstructured-inference
+triton==3.3.1
+    # via torch
 typing-extensions==4.14.1
     # via
-    #   -c requirements/base.txt
+    #   -c base.txt
     #   huggingface-hub
     #   onnx
-    #   pypdf
     #   torch
 tzdata==2025.2
     # via pandas
 unstructured-inference==1.0.5
-    # via -r ./extra-pdf-image.in
+    # via -r extra-pdf-image.in
 unstructured-pytesseract==0.3.15
-    # via -r ./extra-pdf-image.in
+    # via -r extra-pdf-image.in
 urllib3==2.5.0
     # via
-    #   -c requirements/base.txt
-    #   -c requirements/deps/constraints.txt
+    #   -c ./deps/constraints.txt
+    #   -c base.txt
     #   requests
-wrapt==1.17.2
-    # via
-    #   -c requirements/base.txt
-    #   deprecated
diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
index 309ea1336f..3f4a7c4f07 100644
--- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
+++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import pytest
-from pdfminer.layout import LAParams
+from paves.miner import LAParams
 from PIL import Image
 from unstructured_inference.constants import Source as InferenceSource
 from unstructured_inference.inference.elements import (
diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py
index 075a4e151e..2effe7eb75 100644
--- a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py
+++ b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py
@@ -1,6 +1,6 @@
 from unittest.mock import MagicMock
 
-from pdfminer.layout import LTContainer, LTTextLine
+from paves.miner import LTContainer, LTTextLine
 
 from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects
 
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index 0efe69ed03..8860c4c881 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -10,9 +10,7 @@
 from typing import IO, TYPE_CHECKING, Any, Optional, cast
 
 import numpy as np
-import wrapt
-from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
-from pdfminer.utils import open_filename
+from paves.miner import LTContainer, LTImage, LTItem, LTTextBox, resolve1
 from pi_heif import register_heif_opener
 from PIL import Image as PILImage
 from pypdf import PdfReader
@@ -93,19 +91,12 @@
     PartitionStrategy,
 )
 from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
-from unstructured.patches.pdfminer import patch_psparser
 from unstructured.utils import first, requires_dependencies
 
 if TYPE_CHECKING:
     pass
 
 
-# Correct a bug that was introduced by a previous patch to
-# pdfminer.six, causing needless and unsuccessful repairing of PDFs
-# which were not actually broken.
-patch_psparser()
-
-
 RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
 
 
@@ -439,38 +430,23 @@ def _partition_pdf_with_pdfminer(
     """
 
     exactly_one(filename=filename, file=file)
-    if filename:
-        with open_filename(filename, "rb") as fp:
-            fp = cast(IO[bytes], fp)
-            elements = _process_pdfminer_pages(
-                fp=fp,
-                filename=filename,
-                languages=languages,
-                metadata_last_modified=metadata_last_modified,
-                starting_page_number=starting_page_number,
-                password=password,
-                pdfminer_config=pdfminer_config,
-                **kwargs,
-            )
-
-    elif file:
-        elements = _process_pdfminer_pages(
-            fp=file,
-            filename=filename,
-            languages=languages,
-            metadata_last_modified=metadata_last_modified,
-            starting_page_number=starting_page_number,
-            password=password,
-            pdfminer_config=pdfminer_config,
-            **kwargs,
-        )
+    elements = _process_pdfminer_pages(
+        fp=file,
+        filename=filename,
+        languages=languages,
+        metadata_last_modified=metadata_last_modified,
+        starting_page_number=starting_page_number,
+        password=password,
+        pdfminer_config=pdfminer_config,
+        **kwargs,
+    )
 
     return elements
 
 
-@requires_dependencies("pdfminer")
+@requires_dependencies("paves")
 def _process_pdfminer_pages(
-    fp: IO[bytes],
+    fp: Optional[IO[bytes]],
     filename: str,
     metadata_last_modified: Optional[str],
     languages: Optional[list[str]] = None,
@@ -485,7 +461,8 @@ def _process_pdfminer_pages(
     elements = []
 
     for page_number, (page, page_layout) in enumerate(
-        open_pdfminer_pages_generator(fp, password=password, pdfminer_config=pdfminer_config),
+        open_pdfminer_pages_generator(fp, filename, password=password,
+                                      pdfminer_config=pdfminer_config),
         start=starting_page_number,
     ):
         width, height = page_layout.width, page_layout.height
@@ -497,8 +474,9 @@ def _process_pdfminer_pages(
             width=width,
             height=height,
         )
-        if page.annots:
-            annotation_list = get_uris(page.annots, height, coordinate_system, page_number)
+        annots = resolve1(page.attrs.get("Annots"))
+        if annots:
+            annotation_list = get_uris(annots, height, coordinate_system, page_number)
 
         for obj in page_layout:
             x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
@@ -1030,19 +1008,6 @@ def _extract_text(item: LTItem) -> str:
     return "\n"
 
 
-# Some pages with a ICC color space do not follow the pdf spec
-# They throw an error when we call interpreter.process_page
-# Since we don't need color info, we can just drop it in the pdfminer code
-# See #2059
-@wrapt.patch_function_wrapper("pdfminer.pdfinterp", "PDFPageInterpreter.init_resources")
-def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs):
-    resources = args[0]
-    if "ColorSpace" in resources:
-        del resources["ColorSpace"]
-
-    return wrapped(resources)
-
-
 def _combine_list_elements(
     elements: list[Element], coordinate_system: PixelSpace | PointSpace
 ) -> list[Element]:
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
index 8941d5022b..ceedbccafd 100644
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -4,9 +4,7 @@
 from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union, cast
 
 import numpy as np
-from pdfminer.layout import LTChar, LTTextBox
-from pdfminer.pdftypes import PDFObjRef
-from pdfminer.utils import open_filename
+from paves.miner import LTChar, LTTextBox, PDFObjRef, resolve1
 from unstructured_inference.config import inference_config
 from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD
 from unstructured_inference.inference.elements import Rectangle
@@ -43,12 +41,11 @@ def process_file_with_pdfminer(
     password: Optional[str] = None,
     pdfminer_config: Optional[PDFMinerConfig] = None,
 ) -> tuple[List[List["TextRegion"]], List[List]]:
-    with open_filename(filename, "rb") as fp:
-        fp = cast(BinaryIO, fp)
-        extracted_layout, layouts_links = process_data_with_pdfminer(
-            file=fp, dpi=dpi, password=password, pdfminer_config=pdfminer_config
-        )
-        return extracted_layout, layouts_links
+
+    extracted_layout, layouts_links = process_data_with_pdfminer(
+        file=None, filename=filename, dpi=dpi, password=password, pdfminer_config=pdfminer_config
+    )
+    return extracted_layout, layouts_links
 
 
 def _validate_bbox(bbox: list[int | float]) -> bool:
@@ -434,6 +431,7 @@ def process_page_layout_from_pdfminer(
 @requires_dependencies("unstructured_inference")
 def process_data_with_pdfminer(
     file: Optional[Union[bytes, BinaryIO]] = None,
+    filename: Optional[str] = None,
     dpi: int = 200,
     password: Optional[str] = None,
     pdfminer_config: Optional[PDFMinerConfig] = None,
@@ -448,7 +446,7 @@ def process_data_with_pdfminer(
     # Coefficient to rescale bounding box to be compatible with images
     coef = dpi / 72
     for page_number, (page, page_layout) in enumerate(
-        open_pdfminer_pages_generator(file, password=password, pdfminer_config=pdfminer_config)
+        open_pdfminer_pages_generator(file, filename, password=password, pdfminer_config=pdfminer_config)
     ):
         width, height = page_layout.width, page_layout.height
 
@@ -457,8 +455,9 @@ def process_data_with_pdfminer(
             width=width,
             height=height,
         )
-        if page.annots:
-            annotation_list = get_uris(page.annots, height, coordinate_system, page_number)
+        annots = resolve1(page.attrs.get("Annots"))
+        if annots:
+            annotation_list = get_uris(annots, height, coordinate_system, page_number)
 
         layout, urls_metadata = process_page_layout_from_pdfminer(
             annotation_list, page_layout, height, page_number, coef
diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
index 3993f41ae0..8e0bbe7d7d 100644
--- a/unstructured/partition/pdf_image/pdfminer_utils.py
+++ b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -1,15 +1,10 @@
 import os
-import tempfile
 from typing import BinaryIO, List, Optional, Tuple
 
-from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine
-from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
-from pdfminer.pdfpage import PDFPage
-from pdfminer.psexceptions import PSSyntaxError
+import playa
+from paves.miner import LAParams, LTContainer, LTImage, LTItem, LTTextLine, extract_page
 from pydantic import BaseModel
 
-from unstructured.logger import logger
 from unstructured.utils import requires_dependencies
 
 
@@ -20,18 +15,6 @@ class PDFMinerConfig(BaseModel):
     char_margin: Optional[float] = None
 
 
-def init_pdfminer(pdfminer_config: Optional[PDFMinerConfig] = None):
-    rsrcmgr = PDFResourceManager()
-
-    laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {}
-    laparams = LAParams(**laparams_kwargs)
-
-    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
-    interpreter = PDFPageInterpreter(rsrcmgr, device)
-
-    return device, interpreter
-
-
 def extract_image_objects(parent_object: LTItem) -> List[LTImage]:
     """Recursively extracts image objects from a given parent object in a PDF document."""
     objects = []
@@ -81,47 +64,25 @@ def rect_to_bbox(
     return (x1, y1, x2, y2)
 
 
-@requires_dependencies(["pikepdf", "pypdf"])
+@requires_dependencies("paves")
 def open_pdfminer_pages_generator(
-    fp: BinaryIO, password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None
+        fp: Optional[BinaryIO] = None, filename: Optional[str] = None,
+        password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None
 ):
     """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
+    laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {}
+    laparams = LAParams(**laparams_kwargs)
 
-    import pikepdf
-
-    from unstructured.partition.pdf_image.pypdf_utils import get_page_data
-
-    device, interpreter = init_pdfminer(pdfminer_config=pdfminer_config)
-    with tempfile.TemporaryDirectory() as tmp_dir_path:
-        tmp_file_path = os.path.join(tmp_dir_path, "tmp_file")
-        try:
-            pages = PDFPage.get_pages(fp, password=password or "")
-            # Detect invalid dictionary construct for entire PDF
-            for i, page in enumerate(pages):
-                try:
-                    # Detect invalid dictionary construct for one page
-                    interpreter.process_page(page)
-                    page_layout = device.get_result()
-                except PSSyntaxError:
-                    logger.info("Detected invalid dictionary construct for PDFminer")
-                    logger.info(f"Repairing the PDF page {i + 1} ...")
-                    # find the error page from binary data fp
-                    error_page_data = get_page_data(fp, page_number=i)
-                    # repair the error page with pikepdf
-                    with pikepdf.Pdf.open(error_page_data) as pdf:
-                        pdf.save(tmp_file_path)
-                    page = next(PDFPage.get_pages(open(tmp_file_path, "rb")))  # noqa: SIM115
-                    interpreter.process_page(page)
-                    page_layout = device.get_result()
-                yield page, page_layout
-        except PSSyntaxError:
-            logger.info("Detected invalid dictionary construct for PDFminer")
-            logger.info("Repairing the PDF document ...")
-            # repair the entire doc with pikepdf
-            with pikepdf.Pdf.open(fp) as pdf:
-                pdf.save(tmp_file_path)
-            pages = PDFPage.get_pages(open(tmp_file_path, "rb"))  # noqa: SIM115
-            for page in pages:
-                interpreter.process_page(page)
-                page_layout = device.get_result()
-                yield page, page_layout
+    if fp is None:
+        from functools import partial
+
+        assert filename
+        with playa.open(filename, space="page", password=password,
+                        max_workers=min(1, os.cpu_count() // 2)) as doc:
+            yield from zip(doc.pages,
+                           doc.pages.map(partial(extract_page, laparams=laparams)))
+    else:
+        doc = playa.Document(fp, space="page", password=password)
+        for page in doc.pages:
+            page_layout = extract_page(page, laparams)
+            yield page, page_layout
diff --git a/unstructured/patches/pdfminer.py b/unstructured/patches/pdfminer.py
deleted file mode 100644
index cc0c7dab21..0000000000
--- a/unstructured/patches/pdfminer.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import functools
-from typing import Tuple, Union
-
-import pdfminer
-from pdfminer.psparser import (
-    END_KEYWORD,
-    KWD,
-    PSEOF,
-    PSBaseParser,
-    PSBaseParserToken,
-    PSKeyword,
-    log,
-)
-
-factory_seek = PSBaseParser.seek
-
-
-@functools.wraps(PSBaseParser.seek)
-def seek(self: PSBaseParser, pos: int) -> None:
-    factory_seek(self, pos)
-    self.eof = False
-
-
-@functools.wraps(PSBaseParser._parse_keyword)
-def _parse_keyword(self, s: bytes, i: int) -> int:
-    m = END_KEYWORD.search(s, i)
-    if m:
-        j = m.start(0)
-        self._curtoken += s[i:j]
-    else:
-        self._curtoken += s[i:]
-        return len(s)
-    if self._curtoken == b"true":
-        token: Union[bool, PSKeyword] = True
-    elif self._curtoken == b"false":
-        token = False
-    else:
-        token = KWD(self._curtoken)
-    self._add_token(token)
-    self._parse1 = self._parse_main
-    return j
-
-
-@functools.wraps(PSBaseParser.nexttoken)
-def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
-    if self.eof:
-        # It's not really unexpected, come on now...
-        raise PSEOF("Unexpected EOF")
-    while not self._tokens:
-        try:
-            self.fillbuf()
-            self.charpos = self._parse1(self.buf, self.charpos)
-        except PSEOF:
-            # If we hit EOF in the middle of a token, try to parse
-            # it by tacking on whitespace, and delay raising PSEOF
-            # until next time around
-            self.charpos = self._parse1(b"\n", 0)
-            self.eof = True
-            # Oh, so there wasn't actually a token there? OK.
-            if not self._tokens:
-                raise
-    token = self._tokens.pop(0)
-    log.debug("nexttoken: %r", token)
-    return token
-
-
-def patch_psparser():
-    """Monkey-patch certain versions of pdfminer.six to avoid dropping
-    tokens at EOF (before 20231228) and splitting tokens at buffer
-    boundaries (20231228 and 20240706).
-    """
-    # Presuming the bug will be fixed in the next release
-    if pdfminer.__version__ <= "20240706":
-        PSBaseParser.seek = seek
-        PSBaseParser._parse_keyword = _parse_keyword
-        PSBaseParser.nexttoken = nexttoken

From ac2b2e78f6edf3c7e5e3343042814fa961d9ad43 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Sat, 19 Jul 2025 17:22:56 -0400
Subject: [PATCH 02/12] fix: manually hack deps since who knows how they get
 generated

---
 requirements/extra-pdf-image.txt | 87 ++++++++++++++++----------------
 1 file changed, 44 insertions(+), 43 deletions(-)

diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
index ee12284790..3fd0178b6f 100644
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@@ -1,6 +1,10 @@
-# This file was autogenerated by uv via the following command:
-#    uv pip compile --no-strip-extras extra-pdf-image.in
-accelerate==1.9.0
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile ./extra-pdf-image.in
+#
+accelerate==1.8.1
     # via unstructured-inference
 antlr4-python3-runtime==4.9.3
     # via omegaconf
@@ -8,15 +12,15 @@ cachetools==5.5.2
     # via google-auth
 certifi==2025.7.9
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   requests
 cffi==1.17.1
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   cryptography
 charset-normalizer==3.4.2
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   pdfminer-six
     #   requests
 coloredlogs==15.0.1
@@ -25,12 +29,12 @@ contourpy==1.3.2
     # via matplotlib
 cryptography==45.0.5
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   pdfminer-six
 cycler==0.12.1
     # via matplotlib
 effdet==0.4.1
-    # via -r extra-pdf-image.in
+    # via -r ./extra-pdf-image.in
 filelock==3.18.0
     # via
     #   huggingface-hub
@@ -38,9 +42,9 @@ filelock==3.18.0
     #   transformers
 flatbuffers==25.2.10
     # via onnxruntime
-fonttools==4.59.0
+fonttools==4.58.5
     # via matplotlib
-fsspec==2025.7.0
+fsspec==2025.5.1
     # via
     #   huggingface-hub
     #   torch
@@ -51,20 +55,21 @@ google-auth==2.40.3
     #   google-api-core
     #   google-cloud-vision
 google-cloud-vision==3.10.2
-    # via -r extra-pdf-image.in
+    # via -r ./extra-pdf-image.in
 googleapis-common-protos==1.70.0
     # via
     #   google-api-core
     #   grpcio-status
 grpcio==1.73.1
     # via
+    #   -c requirements/deps/constraints.txt
     #   google-api-core
     #   grpcio-status
 grpcio-status==1.73.1
     # via google-api-core
 hf-xet==1.1.5
     # via huggingface-hub
-huggingface-hub==0.33.4
+huggingface-hub==0.33.2
     # via
     #   accelerate
     #   timm
@@ -75,7 +80,7 @@ humanfriendly==10.0
     # via coloredlogs
 idna==3.10
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   requests
 jinja2==3.1.6
     # via torch
@@ -87,11 +92,11 @@ matplotlib==3.10.3
     # via unstructured-inference
 mpmath==1.3.0
     # via sympy
-networkx==3.5
+networkx==3.4.2
     # via torch
 numpy==2.2.6
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   accelerate
     #   contourpy
     #   matplotlib
@@ -108,17 +113,17 @@ omegaconf==2.3.0
     # via effdet
 onnx==1.18.0
     # via
-    #   -r extra-pdf-image.in
+    #   -r ./extra-pdf-image.in
     #   unstructured-inference
-onnxruntime==1.22.1
+onnxruntime==1.22.0
     # via
-    #   -r extra-pdf-image.in
+    #   -r ./extra-pdf-image.in
     #   unstructured-inference
 opencv-python==4.12.0.88
     # via unstructured-inference
 packaging==25.0
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   accelerate
     #   huggingface-hub
     #   matplotlib
@@ -153,7 +158,7 @@ proto-plus==1.26.1
     #   google-cloud-vision
 protobuf==6.31.1
     # via
-    #   -c ./deps/constraints.txt
+    #   -c requirements/deps/constraints.txt
     #   google-api-core
     #   google-cloud-vision
     #   googleapis-common-protos
@@ -163,7 +168,7 @@ protobuf==6.31.1
     #   proto-plus
 psutil==7.0.0
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   accelerate
 pyasn1==0.6.1
     # via
@@ -175,19 +180,19 @@ pycocotools==2.0.10
     # via effdet
 pycparser==2.22
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   cffi
 pyparsing==3.2.3
     # via matplotlib
 pypdf==5.7.0
     # via
-    #   -c base.txt
-    #   -r extra-pdf-image.in
+    #   -c requirements/base.txt
+    #   -r ./extra-pdf-image.in
 pypdfium2==4.30.1
     # via unstructured-inference
 python-dateutil==2.9.0.post0
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   matplotlib
     #   pandas
 python-multipart==0.0.20
@@ -203,15 +208,15 @@ pyyaml==6.0.2
     #   transformers
 rapidfuzz==3.13.0
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   unstructured-inference
 regex==2024.11.6
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   transformers
 requests==2.32.4
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   google-api-core
     #   huggingface-hub
     #   transformers
@@ -222,25 +227,23 @@ safetensors==0.5.3
     #   accelerate
     #   timm
     #   transformers
-scipy==1.16.0
+scipy==1.15.3
     # via unstructured-inference
-setuptools==80.9.0
-    # via triton
 six==1.17.0
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   python-dateutil
 sympy==1.14.0
     # via
     #   onnxruntime
     #   torch
-timm==1.0.17
+timm==1.0.16
     # via
     #   effdet
     #   unstructured-inference
 tokenizers==0.21.2
     # via
-    #   -c ./deps/constraints.txt
+    #   -c requirements/deps/constraints.txt
     #   transformers
 torch==2.7.1
     # via
@@ -255,27 +258,25 @@ torchvision==0.22.1
     #   timm
 tqdm==4.67.1
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   huggingface-hub
     #   transformers
-transformers==4.53.2
+transformers==4.53.1
     # via unstructured-inference
-triton==3.3.1
-    # via torch
 typing-extensions==4.14.1
     # via
-    #   -c base.txt
+    #   -c requirements/base.txt
     #   huggingface-hub
     #   onnx
     #   torch
 tzdata==2025.2
     # via pandas
 unstructured-inference==1.0.5
-    # via -r extra-pdf-image.in
+    # via -r ./extra-pdf-image.in
 unstructured-pytesseract==0.3.15
-    # via -r extra-pdf-image.in
+    # via -r ./extra-pdf-image.in
 urllib3==2.5.0
     # via
-    #   -c ./deps/constraints.txt
-    #   -c base.txt
+    #   -c requirements/base.txt
+    #   -c requirements/deps/constraints.txt
     #   requests

From 6cd328da49e12671797919938efc03c603c0245a Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Sat, 19 Jul 2025 17:37:57 -0400
Subject: [PATCH 03/12] chore: black and ruff

---
 unstructured/partition/pdf.py                      |  5 +++--
 .../partition/pdf_image/pdfminer_processing.py     |  6 ++++--
 unstructured/partition/pdf_image/pdfminer_utils.py | 14 ++++++++------
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index 8860c4c881..a57a6eba27 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -461,8 +461,9 @@ def _process_pdfminer_pages(
     elements = []
 
     for page_number, (page, page_layout) in enumerate(
-        open_pdfminer_pages_generator(fp, filename, password=password,
-                                      pdfminer_config=pdfminer_config),
+        open_pdfminer_pages_generator(
+            fp, filename, password=password, pdfminer_config=pdfminer_config
+        ),
         start=starting_page_number,
     ):
         width, height = page_layout.width, page_layout.height
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
index ceedbccafd..7c56e1e83a 100644
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import os
-from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union, cast
+from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union
 
 import numpy as np
 from paves.miner import LTChar, LTTextBox, PDFObjRef, resolve1
@@ -446,7 +446,9 @@ def process_data_with_pdfminer(
     # Coefficient to rescale bounding box to be compatible with images
     coef = dpi / 72
     for page_number, (page, page_layout) in enumerate(
-        open_pdfminer_pages_generator(file, filename, password=password, pdfminer_config=pdfminer_config)
+        open_pdfminer_pages_generator(
+            file, filename, password=password, pdfminer_config=pdfminer_config
+        )
     ):
         width, height = page_layout.width, page_layout.height
 
diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
index 8e0bbe7d7d..7c270cb693 100644
--- a/unstructured/partition/pdf_image/pdfminer_utils.py
+++ b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -66,8 +66,10 @@ def rect_to_bbox(
 
 @requires_dependencies("paves")
 def open_pdfminer_pages_generator(
-        fp: Optional[BinaryIO] = None, filename: Optional[str] = None,
-        password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None
+    fp: Optional[BinaryIO] = None,
+    filename: Optional[str] = None,
+    password: Optional[str] = None,
+    pdfminer_config: Optional[PDFMinerConfig] = None,
 ):
     """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
     laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {}
@@ -77,10 +79,10 @@ def open_pdfminer_pages_generator(
         from functools import partial
 
         assert filename
-        with playa.open(filename, space="page", password=password,
-                        max_workers=min(1, os.cpu_count() // 2)) as doc:
-            yield from zip(doc.pages,
-                           doc.pages.map(partial(extract_page, laparams=laparams)))
+        with playa.open(
+            filename, space="page", password=password, max_workers=min(1, os.cpu_count() // 2)
+        ) as doc:
+            yield from zip(doc.pages, doc.pages.map(partial(extract_page, laparams=laparams)))
     else:
         doc = playa.Document(fp, space="page", password=password)
         for page in doc.pages:

From a5f00e532dc956e4a837d76326ba5f9147698464 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Sun, 20 Jul 2025 09:51:27 -0400
Subject: [PATCH 04/12] fix(tests): repair no longer necessary

---
 test_unstructured/partition/pdf_image/test_pdf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
index 919ac89619..a50471de75 100644
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1224,14 +1224,14 @@ def test_partition_pdf_with_fast_finds_headers_footers():
 @pytest.mark.parametrize(
     ("filename", "expected_log"),
     [
-        # This one is *actually* an invalid PDF document
+        # This one is *actually* an invalid PDF document, but we no longer need to repair it
         ("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
     ],
 )
 def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
     caplog.set_level(logging.INFO)
     assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}"))
-    assert expected_log in caplog.text
+    assert expected_log not in caplog.text
 
 
 @pytest.mark.parametrize(

From 8ec45e02150d7c86413755a5ba215d1b298d307f Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhdaines@ecolingui.ca>
Date: Mon, 21 Jul 2025 09:56:03 -0400
Subject: [PATCH 05/12] fix: avoid importing pypdf just to count pages!

---
 unstructured/partition/pdf.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index a57a6eba27..23f85b583c 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -13,7 +13,7 @@
 from paves.miner import LTContainer, LTImage, LTItem, LTTextBox, resolve1
 from pi_heif import register_heif_opener
 from PIL import Image as PILImage
-from pypdf import PdfReader
+import playa
 from unstructured_inference.inference.layout import DocumentLayout
 from unstructured_inference.inference.layoutelement import LayoutElement
 
@@ -539,10 +539,11 @@ def _get_pdf_page_number(
     file: Optional[bytes | IO[bytes]] = None,
 ) -> int:
     if file:
-        number_of_pages = PdfReader(file).get_num_pages()
+        number_of_pages = len(playa.Document(file).pages)
         file.seek(0)
     elif filename:
-        number_of_pages = PdfReader(filename).get_num_pages()
+        with playa.open(filename) as pdf:
+            number_of_pages = len(pdf.pages)
     else:
         raise ValueError("Either 'file' or 'filename' must be provided.")
     return number_of_pages

From a489d295a17ea5b90f85c965c53be1af1a8ab385 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhdaines@ecolingui.ca>
Date: Mon, 21 Jul 2025 09:56:16 -0400
Subject: [PATCH 06/12] fix: playa needs "" as default password not None

---
 unstructured/partition/pdf_image/pdfminer_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
index 7c270cb693..eaee1a5baa 100644
--- a/unstructured/partition/pdf_image/pdfminer_utils.py
+++ b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -74,6 +74,8 @@ def open_pdfminer_pages_generator(
     """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
     laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {}
     laparams = LAParams(**laparams_kwargs)
+    if password is None:
+        password = ""  # playa's default
 
     if fp is None:
         from functools import partial

From 318a954a0cd63172f04c5248014c6c8be9143a94 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhdaines@ecolingui.ca>
Date: Mon, 21 Jul 2025 11:06:27 -0400
Subject: [PATCH 07/12] fix: require playa-pdf 0.6.2 for colormap issue

---
 requirements/extra-pdf-image.in  | 1 +
 requirements/extra-pdf-image.txt | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
index df4d00ee17..b4a11ab4a0 100644
--- a/requirements/extra-pdf-image.in
+++ b/requirements/extra-pdf-image.in
@@ -5,6 +5,7 @@ onnx>=1.17.0
 onnxruntime>=1.19.0
 pdf2image
 paves
+playa-pdf>=0.6.2
 pi_heif
 pypdf
 google-cloud-vision
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
index 3fd0178b6f..6518deb748 100644
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@@ -150,7 +150,7 @@ pillow==11.3.0
     #   pi-heif
     #   torchvision
     #   unstructured-pytesseract
-playa-pdf==0.6.1
+playa-pdf==0.6.2
     # via paves
 proto-plus==1.26.1
     # via

From 2f87d893c52f607b0b12f6b42e4f437c454f422f Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhdaines@ecolingui.ca>
Date: Mon, 21 Jul 2025 11:21:45 -0400
Subject: [PATCH 08/12] fix: isort

---
 unstructured/partition/pdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index 23f85b583c..6849b435af 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -10,10 +10,10 @@
 from typing import IO, TYPE_CHECKING, Any, Optional, cast
 
 import numpy as np
+import playa
 from paves.miner import LTContainer, LTImage, LTItem, LTTextBox, resolve1
 from pi_heif import register_heif_opener
 from PIL import Image as PILImage
-import playa
 from unstructured_inference.inference.layout import DocumentLayout
 from unstructured_inference.inference.layoutelement import LayoutElement
 

From e79845f1d2ab96cedc27db7a87f8e4a890278828 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhdaines@ecolingui.ca>
Date: Mon, 21 Jul 2025 11:44:24 -0400
Subject: [PATCH 09/12] fix(tests): playa/paves do not output (cid:N) droppings

---
 .../multi-column-2p.pdf.json                                | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
index 829b9b7a7e..c6e503a945 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
@@ -111,8 +111,8 @@
   },
   {
     "type": "CompositeElement",
-    "element_id": "e6dee1abec28f8ff365ab6275b3e5f0e",
-    "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who ﬁrst voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversiﬁed topics. More speciﬁcally, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne",
+    "element_id": "c2959a06eb5a6864c4f0c7d38e21b2e9",
+    "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who ﬁrst voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversiﬁed topics. More speciﬁcally, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne",
     "metadata": {
       "data_source": {
         "record_locator": {
@@ -153,4 +153,4 @@
       "page_number": 2
     }
   }
-]
\ No newline at end of file
+]

From afb1288250c3f19b02ff830db3b8b1315205225a Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhdaines@ecolingui.ca>
Date: Mon, 21 Jul 2025 11:53:24 -0400
Subject: [PATCH 10/12] fix(tests): update indices since (cid:N) no longer
 occurs

---
 test_unstructured/partition/test_msg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py
index 94b12d5578..d2c0a1ce5f 100644
--- a/test_unstructured/partition/test_msg.py
+++ b/test_unstructured/partition/test_msg.py
@@ -125,8 +125,8 @@ def test_partition_msg_can_process_attachments():
 
     assert all(e.metadata.filename == "fake-email-multiple-attachments.msg" for e in elements[:5])
     assert all(e.metadata.filename == "unstructured_logo.png" for e in elements[5:7])
-    assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:343])
-    assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[343:])
+    assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:341])
+    assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[341:])
     assert [e.text for e in elements[:5]] == [
         "Here are those documents.",
         "--",

From ea36f1075837c0cbe3fb35b6109cbc6f867613db Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhdaines@ecolingui.ca>
Date: Mon, 21 Jul 2025 12:02:55 -0400
Subject: [PATCH 11/12] fix(tests): update markdown and html fixtures

---
 .../multi-column-2p.pdf.html                                  | 4 ++--
 .../multi-column-2p.pdf.md                                    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html
index c780303b30..81f092abee 100644
--- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html
+++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html
@@ -58,12 +58,12 @@
   <div class="CompositeElement" id="4204154eefaa843f79edc96dcc208054">
    In this paper, we address the question: can we train a better dense embedding model using only pairs of questions and passages (or answers), with- out additional pretraining? By leveraging the now standard BERT pretrained model (Devlin et al., 2019) and a dual-encoder architecture (Bromley et al., 1994), we focus on developing the right training scheme using a relatively small number of question and passage pairs. Through a series of careful ablation studies, our ﬁnal solution is surprisingly simple: the embedding is optimized for maximizing inner products of the question and relevant passage vectors, with an objective compar- ing all pairs of questions and passages in a batch. Our Dense Passage Retriever (DPR) is exception- ally strong. It not only outperforms BM25 by a large margin (65.2% vs. 42.9% in Top-5 accuracy), but also results in a substantial improvement on the end-to-end QA accuracy compared to ORQA (41.5% vs. 33.3%) in the open Natural Questions setting (Lee et al., 2019; Kwiatkowski et al., 2019). Our contributions are twofold. First, we demon- strate that with the proper training setup, sim- ply ﬁne-tuning the question and passage encoders on existing question-passage pairs is sufﬁcient to greatly outperform BM25. Our empirical results also suggest that additional pretraining may not be needed. Second, we verify that, in the context of open-domain question answering, a higher retrieval precision indeed translates to a higher end-to-end QA accuracy. By applying a modern reader model to the top retrieved passages, we achieve compara- ble or better results on multiple QA datasets in the open-retrieval setting, compared to several, much complicated systems.
   </div>
-  <div class="CompositeElement" id="e6dee1abec28f8ff365ab6275b3e5f0e">
+  <div class="CompositeElement" id="c2959a06eb5a6864c4f0c7d38e21b2e9">
    2 Background
 
 The problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who ﬁrst voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversiﬁed topics. More speciﬁcally, we assume
 
-the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
+the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
 
 e
   </div>
diff --git a/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md b/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md
index 8e74e6d944..c64ab3495e 100644
--- a/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md
+++ b/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md
@@ -43,7 +43,7 @@ In this paper, we address the question: can we train a better dense embedding mo
 
 The problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who ﬁrst voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversiﬁed topics. More speciﬁcally, we assume
 
-the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
+the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
 
 e
 3 Dense Passage Retriever (DPR)

From e99973482b6d202d0ec617cd05b6850e05eb949e Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhdaines@ecolingui.ca>
Date: Mon, 21 Jul 2025 12:08:07 -0400
Subject: [PATCH 12/12] fix(tests): fix missing or not missing newline for
 silly diff

---
 .../multi-column-2p.pdf.json                                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
index c6e503a945..40c36de858 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
@@ -153,4 +153,4 @@
       "page_number": 2
     }
   }
-]
+]
\ No newline at end of file