From a1b94ccd989c06397bc471c8b246fa0de15e0823 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Fri, 18 Jul 2025 17:29:53 -0400 Subject: [PATCH 01/12] feat: switch from pdfminer to paves This allows us to also remove PDF repair and monkey patching. --- CHANGELOG.md | 1 + requirements/extra-pdf-image.in | 3 +- requirements/extra-pdf-image.txt | 114 ++++++++---------- .../pdf_image/test_pdfminer_processing.py | 2 +- .../pdf_image/test_pdfminer_utils.py | 2 +- unstructured/partition/pdf.py | 71 +++-------- .../pdf_image/pdfminer_processing.py | 23 ++-- .../partition/pdf_image/pdfminer_utils.py | 79 +++--------- unstructured/patches/pdfminer.py | 76 ------------ 9 files changed, 104 insertions(+), 267 deletions(-) delete mode 100644 unstructured/patches/pdfminer.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d2d8950c4..416911c440 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.18.11-dev0 ### Enhancements +- **Switch from pdfminer.six to PAVÉS** Increases robustness of PDF extraction and uses multiple CPUs when possible. No more need to patch pdfminer or repair pdfs with pikepdf. ### Features diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index b0caffbb95..df4d00ee17 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -4,8 +4,7 @@ onnx>=1.17.0 onnxruntime>=1.19.0 pdf2image -pdfminer.six -pikepdf +paves pi_heif pypdf google-cloud-vision diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 4d2ef23532..ee12284790 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -1,10 +1,6 @@ -# -# This file is autogenerated by pip-compile with Python 3.10 -# by the following command: -# -# pip-compile ./extra-pdf-image.in -# -accelerate==1.8.1 +# This file was autogenerated by uv via the following command: +# uv pip compile --no-strip-extras extra-pdf-image.in +accelerate==1.9.0 # via unstructured-inference antlr4-python3-runtime==4.9.3 # via omegaconf @@ -12,15 +8,15 @@ cachetools==5.5.2 # via google-auth certifi==2025.7.9 # via - # -c requirements/base.txt + # -c base.txt # requests cffi==1.17.1 # via - # -c requirements/base.txt + # -c base.txt # cryptography charset-normalizer==3.4.2 # via - # -c requirements/base.txt + # -c base.txt # pdfminer-six # requests coloredlogs==15.0.1 @@ -29,14 +25,12 @@ contourpy==1.3.2 # via matplotlib cryptography==45.0.5 # via - # -c requirements/base.txt + # -c base.txt # pdfminer-six cycler==0.12.1 # via matplotlib -deprecated==1.2.18 - # via pikepdf effdet==0.4.1 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in filelock==3.18.0 # via # huggingface-hub @@ -44,9 +38,9 @@ filelock==3.18.0 # transformers flatbuffers==25.2.10 # via onnxruntime -fonttools==4.58.5 +fonttools==4.59.0 # via matplotlib -fsspec==2025.5.1 +fsspec==2025.7.0 # via # huggingface-hub # torch @@ -57,21 +51,20 @@ google-auth==2.40.3 # google-api-core # google-cloud-vision google-cloud-vision==3.10.2 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in googleapis-common-protos==1.70.0 # via # google-api-core # grpcio-status grpcio==1.73.1 # via - # -c requirements/deps/constraints.txt # google-api-core # grpcio-status grpcio-status==1.73.1 # via google-api-core hf-xet==1.1.5 # via huggingface-hub -huggingface-hub==0.33.2 +huggingface-hub==0.33.4 # via # accelerate # timm @@ -82,27 +75,23 @@ humanfriendly==10.0 # via coloredlogs idna==3.10 # via - # -c requirements/base.txt + # -c base.txt # requests jinja2==3.1.6 # via torch kiwisolver==1.4.8 # via matplotlib -lxml==6.0.0 - # via - # -c requirements/base.txt - # pikepdf markupsafe==3.0.2 # via jinja2 matplotlib==3.10.3 # via unstructured-inference mpmath==1.3.0 # via sympy -networkx==3.4.2 +networkx==3.5 # via torch numpy==2.2.6 # via - # -c requirements/base.txt + # -c base.txt # accelerate # contourpy # matplotlib @@ -119,52 +108,52 @@ omegaconf==2.3.0 # via effdet onnx==1.18.0 # via - # -r ./extra-pdf-image.in + # -r extra-pdf-image.in # unstructured-inference -onnxruntime==1.22.0 +onnxruntime==1.22.1 # via - # -r ./extra-pdf-image.in + # -r extra-pdf-image.in # unstructured-inference opencv-python==4.12.0.88 # via unstructured-inference packaging==25.0 # via - # -c requirements/base.txt + # -c base.txt # accelerate # huggingface-hub # matplotlib # onnxruntime - # pikepdf # transformers # unstructured-pytesseract pandas==2.3.1 # via unstructured-inference +paves==0.6.1 + # via -r extra-pdf-image.in pdf2image==1.17.0 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in pdfminer-six==20250327 # via - # -c requirements/deps/constraints.txt - # -r ./extra-pdf-image.in + # -c ./deps/constraints.txt # unstructured-inference pi-heif==1.0.0 - # via -r ./extra-pdf-image.in -pikepdf==9.9.0 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in pillow==11.3.0 # via # matplotlib + # paves # pdf2image # pi-heif - # pikepdf # torchvision # unstructured-pytesseract +playa-pdf==0.6.1 + # via paves proto-plus==1.26.1 # via # google-api-core # google-cloud-vision protobuf==6.31.1 # via - # -c requirements/deps/constraints.txt + # -c ./deps/constraints.txt # google-api-core # google-cloud-vision # googleapis-common-protos @@ -174,7 +163,7 @@ protobuf==6.31.1 # proto-plus psutil==7.0.0 # via - # -c requirements/base.txt + # -c base.txt # accelerate pyasn1==0.6.1 # via @@ -186,19 +175,19 @@ pycocotools==2.0.10 # via effdet pycparser==2.22 # via - # -c requirements/base.txt + # -c base.txt # cffi pyparsing==3.2.3 # via matplotlib pypdf==5.7.0 # via - # -c requirements/base.txt - # -r ./extra-pdf-image.in + # -c base.txt + # -r extra-pdf-image.in pypdfium2==4.30.1 # via unstructured-inference python-dateutil==2.9.0.post0 # via - # -c requirements/base.txt + # -c base.txt # matplotlib # pandas python-multipart==0.0.20 @@ -214,15 +203,15 @@ pyyaml==6.0.2 # transformers rapidfuzz==3.13.0 # via - # -c requirements/base.txt + # -c base.txt # unstructured-inference regex==2024.11.6 # via - # -c requirements/base.txt + # -c base.txt # transformers requests==2.32.4 # via - # -c requirements/base.txt + # -c base.txt # google-api-core # huggingface-hub # transformers @@ -233,23 +222,25 @@ safetensors==0.5.3 # accelerate # timm # transformers -scipy==1.15.3 +scipy==1.16.0 # via unstructured-inference +setuptools==80.9.0 + # via triton six==1.17.0 # via - # -c requirements/base.txt + # -c base.txt # python-dateutil sympy==1.14.0 # via # onnxruntime # torch -timm==1.0.16 +timm==1.0.17 # via # effdet # unstructured-inference tokenizers==0.21.2 # via - # -c requirements/deps/constraints.txt + # -c ./deps/constraints.txt # transformers torch==2.7.1 # via @@ -264,30 +255,27 @@ torchvision==0.22.1 # timm tqdm==4.67.1 # via - # -c requirements/base.txt + # -c base.txt # huggingface-hub # transformers -transformers==4.53.1 +transformers==4.53.2 # via unstructured-inference +triton==3.3.1 + # via torch typing-extensions==4.14.1 # via - # -c requirements/base.txt + # -c base.txt # huggingface-hub # onnx - # pypdf # torch tzdata==2025.2 # via pandas unstructured-inference==1.0.5 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in unstructured-pytesseract==0.3.15 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in urllib3==2.5.0 # via - # -c requirements/base.txt - # -c requirements/deps/constraints.txt + # -c ./deps/constraints.txt + # -c base.txt # requests -wrapt==1.17.2 - # via - # -c requirements/base.txt - # deprecated diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py index 309ea1336f..3f4a7c4f07 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from pdfminer.layout import LAParams +from paves.miner import LAParams from PIL import Image from unstructured_inference.constants import Source as InferenceSource from unstructured_inference.inference.elements import ( diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py index 075a4e151e..2effe7eb75 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py @@ -1,6 +1,6 @@ from unittest.mock import MagicMock -from pdfminer.layout import LTContainer, LTTextLine +from paves.miner import LTContainer, LTTextLine from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 0efe69ed03..8860c4c881 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -10,9 +10,7 @@ from typing import IO, TYPE_CHECKING, Any, Optional, cast import numpy as np -import wrapt -from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox -from pdfminer.utils import open_filename +from paves.miner import LTContainer, LTImage, LTItem, LTTextBox, resolve1 from pi_heif import register_heif_opener from PIL import Image as PILImage from pypdf import PdfReader @@ -93,19 +91,12 @@ PartitionStrategy, ) from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements -from unstructured.patches.pdfminer import patch_psparser from unstructured.utils import first, requires_dependencies if TYPE_CHECKING: pass -# Correct a bug that was introduced by a previous patch to -# pdfminer.six, causing needless and unsuccessful repairing of PDFs -# which were not actually broken. -patch_psparser() - - RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL) @@ -439,38 +430,23 @@ def _partition_pdf_with_pdfminer( """ exactly_one(filename=filename, file=file) - if filename: - with open_filename(filename, "rb") as fp: - fp = cast(IO[bytes], fp) - elements = _process_pdfminer_pages( - fp=fp, - filename=filename, - languages=languages, - metadata_last_modified=metadata_last_modified, - starting_page_number=starting_page_number, - password=password, - pdfminer_config=pdfminer_config, - **kwargs, - ) - - elif file: - elements = _process_pdfminer_pages( - fp=file, - filename=filename, - languages=languages, - metadata_last_modified=metadata_last_modified, - starting_page_number=starting_page_number, - password=password, - pdfminer_config=pdfminer_config, - **kwargs, - ) + elements = _process_pdfminer_pages( + fp=file, + filename=filename, + languages=languages, + metadata_last_modified=metadata_last_modified, + starting_page_number=starting_page_number, + password=password, + pdfminer_config=pdfminer_config, + **kwargs, + ) return elements -@requires_dependencies("pdfminer") +@requires_dependencies("paves") def _process_pdfminer_pages( - fp: IO[bytes], + fp: Optional[IO[bytes]], filename: str, metadata_last_modified: Optional[str], languages: Optional[list[str]] = None, @@ -485,7 +461,8 @@ def _process_pdfminer_pages( elements = [] for page_number, (page, page_layout) in enumerate( - open_pdfminer_pages_generator(fp, password=password, pdfminer_config=pdfminer_config), + open_pdfminer_pages_generator(fp, filename, password=password, + pdfminer_config=pdfminer_config), start=starting_page_number, ): width, height = page_layout.width, page_layout.height @@ -497,8 +474,9 @@ def _process_pdfminer_pages( width=width, height=height, ) - if page.annots: - annotation_list = get_uris(page.annots, height, coordinate_system, page_number) + annots = resolve1(page.attrs.get("Annots")) + if annots: + annotation_list = get_uris(annots, height, coordinate_system, page_number) for obj in page_layout: x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height) @@ -1030,19 +1008,6 @@ def _extract_text(item: LTItem) -> str: return "\n" -# Some pages with a ICC color space do not follow the pdf spec -# They throw an error when we call interpreter.process_page -# Since we don't need color info, we can just drop it in the pdfminer code -# See #2059 -@wrapt.patch_function_wrapper("pdfminer.pdfinterp", "PDFPageInterpreter.init_resources") -def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs): - resources = args[0] - if "ColorSpace" in resources: - del resources["ColorSpace"] - - return wrapped(resources) - - def _combine_list_elements( elements: list[Element], coordinate_system: PixelSpace | PointSpace ) -> list[Element]: diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 8941d5022b..ceedbccafd 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -4,9 +4,7 @@ from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union, cast import numpy as np -from pdfminer.layout import LTChar, LTTextBox -from pdfminer.pdftypes import PDFObjRef -from pdfminer.utils import open_filename +from paves.miner import LTChar, LTTextBox, PDFObjRef, resolve1 from unstructured_inference.config import inference_config from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD from unstructured_inference.inference.elements import Rectangle @@ -43,12 +41,11 @@ def process_file_with_pdfminer( password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None, ) -> tuple[List[List["TextRegion"]], List[List]]: - with open_filename(filename, "rb") as fp: - fp = cast(BinaryIO, fp) - extracted_layout, layouts_links = process_data_with_pdfminer( - file=fp, dpi=dpi, password=password, pdfminer_config=pdfminer_config - ) - return extracted_layout, layouts_links + + extracted_layout, layouts_links = process_data_with_pdfminer( + file=None, filename=filename, dpi=dpi, password=password, pdfminer_config=pdfminer_config + ) + return extracted_layout, layouts_links def _validate_bbox(bbox: list[int | float]) -> bool: @@ -434,6 +431,7 @@ def process_page_layout_from_pdfminer( @requires_dependencies("unstructured_inference") def process_data_with_pdfminer( file: Optional[Union[bytes, BinaryIO]] = None, + filename: Optional[str] = None, dpi: int = 200, password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None, @@ -448,7 +446,7 @@ def process_data_with_pdfminer( # Coefficient to rescale bounding box to be compatible with images coef = dpi / 72 for page_number, (page, page_layout) in enumerate( - open_pdfminer_pages_generator(file, password=password, pdfminer_config=pdfminer_config) + open_pdfminer_pages_generator(file, filename, password=password, pdfminer_config=pdfminer_config) ): width, height = page_layout.width, page_layout.height @@ -457,8 +455,9 @@ def process_data_with_pdfminer( width=width, height=height, ) - if page.annots: - annotation_list = get_uris(page.annots, height, coordinate_system, page_number) + annots = resolve1(page.attrs.get("Annots")) + if annots: + annotation_list = get_uris(annots, height, coordinate_system, page_number) layout, urls_metadata = process_page_layout_from_pdfminer( annotation_list, page_layout, height, page_number, coef diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index 3993f41ae0..8e0bbe7d7d 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -1,15 +1,10 @@ import os -import tempfile from typing import BinaryIO, List, Optional, Tuple -from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine -from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager -from pdfminer.pdfpage import PDFPage -from pdfminer.psexceptions import PSSyntaxError +import playa +from paves.miner import LAParams, LTContainer, LTImage, LTItem, LTTextLine, extract_page from pydantic import BaseModel -from unstructured.logger import logger from unstructured.utils import requires_dependencies @@ -20,18 +15,6 @@ class PDFMinerConfig(BaseModel): char_margin: Optional[float] = None -def init_pdfminer(pdfminer_config: Optional[PDFMinerConfig] = None): - rsrcmgr = PDFResourceManager() - - laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {} - laparams = LAParams(**laparams_kwargs) - - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - - return device, interpreter - - def extract_image_objects(parent_object: LTItem) -> List[LTImage]: """Recursively extracts image objects from a given parent object in a PDF document.""" objects = [] @@ -81,47 +64,25 @@ def rect_to_bbox( return (x1, y1, x2, y2) -@requires_dependencies(["pikepdf", "pypdf"]) +@requires_dependencies("paves") def open_pdfminer_pages_generator( - fp: BinaryIO, password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None + fp: Optional[BinaryIO] = None, filename: Optional[str] = None, + password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None ): """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" + laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {} + laparams = LAParams(**laparams_kwargs) - import pikepdf - - from unstructured.partition.pdf_image.pypdf_utils import get_page_data - - device, interpreter = init_pdfminer(pdfminer_config=pdfminer_config) - with tempfile.TemporaryDirectory() as tmp_dir_path: - tmp_file_path = os.path.join(tmp_dir_path, "tmp_file") - try: - pages = PDFPage.get_pages(fp, password=password or "") - # Detect invalid dictionary construct for entire PDF - for i, page in enumerate(pages): - try: - # Detect invalid dictionary construct for one page - interpreter.process_page(page) - page_layout = device.get_result() - except PSSyntaxError: - logger.info("Detected invalid dictionary construct for PDFminer") - logger.info(f"Repairing the PDF page {i + 1} ...") - # find the error page from binary data fp - error_page_data = get_page_data(fp, page_number=i) - # repair the error page with pikepdf - with pikepdf.Pdf.open(error_page_data) as pdf: - pdf.save(tmp_file_path) - page = next(PDFPage.get_pages(open(tmp_file_path, "rb"))) # noqa: SIM115 - interpreter.process_page(page) - page_layout = device.get_result() - yield page, page_layout - except PSSyntaxError: - logger.info("Detected invalid dictionary construct for PDFminer") - logger.info("Repairing the PDF document ...") - # repair the entire doc with pikepdf - with pikepdf.Pdf.open(fp) as pdf: - pdf.save(tmp_file_path) - pages = PDFPage.get_pages(open(tmp_file_path, "rb")) # noqa: SIM115 - for page in pages: - interpreter.process_page(page) - page_layout = device.get_result() - yield page, page_layout + if fp is None: + from functools import partial + + assert filename + with playa.open(filename, space="page", password=password, + max_workers=min(1, os.cpu_count() // 2)) as doc: + yield from zip(doc.pages, + doc.pages.map(partial(extract_page, laparams=laparams))) + else: + doc = playa.Document(fp, space="page", password=password) + for page in doc.pages: + page_layout = extract_page(page, laparams) + yield page, page_layout diff --git a/unstructured/patches/pdfminer.py b/unstructured/patches/pdfminer.py deleted file mode 100644 index cc0c7dab21..0000000000 --- a/unstructured/patches/pdfminer.py +++ /dev/null @@ -1,76 +0,0 @@ -import functools -from typing import Tuple, Union - -import pdfminer -from pdfminer.psparser import ( - END_KEYWORD, - KWD, - PSEOF, - PSBaseParser, - PSBaseParserToken, - PSKeyword, - log, -) - -factory_seek = PSBaseParser.seek - - -@functools.wraps(PSBaseParser.seek) -def seek(self: PSBaseParser, pos: int) -> None: - factory_seek(self, pos) - self.eof = False - - -@functools.wraps(PSBaseParser._parse_keyword) -def _parse_keyword(self, s: bytes, i: int) -> int: - m = END_KEYWORD.search(s, i) - if m: - j = m.start(0) - self._curtoken += s[i:j] - else: - self._curtoken += s[i:] - return len(s) - if self._curtoken == b"true": - token: Union[bool, PSKeyword] = True - elif self._curtoken == b"false": - token = False - else: - token = KWD(self._curtoken) - self._add_token(token) - self._parse1 = self._parse_main - return j - - -@functools.wraps(PSBaseParser.nexttoken) -def nexttoken(self) -> Tuple[int, PSBaseParserToken]: - if self.eof: - # It's not really unexpected, come on now... - raise PSEOF("Unexpected EOF") - while not self._tokens: - try: - self.fillbuf() - self.charpos = self._parse1(self.buf, self.charpos) - except PSEOF: - # If we hit EOF in the middle of a token, try to parse - # it by tacking on whitespace, and delay raising PSEOF - # until next time around - self.charpos = self._parse1(b"\n", 0) - self.eof = True - # Oh, so there wasn't actually a token there? OK. - if not self._tokens: - raise - token = self._tokens.pop(0) - log.debug("nexttoken: %r", token) - return token - - -def patch_psparser(): - """Monkey-patch certain versions of pdfminer.six to avoid dropping - tokens at EOF (before 20231228) and splitting tokens at buffer - boundaries (20231228 and 20240706). - """ - # Presuming the bug will be fixed in the next release - if pdfminer.__version__ <= "20240706": - PSBaseParser.seek = seek - PSBaseParser._parse_keyword = _parse_keyword - PSBaseParser.nexttoken = nexttoken From ac2b2e78f6edf3c7e5e3343042814fa961d9ad43 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sat, 19 Jul 2025 17:22:56 -0400 Subject: [PATCH 02/12] fix: manually hack deps since who knows how they get generated --- requirements/extra-pdf-image.txt | 87 ++++++++++++++++---------------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index ee12284790..3fd0178b6f 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -1,6 +1,10 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile --no-strip-extras extra-pdf-image.in -accelerate==1.9.0 +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile ./extra-pdf-image.in +# +accelerate==1.8.1 # via unstructured-inference antlr4-python3-runtime==4.9.3 # via omegaconf @@ -8,15 +12,15 @@ cachetools==5.5.2 # via google-auth certifi==2025.7.9 # via - # -c base.txt + # -c requirements/base.txt # requests cffi==1.17.1 # via - # -c base.txt + # -c requirements/base.txt # cryptography charset-normalizer==3.4.2 # via - # -c base.txt + # -c requirements/base.txt # pdfminer-six # requests coloredlogs==15.0.1 @@ -25,12 +29,12 @@ contourpy==1.3.2 # via matplotlib cryptography==45.0.5 # via - # -c base.txt + # -c requirements/base.txt # pdfminer-six cycler==0.12.1 # via matplotlib effdet==0.4.1 - # via -r extra-pdf-image.in + # via -r ./extra-pdf-image.in filelock==3.18.0 # via # huggingface-hub @@ -38,9 +42,9 @@ filelock==3.18.0 # transformers flatbuffers==25.2.10 # via onnxruntime -fonttools==4.59.0 +fonttools==4.58.5 # via matplotlib -fsspec==2025.7.0 +fsspec==2025.5.1 # via # huggingface-hub # torch @@ -51,20 +55,21 @@ google-auth==2.40.3 # google-api-core # google-cloud-vision google-cloud-vision==3.10.2 - # via -r extra-pdf-image.in + # via -r ./extra-pdf-image.in googleapis-common-protos==1.70.0 # via # google-api-core # grpcio-status grpcio==1.73.1 # via + # -c requirements/deps/constraints.txt # google-api-core # grpcio-status grpcio-status==1.73.1 # via google-api-core hf-xet==1.1.5 # via huggingface-hub -huggingface-hub==0.33.4 +huggingface-hub==0.33.2 # via # accelerate # timm @@ -75,7 +80,7 @@ humanfriendly==10.0 # via coloredlogs idna==3.10 # via - # -c base.txt + # -c requirements/base.txt # requests jinja2==3.1.6 # via torch @@ -87,11 +92,11 @@ matplotlib==3.10.3 # via unstructured-inference mpmath==1.3.0 # via sympy -networkx==3.5 +networkx==3.4.2 # via torch numpy==2.2.6 # via - # -c base.txt + # -c requirements/base.txt # accelerate # contourpy # matplotlib @@ -108,17 +113,17 @@ omegaconf==2.3.0 # via effdet onnx==1.18.0 # via - # -r extra-pdf-image.in + # -r ./extra-pdf-image.in # unstructured-inference -onnxruntime==1.22.1 +onnxruntime==1.22.0 # via - # -r extra-pdf-image.in + # -r ./extra-pdf-image.in # unstructured-inference opencv-python==4.12.0.88 # via unstructured-inference packaging==25.0 # via - # -c base.txt + # -c requirements/base.txt # accelerate # huggingface-hub # matplotlib @@ -153,7 +158,7 @@ proto-plus==1.26.1 # google-cloud-vision protobuf==6.31.1 # via - # -c ./deps/constraints.txt + # -c requirements/deps/constraints.txt # google-api-core # google-cloud-vision # googleapis-common-protos @@ -163,7 +168,7 @@ protobuf==6.31.1 # proto-plus psutil==7.0.0 # via - # -c base.txt + # -c requirements/base.txt # accelerate pyasn1==0.6.1 # via @@ -175,19 +180,19 @@ pycocotools==2.0.10 # via effdet pycparser==2.22 # via - # -c base.txt + # -c requirements/base.txt # cffi pyparsing==3.2.3 # via matplotlib pypdf==5.7.0 # via - # -c base.txt - # -r extra-pdf-image.in + # -c requirements/base.txt + # -r ./extra-pdf-image.in pypdfium2==4.30.1 # via unstructured-inference python-dateutil==2.9.0.post0 # via - # -c base.txt + # -c requirements/base.txt # matplotlib # pandas python-multipart==0.0.20 @@ -203,15 +208,15 @@ pyyaml==6.0.2 # transformers rapidfuzz==3.13.0 # via - # -c base.txt + # -c requirements/base.txt # unstructured-inference regex==2024.11.6 # via - # -c base.txt + # -c requirements/base.txt # transformers requests==2.32.4 # via - # -c base.txt + # -c requirements/base.txt # google-api-core # huggingface-hub # transformers @@ -222,25 +227,23 @@ safetensors==0.5.3 # accelerate # timm # transformers -scipy==1.16.0 +scipy==1.15.3 # via unstructured-inference -setuptools==80.9.0 - # via triton six==1.17.0 # via - # -c base.txt + # -c requirements/base.txt # python-dateutil sympy==1.14.0 # via # onnxruntime # torch -timm==1.0.17 +timm==1.0.16 # via # effdet # unstructured-inference tokenizers==0.21.2 # via - # -c ./deps/constraints.txt + # -c requirements/deps/constraints.txt # transformers torch==2.7.1 # via @@ -255,27 +258,25 @@ torchvision==0.22.1 # timm tqdm==4.67.1 # via - # -c base.txt + # -c requirements/base.txt # huggingface-hub # transformers -transformers==4.53.2 +transformers==4.53.1 # via unstructured-inference -triton==3.3.1 - # via torch typing-extensions==4.14.1 # via - # -c base.txt + # -c requirements/base.txt # huggingface-hub # onnx # torch tzdata==2025.2 # via pandas unstructured-inference==1.0.5 - # via -r extra-pdf-image.in + # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.15 - # via -r extra-pdf-image.in + # via -r ./extra-pdf-image.in urllib3==2.5.0 # via - # -c ./deps/constraints.txt - # -c base.txt + # -c requirements/base.txt + # -c requirements/deps/constraints.txt # requests From 6cd328da49e12671797919938efc03c603c0245a Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sat, 19 Jul 2025 17:37:57 -0400 Subject: [PATCH 03/12] chore: black and ruff --- unstructured/partition/pdf.py | 5 +++-- .../partition/pdf_image/pdfminer_processing.py | 6 ++++-- unstructured/partition/pdf_image/pdfminer_utils.py | 14 ++++++++------ 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 8860c4c881..a57a6eba27 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -461,8 +461,9 @@ def _process_pdfminer_pages( elements = [] for page_number, (page, page_layout) in enumerate( - open_pdfminer_pages_generator(fp, filename, password=password, - pdfminer_config=pdfminer_config), + open_pdfminer_pages_generator( + fp, filename, password=password, pdfminer_config=pdfminer_config + ), start=starting_page_number, ): width, height = page_layout.width, page_layout.height diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index ceedbccafd..7c56e1e83a 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -1,7 +1,7 @@ from __future__ import annotations import os -from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union, cast +from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union import numpy as np from paves.miner import LTChar, LTTextBox, PDFObjRef, resolve1 @@ -446,7 +446,9 @@ def process_data_with_pdfminer( # Coefficient to rescale bounding box to be compatible with images coef = dpi / 72 for page_number, (page, page_layout) in enumerate( - open_pdfminer_pages_generator(file, filename, password=password, pdfminer_config=pdfminer_config) + open_pdfminer_pages_generator( + file, filename, password=password, pdfminer_config=pdfminer_config + ) ): width, height = page_layout.width, page_layout.height diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index 8e0bbe7d7d..7c270cb693 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -66,8 +66,10 @@ def rect_to_bbox( @requires_dependencies("paves") def open_pdfminer_pages_generator( - fp: Optional[BinaryIO] = None, filename: Optional[str] = None, - password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None + fp: Optional[BinaryIO] = None, + filename: Optional[str] = None, + password: Optional[str] = None, + pdfminer_config: Optional[PDFMinerConfig] = None, ): """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {} @@ -77,10 +79,10 @@ def open_pdfminer_pages_generator( from functools import partial assert filename - with playa.open(filename, space="page", password=password, - max_workers=min(1, os.cpu_count() // 2)) as doc: - yield from zip(doc.pages, - doc.pages.map(partial(extract_page, laparams=laparams))) + with playa.open( + filename, space="page", password=password, max_workers=min(1, os.cpu_count() // 2) + ) as doc: + yield from zip(doc.pages, doc.pages.map(partial(extract_page, laparams=laparams))) else: doc = playa.Document(fp, space="page", password=password) for page in doc.pages: From a5f00e532dc956e4a837d76326ba5f9147698464 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sun, 20 Jul 2025 09:51:27 -0400 Subject: [PATCH 04/12] fix(tests): repair no longer necessary --- test_unstructured/partition/pdf_image/test_pdf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 919ac89619..a50471de75 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1224,14 +1224,14 @@ def test_partition_pdf_with_fast_finds_headers_footers(): @pytest.mark.parametrize( ("filename", "expected_log"), [ - # This one is *actually* an invalid PDF document + # This one is *actually* an invalid PDF document, but we no longer need to repair it ("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."), ], ) def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog): caplog.set_level(logging.INFO) assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}")) - assert expected_log in caplog.text + assert expected_log not in caplog.text @pytest.mark.parametrize( From 8ec45e02150d7c86413755a5ba215d1b298d307f Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 21 Jul 2025 09:56:03 -0400 Subject: [PATCH 05/12] fix: avoid importing pypdf just to count pages! --- unstructured/partition/pdf.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index a57a6eba27..23f85b583c 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -13,7 +13,7 @@ from paves.miner import LTContainer, LTImage, LTItem, LTTextBox, resolve1 from pi_heif import register_heif_opener from PIL import Image as PILImage -from pypdf import PdfReader +import playa from unstructured_inference.inference.layout import DocumentLayout from unstructured_inference.inference.layoutelement import LayoutElement @@ -539,10 +539,11 @@ def _get_pdf_page_number( file: Optional[bytes | IO[bytes]] = None, ) -> int: if file: - number_of_pages = PdfReader(file).get_num_pages() + number_of_pages = len(playa.Document(file).pages) file.seek(0) elif filename: - number_of_pages = PdfReader(filename).get_num_pages() + with playa.open(filename) as pdf: + number_of_pages = len(pdf.pages) else: raise ValueError("Either 'file' or 'filename' must be provided.") return number_of_pages From a489d295a17ea5b90f85c965c53be1af1a8ab385 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 21 Jul 2025 09:56:16 -0400 Subject: [PATCH 06/12] fix: playa needs "" as default password not None --- unstructured/partition/pdf_image/pdfminer_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index 7c270cb693..eaee1a5baa 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -74,6 +74,8 @@ def open_pdfminer_pages_generator( """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {} laparams = LAParams(**laparams_kwargs) + if password is None: + password = "" # playa's default if fp is None: from functools import partial From 318a954a0cd63172f04c5248014c6c8be9143a94 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 21 Jul 2025 11:06:27 -0400 Subject: [PATCH 07/12] fix: require playa-pdf 0.6.2 for colormap issue --- requirements/extra-pdf-image.in | 1 + requirements/extra-pdf-image.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index df4d00ee17..b4a11ab4a0 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -5,6 +5,7 @@ onnx>=1.17.0 onnxruntime>=1.19.0 pdf2image paves +playa-pdf>=0.6.2 pi_heif pypdf google-cloud-vision diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 3fd0178b6f..6518deb748 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -150,7 +150,7 @@ pillow==11.3.0 # pi-heif # torchvision # unstructured-pytesseract -playa-pdf==0.6.1 +playa-pdf==0.6.2 # via paves proto-plus==1.26.1 # via From 2f87d893c52f607b0b12f6b42e4f437c454f422f Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 21 Jul 2025 11:21:45 -0400 Subject: [PATCH 08/12] fix: isort --- unstructured/partition/pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 23f85b583c..6849b435af 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -10,10 +10,10 @@ from typing import IO, TYPE_CHECKING, Any, Optional, cast import numpy as np +import playa from paves.miner import LTContainer, LTImage, LTItem, LTTextBox, resolve1 from pi_heif import register_heif_opener from PIL import Image as PILImage -import playa from unstructured_inference.inference.layout import DocumentLayout from unstructured_inference.inference.layoutelement import LayoutElement From e79845f1d2ab96cedc27db7a87f8e4a890278828 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 21 Jul 2025 11:44:24 -0400 Subject: [PATCH 09/12] fix(tests): playa/paves do not output (cid:N) droppings --- .../multi-column-2p.pdf.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json index 829b9b7a7e..c6e503a945 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json @@ -111,8 +111,8 @@ }, { "type": "CompositeElement", - "element_id": "e6dee1abec28f8ff365ab6275b3e5f0e", - "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who first voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversified topics. More specifically, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne", + "element_id": "c2959a06eb5a6864c4f0c7d38e21b2e9", + "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who first voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversified topics. More specifically, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne", "metadata": { "data_source": { "record_locator": { @@ -153,4 +153,4 @@ "page_number": 2 } } -] \ No newline at end of file +] From afb1288250c3f19b02ff830db3b8b1315205225a Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 21 Jul 2025 11:53:24 -0400 Subject: [PATCH 10/12] fix(tests): update indices since (cid:N) no longer occurs --- test_unstructured/partition/test_msg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py index 94b12d5578..d2c0a1ce5f 100644 --- a/test_unstructured/partition/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -125,8 +125,8 @@ def test_partition_msg_can_process_attachments(): assert all(e.metadata.filename == "fake-email-multiple-attachments.msg" for e in elements[:5]) assert all(e.metadata.filename == "unstructured_logo.png" for e in elements[5:7]) - assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:343]) - assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[343:]) + assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:341]) + assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[341:]) assert [e.text for e in elements[:5]] == [ "Here are those documents.", "--", From ea36f1075837c0cbe3fb35b6109cbc6f867613db Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 21 Jul 2025 12:02:55 -0400 Subject: [PATCH 11/12] fix(tests): update markdown and html fixtures --- .../multi-column-2p.pdf.html | 4 ++-- .../multi-column-2p.pdf.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html index c780303b30..81f092abee 100644 --- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html @@ -58,12 +58,12 @@
In this paper, we address the question: can we train a better dense embedding model using only pairs of questions and passages (or answers), with- out additional pretraining? By leveraging the now standard BERT pretrained model (Devlin et al., 2019) and a dual-encoder architecture (Bromley et al., 1994), we focus on developing the right training scheme using a relatively small number of question and passage pairs. Through a series of careful ablation studies, our final solution is surprisingly simple: the embedding is optimized for maximizing inner products of the question and relevant passage vectors, with an objective compar- ing all pairs of questions and passages in a batch. Our Dense Passage Retriever (DPR) is exception- ally strong. It not only outperforms BM25 by a large margin (65.2% vs. 42.9% in Top-5 accuracy), but also results in a substantial improvement on the end-to-end QA accuracy compared to ORQA (41.5% vs. 33.3%) in the open Natural Questions setting (Lee et al., 2019; Kwiatkowski et al., 2019). Our contributions are twofold. First, we demon- strate that with the proper training setup, sim- ply fine-tuning the question and passage encoders on existing question-passage pairs is sufficient to greatly outperform BM25. Our empirical results also suggest that additional pretraining may not be needed. Second, we verify that, in the context of open-domain question answering, a higher retrieval precision indeed translates to a higher end-to-end QA accuracy. By applying a modern reader model to the top retrieved passages, we achieve compara- ble or better results on multiple QA datasets in the open-retrieval setting, compared to several, much complicated systems.
-
+
2 Background The problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who first voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversified topics. More specifically, we assume -the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question. +the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question. e
diff --git a/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md b/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md index 8e74e6d944..c64ab3495e 100644 --- a/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md +++ b/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md @@ -43,7 +43,7 @@ In this paper, we address the question: can we train a better dense embedding mo The problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who first voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversified topics. More specifically, we assume -the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question. +the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question. e 3 Dense Passage Retriever (DPR) From e99973482b6d202d0ec617cd05b6850e05eb949e Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 21 Jul 2025 12:08:07 -0400 Subject: [PATCH 12/12] fix(tests): fix missing or not missing newline for silly diff --- .../multi-column-2p.pdf.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json index c6e503a945..40c36de858 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json @@ -153,4 +153,4 @@ "page_number": 2 } } -] +] \ No newline at end of file