Skip to content

Commit cf0eecd

Browse files
authored
Merge pull request #333 from pymupdf/v0.2.3
Version 0.2.3
2 parents 4965299 + 3440b5a commit cf0eecd

File tree

10 files changed

+593
-279
lines changed

10 files changed

+593
-279
lines changed

CHANGES.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,22 @@
11
# Change Log
22

3+
## Changes in version 0.2.3
4+
5+
### Fixes:
6+
7+
* [332](https://github.com/pymupdf/RAG/issues/332) - TypeError("to_markdown() got an unexpected keyword argument 'header'")
8+
9+
### Other Changes:
10+
11+
* Output (backend) methods now accept a new parameter `ocr_dpi=400` which sets the OCR resolution for full-page OCR.
12+
* The OCR detection heuristics is more fine-grained and now detects more OCR situations.
13+
* Resolved multiple performance issues, specifically cases with overwhelmingly many images and extremely large `StructTreeRoot` objects in PDF.
14+
* Reflected layout-specific API changes in the legacy code and will now raise `NotImplementedError` exceptions when layout-only features are used.
15+
* Information messages during document parsing are now written to stdout collectively at the end of the phase. This applies to announcing page OCR decisions specifically.
16+
* Support parameter `page_separators` as in the legacy mode.
17+
18+
------
19+
320
## Changes in version 0.2.1
421

522
### Fixes:

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ The Python package on PyPI [pymupdf4llm](https://pypi.org/project/pymupdf4llm/)
3030
$ pip install -U pymupdf4llm
3131
```
3232

33-
> This command will automatically install [PyMuPDF](https://github.com/pymupdf/PyMuPDF) if required.
33+
> This command will automatically install or upgrade [PyMuPDF](https://github.com/pymupdf/PyMuPDF) if required.
3434
3535
Then in your script do
3636

pdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
77
readme = f.read()
88

9-
version = "0.2.2"
9+
version = "0.2.3" # must always equal the pymupdf4llm version
1010

1111
classifiers = [
1212
"Development Status :: 5 - Production/Stable",

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 37 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,32 +12,42 @@
1212
version_tuple = tuple(map(int, version.split(".")))
1313

1414
if pymupdf._get_layout is None:
15-
from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown
15+
from .helpers.pymupdf_rag import (
16+
IdentifyHeaders,
17+
TocHeaders,
18+
to_markdown,
19+
to_json,
20+
to_text,
21+
)
1622

1723
pymupdf._warn_layout_once() # recommend pymupdf_layout
1824

1925
else:
20-
from .helpers import document_layout as DL
26+
from .helpers import document_layout
2127

2228
def parse_document(
2329
doc,
2430
filename="",
2531
image_dpi=150,
2632
image_format="png",
2733
image_path="",
34+
ocr_dpi=400,
2835
pages=None,
29-
output_images=True,
36+
write_images=False,
37+
embed_images=False,
3038
show_progress=False,
3139
force_text=True,
3240
):
33-
return DL.parse_document(
41+
return document_layout.parse_document(
3442
doc,
3543
filename=filename,
3644
image_dpi=image_dpi,
3745
image_format=image_format,
3846
image_path=image_path,
3947
pages=pages,
40-
output_images=output_images,
48+
ocr_dpi=ocr_dpi,
49+
write_images=write_images,
50+
embed_images=embed_images,
4151
show_progress=show_progress,
4252
force_text=force_text,
4353
)
@@ -48,40 +58,35 @@ def to_markdown(
4858
header=True,
4959
footer=True,
5060
pages=None,
51-
hdr_info=None,
5261
write_images=False,
5362
embed_images=False,
54-
ignore_images=False,
55-
ignore_graphics=False,
56-
detect_bg_color=True,
5763
image_path="",
5864
image_format="png",
59-
image_size_limit=0.05,
6065
filename="",
6166
force_text=True,
6267
page_chunks=False,
6368
page_separators=False,
64-
margins=0,
6569
dpi=150,
70+
ocr_dpi=400,
6671
page_width=612,
6772
page_height=None,
68-
table_strategy="lines_strict",
69-
graphics_limit=None,
70-
fontsize_limit=3,
7173
ignore_code=False,
72-
extract_words=False,
7374
show_progress=False,
74-
use_glyphs=False,
75-
ignore_alpha=False,
75+
# unsupported options for pymupdf layout:
76+
**kwargs,
7677
):
78+
if write_images and embed_images:
79+
raise ValueError("Cannot both write_images and embed_images")
7780
parsed_doc = parse_document(
7881
doc,
7982
filename=filename,
8083
image_dpi=dpi,
8184
image_format=image_format,
8285
image_path=image_path,
8386
pages=pages,
84-
output_images=embed_images or write_images,
87+
ocr_dpi=ocr_dpi,
88+
write_images=write_images,
89+
embed_images=embed_images,
8590
show_progress=show_progress,
8691
force_text=force_text,
8792
)
@@ -92,27 +97,32 @@ def to_markdown(
9297
embed_images=embed_images,
9398
ignore_code=ignore_code,
9499
show_progress=show_progress,
100+
page_separators=page_separators,
101+
page_chunks=page_chunks,
95102
)
96103

97104
def to_json(
98105
doc,
99-
header=True,
100-
footer=True,
101106
image_dpi=150,
102107
image_format="png",
103108
image_path="",
104109
pages=None,
105-
output_images=False,
110+
ocr_dpi=400,
111+
write_images=False,
112+
embed_images=False,
106113
show_progress=False,
107114
force_text=True,
115+
# unsupported options for pymupdf layout:
116+
**kwargs,
108117
):
109118
parsed_doc = parse_document(
110119
doc,
111120
image_dpi=image_dpi,
112121
image_format=image_format,
113122
image_path=image_path,
114123
pages=pages,
115-
output_images=output_images,
124+
embed_images=embed_images,
125+
write_images=write_images,
116126
show_progress=show_progress,
117127
force_text=force_text,
118128
)
@@ -127,15 +137,16 @@ def to_text(
127137
ignore_code=False,
128138
show_progress=False,
129139
force_text=True,
140+
ocr_dpi=400,
141+
# unsupported options for pymupdf layout:
142+
**kwargs,
130143
):
131144
parsed_doc = parse_document(
132145
doc,
133146
filename=filename,
134-
image_dpi=150,
135-
image_format="png",
136-
image_path="",
137147
pages=pages,
138-
output_images=False,
148+
embed_images=False,
149+
write_images=False,
139150
show_progress=show_progress,
140151
force_text=force_text,
141152
)

0 commit comments

Comments
 (0)