pymupdf
diff --git a/‎CHANGES.md‎
Lines changed: 17 additions & 0 deletions b/‎CHANGES.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pdf4llm/setup.py‎
Lines changed: 1 addition & 1 deletion b/‎pdf4llm/setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pymupdf4llm/pymupdf4llm/__init__.py‎
Lines changed: 37 additions & 26 deletions b/‎pymupdf4llm/pymupdf4llm/__init__.py‎
Lines changed: 37 additions & 26 deletions
@@ -1,5 +1,22 @@
 # Change Log
 
+## Changes in version 0.2.3
+
+### Fixes:
+
+* [332](https://github.com/pymupdf/RAG/issues/332) - TypeError("to_markdown() got an unexpected keyword argument 'header'")
+
+### Other Changes:
+
+* Output (backend) methods now accept a new parameter `ocr_dpi=400` which sets the OCR resolution for full-page OCR.
+* The OCR detection heuristics is more fine-grained and now detects more OCR situations.
+* Resolved multiple performance issues, specifically cases with overwhelmingly many images and extremely large `StructTreeRoot` objects in PDF.
+* Reflected layout-specific API changes in the legacy code and will now raise `NotImplementedError` exceptions when layout-only features are used.
+* Information messages during document parsing are now written to stdout collectively at the end of the phase. This applies to announcing page OCR decisions specifically.
+* Support parameter `page_separators` as in the legacy mode.
+
+------
+
 ## Changes in version 0.2.1
 
 ### Fixes:
 
@@ -30,7 +30,7 @@ The Python package on PyPI [pymupdf4llm](https://pypi.org/project/pymupdf4llm/)
 $ pip install -U pymupdf4llm
 ```
 
-> This command will automatically install [PyMuPDF](https://github.com/pymupdf/PyMuPDF) if required.
+> This command will automatically install or upgrade [PyMuPDF](https://github.com/pymupdf/PyMuPDF) if required.
 
 Then in your script do
 
 
@@ -6,7 +6,7 @@
 with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
     readme = f.read()
 
-version = "0.2.2"
+version = "0.2.3"  # must always equal the pymupdf4llm version
 
 classifiers = [
     "Development Status :: 5 - Production/Stable",
 
@@ -12,32 +12,42 @@
 version_tuple = tuple(map(int, version.split(".")))
 
 if pymupdf._get_layout is None:
-    from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown
+    from .helpers.pymupdf_rag import (
+        IdentifyHeaders,
+        TocHeaders,
+        to_markdown,
+        to_json,
+        to_text,
+    )
 
     pymupdf._warn_layout_once()  # recommend pymupdf_layout
 
 else:
-    from .helpers import document_layout as DL
+    from .helpers import document_layout
 
     def parse_document(
         doc,
         filename="",
         image_dpi=150,
         image_format="png",
         image_path="",
+        ocr_dpi=400,
         pages=None,
-        output_images=True,
+        write_images=False,
+        embed_images=False,
         show_progress=False,
         force_text=True,
     ):
-        return DL.parse_document(
+        return document_layout.parse_document(
             doc,
             filename=filename,
             image_dpi=image_dpi,
             image_format=image_format,
             image_path=image_path,
             pages=pages,
-            output_images=output_images,
+            ocr_dpi=ocr_dpi,
+            write_images=write_images,
+            embed_images=embed_images,
             show_progress=show_progress,
             force_text=force_text,
         )
@@ -48,40 +58,35 @@ def to_markdown(
         header=True,
         footer=True,
         pages=None,
-        hdr_info=None,
         write_images=False,
         embed_images=False,
-        ignore_images=False,
-        ignore_graphics=False,
-        detect_bg_color=True,
         image_path="",
         image_format="png",
-        image_size_limit=0.05,
         filename="",
         force_text=True,
         page_chunks=False,
         page_separators=False,
-        margins=0,
         dpi=150,
+        ocr_dpi=400,
         page_width=612,
         page_height=None,
-        table_strategy="lines_strict",
-        graphics_limit=None,
-        fontsize_limit=3,
         ignore_code=False,
-        extract_words=False,
         show_progress=False,
-        use_glyphs=False,
-        ignore_alpha=False,
+        # unsupported options for pymupdf layout:
+        **kwargs,
     ):
+        if write_images and embed_images:
+            raise ValueError("Cannot both write_images and embed_images")
         parsed_doc = parse_document(
             doc,
             filename=filename,
             image_dpi=dpi,
             image_format=image_format,
             image_path=image_path,
             pages=pages,
-            output_images=embed_images or write_images,
+            ocr_dpi=ocr_dpi,
+            write_images=write_images,
+            embed_images=embed_images,
             show_progress=show_progress,
             force_text=force_text,
         )
@@ -92,27 +97,32 @@ def to_markdown(
             embed_images=embed_images,
             ignore_code=ignore_code,
             show_progress=show_progress,
+            page_separators=page_separators,
+            page_chunks=page_chunks,
         )
 
     def to_json(
         doc,
-        header=True,
-        footer=True,
         image_dpi=150,
         image_format="png",
         image_path="",
         pages=None,
-        output_images=False,
+        ocr_dpi=400,
+        write_images=False,
+        embed_images=False,
         show_progress=False,
         force_text=True,
+        # unsupported options for pymupdf layout:
+        **kwargs,
     ):
         parsed_doc = parse_document(
             doc,
             image_dpi=image_dpi,
             image_format=image_format,
             image_path=image_path,
             pages=pages,
-            output_images=output_images,
+            embed_images=embed_images,
+            write_images=write_images,
             show_progress=show_progress,
             force_text=force_text,
         )
@@ -127,15 +137,16 @@ def to_text(
         ignore_code=False,
         show_progress=False,
         force_text=True,
+        ocr_dpi=400,
+        # unsupported options for pymupdf layout:
+        **kwargs,
     ):
         parsed_doc = parse_document(
             doc,
             filename=filename,
-            image_dpi=150,
-            image_format="png",
-            image_path="",
             pages=pages,
-            output_images=False,
+            embed_images=False,
+            write_images=False,
             show_progress=show_progress,
             force_text=force_text,
         )