⚡️ Speed up function _merge_extracted_into_inferred_when_almost_the_same by 24% in PR #4112 (feat/track-text-source) (#4114)

codeflash-ai[bot] · web-flow · commit 00a2e6aa2bb4 · 2025-11-06T14:55:23.000-06:00
## ⚡️ This pull request contains optimizations for PR #4112 If you approve this dependent PR, these changes will be merged into the original PR branch `feat/track-text-source`. >This PR will be automatically closed if the original PR is merged. ---- #### 📄 24% (0.24x) speedup for ***`_merge_extracted_into_inferred_when_almost_the_same` in `unstructured/partition/pdf_image/pdfminer_processing.py`*** ⏱️ Runtime : **`40.6 milliseconds`** **→** **`32.6 milliseconds`** (best of `18` runs) #### 📝 Explanation and details The optimized code achieves a **24% speedup** through two key optimizations: **1. Improved `_minimum_containing_coords` function:** - **What**: Replaced `np.vstack` with separate array creation followed by `np.column_stack` - **Why**: The original code created list comprehensions multiple times within `np.vstack`, causing redundant temporary arrays and inefficient memory access patterns. The optimized version pre-computes each coordinate array once, then combines them efficiently - **Impact**: Reduces function time from 1.88ms to 1.41ms (25% faster). Line profiler shows the costly list comprehensions in the original (lines with 27%, 14%, 13%, 12% of time) are replaced with more efficient array operations **2. Optimized comparison in `boxes_iou` function:** - **What**: Changed `(inter_area / denom) > threshold` to `inter_area > (threshold * denom)` - **Why**: Avoids expensive division operations by algebraically rearranging the inequality. Division is significantly slower than multiplication in NumPy, especially for large arrays - **Impact**: Reduces the final comparison from 19% to 5.8% of function time, while the intermediate denominator calculation takes 11.8% **3. Minor optimization in boolean mask creation:** - **What**: Replaced `boxes_almost_same.sum(axis=1).astype(bool)` with `np.any(boxes_almost_same, axis=1)` - **Why**: `np.any` short-circuits on the first True value and is semantically clearer, though the performance gain is minimal **Test case analysis shows the optimizations are particularly effective for:** - Large-scale scenarios (1000+ elements): 17-75% speedup depending on match patterns - Cases with no matches benefit most (74.6% faster) due to avoiding expensive division operations - All test cases show consistent 6-17% improvements, indicating robust optimization across different workloads The optimizations maintain identical functionality while reducing computational overhead through better NumPy usage patterns and mathematical rearrangement. ✅ **Correctness verification report:** | Test | Status | | --------------------------- | ----------------- | | ⏪ Replay Tests | 🔘 **None Found** | | ⚙️ Existing Unit Tests | 🔘 **None Found** | | 🔎 Concolic Coverage Tests | 🔘 **None Found** | | 🌀 Generated Regression Tests | ✅ **18 Passed** | |📊 Tests Coverage | 100.0% | <details> <summary>🌀 Generated Regression Tests and Runtime</summary> ```python import numpy as np # imports import pytest from unstructured.partition.pdf_image.pdfminer_processing import \ _merge_extracted_into_inferred_when_almost_the_same # --- Minimal class stubs and helpers to support the function under test --- class DummyLayoutElements: """ Minimal implementation of LayoutElements to support testing. - element_coords: np.ndarray of shape (N, 4) for bounding boxes. - texts: np.ndarray of shape (N,) for text strings. - is_extracted_array: np.ndarray of shape (N,) for boolean flags. """ def __init__(self, element_coords, texts=None, is_extracted_array=None): self.element_coords = np.array(element_coords, dtype=np.float32) self.texts = np.array(texts if texts is not None else [''] * len(element_coords), dtype=object) self.is_extracted_array = np.array(is_extracted_array if is_extracted_array is not None else [False] * len(element_coords), dtype=bool) def __len__(self): return len(self.element_coords) def slice(self, mask): # mask can be a boolean array or integer indices if isinstance(mask, (np.ndarray, list)): if isinstance(mask[0], bool): idx = np.where(mask)[0] else: idx = np.array(mask) else: idx = np.array([mask]) return DummyLayoutElements( self.element_coords[idx], self.texts[idx], self.is_extracted_array[idx] ) from unstructured.partition.pdf_image.pdfminer_processing import \ _merge_extracted_into_inferred_when_almost_the_same # --- Unit Tests --- # ----------- BASIC TEST CASES ----------- def test_no_inferred_elements_returns_false_mask(): # No inferred elements: all extracted should not be merged extracted = DummyLayoutElements([[0, 0, 1, 1], [1, 1, 2, 2]], texts=["a", "b"]) inferred = DummyLayoutElements([]) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.9); mask = codeflash_output # 3.50μs -> 3.30μs (6.10% faster) def test_no_extracted_elements_returns_empty_mask(): # No extracted elements: should return empty mask extracted = DummyLayoutElements([]) inferred = DummyLayoutElements([[0, 0, 1, 1]]) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.9); mask = codeflash_output # 2.30μs -> 2.31μs (0.475% slower) #------------------------------------------------ import numpy as np # imports import pytest from unstructured.partition.pdf_image.pdfminer_processing import \ _merge_extracted_into_inferred_when_almost_the_same # Minimal stubs for TextRegions and LayoutElements to enable testing class TextRegions: def __init__(self, coords, texts=None, is_extracted_array=None): self.x1 = coords[:, 0] self.y1 = coords[:, 1] self.x2 = coords[:, 2] self.y2 = coords[:, 3] self.texts = np.array(texts) if texts is not None else np.array([""] * len(coords)) self.is_extracted_array = np.array(is_extracted_array) if is_extracted_array is not None else np.zeros(len(coords), dtype=bool) self.element_coords = coords def __len__(self): return len(self.element_coords) def slice(self, mask): # mask can be bool array or indices if isinstance(mask, (np.ndarray, list)): if isinstance(mask, np.ndarray) and mask.dtype == bool: idx = np.where(mask)[0] else: idx = mask else: idx = [mask] coords = self.element_coords[idx] texts = self.texts[idx] is_extracted_array = self.is_extracted_array[idx] return TextRegions(coords, texts, is_extracted_array) class LayoutElements(TextRegions): pass from unstructured.partition.pdf_image.pdfminer_processing import \ _merge_extracted_into_inferred_when_almost_the_same # =========================== # Unit Tests # =========================== # ----------- BASIC TEST CASES ----------- def test_basic_exact_match(): # One extracted, one inferred, same box coords = np.array([[0, 0, 10, 10]]) extracted = LayoutElements(coords, texts=["extracted"], is_extracted_array=[True]) inferred = LayoutElements(coords, texts=["inferred"], is_extracted_array=[False]) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.99); mask = codeflash_output # 207μs -> 192μs (7.74% faster) def test_basic_no_match(): # Boxes do not overlap extracted = LayoutElements(np.array([[0, 0, 10, 10]]), texts=["extracted"], is_extracted_array=[True]) inferred = LayoutElements(np.array([[20, 20, 30, 30]]), texts=["inferred"], is_extracted_array=[False]) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.99); mask = codeflash_output # 163μs -> 151μs (7.85% faster) def test_basic_partial_overlap_below_threshold(): # Overlap, but below threshold extracted = LayoutElements(np.array([[0, 0, 10, 10]]), texts=["extracted"], is_extracted_array=[True]) inferred = LayoutElements(np.array([[5, 5, 15, 15]]), texts=["inferred"], is_extracted_array=[False]) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.99); mask = codeflash_output # 158μs -> 148μs (6.53% faster) def test_basic_partial_overlap_above_threshold(): # Overlap, above threshold extracted = LayoutElements(np.array([[0, 0, 10, 10]]), texts=["extracted"], is_extracted_array=[True]) inferred = LayoutElements(np.array([[0, 0, 10, 10.1]]), texts=["inferred"], is_extracted_array=[False]) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.99); mask = codeflash_output # 191μs -> 176μs (8.22% faster) def test_basic_multiple_elements_some_match(): # Multiple extracted/inferred, some matches extracted = LayoutElements( np.array([[0, 0, 10, 10], [20, 20, 30, 30]]), texts=["extracted1", "extracted2"], is_extracted_array=[True, True] ) inferred = LayoutElements( np.array([[0, 0, 10, 10], [100, 100, 110, 110]]), texts=["inferred1", "inferred2"], is_extracted_array=[False, False] ) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.99); mask = codeflash_output # 172μs -> 162μs (5.98% faster) # ----------- EDGE TEST CASES ----------- def test_edge_empty_extracted(): # No extracted elements extracted = LayoutElements(np.zeros((0, 4)), texts=[], is_extracted_array=[]) inferred = LayoutElements(np.array([[0,0,1,1]]), texts=["foo"], is_extracted_array=[False]) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.99); mask = codeflash_output # 2.08μs -> 2.06μs (0.969% faster) def test_edge_empty_inferred(): # No inferred elements extracted = LayoutElements(np.array([[0,0,1,1]]), texts=["foo"], is_extracted_array=[True]) inferred = LayoutElements(np.zeros((0, 4)), texts=[], is_extracted_array=[]) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.99); mask = codeflash_output # 2.71μs -> 2.48μs (9.29% faster) def test_edge_all_elements_match(): # All extracted match inferred coords = np.array([[0,0,10,10], [20,20,30,30]]) extracted = LayoutElements(coords, texts=["A", "B"], is_extracted_array=[True, True]) inferred = LayoutElements(coords, texts=["X", "Y"], is_extracted_array=[False, False]) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.99); mask = codeflash_output # 174μs -> 162μs (7.69% faster) def test_edge_threshold_zero(): # Threshold zero means all overlap counts extracted = LayoutElements(np.array([[0,0,10,10]]), texts=["foo"], is_extracted_array=[True]) inferred = LayoutElements(np.array([[5,5,15,15]]), texts=["bar"], is_extracted_array=[False]) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.0); mask = codeflash_output # 159μs -> 150μs (5.94% faster) def test_edge_threshold_one(): # Threshold one means only perfect overlap counts extracted = LayoutElements(np.array([[0,0,10,10]]), texts=["foo"], is_extracted_array=[True]) inferred = LayoutElements(np.array([[0,0,10,10]]), texts=["bar"], is_extracted_array=[False]) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 1.0); mask = codeflash_output # 155μs -> 145μs (7.01% faster) def test_edge_multiple_matches_first_match_wins(): # Extracted overlaps with multiple inferred, but only first match is updated extracted = LayoutElements(np.array([[0,0,10,10]]), texts=["foo"], is_extracted_array=[True]) inferred = LayoutElements( np.array([[0,0,10,10], [0,0,10,10]]), texts=["bar1", "bar2"], is_extracted_array=[False, False] ) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.99); mask = codeflash_output # 168μs -> 156μs (7.25% faster) def test_edge_coords_are_updated_to_minimum_containing(): # Bounding boxes are updated to minimum containing box extracted = LayoutElements(np.array([[1,2,9,10]]), texts=["foo"], is_extracted_array=[True]) inferred = LayoutElements(np.array([[0,0,10,10]]), texts=["bar"], is_extracted_array=[False]) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.99); mask = codeflash_output # 156μs -> 144μs (8.56% faster) # The new coords should be the minimum containing both expected = np.array([0,0,10,10]) # ----------- LARGE SCALE TEST CASES ----------- def test_large_scale_many_elements(): # 500 extracted, 500 inferred, all match N = 500 coords = np.stack([np.arange(N), np.arange(N), np.arange(N)+10, np.arange(N)+10], axis=1) extracted = LayoutElements(coords, texts=[f"ex{i}" for i in range(N)], is_extracted_array=[True]*N) inferred = LayoutElements(coords.copy(), texts=[f"in{i}" for i in range(N)], is_extracted_array=[False]*N) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.99); mask = codeflash_output # 2.90ms -> 2.79ms (3.78% faster) def test_large_scale_some_elements_match(): # 1000 extracted, 500 inferred, only first 500 match N = 1000 M = 500 coords_extracted = np.stack([np.arange(N), np.arange(N), np.arange(N)+10, np.arange(N)+10], axis=1) coords_inferred = coords_extracted[:M] extracted = LayoutElements(coords_extracted, texts=[f"ex{i}" for i in range(N)], is_extracted_array=[True]*N) inferred = LayoutElements(coords_inferred.copy(), texts=[f"in{i}" for i in range(M)], is_extracted_array=[False]*M) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.99); mask = codeflash_output # 6.49ms -> 5.56ms (16.6% faster) # First 500 should be merged, rest not expected_mask = np.zeros(N, dtype=bool) expected_mask[:M] = True def test_large_scale_no_elements_match(): # 1000 extracted, 500 inferred, none match N = 1000 M = 500 coords_extracted = np.stack([np.arange(N), np.arange(N), np.arange(N)+10, np.arange(N)+10], axis=1) coords_inferred = coords_extracted[:M] + 10000 # Far away extracted = LayoutElements(coords_extracted, texts=[f"ex{i}" for i in range(N)], is_extracted_array=[True]*N) inferred = LayoutElements(coords_inferred, texts=[f"in{i}" for i in range(M)], is_extracted_array=[False]*M) codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.99); mask = codeflash_output # 8.91ms -> 5.11ms (74.6% faster) def test_large_scale_performance(): # Test that the function runs efficiently for 1000 elements N = 1000 coords = np.stack([np.arange(N), np.arange(N), np.arange(N)+10, np.arange(N)+10], axis=1) extracted = LayoutElements(coords, texts=[f"ex{i}" for i in range(N)], is_extracted_array=[True]*N) inferred = LayoutElements(coords.copy(), texts=[f"in{i}" for i in range(N)], is_extracted_array=[False]*N) import time start = time.time() codeflash_output = _merge_extracted_into_inferred_when_almost_the_same(extracted, inferred, 0.99); mask = codeflash_output # 20.6ms -> 17.6ms (17.1% faster) elapsed = time.time() - start # codeflash_output is used to check that the output of the original code is the same as that of the optimized code. ``` </details> To edit these changes `git checkout codeflash/optimize-pr4112-2025-11-05T21.03.01` and push. [![Codeflash](https://img.shields.io/badge/Optimized%20with-Codeflash-yellow?style=flat&color=%23ffc428&logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNDgwIiBoZWlnaHQ9ImF1dG8iIHZpZXdCb3g9IjAgMCA0ODAgMjgwIiBmaWxsPSJub25lIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPgo8cGF0aCBmaWxsLXJ1bGU9ImV2ZW5vZGQiIGNsaXAtcnVsZT0iZXZlbm9kZCIgZD0iTTI4Ni43IDAuMzc4NDE4SDIwMS43NTFMNTAuOTAxIDE0OC45MTFIMTM1Ljg1MUwwLjk2MDkzOCAyODEuOTk5SDk1LjQzNTJMMjgyLjMyNCA4OS45NjE2SDE5Ni4zNDVMMjg2LjcgMC4zNzg0MThaIiBmaWxsPSIjRkZDMDQzIi8+CjxwYXRoIGZpbGwtcnVsZT0iZXZlbm9kZCIgY2xpcC1ydWxlPSJldmVub2RkIiBkPSJNMzExLjYwNyAwLjM3ODkwNkwyNTguNTc4IDU0Ljk1MjZIMzc5LjU2N0w0MzIuMzM5IDAuMzc4OTA2SDMxMS42MDdaIiBmaWxsPSIjMEIwQTBBIi8+CjxwYXRoIGZpbGwtcnVsZT0iZXZlbm9kZCIgY2xpcC1ydWxlPSJldmVub2RkIiBkPSJNMzA5LjU0NyA4OS45NjAxTDI1Ni41MTggMTQ0LjI3NkgzNzcuNTA2TDQzMC4wMjEgODkuNzAyNkgzMDkuNTQ3Vjg5Ljk2MDFaIiBmaWxsPSIjMEIwQTBBIi8+CjxwYXRoIGZpbGwtcnVsZT0iZXZlbm9kZCIgY2xpcC1ydWxlPSJldmVub2RkIiBkPSJNMjQyLjg3MyAxNjQuNjZMMTg5Ljg0NCAyMTkuMjM0SDMxMC44MzNMMzYzLjM0NyAxNjQuNjZIMjQyLjg3M1oiIGZpbGw9IiMwQjBBMEEiLz4KPC9zdmc+Cg==)](https://codeflash.ai) ![Static Badge](https://img.shields.io/badge/🎯_Optimization_Quality-high-green)  --- > [!NOTE] > Speeds up layout merging by optimizing bounding-box aggregation, boolean mask creation, and IOU comparison to avoid divisions. > > - **Performance optimizations in `unstructured/partition/pdf_image/pdfminer_processing.py`**: > - `/_minimum_containing_coords`: > - Precomputes `x1/y1/x2/y2` arrays and uses `np.column_stack` to build output; removes extra transpose. > - `/_merge_extracted_into_inferred_when_almost_the_same`: > - Replaces `sum(...).astype(bool)` with `np.any(..., axis=1)` for match mask. > - `/boxes_iou`: > - Computes denominator once and replaces division `(x/y) > t` with `x > t*y` to avoid divisions. > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 8a0335f. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup>  Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com>
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -57,14 +57,20 @@ def _validate_bbox(bbox: list[int | float]) -> bool:
 
 def _minimum_containing_coords(*regions: TextRegions) -> np.ndarray:
     # TODO: refactor to just use np array as input
-    return np.vstack(
+    # Optimization: Use np.stack and np.column_stack to build output in a single step
+    x1s = np.array([region.x1 for region in regions])
+    y1s = np.array([region.y1 for region in regions])
+    x2s = np.array([region.x2 for region in regions])
+    y2s = np.array([region.y2 for region in regions])
+    # Use np.min/max reduction rather than create matrix then operate. Transpose last for shape (N, 4)
+    return np.column_stack(
         (
-            np.min([region.x1 for region in regions], axis=0),
-            np.min([region.y1 for region in regions], axis=0),
-            np.max([region.x2 for region in regions], axis=0),
-            np.max([region.y2 for region in regions], axis=0),
+            np.min(x1s, axis=0),
+            np.min(y1s, axis=0),
+            np.max(x2s, axis=0),
+            np.max(y2s, axis=0),
         )
-    ).T
+    )
 
 
 def _inferred_is_elementtype(
@@ -120,7 +126,7 @@ def _merge_extracted_into_inferred_when_almost_the_same(
         inferred_layout.element_coords,
         threshold=same_region_threshold,
     )
-    extracted_almost_the_same_as_inferred = boxes_almost_same.sum(axis=1).astype(bool)
+    extracted_almost_the_same_as_inferred = np.any(boxes_almost_same, axis=1)
     # NOTE: if a row is full of False the argmax returns first index; we use the mask above to
     # distinguish those (they would be False in the mask)
     first_match = np.argmax(boxes_almost_same, axis=1)
@@ -584,7 +590,9 @@ def boxes_iou(
     inter_area, boxa_area, boxb_area = areas_of_boxes_and_intersection_area(
         coords1, coords2, round_to=round_to
     )
-    return (inter_area / np.maximum(EPSILON_AREA, boxa_area + boxb_area.T - inter_area)) > threshold
+    denom = np.maximum(EPSILON_AREA, boxa_area + boxb_area.T - inter_area)
+    # Instead of (x/y) > t, use x > t*y for memory & speed with same result
+    return inter_area > (threshold * denom)
 
 
 @requires_dependencies("unstructured_inference")