Skip to content

Commit abcc4f3

Browse files
committed
Add merge logic for is_extracted
1 parent 1a78d06 commit abcc4f3

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

unstructured/partition/pdf_image/pdfminer_processing.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pdfminer.pdftypes import PDFObjRef
99
from pdfminer.utils import open_filename
1010
from unstructured_inference.config import inference_config
11-
from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD
11+
from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD, IsExtracted
1212
from unstructured_inference.inference.elements import Rectangle
1313

1414
from unstructured.documents.coordinates import PixelSpace, PointSpace
@@ -647,13 +647,14 @@ def merge_inferred_with_extracted_layout(
647647
merged_layout = sort_text_regions(merged_layout, SORT_MODE_BASIC)
648648
# so that we can modify the text without worrying about hitting length limit
649649
merged_layout.texts = merged_layout.texts.astype(object)
650-
650+
merged_layout.is_extracted_array = merged_layout.is_extracted_array.astype(object)
651651
for i, text in enumerate(merged_layout.texts):
652652
if text is None:
653653
text = aggregate_embedded_text_by_block(
654654
target_region=merged_layout.slice([i]),
655655
source_regions=extracted_page_layout,
656656
)
657+
merged_layout.is_extracted_array[i] = IsExtracted.TRUE
657658
merged_layout.texts[i] = remove_control_characters(text)
658659

659660
inferred_page.elements_array = merged_layout

0 commit comments

Comments
 (0)