Skip to content

Commit ae8f1a1

Browse files
committed
merge array elements while retaining extracted status
1 parent 7e159c4 commit ae8f1a1

File tree

1 file changed

+2
-0
lines changed

1 file changed

+2
-0
lines changed

unstructured/partition/pdf_image/pdfminer_processing.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ def _merge_extracted_into_inferred_when_almost_the_same(
128128
extracted_to_remove = extracted_layout.slice(extracted_almost_the_same_as_inferred)
129129
# copy here in case we change the extracted layout later
130130
inferred_layout.texts[inferred_indices_to_update] = extracted_to_remove.texts.copy()
131+
inferred_layout.is_extracted_array[inferred_indices_to_update] = extracted_to_remove.is_extracted_array.copy()
131132
# use coords that can bound BOTH the inferred and extracted region as final bounding box coords
132133
inferred_layout.element_coords[inferred_indices_to_update] = _minimum_containing_coords(
133134
inferred_layout.slice(inferred_indices_to_update),
@@ -426,6 +427,7 @@ def process_page_layout_from_pdfminer(
426427
element_class_ids=np.array(element_class),
427428
element_class_id_map={0: ElementType.UNCATEGORIZED_TEXT, 1: ElementType.IMAGE},
428429
sources=np.array([Source.PDFMINER] * len(element_class)),
430+
is_extracted_array=np.array([IsExtracted.TRUE] * len(element_class)),
429431
),
430432
urls_metadata,
431433
)

0 commit comments

Comments
 (0)