Skip to content

Commit 1a78d06

Browse files
committed
support is_extracted metadata for elements
1 parent 130c867 commit 1a78d06

File tree

2 files changed

+8
-0
lines changed

2 files changed

+8
-0
lines changed

unstructured/documents/elements.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ class ElementMetadata:
206206

207207
# -- used for Table elements to capture rows/col structure --
208208
text_as_html: Optional[str]
209+
is_extracted: Optional[str]
209210
table_as_cells: Optional[dict[str, str | int]]
210211
url: Optional[str]
211212

@@ -498,6 +499,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
498499
"image_base64": cls.DROP,
499500
"image_mime_type": cls.DROP,
500501
"is_continuation": cls.DROP, # -- not expected, added by chunking, not before --
502+
"is_extracted": cls.DROP,
501503
"languages": cls.LIST_UNIQUE,
502504
"last_modified": cls.FIRST,
503505
"link_texts": cls.LIST_CONCATENATE,

unstructured/partition/common/common.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import numbers
44
import subprocess
5+
from enum import Enum
56
from io import BufferedReader, BytesIO, TextIOWrapper
67
from tempfile import SpooledTemporaryFile
78
from time import sleep
@@ -58,12 +59,17 @@ def normalize_layout_element(
5859
prob = layout_dict.get("prob")
5960
aux_origin = layout_dict.get("source", None)
6061
origin = None
62+
if isinstance(layout_dict.get("is_extracted"), Enum):
63+
is_extracted = layout_dict["is_extracted"].value
64+
else:
65+
is_extracted = None
6166
if aux_origin:
6267
origin = aux_origin.value
6368
if prob and isinstance(prob, (int, str, float, numbers.Number)):
6469
class_prob_metadata = ElementMetadata(detection_class_prob=float(prob)) # type: ignore
6570
else:
6671
class_prob_metadata = ElementMetadata()
72+
class_prob_metadata.is_extracted = is_extracted
6773
common_kwargs = {
6874
"coordinates": coordinates,
6975
"coordinate_system": coordinate_system,

0 commit comments

Comments
 (0)