Skip to content

Commit 471303b

Browse files
committed
✨ add data schema override to v2
1 parent 212a490 commit 471303b

File tree

4 files changed

+87
-42
lines changed

4 files changed

+87
-42
lines changed

mindee/input/inference_parameters.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,41 @@
1+
import json
12
from dataclasses import dataclass
2-
from typing import List, Optional
3+
from typing import List, Optional, Union
34

45
from mindee.input.polling_options import PollingOptions
56

67

8+
class DataSchema:
9+
"""Modify the Data Schema."""
10+
11+
_override: Optional[dict] = None
12+
13+
def __init__(self, override: Optional[dict] = None):
14+
self._override = override
15+
16+
@property
17+
def override(self):
18+
"""Override the data schema."""
19+
return self._override
20+
21+
@override.setter
22+
def override(self, value: Optional[Union[str, dict]]) -> None:
23+
if value is None:
24+
_override = None
25+
elif isinstance(value, str):
26+
_override = json.loads(value)
27+
elif isinstance(value, dict):
28+
_override = value
29+
else:
30+
raise TypeError("Invalid type for data schema override")
31+
if _override is not None and _override == {}:
32+
raise ValueError("Empty override provided")
33+
self._override = _override
34+
35+
def __str__(self) -> str:
36+
return json.dumps({"override": self.override})
37+
38+
739
@dataclass
840
class InferenceParameters:
941
"""Inference parameters to set when sending a file."""
@@ -31,3 +63,4 @@ class InferenceParameters:
3163
"""Whether to close the file after parsing."""
3264
text_context: Optional[str] = None
3365
"""Additional text context used by the model during inference. Not recommended, for specific use only."""
66+
data_schema: Optional[DataSchema] = None

mindee/mindee_http/mindee_api_v2.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,10 @@ def req_post_inference_enqueue(
9494
data["webhook_ids"] = params.webhook_ids
9595
if params.alias and len(params.alias):
9696
data["alias"] = params.alias
97-
if params.text_context and (params.text_context):
97+
if params.text_context and len(params.text_context):
9898
data["text_context"] = params.text_context
99+
if params.data_schema is not None:
100+
data["data_schema"] = str(params.data_schema)
99101

100102
if isinstance(input_source, LocalInputSource):
101103
files = {"file": input_source.read_contents(params.close_file)}

tests/v2/test_client.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from mindee.error.mindee_error import MindeeApiV2Error, MindeeError
77
from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2
88
from mindee.input import LocalInputSource, PathInput
9+
from mindee.input.inference_parameters import DataSchema
910
from mindee.mindee_http.base_settings import USER_AGENT
1011
from mindee.parsing.v2.inference import Inference
1112
from mindee.parsing.v2.job import Job
@@ -130,7 +131,11 @@ def test_enqueue_and_parse_path_with_env_token(custom_base_url_client):
130131
with pytest.raises(MindeeHTTPErrorV2):
131132
custom_base_url_client.enqueue_and_get_inference(
132133
input_doc,
133-
InferenceParameters("dummy-model", text_context="ignore this message"),
134+
InferenceParameters(
135+
"dummy-model",
136+
text_context="ignore this message",
137+
data_schema=DataSchema(override={"test_field": {}}),
138+
),
134139
)
135140

136141

tests/v2/test_client_integration.py

Lines changed: 44 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from mindee import ClientV2, InferenceParameters, PathInput, UrlInputSource
77
from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2
8+
from mindee.input.inference_parameters import DataSchema
89
from mindee.parsing.v2.inference_response import InferenceResponse
910
from tests.utils import FILE_TYPES_DIR, V2_PRODUCT_DATA_DIR
1011

@@ -25,6 +26,20 @@ def v2_client() -> ClientV2:
2526
return ClientV2(api_key)
2627

2728

29+
def _basic_assert_success(response: InferenceResponse, page_count: int) -> None:
30+
assert response is not None
31+
assert response.inference is not None
32+
33+
assert response.inference.file is not None
34+
assert response.inference.file.page_count == page_count
35+
36+
assert response.inference.model is not None
37+
assert response.inference.model.id == findoc_model_id
38+
39+
assert response.inference.result is not None
40+
assert response.inference.active_options is not None
41+
42+
2843
@pytest.mark.integration
2944
@pytest.mark.v2
3045
def test_parse_file_empty_multiple_pages_must_succeed(
@@ -49,24 +64,15 @@ def test_parse_file_empty_multiple_pages_must_succeed(
4964
response: InferenceResponse = v2_client.enqueue_and_get_inference(
5065
input_source, params
5166
)
52-
assert response is not None
53-
assert response.inference is not None
67+
_basic_assert_success(response=response, page_count=2)
5468

55-
assert response.inference.file is not None
5669
assert response.inference.file.name == "multipage_cut-2.pdf"
57-
assert response.inference.file.page_count == 2
58-
59-
assert response.inference.model is not None
60-
assert response.inference.model.id == findoc_model_id
6170

62-
assert response.inference.active_options is not None
6371
assert response.inference.active_options.rag is False
6472
assert response.inference.active_options.raw_text is True
6573
assert response.inference.active_options.polygon is False
6674
assert response.inference.active_options.confidence is False
6775

68-
assert response.inference.result is not None
69-
7076
assert response.inference.result.raw_text is not None
7177
assert len(response.inference.result.raw_text.pages) == 2
7278

@@ -93,24 +99,15 @@ def test_parse_file_empty_single_page_options_must_succeed(
9399
response: InferenceResponse = v2_client.enqueue_and_get_inference(
94100
input_source, params
95101
)
96-
assert response is not None
97-
assert response.inference is not None
98-
99-
assert response.inference.model is not None
100-
assert response.inference.model.id == findoc_model_id
102+
_basic_assert_success(response=response, page_count=1)
101103

102-
assert response.inference.file is not None
103104
assert response.inference.file.name == "blank_1.pdf"
104-
assert response.inference.file.page_count == 1
105105

106-
assert response.inference.active_options is not None
107106
assert response.inference.active_options.rag is True
108107
assert response.inference.active_options.raw_text is True
109108
assert response.inference.active_options.polygon is True
110109
assert response.inference.active_options.confidence is True
111110

112-
assert response.inference.result is not None
113-
114111

115112
@pytest.mark.integration
116113
@pytest.mark.v2
@@ -137,26 +134,17 @@ def test_parse_file_filled_single_page_must_succeed(
137134
response: InferenceResponse = v2_client.enqueue_and_get_inference(
138135
input_source, params
139136
)
137+
_basic_assert_success(response=response, page_count=1)
140138

141-
assert response is not None
142-
assert response.inference is not None
143-
144-
assert response.inference.file is not None
145139
assert response.inference.file.name == "default_sample.jpg"
146-
assert response.inference.file.page_count == 1
147140

148-
assert response.inference.model is not None
149-
assert response.inference.model.id == findoc_model_id
150-
151-
assert response.inference.active_options is not None
152141
assert response.inference.active_options.rag is False
153142
assert response.inference.active_options.raw_text is False
154143
assert response.inference.active_options.polygon is False
155144
assert response.inference.active_options.confidence is False
156145

157146
assert response.inference.result.raw_text is None
158147

159-
assert response.inference.result is not None
160148
supplier_name = response.inference.result.fields["supplier_name"]
161149
assert supplier_name is not None
162150
assert supplier_name.value == "John Smith"
@@ -266,15 +254,32 @@ def test_blank_url_input_source_must_succeed(
266254
response: InferenceResponse = v2_client.enqueue_and_get_inference(
267255
input_source, params
268256
)
269-
assert response is not None
270-
assert response.inference is not None
257+
_basic_assert_success(response=response, page_count=1)
271258

272-
assert response.inference.file is not None
273-
assert response.inference.file.page_count == 1
274259

275-
assert response.inference.model is not None
276-
assert response.inference.model.id == findoc_model_id
277-
278-
assert response.inference.result is not None
260+
@pytest.mark.integration
261+
@pytest.mark.v2
262+
def test_data_schema_must_succeed(
263+
v2_client: ClientV2,
264+
findoc_model_id: str,
265+
) -> None:
266+
"""
267+
Load a blank PDF from an HTTPS URL and make sure the inference call completes without raising any errors.
268+
"""
269+
input_path: Path = FILE_TYPES_DIR / "pdf" / "blank_1.pdf"
279270

280-
assert response.inference.active_options is not None
271+
input_source = PathInput(input_path)
272+
params = InferenceParameters(
273+
model_id=findoc_model_id,
274+
rag=False,
275+
raw_text=False,
276+
polygon=False,
277+
confidence=False,
278+
webhook_ids=[],
279+
data_schema=DataSchema(override={"test_field": {}}),
280+
alias="py_integration_data_schema_override",
281+
)
282+
response: InferenceResponse = v2_client.enqueue_and_get_inference(
283+
input_source, params
284+
)
285+
_basic_assert_success(response=response, page_count=1)

0 commit comments

Comments
 (0)