Skip to content

Commit c96a738

Browse files
committed
✨ add data schema override to v2
1 parent 212a490 commit c96a738

File tree

4 files changed

+100
-42
lines changed

4 files changed

+100
-42
lines changed

mindee/input/inference_parameters.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,41 @@
1+
import json
12
from dataclasses import dataclass
2-
from typing import List, Optional
3+
from typing import List, Optional, Union
34

45
from mindee.input.polling_options import PollingOptions
56

67

8+
class DataSchema:
9+
"""Modify the Data Schema."""
10+
11+
_override: Optional[list] = None
12+
13+
def __init__(self, override: Optional[list] = None):
14+
self._override = override
15+
16+
@property
17+
def override(self):
18+
"""Override the data schema."""
19+
return self._override
20+
21+
@override.setter
22+
def override(self, value: Optional[Union[str, list]]) -> None:
23+
if value is None:
24+
_override = None
25+
elif isinstance(value, str):
26+
_override = json.loads(value)
27+
elif isinstance(value, list):
28+
_override = value
29+
else:
30+
raise TypeError("Invalid type for data schema override")
31+
if _override is not None and _override == {}:
32+
raise ValueError("Empty override provided")
33+
self._override = _override
34+
35+
def __str__(self) -> str:
36+
return json.dumps({"override": self.override})
37+
38+
739
@dataclass
840
class InferenceParameters:
941
"""Inference parameters to set when sending a file."""
@@ -31,3 +63,4 @@ class InferenceParameters:
3163
"""Whether to close the file after parsing."""
3264
text_context: Optional[str] = None
3365
"""Additional text context used by the model during inference. Not recommended, for specific use only."""
66+
data_schema: Optional[DataSchema] = None

mindee/mindee_http/mindee_api_v2.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,10 @@ def req_post_inference_enqueue(
9494
data["webhook_ids"] = params.webhook_ids
9595
if params.alias and len(params.alias):
9696
data["alias"] = params.alias
97-
if params.text_context and (params.text_context):
97+
if params.text_context and len(params.text_context):
9898
data["text_context"] = params.text_context
99+
if params.data_schema is not None:
100+
data["data_schema"] = str(params.data_schema)
99101

100102
if isinstance(input_source, LocalInputSource):
101103
files = {"file": input_source.read_contents(params.close_file)}

tests/v2/test_client.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from mindee.error.mindee_error import MindeeApiV2Error, MindeeError
77
from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2
88
from mindee.input import LocalInputSource, PathInput
9+
from mindee.input.inference_parameters import DataSchema
910
from mindee.mindee_http.base_settings import USER_AGENT
1011
from mindee.parsing.v2.inference import Inference
1112
from mindee.parsing.v2.job import Job
@@ -130,7 +131,11 @@ def test_enqueue_and_parse_path_with_env_token(custom_base_url_client):
130131
with pytest.raises(MindeeHTTPErrorV2):
131132
custom_base_url_client.enqueue_and_get_inference(
132133
input_doc,
133-
InferenceParameters("dummy-model", text_context="ignore this message"),
134+
InferenceParameters(
135+
"dummy-model",
136+
text_context="ignore this message",
137+
data_schema=DataSchema(override={"test_field": {}}),
138+
),
134139
)
135140

136141

tests/v2/test_client_integration.py

Lines changed: 57 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from mindee import ClientV2, InferenceParameters, PathInput, UrlInputSource
77
from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2
8+
from mindee.input.inference_parameters import DataSchema
89
from mindee.parsing.v2.inference_response import InferenceResponse
910
from tests.utils import FILE_TYPES_DIR, V2_PRODUCT_DATA_DIR
1011

@@ -25,6 +26,22 @@ def v2_client() -> ClientV2:
2526
return ClientV2(api_key)
2627

2728

29+
def _basic_assert_success(
30+
response: InferenceResponse, page_count: int, model_id: str
31+
) -> None:
32+
assert response is not None
33+
assert response.inference is not None
34+
35+
assert response.inference.file is not None
36+
assert response.inference.file.page_count == page_count
37+
38+
assert response.inference.model is not None
39+
assert response.inference.model.id == model_id
40+
41+
assert response.inference.result is not None
42+
assert response.inference.active_options is not None
43+
44+
2845
@pytest.mark.integration
2946
@pytest.mark.v2
3047
def test_parse_file_empty_multiple_pages_must_succeed(
@@ -49,24 +66,15 @@ def test_parse_file_empty_multiple_pages_must_succeed(
4966
response: InferenceResponse = v2_client.enqueue_and_get_inference(
5067
input_source, params
5168
)
52-
assert response is not None
53-
assert response.inference is not None
69+
_basic_assert_success(response=response, page_count=2, model_id=findoc_model_id)
5470

55-
assert response.inference.file is not None
5671
assert response.inference.file.name == "multipage_cut-2.pdf"
57-
assert response.inference.file.page_count == 2
58-
59-
assert response.inference.model is not None
60-
assert response.inference.model.id == findoc_model_id
6172

62-
assert response.inference.active_options is not None
6373
assert response.inference.active_options.rag is False
6474
assert response.inference.active_options.raw_text is True
6575
assert response.inference.active_options.polygon is False
6676
assert response.inference.active_options.confidence is False
6777

68-
assert response.inference.result is not None
69-
7078
assert response.inference.result.raw_text is not None
7179
assert len(response.inference.result.raw_text.pages) == 2
7280

@@ -93,24 +101,15 @@ def test_parse_file_empty_single_page_options_must_succeed(
93101
response: InferenceResponse = v2_client.enqueue_and_get_inference(
94102
input_source, params
95103
)
96-
assert response is not None
97-
assert response.inference is not None
98-
99-
assert response.inference.model is not None
100-
assert response.inference.model.id == findoc_model_id
104+
_basic_assert_success(response=response, page_count=1, model_id=findoc_model_id)
101105

102-
assert response.inference.file is not None
103106
assert response.inference.file.name == "blank_1.pdf"
104-
assert response.inference.file.page_count == 1
105107

106-
assert response.inference.active_options is not None
107108
assert response.inference.active_options.rag is True
108109
assert response.inference.active_options.raw_text is True
109110
assert response.inference.active_options.polygon is True
110111
assert response.inference.active_options.confidence is True
111112

112-
assert response.inference.result is not None
113-
114113

115114
@pytest.mark.integration
116115
@pytest.mark.v2
@@ -137,26 +136,17 @@ def test_parse_file_filled_single_page_must_succeed(
137136
response: InferenceResponse = v2_client.enqueue_and_get_inference(
138137
input_source, params
139138
)
139+
_basic_assert_success(response=response, page_count=1, model_id=findoc_model_id)
140140

141-
assert response is not None
142-
assert response.inference is not None
143-
144-
assert response.inference.file is not None
145141
assert response.inference.file.name == "default_sample.jpg"
146-
assert response.inference.file.page_count == 1
147142

148-
assert response.inference.model is not None
149-
assert response.inference.model.id == findoc_model_id
150-
151-
assert response.inference.active_options is not None
152143
assert response.inference.active_options.rag is False
153144
assert response.inference.active_options.raw_text is False
154145
assert response.inference.active_options.polygon is False
155146
assert response.inference.active_options.confidence is False
156147

157148
assert response.inference.result.raw_text is None
158149

159-
assert response.inference.result is not None
160150
supplier_name = response.inference.result.fields["supplier_name"]
161151
assert supplier_name is not None
162152
assert supplier_name.value == "John Smith"
@@ -266,15 +256,43 @@ def test_blank_url_input_source_must_succeed(
266256
response: InferenceResponse = v2_client.enqueue_and_get_inference(
267257
input_source, params
268258
)
269-
assert response is not None
270-
assert response.inference is not None
271-
272-
assert response.inference.file is not None
273-
assert response.inference.file.page_count == 1
259+
_basic_assert_success(response=response, page_count=1, model_id=findoc_model_id)
274260

275-
assert response.inference.model is not None
276-
assert response.inference.model.id == findoc_model_id
277261

278-
assert response.inference.result is not None
262+
@pytest.mark.integration
263+
@pytest.mark.v2
264+
def test_data_schema_must_succeed(
265+
v2_client: ClientV2,
266+
findoc_model_id: str,
267+
) -> None:
268+
"""
269+
Load a blank PDF from an HTTPS URL and make sure the inference call completes without raising any errors.
270+
"""
271+
input_path: Path = FILE_TYPES_DIR / "pdf" / "blank_1.pdf"
279272

280-
assert response.inference.active_options is not None
273+
input_source = PathInput(input_path)
274+
params = InferenceParameters(
275+
model_id=findoc_model_id,
276+
rag=False,
277+
raw_text=False,
278+
polygon=False,
279+
confidence=False,
280+
webhook_ids=[],
281+
data_schema=DataSchema(
282+
override=[
283+
{
284+
"name": "test",
285+
"title": "Test",
286+
"is_array": False,
287+
"type": "string",
288+
"description": "A test field",
289+
}
290+
]
291+
),
292+
alias="py_integration_data_schema_override",
293+
)
294+
response: InferenceResponse = v2_client.enqueue_and_get_inference(
295+
input_source, params
296+
)
297+
_basic_assert_success(response=response, page_count=1, model_id=findoc_model_id)
298+
assert response.inference.result.fields["test"] is not None

0 commit comments

Comments
 (0)