Skip to content

Commit b092d45

Browse files
authored
Remove unsupported chipper model (#3728)
The chipper model is no longer supported.
1 parent 1eceac2 commit b092d45

File tree

9 files changed

+88
-202
lines changed

9 files changed

+88
-202
lines changed

.github/workflows/ci.yml

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -153,41 +153,6 @@ jobs:
153153
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
154154
make check-coverage
155155
156-
test_chipper:
157-
strategy:
158-
matrix:
159-
python-version: ["3.10"]
160-
runs-on: ubuntu-latest
161-
env:
162-
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
163-
NLTK_DATA: ${{ github.workspace }}/nltk_data
164-
needs: [setup, lint]
165-
steps:
166-
- uses: actions/checkout@v4
167-
- name: Set up Python ${{ matrix.python-version }}
168-
uses: actions/setup-python@v5
169-
with:
170-
python-version: ${{ matrix.python-version }}
171-
- name: Setup virtual environment
172-
uses: ./.github/actions/base-cache
173-
with:
174-
python-version: ${{ matrix.python-version }}
175-
- name: Test
176-
env:
177-
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
178-
PYTHON: python${{ matrix.python-version }}
179-
NLTK_DATA: ${{ github.workspace }}/nltk_data
180-
run: |
181-
source .venv/bin/activate
182-
sudo apt-get update
183-
sudo apt-get install -y poppler-utils
184-
make install-pandoc install-test
185-
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
186-
sudo apt-get update
187-
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
188-
tesseract --version
189-
make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
190-
191156
test_unit_no_extras:
192157
strategy:
193158
matrix:

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
## 0.16.1-dev0
1+
## 0.16.1-dev1
22

33
### Enhancements
44

55
### Features
66

77
### Fixes
88

9+
* **Remove unsupported chipper model**
910
* **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.
1011

1112
## 0.16.0

Makefile

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -138,12 +138,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
138138
.PHONY: test
139139
test:
140140
PYTHONPATH=. CI=$(CI) \
141-
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
142-
143-
.PHONY: test-chipper
144-
test-chipper:
145-
PYTHONPATH=. CI=$(CI) \
146-
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
141+
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
147142

148143
.PHONY: test-unstructured-api-unit
149144
test-unstructured-api-unit:
@@ -309,7 +304,7 @@ docker-test:
309304
$(DOCKER_IMAGE) \
310305
bash -c "CI=$(CI) \
311306
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
312-
pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
307+
pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
313308

314309
.PHONY: docker-smoke-test
315310
docker-smoke-test:

setup.cfg

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@ filterwarnings =
1515
ignore::DeprecationWarning
1616
python_classes = Test Describe
1717
python_functions = test_ it_ they_ but_ and_
18-
markers =
19-
chipper: mark a test as running chipper, which tends to be slow and compute-heavy.
2018
testpaths =
2119
test_unstructured
2220
test_unstructured_ingest

test_unstructured/partition/pdf_image/test_chipper.py

Lines changed: 0 additions & 43 deletions
This file was deleted.

test_unstructured/partition/pdf_image/test_pdf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def test_partition_pdf_with_model_name_env_var(
218218
assert mock_process.call_args[1]["model_name"] == "checkbox"
219219

220220

221-
@pytest.mark.parametrize("model_name", ["checkbox", "yolox", "chipper"])
221+
@pytest.mark.parametrize("model_name", ["checkbox", "yolox"])
222222
def test_partition_pdf_with_model_name(
223223
monkeypatch,
224224
model_name,

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.1-dev0" # pragma: no cover
1+
__version__ = "0.16.1-dev1" # pragma: no cover

unstructured/partition/pdf.py

Lines changed: 81 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -566,12 +566,7 @@ def _partition_pdf_or_image_local(
566566

567567
hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
568568
if pdf_image_dpi is None:
569-
pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200
570-
if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")):
571-
logger.warning(
572-
"The Chipper model performs better when images are rendered with DPI >= 300 "
573-
f"(currently {pdf_image_dpi}).",
574-
)
569+
pdf_image_dpi = 200
575570

576571
od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
577572
extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
@@ -588,53 +583,48 @@ def _partition_pdf_or_image_local(
588583
pdf_image_dpi=pdf_image_dpi,
589584
)
590585

591-
if hi_res_model_name.startswith("chipper"):
592-
# NOTE(alan): We shouldn't do OCR with chipper
593-
# NOTE(antonio): We shouldn't do PDFMiner with chipper
594-
final_document_layout = inferred_document_layout
595-
else:
596-
extracted_layout = (
597-
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
598-
if pdf_text_extractable
599-
else []
600-
)
586+
extracted_layout = (
587+
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
588+
if pdf_text_extractable
589+
else []
590+
)
601591

602-
if analysis:
603-
if not analyzed_image_output_dir_path:
604-
if env_config.GLOBAL_WORKING_DIR_ENABLED:
605-
analyzed_image_output_dir_path = str(
606-
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
607-
)
608-
else:
609-
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
610-
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
611-
if not skip_analysis_dump:
612-
od_model_layout_dumper = ObjectDetectionLayoutDumper(
613-
layout=inferred_document_layout,
614-
model_name=hi_res_model_name,
615-
)
616-
extracted_layout_dumper = ExtractedLayoutDumper(
617-
layout=extracted_layout,
592+
if analysis:
593+
if not analyzed_image_output_dir_path:
594+
if env_config.GLOBAL_WORKING_DIR_ENABLED:
595+
analyzed_image_output_dir_path = str(
596+
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
618597
)
619-
ocr_layout_dumper = OCRLayoutDumper()
620-
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
621-
merged_document_layout = merge_inferred_with_extracted_layout(
622-
inferred_document_layout=inferred_document_layout,
623-
extracted_layout=extracted_layout,
624-
hi_res_model_name=hi_res_model_name,
625-
)
598+
else:
599+
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
600+
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
601+
if not skip_analysis_dump:
602+
od_model_layout_dumper = ObjectDetectionLayoutDumper(
603+
layout=inferred_document_layout,
604+
model_name=hi_res_model_name,
605+
)
606+
extracted_layout_dumper = ExtractedLayoutDumper(
607+
layout=extracted_layout,
608+
)
609+
ocr_layout_dumper = OCRLayoutDumper()
610+
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
611+
merged_document_layout = merge_inferred_with_extracted_layout(
612+
inferred_document_layout=inferred_document_layout,
613+
extracted_layout=extracted_layout,
614+
hi_res_model_name=hi_res_model_name,
615+
)
626616

627-
final_document_layout = process_file_with_ocr(
628-
filename,
629-
merged_document_layout,
630-
extracted_layout=extracted_layout,
631-
is_image=is_image,
632-
infer_table_structure=infer_table_structure,
633-
ocr_languages=ocr_languages,
634-
ocr_mode=ocr_mode,
635-
pdf_image_dpi=pdf_image_dpi,
636-
ocr_layout_dumper=ocr_layout_dumper,
637-
)
617+
final_document_layout = process_file_with_ocr(
618+
filename,
619+
merged_document_layout,
620+
extracted_layout=extracted_layout,
621+
is_image=is_image,
622+
infer_table_structure=infer_table_structure,
623+
ocr_languages=ocr_languages,
624+
ocr_mode=ocr_mode,
625+
pdf_image_dpi=pdf_image_dpi,
626+
ocr_layout_dumper=ocr_layout_dumper,
627+
)
638628
else:
639629
inferred_document_layout = process_data_with_model(
640630
file,
@@ -643,62 +633,51 @@ def _partition_pdf_or_image_local(
643633
pdf_image_dpi=pdf_image_dpi,
644634
)
645635

646-
if hi_res_model_name.startswith("chipper"):
647-
# NOTE(alan): We shouldn't do OCR with chipper
648-
# NOTE(antonio): We shouldn't do PDFMiner with chipper
649-
final_document_layout = inferred_document_layout
650-
else:
651-
if hasattr(file, "seek"):
652-
file.seek(0)
636+
if hasattr(file, "seek"):
637+
file.seek(0)
653638

654-
extracted_layout = (
655-
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
656-
if pdf_text_extractable
657-
else []
658-
)
639+
extracted_layout = (
640+
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
641+
)
659642

660-
if analysis:
661-
if not analyzed_image_output_dir_path:
662-
if env_config.GLOBAL_WORKING_DIR_ENABLED:
663-
analyzed_image_output_dir_path = str(
664-
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
665-
)
666-
else:
667-
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
668-
if not skip_analysis_dump:
669-
od_model_layout_dumper = ObjectDetectionLayoutDumper(
670-
layout=inferred_document_layout,
671-
model_name=hi_res_model_name,
643+
if analysis:
644+
if not analyzed_image_output_dir_path:
645+
if env_config.GLOBAL_WORKING_DIR_ENABLED:
646+
analyzed_image_output_dir_path = str(
647+
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
672648
)
673-
extracted_layout_dumper = ExtractedLayoutDumper(
674-
layout=extracted_layout,
675-
)
676-
ocr_layout_dumper = OCRLayoutDumper()
677-
678-
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
679-
merged_document_layout = merge_inferred_with_extracted_layout(
680-
inferred_document_layout=inferred_document_layout,
681-
extracted_layout=extracted_layout,
682-
hi_res_model_name=hi_res_model_name,
683-
)
649+
else:
650+
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
651+
if not skip_analysis_dump:
652+
od_model_layout_dumper = ObjectDetectionLayoutDumper(
653+
layout=inferred_document_layout,
654+
model_name=hi_res_model_name,
655+
)
656+
extracted_layout_dumper = ExtractedLayoutDumper(
657+
layout=extracted_layout,
658+
)
659+
ocr_layout_dumper = OCRLayoutDumper()
684660

685-
if hasattr(file, "seek"):
686-
file.seek(0)
687-
final_document_layout = process_data_with_ocr(
688-
file,
689-
merged_document_layout,
690-
extracted_layout=extracted_layout,
691-
is_image=is_image,
692-
infer_table_structure=infer_table_structure,
693-
ocr_languages=ocr_languages,
694-
ocr_mode=ocr_mode,
695-
pdf_image_dpi=pdf_image_dpi,
696-
ocr_layout_dumper=ocr_layout_dumper,
697-
)
661+
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
662+
merged_document_layout = merge_inferred_with_extracted_layout(
663+
inferred_document_layout=inferred_document_layout,
664+
extracted_layout=extracted_layout,
665+
hi_res_model_name=hi_res_model_name,
666+
)
698667

699-
# NOTE(alan): starting with v2, chipper sorts the elements itself.
700-
if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
701-
kwargs["sort_mode"] = SORT_MODE_DONT
668+
if hasattr(file, "seek"):
669+
file.seek(0)
670+
final_document_layout = process_data_with_ocr(
671+
file,
672+
merged_document_layout,
673+
extracted_layout=extracted_layout,
674+
is_image=is_image,
675+
infer_table_structure=infer_table_structure,
676+
ocr_languages=ocr_languages,
677+
ocr_mode=ocr_mode,
678+
pdf_image_dpi=pdf_image_dpi,
679+
ocr_layout_dumper=ocr_layout_dumper,
680+
)
702681

703682
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
704683

@@ -766,9 +745,7 @@ def _partition_pdf_or_image_local(
766745
" ",
767746
el.text or "",
768747
).strip()
769-
# NOTE(alan): with chipper there are parent elements with no text we don't want to
770-
# filter those out and leave the children orphaned.
771-
if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"):
748+
if el.text or isinstance(el, PageBreak):
772749
out_elements.append(cast(Element, el))
773750

774751
if extract_forms:

unstructured/partition/pdf_image/pdfminer_processing.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,6 @@ def merge_inferred_with_extracted_layout(
176176
)
177177
from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
178178

179-
# If the model is a chipper model, we don't want to order the
180-
# elements, as they are already ordered
181-
order_elements = not hi_res_model_name.startswith("chipper")
182-
183179
inferred_pages = inferred_document_layout.pages
184180
for i, (inferred_page, extracted_page_layout) in enumerate(
185181
zip(inferred_pages, extracted_layout)
@@ -206,10 +202,7 @@ def merge_inferred_with_extracted_layout(
206202
**threshold_kwargs,
207203
)
208204

209-
if order_elements:
210-
merged_layout = sort_text_regions(
211-
cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC
212-
)
205+
merged_layout = sort_text_regions(cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC)
213206

214207
elements = []
215208
for layout_el in merged_layout:

0 commit comments

Comments
 (0)