@@ -566,12 +566,7 @@ def _partition_pdf_or_image_local(
566566
567567 hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model ()
568568 if pdf_image_dpi is None :
569- pdf_image_dpi = 300 if hi_res_model_name .startswith ("chipper" ) else 200
570- if (pdf_image_dpi < 300 ) and (hi_res_model_name .startswith ("chipper" )):
571- logger .warning (
572- "The Chipper model performs better when images are rendered with DPI >= 300 "
573- f"(currently { pdf_image_dpi } )." ,
574- )
569+ pdf_image_dpi = 200
575570
576571 od_model_layout_dumper : Optional [ObjectDetectionLayoutDumper ] = None
577572 extracted_layout_dumper : Optional [ExtractedLayoutDumper ] = None
@@ -588,53 +583,48 @@ def _partition_pdf_or_image_local(
588583 pdf_image_dpi = pdf_image_dpi ,
589584 )
590585
591- if hi_res_model_name .startswith ("chipper" ):
592- # NOTE(alan): We shouldn't do OCR with chipper
593- # NOTE(antonio): We shouldn't do PDFMiner with chipper
594- final_document_layout = inferred_document_layout
595- else :
596- extracted_layout = (
597- process_file_with_pdfminer (filename = filename , dpi = pdf_image_dpi )
598- if pdf_text_extractable
599- else []
600- )
586+ extracted_layout = (
587+ process_file_with_pdfminer (filename = filename , dpi = pdf_image_dpi )
588+ if pdf_text_extractable
589+ else []
590+ )
601591
602- if analysis :
603- if not analyzed_image_output_dir_path :
604- if env_config .GLOBAL_WORKING_DIR_ENABLED :
605- analyzed_image_output_dir_path = str (
606- Path (env_config .GLOBAL_WORKING_PROCESS_DIR ) / "annotated"
607- )
608- else :
609- analyzed_image_output_dir_path = str (Path .cwd () / "annotated" )
610- os .makedirs (analyzed_image_output_dir_path , exist_ok = True )
611- if not skip_analysis_dump :
612- od_model_layout_dumper = ObjectDetectionLayoutDumper (
613- layout = inferred_document_layout ,
614- model_name = hi_res_model_name ,
615- )
616- extracted_layout_dumper = ExtractedLayoutDumper (
617- layout = extracted_layout ,
592+ if analysis :
593+ if not analyzed_image_output_dir_path :
594+ if env_config .GLOBAL_WORKING_DIR_ENABLED :
595+ analyzed_image_output_dir_path = str (
596+ Path (env_config .GLOBAL_WORKING_PROCESS_DIR ) / "annotated"
618597 )
619- ocr_layout_dumper = OCRLayoutDumper ()
620- # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
621- merged_document_layout = merge_inferred_with_extracted_layout (
622- inferred_document_layout = inferred_document_layout ,
623- extracted_layout = extracted_layout ,
624- hi_res_model_name = hi_res_model_name ,
625- )
598+ else :
599+ analyzed_image_output_dir_path = str (Path .cwd () / "annotated" )
600+ os .makedirs (analyzed_image_output_dir_path , exist_ok = True )
601+ if not skip_analysis_dump :
602+ od_model_layout_dumper = ObjectDetectionLayoutDumper (
603+ layout = inferred_document_layout ,
604+ model_name = hi_res_model_name ,
605+ )
606+ extracted_layout_dumper = ExtractedLayoutDumper (
607+ layout = extracted_layout ,
608+ )
609+ ocr_layout_dumper = OCRLayoutDumper ()
610+ # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
611+ merged_document_layout = merge_inferred_with_extracted_layout (
612+ inferred_document_layout = inferred_document_layout ,
613+ extracted_layout = extracted_layout ,
614+ hi_res_model_name = hi_res_model_name ,
615+ )
626616
627- final_document_layout = process_file_with_ocr (
628- filename ,
629- merged_document_layout ,
630- extracted_layout = extracted_layout ,
631- is_image = is_image ,
632- infer_table_structure = infer_table_structure ,
633- ocr_languages = ocr_languages ,
634- ocr_mode = ocr_mode ,
635- pdf_image_dpi = pdf_image_dpi ,
636- ocr_layout_dumper = ocr_layout_dumper ,
637- )
617+ final_document_layout = process_file_with_ocr (
618+ filename ,
619+ merged_document_layout ,
620+ extracted_layout = extracted_layout ,
621+ is_image = is_image ,
622+ infer_table_structure = infer_table_structure ,
623+ ocr_languages = ocr_languages ,
624+ ocr_mode = ocr_mode ,
625+ pdf_image_dpi = pdf_image_dpi ,
626+ ocr_layout_dumper = ocr_layout_dumper ,
627+ )
638628 else :
639629 inferred_document_layout = process_data_with_model (
640630 file ,
@@ -643,62 +633,51 @@ def _partition_pdf_or_image_local(
643633 pdf_image_dpi = pdf_image_dpi ,
644634 )
645635
646- if hi_res_model_name .startswith ("chipper" ):
647- # NOTE(alan): We shouldn't do OCR with chipper
648- # NOTE(antonio): We shouldn't do PDFMiner with chipper
649- final_document_layout = inferred_document_layout
650- else :
651- if hasattr (file , "seek" ):
652- file .seek (0 )
636+ if hasattr (file , "seek" ):
637+ file .seek (0 )
653638
654- extracted_layout = (
655- process_data_with_pdfminer (file = file , dpi = pdf_image_dpi )
656- if pdf_text_extractable
657- else []
658- )
639+ extracted_layout = (
640+ process_data_with_pdfminer (file = file , dpi = pdf_image_dpi ) if pdf_text_extractable else []
641+ )
659642
660- if analysis :
661- if not analyzed_image_output_dir_path :
662- if env_config .GLOBAL_WORKING_DIR_ENABLED :
663- analyzed_image_output_dir_path = str (
664- Path (env_config .GLOBAL_WORKING_PROCESS_DIR ) / "annotated"
665- )
666- else :
667- analyzed_image_output_dir_path = str (Path .cwd () / "annotated" )
668- if not skip_analysis_dump :
669- od_model_layout_dumper = ObjectDetectionLayoutDumper (
670- layout = inferred_document_layout ,
671- model_name = hi_res_model_name ,
643+ if analysis :
644+ if not analyzed_image_output_dir_path :
645+ if env_config .GLOBAL_WORKING_DIR_ENABLED :
646+ analyzed_image_output_dir_path = str (
647+ Path (env_config .GLOBAL_WORKING_PROCESS_DIR ) / "annotated"
672648 )
673- extracted_layout_dumper = ExtractedLayoutDumper (
674- layout = extracted_layout ,
675- )
676- ocr_layout_dumper = OCRLayoutDumper ()
677-
678- # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
679- merged_document_layout = merge_inferred_with_extracted_layout (
680- inferred_document_layout = inferred_document_layout ,
681- extracted_layout = extracted_layout ,
682- hi_res_model_name = hi_res_model_name ,
683- )
649+ else :
650+ analyzed_image_output_dir_path = str ( Path . cwd () / "annotated" )
651+ if not skip_analysis_dump :
652+ od_model_layout_dumper = ObjectDetectionLayoutDumper (
653+ layout = inferred_document_layout ,
654+ model_name = hi_res_model_name ,
655+ )
656+ extracted_layout_dumper = ExtractedLayoutDumper (
657+ layout = extracted_layout ,
658+ )
659+ ocr_layout_dumper = OCRLayoutDumper ( )
684660
685- if hasattr (file , "seek" ):
686- file .seek (0 )
687- final_document_layout = process_data_with_ocr (
688- file ,
689- merged_document_layout ,
690- extracted_layout = extracted_layout ,
691- is_image = is_image ,
692- infer_table_structure = infer_table_structure ,
693- ocr_languages = ocr_languages ,
694- ocr_mode = ocr_mode ,
695- pdf_image_dpi = pdf_image_dpi ,
696- ocr_layout_dumper = ocr_layout_dumper ,
697- )
661+ # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
662+ merged_document_layout = merge_inferred_with_extracted_layout (
663+ inferred_document_layout = inferred_document_layout ,
664+ extracted_layout = extracted_layout ,
665+ hi_res_model_name = hi_res_model_name ,
666+ )
698667
699- # NOTE(alan): starting with v2, chipper sorts the elements itself.
700- if hi_res_model_name .startswith ("chipper" ) and hi_res_model_name != "chipperv1" :
701- kwargs ["sort_mode" ] = SORT_MODE_DONT
668+ if hasattr (file , "seek" ):
669+ file .seek (0 )
670+ final_document_layout = process_data_with_ocr (
671+ file ,
672+ merged_document_layout ,
673+ extracted_layout = extracted_layout ,
674+ is_image = is_image ,
675+ infer_table_structure = infer_table_structure ,
676+ ocr_languages = ocr_languages ,
677+ ocr_mode = ocr_mode ,
678+ pdf_image_dpi = pdf_image_dpi ,
679+ ocr_layout_dumper = ocr_layout_dumper ,
680+ )
702681
703682 final_document_layout = clean_pdfminer_inner_elements (final_document_layout )
704683
@@ -766,9 +745,7 @@ def _partition_pdf_or_image_local(
766745 " " ,
767746 el .text or "" ,
768747 ).strip ()
769- # NOTE(alan): with chipper there are parent elements with no text we don't want to
770- # filter those out and leave the children orphaned.
771- if el .text or isinstance (el , PageBreak ) or hi_res_model_name .startswith ("chipper" ):
748+ if el .text or isinstance (el , PageBreak ):
772749 out_elements .append (cast (Element , el ))
773750
774751 if extract_forms :
0 commit comments