diff --git a/PDF Image Extractor/README.md b/PDF Image Extractor/README.md new file mode 100644 index 0000000..0cc3da2 --- /dev/null +++ b/PDF Image Extractor/README.md @@ -0,0 +1,47 @@ +# PDF Image Extractor + +Recursively extracts all images from every PDF file in a directory tree, saving the images in a subfolder named `PDF` in the input root directory by default. Each PDF file is organized into its own folder, containing all images extracted from that document. + +## Requirements + +- Python 3.8+ +- [PyMuPDF](https://pymupdf.readthedocs.io) (`pip install pymupdf`) + +## Usage + + +1. Install dependencies: + + ``` + pip install pymupdf + ``` + + +2. Run the script: + + ``` + python pdf_image_extractor.py [--dir ] [--out ] [--dedup] + ``` + +- `--dir `: Root directory to search for PDFs (default: script directory) +- `--out `: Output directory for images (default: `/PDF`) +- `--dedup`: Enable deduplication of images per PDF (default: off) + +### Examples + +Extract all images (including duplicates) from PDFs in `./my_pdfs`: + +``` +python pdf_image_extractor.py --dir ./my_pdfs +``` + +Extract only unique images per PDF, saving to a custom output folder: + +``` +python pdf_image_extractor.py --dir ./my_pdfs --out ./images --dedup +``` + +## Output Structure + +- For each PDF, a folder named after the PDF (without extension) is created in the output directory, mirroring the original structure. +- Images are saved as `page_img.`. diff --git a/PDF Image Extractor/pdf_image_extractor.py b/PDF Image Extractor/pdf_image_extractor.py new file mode 100644 index 0000000..1a88fe3 --- /dev/null +++ b/PDF Image Extractor/pdf_image_extractor.py @@ -0,0 +1,92 @@ +import os +import argparse +import fitz # PyMuPDF + + +def extract_images(pdf_path, output_root, dedup): + try: + doc = fitz.open(pdf_path) + except Exception as e: + print(f"Failed to open {pdf_path}: {e}") + return 0 + + # Create output folder structure mirroring input + rel_path = os.path.relpath( + pdf_path, start=os.path.commonpath([pdf_path, output_root]) + ) + pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] + output_folder = os.path.join(output_root, os.path.dirname(rel_path), pdf_name) + img_count = 0 + + seen = set() + for page_num in range(len(doc)): + page = doc[page_num] + images = page.get_images(full=True) + for img_index, img in enumerate(images): + xref = img[0] + if dedup and xref in seen: + continue + if dedup: + seen.add(xref) + + base_image = doc.extract_image(xref) + image_bytes = base_image["image"] + image_ext = base_image.get("ext", "png") + if img_count == 0 and not os.path.exists(output_folder): + os.makedirs(output_folder, exist_ok=True) + img_count += 1 + img_filename = f"page{page_num + 1}_img{img_index + 1}.{image_ext}" + img_path = os.path.join(output_folder, img_filename) + with open(img_path, "wb") as img_file: + img_file.write(image_bytes) + doc.close() + print(f"Extracted {img_count} images from {pdf_path} to {output_folder}") + return img_count + + +def main(): + parser = argparse.ArgumentParser( + description="Recursively extract images from all PDFs in a directory tree." + ) + parser.add_argument( + "--dir", + type=str, + default=os.path.dirname(os.path.abspath(__file__)), + help="Root directory to search for PDF files (default: script directory)", + ) + parser.add_argument( + "--out", + type=str, + default=None, + help="Output directory for extracted images (default: /PDF)", + ) + parser.add_argument( + "--dedup", + action="store_true", + help="Enable deduplication of images per PDF (default: off, extract all images including duplicates).", + ) + args = parser.parse_args() + + pdf_dir = os.path.abspath(args.dir) + # Default output is /PDF + output_root = ( + os.path.abspath(args.out) if args.out else os.path.join(pdf_dir, "PDF") + ) + + total_images = 0 + pdf_files = [] + for root, _, files in os.walk(pdf_dir): + for f in files: + if f.lower().endswith(".pdf"): + pdf_files.append(os.path.join(root, f)) + + for pdf_path in pdf_files: + total_images += extract_images(pdf_path, output_root, args.dedup) + + print( + f"---\nDone extracting images from all PDFs.\nTotal images extracted: {total_images}" + ) + + +if __name__ == "__main__": + main()