wasmerio · nglong14 · Nov 24, 2025 · Nov 24, 2025
diff --git a/PDF to Markdown/PDFtoMD.py b/PDF to Markdown/PDFtoMD.py
@@ -0,0 +1,112 @@
+import pytesseract
+from PIL import Image
+from markitdown import MarkItDown
+import pdf2image
+import os
+import sys
+
+# Configure tesseract path
+pytesseract.pytesseract.tesseract_cmd = r'D:\Tesseract OCR\tesseract.exe'
+poppler_path = r'D:\Release-25.07.0-0\poppler-25.07.0\Library\bin'
+
+def detect_type(pdf_file):
+    try:
+        md = MarkItDown()
+        result = md.convert(pdf_file)
+
+        if len(result.text_content.strip()) < 50:
+            return "image"
+        else: 
+            return "text"
+    except Exception as e:
+        print(f"Error detecting PDF type: {e}")
+        return "image"
+
+def convert_text_pdf(pdf_file, output_folder):
+    try:
+        md = MarkItDown()
+        result = md.convert(pdf_file)
+
+        md_filename = os.path.basename(pdf_file).replace('.pdf', '.md')
+        output_path = os.path.join(output_folder, md_filename)
+
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(result.text_content)
+
+        print(f"Converted {pdf_file} -> {output_path}")
+        return True
+    except Exception as e:
+        print(f"Error converting {pdf_file}: {e}")
+        return False
+
+def convert_image_pdf(pdf_file, output_folder):
+    try:
+        pages = pdf2image.convert_from_path(pdf_file, dpi=300, poppler_path = poppler_path)
+
+        pdf_name = os.path.basename(pdf_file)
+        all_text = f"# {pdf_name}\n\n"
+
+        for i, page in enumerate(pages):
+            page = page.convert('L')
+
+            text = pytesseract.image_to_string(
+                page, 
+                lang='vie',
+                config='--oem 3 --psm 6'
+            )
+
+            if text.strip():
+                all_text += f'## Trang {i+1}\n\n{text}\n\n'
+
+        md_filename = os.path.basename(pdf_file).replace('.pdf', '.md')
+        output_path = os.path.join(output_folder, md_filename)
+
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(all_text)
+
+        print(f"Converted {pdf_file} -> {output_path}")
+        return True
+    except Exception as e:
+        print(f"Error converting {pdf_file}: {e}")
+        return False
+
+def smart_convert_pdf(pdf_file, output_folder=r"D:\PythonProject\ToMD\Output"):
+    os.makedirs(output_folder, exist_ok=True)
+
+    pdf_type = detect_type(pdf_file)
+    print(f"Detected PDF type: {pdf_type}")
+
+    if pdf_type == "text":
+        return convert_text_pdf(pdf_file, output_folder)
+    else:
+        return convert_image_pdf(pdf_file, output_folder)
+
+def main():
+    data_folder = r'D:\PythonProject\ToMD\Data'
+
+    if not os.path.exists(data_folder):
+        print(f"Data folder not found: {data_folder}")
+        return
+
+    pdf_files = [f for f in os.listdir(data_folder) if f.lower().endswith('.pdf')]
+
+    if not pdf_files:
+        print("No PDF files found in the data folder")
+        return
+
+    print(f"Found {len(pdf_files)} PDF files")
+
+    for pdf_file in pdf_files:
+        full_path = os.path.join(data_folder, pdf_file)
+        print(f"\nProcessing: {pdf_file}")
+
+        success = smart_convert_pdf(full_path)
+        if success:
+            print("Conversion successful")
+        else:
+            print("Conversion failed")
+
+        print("-" * 50)
+
+if __name__ == "__main__":
+    main()
diff --git a/PDF to Markdown/README.md b/PDF to Markdown/README.md
@@ -0,0 +1,60 @@
+# PDF to Markdown Converter
+
+A Python script that intelligently converts PDF files to Markdown format. The script automatically detects whether a PDF is text-based or image-based and applies the appropriate conversion method.
+
+## Features
+
+- **Smart Detection**: Automatically identifies if a PDF contains text or images
+- **Text PDF Conversion**: Extracts text directly from text-based PDFs
+- **Image PDF Conversion**: Uses OCR (Optical Character Recognition) to extract text from image-based PDFs
+- **Batch Processing**: Converts multiple PDF files at once
+- **Vietnamese Language Support**: Includes Vietnamese OCR support
+
+## Requirements
+
+- Python 3.x
+- pytesseract
+- Pillow (PIL)
+- markitdown
+- pdf2image
+- Tesseract OCR (installed separately)
+- Poppler (installed separately)
+
+## Installation
+
+1. Install Python dependencies:
+```bash
+pip install pytesseract Pillow markitdown pdf2image
+```
+
+2. Install Tesseract OCR from [https://github.com/tesseract-ocr/tesseract](https://github.com/tesseract-ocr/tesseract)
+
+3. Install Poppler from [https://github.com/oschwartz10612/poppler-windows/releases](https://github.com/oschwartz10612/poppler-windows/releases)
+
+4. Update the paths in the script:
+   - `tesseract_cmd`: Path to your Tesseract executable
+   - `poppler_path`: Path to your Poppler bin folder
+   - `data_folder`: Folder containing your PDF files
+   - `output_folder`: Folder where Markdown files will be saved
+
+## Usage
+
+1. Place your PDF files in the data folder
+2. Run the script:
+```bash
+python PDFtoMD.py
+```
+
+3. Find converted Markdown files in the output folder
+
+## How It Works
+
+1. The script scans the data folder for PDF files
+2. For each PDF, it detects whether it's text-based or image-based
+3. Text PDFs are converted directly using MarkItDown
+4. Image PDFs are converted using OCR with pytesseract
+5. Output is saved as `.md` files with the same name as the input PDF
+
+## License
+
+This project is open source and available under the MIT License.