Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions PDF to Markdown/PDFtoMD.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import pytesseract
from PIL import Image
from markitdown import MarkItDown
import pdf2image
import os
import sys

# Configure tesseract path
pytesseract.pytesseract.tesseract_cmd = r'D:\Tesseract OCR\tesseract.exe'
poppler_path = r'D:\Release-25.07.0-0\poppler-25.07.0\Library\bin'

def detect_type(pdf_file):
try:
md = MarkItDown()
result = md.convert(pdf_file)

if len(result.text_content.strip()) < 50:
return "image"
else:
return "text"
except Exception as e:
print(f"Error detecting PDF type: {e}")
return "image"

def convert_text_pdf(pdf_file, output_folder):
try:
md = MarkItDown()
result = md.convert(pdf_file)

md_filename = os.path.basename(pdf_file).replace('.pdf', '.md')
output_path = os.path.join(output_folder, md_filename)

with open(output_path, "w", encoding="utf-8") as f:
f.write(result.text_content)

print(f"Converted {pdf_file} -> {output_path}")
return True
except Exception as e:
print(f"Error converting {pdf_file}: {e}")
return False

def convert_image_pdf(pdf_file, output_folder):
try:
pages = pdf2image.convert_from_path(pdf_file, dpi=300, poppler_path = poppler_path)

pdf_name = os.path.basename(pdf_file)
all_text = f"# {pdf_name}\n\n"

for i, page in enumerate(pages):
page = page.convert('L')

text = pytesseract.image_to_string(
page,
lang='vie',
config='--oem 3 --psm 6'
)

if text.strip():
all_text += f'## Trang {i+1}\n\n{text}\n\n'

md_filename = os.path.basename(pdf_file).replace('.pdf', '.md')
output_path = os.path.join(output_folder, md_filename)

with open(output_path, "w", encoding="utf-8") as f:
f.write(all_text)

print(f"Converted {pdf_file} -> {output_path}")
return True
except Exception as e:
print(f"Error converting {pdf_file}: {e}")
return False

def smart_convert_pdf(pdf_file, output_folder=r"D:\PythonProject\ToMD\Output"):
os.makedirs(output_folder, exist_ok=True)

pdf_type = detect_type(pdf_file)
print(f"Detected PDF type: {pdf_type}")

if pdf_type == "text":
return convert_text_pdf(pdf_file, output_folder)
else:
return convert_image_pdf(pdf_file, output_folder)

def main():
data_folder = r'D:\PythonProject\ToMD\Data'

if not os.path.exists(data_folder):
print(f"Data folder not found: {data_folder}")
return

pdf_files = [f for f in os.listdir(data_folder) if f.lower().endswith('.pdf')]

if not pdf_files:
print("No PDF files found in the data folder")
return

print(f"Found {len(pdf_files)} PDF files")

for pdf_file in pdf_files:
full_path = os.path.join(data_folder, pdf_file)
print(f"\nProcessing: {pdf_file}")

success = smart_convert_pdf(full_path)
if success:
print("Conversion successful")
else:
print("Conversion failed")

print("-" * 50)

if __name__ == "__main__":
main()
60 changes: 60 additions & 0 deletions PDF to Markdown/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# PDF to Markdown Converter

A Python script that intelligently converts PDF files to Markdown format. The script automatically detects whether a PDF is text-based or image-based and applies the appropriate conversion method.

## Features

- **Smart Detection**: Automatically identifies if a PDF contains text or images
- **Text PDF Conversion**: Extracts text directly from text-based PDFs
- **Image PDF Conversion**: Uses OCR (Optical Character Recognition) to extract text from image-based PDFs
- **Batch Processing**: Converts multiple PDF files at once
- **Vietnamese Language Support**: Includes Vietnamese OCR support

## Requirements

- Python 3.x
- pytesseract
- Pillow (PIL)
- markitdown
- pdf2image
- Tesseract OCR (installed separately)
- Poppler (installed separately)

## Installation

1. Install Python dependencies:
```bash
pip install pytesseract Pillow markitdown pdf2image
```

2. Install Tesseract OCR from [https://github.com/tesseract-ocr/tesseract](https://github.com/tesseract-ocr/tesseract)

3. Install Poppler from [https://github.com/oschwartz10612/poppler-windows/releases](https://github.com/oschwartz10612/poppler-windows/releases)

4. Update the paths in the script:
- `tesseract_cmd`: Path to your Tesseract executable
- `poppler_path`: Path to your Poppler bin folder
- `data_folder`: Folder containing your PDF files
- `output_folder`: Folder where Markdown files will be saved

## Usage

1. Place your PDF files in the data folder
2. Run the script:
```bash
python PDFtoMD.py
```

3. Find converted Markdown files in the output folder

## How It Works

1. The script scans the data folder for PDF files
2. For each PDF, it detects whether it's text-based or image-based
3. Text PDFs are converted directly using MarkItDown
4. Image PDFs are converted using OCR with pytesseract
5. Output is saved as `.md` files with the same name as the input PDF

## License

This project is open source and available under the MIT License.