diff --git a/plugins/model/pdf-mineru/.dockerignore b/plugins/model/pdf-mineru/.dockerignore new file mode 100644 index 000000000000..6afc89bdc0d5 --- /dev/null +++ b/plugins/model/pdf-mineru/.dockerignore @@ -0,0 +1,21 @@ +__pycache__ +.pyc +.pyo +.pyd +.Python +env +venv +.venv +pip-log.txt +pip-delete-this-directory.txt +.tox +.coverage +.coverage. +.cache +nosetests.xml +coverage.xml +.cover +.log +.git +.mypy_cache +.pytest_cache \ No newline at end of file diff --git a/plugins/model/pdf-mineru/.env b/plugins/model/pdf-mineru/.env new file mode 100644 index 000000000000..565fb8be2889 --- /dev/null +++ b/plugins/model/pdf-mineru/.env @@ -0,0 +1 @@ +MINERU_TOKEN=官网申请的API 密钥 \ No newline at end of file diff --git a/plugins/model/pdf-mineru/Dockerfile b/plugins/model/pdf-mineru/Dockerfile new file mode 100644 index 000000000000..1b0bff5b2d69 --- /dev/null +++ b/plugins/model/pdf-mineru/Dockerfile @@ -0,0 +1,21 @@ +# ---- 基础镜像 ---- +FROM python:3.12-slim + +# ---- 工作目录 ---- +WORKDIR /app + +# ---- 复制代码 ---- +COPY mineru_saas_api.py . +COPY requirements.txt . + +# ---- 安装依赖 ---- +RUN pip install --no-cache-dir -r requirements.txt + +# ---- 环境变量(运行时注入)---- +ENV MINERU_TOKEN="YOUR_TOKEN_WILL_BE_INJECTED" + +# ---- 暴露端口 ---- +EXPOSE 1234 + +# ---- 启动命令 ---- +CMD ["uvicorn", "mineru_saas_api:app", "--host", "0.0.0.0", "--port", "1234"] diff --git a/plugins/model/pdf-mineru/README.md b/plugins/model/pdf-mineru/README.md index ea5ed334e0e6..4e62b764956e 100644 --- a/plugins/model/pdf-mineru/README.md +++ b/plugins/model/pdf-mineru/README.md @@ -1,85 +1,194 @@ -# Readme +# **MinerU SaaS Wrapper For Fastgpt 详细部署文档** +**—— 为 FastGPT 提供稳定、高效、开箱即用的纯白嫖文档解析服务,转接服务用grok写的,文档也是,有不明白出问题了,`docker logs -f mineru-saas-wrapper` 查看日志,问他~** -# 项目介绍 --- -本项目参照官方插件**pdf-marker,**基于MinertU实现了一个高效的 **PDF 转 Markdown 接口服务**,通过高性能的接口设计,快速将 PDF 文档转换为 Markdown 格式文本。 -- **简洁性:**项目无需修改代码,仅需调整文件路径即可使用,简单易用 -- **易用性:**通过提供简洁的 API,开发者只需发送 HTTP 请求即可完成 PDF 转换 -- **灵活性:**支持本地部署,便于快速上手和灵活集成 +> **适用人群**:FastGPT 开发者、后端工程师、DevOps、AI 应用集成者 +> **目标**:在 **5 分钟内**完成从零到生产可用的 MinerU saas服务api的文档解析服务部署 -# 配置推荐 +--- + +## 一、项目概述 -配置及速率请参照[MinerU项目](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md)官方介绍。 +| 项目 | 说明 | +|------|------| +| **名称** | MinerU SaaS Wrapper for FastGPT | +| **框架** | FastAPI + Uvicorn | +| **核心功能** | 接收文件 → 调用 MinerU 官方 SaaS API → 轮询结果 → 返回内嵌图片的 Markdown → fasgpt读取解析内容转为知识库 | +| **部署方式** | Docker(推荐) / docker-compose | +| **接口路径** | `POST /v2/parse/file` | -# 本地开发 +--- -## 基本流程 +## 二、前置条件 +| **MinerU Token** | 在 [https://mineru.net](https://mineru.net) 注册并获取 SaaS Token | -1、安装基本环境,主要参照官方文档[使用CPU及GPU](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#%E4%BD%BF%E7%94%A8GPU)运行MinerU的方式进行。具体如下,首先使用anaconda安装基础运行环境 +> **获取 Token 步骤**: +> 1. 登录 MinerU 官网 +> 2. 进入 **控制台 → API 密钥** +> 3. 创建新密钥(建议命名 `fastgpt-wrapper`) +> 4. 复制完整 Token(以 `eyJ...` 开头) + +--- + +## 三、目录结构说明 ```bash -conda create -n mineru python=3.10 -conda activate mineru -pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple +mineru-saas-wrapper/ +├── .dockerignore +├── Dockerfile +├── docker-compose.yml +├── mineru_saas_api.py # 主服务逻辑 +├── requirements.txt # 依赖包 +├── .env # (可选)环境变量文件 +└── README.md ``` -2、[下载模型权重文件](https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_zh_cn.md) +--- + +## 四、部署方式一:使用 `docker-compose`(推荐) + +### 步骤 1:克隆项目 ```bash -pip install modelscope -wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py -python download_models.py +mkdir mineru-saas-wrapper +cd mineru-saas-wrapper ``` -python脚本会自动下载模型文件并配置好配置文件中的模型目录 +### 步骤 2:创建 `.env` 文件(推荐,防止 Token 泄露) -配置文件可以在用户目录中找到,文件名为`magic-pdf.json` +```bash +touch .env +``` -> windows的用户目录为 "C:\\Users\\用户名", linux用户目录为 "/home/用户名", macOS用户目录为 "/Users/用户名" +编辑 `.env`: -3、如果您的显卡显存大于等于 **8GB** ,可以进行以下流程,测试CUDA解析加速效果。默认为cpu模式,使用显卡的话需修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值。 +```env +MINERU_TOKEN=官网申请的API 密钥 +POLL_INTERVAL=3 +POLL_TIMEOUT=600 +PORT=1234 +``` -```bash -{ - "device-mode":"cuda" -} +### 步骤 3:修改 `docker-compose.yml` + +```yaml +services: + mineru-saas-wrapper: + build: + context: . + dockerfile: Dockerfile + container_name: mineru-saas-wrapper + restart: unless-stopped + ports: + - "1234:1234" + env_file: + - .env # 改为读取 .env 文件 + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:1234/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" ``` -4、如需使用GPU加速,需额外再安装依赖。 +### 步骤 4:启动服务 ```bash -pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 "numpy<2.0.0" --index-url https://download.pytorch.org/whl/cu118 +docker-compose up -d --build ``` +### 步骤 5:验证服务状态 + ```bash -pip install paddlepaddle-gpu==2.6.1 +# 查看容器状态 +docker ps | grep mineru-saas-wrapper + +# 查看健康检查 +curl http://localhost:1234/health +# 预期输出: +{"status":"healthy"} ``` -5、克隆一个FastGPT的项目文件 -``` -git clone https://github.com/labring/FastGPT.git -``` +## 五、接口测试 -6、将主目录设置为 plugins/model 下的pdf-mineru文件夹 +### 1. 使用 `curl` 测试 +```bash +curl -X POST "http://localhost:1234/v2/parse/file" \ + -F "file=@./sample.pdf" | jq ``` -cd /plugins/model/pdf-mineru/ + +### 2. 预期成功响应 + +```json +{ + "success": true, + "message": "", + "markdown": "# 标题\n\n![](data:image/png;base64,iVBORw0KGgoAAA...) ...", + "pages": 8 +} ``` -7、执行文件pdf_parser_mineru.py,启动服务 +### 查看详细日志 ```bash -python pdf_parser_mineru.py +docker logs -f mineru-saas-wrapper ``` -# 访问示例 +关键日志关键词: +- `Got upload url` → 上传成功 +- `Polling ... -> done` → 解析完成 +- `Parse finished, X pages` → 成功返回 + +--- -仿照了**pdf-marker**的方式。 +## 九、FastGPT 集成指南 -```bash -curl --location --request POST "http://localhost:7231/v1/parse/file" \ ---header "Authorization: Bearer your_access_token" \ ---form "file=@./file/chinese_test.pdf" +### 1. 在 FastGPT 中配置「文档解析」节点 + +| 字段 | 值 | +|------|---- | +| **解析服务地址** | `http://your-server-ip:1234/v2/parse/file` | +| **请求方式** | POST | +| **文件字段名** | `file` | +| **响应字段映射** | `markdown` → 内容,`pages` → 页数 | + +### 2. FastGPT 示例配置(JSON) + +```json +// 已使用 json5 进行解析,会自动去掉注释,无需手动去除 +{ + "feConfigs": { + "lafEnv": "https://laf.dev", // laf环境。 https://laf.run (杭州阿里云) ,或者私有化的laf环境。如果使用 Laf openapi 功能,需要最新版的 laf 。 + "mcpServerProxyEndpoint": "" // mcp server 代理地址,例如: http://localhost:3005 + }, + "systemEnv": { + "datasetParseMaxProcess": 10, // 知识库文件解析最大线程数量 + "vectorMaxProcess": 10, // 向量处理线程数量 + "qaMaxProcess": 10, // 问答拆分线程数量 + "vlmMaxProcess": 10, // 图片理解模型最大处理进程 + "tokenWorkers": 30, // Token 计算线程保持数,会持续占用内存,不能设置太大。 + "hnswEfSearch": 100, // 向量搜索参数,仅对 PG 和 OB 生效。越大,搜索越精确,但是速度越慢。设置为100,有99%+精度。 + "hnswMaxScanTuples": 100000, // 向量搜索最大扫描数据量,仅对 PG生效。 + "customPdfParse": { + "url": "http://your-server-ip:1234/v2/parse/file", // 自定义 PDF 解析服务地址 + "key": "", // 自定义 PDF 解析服务密钥 + "doc2xKey": "", // doc2x 服务密钥 + "price": 0 // PDF 解析服务价格 + } + } +} ``` +--- + +**部署完成!** +现在你的 FastGPT 已拥有强大的 **MinerU 文档解析能力**,支持 PDF + 图片 → 完美 Markdown 内嵌渲染。 + +> 如有问题,欢迎提交 Issue 或查看日志排查。祝你解析愉快! \ No newline at end of file diff --git a/plugins/model/pdf-mineru/docker-compose.yml b/plugins/model/pdf-mineru/docker-compose.yml new file mode 100644 index 000000000000..cd5e47762072 --- /dev/null +++ b/plugins/model/pdf-mineru/docker-compose.yml @@ -0,0 +1,32 @@ +services: + mineru-saas-wrapper: + build: + context: . + dockerfile: Dockerfile + container_name: mineru-saas-wrapper + restart: unless-stopped + ports: + - "1234:1234" + environment: + # 你的 MinerU SaaS API Token(必须) + - MINERU_TOKEN=eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiIzODcwOTM0MyIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc2Mjc2MTEzMywiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiMTg1MjEzMzQ1MDEiLCJvcGVuSWQiOm51bGwsInV1aWQiOiI4OTI5YjgzNC05ZTY4LTRhOTctOTNiMi1hMGVkNDk5N2YzYmYiLCJlbWFpbCI6IiIsImV4cCI6MTc2Mzk3MDczM30.CadUrEtAc_B_04opSk4b5ykK60m-CbrXArZuhNGV35MKsX_SaWTbrMHd3ND309f9fgM10QTWHAszjP2Duamzwg + + # 可选:自定义轮询间隔(秒) + - POLL_INTERVAL=3 + + # 可选:最大等待时间(秒) + - POLL_TIMEOUT=600 + + # 可选:如果你的网络在国外,可改为国内加速镜像源(可选) + # - MINERU_BASE=https://mineru.net + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:1234/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" diff --git a/plugins/model/pdf-mineru/main.py b/plugins/model/pdf-mineru/main.py deleted file mode 100644 index 27dfcc206020..000000000000 --- a/plugins/model/pdf-mineru/main.py +++ /dev/null @@ -1,282 +0,0 @@ -import json -import os -from base64 import b64encode -from glob import glob -from io import StringIO -from typing import Tuple, Union - -import uvicorn -from fastapi import FastAPI, UploadFile, File -from fastapi.responses import JSONResponse -from loguru import logger -from tempfile import TemporaryDirectory -from pathlib import Path -import fitz # PyMuPDF -import asyncio -from concurrent.futures import ProcessPoolExecutor -import torch -import multiprocessing as mp -from contextlib import asynccontextmanager -import time - -import magic_pdf.model as model_config -from magic_pdf.config.enums import SupportedPdfParseMethod -from magic_pdf.data.data_reader_writer import DataWriter, FileBasedDataWriter -from magic_pdf.data.dataset import PymuDocDataset -from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze -from magic_pdf.operators.models import InferenceResult -from magic_pdf.operators.pipes import PipeResult - -model_config.__use_inside_model__ = True - -app = FastAPI() - -process_variables = {} -my_pool = None - -class MemoryDataWriter(DataWriter): - def __init__(self): - self.buffer = StringIO() - - def write(self, path: str, data: bytes) -> None: - if isinstance(data, str): - self.buffer.write(data) - else: - self.buffer.write(data.decode("utf-8")) - - def write_string(self, path: str, data: str) -> None: - self.buffer.write(data) - - def get_value(self) -> str: - return self.buffer.getvalue() # 修复:使用 getvalue() 而不是 get_value() - - def close(self): - self.buffer.close() - -def worker_init(counter, lock): - num_gpus = torch.cuda.device_count() - processes_per_gpu = int(os.environ.get('PROCESSES_PER_GPU', 1)) - with lock: - worker_id = counter.value - counter.value += 1 - if num_gpus == 0: - device = 'cpu' - else: - device_id = worker_id // processes_per_gpu - if device_id >= num_gpus: - raise ValueError(f"Worker ID {worker_id} exceeds available GPUs ({num_gpus}).") - device = f'cuda:{device_id}' - config = { - "parse_method": "auto", - "ADDITIONAL_KEY": "VALUE" - } - converter = init_converter(config, device_id) - pid = os.getpid() - process_variables[pid] = converter - print(f"Worker {worker_id}: Models loaded successfully on {device}!") - -def init_converter(config, device_id): - os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id) - return config - -def img_to_base64(img_path: str) -> str: - with open(img_path, "rb") as img_file: - return b64encode(img_file.read()).decode('utf-8') - -def embed_images_as_base64(md_content: str, image_dir: str) -> str: - lines = md_content.split('\n') - new_lines = [] - for line in lines: - if line.startswith("![") and "](" in line and ")" in line: - start_idx = line.index("](") + 2 - end_idx = line.index(")", start_idx) - img_rel_path = line[start_idx:end_idx] - img_name = os.path.basename(img_rel_path) - img_path = os.path.join(image_dir, img_name) - logger.info(f"Checking image: {img_path}") - if os.path.exists(img_path): - img_base64 = img_to_base64(img_path) - new_line = f"![](data:image/png;base64,{img_base64})" - new_lines.append(new_line) - else: - logger.warning(f"Image not found: {img_path}") - new_lines.append(line) - else: - new_lines.append(line) - return '\n'.join(new_lines) - -def process_pdf(pdf_path, output_dir): - try: - pid = os.getpid() - config = process_variables.get(pid, "No variable") - parse_method = config["parse_method"] - - with open(str(pdf_path), "rb") as f: - pdf_bytes = f.read() - - output_path = Path(output_dir) / f"{Path(pdf_path).stem}_output" - os.makedirs(str(output_path), exist_ok=True) - image_dir = os.path.join(str(output_path), "images") - os.makedirs(image_dir, exist_ok=True) - image_writer = FileBasedDataWriter(str(output_path)) - - # 处理 PDF - infer_result, pipe_result = process_pdf_content(pdf_bytes, parse_method, image_writer) - - md_content_writer = MemoryDataWriter() - pipe_result.dump_md(md_content_writer, "", "images") - md_content = md_content_writer.get_value() - md_content_writer.close() - - # 获取保存的图片路径 - image_paths = glob(os.path.join(image_dir, "*.jpg")) - logger.info(f"Saved images by magic_pdf: {image_paths}") - - # 如果 magic_pdf 未保存足够图片,使用 fitz 提取 - if not image_paths or len(image_paths) < 3: # 假设至少 3 张图片 - logger.warning("Insufficient images saved by magic_pdf, falling back to fitz extraction") - image_map = {} - original_names = [] - # 收集 Markdown 中的所有图片文件名 - for line in md_content.split('\n'): - if line.startswith("![") and "](" in line and ")" in line: - start_idx = line.index("](") + 2 - end_idx = line.index(")", start_idx) - img_rel_path = line[start_idx:end_idx] - original_names.append(os.path.basename(img_rel_path)) - - # 提取图片并映射 - with fitz.open(pdf_path) as doc: - img_counter = 0 - for page_num, page in enumerate(doc): - for img_index, img in enumerate(page.get_images(full=True)): - xref = img[0] - base = doc.extract_image(xref) - if img_counter < len(original_names): - img_name = original_names[img_counter] # 使用 Markdown 中的原始文件名 - else: - img_name = f"page_{page_num}_img_{img_index}.jpg" - img_path = os.path.join(image_dir, img_name) - with open(img_path, "wb") as f: - f.write(base["image"]) - if img_counter < len(original_names): - image_map[original_names[img_counter]] = img_name - img_counter += 1 - - image_paths = glob(os.path.join(image_dir, "*.jpg")) - logger.info(f"Images extracted by fitz: {image_paths}") - - # 更新 Markdown(仅在必要时替换) - for original_name, new_name in image_map.items(): - if original_name != new_name: - md_content = md_content.replace(f"images/{original_name}", f"images/{new_name}") - - return { - "status": "success", - "text": md_content, - "output_path": str(output_path), - "images": image_paths - } - except Exception as e: - logger.error(f"Error processing PDF: {str(e)}") - return { - "status": "error", - "message": str(e), - "file": str(pdf_path) - } - -def process_pdf_content(pdf_bytes, parse_method, image_writer): - ds = PymuDocDataset(pdf_bytes) - infer_result: InferenceResult = None - pipe_result: PipeResult = None - - if parse_method == "ocr": - infer_result = ds.apply(doc_analyze, ocr=True) - pipe_result = infer_result.pipe_ocr_mode(image_writer) - elif parse_method == "txt": - infer_result = ds.apply(doc_analyze, ocr=False) - pipe_result = infer_result.pipe_txt_mode(image_writer) - else: # auto - if ds.classify() == SupportedPdfParseMethod.OCR: - infer_result = ds.apply(doc_analyze, ocr=True) - pipe_result = infer_result.pipe_ocr_mode(image_writer) - else: - infer_result = ds.apply(doc_analyze, ocr=False) - pipe_result = infer_result.pipe_txt_mode(image_writer) - - return infer_result, pipe_result - -@asynccontextmanager -async def lifespan(app: FastAPI): - try: - mp.set_start_method('spawn') - except RuntimeError: - raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.") - global my_pool - manager = mp.Manager() - worker_counter = manager.Value('i', 0) - worker_lock = manager.Lock() - gpu_count = torch.cuda.device_count() - my_pool = ProcessPoolExecutor(max_workers=gpu_count * int(os.environ.get('PROCESSES_PER_GPU', 1)), - initializer=worker_init, initargs=(worker_counter, worker_lock)) - yield - if my_pool: - my_pool.shutdown(wait=True) - print("Application shutdown, cleaning up...") - -app.router.lifespan_context = lifespan - -@app.post("/v2/parse/file") -async def process_pdfs(file: UploadFile = File(...)): - s_time = time.time() - with TemporaryDirectory() as temp_dir: - temp_path = Path(temp_dir) / file.filename - with open(str(temp_path), "wb") as buffer: - buffer.write(await file.read()) - - # 验证 PDF 文件 - try: - with fitz.open(str(temp_path)) as pdf_document: - total_pages = pdf_document.page_count - except fitz.fitz.FileDataError: - return JSONResponse(content={"success": False, "message": "", "error": "Invalid PDF file"}, status_code=400) - except Exception as e: - logger.error(f"Error opening PDF: {str(e)}") - return JSONResponse(content={"success": False, "message": "", "error": f"Internal server error: {str(e)}"}, status_code=500) - - try: - loop = asyncio.get_running_loop() - results = await loop.run_in_executor( - my_pool, - process_pdf, - str(temp_path), - str(temp_dir) - ) - - if results.get("status") == "error": - return JSONResponse(content={ - "success": False, - "message": "", - "error": results.get("message") - }, status_code=500) - - # 嵌入 Base64 - image_dir = os.path.join(results.get("output_path"), "images") - md_content_with_base64 = embed_images_as_base64(results.get("text"), image_dir) - - return { - "success": True, - "message": "", - "markdown": md_content_with_base64, - "pages": total_pages - } - except Exception as e: - logger.error(f"Error in process_pdfs: {str(e)}") - return JSONResponse(content={ - "success": False, - "message": "", - "error": f"Internal server error: {str(e)}" - }, status_code=500) - -if __name__ == "__main__": - uvicorn.run(app, host="0.0.0.0", port=7231) diff --git a/plugins/model/pdf-mineru/mineru_saas_api.py b/plugins/model/pdf-mineru/mineru_saas_api.py new file mode 100644 index 000000000000..2d46f2114a7b --- /dev/null +++ b/plugins/model/pdf-mineru/mineru_saas_api.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- +import os +import io +import time +import zipfile +import base64 +import tempfile +from pathlib import Path +from typing import List + +import httpx +import uvicorn +from fastapi import FastAPI, File, UploadFile, HTTPException +from fastapi.responses import JSONResponse +from loguru import logger + +# -------------------------------------------------------------- +# 配置(全部走环境变量,Docker 里通过 -e 注入) +# -------------------------------------------------------------- +MINERU_TOKEN = os.getenv("MINERU_TOKEN") # 必须 +MINERU_BASE = os.getenv("MINERU_BASE", "https://mineru.net") +POLL_INTERVAL = int(os.getenv("POLL_INTERVAL", "3")) # 秒 +POLL_TIMEOUT = int(os.getenv("POLL_TIMEOUT", "600")) # 秒 +# -------------------------------------------------------------- + +app = FastAPI(title="MinerU SaaS Wrapper", version="1.0.0") + +# ---------- 工具 ---------- +def img_to_base64(img_bytes: bytes) -> str: + return base64.b64encode(img_bytes).decode("utf-8") + +def embed_images(md: str, img_dir: Path) -> str: + """把 markdown 中 ![xxx](relative_path) 替换为 data-uri""" + lines = md.splitlines() + out: List[str] = [] + for line in lines: + if line.startswith("![") and "](" in line and ")" in line: + start = line.index("](") + 2 + end = line.index(")", start) + rel = line[start:end] + img_path = img_dir / rel + if img_path.is_file(): + b64 = img_to_base64(img_path.read_bytes()) + new_line = f'![](data:image/png;base64,{b64})' + out.append(new_line) + continue + out.append(line) + return "\n".join(out) + +# ---------- SaaS 调用 ---------- +async def create_task(file_bytes: bytes, filename: str) -> str: + url = f"{MINERU_BASE}/api/v4/extract/task" + headers = { + "Authorization": f"Bearer {MINERU_TOKEN}", + "Content-Type": "application/json", + } + # 这里使用 VLM(默认),如需 pipeline 可改 model_version + payload = { + "url": "", # 必填但我们用 upload 方式,留空 + "model_version": "vlm", + } + # SaaS 目前只接受 URL,我们先把文件上传到临时公开位置不可行 → 改用 **批量上传** 方式 + # 下面改成 **批量文件上传**(一次只传一个文件),返回 task_id 列表 + raise NotImplementedError("请看下方完整实现") + +# -------------------------------------------------------------- +# 下面是 **完整实现**(一次只处理一个文件,使用批量上传接口) +# -------------------------------------------------------------- +async def _upload_and_create(file_bytes: bytes, filename: str) -> str: + """ + 1. 调用 /api/v4/file-urls/batch 获取上传 URL(一次一个文件) + 2. PUT 上传文件 + 3. 系统自动提交解析任务,返回 batch_id + 4. 轮询 /api/v4/extract-results/batch/{batch_id} 取结果 + """ + client = httpx.AsyncClient(timeout=60.0) + + # ---- 1. 申请上传 URL ---- + batch_url = f"{MINERU_BASE}/api/v4/file-urls/batch" + headers = {"Authorization": f"Bearer {MINERU_TOKEN}", "Content-Type": "application/json"} + batch_payload = { + "files": [{"name": filename}], + "model_version": "vlm" + } + r = await client.post(batch_url, headers=headers, json=batch_payload) + r.raise_for_status() + batch_resp = r.json() + if batch_resp.get("code") != 0: + raise HTTPException(status_code=500, detail=f"MinerU batch create fail: {batch_resp.get('msg')}") + batch_id = batch_resp["data"]["batch_id"] + upload_url = batch_resp["data"]["file_urls"][0] + logger.info(f"Got upload url for {filename}, batch_id={batch_id}") + + # ---- 2. 上传文件 ---- + put_r = await client.put(upload_url, content=file_bytes) + put_r.raise_for_status() + logger.info(f"File uploaded, status={put_r.status_code}") + + # ---- 3. 轮询结果 ---- + result_url = f"{MINERU_BASE}/api/v4/extract-results/batch/{batch_id}" + start = time.time() + while True: + if time.time() - start > POLL_TIMEOUT: + raise HTTPException(status_code=504, detail="MinerU SaaS timeout") + poll = await client.get(result_url, headers=headers) + poll.raise_for_status() + data = poll.json() + if data.get("code") != 0: + raise HTTPException(status_code=500, detail=data.get("msg")) + + results = data["data"]["extract_result"] + # 只有一个文件 + task = results[0] + state = task["state"] + logger.debug(f"Polling {batch_id} -> {state}") + + if state == "done": + zip_url = task["full_zip_url"] + await client.aclose() + return zip_url + if state in ("failed",): + raise HTTPException(status_code=500, detail=task.get("err_msg", "MinerU parse failed")) + # pending / running / converting / waiting-file + await asyncio.sleep(POLL_INTERVAL) + +# ---------- 主入口 ---------- +import asyncio + +@app.post("/v2/parse/file") +async def parse_file(file: UploadFile = File(...)): + """ + FastGPT 调用的统一入口 + """ + if not MINERU_TOKEN: + raise HTTPException(status_code=500, detail="MINERU_TOKEN not set") + + allowed = {".pdf", ".png", ".jpeg", ".jpg"} + ext = Path(file.filename).suffix.lower() + if ext not in allowed: + raise HTTPException(status_code=400, + detail=f"Unsupported file type {ext}. Allowed: {allowed}") + + file_bytes = await file.read() + if not file_bytes: + raise HTTPException(status_code=400, detail="Empty file") + + filename = Path(file.filename).name + start = time.time() + + try: + # 1. 上传 + 提交任务 → 得到 zip_url + zip_url = await _upload_and_create(file_bytes, filename) + + # 2. 下载 zip + async with httpx.AsyncClient() as client: + resp = await client.get(zip_url) + resp.raise_for_status() + zip_bytes = resp.content + + # 3. 解压到临时目录 + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + with zipfile.ZipFile(io.BytesIO(zip_bytes)) as z: + z.extractall(tmp_path) + + # 4. 找 markdown(默认是和文件名同名的 .md) + md_files = list(tmp_path.rglob("*.md")) + if not md_files: + raise HTTPException(status_code=500, detail="No markdown in result zip") + md_path = md_files[0] + markdown = md_path.read_text(encoding="utf-8") + + # 5. 嵌入图片(图片在同一级目录或子目录) + img_dir = md_path.parent + markdown_b64 = embed_images(markdown, img_dir) + + # 6. 计算页数(zip 中通常有 page_*.png) + page_imgs = list(tmp_path.rglob("page_*.png")) + list(tmp_path.rglob("page_*.jpg")) + pages = len(page_imgs) + + logger.info(f"Parse finished, {pages} pages, {time.time()-start:.1f}s") + return JSONResponse({ + "success": True, + "message": "", + "markdown": markdown_b64, + "pages": pages + }) + + except Exception as e: + logger.exception(f"Parse error for {filename}") + raise HTTPException(status_code=500, detail=str(e)) + +# ---------- 健康检查 ---------- +@app.get("/health") +async def health(): + return {"status": "healthy"} + +# -------------------------------------------------------------- +if __name__ == "__main__": + port = int(os.getenv("PORT", "1234")) + host = os.getenv("HOST", "0.0.0.0") + logger.info(f"Starting MinerU SaaS wrapper on {host}:{port}") + uvicorn.run("mineru_saas_api:app", host=host, port=port, reload=False) diff --git a/plugins/model/pdf-mineru/requirements.txt b/plugins/model/pdf-mineru/requirements.txt new file mode 100644 index 000000000000..efe99f3d5abf --- /dev/null +++ b/plugins/model/pdf-mineru/requirements.txt @@ -0,0 +1,5 @@ +fastapi>=0.104.0 +uvicorn[standard]>=0.24.0 +httpx>=0.27.0 +loguru>=0.7.2 +python-multipart>=0.0.6