# Copyright (c) Opendatalab. All rights reserved. import base64 import os import re import sys import time import zipfile from pathlib import Path import click import gradio as gr from gradio_pdf import PDF from loguru import logger log_level = os.getenv("MINERU_LOG_LEVEL", "INFO").upper() logger.remove() # 移除默认handler logger.add(sys.stderr, level=log_level) # 添加新handler from mineru.cli.common import prepare_env, read_fn, aio_do_parse, pdf_suffixes, image_suffixes from mineru.utils.cli_parser import arg_parse from mineru.utils.engine_utils import get_vlm_engine from mineru.utils.hash_utils import str_sha256 # --- 新增:标准的树状思维导图生成函数 --- def md_to_markmap_html(md_content): """ 将 Markdown 转换为标准的树状思维导图 (Markmap) """ if not md_content: return "" # 转义 Markdown 中的反引号和符号,防止破坏 JS 字符串 safe_md = md_content.replace('`', '\\`').replace('$', '\\$') # 完整的 HTML + Markmap 渲染引擎 full_html = f"""
""" # 使用 iframe 封装,彻底解决渲染失效问题 iframe_srcdoc = full_html.replace('"', '"') iframe_code = f""" """ return iframe_code # ────────────── 新增:根据上一级标题自动补全下一级标题 ────────────── def auto_promote_paragraphs_to_subheading(text): lines = text.splitlines() result = [] in_section = False empty_count = 0 for line in lines: stripped = line.strip() if stripped.startswith('# '): result.append(line) in_section = True empty_count = 0 continue if stripped.startswith('#'): result.append(line) in_section = False empty_count = 0 continue if not stripped: result.append(line) empty_count += 1 if empty_count >= 2: in_section = False continue # 跳过图片、列表、代码等特殊行 if ( stripped.startswith('![') or stripped.startswith('>') or stripped.startswith('```') or re.match(r'^[-*+] ', stripped) or re.match(r'^\d+\. ', stripped) ): result.append(line) empty_count = 0 continue empty_count = 0 if in_section: result.append('## ' + stripped) else: result.append(line) return '\n'.join(result) async def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_enable, language, backend, url): os.makedirs(output_dir, exist_ok=True) try: file_name = f'{safe_stem(Path(doc_path).stem)}_{time.strftime("%y%m%d_%H%M%S")}' pdf_data = read_fn(doc_path) if backend.startswith("vlm"): parse_method = "vlm" else: parse_method = 'ocr' if is_ocr else 'auto' if backend.startswith("hybrid"): env_name = f"hybrid_{parse_method}" else: env_name = parse_method local_image_dir, local_md_dir = prepare_env(output_dir, file_name, env_name) await aio_do_parse( output_dir=output_dir, pdf_file_names=[file_name], pdf_bytes_list=[pdf_data], p_lang_list=[language], parse_method=parse_method, end_page_id=end_page_id, formula_enable=formula_enable, table_enable=table_enable, backend=backend, server_url=url, ) return local_md_dir, file_name except Exception as e: logger.exception(e) return None def compress_directory_to_zip(directory_path, output_zip_path): try: with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: for root, dirs, files in os.walk(directory_path): for file in files: file_path = os.path.join(root, file) arcname = os.path.relpath(file_path, directory_path) zipf.write(file_path, arcname) return 0 except Exception as e: logger.exception(e) return -1 def image_to_base64(image_path): with open(image_path, 'rb') as image_file: return base64.b64encode(image_file.read()).decode('utf-8') def replace_image_with_base64(markdown_text, image_dir_path): pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)' def replace(match): relative_path = match.group(1) if relative_path.endswith('.jpg'): full_path = os.path.join(image_dir_path, relative_path) base64_image = image_to_base64(full_path) return f'' return match.group(0) return re.sub(pattern, replace, markdown_text) async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True, table_enable=True, language="ch", backend="pipeline", url=None): # 如果language包含(),则提取括号前的内容作为实际语言 if '(' in language and ')' in language: language = language.split('(')[0].strip() file_path = to_pdf(file_path) # 打印请求参数日志 logger.info(f"parse_pdf 请求参数: file_path={file_path}, output_dir='./output', end_page_id={end_pages - 1}, " f"is_ocr={is_ocr}, formula_enable={formula_enable}, table_enable={table_enable}, " f"language='{language}', backend='{backend}', url={url}") # 获取识别的md文件以及压缩包文件路径 local_md_dir, file_name = await parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language, backend, url) archive_zip_path = os.path.join('./output', str_sha256(local_md_dir) + '.zip') zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path) if zip_archive_success == 0: logger.info('Compression successful') else: logger.error('Compression failed') md_path = os.path.join(local_md_dir, file_name + '.md') with open(md_path, 'r', encoding='utf-8') as f: txt_content = f.read() # ────────────── 自动补全:根据 # 标题补全后续段落为 ## ────────────── txt_content = auto_promote_paragraphs_to_subheading(txt_content) # ──────────────────────────────────────────────────────────────── md_content = replace_image_with_base64(txt_content, local_md_dir) # 生成思维导图HTML - 使用新的实现 mind_map_html = md_to_markmap_html(txt_content) # 返回转换后的PDF路径 new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf') return md_content, txt_content, archive_zip_path, new_pdf_path, mind_map_html latex_delimiters_type_all = [ {'left': '$$', 'right': '$$', 'display': True}, {'left': '$', 'right': '$', 'display': False}, {'left': '\\(', 'right': '\\)', 'display': False}, {'left': '\\[', 'right': '\\]', 'display': True}, ] latex_delimiters_type_a = [ {'left': '$$', 'right': '$$', 'display': True}, {'left': '$', 'right': '$', 'display': False}, ] latex_delimiters_type_b = [ {'left': '\\(', 'right': '\\)', 'display': False}, {'left': '\\[', 'right': '\\]', 'display': True}, ] other_lang = ['ch (Chinese, English, Chinese Traditional)', 'en (English)', 'korean', 'japan'] all_lang = [*other_lang] def safe_stem(file_path): stem = Path(file_path).stem return re.sub(r'[^\w.]', '_', stem) def to_pdf(file_path): if file_path is None: return None pdf_bytes = read_fn(file_path) unique_filename = f'{safe_stem(file_path)}.pdf' tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename) with open(tmp_file_path, 'wb') as tmp_pdf_file: tmp_pdf_file.write(pdf_bytes) return tmp_file_path @click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)) @click.pass_context @click.option( '--enable-example', 'example_enable', type=bool, help="Enable example files for input." "The example files to be input need to be placed in the `example` folder within the directory where the command is currently executed.", default=True, ) @click.option( '--enable-http-client', 'http_client_enable', type=bool, help="Enable http-client backend to link openai-compatible servers.", default=False, ) @click.option( '--enable-api', 'api_enable', type=bool, help="Enable gradio API for serving the application.", default=True, ) @click.option( '--max-convert-pages', 'max_convert_pages', type=int, help="Set the maximum number of pages to convert from PDF to Markdown.", default=1000, ) @click.option( '--server-name', 'server_name', type=str, help="Set the server name for the Gradio app.", default=None, ) @click.option( '--server-port', 'server_port', type=int, help="Set the server port for the Gradio app.", default=None, ) @click.option( '--latex-delimiters-type', 'latex_delimiters_type', type=click.Choice(['a', 'b', 'all']), help="Set the type of LaTeX delimiters to use in Markdown rendering:" "'a' for type '$', 'b' for type '()[]', 'all' for both types.", default='all', ) def main(ctx, example_enable, http_client_enable, api_enable, max_convert_pages, server_name, server_port, latex_delimiters_type, **kwargs ): # 检测系统语言环境,默认为中文 import locale import os def detect_language(): # 检查环境变量 lang = os.getenv('LANG', '') if 'zh' in lang.lower() or 'chinese' in lang.lower(): return 'zh' # 检查系统默认locale try: default_locale = locale.getdefaultlocale()[0] if default_locale and 'zh' in default_locale.lower(): return 'zh' except: pass # 默认返回中文 return 'zh' detected_lang = detect_language() # 创建 i18n 实例,支持中英文,默认为中文 i18n = gr.I18n( en={ "upload_file": "Please upload a PDF or image", "max_pages": "Max convert pages", "backend": "Backend", "server_url": "Server URL", "server_url_info": "OpenAI-compatible server URL for http-client backend.", "recognition_options": "Recognition Options", "table_enable": "Enable table recognition", "table_info": "If disabled, tables will be shown as images.", "formula_label_vlm": "Enable display formula recognition", "formula_label_pipeline": "Enable formula recognition", "formula_label_hybrid": "Enable inline formula recognition", "formula_info_vlm": "If disabled, display formulas will be shown as images.", "formula_info_pipeline": "If disabled, display formulas will be shown as images, and inline formulas will not be detected or parsed.", "formula_info_hybrid": "If disabled, inline formulas will not be detected or parsed.", "ocr_language": "OCR Language", "ocr_language_info": "Select the OCR language for image-based PDFs and images.", "force_ocr": "Force enable OCR", "force_ocr_info": "Enable only if the result is extremely poor. Requires correct OCR language.", "convert": "Convert", "clear": "Clear", "pdf_preview": "PDF preview", "examples": "Examples:", "convert_result": "Convert result", "md_rendering": "Markdown rendering", "md_text": "Markdown text", "mind_map": "Mind Map", # 新增 "backend_info_vlm": "High-precision parsing via VLM, supports Chinese and English documents only.", "backend_info_pipeline": "Traditional Multi-model pipeline parsing, supports multiple languages, hallucination-free.", "backend_info_hybrid": "High-precision hybrid parsing, supports multiple languages.", "backend_info_default": "Select the backend engine for document parsing.", }, zh={ "upload_file": "请上传 PDF 或图片", "max_pages": "最大转换页数", "backend": "解析后端", "server_url": "服务器地址", "server_url_info": "http-client 后端的 OpenAI 兼容服务器地址。", "recognition_options": "识别选项", "table_enable": "启用表格识别", "table_info": "禁用后,表格将显示为图片。", "formula_label_vlm": "启用行间公式识别", "formula_label_pipeline": "启用公式识别", "formula_label_hybrid": "启用行内公式识别", "formula_info_vlm": "禁用后,行间公式将显示为图片。", "formula_info_pipeline": "禁用后,行间公式将显示为图片,行内公式将不会被检测或解析。", "formula_info_hybrid": "禁用后,行内公式将不会被检测或解析。", "ocr_language": "OCR 语言", "ocr_language_info": "为扫描版 PDF 和图片选择 OCR 语言。", "force_ocr": "强制启用 OCR", "force_ocr_info": "仅在识别效果极差时启用,需选择正确的 OCR 语言。", "convert": "转换", "clear": "清除", "pdf_preview": "PDF 预览", "examples": "示例:", "convert_result": "转换结果", "md_rendering": "Markdown 渲染", "md_text": "Markdown 文本", "mind_map": "思维导图", # 新增 "backend_info_vlm": "多模态大模型高精度解析,仅支持中英文文档。", "backend_info_pipeline": "传统多模型管道解析,支持多语言,无幻觉。", "backend_info_hybrid": "高精度混合解析,支持多语言。", "backend_info_default": "选择文档解析的后端引擎。", }, ) # 根据后端类型获取公式识别标签(闭包函数以支持 i18n) def get_formula_label(backend_choice): if backend_choice.startswith("vlm"): return i18n("formula_label_vlm") elif backend_choice == "pipeline": return i18n("formula_label_pipeline") elif backend_choice.startswith("hybrid"): return i18n("formula_label_hybrid") else: return i18n("formula_label_pipeline") def get_formula_info(backend_choice): if backend_choice.startswith("vlm"): return i18n("formula_info_vlm") elif backend_choice == "pipeline": return i18n("formula_info_pipeline") elif backend_choice.startswith("hybrid"): return i18n("formula_info_hybrid") else: return "" def get_backend_info(backend_choice): if backend_choice.startswith("vlm"): return i18n("backend_info_vlm") elif backend_choice == "pipeline": return i18n("backend_info_pipeline") elif backend_choice.startswith("hybrid"): return i18n("backend_info_hybrid") else: return i18n("backend_info_default") # 更新界面函数 def update_interface(backend_choice): formula_label_update = gr.update(label=get_formula_label(backend_choice), info=get_formula_info(backend_choice)) backend_info_update = gr.update(info=get_backend_info(backend_choice)) if "http-client" in backend_choice: client_options_update = gr.update(visible=True) else: client_options_update = gr.update(visible=False) if "vlm" in backend_choice: ocr_options_update = gr.update(visible=False) else: ocr_options_update = gr.update(visible=True) return client_options_update, ocr_options_update, formula_label_update, backend_info_update kwargs.update(arg_parse(ctx)) if latex_delimiters_type == 'a': latex_delimiters = latex_delimiters_type_a elif latex_delimiters_type == 'b': latex_delimiters = latex_delimiters_type_b elif latex_delimiters_type == 'all': latex_delimiters = latex_delimiters_type_all else: raise ValueError(f"Invalid latex delimiters type: {latex_delimiters_type}.") vlm_engine = get_vlm_engine("auto", is_async=True) if vlm_engine in ["transformers", "mlx-engine"]: http_client_enable = True else: try: logger.info(f"Start init {vlm_engine}...") from mineru.backend.vlm.vlm_analyze import ModelSingleton model_singleton = ModelSingleton() predictor = model_singleton.get_model( vlm_engine, None, None, **kwargs ) logger.info(f"{vlm_engine} init successfully.") except Exception as e: logger.exception(e) suffixes = [f".{suffix}" for suffix in pdf_suffixes + image_suffixes] with gr.Blocks(title="多模态思维导图助手", fill_height=True) as demo: # gr.HTML(header) gr.HTML("