Spaces:
Running
Running
| import os | |
| import uuid | |
| import shutil | |
| import re | |
| import base64 | |
| import gradio as gr | |
| from PIL import Image | |
| from tools.infer_doc import OpenDoc | |
| from tools.utils.logging import get_logger | |
| logger = get_logger(name='opendoc_gradio') | |
| # Initialize the pipeline | |
| pipeline: OpenDoc | None = None | |
| def get_pipeline(gpu_id: int) -> OpenDoc: | |
| """获取或初始化OpenDoc流水线 | |
| Args: | |
| gpu_id: GPU设备ID,-1表示使用CPU | |
| Returns: | |
| OpenDoc: 初始化好的OpenDoc实例 | |
| """ | |
| global pipeline | |
| if pipeline is None: | |
| logger.info( | |
| f"Initializing OpenDoc pipeline on {'GPU ' + str(gpu_id) if gpu_id >= 0 else 'CPU'}..." | |
| ) | |
| pipeline = OpenDoc(gpuId=gpu_id) | |
| return pipeline | |
| # Ensure pipeline is initialized | |
| try: | |
| current_pipeline = get_pipeline(0) | |
| except Exception as e: | |
| raise e | |
| def process_image(image_path: str | None) -> tuple[Image.Image | None, str, str, str | None, str, str]: | |
| """处理图片并进行OCR识别 | |
| Args: | |
| image_path: 图片文件路径,None表示无图片 | |
| Returns: | |
| tuple: (可视化图片, Markdown内容(base64图片), JSON内容, ZIP文件路径, 原始Markdown, Markdown内容(base64图片)) | |
| """ | |
| if image_path is None: | |
| return None, '', '', None, '', '' | |
| # Get original image name | |
| base_name = os.path.splitext(os.path.basename(image_path))[0] | |
| file_ext = os.path.splitext(image_path)[1] or '.jpg' | |
| # Create a directory with image name for this request | |
| output_base_dir = 'gradio_outputs' | |
| os.makedirs(output_base_dir, exist_ok=True) | |
| # Add timestamp to avoid conflicts if same filename is uploaded multiple times | |
| timestamp = str(uuid.uuid4())[:8] | |
| folder_name = f"{base_name}_{timestamp}" | |
| tmp_dir = os.path.join(output_base_dir, folder_name) | |
| os.makedirs(tmp_dir, exist_ok=True) | |
| try: | |
| # Copy and rename the input image | |
| tmp_img_path = os.path.join(tmp_dir, f'{base_name}{file_ext}') | |
| image = Image.open(image_path) | |
| image.save(tmp_img_path) | |
| # Predict | |
| output = list( | |
| current_pipeline.predict(tmp_img_path, | |
| use_doc_orientation_classify=False, | |
| use_doc_unwarping=False)) | |
| if not output: | |
| return None, 'No results found.', '', None, '', '' | |
| res = output[0] | |
| # Save results | |
| res.save_to_img(tmp_dir) | |
| res.save_to_markdown(tmp_dir, pretty=True) | |
| res.save_to_json(tmp_dir) | |
| # Find the saved files | |
| vis_img = None | |
| for f in os.listdir(tmp_dir): | |
| if 'layout_order_res' in f: | |
| vis_img_path = os.path.join(tmp_dir, f) | |
| vis_img = Image.open(vis_img_path) | |
| break | |
| markdown_content = '' | |
| md_file_path = None | |
| for f in os.listdir(tmp_dir): | |
| if f.endswith('.md'): | |
| md_file_path = os.path.join(tmp_dir, f) | |
| with open(md_file_path, 'r', encoding='utf-8') as file: | |
| markdown_content = file.read() | |
| break | |
| # Convert relative image paths to base64 for proper display in Gradio | |
| if markdown_content: | |
| def replace_img_with_base64(match): | |
| img_path = match.group(1) | |
| full_img_path = os.path.join(tmp_dir, img_path) | |
| if os.path.exists(full_img_path): | |
| try: | |
| with open(full_img_path, 'rb') as img_file: | |
| img_data = base64.b64encode(img_file.read()).decode('utf-8') | |
| # Determine image format | |
| ext = os.path.splitext(full_img_path)[1].lower() | |
| mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png' | |
| # Replace src with base64 data URL | |
| return match.group(0).replace(f'src="{img_path}"', f'src="data:{mime_type};base64,{img_data}"') | |
| except Exception as e: | |
| logger.warning(f'Failed to convert image {img_path} to base64: {e}') | |
| return match.group(0) | |
| # Find all img tags and replace their src | |
| markdown_content_show = re.sub(r'<img[^>]*src="([^"]+)"[^>]*>', replace_img_with_base64, markdown_content) | |
| else: | |
| markdown_content_show = markdown_content | |
| json_content = '' | |
| json_file_path = None | |
| for f in os.listdir(tmp_dir): | |
| if f.endswith('.json'): | |
| json_file_path = os.path.join(tmp_dir, f) | |
| with open(json_file_path, 'r', encoding='utf-8') as file: | |
| json_content = file.read() | |
| break | |
| # Prepare all files in tmp_dir for download by creating a zip archive | |
| zip_path = os.path.join(output_base_dir, f'{folder_name}.zip') | |
| _ = shutil.make_archive(zip_path.replace('.zip', ''), 'zip', tmp_dir) | |
| return vis_img, markdown_content_show, json_content, zip_path, markdown_content, markdown_content_show | |
| except Exception as e: | |
| logger.error(f'Prediction error: {str(e)}') | |
| return None, f'Error during prediction: {str(e)}', '', None, '', '' | |
| # Custom CSS with adaptive colors | |
| custom_css = """ | |
| body, .gradio-container { | |
| font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Noto Sans", Helvetica, Arial, sans-serif; | |
| } | |
| .app-header { | |
| text-align: center; | |
| max-width: 1200px; | |
| margin: 20px auto !important; | |
| padding: 20px; | |
| } | |
| .app-header h1 { | |
| font-size: 2.5em; | |
| font-weight: 700; | |
| margin-bottom: 10px; | |
| } | |
| .app-header p { | |
| font-size: 1.1em; | |
| opacity: 0.7; | |
| line-height: 1.6; | |
| } | |
| .quick-links { | |
| text-align: center; | |
| padding: 12px 0; | |
| border: 1px solid var(--border-color-primary); | |
| border-radius: 12px; | |
| margin: 16px auto; | |
| max-width: 1200px; | |
| background: var(--background-fill-secondary); | |
| } | |
| .quick-links a { | |
| margin: 0 16px; | |
| font-size: 15px; | |
| font-weight: 600; | |
| color: var(--link-text-color); | |
| text-decoration: none; | |
| transition: all 0.3s ease; | |
| } | |
| .quick-links a:hover { | |
| opacity: 0.8; | |
| text-decoration: underline; | |
| } | |
| .upload-section { | |
| border: 2px dashed var(--border-color-primary); | |
| border-radius: 12px; | |
| padding: 20px; | |
| background: var(--background-fill-secondary); | |
| transition: all 0.3s ease; | |
| } | |
| .upload-section:hover { | |
| border-color: var(--color-accent); | |
| background: var(--background-fill-primary); | |
| } | |
| #vis_output { | |
| min-height: 400px; | |
| border-radius: 12px; | |
| overflow: hidden; | |
| } | |
| #md_preview { | |
| max-height: 600px; | |
| min-height: 200px; | |
| overflow: auto; | |
| padding: 20px; | |
| background: var(--background-fill-primary); | |
| border-radius: 12px; | |
| box-shadow: var(--shadow-drop); | |
| } | |
| #md_preview img { | |
| display: block; | |
| margin: 16px auto; | |
| max-width: 100%; | |
| height: auto; | |
| border-radius: 8px; | |
| } | |
| .notice { | |
| margin: 20px auto; | |
| max-width: 1200px; | |
| padding: 16px 20px; | |
| border-left: 4px solid var(--color-accent); | |
| border-radius: 8px; | |
| background: var(--background-fill-secondary); | |
| font-size: 14px; | |
| line-height: 1.8; | |
| } | |
| .notice strong { | |
| font-weight: 700; | |
| color: var(--color-accent); | |
| } | |
| .notice ul { | |
| margin-top: 8px; | |
| padding-left: 20px; | |
| } | |
| .notice li { | |
| margin: 8px 0; | |
| } | |
| .gradio-button-primary { | |
| font-weight: 600 !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| .gradio-button-primary:hover { | |
| transform: translateY(-2px); | |
| box-shadow: var(--shadow-drop-lg) !important; | |
| } | |
| """ | |
| # LaTeX delimiters for formula rendering | |
| LATEX_DELIMS = [ | |
| {"left": "$$", "right": "$$", "display": True}, | |
| {"left": "$", "right": "$", "display": False}, | |
| {"left": "\\(", "right": "\\)", "display": False}, | |
| {"left": "\\[", "right": "\\]", "display": True}, | |
| ] | |
| # Define the Gradio Interface | |
| def create_demo() -> gr.Blocks: | |
| """创建Gradio演示界面 | |
| Returns: | |
| gr.Blocks: Gradio Blocks应用实例 | |
| """ | |
| with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title='OpenDoc-0.1B Demo') as demo: | |
| # Header | |
| gr.HTML(""" | |
| <div class="app-header"> | |
| <h1>🚀 OpenDoc-0.1B</h1> | |
| <p>Ultra-Lightweight Document Parsing System with 0.1B Parameters (built by <a href="https://github.com/Topdu/OpenOCR">OCR Team</a>, <a href="https://fvl.fudan.edu.cn">FVL Lab</a>)</p> | |
| <p style="font-size: 0.95em; color: #888;"> | |
| Powered by <a href="https://www.paddleocr.ai/latest/version3.x/module_usage/layout_analysis.html" target="_blank">PP-DocLayoutV2</a> for layout analysis and <a href="https://arxiv.org/pdf/2512.21095" target="_blank">UniRec-0.1B</a> for unified recognition of text, formulas, and tables | |
| </p> | |
| <strong>⚡ Deployment Notice:</strong> | |
| <p style="font-size: 0.95em; color: #888;">The current online demo uses CPU inference, which may be unstable and slow. For better performance, you can deploy locally with GPU acceleration refer to the <a href="https://github.com/Topdu/OpenOCR/blob/main/docs/opendoc.md" target="_blank">[Local GPU Deployment]</a>. We are also working on deploying OpenDoc-0.1B using inference acceleration frameworks. Stay tuned!</p> | |
| </div> | |
| <div class="quick-links"> | |
| <a href="https://github.com/Topdu/OpenOCR" target="_blank">📖 GitHub</a> | |
| <a href="https://arxiv.org/pdf/2512.21095" target="_blank">📄 Paper</a> | |
| <a href="https://huggingface.co/topdu/unirec-0.1b" target="_blank">🤗 Model</a> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=5, elem_classes=["upload-section"]): | |
| input_img = gr.Image(type='filepath', label='📤 Upload Document Image', height=400) | |
| gr.Markdown(""" | |
| ### 💡 Tips | |
| - Supports Chinese and English documents | |
| - Best for reports, papers, magazines, and complex layouts | |
| - Handles text, formulas, tables, and images | |
| """) | |
| btn = gr.Button('🔍 Analyze Document', variant='primary', size='lg') | |
| download_output = gr.File(label='📥 Download All Results (ZIP)', visible=True) | |
| with gr.Column(scale=7): | |
| with gr.Tabs(): | |
| with gr.Tab('📝 Markdown Preview'): | |
| output_md = gr.Markdown( | |
| 'Please upload an image and click "Analyze Document" to see results.', | |
| latex_delimiters=LATEX_DELIMS, | |
| elem_id='md_preview' | |
| ) | |
| with gr.Tab('📊 Layout Visualization'): | |
| output_vis = gr.Image(type='pil', label='Layout Analysis Results', elem_id='vis_output') | |
| with gr.Tab('📄 Raw Markdown'): | |
| output_md_raw = gr.Code( | |
| label='Markdown Source', | |
| language='markdown', | |
| lines=20 | |
| ) | |
| with gr.Tab('📄 Raw Markdown with Base64 Images'): | |
| output_md_raw_with_base64 = gr.Code( | |
| label='Markdown Source', | |
| language='markdown', | |
| lines=20 | |
| ) | |
| with gr.Tab('🗂️ JSON Result'): | |
| output_json = gr.Code(label='Structured Data', language='json') | |
| # Feature notice | |
| gr.HTML(""" | |
| <div class="notice"> | |
| <strong>✨ Key Features:</strong> | |
| <ul> | |
| <li><strong>Ultra-lightweight:</strong> Only 0.1B parameters, fast inference speed</li> | |
| <li><strong>High accuracy:</strong> Achieves 90.57% on OmniDocBench (v1.5)</li> | |
| <li><strong>Unified recognition:</strong> Handles text, formulas, and tables in one model</li> | |
| <li><strong>Rich output:</strong> Provides Markdown, JSON, and visualization results</li> | |
| </ul> | |
| </div> | |
| """) | |
| btn.click( | |
| fn=process_image, | |
| inputs=[input_img], | |
| outputs=[output_vis, output_md, output_json, download_output, output_md_raw, output_md_raw_with_base64] | |
| ) | |
| return demo | |
| if __name__ == '__main__': | |
| demo = create_demo() | |
| demo.queue(max_size=20).launch( | |
| share=False | |
| ) | |