import os import uuid import shutil import re import base64 import gradio as gr from PIL import Image from tools.infer_doc import OpenDoc from tools.utils.logging import get_logger logger = get_logger(name='opendoc_gradio') # Initialize the pipeline pipeline: OpenDoc | None = None def get_pipeline(gpu_id: int) -> OpenDoc: """获取或初始化OpenDoc流水线 Args: gpu_id: GPU设备ID,-1表示使用CPU Returns: OpenDoc: 初始化好的OpenDoc实例 """ global pipeline if pipeline is None: logger.info( f"Initializing OpenDoc pipeline on {'GPU ' + str(gpu_id) if gpu_id >= 0 else 'CPU'}..." ) pipeline = OpenDoc(gpuId=gpu_id) return pipeline # Ensure pipeline is initialized try: current_pipeline = get_pipeline(0) except Exception as e: raise e def process_image(image_path: str | None) -> tuple[Image.Image | None, str, str, str | None, str, str]: """处理图片并进行OCR识别 Args: image_path: 图片文件路径,None表示无图片 Returns: tuple: (可视化图片, Markdown内容(base64图片), JSON内容, ZIP文件路径, 原始Markdown, Markdown内容(base64图片)) """ if image_path is None: return None, '', '', None, '', '' # Get original image name base_name = os.path.splitext(os.path.basename(image_path))[0] file_ext = os.path.splitext(image_path)[1] or '.jpg' # Create a directory with image name for this request output_base_dir = 'gradio_outputs' os.makedirs(output_base_dir, exist_ok=True) # Add timestamp to avoid conflicts if same filename is uploaded multiple times timestamp = str(uuid.uuid4())[:8] folder_name = f"{base_name}_{timestamp}" tmp_dir = os.path.join(output_base_dir, folder_name) os.makedirs(tmp_dir, exist_ok=True) try: # Copy and rename the input image tmp_img_path = os.path.join(tmp_dir, f'{base_name}{file_ext}') image = Image.open(image_path) image.save(tmp_img_path) # Predict output = list( current_pipeline.predict(tmp_img_path, use_doc_orientation_classify=False, use_doc_unwarping=False)) if not output: return None, 'No results found.', '', None, '', '' res = output[0] # Save results res.save_to_img(tmp_dir) res.save_to_markdown(tmp_dir, pretty=True) res.save_to_json(tmp_dir) # Find the saved files vis_img = None for f in os.listdir(tmp_dir): if 'layout_order_res' in f: vis_img_path = os.path.join(tmp_dir, f) vis_img = Image.open(vis_img_path) break markdown_content = '' md_file_path = None for f in os.listdir(tmp_dir): if f.endswith('.md'): md_file_path = os.path.join(tmp_dir, f) with open(md_file_path, 'r', encoding='utf-8') as file: markdown_content = file.read() break # Convert relative image paths to base64 for proper display in Gradio if markdown_content: def replace_img_with_base64(match): img_path = match.group(1) full_img_path = os.path.join(tmp_dir, img_path) if os.path.exists(full_img_path): try: with open(full_img_path, 'rb') as img_file: img_data = base64.b64encode(img_file.read()).decode('utf-8') # Determine image format ext = os.path.splitext(full_img_path)[1].lower() mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png' # Replace src with base64 data URL return match.group(0).replace(f'src="{img_path}"', f'src="data:{mime_type};base64,{img_data}"') except Exception as e: logger.warning(f'Failed to convert image {img_path} to base64: {e}') return match.group(0) # Find all img tags and replace their src markdown_content_show = re.sub(r']*src="([^"]+)"[^>]*>', replace_img_with_base64, markdown_content) else: markdown_content_show = markdown_content json_content = '' json_file_path = None for f in os.listdir(tmp_dir): if f.endswith('.json'): json_file_path = os.path.join(tmp_dir, f) with open(json_file_path, 'r', encoding='utf-8') as file: json_content = file.read() break # Prepare all files in tmp_dir for download by creating a zip archive zip_path = os.path.join(output_base_dir, f'{folder_name}.zip') _ = shutil.make_archive(zip_path.replace('.zip', ''), 'zip', tmp_dir) return vis_img, markdown_content_show, json_content, zip_path, markdown_content, markdown_content_show except Exception as e: logger.error(f'Prediction error: {str(e)}') return None, f'Error during prediction: {str(e)}', '', None, '', '' # Custom CSS with adaptive colors custom_css = """ body, .gradio-container { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Noto Sans", Helvetica, Arial, sans-serif; } .app-header { text-align: center; max-width: 1200px; margin: 20px auto !important; padding: 20px; } .app-header h1 { font-size: 2.5em; font-weight: 700; margin-bottom: 10px; } .app-header p { font-size: 1.1em; opacity: 0.7; line-height: 1.6; } .quick-links { text-align: center; padding: 12px 0; border: 1px solid var(--border-color-primary); border-radius: 12px; margin: 16px auto; max-width: 1200px; background: var(--background-fill-secondary); } .quick-links a { margin: 0 16px; font-size: 15px; font-weight: 600; color: var(--link-text-color); text-decoration: none; transition: all 0.3s ease; } .quick-links a:hover { opacity: 0.8; text-decoration: underline; } .upload-section { border: 2px dashed var(--border-color-primary); border-radius: 12px; padding: 20px; background: var(--background-fill-secondary); transition: all 0.3s ease; } .upload-section:hover { border-color: var(--color-accent); background: var(--background-fill-primary); } #vis_output { min-height: 400px; border-radius: 12px; overflow: hidden; } #md_preview { max-height: 600px; min-height: 200px; overflow: auto; padding: 20px; background: var(--background-fill-primary); border-radius: 12px; box-shadow: var(--shadow-drop); } #md_preview img { display: block; margin: 16px auto; max-width: 100%; height: auto; border-radius: 8px; } .notice { margin: 20px auto; max-width: 1200px; padding: 16px 20px; border-left: 4px solid var(--color-accent); border-radius: 8px; background: var(--background-fill-secondary); font-size: 14px; line-height: 1.8; } .notice strong { font-weight: 700; color: var(--color-accent); } .notice ul { margin-top: 8px; padding-left: 20px; } .notice li { margin: 8px 0; } .gradio-button-primary { font-weight: 600 !important; transition: all 0.3s ease !important; } .gradio-button-primary:hover { transform: translateY(-2px); box-shadow: var(--shadow-drop-lg) !important; } """ # LaTeX delimiters for formula rendering LATEX_DELIMS = [ {"left": "$$", "right": "$$", "display": True}, {"left": "$", "right": "$", "display": False}, {"left": "\\(", "right": "\\)", "display": False}, {"left": "\\[", "right": "\\]", "display": True}, ] # Define the Gradio Interface def create_demo() -> gr.Blocks: """创建Gradio演示界面 Returns: gr.Blocks: Gradio Blocks应用实例 """ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title='OpenDoc-0.1B Demo') as demo: # Header gr.HTML("""

🚀 OpenDoc-0.1B

Ultra-Lightweight Document Parsing System with 0.1B Parameters (built by OCR Team, FVL Lab)

Powered by PP-DocLayoutV2 for layout analysis and UniRec-0.1B for unified recognition of text, formulas, and tables

⚡ Deployment Notice:

The current online demo uses CPU inference, which may be unstable and slow. For better performance, you can deploy locally with GPU acceleration refer to the [Local GPU Deployment]. We are also working on deploying OpenDoc-0.1B using inference acceleration frameworks. Stay tuned!

""") with gr.Row(): with gr.Column(scale=5, elem_classes=["upload-section"]): input_img = gr.Image(type='filepath', label='📤 Upload Document Image', height=400) gr.Markdown(""" ### 💡 Tips - Supports Chinese and English documents - Best for reports, papers, magazines, and complex layouts - Handles text, formulas, tables, and images """) btn = gr.Button('🔍 Analyze Document', variant='primary', size='lg') download_output = gr.File(label='📥 Download All Results (ZIP)', visible=True) with gr.Column(scale=7): with gr.Tabs(): with gr.Tab('📝 Markdown Preview'): output_md = gr.Markdown( 'Please upload an image and click "Analyze Document" to see results.', latex_delimiters=LATEX_DELIMS, elem_id='md_preview' ) with gr.Tab('📊 Layout Visualization'): output_vis = gr.Image(type='pil', label='Layout Analysis Results', elem_id='vis_output') with gr.Tab('📄 Raw Markdown'): output_md_raw = gr.Code( label='Markdown Source', language='markdown', lines=20 ) with gr.Tab('📄 Raw Markdown with Base64 Images'): output_md_raw_with_base64 = gr.Code( label='Markdown Source', language='markdown', lines=20 ) with gr.Tab('🗂️ JSON Result'): output_json = gr.Code(label='Structured Data', language='json') # Feature notice gr.HTML("""
✨ Key Features:
""") btn.click( fn=process_image, inputs=[input_img], outputs=[output_vis, output_md, output_json, download_output, output_md_raw, output_md_raw_with_base64] ) return demo if __name__ == '__main__': demo = create_demo() demo.queue(max_size=20).launch( share=False )