services: mineru-vllm-server: image:mineru-vllm:latest container_name:mineru-vllm-server restart:always profiles: ["vllm-server"] ports: -30000:30000 environment: MINERU_MODEL_SOURCE:local entrypoint:mineru-vllm-server command: --host0.0.0.0 --port30000 # --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode # --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below. ulimits: memlock:-1 stack:67108864 ipc:host healthcheck: test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"] deploy: resources: reservations: devices: -driver:nvidia device_ids: ["0"] capabilities: [gpu]
mineru-api: image:mineru-vllm:latest container_name:mineru-api restart:always profiles: ["api"] ports: -17777:8000 environment: MINERU_MODEL_SOURCE:local entrypoint:mineru-api command: --host0.0.0.0 --port8000 # parameters for vllm-engine # --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode # --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below. ulimits: memlock:-1 stack:67108864 ipc:host deploy: resources: reservations: devices: -driver:nvidia device_ids: [ "0" ] capabilities: [ gpu ]
mineru-gradio: image:mineru-vllm:latest container_name:mineru-gradio restart:always profiles: ["gradio"] ports: -7860:7860 environment: MINERU_MODEL_SOURCE:local entrypoint:mineru-gradio command: --server-name0.0.0.0 --server-port7860 --enable-vllm-enginetrue# Enable the vllm engine for Gradio # --enable-api false # If you want to disable the API, set this to false # --max-convert-pages 20 # If you want to limit the number of pages for conversion, set this to a specific number # parameters for vllm-engine # --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode # --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below. ulimits: memlock:-1 stack:67108864 ipc:host deploy: resources: reservations: devices: -driver:nvidia device_ids: [ "0" ] capabilities: [ gpu ]
Due to the pre-allocation of GPU memory by the vllm inference acceleration framework, you may not be able to run multiple vllm services simultaneously on the same machine. Therefore, ensure that other services that might use GPU memory have been stopped before starting the vlm-vllm-server service or using the vlm-vllm-engine backend. 由于 vllm 推理加速框架预先分配了 GPU 内存,您可能无法在同一台机器上同时运行多个 vllm 服务。因此,请确保在启动 vlm-vllm-server 服务或使用 vlm-vllm-engine 后端之前,已停止其他可能使用 GPU 内存的服务。