vllm-qwen3.5-nvfp4-5090/docker-compose.yml at main · Li-Lee/vllm-qwen3.5-nvfp4-5090 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# docker-compose.yml - Qwen3.5-35B-A3B-NVFP4 on RTX 5090
# Using vLLM v0.17+ with native NVFP4 quantization support (no patch needed)
services:
  vllm:
    image: vllm/vllm-openai:nightly
    container_name: qwen3.5-35b-a3b-nvfp4
    restart: unless-stopped
    network_mode: host
    ipc: host
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    volumes:
      - ${HF_CACHE}:/root/.cache/huggingface
      - ./chat_template.jinja:/chat_template.jinja:ro
    environment:
      - HF_TOKEN=${HF_TOKEN}
      - VLLM_USE_FLASHINFER_MOE_FP4=0
      - VLLM_TEST_FORCE_FP8_MARLIN=1
      - VLLM_NVFP4_GEMM_BACKEND=marlin
      - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
      - PYTHONWARNINGS=ignore::UserWarning:vllm.model_executor.layers.fla.ops
    command:
      - "Kbenkhaled/Qwen3.5-35B-A3B-NVFP4"
      - "--host=0.0.0.0"
      - "--port=8000"
      - "--max-model-len=262144"
      - "--gpu-memory-utilization=0.95"
      - "--max-num-seqs=4"
      - "--max-num-batched-tokens=4096"
      - "--kv-cache-dtype=fp8"
      - "--reasoning-parser=qwen3"
      - "--enable-auto-tool-choice"
      - "--enable-prefix-caching"
      - "--tool-call-parser=qwen3_coder"
      - "--chat-template"
      - "/chat_template.jinja"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 600s