--- # vLLM — high-performance LLM inference with OpenAI-compatible API # Skipped automatically if no GPU is present. # Variables (set defaults in defaults/main.yml, override per-tier in the playbook): # vllm_model HuggingFace model ID to load on start # vllm_tensor_parallel Number of GPUs for tensor parallelism (1 for Starter/Basic) # vllm_gpu_memory_util Fraction of VRAM to reserve for vLLM (0.0–1.0) # vllm_max_model_len Maximum context length in tokens # vllm_quantization Quantization method: "" (none) | "awq" | "gptq" | "fp8" - name: Check for NVIDIA GPU shell: nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 register: gpu_check ignore_errors: true changed_when: false - name: Skip vLLM if no GPU detected debug: msg: > No GPU detected — skipping vLLM image pull. Run manually when GPU is available: docker pull vllm/vllm-openai:latest when: gpu_check.stdout == "" or gpu_check.rc != 0 - name: Pull vLLM Docker image shell: docker pull vllm/vllm-openai:latest retries: 3 delay: 15 when: gpu_check.stdout != "" and gpu_check.rc == 0 - name: Build vLLM quantization flag set_fact: vllm_quant_flag: "{{ '--quantization ' + vllm_quantization if vllm_quantization != '' else '' }}" - name: Create vLLM systemd service copy: dest: /etc/systemd/system/vllm.service mode: "0644" content: | [Unit] Description=vLLM OpenAI-Compatible Inference Server ({{ vllm_model }}) After=docker.service Requires=docker.service [Service] Restart=always RestartSec=10 ExecStartPre=-/usr/bin/docker stop vllm ExecStartPre=-/usr/bin/docker rm vllm ExecStart=/usr/bin/docker run \ --name vllm \ --gpus all \ --ipc=host \ -p 8000:8000 \ -v /opt/cezen/models:/root/.cache/huggingface \ -e HF_HOME=/root/.cache/huggingface \ vllm/vllm-openai:latest \ --model {{ vllm_model }} \ --gpu-memory-utilization {{ vllm_gpu_memory_util }} \ --max-model-len {{ vllm_max_model_len }} \ --tensor-parallel-size {{ vllm_tensor_parallel }} \ {{ vllm_quant_flag }} ExecStop=/usr/bin/docker stop vllm TimeoutStartSec=300 [Install] WantedBy=multi-user.target - name: Create vLLM model directory file: path: /opt/cezen/models/hf_cache state: directory owner: cezen group: cezen mode: "0755" - name: Write vLLM tier config file (for portal reference) copy: dest: /opt/cezen/vllm-config.json owner: cezen group: cezen mode: "0644" content: | { "model": "{{ vllm_model }}", "tensor_parallel_size": {{ vllm_tensor_parallel }}, "gpu_memory_utilization": {{ vllm_gpu_memory_util }}, "max_model_len": {{ vllm_max_model_len }}, "quantization": "{{ vllm_quantization }}" } - name: Enable and start vLLM service systemd: name: vllm enabled: true daemon_reload: true when: gpu_check.stdout != "" and gpu_check.rc == 0