97 lines
3.0 KiB
YAML
97 lines
3.0 KiB
YAML
---
|
||
# vLLM — high-performance LLM inference with OpenAI-compatible API
|
||
# Skipped automatically if no GPU is present.
|
||
# Variables (set defaults in defaults/main.yml, override per-tier in the playbook):
|
||
# vllm_model HuggingFace model ID to load on start
|
||
# vllm_tensor_parallel Number of GPUs for tensor parallelism (1 for Starter/Basic)
|
||
# vllm_gpu_memory_util Fraction of VRAM to reserve for vLLM (0.0–1.0)
|
||
# vllm_max_model_len Maximum context length in tokens
|
||
# vllm_quantization Quantization method: "" (none) | "awq" | "gptq" | "fp8"
|
||
|
||
- name: Check for NVIDIA GPU
|
||
shell: nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1
|
||
register: gpu_check
|
||
ignore_errors: true
|
||
changed_when: false
|
||
|
||
- name: Skip vLLM if no GPU detected
|
||
debug:
|
||
msg: >
|
||
No GPU detected — skipping vLLM image pull.
|
||
Run manually when GPU is available: docker pull vllm/vllm-openai:latest
|
||
when: gpu_check.stdout == "" or gpu_check.rc != 0
|
||
|
||
- name: Pull vLLM Docker image
|
||
shell: docker pull vllm/vllm-openai:latest
|
||
retries: 3
|
||
delay: 15
|
||
when: gpu_check.stdout != "" and gpu_check.rc == 0
|
||
|
||
- name: Build vLLM quantization flag
|
||
set_fact:
|
||
vllm_quant_flag: "{{ '--quantization ' + vllm_quantization if vllm_quantization != '' else '' }}"
|
||
|
||
- name: Create vLLM systemd service
|
||
copy:
|
||
dest: /etc/systemd/system/vllm.service
|
||
mode: "0644"
|
||
content: |
|
||
[Unit]
|
||
Description=vLLM OpenAI-Compatible Inference Server ({{ vllm_model }})
|
||
After=docker.service
|
||
Requires=docker.service
|
||
|
||
[Service]
|
||
Restart=always
|
||
RestartSec=10
|
||
ExecStartPre=-/usr/bin/docker stop vllm
|
||
ExecStartPre=-/usr/bin/docker rm vllm
|
||
ExecStart=/usr/bin/docker run \
|
||
--name vllm \
|
||
--gpus all \
|
||
--ipc=host \
|
||
-p 8000:8000 \
|
||
-v /opt/cezen/models:/root/.cache/huggingface \
|
||
-e HF_HOME=/root/.cache/huggingface \
|
||
vllm/vllm-openai:latest \
|
||
--model {{ vllm_model }} \
|
||
--gpu-memory-utilization {{ vllm_gpu_memory_util }} \
|
||
--max-model-len {{ vllm_max_model_len }} \
|
||
--tensor-parallel-size {{ vllm_tensor_parallel }} \
|
||
{{ vllm_quant_flag }}
|
||
ExecStop=/usr/bin/docker stop vllm
|
||
TimeoutStartSec=300
|
||
|
||
[Install]
|
||
WantedBy=multi-user.target
|
||
|
||
- name: Create vLLM model directory
|
||
file:
|
||
path: /opt/cezen/models/hf_cache
|
||
state: directory
|
||
owner: cezen
|
||
group: cezen
|
||
mode: "0755"
|
||
|
||
- name: Write vLLM tier config file (for portal reference)
|
||
copy:
|
||
dest: /opt/cezen/vllm-config.json
|
||
owner: cezen
|
||
group: cezen
|
||
mode: "0644"
|
||
content: |
|
||
{
|
||
"model": "{{ vllm_model }}",
|
||
"tensor_parallel_size": {{ vllm_tensor_parallel }},
|
||
"gpu_memory_utilization": {{ vllm_gpu_memory_util }},
|
||
"max_model_len": {{ vllm_max_model_len }},
|
||
"quantization": "{{ vllm_quantization }}"
|
||
}
|
||
|
||
- name: Enable and start vLLM service
|
||
systemd:
|
||
name: vllm
|
||
enabled: true
|
||
daemon_reload: true
|
||
when: gpu_check.stdout != "" and gpu_check.rc == 0
|