aipackage/ansible/roles/vllm/tasks/main.yml

97 lines
3.0 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

---
# vLLM — high-performance LLM inference with OpenAI-compatible API
# Skipped automatically if no GPU is present.
# Variables (set defaults in defaults/main.yml, override per-tier in the playbook):
# vllm_model HuggingFace model ID to load on start
# vllm_tensor_parallel Number of GPUs for tensor parallelism (1 for Starter/Basic)
# vllm_gpu_memory_util Fraction of VRAM to reserve for vLLM (0.01.0)
# vllm_max_model_len Maximum context length in tokens
# vllm_quantization Quantization method: "" (none) | "awq" | "gptq" | "fp8"
- name: Check for NVIDIA GPU
shell: nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1
register: gpu_check
ignore_errors: true
changed_when: false
- name: Skip vLLM if no GPU detected
debug:
msg: >
No GPU detected — skipping vLLM image pull.
Run manually when GPU is available: docker pull vllm/vllm-openai:latest
when: gpu_check.stdout == "" or gpu_check.rc != 0
- name: Pull vLLM Docker image
shell: docker pull vllm/vllm-openai:latest
retries: 3
delay: 15
when: gpu_check.stdout != "" and gpu_check.rc == 0
- name: Build vLLM quantization flag
set_fact:
vllm_quant_flag: "{{ '--quantization ' + vllm_quantization if vllm_quantization != '' else '' }}"
- name: Create vLLM systemd service
copy:
dest: /etc/systemd/system/vllm.service
mode: "0644"
content: |
[Unit]
Description=vLLM OpenAI-Compatible Inference Server ({{ vllm_model }})
After=docker.service
Requires=docker.service
[Service]
Restart=always
RestartSec=10
ExecStartPre=-/usr/bin/docker stop vllm
ExecStartPre=-/usr/bin/docker rm vllm
ExecStart=/usr/bin/docker run \
--name vllm \
--gpus all \
--ipc=host \
-p 8000:8000 \
-v /opt/cezen/models:/root/.cache/huggingface \
-e HF_HOME=/root/.cache/huggingface \
vllm/vllm-openai:latest \
--model {{ vllm_model }} \
--gpu-memory-utilization {{ vllm_gpu_memory_util }} \
--max-model-len {{ vllm_max_model_len }} \
--tensor-parallel-size {{ vllm_tensor_parallel }} \
{{ vllm_quant_flag }}
ExecStop=/usr/bin/docker stop vllm
TimeoutStartSec=300
[Install]
WantedBy=multi-user.target
- name: Create vLLM model directory
file:
path: /opt/cezen/models/hf_cache
state: directory
owner: cezen
group: cezen
mode: "0755"
- name: Write vLLM tier config file (for portal reference)
copy:
dest: /opt/cezen/vllm-config.json
owner: cezen
group: cezen
mode: "0644"
content: |
{
"model": "{{ vllm_model }}",
"tensor_parallel_size": {{ vllm_tensor_parallel }},
"gpu_memory_utilization": {{ vllm_gpu_memory_util }},
"max_model_len": {{ vllm_max_model_len }},
"quantization": "{{ vllm_quantization }}"
}
- name: Enable and start vLLM service
systemd:
name: vllm
enabled: true
daemon_reload: true
when: gpu_check.stdout != "" and gpu_check.rc == 0