aipackage/ansible/roles/vllm/tasks/main.yml
2026-06-23 14:28:40 +05:30

60 lines
1.7 KiB
YAML

---
# vLLM — high-performance LLM inference with OpenAI-compatible API
# Skipped automatically if no GPU is present.
- name: Check for NVIDIA GPU
shell: nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1
register: gpu_check
ignore_errors: true
changed_when: false
- name: Skip vLLM if no GPU detected
debug:
msg: "No GPU detected — skipping vLLM image pull. Run manually when GPU is available: docker pull vllm/vllm-openai:latest"
when: gpu_check.stdout == "" or gpu_check.rc != 0
- name: Pull vLLM Docker image
shell: docker pull vllm/vllm-openai:latest
retries: 3
delay: 15
when: gpu_check.stdout != "" and gpu_check.rc == 0
- name: Create vLLM systemd service
copy:
dest: /etc/systemd/system/vllm.service
content: |
[Unit]
Description=vLLM OpenAI-Compatible Inference Server
After=docker.service ollama.service
Requires=docker.service
[Service]
Restart=always
RestartSec=5
ExecStartPre=-/usr/bin/docker stop vllm
ExecStartPre=-/usr/bin/docker rm vllm
ExecStart=/usr/bin/docker run \
--name vllm \
--gpus all \
--ipc=host \
-p 8000:8000 \
-v /opt/cezen/models:/root/.cache/huggingface \
-e HF_HOME=/root/.cache/huggingface \
vllm/vllm-openai:latest \
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
--gpu-memory-utilization 0.7 \
--max-model-len 8192 \
--tensor-parallel-size 1
ExecStop=/usr/bin/docker stop vllm
[Install]
WantedBy=multi-user.target
mode: "0644"
- name: Create vLLM model directory
file:
path: /opt/cezen/models/hf_cache
state: directory
owner: cezen
group: cezen