60 lines
1.7 KiB
YAML
60 lines
1.7 KiB
YAML
---
|
|
# vLLM — high-performance LLM inference with OpenAI-compatible API
|
|
# Skipped automatically if no GPU is present.
|
|
|
|
- name: Check for NVIDIA GPU
|
|
shell: nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1
|
|
register: gpu_check
|
|
ignore_errors: true
|
|
changed_when: false
|
|
|
|
- name: Skip vLLM if no GPU detected
|
|
debug:
|
|
msg: "No GPU detected — skipping vLLM image pull. Run manually when GPU is available: docker pull vllm/vllm-openai:latest"
|
|
when: gpu_check.stdout == "" or gpu_check.rc != 0
|
|
|
|
- name: Pull vLLM Docker image
|
|
shell: docker pull vllm/vllm-openai:latest
|
|
retries: 3
|
|
delay: 15
|
|
when: gpu_check.stdout != "" and gpu_check.rc == 0
|
|
|
|
- name: Create vLLM systemd service
|
|
copy:
|
|
dest: /etc/systemd/system/vllm.service
|
|
content: |
|
|
[Unit]
|
|
Description=vLLM OpenAI-Compatible Inference Server
|
|
After=docker.service ollama.service
|
|
Requires=docker.service
|
|
|
|
[Service]
|
|
Restart=always
|
|
RestartSec=5
|
|
ExecStartPre=-/usr/bin/docker stop vllm
|
|
ExecStartPre=-/usr/bin/docker rm vllm
|
|
ExecStart=/usr/bin/docker run \
|
|
--name vllm \
|
|
--gpus all \
|
|
--ipc=host \
|
|
-p 8000:8000 \
|
|
-v /opt/cezen/models:/root/.cache/huggingface \
|
|
-e HF_HOME=/root/.cache/huggingface \
|
|
vllm/vllm-openai:latest \
|
|
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
|
|
--gpu-memory-utilization 0.7 \
|
|
--max-model-len 8192 \
|
|
--tensor-parallel-size 1
|
|
ExecStop=/usr/bin/docker stop vllm
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
mode: "0644"
|
|
|
|
- name: Create vLLM model directory
|
|
file:
|
|
path: /opt/cezen/models/hf_cache
|
|
state: directory
|
|
owner: cezen
|
|
group: cezen
|