--- # vLLM — high-performance LLM inference with OpenAI-compatible API # Skipped automatically if no GPU is present. - name: Check for NVIDIA GPU shell: nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 register: gpu_check ignore_errors: true changed_when: false - name: Skip vLLM if no GPU detected debug: msg: "No GPU detected — skipping vLLM image pull. Run manually when GPU is available: docker pull vllm/vllm-openai:latest" when: gpu_check.stdout == "" or gpu_check.rc != 0 - name: Pull vLLM Docker image shell: docker pull vllm/vllm-openai:latest retries: 3 delay: 15 when: gpu_check.stdout != "" and gpu_check.rc == 0 - name: Create vLLM systemd service copy: dest: /etc/systemd/system/vllm.service content: | [Unit] Description=vLLM OpenAI-Compatible Inference Server After=docker.service ollama.service Requires=docker.service [Service] Restart=always RestartSec=5 ExecStartPre=-/usr/bin/docker stop vllm ExecStartPre=-/usr/bin/docker rm vllm ExecStart=/usr/bin/docker run \ --name vllm \ --gpus all \ --ipc=host \ -p 8000:8000 \ -v /opt/cezen/models:/root/.cache/huggingface \ -e HF_HOME=/root/.cache/huggingface \ vllm/vllm-openai:latest \ --model meta-llama/Meta-Llama-3.1-8B-Instruct \ --gpu-memory-utilization 0.7 \ --max-model-len 8192 \ --tensor-parallel-size 1 ExecStop=/usr/bin/docker stop vllm [Install] WantedBy=multi-user.target mode: "0644" - name: Create vLLM model directory file: path: /opt/cezen/models/hf_cache state: directory owner: cezen group: cezen