--- # vLLM — high-performance LLM inference with OpenAI-compatible API # Entry tier: runs as a Docker container (easier to manage than pip install) - name: Pull vLLM Docker image shell: docker pull vllm/vllm-openai:latest retries: 3 delay: 15 - name: Create vLLM systemd service copy: dest: /etc/systemd/system/vllm.service content: | [Unit] Description=vLLM OpenAI-Compatible Inference Server After=docker.service ollama.service Requires=docker.service [Service] Restart=always RestartSec=5 ExecStartPre=-/usr/bin/docker stop vllm ExecStartPre=-/usr/bin/docker rm vllm ExecStart=/usr/bin/docker run \ --name vllm \ --gpus all \ --ipc=host \ -p 8000:8000 \ -v /opt/cezen/models:/root/.cache/huggingface \ -e HF_HOME=/root/.cache/huggingface \ vllm/vllm-openai:latest \ --model meta-llama/Meta-Llama-3.1-8B-Instruct \ --gpu-memory-utilization 0.7 \ --max-model-len 8192 \ --tensor-parallel-size 1 ExecStop=/usr/bin/docker stop vllm [Install] WantedBy=multi-user.target mode: "0644" - name: Enable vLLM (but don't start yet — model selection needed first) systemd: name: vllm enabled: yes daemon_reload: yes # Note: vLLM service is enabled but not started by default. # Start manually after choosing a model: # sudo systemctl start vllm # Or change the --model flag in /etc/systemd/system/vllm.service first. - name: Create vLLM model directory file: path: /opt/cezen/models/hf_cache state: directory owner: cezen group: cezen