--- # Ollama — local LLM serving (main inference engine for Entry tier) - name: Check if Ollama is already installed stat: path: /usr/local/bin/ollama register: ollama_binary - name: Install Ollama shell: curl -fsSL https://ollama.ai/install.sh | sh when: not ollama_binary.stat.exists retries: 3 delay: 10 - name: Create Ollama systemd service with GPU support copy: dest: /etc/systemd/system/ollama.service content: | [Unit] Description=Ollama Service After=network-online.target [Service] ExecStart=/usr/local/bin/ollama serve User=cezen Group=cezen Restart=always RestartSec=3 Environment="PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" Environment="OLLAMA_HOST=0.0.0.0:11434" Environment="OLLAMA_MODELS=/opt/cezen/models/ollama" Environment="CUDA_VISIBLE_DEVICES=0" [Install] WantedBy=multi-user.target mode: "0644" - name: Create Ollama models directory file: path: /opt/cezen/models/ollama state: directory owner: cezen group: cezen - name: Enable and start Ollama systemd: name: ollama enabled: yes state: started daemon_reload: yes - name: Wait for Ollama API to be ready wait_for: host: localhost port: 11434 timeout: 60 - name: Select tier model set set_fact: ollama_models: >- {{ { 'starter': ['phi3:mini', 'nomic-embed-text'], 'basic': ['llama3.1:8b', 'mistral:7b', 'nomic-embed-text', 'codellama:13b'], 'entry': ['llama3.1:8b', 'mistral:7b', 'nomic-embed-text', 'codellama:13b'], 'pro': ['llama3.1:8b', 'mistral:7b', 'nomic-embed-text', 'codellama:13b', 'llama3.1:70b', 'mixtral:8x7b', 'deepseek-coder-v2:16b'], 'max': ['llama3.1:8b', 'mistral:7b', 'nomic-embed-text', 'codellama:13b', 'llama3.1:70b', 'mixtral:8x7b', 'deepseek-coder-v2:16b', 'llama3.1:405b', 'mixtral:8x22b'] }.get(tier | default('basic'), ['llama3.1:8b', 'mistral:7b', 'nomic-embed-text']) }} - name: Pull tier Ollama models become_user: cezen command: ollama pull {{ item }} loop: "{{ ollama_models }}" environment: OLLAMA_HOST: "http://localhost:11434" retries: 3 delay: 15 # NOTE: Pro/Max models are very large. Skip with --skip-model-pull for # bandwidth-constrained installs, then run models/pull-models.sh later. when: not (skip_model_pull | default(false)) # Open WebUI (chat interface on top of Ollama) - name: Start Open WebUI via Docker CLI shell: | if docker ps -a --format '{{ "{{" }}.Names{{ "}}" }}' | grep -qx open-webui; then docker start open-webui else docker run -d \ --name open-webui \ --restart always \ -p 3001:8080 \ --add-host=host-gateway:172.17.0.1 \ -v open-webui:/app/backend/data \ -e OLLAMA_BASE_URL=http://host-gateway:11434 \ ghcr.io/open-webui/open-webui:main fi args: executable: /bin/bash register: webui_result changed_when: webui_result.rc == 0 failed_when: webui_result.rc != 0 and 'already in use' not in (webui_result.stderr | default('')) ignore_errors: true