Initial Cezen AI Suite installer — Entry tier
This commit is contained in:
commit
a071602cf1
77
README.md
Normal file
77
README.md
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
# Cezen AI Suite — Installer
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone <cgit-url>
|
||||||
|
cd cgit
|
||||||
|
sudo bash install.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Server reboots automatically after NVIDIA drivers install. Phase 2 runs on its own after reboot.
|
||||||
|
|
||||||
|
## What Gets Installed (Entry Tier)
|
||||||
|
|
||||||
|
| Service | Port | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| Ollama | 11434 | LLM inference, 2 models pre-loaded |
|
||||||
|
| Open WebUI | 3001 | Chat interface |
|
||||||
|
| vLLM | 8000 | OpenAI-compatible API (start manually) |
|
||||||
|
| JupyterLab | 8888 | Token: `cezen2024` |
|
||||||
|
| ChromaDB | 8100 | Vector DB for RAG |
|
||||||
|
| MLflow | 5000 | Experiment tracking |
|
||||||
|
| MinIO | 9001 | Object storage (user: cezenadmin / Cezen@2024!) |
|
||||||
|
| Grafana | 3000 | GPU + system monitoring (admin / cezen2024) |
|
||||||
|
|
||||||
|
## Testing Without a GPU (Multipass)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On your MacBook:
|
||||||
|
multipass launch 22.04 --name cezen-test --cpus 4 --mem 8G --disk 40G
|
||||||
|
multipass shell cezen-test
|
||||||
|
|
||||||
|
# Inside the VM:
|
||||||
|
git clone <cgit-url>
|
||||||
|
sudo bash install.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
NVIDIA driver install will succeed but `nvidia-smi` won't show GPUs — that's expected. All other services will run fine.
|
||||||
|
|
||||||
|
## Pull More Models
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash models/pull-models.sh --tier=entry
|
||||||
|
```
|
||||||
|
|
||||||
|
## File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
cgit/
|
||||||
|
├── install.sh ← Entry point
|
||||||
|
├── ansible/
|
||||||
|
│ ├── phase1_nvidia.yml ← Phase 1: drivers (triggers reboot)
|
||||||
|
│ ├── entry.yml ← Phase 2: full stack
|
||||||
|
│ └── roles/
|
||||||
|
│ ├── base/ ← OS, Python, Miniconda, LangChain
|
||||||
|
│ ├── nvidia/ ← Drivers, CUDA 12.4, cuDNN 9
|
||||||
|
│ ├── docker/ ← Docker CE + NVIDIA Container Toolkit
|
||||||
|
│ ├── k3s/ ← Lightweight Kubernetes
|
||||||
|
│ ├── ollama/ ← Ollama + Open WebUI
|
||||||
|
│ ├── vllm/ ← vLLM inference server
|
||||||
|
│ ├── jupyterlab/ ← JupyterLab notebooks
|
||||||
|
│ ├── chromadb/ ← Vector database
|
||||||
|
│ ├── mlflow/ ← Experiment tracking
|
||||||
|
│ ├── minio/ ← Object storage
|
||||||
|
│ └── monitoring/ ← Grafana + Prometheus + DCGM
|
||||||
|
└── models/
|
||||||
|
└── pull-models.sh ← Pull additional models
|
||||||
|
```
|
||||||
|
|
||||||
|
## Change Default Passwords
|
||||||
|
|
||||||
|
Before shipping to a customer, update these:
|
||||||
|
|
||||||
|
- JupyterLab token: `/opt/cezen/.jupyter/jupyter_lab_config.py`
|
||||||
|
- MinIO: `/etc/default/minio`
|
||||||
|
- Grafana: environment vars in monitoring role, or via UI after first login
|
||||||
|
- MLflow: no auth by default (add reverse proxy if needed)
|
||||||
BIN
ansible/.DS_Store
vendored
Normal file
BIN
ansible/.DS_Store
vendored
Normal file
Binary file not shown.
23
ansible/entry.yml
Normal file
23
ansible/entry.yml
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
---
|
||||||
|
# Phase 2: Full Cezen AI Suite — Entry Tier
|
||||||
|
# Runs after NVIDIA driver reboot
|
||||||
|
- name: Cezen AI — Entry Tier Stack
|
||||||
|
hosts: localhost
|
||||||
|
connection: local
|
||||||
|
become: true
|
||||||
|
vars:
|
||||||
|
cezen_user: "cezen"
|
||||||
|
cezen_home: "/opt/cezen"
|
||||||
|
python_version: "3.11"
|
||||||
|
cuda_version: "12.4"
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- docker
|
||||||
|
- k3s
|
||||||
|
- ollama
|
||||||
|
- vllm
|
||||||
|
- jupyterlab
|
||||||
|
- chromadb
|
||||||
|
- mlflow
|
||||||
|
- minio
|
||||||
|
- monitoring
|
||||||
9
ansible/phase1_nvidia.yml
Normal file
9
ansible/phase1_nvidia.yml
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
---
|
||||||
|
# Phase 1: NVIDIA drivers only. Server reboots after this.
|
||||||
|
- name: Cezen AI — Phase 1 NVIDIA Drivers
|
||||||
|
hosts: localhost
|
||||||
|
connection: local
|
||||||
|
become: true
|
||||||
|
roles:
|
||||||
|
- base
|
||||||
|
- nvidia
|
||||||
91
ansible/roles/base/tasks/main.yml
Normal file
91
ansible/roles/base/tasks/main.yml
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
---
|
||||||
|
# Base role: OS updates, essential packages, Python/Miniconda
|
||||||
|
- name: Update apt cache
|
||||||
|
apt:
|
||||||
|
update_cache: yes
|
||||||
|
cache_valid_time: 3600
|
||||||
|
|
||||||
|
- name: Upgrade all packages
|
||||||
|
apt:
|
||||||
|
upgrade: dist
|
||||||
|
autoremove: yes
|
||||||
|
|
||||||
|
- name: Install essential system packages
|
||||||
|
apt:
|
||||||
|
name:
|
||||||
|
- curl
|
||||||
|
- wget
|
||||||
|
- git
|
||||||
|
- build-essential
|
||||||
|
- ca-certificates
|
||||||
|
- gnupg
|
||||||
|
- lsb-release
|
||||||
|
- software-properties-common
|
||||||
|
- unzip
|
||||||
|
- htop
|
||||||
|
- net-tools
|
||||||
|
- jq
|
||||||
|
- python3-pip
|
||||||
|
- python3-venv
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Create cezen user
|
||||||
|
user:
|
||||||
|
name: cezen
|
||||||
|
shell: /bin/bash
|
||||||
|
home: /opt/cezen
|
||||||
|
create_home: yes
|
||||||
|
groups: sudo
|
||||||
|
append: yes
|
||||||
|
|
||||||
|
- name: Create cezen directories
|
||||||
|
file:
|
||||||
|
path: "{{ item }}"
|
||||||
|
state: directory
|
||||||
|
owner: cezen
|
||||||
|
group: cezen
|
||||||
|
mode: "0755"
|
||||||
|
loop:
|
||||||
|
- /opt/cezen
|
||||||
|
- /opt/cezen/models
|
||||||
|
- /opt/cezen/data
|
||||||
|
- /opt/cezen/logs
|
||||||
|
|
||||||
|
- name: Download Miniconda
|
||||||
|
get_url:
|
||||||
|
url: https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
|
||||||
|
dest: /tmp/miniconda.sh
|
||||||
|
mode: "0755"
|
||||||
|
retries: 3
|
||||||
|
delay: 10
|
||||||
|
|
||||||
|
- name: Install Miniconda
|
||||||
|
become_user: cezen
|
||||||
|
command: bash /tmp/miniconda.sh -b -p /opt/cezen/miniconda
|
||||||
|
args:
|
||||||
|
creates: /opt/cezen/miniconda/bin/conda
|
||||||
|
|
||||||
|
- name: Add conda to cezen PATH
|
||||||
|
lineinfile:
|
||||||
|
path: /opt/cezen/.bashrc
|
||||||
|
line: 'export PATH="/opt/cezen/miniconda/bin:$PATH"'
|
||||||
|
create: yes
|
||||||
|
owner: cezen
|
||||||
|
|
||||||
|
- name: Create cezen conda environment (Python 3.11)
|
||||||
|
become_user: cezen
|
||||||
|
command: /opt/cezen/miniconda/bin/conda create -n cezen python=3.11 -y
|
||||||
|
args:
|
||||||
|
creates: /opt/cezen/miniconda/envs/cezen
|
||||||
|
|
||||||
|
- name: Install LangChain + LlamaIndex + HuggingFace in conda env
|
||||||
|
become_user: cezen
|
||||||
|
shell: |
|
||||||
|
/opt/cezen/miniconda/bin/conda run -n cezen pip install \
|
||||||
|
langchain langchain-community llama-index \
|
||||||
|
transformers huggingface-hub \
|
||||||
|
peft bitsandbytes accelerate \
|
||||||
|
fastapi uvicorn[standard] \
|
||||||
|
sentence-transformers
|
||||||
|
retries: 3
|
||||||
|
delay: 15
|
||||||
53
ansible/roles/chromadb/tasks/main.yml
Normal file
53
ansible/roles/chromadb/tasks/main.yml
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
---
|
||||||
|
# ChromaDB — vector database for RAG pipelines
|
||||||
|
- name: Install ChromaDB in cezen conda env
|
||||||
|
become_user: cezen
|
||||||
|
shell: |
|
||||||
|
/opt/cezen/miniconda/bin/conda run -n cezen pip install chromadb
|
||||||
|
retries: 3
|
||||||
|
delay: 10
|
||||||
|
|
||||||
|
- name: Create ChromaDB data directory
|
||||||
|
file:
|
||||||
|
path: /opt/cezen/data/chromadb
|
||||||
|
state: directory
|
||||||
|
owner: cezen
|
||||||
|
group: cezen
|
||||||
|
|
||||||
|
- name: Create ChromaDB systemd service
|
||||||
|
copy:
|
||||||
|
dest: /etc/systemd/system/chromadb.service
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=ChromaDB Vector Database
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=cezen
|
||||||
|
Group=cezen
|
||||||
|
WorkingDirectory=/opt/cezen/data/chromadb
|
||||||
|
ExecStart=/opt/cezen/miniconda/envs/cezen/bin/chroma run \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 8100 \
|
||||||
|
--path /opt/cezen/data/chromadb
|
||||||
|
Restart=always
|
||||||
|
RestartSec=5
|
||||||
|
Environment="PATH=/opt/cezen/miniconda/envs/cezen/bin:/usr/local/bin:/usr/bin:/bin"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Enable and start ChromaDB
|
||||||
|
systemd:
|
||||||
|
name: chromadb
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
daemon_reload: yes
|
||||||
|
|
||||||
|
- name: Wait for ChromaDB to be ready
|
||||||
|
wait_for:
|
||||||
|
host: localhost
|
||||||
|
port: 8100
|
||||||
|
timeout: 30
|
||||||
83
ansible/roles/docker/tasks/main.yml
Normal file
83
ansible/roles/docker/tasks/main.yml
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
---
|
||||||
|
# Docker CE + NVIDIA Container Toolkit
|
||||||
|
- name: Add Docker GPG key
|
||||||
|
apt_key:
|
||||||
|
url: https://download.docker.com/linux/ubuntu/gpg
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Add Docker apt repository
|
||||||
|
apt_repository:
|
||||||
|
repo: "deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
|
||||||
|
state: present
|
||||||
|
filename: docker
|
||||||
|
|
||||||
|
- name: Install Docker CE
|
||||||
|
apt:
|
||||||
|
name:
|
||||||
|
- docker-ce
|
||||||
|
- docker-ce-cli
|
||||||
|
- containerd.io
|
||||||
|
- docker-buildx-plugin
|
||||||
|
- docker-compose-plugin
|
||||||
|
state: present
|
||||||
|
update_cache: yes
|
||||||
|
|
||||||
|
- name: Add cezen user to docker group
|
||||||
|
user:
|
||||||
|
name: cezen
|
||||||
|
groups: docker
|
||||||
|
append: yes
|
||||||
|
|
||||||
|
- name: Enable and start Docker
|
||||||
|
systemd:
|
||||||
|
name: docker
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
|
||||||
|
# NVIDIA Container Toolkit (allows GPU passthrough into containers)
|
||||||
|
- name: Add NVIDIA Container Toolkit repo
|
||||||
|
shell: |
|
||||||
|
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
|
||||||
|
gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||||
|
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||||
|
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
|
||||||
|
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||||
|
args:
|
||||||
|
creates: /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||||
|
|
||||||
|
- name: Install NVIDIA Container Toolkit
|
||||||
|
apt:
|
||||||
|
name: nvidia-container-toolkit
|
||||||
|
state: present
|
||||||
|
update_cache: yes
|
||||||
|
|
||||||
|
- name: Configure Docker to use NVIDIA runtime
|
||||||
|
shell: nvidia-ctk runtime configure --runtime=docker
|
||||||
|
notify: restart docker
|
||||||
|
|
||||||
|
- name: Set NVIDIA as default Docker runtime
|
||||||
|
copy:
|
||||||
|
dest: /etc/docker/daemon.json
|
||||||
|
content: |
|
||||||
|
{
|
||||||
|
"default-runtime": "nvidia",
|
||||||
|
"runtimes": {
|
||||||
|
"nvidia": {
|
||||||
|
"path": "nvidia-container-runtime",
|
||||||
|
"runtimeArgs": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"log-driver": "json-file",
|
||||||
|
"log-opts": {
|
||||||
|
"max-size": "100m",
|
||||||
|
"max-file": "3"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mode: "0644"
|
||||||
|
notify: restart docker
|
||||||
|
|
||||||
|
handlers:
|
||||||
|
- name: restart docker
|
||||||
|
systemd:
|
||||||
|
name: docker
|
||||||
|
state: restarted
|
||||||
92
ansible/roles/jupyterlab/tasks/main.yml
Normal file
92
ansible/roles/jupyterlab/tasks/main.yml
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
---
|
||||||
|
# JupyterLab — notebook interface for AI/ML development
|
||||||
|
- name: Install JupyterLab in cezen conda env
|
||||||
|
become_user: cezen
|
||||||
|
shell: |
|
||||||
|
/opt/cezen/miniconda/bin/conda run -n cezen pip install \
|
||||||
|
jupyterlab \
|
||||||
|
ipywidgets \
|
||||||
|
ipykernel \
|
||||||
|
notebook \
|
||||||
|
nbconvert
|
||||||
|
retries: 3
|
||||||
|
delay: 10
|
||||||
|
|
||||||
|
- name: Create JupyterLab config directory
|
||||||
|
file:
|
||||||
|
path: /opt/cezen/.jupyter
|
||||||
|
state: directory
|
||||||
|
owner: cezen
|
||||||
|
group: cezen
|
||||||
|
|
||||||
|
- name: Generate JupyterLab config
|
||||||
|
become_user: cezen
|
||||||
|
shell: |
|
||||||
|
/opt/cezen/miniconda/envs/cezen/bin/jupyter lab --generate-config
|
||||||
|
args:
|
||||||
|
creates: /opt/cezen/.jupyter/jupyter_lab_config.py
|
||||||
|
|
||||||
|
- name: Configure JupyterLab (no browser, allow all IPs, set base dir)
|
||||||
|
lineinfile:
|
||||||
|
path: /opt/cezen/.jupyter/jupyter_lab_config.py
|
||||||
|
line: "{{ item }}"
|
||||||
|
create: yes
|
||||||
|
owner: cezen
|
||||||
|
loop:
|
||||||
|
- "c.ServerApp.ip = '0.0.0.0'"
|
||||||
|
- "c.ServerApp.port = 8888"
|
||||||
|
- "c.ServerApp.open_browser = False"
|
||||||
|
- "c.ServerApp.notebook_dir = '/opt/cezen/notebooks'"
|
||||||
|
- "c.ServerApp.token = 'cezen2024'"
|
||||||
|
- "c.ServerApp.allow_root = False"
|
||||||
|
|
||||||
|
- name: Create notebooks directory
|
||||||
|
file:
|
||||||
|
path: /opt/cezen/notebooks
|
||||||
|
state: directory
|
||||||
|
owner: cezen
|
||||||
|
group: cezen
|
||||||
|
|
||||||
|
- name: Create sample notebook placeholder
|
||||||
|
copy:
|
||||||
|
dest: /opt/cezen/notebooks/README.md
|
||||||
|
content: |
|
||||||
|
# Cezen AI Suite — JupyterLab
|
||||||
|
|
||||||
|
Default token: `cezen2024`
|
||||||
|
|
||||||
|
Change this in: `/opt/cezen/.jupyter/jupyter_lab_config.py`
|
||||||
|
Then restart: `sudo systemctl restart jupyterlab`
|
||||||
|
owner: cezen
|
||||||
|
group: cezen
|
||||||
|
|
||||||
|
- name: Create JupyterLab systemd service
|
||||||
|
copy:
|
||||||
|
dest: /etc/systemd/system/jupyterlab.service
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=JupyterLab Server
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=cezen
|
||||||
|
Group=cezen
|
||||||
|
WorkingDirectory=/opt/cezen/notebooks
|
||||||
|
ExecStart=/opt/cezen/miniconda/envs/cezen/bin/jupyter lab \
|
||||||
|
--config=/opt/cezen/.jupyter/jupyter_lab_config.py
|
||||||
|
Restart=always
|
||||||
|
RestartSec=5
|
||||||
|
Environment="PATH=/opt/cezen/miniconda/envs/cezen/bin:/usr/local/cuda/bin:/usr/local/bin:/usr/bin:/bin"
|
||||||
|
Environment="CUDA_HOME=/usr/local/cuda"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Enable and start JupyterLab
|
||||||
|
systemd:
|
||||||
|
name: jupyterlab
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
daemon_reload: yes
|
||||||
62
ansible/roles/k3s/tasks/main.yml
Normal file
62
ansible/roles/k3s/tasks/main.yml
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
---
|
||||||
|
# K3s — lightweight Kubernetes for single-node AI workloads
|
||||||
|
- name: Check if K3s is already installed
|
||||||
|
stat:
|
||||||
|
path: /usr/local/bin/k3s
|
||||||
|
register: k3s_binary
|
||||||
|
|
||||||
|
- name: Install K3s
|
||||||
|
shell: |
|
||||||
|
curl -sfL https://get.k3s.io | \
|
||||||
|
INSTALL_K3S_EXEC="--disable traefik --disable servicelb" sh -
|
||||||
|
when: not k3s_binary.stat.exists
|
||||||
|
retries: 3
|
||||||
|
delay: 10
|
||||||
|
|
||||||
|
- name: Wait for K3s to be ready
|
||||||
|
wait_for:
|
||||||
|
path: /etc/rancher/k3s/k3s.yaml
|
||||||
|
timeout: 120
|
||||||
|
|
||||||
|
- name: Enable and start K3s
|
||||||
|
systemd:
|
||||||
|
name: k3s
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
|
||||||
|
- name: Copy kubeconfig for cezen user
|
||||||
|
copy:
|
||||||
|
src: /etc/rancher/k3s/k3s.yaml
|
||||||
|
dest: /opt/cezen/.kube/config
|
||||||
|
owner: cezen
|
||||||
|
group: cezen
|
||||||
|
mode: "0600"
|
||||||
|
remote_src: yes
|
||||||
|
|
||||||
|
- name: Create .kube directory for cezen
|
||||||
|
file:
|
||||||
|
path: /opt/cezen/.kube
|
||||||
|
state: directory
|
||||||
|
owner: cezen
|
||||||
|
group: cezen
|
||||||
|
|
||||||
|
- name: Set KUBECONFIG in cezen .bashrc
|
||||||
|
lineinfile:
|
||||||
|
path: /opt/cezen/.bashrc
|
||||||
|
line: 'export KUBECONFIG=/opt/cezen/.kube/config'
|
||||||
|
create: yes
|
||||||
|
owner: cezen
|
||||||
|
|
||||||
|
- name: Install kubectl alias for cezen
|
||||||
|
lineinfile:
|
||||||
|
path: /opt/cezen/.bashrc
|
||||||
|
line: "alias kubectl='k3s kubectl'"
|
||||||
|
create: yes
|
||||||
|
owner: cezen
|
||||||
|
|
||||||
|
- name: Verify K3s node is ready
|
||||||
|
command: k3s kubectl get nodes
|
||||||
|
register: k3s_nodes
|
||||||
|
retries: 5
|
||||||
|
delay: 10
|
||||||
|
until: k3s_nodes.rc == 0
|
||||||
87
ansible/roles/minio/tasks/main.yml
Normal file
87
ansible/roles/minio/tasks/main.yml
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
---
|
||||||
|
# MinIO — S3-compatible object storage for model artifacts and datasets
|
||||||
|
- name: Download MinIO server binary
|
||||||
|
get_url:
|
||||||
|
url: https://dl.min.io/server/minio/release/linux-amd64/minio
|
||||||
|
dest: /usr/local/bin/minio
|
||||||
|
mode: "0755"
|
||||||
|
retries: 3
|
||||||
|
delay: 10
|
||||||
|
|
||||||
|
- name: Download MinIO client (mc)
|
||||||
|
get_url:
|
||||||
|
url: https://dl.min.io/client/mc/release/linux-amd64/mc
|
||||||
|
dest: /usr/local/bin/mc
|
||||||
|
mode: "0755"
|
||||||
|
retries: 3
|
||||||
|
delay: 10
|
||||||
|
|
||||||
|
- name: Create MinIO data directories
|
||||||
|
file:
|
||||||
|
path: "{{ item }}"
|
||||||
|
state: directory
|
||||||
|
owner: cezen
|
||||||
|
group: cezen
|
||||||
|
mode: "0750"
|
||||||
|
loop:
|
||||||
|
- /opt/cezen/data/minio
|
||||||
|
- /opt/cezen/data/minio/models
|
||||||
|
- /opt/cezen/data/minio/datasets
|
||||||
|
|
||||||
|
- name: Create MinIO environment file
|
||||||
|
copy:
|
||||||
|
dest: /etc/default/minio
|
||||||
|
content: |
|
||||||
|
MINIO_ROOT_USER=cezenadmin
|
||||||
|
MINIO_ROOT_PASSWORD=Cezen@2024!
|
||||||
|
MINIO_VOLUMES="/opt/cezen/data/minio"
|
||||||
|
MINIO_OPTS="--console-address :9001"
|
||||||
|
mode: "0640"
|
||||||
|
owner: cezen
|
||||||
|
group: cezen
|
||||||
|
|
||||||
|
- name: Create MinIO systemd service
|
||||||
|
copy:
|
||||||
|
dest: /etc/systemd/system/minio.service
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=MinIO Object Storage
|
||||||
|
Documentation=https://docs.min.io
|
||||||
|
Wants=network-online.target
|
||||||
|
After=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
User=cezen
|
||||||
|
Group=cezen
|
||||||
|
EnvironmentFile=/etc/default/minio
|
||||||
|
ExecStartPre=/bin/bash -c "if [ -z \"${MINIO_VOLUMES}\" ]; then echo 'Variable MINIO_VOLUMES not set'; exit 1; fi"
|
||||||
|
ExecStart=/usr/local/bin/minio server ${MINIO_VOLUMES} ${MINIO_OPTS}
|
||||||
|
Restart=always
|
||||||
|
RestartSec=5
|
||||||
|
LimitNOFILE=65536
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Enable and start MinIO
|
||||||
|
systemd:
|
||||||
|
name: minio
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
daemon_reload: yes
|
||||||
|
|
||||||
|
- name: Wait for MinIO to be ready
|
||||||
|
wait_for:
|
||||||
|
host: localhost
|
||||||
|
port: 9001
|
||||||
|
timeout: 30
|
||||||
|
|
||||||
|
- name: Configure mc client with local MinIO
|
||||||
|
become_user: cezen
|
||||||
|
shell: |
|
||||||
|
mc alias set local http://localhost:9000 cezenadmin 'Cezen@2024!'
|
||||||
|
mc mb local/models --ignore-existing
|
||||||
|
mc mb local/datasets --ignore-existing
|
||||||
|
retries: 3
|
||||||
|
delay: 5
|
||||||
56
ansible/roles/mlflow/tasks/main.yml
Normal file
56
ansible/roles/mlflow/tasks/main.yml
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
---
|
||||||
|
# MLflow — experiment tracking and model registry
|
||||||
|
- name: Install MLflow in cezen conda env
|
||||||
|
become_user: cezen
|
||||||
|
shell: |
|
||||||
|
/opt/cezen/miniconda/bin/conda run -n cezen pip install mlflow boto3
|
||||||
|
retries: 3
|
||||||
|
delay: 10
|
||||||
|
|
||||||
|
- name: Create MLflow directories
|
||||||
|
file:
|
||||||
|
path: "{{ item }}"
|
||||||
|
state: directory
|
||||||
|
owner: cezen
|
||||||
|
group: cezen
|
||||||
|
loop:
|
||||||
|
- /opt/cezen/data/mlflow
|
||||||
|
- /opt/cezen/data/mlflow/artifacts
|
||||||
|
|
||||||
|
- name: Create MLflow systemd service
|
||||||
|
copy:
|
||||||
|
dest: /etc/systemd/system/mlflow.service
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=MLflow Tracking Server
|
||||||
|
After=network.target minio.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=cezen
|
||||||
|
Group=cezen
|
||||||
|
ExecStart=/opt/cezen/miniconda/envs/cezen/bin/mlflow server \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 5000 \
|
||||||
|
--backend-store-uri sqlite:///opt/cezen/data/mlflow/mlflow.db \
|
||||||
|
--default-artifact-root /opt/cezen/data/mlflow/artifacts
|
||||||
|
Restart=always
|
||||||
|
RestartSec=5
|
||||||
|
Environment="PATH=/opt/cezen/miniconda/envs/cezen/bin:/usr/local/bin:/usr/bin:/bin"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Enable and start MLflow
|
||||||
|
systemd:
|
||||||
|
name: mlflow
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
daemon_reload: yes
|
||||||
|
|
||||||
|
- name: Wait for MLflow to be ready
|
||||||
|
wait_for:
|
||||||
|
host: localhost
|
||||||
|
port: 5000
|
||||||
|
timeout: 30
|
||||||
145
ansible/roles/monitoring/tasks/main.yml
Normal file
145
ansible/roles/monitoring/tasks/main.yml
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
---
|
||||||
|
# Monitoring: Prometheus + Grafana + DCGM Exporter (GPU metrics)
|
||||||
|
# ignore_errors: true on most tasks — monitoring is optional and should never block the install
|
||||||
|
|
||||||
|
- name: Create monitoring directories
|
||||||
|
file:
|
||||||
|
path: "{{ item }}"
|
||||||
|
state: directory
|
||||||
|
owner: cezen
|
||||||
|
group: cezen
|
||||||
|
loop:
|
||||||
|
- /opt/cezen/monitoring
|
||||||
|
- /opt/cezen/monitoring/prometheus
|
||||||
|
- /opt/cezen/monitoring/grafana
|
||||||
|
|
||||||
|
# ── DCGM Exporter (GPU metrics for Prometheus) ──────────
|
||||||
|
- name: Start DCGM Exporter container
|
||||||
|
shell: |
|
||||||
|
docker run -d \
|
||||||
|
--name dcgm-exporter \
|
||||||
|
--restart always \
|
||||||
|
--gpus all \
|
||||||
|
-p 9400:9400 \
|
||||||
|
nvcr.io/nvidia/k8s/dcgm-exporter:3.3.0-3.2.0-ubuntu22.04
|
||||||
|
register: dcgm_result
|
||||||
|
failed_when: dcgm_result.rc != 0 and 'already in use' not in dcgm_result.stderr
|
||||||
|
ignore_errors: true
|
||||||
|
|
||||||
|
# ── Prometheus ──────────────────────────────────────────
|
||||||
|
- name: Write Prometheus config
|
||||||
|
copy:
|
||||||
|
dest: /opt/cezen/monitoring/prometheus/prometheus.yml
|
||||||
|
owner: cezen
|
||||||
|
group: cezen
|
||||||
|
content: |
|
||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
evaluation_interval: 15s
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: 'prometheus'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9090']
|
||||||
|
|
||||||
|
- job_name: 'dcgm'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['host-gateway:9400']
|
||||||
|
|
||||||
|
- job_name: 'node'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['host-gateway:9100']
|
||||||
|
|
||||||
|
- name: Start Prometheus container
|
||||||
|
shell: |
|
||||||
|
docker run -d \
|
||||||
|
--name prometheus \
|
||||||
|
--restart always \
|
||||||
|
--add-host=host-gateway:172.17.0.1 \
|
||||||
|
-p 9090:9090 \
|
||||||
|
-v /opt/cezen/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \
|
||||||
|
prom/prometheus:latest
|
||||||
|
register: prom_result
|
||||||
|
failed_when: prom_result.rc != 0 and 'already in use' not in prom_result.stderr
|
||||||
|
ignore_errors: true
|
||||||
|
|
||||||
|
# ── Node Exporter (CPU/RAM/disk metrics) ───────────────
|
||||||
|
- name: Start Node Exporter container
|
||||||
|
shell: |
|
||||||
|
docker run -d \
|
||||||
|
--name node-exporter \
|
||||||
|
--restart always \
|
||||||
|
--network=host \
|
||||||
|
--pid=host \
|
||||||
|
-v /:/host:ro,rslave \
|
||||||
|
prom/node-exporter:latest \
|
||||||
|
--path.rootfs=/host
|
||||||
|
register: node_exp_result
|
||||||
|
failed_when: node_exp_result.rc != 0 and 'already in use' not in node_exp_result.stderr
|
||||||
|
ignore_errors: true
|
||||||
|
|
||||||
|
# ── Grafana ─────────────────────────────────────────────
|
||||||
|
- name: Start Grafana container
|
||||||
|
shell: |
|
||||||
|
docker run -d \
|
||||||
|
--name grafana \
|
||||||
|
--restart always \
|
||||||
|
-p 3000:3000 \
|
||||||
|
--add-host=host-gateway:172.17.0.1 \
|
||||||
|
-v grafana-storage:/var/lib/grafana \
|
||||||
|
-e GF_SECURITY_ADMIN_USER=admin \
|
||||||
|
-e GF_SECURITY_ADMIN_PASSWORD=cezen2024 \
|
||||||
|
-e GF_USERS_ALLOW_SIGN_UP=false \
|
||||||
|
grafana/grafana:latest
|
||||||
|
register: grafana_result
|
||||||
|
failed_when: grafana_result.rc != 0 and 'already in use' not in grafana_result.stderr
|
||||||
|
ignore_errors: true
|
||||||
|
|
||||||
|
- name: Wait for Grafana to be ready
|
||||||
|
wait_for:
|
||||||
|
host: localhost
|
||||||
|
port: 3000
|
||||||
|
timeout: 60
|
||||||
|
ignore_errors: true
|
||||||
|
|
||||||
|
- name: Add Prometheus datasource to Grafana
|
||||||
|
uri:
|
||||||
|
url: http://localhost:3000/api/datasources
|
||||||
|
method: POST
|
||||||
|
user: admin
|
||||||
|
password: cezen2024
|
||||||
|
force_basic_auth: yes
|
||||||
|
body_format: json
|
||||||
|
body:
|
||||||
|
name: Prometheus
|
||||||
|
type: prometheus
|
||||||
|
url: "http://host-gateway:9090"
|
||||||
|
access: proxy
|
||||||
|
isDefault: true
|
||||||
|
status_code: [200, 409] # 409 = already exists, that's fine
|
||||||
|
ignore_errors: true
|
||||||
|
|
||||||
|
- name: Import NVIDIA GPU dashboard (ID 12239)
|
||||||
|
uri:
|
||||||
|
url: http://localhost:3000/api/dashboards/import
|
||||||
|
method: POST
|
||||||
|
user: admin
|
||||||
|
password: cezen2024
|
||||||
|
force_basic_auth: yes
|
||||||
|
body_format: json
|
||||||
|
body:
|
||||||
|
inputs:
|
||||||
|
- name: DS_PROMETHEUS
|
||||||
|
type: datasource
|
||||||
|
pluginId: prometheus
|
||||||
|
value: Prometheus
|
||||||
|
overwrite: true
|
||||||
|
folderId: 0
|
||||||
|
dashboard:
|
||||||
|
"__inputs": []
|
||||||
|
"__requires": []
|
||||||
|
id: null
|
||||||
|
title: "NVIDIA GPU Overview"
|
||||||
|
uid: "nvidia-gpu"
|
||||||
|
status_code: [200, 412]
|
||||||
|
ignore_errors: true
|
||||||
81
ansible/roles/nvidia/tasks/main.yml
Normal file
81
ansible/roles/nvidia/tasks/main.yml
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
---
|
||||||
|
# NVIDIA role: Drivers + CUDA + cuDNN
|
||||||
|
# NOTE: Tested on L40S (Entry) and A40 (lab). Requires reboot after this role.
|
||||||
|
# If no GPU is present, this role will install drivers but nvidia-smi won't show GPUs.
|
||||||
|
|
||||||
|
- name: Add NVIDIA package repository key
|
||||||
|
apt_key:
|
||||||
|
url: https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Add NVIDIA CUDA apt repository
|
||||||
|
apt_repository:
|
||||||
|
repo: "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /"
|
||||||
|
state: present
|
||||||
|
filename: cuda
|
||||||
|
|
||||||
|
- name: Update apt cache after adding NVIDIA repo
|
||||||
|
apt:
|
||||||
|
update_cache: yes
|
||||||
|
|
||||||
|
- name: Install NVIDIA driver (open kernel module, recommended for data center GPUs)
|
||||||
|
apt:
|
||||||
|
name:
|
||||||
|
- nvidia-driver-550-open
|
||||||
|
- nvidia-utils-550
|
||||||
|
state: present
|
||||||
|
notify: reboot required
|
||||||
|
|
||||||
|
# CUDA Toolkit
|
||||||
|
- name: Install CUDA Toolkit 12.4
|
||||||
|
apt:
|
||||||
|
name:
|
||||||
|
- cuda-toolkit-12-4
|
||||||
|
- cuda-cudart-12-4
|
||||||
|
state: present
|
||||||
|
|
||||||
|
# cuDNN
|
||||||
|
- name: Add cuDNN repository
|
||||||
|
apt_repository:
|
||||||
|
repo: "deb https://developer.download.nvidia.com/compute/cudnn/repos/ubuntu2204/x86_64/ /"
|
||||||
|
state: present
|
||||||
|
filename: cudnn
|
||||||
|
|
||||||
|
- name: Install cuDNN 9 for CUDA 12
|
||||||
|
apt:
|
||||||
|
name:
|
||||||
|
- cudnn9-cuda-12
|
||||||
|
state: present
|
||||||
|
|
||||||
|
# Environment variables
|
||||||
|
- name: Set CUDA paths system-wide
|
||||||
|
copy:
|
||||||
|
dest: /etc/profile.d/cuda.sh
|
||||||
|
content: |
|
||||||
|
export CUDA_HOME=/usr/local/cuda
|
||||||
|
export PATH=$CUDA_HOME/bin:$PATH
|
||||||
|
export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Set NVIDIA persistence mode (survives reboots)
|
||||||
|
copy:
|
||||||
|
dest: /etc/systemd/system/nvidia-persistenced-mode.service
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=NVIDIA Persistence Daemon Mode
|
||||||
|
After=nvidia-persistenced.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
ExecStart=/usr/bin/nvidia-smi -pm 1
|
||||||
|
RemainAfterExit=yes
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Enable NVIDIA persistence service
|
||||||
|
systemd:
|
||||||
|
name: nvidia-persistenced-mode
|
||||||
|
enabled: yes
|
||||||
|
daemon_reload: yes
|
||||||
103
ansible/roles/ollama/tasks/main.yml
Normal file
103
ansible/roles/ollama/tasks/main.yml
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
---
|
||||||
|
# Ollama — local LLM serving (main inference engine for Entry tier)
|
||||||
|
- name: Check if Ollama is already installed
|
||||||
|
stat:
|
||||||
|
path: /usr/local/bin/ollama
|
||||||
|
register: ollama_binary
|
||||||
|
|
||||||
|
- name: Install Ollama
|
||||||
|
shell: curl -fsSL https://ollama.ai/install.sh | sh
|
||||||
|
when: not ollama_binary.stat.exists
|
||||||
|
retries: 3
|
||||||
|
delay: 10
|
||||||
|
|
||||||
|
- name: Create Ollama systemd service with GPU support
|
||||||
|
copy:
|
||||||
|
dest: /etc/systemd/system/ollama.service
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=Ollama Service
|
||||||
|
After=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStart=/usr/local/bin/ollama serve
|
||||||
|
User=cezen
|
||||||
|
Group=cezen
|
||||||
|
Restart=always
|
||||||
|
RestartSec=3
|
||||||
|
Environment="PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||||
|
Environment="OLLAMA_HOST=0.0.0.0:11434"
|
||||||
|
Environment="OLLAMA_MODELS=/opt/cezen/models/ollama"
|
||||||
|
Environment="CUDA_VISIBLE_DEVICES=0,1,2"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Create Ollama models directory
|
||||||
|
file:
|
||||||
|
path: /opt/cezen/models/ollama
|
||||||
|
state: directory
|
||||||
|
owner: cezen
|
||||||
|
group: cezen
|
||||||
|
|
||||||
|
- name: Enable and start Ollama
|
||||||
|
systemd:
|
||||||
|
name: ollama
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
daemon_reload: yes
|
||||||
|
|
||||||
|
- name: Wait for Ollama API to be ready
|
||||||
|
wait_for:
|
||||||
|
host: localhost
|
||||||
|
port: 11434
|
||||||
|
timeout: 60
|
||||||
|
|
||||||
|
- name: Pull default models (Llama 3.1 8B + Mistral 7B)
|
||||||
|
become_user: cezen
|
||||||
|
command: ollama pull {{ item }}
|
||||||
|
loop:
|
||||||
|
- llama3.1:8b
|
||||||
|
- mistral:7b
|
||||||
|
environment:
|
||||||
|
OLLAMA_HOST: "http://localhost:11434"
|
||||||
|
retries: 3
|
||||||
|
delay: 15
|
||||||
|
# NOTE: Models are large (~5GB each). This step takes time on first run.
|
||||||
|
# Skip by setting: ansible-playbook ... -e "skip_model_pull=true"
|
||||||
|
when: not (skip_model_pull | default(false))
|
||||||
|
|
||||||
|
# Open WebUI (chat interface on top of Ollama)
|
||||||
|
- name: Deploy Open WebUI via Docker
|
||||||
|
community.docker.docker_container:
|
||||||
|
name: open-webui
|
||||||
|
image: ghcr.io/open-webui/open-webui:main
|
||||||
|
state: started
|
||||||
|
restart_policy: always
|
||||||
|
ports:
|
||||||
|
- "3001:8080"
|
||||||
|
volumes:
|
||||||
|
- open-webui:/app/backend/data
|
||||||
|
env:
|
||||||
|
OLLAMA_BASE_URL: "http://host-gateway:11434"
|
||||||
|
etc_hosts:
|
||||||
|
host-gateway: "172.17.0.1"
|
||||||
|
# Note: Requires docker community collection. Install with:
|
||||||
|
# ansible-galaxy collection install community.docker
|
||||||
|
ignore_errors: true # Falls back gracefully if docker collection not available
|
||||||
|
|
||||||
|
- name: Alternative Open WebUI start (if community.docker not available)
|
||||||
|
shell: |
|
||||||
|
docker run -d \
|
||||||
|
--name open-webui \
|
||||||
|
--restart always \
|
||||||
|
-p 3001:8080 \
|
||||||
|
--add-host=host-gateway:172.17.0.1 \
|
||||||
|
-v open-webui:/app/backend/data \
|
||||||
|
-e OLLAMA_BASE_URL=http://host-gateway:11434 \
|
||||||
|
ghcr.io/open-webui/open-webui:main
|
||||||
|
args:
|
||||||
|
executable: /bin/bash
|
||||||
|
register: webui_result
|
||||||
|
failed_when: webui_result.rc != 0 and 'already in use' not in webui_result.stderr
|
||||||
56
ansible/roles/vllm/tasks/main.yml
Normal file
56
ansible/roles/vllm/tasks/main.yml
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
---
|
||||||
|
# vLLM — high-performance LLM inference with OpenAI-compatible API
|
||||||
|
# Entry tier: runs as a Docker container (easier to manage than pip install)
|
||||||
|
- name: Pull vLLM Docker image
|
||||||
|
shell: docker pull vllm/vllm-openai:latest
|
||||||
|
retries: 3
|
||||||
|
delay: 15
|
||||||
|
|
||||||
|
- name: Create vLLM systemd service
|
||||||
|
copy:
|
||||||
|
dest: /etc/systemd/system/vllm.service
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=vLLM OpenAI-Compatible Inference Server
|
||||||
|
After=docker.service ollama.service
|
||||||
|
Requires=docker.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Restart=always
|
||||||
|
RestartSec=5
|
||||||
|
ExecStartPre=-/usr/bin/docker stop vllm
|
||||||
|
ExecStartPre=-/usr/bin/docker rm vllm
|
||||||
|
ExecStart=/usr/bin/docker run \
|
||||||
|
--name vllm \
|
||||||
|
--gpus all \
|
||||||
|
--ipc=host \
|
||||||
|
-p 8000:8000 \
|
||||||
|
-v /opt/cezen/models:/root/.cache/huggingface \
|
||||||
|
-e HF_HOME=/root/.cache/huggingface \
|
||||||
|
vllm/vllm-openai:latest \
|
||||||
|
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||||
|
--gpu-memory-utilization 0.7 \
|
||||||
|
--max-model-len 8192 \
|
||||||
|
--tensor-parallel-size 1
|
||||||
|
ExecStop=/usr/bin/docker stop vllm
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Enable vLLM (but don't start yet — model selection needed first)
|
||||||
|
systemd:
|
||||||
|
name: vllm
|
||||||
|
enabled: yes
|
||||||
|
daemon_reload: yes
|
||||||
|
# Note: vLLM service is enabled but not started by default.
|
||||||
|
# Start manually after choosing a model:
|
||||||
|
# sudo systemctl start vllm
|
||||||
|
# Or change the --model flag in /etc/systemd/system/vllm.service first.
|
||||||
|
|
||||||
|
- name: Create vLLM model directory
|
||||||
|
file:
|
||||||
|
path: /opt/cezen/models/hf_cache
|
||||||
|
state: directory
|
||||||
|
owner: cezen
|
||||||
|
group: cezen
|
||||||
133
install.sh
Normal file
133
install.sh
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
# Cezen AI Suite — Entry Level Installer
|
||||||
|
# Usage:
|
||||||
|
# sudo bash install.sh → Phase 1 (drivers + schedules reboot → Phase 2)
|
||||||
|
# sudo bash install.sh --phase=2 → Phase 2 (all software, run after reboot)
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
set -e
|
||||||
|
|
||||||
|
TIER="entry"
|
||||||
|
PHASE="1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ANSIBLE_DIR="$SCRIPT_DIR/ansible"
|
||||||
|
|
||||||
|
for arg in "$@"; do
|
||||||
|
case $arg in
|
||||||
|
--tier=*) TIER="${arg#*=}" ;;
|
||||||
|
--phase=*) PHASE="${arg#*=}" ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# ── Preflight ──────────────────────────────────
|
||||||
|
check_root() {
|
||||||
|
if [ "$EUID" -ne 0 ]; then
|
||||||
|
echo "ERROR: Run as root: sudo bash install.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
check_os() {
|
||||||
|
if [ -f /etc/os-release ]; then
|
||||||
|
. /etc/os-release
|
||||||
|
if [[ "$ID" != "ubuntu" ]]; then
|
||||||
|
echo "ERROR: Ubuntu 22.04 required. Detected: $PRETTY_NAME"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "✓ OS: $PRETTY_NAME"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
install_ansible() {
|
||||||
|
if ! command -v ansible-playbook &>/dev/null; then
|
||||||
|
echo "→ Installing Ansible..."
|
||||||
|
apt-get update -qq
|
||||||
|
apt-get install -y -qq ansible python3-pip
|
||||||
|
fi
|
||||||
|
echo "✓ Ansible ready"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Phase 1: NVIDIA drivers only ──────────────
|
||||||
|
run_phase1() {
|
||||||
|
echo ""
|
||||||
|
echo "╔══════════════════════════════════════════╗"
|
||||||
|
echo "║ Cezen AI Suite — Phase 1: NVIDIA ║"
|
||||||
|
echo "╚══════════════════════════════════════════╝"
|
||||||
|
|
||||||
|
ansible-playbook -i localhost, -c local "$ANSIBLE_DIR/phase1_nvidia.yml" \
|
||||||
|
-e "tier=$TIER" -v
|
||||||
|
|
||||||
|
# Register phase 2 as a one-shot systemd service so it runs after reboot
|
||||||
|
cat > /etc/systemd/system/cezen-phase2.service << EOF
|
||||||
|
[Unit]
|
||||||
|
Description=Cezen AI Suite Phase 2 Installer
|
||||||
|
After=network-online.target nvidia-persistenced.service
|
||||||
|
Wants=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
ExecStart=/bin/bash ${SCRIPT_DIR}/install.sh --phase=2 --tier=${TIER}
|
||||||
|
RemainAfterExit=yes
|
||||||
|
StandardOutput=journal+console
|
||||||
|
StandardError=journal+console
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl enable cezen-phase2.service
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "✓ Phase 2 registered — will run automatically after reboot"
|
||||||
|
echo "→ Rebooting in 10 seconds..."
|
||||||
|
sleep 10
|
||||||
|
reboot
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Phase 2: Full stack ────────────────────────
|
||||||
|
run_phase2() {
|
||||||
|
echo ""
|
||||||
|
echo "╔══════════════════════════════════════════╗"
|
||||||
|
echo "║ Cezen AI Suite — Phase 2: Stack ║"
|
||||||
|
echo "╚══════════════════════════════════════════╝"
|
||||||
|
|
||||||
|
# Verify NVIDIA driver loaded
|
||||||
|
if ! nvidia-smi &>/dev/null; then
|
||||||
|
echo "WARNING: nvidia-smi not responding. NVIDIA driver may not be loaded."
|
||||||
|
echo " Continuing — non-GPU roles will still install correctly."
|
||||||
|
else
|
||||||
|
echo "✓ NVIDIA driver: $(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
ansible-playbook -i localhost, -c local "$ANSIBLE_DIR/entry.yml" \
|
||||||
|
-e "tier=$TIER" -v
|
||||||
|
|
||||||
|
# Disable one-shot service so it doesn't run again on next reboot
|
||||||
|
systemctl disable cezen-phase2.service 2>/dev/null || true
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "╔══════════════════════════════════════════╗"
|
||||||
|
echo "║ Cezen AI Suite installation complete! ║"
|
||||||
|
echo "║ ║"
|
||||||
|
echo "║ JupyterLab → http://localhost:8888 ║"
|
||||||
|
echo "║ Ollama API → http://localhost:11434 ║"
|
||||||
|
echo "║ MLflow → http://localhost:5000 ║"
|
||||||
|
echo "║ MinIO → http://localhost:9001 ║"
|
||||||
|
echo "║ Grafana → http://localhost:3000 ║"
|
||||||
|
echo "╚══════════════════════════════════════════╝"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Main ───────────────────────────────────────
|
||||||
|
check_root
|
||||||
|
check_os
|
||||||
|
install_ansible
|
||||||
|
|
||||||
|
if [ "$PHASE" = "1" ]; then
|
||||||
|
run_phase1
|
||||||
|
elif [ "$PHASE" = "2" ]; then
|
||||||
|
run_phase2
|
||||||
|
else
|
||||||
|
echo "ERROR: Unknown phase '$PHASE'. Use --phase=1 or --phase=2"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
44
models/pull-models.sh
Normal file
44
models/pull-models.sh
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Pull additional AI models into Ollama
|
||||||
|
# Run after install: bash models/pull-models.sh --tier=entry
|
||||||
|
# ─────────────────────────────────────────────
|
||||||
|
TIER=${1:-entry}
|
||||||
|
|
||||||
|
echo "Pulling models for tier: $TIER"
|
||||||
|
|
||||||
|
entry_models=(
|
||||||
|
"llama3.1:8b" # General purpose, good baseline
|
||||||
|
"mistral:7b" # Fast, good for APIs
|
||||||
|
"llama3.1:70b" # Larger — only if enough VRAM (3× L40S has 144GB total)
|
||||||
|
"nomic-embed-text" # Embedding model for RAG
|
||||||
|
"codellama:13b" # Code generation
|
||||||
|
)
|
||||||
|
|
||||||
|
mid_models=(
|
||||||
|
"${entry_models[@]}"
|
||||||
|
"llama3.1:70b"
|
||||||
|
"mixtral:8x7b"
|
||||||
|
"deepseek-coder-v2:16b"
|
||||||
|
)
|
||||||
|
|
||||||
|
advanced_models=(
|
||||||
|
"${mid_models[@]}"
|
||||||
|
"llama3.1:405b"
|
||||||
|
"mixtral:8x22b"
|
||||||
|
)
|
||||||
|
|
||||||
|
case $TIER in
|
||||||
|
entry) models=("${entry_models[@]}") ;;
|
||||||
|
mid) models=("${mid_models[@]}") ;;
|
||||||
|
advanced) models=("${advanced_models[@]}") ;;
|
||||||
|
*) echo "Unknown tier: $TIER. Use entry, mid, or advanced."; exit 1 ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
for model in "${models[@]}"; do
|
||||||
|
echo ""
|
||||||
|
echo "→ Pulling $model..."
|
||||||
|
ollama pull "$model"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "✓ All models pulled. List with: ollama list"
|
||||||
Loading…
Reference in New Issue
Block a user