From a071602cf1eb552115f2b9366ec3a194c7ba61b9 Mon Sep 17 00:00:00 2001 From: Jino Jose Date: Tue, 23 Jun 2026 13:09:03 +0530 Subject: [PATCH] =?UTF-8?q?Initial=20Cezen=20AI=20Suite=20installer=20?= =?UTF-8?q?=E2=80=94=20Entry=20tier?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .DS_Store | Bin 0 -> 6148 bytes README.md | 77 +++++++++++++ ansible/.DS_Store | Bin 0 -> 6148 bytes ansible/entry.yml | 23 ++++ ansible/phase1_nvidia.yml | 9 ++ ansible/roles/base/tasks/main.yml | 91 +++++++++++++++ ansible/roles/chromadb/tasks/main.yml | 53 +++++++++ ansible/roles/docker/tasks/main.yml | 83 ++++++++++++++ ansible/roles/jupyterlab/tasks/main.yml | 92 +++++++++++++++ ansible/roles/k3s/tasks/main.yml | 62 ++++++++++ ansible/roles/minio/tasks/main.yml | 87 ++++++++++++++ ansible/roles/mlflow/tasks/main.yml | 56 +++++++++ ansible/roles/monitoring/tasks/main.yml | 145 ++++++++++++++++++++++++ ansible/roles/nvidia/tasks/main.yml | 81 +++++++++++++ ansible/roles/ollama/tasks/main.yml | 103 +++++++++++++++++ ansible/roles/vllm/tasks/main.yml | 56 +++++++++ install.sh | 133 ++++++++++++++++++++++ models/pull-models.sh | 44 +++++++ 18 files changed, 1195 insertions(+) create mode 100644 .DS_Store create mode 100644 README.md create mode 100644 ansible/.DS_Store create mode 100644 ansible/entry.yml create mode 100644 ansible/phase1_nvidia.yml create mode 100644 ansible/roles/base/tasks/main.yml create mode 100644 ansible/roles/chromadb/tasks/main.yml create mode 100644 ansible/roles/docker/tasks/main.yml create mode 100644 ansible/roles/jupyterlab/tasks/main.yml create mode 100644 ansible/roles/k3s/tasks/main.yml create mode 100644 ansible/roles/minio/tasks/main.yml create mode 100644 ansible/roles/mlflow/tasks/main.yml create mode 100644 ansible/roles/monitoring/tasks/main.yml create mode 100644 ansible/roles/nvidia/tasks/main.yml create mode 100644 ansible/roles/ollama/tasks/main.yml create mode 100644 ansible/roles/vllm/tasks/main.yml create mode 100644 install.sh create mode 100644 models/pull-models.sh diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..179d250f549ec554b6c207217d87d6f098ed052b GIT binary patch literal 6148 zcmeHKJ5B>Z41IN??z)hR@;F-5*kizzhPbF(3xSz!wAleP~q2u5eC_ zPX|qm0K^sDAza5SL2T|Ic7<~yJrv7SVy0S+7?$buw~Ffu=fq5h#m%f!H(O087Pr&i zA{^Ew%8CIoaLT}CZa3cl@2Nk`|EEdXi2*V2uNbh^?qRp(ld88a9_PKbQ6H%u^Fejt pI#Ot&9TTG+bK~v!Hi9y)`I_fl;hY%h^hY{TKLgH-L=5~j1D^t~8&3cL literal 0 HcmV?d00001 diff --git a/README.md b/README.md new file mode 100644 index 0000000..64d72fd --- /dev/null +++ b/README.md @@ -0,0 +1,77 @@ +# Cezen AI Suite — Installer + +## Quick Start + +```bash +git clone +cd cgit +sudo bash install.sh +``` + +Server reboots automatically after NVIDIA drivers install. Phase 2 runs on its own after reboot. + +## What Gets Installed (Entry Tier) + +| Service | Port | Notes | +|---|---|---| +| Ollama | 11434 | LLM inference, 2 models pre-loaded | +| Open WebUI | 3001 | Chat interface | +| vLLM | 8000 | OpenAI-compatible API (start manually) | +| JupyterLab | 8888 | Token: `cezen2024` | +| ChromaDB | 8100 | Vector DB for RAG | +| MLflow | 5000 | Experiment tracking | +| MinIO | 9001 | Object storage (user: cezenadmin / Cezen@2024!) | +| Grafana | 3000 | GPU + system monitoring (admin / cezen2024) | + +## Testing Without a GPU (Multipass) + +```bash +# On your MacBook: +multipass launch 22.04 --name cezen-test --cpus 4 --mem 8G --disk 40G +multipass shell cezen-test + +# Inside the VM: +git clone +sudo bash install.sh +``` + +NVIDIA driver install will succeed but `nvidia-smi` won't show GPUs — that's expected. All other services will run fine. + +## Pull More Models + +```bash +bash models/pull-models.sh --tier=entry +``` + +## File Structure + +``` +cgit/ +├── install.sh ← Entry point +├── ansible/ +│ ├── phase1_nvidia.yml ← Phase 1: drivers (triggers reboot) +│ ├── entry.yml ← Phase 2: full stack +│ └── roles/ +│ ├── base/ ← OS, Python, Miniconda, LangChain +│ ├── nvidia/ ← Drivers, CUDA 12.4, cuDNN 9 +│ ├── docker/ ← Docker CE + NVIDIA Container Toolkit +│ ├── k3s/ ← Lightweight Kubernetes +│ ├── ollama/ ← Ollama + Open WebUI +│ ├── vllm/ ← vLLM inference server +│ ├── jupyterlab/ ← JupyterLab notebooks +│ ├── chromadb/ ← Vector database +│ ├── mlflow/ ← Experiment tracking +│ ├── minio/ ← Object storage +│ └── monitoring/ ← Grafana + Prometheus + DCGM +└── models/ + └── pull-models.sh ← Pull additional models +``` + +## Change Default Passwords + +Before shipping to a customer, update these: + +- JupyterLab token: `/opt/cezen/.jupyter/jupyter_lab_config.py` +- MinIO: `/etc/default/minio` +- Grafana: environment vars in monitoring role, or via UI after first login +- MLflow: no auth by default (add reverse proxy if needed) diff --git a/ansible/.DS_Store b/ansible/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..a565689de842ae81156e4c0d241fcd5c1365fdf0 GIT binary patch literal 6148 zcmeH~I|>3p42BaQAlO)1PU8W*!6149FCad;g$2caj_#iaf~&QN{DI`3$t22t#m+`V zbaOwiMHV75gPY3A!oU>!sa&O(PA+mg-%j1J?>ny=Wv$h~@f)w_c}yVz5+DH*AORBi zAp&-9!)EhPMiL+a5_l4@_d|i3*3=g2uMPws0iYA4-LUpq0$MBqt*I>(8JI>ZG+Nci z5X*Zzv}9dPZK2UFn!|_YKdVhKFpYN6f(=Zo3j+y|z<|Iw_6xiJckoa1|Dc6i5+H#; zBcPM*e!Ia#<=y)Bc$Pn8*47OU^>T!lj{t1!Dqh0fa9(Tyt*I>(85lnV90LOhe3ifh Dr(hGD literal 0 HcmV?d00001 diff --git a/ansible/entry.yml b/ansible/entry.yml new file mode 100644 index 0000000..3f50fa6 --- /dev/null +++ b/ansible/entry.yml @@ -0,0 +1,23 @@ +--- +# Phase 2: Full Cezen AI Suite — Entry Tier +# Runs after NVIDIA driver reboot +- name: Cezen AI — Entry Tier Stack + hosts: localhost + connection: local + become: true + vars: + cezen_user: "cezen" + cezen_home: "/opt/cezen" + python_version: "3.11" + cuda_version: "12.4" + + roles: + - docker + - k3s + - ollama + - vllm + - jupyterlab + - chromadb + - mlflow + - minio + - monitoring diff --git a/ansible/phase1_nvidia.yml b/ansible/phase1_nvidia.yml new file mode 100644 index 0000000..955102c --- /dev/null +++ b/ansible/phase1_nvidia.yml @@ -0,0 +1,9 @@ +--- +# Phase 1: NVIDIA drivers only. Server reboots after this. +- name: Cezen AI — Phase 1 NVIDIA Drivers + hosts: localhost + connection: local + become: true + roles: + - base + - nvidia diff --git a/ansible/roles/base/tasks/main.yml b/ansible/roles/base/tasks/main.yml new file mode 100644 index 0000000..0a67eb5 --- /dev/null +++ b/ansible/roles/base/tasks/main.yml @@ -0,0 +1,91 @@ +--- +# Base role: OS updates, essential packages, Python/Miniconda +- name: Update apt cache + apt: + update_cache: yes + cache_valid_time: 3600 + +- name: Upgrade all packages + apt: + upgrade: dist + autoremove: yes + +- name: Install essential system packages + apt: + name: + - curl + - wget + - git + - build-essential + - ca-certificates + - gnupg + - lsb-release + - software-properties-common + - unzip + - htop + - net-tools + - jq + - python3-pip + - python3-venv + state: present + +- name: Create cezen user + user: + name: cezen + shell: /bin/bash + home: /opt/cezen + create_home: yes + groups: sudo + append: yes + +- name: Create cezen directories + file: + path: "{{ item }}" + state: directory + owner: cezen + group: cezen + mode: "0755" + loop: + - /opt/cezen + - /opt/cezen/models + - /opt/cezen/data + - /opt/cezen/logs + +- name: Download Miniconda + get_url: + url: https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + dest: /tmp/miniconda.sh + mode: "0755" + retries: 3 + delay: 10 + +- name: Install Miniconda + become_user: cezen + command: bash /tmp/miniconda.sh -b -p /opt/cezen/miniconda + args: + creates: /opt/cezen/miniconda/bin/conda + +- name: Add conda to cezen PATH + lineinfile: + path: /opt/cezen/.bashrc + line: 'export PATH="/opt/cezen/miniconda/bin:$PATH"' + create: yes + owner: cezen + +- name: Create cezen conda environment (Python 3.11) + become_user: cezen + command: /opt/cezen/miniconda/bin/conda create -n cezen python=3.11 -y + args: + creates: /opt/cezen/miniconda/envs/cezen + +- name: Install LangChain + LlamaIndex + HuggingFace in conda env + become_user: cezen + shell: | + /opt/cezen/miniconda/bin/conda run -n cezen pip install \ + langchain langchain-community llama-index \ + transformers huggingface-hub \ + peft bitsandbytes accelerate \ + fastapi uvicorn[standard] \ + sentence-transformers + retries: 3 + delay: 15 diff --git a/ansible/roles/chromadb/tasks/main.yml b/ansible/roles/chromadb/tasks/main.yml new file mode 100644 index 0000000..dfa46e9 --- /dev/null +++ b/ansible/roles/chromadb/tasks/main.yml @@ -0,0 +1,53 @@ +--- +# ChromaDB — vector database for RAG pipelines +- name: Install ChromaDB in cezen conda env + become_user: cezen + shell: | + /opt/cezen/miniconda/bin/conda run -n cezen pip install chromadb + retries: 3 + delay: 10 + +- name: Create ChromaDB data directory + file: + path: /opt/cezen/data/chromadb + state: directory + owner: cezen + group: cezen + +- name: Create ChromaDB systemd service + copy: + dest: /etc/systemd/system/chromadb.service + content: | + [Unit] + Description=ChromaDB Vector Database + After=network.target + + [Service] + Type=simple + User=cezen + Group=cezen + WorkingDirectory=/opt/cezen/data/chromadb + ExecStart=/opt/cezen/miniconda/envs/cezen/bin/chroma run \ + --host 0.0.0.0 \ + --port 8100 \ + --path /opt/cezen/data/chromadb + Restart=always + RestartSec=5 + Environment="PATH=/opt/cezen/miniconda/envs/cezen/bin:/usr/local/bin:/usr/bin:/bin" + + [Install] + WantedBy=multi-user.target + mode: "0644" + +- name: Enable and start ChromaDB + systemd: + name: chromadb + enabled: yes + state: started + daemon_reload: yes + +- name: Wait for ChromaDB to be ready + wait_for: + host: localhost + port: 8100 + timeout: 30 diff --git a/ansible/roles/docker/tasks/main.yml b/ansible/roles/docker/tasks/main.yml new file mode 100644 index 0000000..77769ce --- /dev/null +++ b/ansible/roles/docker/tasks/main.yml @@ -0,0 +1,83 @@ +--- +# Docker CE + NVIDIA Container Toolkit +- name: Add Docker GPG key + apt_key: + url: https://download.docker.com/linux/ubuntu/gpg + state: present + +- name: Add Docker apt repository + apt_repository: + repo: "deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable" + state: present + filename: docker + +- name: Install Docker CE + apt: + name: + - docker-ce + - docker-ce-cli + - containerd.io + - docker-buildx-plugin + - docker-compose-plugin + state: present + update_cache: yes + +- name: Add cezen user to docker group + user: + name: cezen + groups: docker + append: yes + +- name: Enable and start Docker + systemd: + name: docker + enabled: yes + state: started + +# NVIDIA Container Toolkit (allows GPU passthrough into containers) +- name: Add NVIDIA Container Toolkit repo + shell: | + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ + gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + args: + creates: /etc/apt/sources.list.d/nvidia-container-toolkit.list + +- name: Install NVIDIA Container Toolkit + apt: + name: nvidia-container-toolkit + state: present + update_cache: yes + +- name: Configure Docker to use NVIDIA runtime + shell: nvidia-ctk runtime configure --runtime=docker + notify: restart docker + +- name: Set NVIDIA as default Docker runtime + copy: + dest: /etc/docker/daemon.json + content: | + { + "default-runtime": "nvidia", + "runtimes": { + "nvidia": { + "path": "nvidia-container-runtime", + "runtimeArgs": [] + } + }, + "log-driver": "json-file", + "log-opts": { + "max-size": "100m", + "max-file": "3" + } + } + mode: "0644" + notify: restart docker + +handlers: + - name: restart docker + systemd: + name: docker + state: restarted diff --git a/ansible/roles/jupyterlab/tasks/main.yml b/ansible/roles/jupyterlab/tasks/main.yml new file mode 100644 index 0000000..06663d5 --- /dev/null +++ b/ansible/roles/jupyterlab/tasks/main.yml @@ -0,0 +1,92 @@ +--- +# JupyterLab — notebook interface for AI/ML development +- name: Install JupyterLab in cezen conda env + become_user: cezen + shell: | + /opt/cezen/miniconda/bin/conda run -n cezen pip install \ + jupyterlab \ + ipywidgets \ + ipykernel \ + notebook \ + nbconvert + retries: 3 + delay: 10 + +- name: Create JupyterLab config directory + file: + path: /opt/cezen/.jupyter + state: directory + owner: cezen + group: cezen + +- name: Generate JupyterLab config + become_user: cezen + shell: | + /opt/cezen/miniconda/envs/cezen/bin/jupyter lab --generate-config + args: + creates: /opt/cezen/.jupyter/jupyter_lab_config.py + +- name: Configure JupyterLab (no browser, allow all IPs, set base dir) + lineinfile: + path: /opt/cezen/.jupyter/jupyter_lab_config.py + line: "{{ item }}" + create: yes + owner: cezen + loop: + - "c.ServerApp.ip = '0.0.0.0'" + - "c.ServerApp.port = 8888" + - "c.ServerApp.open_browser = False" + - "c.ServerApp.notebook_dir = '/opt/cezen/notebooks'" + - "c.ServerApp.token = 'cezen2024'" + - "c.ServerApp.allow_root = False" + +- name: Create notebooks directory + file: + path: /opt/cezen/notebooks + state: directory + owner: cezen + group: cezen + +- name: Create sample notebook placeholder + copy: + dest: /opt/cezen/notebooks/README.md + content: | + # Cezen AI Suite — JupyterLab + + Default token: `cezen2024` + + Change this in: `/opt/cezen/.jupyter/jupyter_lab_config.py` + Then restart: `sudo systemctl restart jupyterlab` + owner: cezen + group: cezen + +- name: Create JupyterLab systemd service + copy: + dest: /etc/systemd/system/jupyterlab.service + content: | + [Unit] + Description=JupyterLab Server + After=network.target + + [Service] + Type=simple + User=cezen + Group=cezen + WorkingDirectory=/opt/cezen/notebooks + ExecStart=/opt/cezen/miniconda/envs/cezen/bin/jupyter lab \ + --config=/opt/cezen/.jupyter/jupyter_lab_config.py + Restart=always + RestartSec=5 + Environment="PATH=/opt/cezen/miniconda/envs/cezen/bin:/usr/local/cuda/bin:/usr/local/bin:/usr/bin:/bin" + Environment="CUDA_HOME=/usr/local/cuda" + + [Install] + WantedBy=multi-user.target + mode: "0644" + +- name: Enable and start JupyterLab + systemd: + name: jupyterlab + enabled: yes + state: started + daemon_reload: yes diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml new file mode 100644 index 0000000..abf090b --- /dev/null +++ b/ansible/roles/k3s/tasks/main.yml @@ -0,0 +1,62 @@ +--- +# K3s — lightweight Kubernetes for single-node AI workloads +- name: Check if K3s is already installed + stat: + path: /usr/local/bin/k3s + register: k3s_binary + +- name: Install K3s + shell: | + curl -sfL https://get.k3s.io | \ + INSTALL_K3S_EXEC="--disable traefik --disable servicelb" sh - + when: not k3s_binary.stat.exists + retries: 3 + delay: 10 + +- name: Wait for K3s to be ready + wait_for: + path: /etc/rancher/k3s/k3s.yaml + timeout: 120 + +- name: Enable and start K3s + systemd: + name: k3s + enabled: yes + state: started + +- name: Copy kubeconfig for cezen user + copy: + src: /etc/rancher/k3s/k3s.yaml + dest: /opt/cezen/.kube/config + owner: cezen + group: cezen + mode: "0600" + remote_src: yes + +- name: Create .kube directory for cezen + file: + path: /opt/cezen/.kube + state: directory + owner: cezen + group: cezen + +- name: Set KUBECONFIG in cezen .bashrc + lineinfile: + path: /opt/cezen/.bashrc + line: 'export KUBECONFIG=/opt/cezen/.kube/config' + create: yes + owner: cezen + +- name: Install kubectl alias for cezen + lineinfile: + path: /opt/cezen/.bashrc + line: "alias kubectl='k3s kubectl'" + create: yes + owner: cezen + +- name: Verify K3s node is ready + command: k3s kubectl get nodes + register: k3s_nodes + retries: 5 + delay: 10 + until: k3s_nodes.rc == 0 diff --git a/ansible/roles/minio/tasks/main.yml b/ansible/roles/minio/tasks/main.yml new file mode 100644 index 0000000..39ca62a --- /dev/null +++ b/ansible/roles/minio/tasks/main.yml @@ -0,0 +1,87 @@ +--- +# MinIO — S3-compatible object storage for model artifacts and datasets +- name: Download MinIO server binary + get_url: + url: https://dl.min.io/server/minio/release/linux-amd64/minio + dest: /usr/local/bin/minio + mode: "0755" + retries: 3 + delay: 10 + +- name: Download MinIO client (mc) + get_url: + url: https://dl.min.io/client/mc/release/linux-amd64/mc + dest: /usr/local/bin/mc + mode: "0755" + retries: 3 + delay: 10 + +- name: Create MinIO data directories + file: + path: "{{ item }}" + state: directory + owner: cezen + group: cezen + mode: "0750" + loop: + - /opt/cezen/data/minio + - /opt/cezen/data/minio/models + - /opt/cezen/data/minio/datasets + +- name: Create MinIO environment file + copy: + dest: /etc/default/minio + content: | + MINIO_ROOT_USER=cezenadmin + MINIO_ROOT_PASSWORD=Cezen@2024! + MINIO_VOLUMES="/opt/cezen/data/minio" + MINIO_OPTS="--console-address :9001" + mode: "0640" + owner: cezen + group: cezen + +- name: Create MinIO systemd service + copy: + dest: /etc/systemd/system/minio.service + content: | + [Unit] + Description=MinIO Object Storage + Documentation=https://docs.min.io + Wants=network-online.target + After=network-online.target + + [Service] + User=cezen + Group=cezen + EnvironmentFile=/etc/default/minio + ExecStartPre=/bin/bash -c "if [ -z \"${MINIO_VOLUMES}\" ]; then echo 'Variable MINIO_VOLUMES not set'; exit 1; fi" + ExecStart=/usr/local/bin/minio server ${MINIO_VOLUMES} ${MINIO_OPTS} + Restart=always + RestartSec=5 + LimitNOFILE=65536 + + [Install] + WantedBy=multi-user.target + mode: "0644" + +- name: Enable and start MinIO + systemd: + name: minio + enabled: yes + state: started + daemon_reload: yes + +- name: Wait for MinIO to be ready + wait_for: + host: localhost + port: 9001 + timeout: 30 + +- name: Configure mc client with local MinIO + become_user: cezen + shell: | + mc alias set local http://localhost:9000 cezenadmin 'Cezen@2024!' + mc mb local/models --ignore-existing + mc mb local/datasets --ignore-existing + retries: 3 + delay: 5 diff --git a/ansible/roles/mlflow/tasks/main.yml b/ansible/roles/mlflow/tasks/main.yml new file mode 100644 index 0000000..b5659fd --- /dev/null +++ b/ansible/roles/mlflow/tasks/main.yml @@ -0,0 +1,56 @@ +--- +# MLflow — experiment tracking and model registry +- name: Install MLflow in cezen conda env + become_user: cezen + shell: | + /opt/cezen/miniconda/bin/conda run -n cezen pip install mlflow boto3 + retries: 3 + delay: 10 + +- name: Create MLflow directories + file: + path: "{{ item }}" + state: directory + owner: cezen + group: cezen + loop: + - /opt/cezen/data/mlflow + - /opt/cezen/data/mlflow/artifacts + +- name: Create MLflow systemd service + copy: + dest: /etc/systemd/system/mlflow.service + content: | + [Unit] + Description=MLflow Tracking Server + After=network.target minio.service + + [Service] + Type=simple + User=cezen + Group=cezen + ExecStart=/opt/cezen/miniconda/envs/cezen/bin/mlflow server \ + --host 0.0.0.0 \ + --port 5000 \ + --backend-store-uri sqlite:///opt/cezen/data/mlflow/mlflow.db \ + --default-artifact-root /opt/cezen/data/mlflow/artifacts + Restart=always + RestartSec=5 + Environment="PATH=/opt/cezen/miniconda/envs/cezen/bin:/usr/local/bin:/usr/bin:/bin" + + [Install] + WantedBy=multi-user.target + mode: "0644" + +- name: Enable and start MLflow + systemd: + name: mlflow + enabled: yes + state: started + daemon_reload: yes + +- name: Wait for MLflow to be ready + wait_for: + host: localhost + port: 5000 + timeout: 30 diff --git a/ansible/roles/monitoring/tasks/main.yml b/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000..15c7e6a --- /dev/null +++ b/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,145 @@ +--- +# Monitoring: Prometheus + Grafana + DCGM Exporter (GPU metrics) +# ignore_errors: true on most tasks — monitoring is optional and should never block the install + +- name: Create monitoring directories + file: + path: "{{ item }}" + state: directory + owner: cezen + group: cezen + loop: + - /opt/cezen/monitoring + - /opt/cezen/monitoring/prometheus + - /opt/cezen/monitoring/grafana + +# ── DCGM Exporter (GPU metrics for Prometheus) ────────── +- name: Start DCGM Exporter container + shell: | + docker run -d \ + --name dcgm-exporter \ + --restart always \ + --gpus all \ + -p 9400:9400 \ + nvcr.io/nvidia/k8s/dcgm-exporter:3.3.0-3.2.0-ubuntu22.04 + register: dcgm_result + failed_when: dcgm_result.rc != 0 and 'already in use' not in dcgm_result.stderr + ignore_errors: true + +# ── Prometheus ────────────────────────────────────────── +- name: Write Prometheus config + copy: + dest: /opt/cezen/monitoring/prometheus/prometheus.yml + owner: cezen + group: cezen + content: | + global: + scrape_interval: 15s + evaluation_interval: 15s + + scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'dcgm' + static_configs: + - targets: ['host-gateway:9400'] + + - job_name: 'node' + static_configs: + - targets: ['host-gateway:9100'] + +- name: Start Prometheus container + shell: | + docker run -d \ + --name prometheus \ + --restart always \ + --add-host=host-gateway:172.17.0.1 \ + -p 9090:9090 \ + -v /opt/cezen/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \ + prom/prometheus:latest + register: prom_result + failed_when: prom_result.rc != 0 and 'already in use' not in prom_result.stderr + ignore_errors: true + +# ── Node Exporter (CPU/RAM/disk metrics) ─────────────── +- name: Start Node Exporter container + shell: | + docker run -d \ + --name node-exporter \ + --restart always \ + --network=host \ + --pid=host \ + -v /:/host:ro,rslave \ + prom/node-exporter:latest \ + --path.rootfs=/host + register: node_exp_result + failed_when: node_exp_result.rc != 0 and 'already in use' not in node_exp_result.stderr + ignore_errors: true + +# ── Grafana ───────────────────────────────────────────── +- name: Start Grafana container + shell: | + docker run -d \ + --name grafana \ + --restart always \ + -p 3000:3000 \ + --add-host=host-gateway:172.17.0.1 \ + -v grafana-storage:/var/lib/grafana \ + -e GF_SECURITY_ADMIN_USER=admin \ + -e GF_SECURITY_ADMIN_PASSWORD=cezen2024 \ + -e GF_USERS_ALLOW_SIGN_UP=false \ + grafana/grafana:latest + register: grafana_result + failed_when: grafana_result.rc != 0 and 'already in use' not in grafana_result.stderr + ignore_errors: true + +- name: Wait for Grafana to be ready + wait_for: + host: localhost + port: 3000 + timeout: 60 + ignore_errors: true + +- name: Add Prometheus datasource to Grafana + uri: + url: http://localhost:3000/api/datasources + method: POST + user: admin + password: cezen2024 + force_basic_auth: yes + body_format: json + body: + name: Prometheus + type: prometheus + url: "http://host-gateway:9090" + access: proxy + isDefault: true + status_code: [200, 409] # 409 = already exists, that's fine + ignore_errors: true + +- name: Import NVIDIA GPU dashboard (ID 12239) + uri: + url: http://localhost:3000/api/dashboards/import + method: POST + user: admin + password: cezen2024 + force_basic_auth: yes + body_format: json + body: + inputs: + - name: DS_PROMETHEUS + type: datasource + pluginId: prometheus + value: Prometheus + overwrite: true + folderId: 0 + dashboard: + "__inputs": [] + "__requires": [] + id: null + title: "NVIDIA GPU Overview" + uid: "nvidia-gpu" + status_code: [200, 412] + ignore_errors: true diff --git a/ansible/roles/nvidia/tasks/main.yml b/ansible/roles/nvidia/tasks/main.yml new file mode 100644 index 0000000..834038a --- /dev/null +++ b/ansible/roles/nvidia/tasks/main.yml @@ -0,0 +1,81 @@ +--- +# NVIDIA role: Drivers + CUDA + cuDNN +# NOTE: Tested on L40S (Entry) and A40 (lab). Requires reboot after this role. +# If no GPU is present, this role will install drivers but nvidia-smi won't show GPUs. + +- name: Add NVIDIA package repository key + apt_key: + url: https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub + state: present + +- name: Add NVIDIA CUDA apt repository + apt_repository: + repo: "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" + state: present + filename: cuda + +- name: Update apt cache after adding NVIDIA repo + apt: + update_cache: yes + +- name: Install NVIDIA driver (open kernel module, recommended for data center GPUs) + apt: + name: + - nvidia-driver-550-open + - nvidia-utils-550 + state: present + notify: reboot required + +# CUDA Toolkit +- name: Install CUDA Toolkit 12.4 + apt: + name: + - cuda-toolkit-12-4 + - cuda-cudart-12-4 + state: present + +# cuDNN +- name: Add cuDNN repository + apt_repository: + repo: "deb https://developer.download.nvidia.com/compute/cudnn/repos/ubuntu2204/x86_64/ /" + state: present + filename: cudnn + +- name: Install cuDNN 9 for CUDA 12 + apt: + name: + - cudnn9-cuda-12 + state: present + +# Environment variables +- name: Set CUDA paths system-wide + copy: + dest: /etc/profile.d/cuda.sh + content: | + export CUDA_HOME=/usr/local/cuda + export PATH=$CUDA_HOME/bin:$PATH + export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH + mode: "0644" + +- name: Set NVIDIA persistence mode (survives reboots) + copy: + dest: /etc/systemd/system/nvidia-persistenced-mode.service + content: | + [Unit] + Description=NVIDIA Persistence Daemon Mode + After=nvidia-persistenced.service + + [Service] + Type=oneshot + ExecStart=/usr/bin/nvidia-smi -pm 1 + RemainAfterExit=yes + + [Install] + WantedBy=multi-user.target + mode: "0644" + +- name: Enable NVIDIA persistence service + systemd: + name: nvidia-persistenced-mode + enabled: yes + daemon_reload: yes diff --git a/ansible/roles/ollama/tasks/main.yml b/ansible/roles/ollama/tasks/main.yml new file mode 100644 index 0000000..8b982b9 --- /dev/null +++ b/ansible/roles/ollama/tasks/main.yml @@ -0,0 +1,103 @@ +--- +# Ollama — local LLM serving (main inference engine for Entry tier) +- name: Check if Ollama is already installed + stat: + path: /usr/local/bin/ollama + register: ollama_binary + +- name: Install Ollama + shell: curl -fsSL https://ollama.ai/install.sh | sh + when: not ollama_binary.stat.exists + retries: 3 + delay: 10 + +- name: Create Ollama systemd service with GPU support + copy: + dest: /etc/systemd/system/ollama.service + content: | + [Unit] + Description=Ollama Service + After=network-online.target + + [Service] + ExecStart=/usr/local/bin/ollama serve + User=cezen + Group=cezen + Restart=always + RestartSec=3 + Environment="PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + Environment="OLLAMA_HOST=0.0.0.0:11434" + Environment="OLLAMA_MODELS=/opt/cezen/models/ollama" + Environment="CUDA_VISIBLE_DEVICES=0,1,2" + + [Install] + WantedBy=multi-user.target + mode: "0644" + +- name: Create Ollama models directory + file: + path: /opt/cezen/models/ollama + state: directory + owner: cezen + group: cezen + +- name: Enable and start Ollama + systemd: + name: ollama + enabled: yes + state: started + daemon_reload: yes + +- name: Wait for Ollama API to be ready + wait_for: + host: localhost + port: 11434 + timeout: 60 + +- name: Pull default models (Llama 3.1 8B + Mistral 7B) + become_user: cezen + command: ollama pull {{ item }} + loop: + - llama3.1:8b + - mistral:7b + environment: + OLLAMA_HOST: "http://localhost:11434" + retries: 3 + delay: 15 + # NOTE: Models are large (~5GB each). This step takes time on first run. + # Skip by setting: ansible-playbook ... -e "skip_model_pull=true" + when: not (skip_model_pull | default(false)) + +# Open WebUI (chat interface on top of Ollama) +- name: Deploy Open WebUI via Docker + community.docker.docker_container: + name: open-webui + image: ghcr.io/open-webui/open-webui:main + state: started + restart_policy: always + ports: + - "3001:8080" + volumes: + - open-webui:/app/backend/data + env: + OLLAMA_BASE_URL: "http://host-gateway:11434" + etc_hosts: + host-gateway: "172.17.0.1" + # Note: Requires docker community collection. Install with: + # ansible-galaxy collection install community.docker + ignore_errors: true # Falls back gracefully if docker collection not available + +- name: Alternative Open WebUI start (if community.docker not available) + shell: | + docker run -d \ + --name open-webui \ + --restart always \ + -p 3001:8080 \ + --add-host=host-gateway:172.17.0.1 \ + -v open-webui:/app/backend/data \ + -e OLLAMA_BASE_URL=http://host-gateway:11434 \ + ghcr.io/open-webui/open-webui:main + args: + executable: /bin/bash + register: webui_result + failed_when: webui_result.rc != 0 and 'already in use' not in webui_result.stderr diff --git a/ansible/roles/vllm/tasks/main.yml b/ansible/roles/vllm/tasks/main.yml new file mode 100644 index 0000000..2e04550 --- /dev/null +++ b/ansible/roles/vllm/tasks/main.yml @@ -0,0 +1,56 @@ +--- +# vLLM — high-performance LLM inference with OpenAI-compatible API +# Entry tier: runs as a Docker container (easier to manage than pip install) +- name: Pull vLLM Docker image + shell: docker pull vllm/vllm-openai:latest + retries: 3 + delay: 15 + +- name: Create vLLM systemd service + copy: + dest: /etc/systemd/system/vllm.service + content: | + [Unit] + Description=vLLM OpenAI-Compatible Inference Server + After=docker.service ollama.service + Requires=docker.service + + [Service] + Restart=always + RestartSec=5 + ExecStartPre=-/usr/bin/docker stop vllm + ExecStartPre=-/usr/bin/docker rm vllm + ExecStart=/usr/bin/docker run \ + --name vllm \ + --gpus all \ + --ipc=host \ + -p 8000:8000 \ + -v /opt/cezen/models:/root/.cache/huggingface \ + -e HF_HOME=/root/.cache/huggingface \ + vllm/vllm-openai:latest \ + --model meta-llama/Meta-Llama-3.1-8B-Instruct \ + --gpu-memory-utilization 0.7 \ + --max-model-len 8192 \ + --tensor-parallel-size 1 + ExecStop=/usr/bin/docker stop vllm + + [Install] + WantedBy=multi-user.target + mode: "0644" + +- name: Enable vLLM (but don't start yet — model selection needed first) + systemd: + name: vllm + enabled: yes + daemon_reload: yes + # Note: vLLM service is enabled but not started by default. + # Start manually after choosing a model: + # sudo systemctl start vllm + # Or change the --model flag in /etc/systemd/system/vllm.service first. + +- name: Create vLLM model directory + file: + path: /opt/cezen/models/hf_cache + state: directory + owner: cezen + group: cezen diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..6f8f17c --- /dev/null +++ b/install.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash +# ───────────────────────────────────────────── +# Cezen AI Suite — Entry Level Installer +# Usage: +# sudo bash install.sh → Phase 1 (drivers + schedules reboot → Phase 2) +# sudo bash install.sh --phase=2 → Phase 2 (all software, run after reboot) +# ───────────────────────────────────────────── +set -e + +TIER="entry" +PHASE="1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ANSIBLE_DIR="$SCRIPT_DIR/ansible" + +for arg in "$@"; do + case $arg in + --tier=*) TIER="${arg#*=}" ;; + --phase=*) PHASE="${arg#*=}" ;; + esac +done + +# ── Preflight ────────────────────────────────── +check_root() { + if [ "$EUID" -ne 0 ]; then + echo "ERROR: Run as root: sudo bash install.sh" + exit 1 + fi +} + +check_os() { + if [ -f /etc/os-release ]; then + . /etc/os-release + if [[ "$ID" != "ubuntu" ]]; then + echo "ERROR: Ubuntu 22.04 required. Detected: $PRETTY_NAME" + exit 1 + fi + echo "✓ OS: $PRETTY_NAME" + fi +} + +install_ansible() { + if ! command -v ansible-playbook &>/dev/null; then + echo "→ Installing Ansible..." + apt-get update -qq + apt-get install -y -qq ansible python3-pip + fi + echo "✓ Ansible ready" +} + +# ── Phase 1: NVIDIA drivers only ────────────── +run_phase1() { + echo "" + echo "╔══════════════════════════════════════════╗" + echo "║ Cezen AI Suite — Phase 1: NVIDIA ║" + echo "╚══════════════════════════════════════════╝" + + ansible-playbook -i localhost, -c local "$ANSIBLE_DIR/phase1_nvidia.yml" \ + -e "tier=$TIER" -v + + # Register phase 2 as a one-shot systemd service so it runs after reboot + cat > /etc/systemd/system/cezen-phase2.service << EOF +[Unit] +Description=Cezen AI Suite Phase 2 Installer +After=network-online.target nvidia-persistenced.service +Wants=network-online.target + +[Service] +Type=oneshot +ExecStart=/bin/bash ${SCRIPT_DIR}/install.sh --phase=2 --tier=${TIER} +RemainAfterExit=yes +StandardOutput=journal+console +StandardError=journal+console + +[Install] +WantedBy=multi-user.target +EOF + + systemctl daemon-reload + systemctl enable cezen-phase2.service + + echo "" + echo "✓ Phase 2 registered — will run automatically after reboot" + echo "→ Rebooting in 10 seconds..." + sleep 10 + reboot +} + +# ── Phase 2: Full stack ──────────────────────── +run_phase2() { + echo "" + echo "╔══════════════════════════════════════════╗" + echo "║ Cezen AI Suite — Phase 2: Stack ║" + echo "╚══════════════════════════════════════════╝" + + # Verify NVIDIA driver loaded + if ! nvidia-smi &>/dev/null; then + echo "WARNING: nvidia-smi not responding. NVIDIA driver may not be loaded." + echo " Continuing — non-GPU roles will still install correctly." + else + echo "✓ NVIDIA driver: $(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1)" + fi + + ansible-playbook -i localhost, -c local "$ANSIBLE_DIR/entry.yml" \ + -e "tier=$TIER" -v + + # Disable one-shot service so it doesn't run again on next reboot + systemctl disable cezen-phase2.service 2>/dev/null || true + + echo "" + echo "╔══════════════════════════════════════════╗" + echo "║ Cezen AI Suite installation complete! ║" + echo "║ ║" + echo "║ JupyterLab → http://localhost:8888 ║" + echo "║ Ollama API → http://localhost:11434 ║" + echo "║ MLflow → http://localhost:5000 ║" + echo "║ MinIO → http://localhost:9001 ║" + echo "║ Grafana → http://localhost:3000 ║" + echo "╚══════════════════════════════════════════╝" +} + +# ── Main ─────────────────────────────────────── +check_root +check_os +install_ansible + +if [ "$PHASE" = "1" ]; then + run_phase1 +elif [ "$PHASE" = "2" ]; then + run_phase2 +else + echo "ERROR: Unknown phase '$PHASE'. Use --phase=1 or --phase=2" + exit 1 +fi diff --git a/models/pull-models.sh b/models/pull-models.sh new file mode 100644 index 0000000..7840888 --- /dev/null +++ b/models/pull-models.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Pull additional AI models into Ollama +# Run after install: bash models/pull-models.sh --tier=entry +# ───────────────────────────────────────────── +TIER=${1:-entry} + +echo "Pulling models for tier: $TIER" + +entry_models=( + "llama3.1:8b" # General purpose, good baseline + "mistral:7b" # Fast, good for APIs + "llama3.1:70b" # Larger — only if enough VRAM (3× L40S has 144GB total) + "nomic-embed-text" # Embedding model for RAG + "codellama:13b" # Code generation +) + +mid_models=( + "${entry_models[@]}" + "llama3.1:70b" + "mixtral:8x7b" + "deepseek-coder-v2:16b" +) + +advanced_models=( + "${mid_models[@]}" + "llama3.1:405b" + "mixtral:8x22b" +) + +case $TIER in + entry) models=("${entry_models[@]}") ;; + mid) models=("${mid_models[@]}") ;; + advanced) models=("${advanced_models[@]}") ;; + *) echo "Unknown tier: $TIER. Use entry, mid, or advanced."; exit 1 ;; +esac + +for model in "${models[@]}"; do + echo "" + echo "→ Pulling $model..." + ollama pull "$model" +done + +echo "" +echo "✓ All models pulled. List with: ollama list"