Improve ISO installer non-GPU and bundled setup flow

This commit is contained in:
Jino Jose 2026-06-24 17:03:59 +05:30
parent 82cd52a409
commit ec296b3d42
7 changed files with 148 additions and 25 deletions

View File

@ -11,6 +11,7 @@
python_version: "3.11"
cuda_version: "12.4"
skip_roles: "" # comma-separated list of role names to skip (set by install.sh)
gpu_available: false
roles:
- role: base

View File

@ -34,6 +34,21 @@
enabled: yes
state: started
- name: Configure Docker standard runtime for non-GPU installs
copy:
dest: /etc/docker/daemon.json
content: |
{
"log-driver": "json-file",
"log-opts": {
"max-size": "100m",
"max-file": "3"
}
}
mode: "0644"
notify: restart docker
when: not (gpu_available | default(false) | bool)
# NVIDIA Container Toolkit (allows GPU passthrough into containers)
- name: Add NVIDIA Container Toolkit repo
shell: |
@ -44,16 +59,19 @@
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
args:
creates: /etc/apt/sources.list.d/nvidia-container-toolkit.list
when: gpu_available | default(false) | bool
- name: Install NVIDIA Container Toolkit
apt:
name: nvidia-container-toolkit
state: present
update_cache: yes
when: gpu_available | default(false) | bool
- name: Configure Docker to use NVIDIA runtime
shell: nvidia-ctk runtime configure --runtime=docker
notify: restart docker
when: gpu_available | default(false) | bool
- name: Set NVIDIA as default Docker runtime
copy:
@ -75,3 +93,4 @@
}
mode: "0644"
notify: restart docker
when: gpu_available | default(false) | bool

View File

@ -13,6 +13,7 @@
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PACKAGE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
WORK_DIR="/tmp/cezen-iso-work"
ORIGINAL_ISO="/tmp/ubuntu-22.04.5-live-server-amd64.iso"
OUTPUT_ISO="$SCRIPT_DIR/cezen-ai-ubuntu2204.iso"
@ -26,7 +27,7 @@ echo ""
# ── Install build tools ────────────────────────
echo "→ Installing build tools..."
apt-get update -qq
apt-get install -y -qq xorriso wget isolinux
apt-get install -y -qq xorriso wget isolinux rsync
echo "✓ Tools ready"
# ── Download Ubuntu ISO ────────────────────────
@ -55,6 +56,16 @@ cp "$SCRIPT_DIR/user-data" "$WORK_DIR/nocloud/user-data"
cp "$SCRIPT_DIR/meta-data" "$WORK_DIR/nocloud/meta-data"
echo "✓ user-data and meta-data injected"
# Keep the installer payload on the ISO so first boot does not depend on a
# private Git server being reachable before the setup UI can start.
echo "→ Bundling Cezen AI installer payload..."
mkdir -p "$WORK_DIR/cezen-aipackage"
rsync -a --delete \
--exclude 'autoinstall/cezen-ai-ubuntu2204.iso' \
--exclude '*.iso' \
"$PACKAGE_DIR/" "$WORK_DIR/cezen-aipackage/"
echo "✓ Installer payload bundled"
# ── Patch GRUB ────────────────────────────────
echo "→ Patching GRUB config..."
GRUB_CFG="$WORK_DIR/boot/grub/grub.cfg"

View File

@ -169,18 +169,22 @@ echo ""
# Write selected tools to a config file so install.sh can read it
mkdir -p /opt/cezen
cat > /opt/cezen/install.conf << EOF
TIER=${TIER}
SKIP_ROLES=""
EOF
# Determine which roles to skip based on tool selection
for role in ollama jupyterlab chromadb vllm mlflow minio monitoring k3s; do
if ! echo "$TOOLS" | grep -q "$role"; then
sed -i "s/SKIP_ROLES=\"\"/SKIP_ROLES=\"${role}\"/" /opt/cezen/install.conf
if [ -n "$SKIP_ROLES" ]; then
SKIP_ROLES="${SKIP_ROLES},${role}"
else
SKIP_ROLES="${role}"
fi
fi
done
cat > /opt/cezen/install.conf << EOF
TIER=${TIER}
SKIP_ROLES=${SKIP_ROLES}
EOF
# Mark as configured so this wizard doesn't run again
touch /opt/cezen/.setup-done

View File

@ -60,8 +60,10 @@ autoinstall:
- echo "cezen ALL=(ALL) NOPASSWD:ALL" > /target/etc/sudoers.d/cezen
- chmod 440 /target/etc/sudoers.d/cezen
# Clone the Cezen AI installer
- git clone https://cgit.cezentech.com/jinojose/aipackage.git /target/opt/aipackage || true
# Install the Cezen AI payload from the ISO first. Fall back to Git only
# when building from older media that does not contain /cdrom/cezen-aipackage.
- mkdir -p /target/opt/aipackage
- cp -a /cdrom/cezen-aipackage/. /target/opt/aipackage/ || git clone https://cgit.cezentech.com/jinojose/aipackage.git /target/opt/aipackage
# Deploy the web setup server
- mkdir -p /target/opt/cezen

View File

@ -36,7 +36,30 @@ def get_interfaces():
except:
return ["eth0"]
def has_nvidia_gpu():
"""Detect NVIDIA PCI devices before the driver or nvidia-smi exists."""
try:
for root, _, files in os.walk("/sys/bus/pci/devices"):
if "vendor" not in files:
continue
with open(os.path.join(root, "vendor")) as f:
if f.read().strip().lower() == "0x10de":
return True
except Exception:
pass
return False
def validate_static_network(ip, prefix, gateway, dns):
ipaddress.ip_address(ip)
ipaddress.ip_address(gateway)
ipaddress.ip_address(dns)
prefix_int = int(prefix)
if prefix_int < 1 or prefix_int > 32:
raise ValueError("CIDR prefix must be between 1 and 32")
return str(prefix_int)
def apply_static_ip(iface, ip, prefix, gateway, dns):
prefix = validate_static_network(ip, prefix, gateway, dns)
config = f"""network:
version: 2
ethernets:
@ -69,9 +92,10 @@ def run_install(tier, skip_tools):
open(SETUP_DONE_FILE, "w").close()
env = os.environ.copy()
# Phase 1: installs NVIDIA drivers, registers cezen-phase2 systemd service,
# then reboots. Phase 2 (full stack) runs automatically after reboot.
cmd = ["bash", f"{AIPACKAGE_DIR}/install.sh", "--phase=1", f"--tier={tier}"]
# Fresh NVIDIA servers do not have nvidia-smi yet, so detect the PCI
# device and run phase 1 to install drivers before the AI stack.
phase = "1" if has_nvidia_gpu() else "2"
cmd = ["bash", f"{AIPACKAGE_DIR}/install.sh", f"--phase={phase}", f"--tier={tier}"]
with open(INSTALL_LOG, "w") as log:
proc = subprocess.Popen(cmd, stdout=log, stderr=log, env=env)
proc.wait()
@ -525,11 +549,18 @@ function streamLog() {
}
};
let reconnectAttempts = 0;
es.onerror = () => {
es.close();
if (!installDone) {
// Connection lost most likely the server rebooted
if (installDone) return;
reconnectAttempts++;
lbl.textContent = `Connection lost reconnecting... (${reconnectAttempts})`;
if (reconnectAttempts >= 5) {
// After 5 failed reconnects assume it's a real reboot
showRebootNotice();
} else {
// Try reconnecting after a delay
setTimeout(() => { if (!installDone) streamLog(); }, 4000);
}
};
}
@ -653,14 +684,46 @@ class Handler(BaseHTTPRequestHandler):
self.end_headers()
# ─── Main ─────────────────────────────────────────────────
def show_console_banner(ip):
"""Write the setup URL banner to /dev/tty1 so it appears on the physical console."""
banner = f"""
\033[1;36m
CEZEN AI SUITE SERVER SETUP
Open a browser on any computer on this network:
\033[1;33m http://{ip:<42}\033[1;36m
\033[1;33m http://cezenai.local\033[1;36m
Complete setup from your browser no keyboard
input needed here.
\033[0m
"""
# Write to tty1 (physical console) and stdout (journalctl)
print(banner)
try:
with open("/dev/tty1", "w") as tty:
tty.write(banner)
except Exception:
pass # tty1 may not be accessible in all environments
# Also update /etc/issue so the URL appears above the login prompt
try:
with open("/etc/issue", "w") as f:
f.write(f"Ubuntu 22.04.5 LTS \\n \\l\n\n")
f.write(f" \033[1;36mCezen AI Suite Setup:\033[0m http://{ip} | http://cezenai.local\n\n")
except Exception:
pass
if __name__ == "__main__":
# Ensure log file exists
open(INSTALL_LOG, "a").close()
ip = get_ip()
print(f"\n{'='*50}")
print(f" Cezen AI Suite — Setup Server")
print(f" Open in browser: http://{ip}")
print(f" Or: http://cezenai.local")
print(f"{'='*50}\n")
show_console_banner(ip)
server = HTTPServer(("0.0.0.0", 80), Handler)
server.serve_forever()

View File

@ -52,6 +52,20 @@ install_ansible() {
echo "✓ Ansible ready"
}
has_nvidia_pci_gpu() {
for vendor_file in /sys/bus/pci/devices/*/vendor; do
[ -f "$vendor_file" ] || continue
if [ "$(tr '[:upper:]' '[:lower:]' < "$vendor_file")" = "0x10de" ]; then
return 0
fi
done
return 1
}
has_working_nvidia_driver() {
command -v nvidia-smi &>/dev/null && nvidia-smi &>/dev/null
}
# ── Phase 1: NVIDIA drivers only ──────────────
run_phase1() {
echo ""
@ -59,6 +73,13 @@ run_phase1() {
echo "║ Cezen AI Suite — Phase 1: NVIDIA ║"
echo "╚══════════════════════════════════════════╝"
if ! has_nvidia_pci_gpu; then
echo "No NVIDIA GPU found. Continuing with CPU/non-GPU installation path."
PHASE="2"
run_phase2
return
fi
ANSIBLE_STDOUT_CALLBACK=yaml \
ansible-playbook -i localhost, -c local "$ANSIBLE_DIR/phase1_nvidia.yml" \
-e "tier=$TIER"
@ -72,7 +93,7 @@ Wants=network-online.target
[Service]
Type=oneshot
ExecStart=/bin/bash ${SCRIPT_DIR}/install.sh --phase=2 --tier=${TIER}
ExecStart=/bin/bash -lc 'set -o pipefail; /bin/bash ${SCRIPT_DIR}/install.sh --phase=2 --tier=${TIER} 2>&1 | tee -a /var/log/cezen-install.log'
RemainAfterExit=yes
StandardOutput=journal+console
StandardError=journal+console
@ -98,17 +119,19 @@ run_phase2() {
echo "║ Cezen AI Suite — Phase 2: Stack ║"
echo "╚══════════════════════════════════════════╝"
# Verify NVIDIA driver loaded
if ! nvidia-smi &>/dev/null; then
echo "WARNING: nvidia-smi not responding. NVIDIA driver may not be loaded."
echo " Continuing — non-GPU roles will still install correctly."
GPU_AVAILABLE=false
if ! has_working_nvidia_driver; then
echo "No working NVIDIA GPU/driver found. Continuing with CPU/non-GPU installation path."
echo "GPU-only features such as NVIDIA Docker runtime, DCGM metrics, and vLLM serving will be skipped or left inactive."
else
GPU_AVAILABLE=true
echo "✓ NVIDIA driver: $(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1)"
fi
# Build skip_roles extra var (comma-separated list, empty string = skip nothing)
EXTRA_VARS="tier=$TIER skip_roles=\"$SKIP_ROLES\""
EXTRA_VARS="tier=$TIER skip_roles=\"$SKIP_ROLES\" gpu_available=$GPU_AVAILABLE"
echo "→ Tier: $TIER | Skip: ${SKIP_ROLES:-none}"
echo "→ GPU available: $GPU_AVAILABLE"
ANSIBLE_STDOUT_CALLBACK=yaml \
ansible-playbook -i localhost, -c local "$ANSIBLE_DIR/entry.yml" \