From ec296b3d4201c1963a9937baadf0fbccd2126bae Mon Sep 17 00:00:00 2001 From: Jino Jose Date: Wed, 24 Jun 2026 17:03:59 +0530 Subject: [PATCH] Improve ISO installer non-GPU and bundled setup flow --- ansible/entry.yml | 1 + ansible/roles/docker/tasks/main.yml | 19 +++++++ autoinstall/build-iso.sh | 13 ++++- autoinstall/firstboot-setup.sh | 16 +++--- autoinstall/user-data | 6 ++- autoinstall/websetup/server.py | 83 +++++++++++++++++++++++++---- install.sh | 35 +++++++++--- 7 files changed, 148 insertions(+), 25 deletions(-) diff --git a/ansible/entry.yml b/ansible/entry.yml index 5fbad77..4a47c93 100644 --- a/ansible/entry.yml +++ b/ansible/entry.yml @@ -11,6 +11,7 @@ python_version: "3.11" cuda_version: "12.4" skip_roles: "" # comma-separated list of role names to skip (set by install.sh) + gpu_available: false roles: - role: base diff --git a/ansible/roles/docker/tasks/main.yml b/ansible/roles/docker/tasks/main.yml index d1dfc7c..d64eb20 100644 --- a/ansible/roles/docker/tasks/main.yml +++ b/ansible/roles/docker/tasks/main.yml @@ -34,6 +34,21 @@ enabled: yes state: started +- name: Configure Docker standard runtime for non-GPU installs + copy: + dest: /etc/docker/daemon.json + content: | + { + "log-driver": "json-file", + "log-opts": { + "max-size": "100m", + "max-file": "3" + } + } + mode: "0644" + notify: restart docker + when: not (gpu_available | default(false) | bool) + # NVIDIA Container Toolkit (allows GPU passthrough into containers) - name: Add NVIDIA Container Toolkit repo shell: | @@ -44,16 +59,19 @@ tee /etc/apt/sources.list.d/nvidia-container-toolkit.list args: creates: /etc/apt/sources.list.d/nvidia-container-toolkit.list + when: gpu_available | default(false) | bool - name: Install NVIDIA Container Toolkit apt: name: nvidia-container-toolkit state: present update_cache: yes + when: gpu_available | default(false) | bool - name: Configure Docker to use NVIDIA runtime shell: nvidia-ctk runtime configure --runtime=docker notify: restart docker + when: gpu_available | default(false) | bool - name: Set NVIDIA as default Docker runtime copy: @@ -75,3 +93,4 @@ } mode: "0644" notify: restart docker + when: gpu_available | default(false) | bool diff --git a/autoinstall/build-iso.sh b/autoinstall/build-iso.sh index 942c3e8..0bc6e5a 100644 --- a/autoinstall/build-iso.sh +++ b/autoinstall/build-iso.sh @@ -13,6 +13,7 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PACKAGE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" WORK_DIR="/tmp/cezen-iso-work" ORIGINAL_ISO="/tmp/ubuntu-22.04.5-live-server-amd64.iso" OUTPUT_ISO="$SCRIPT_DIR/cezen-ai-ubuntu2204.iso" @@ -26,7 +27,7 @@ echo "" # ── Install build tools ──────────────────────── echo "→ Installing build tools..." apt-get update -qq -apt-get install -y -qq xorriso wget isolinux +apt-get install -y -qq xorriso wget isolinux rsync echo "✓ Tools ready" # ── Download Ubuntu ISO ──────────────────────── @@ -55,6 +56,16 @@ cp "$SCRIPT_DIR/user-data" "$WORK_DIR/nocloud/user-data" cp "$SCRIPT_DIR/meta-data" "$WORK_DIR/nocloud/meta-data" echo "✓ user-data and meta-data injected" +# Keep the installer payload on the ISO so first boot does not depend on a +# private Git server being reachable before the setup UI can start. +echo "→ Bundling Cezen AI installer payload..." +mkdir -p "$WORK_DIR/cezen-aipackage" +rsync -a --delete \ + --exclude 'autoinstall/cezen-ai-ubuntu2204.iso' \ + --exclude '*.iso' \ + "$PACKAGE_DIR/" "$WORK_DIR/cezen-aipackage/" +echo "✓ Installer payload bundled" + # ── Patch GRUB ──────────────────────────────── echo "→ Patching GRUB config..." GRUB_CFG="$WORK_DIR/boot/grub/grub.cfg" diff --git a/autoinstall/firstboot-setup.sh b/autoinstall/firstboot-setup.sh index a520749..ca1895d 100644 --- a/autoinstall/firstboot-setup.sh +++ b/autoinstall/firstboot-setup.sh @@ -169,18 +169,22 @@ echo "" # Write selected tools to a config file so install.sh can read it mkdir -p /opt/cezen -cat > /opt/cezen/install.conf << EOF -TIER=${TIER} SKIP_ROLES="" -EOF - -# Determine which roles to skip based on tool selection for role in ollama jupyterlab chromadb vllm mlflow minio monitoring k3s; do if ! echo "$TOOLS" | grep -q "$role"; then - sed -i "s/SKIP_ROLES=\"\"/SKIP_ROLES=\"${role}\"/" /opt/cezen/install.conf + if [ -n "$SKIP_ROLES" ]; then + SKIP_ROLES="${SKIP_ROLES},${role}" + else + SKIP_ROLES="${role}" + fi fi done +cat > /opt/cezen/install.conf << EOF +TIER=${TIER} +SKIP_ROLES=${SKIP_ROLES} +EOF + # Mark as configured so this wizard doesn't run again touch /opt/cezen/.setup-done diff --git a/autoinstall/user-data b/autoinstall/user-data index 4dac443..7cbd011 100644 --- a/autoinstall/user-data +++ b/autoinstall/user-data @@ -60,8 +60,10 @@ autoinstall: - echo "cezen ALL=(ALL) NOPASSWD:ALL" > /target/etc/sudoers.d/cezen - chmod 440 /target/etc/sudoers.d/cezen - # Clone the Cezen AI installer - - git clone https://cgit.cezentech.com/jinojose/aipackage.git /target/opt/aipackage || true + # Install the Cezen AI payload from the ISO first. Fall back to Git only + # when building from older media that does not contain /cdrom/cezen-aipackage. + - mkdir -p /target/opt/aipackage + - cp -a /cdrom/cezen-aipackage/. /target/opt/aipackage/ || git clone https://cgit.cezentech.com/jinojose/aipackage.git /target/opt/aipackage # Deploy the web setup server - mkdir -p /target/opt/cezen diff --git a/autoinstall/websetup/server.py b/autoinstall/websetup/server.py index c038000..b35558a 100644 --- a/autoinstall/websetup/server.py +++ b/autoinstall/websetup/server.py @@ -36,7 +36,30 @@ def get_interfaces(): except: return ["eth0"] +def has_nvidia_gpu(): + """Detect NVIDIA PCI devices before the driver or nvidia-smi exists.""" + try: + for root, _, files in os.walk("/sys/bus/pci/devices"): + if "vendor" not in files: + continue + with open(os.path.join(root, "vendor")) as f: + if f.read().strip().lower() == "0x10de": + return True + except Exception: + pass + return False + +def validate_static_network(ip, prefix, gateway, dns): + ipaddress.ip_address(ip) + ipaddress.ip_address(gateway) + ipaddress.ip_address(dns) + prefix_int = int(prefix) + if prefix_int < 1 or prefix_int > 32: + raise ValueError("CIDR prefix must be between 1 and 32") + return str(prefix_int) + def apply_static_ip(iface, ip, prefix, gateway, dns): + prefix = validate_static_network(ip, prefix, gateway, dns) config = f"""network: version: 2 ethernets: @@ -69,9 +92,10 @@ def run_install(tier, skip_tools): open(SETUP_DONE_FILE, "w").close() env = os.environ.copy() - # Phase 1: installs NVIDIA drivers, registers cezen-phase2 systemd service, - # then reboots. Phase 2 (full stack) runs automatically after reboot. - cmd = ["bash", f"{AIPACKAGE_DIR}/install.sh", "--phase=1", f"--tier={tier}"] + # Fresh NVIDIA servers do not have nvidia-smi yet, so detect the PCI + # device and run phase 1 to install drivers before the AI stack. + phase = "1" if has_nvidia_gpu() else "2" + cmd = ["bash", f"{AIPACKAGE_DIR}/install.sh", f"--phase={phase}", f"--tier={tier}"] with open(INSTALL_LOG, "w") as log: proc = subprocess.Popen(cmd, stdout=log, stderr=log, env=env) proc.wait() @@ -525,11 +549,18 @@ function streamLog() { } }; + let reconnectAttempts = 0; es.onerror = () => { es.close(); - if (!installDone) { - // Connection lost — most likely the server rebooted + if (installDone) return; + reconnectAttempts++; + lbl.textContent = `Connection lost — reconnecting... (${reconnectAttempts})`; + if (reconnectAttempts >= 5) { + // After 5 failed reconnects assume it's a real reboot showRebootNotice(); + } else { + // Try reconnecting after a delay + setTimeout(() => { if (!installDone) streamLog(); }, 4000); } }; } @@ -653,14 +684,46 @@ class Handler(BaseHTTPRequestHandler): self.end_headers() # ─── Main ───────────────────────────────────────────────── +def show_console_banner(ip): + """Write the setup URL banner to /dev/tty1 so it appears on the physical console.""" + banner = f""" + +\033[1;36m╔══════════════════════════════════════════════════════╗ +║ ║ +║ CEZEN AI SUITE — SERVER SETUP ║ +║ ║ +║ Open a browser on any computer on this network: ║ +║ ║ +║ \033[1;33m➜ http://{ip:<42}\033[1;36m║ +║ \033[1;33m➜ http://cezenai.local\033[1;36m ║ +║ ║ +║ Complete setup from your browser — no keyboard ║ +║ input needed here. ║ +║ ║ +╚══════════════════════════════════════════════════════╝\033[0m + +""" + # Write to tty1 (physical console) and stdout (journalctl) + print(banner) + try: + with open("/dev/tty1", "w") as tty: + tty.write(banner) + except Exception: + pass # tty1 may not be accessible in all environments + + # Also update /etc/issue so the URL appears above the login prompt + try: + with open("/etc/issue", "w") as f: + f.write(f"Ubuntu 22.04.5 LTS \\n \\l\n\n") + f.write(f" \033[1;36mCezen AI Suite Setup:\033[0m http://{ip} | http://cezenai.local\n\n") + except Exception: + pass + + if __name__ == "__main__": # Ensure log file exists open(INSTALL_LOG, "a").close() ip = get_ip() - print(f"\n{'='*50}") - print(f" Cezen AI Suite — Setup Server") - print(f" Open in browser: http://{ip}") - print(f" Or: http://cezenai.local") - print(f"{'='*50}\n") + show_console_banner(ip) server = HTTPServer(("0.0.0.0", 80), Handler) server.serve_forever() diff --git a/install.sh b/install.sh index 148028d..622a4f8 100644 --- a/install.sh +++ b/install.sh @@ -52,6 +52,20 @@ install_ansible() { echo "✓ Ansible ready" } +has_nvidia_pci_gpu() { + for vendor_file in /sys/bus/pci/devices/*/vendor; do + [ -f "$vendor_file" ] || continue + if [ "$(tr '[:upper:]' '[:lower:]' < "$vendor_file")" = "0x10de" ]; then + return 0 + fi + done + return 1 +} + +has_working_nvidia_driver() { + command -v nvidia-smi &>/dev/null && nvidia-smi &>/dev/null +} + # ── Phase 1: NVIDIA drivers only ────────────── run_phase1() { echo "" @@ -59,6 +73,13 @@ run_phase1() { echo "║ Cezen AI Suite — Phase 1: NVIDIA ║" echo "╚══════════════════════════════════════════╝" + if ! has_nvidia_pci_gpu; then + echo "No NVIDIA GPU found. Continuing with CPU/non-GPU installation path." + PHASE="2" + run_phase2 + return + fi + ANSIBLE_STDOUT_CALLBACK=yaml \ ansible-playbook -i localhost, -c local "$ANSIBLE_DIR/phase1_nvidia.yml" \ -e "tier=$TIER" @@ -72,7 +93,7 @@ Wants=network-online.target [Service] Type=oneshot -ExecStart=/bin/bash ${SCRIPT_DIR}/install.sh --phase=2 --tier=${TIER} +ExecStart=/bin/bash -lc 'set -o pipefail; /bin/bash ${SCRIPT_DIR}/install.sh --phase=2 --tier=${TIER} 2>&1 | tee -a /var/log/cezen-install.log' RemainAfterExit=yes StandardOutput=journal+console StandardError=journal+console @@ -98,17 +119,19 @@ run_phase2() { echo "║ Cezen AI Suite — Phase 2: Stack ║" echo "╚══════════════════════════════════════════╝" - # Verify NVIDIA driver loaded - if ! nvidia-smi &>/dev/null; then - echo "WARNING: nvidia-smi not responding. NVIDIA driver may not be loaded." - echo " Continuing — non-GPU roles will still install correctly." + GPU_AVAILABLE=false + if ! has_working_nvidia_driver; then + echo "No working NVIDIA GPU/driver found. Continuing with CPU/non-GPU installation path." + echo "GPU-only features such as NVIDIA Docker runtime, DCGM metrics, and vLLM serving will be skipped or left inactive." else + GPU_AVAILABLE=true echo "✓ NVIDIA driver: $(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1)" fi # Build skip_roles extra var (comma-separated list, empty string = skip nothing) - EXTRA_VARS="tier=$TIER skip_roles=\"$SKIP_ROLES\"" + EXTRA_VARS="tier=$TIER skip_roles=\"$SKIP_ROLES\" gpu_available=$GPU_AVAILABLE" echo "→ Tier: $TIER | Skip: ${SKIP_ROLES:-none}" + echo "→ GPU available: $GPU_AVAILABLE" ANSIBLE_STDOUT_CALLBACK=yaml \ ansible-playbook -i localhost, -c local "$ANSIBLE_DIR/entry.yml" \