--- # Monitoring: Prometheus + Grafana + DCGM Exporter (GPU metrics) # ignore_errors: true on most tasks — monitoring is optional and should never block the install - name: Create monitoring directories file: path: "{{ item }}" state: directory owner: cezen group: cezen loop: - /opt/cezen/monitoring - /opt/cezen/monitoring/prometheus - /opt/cezen/monitoring/grafana # ── DCGM Exporter (GPU metrics for Prometheus) ────────── - name: Start DCGM Exporter container shell: | docker run -d \ --name dcgm-exporter \ --restart always \ --gpus all \ -p 9400:9400 \ nvcr.io/nvidia/k8s/dcgm-exporter:3.3.0-3.2.0-ubuntu22.04 register: dcgm_result failed_when: dcgm_result.rc != 0 and 'already in use' not in dcgm_result.stderr ignore_errors: true when: gpu_available | default(false) | bool # ── Prometheus ────────────────────────────────────────── - name: Write Prometheus config copy: dest: /opt/cezen/monitoring/prometheus/prometheus.yml owner: cezen group: cezen content: | global: scrape_interval: 15s evaluation_interval: 15s scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] - job_name: 'dcgm' static_configs: - targets: ['host-gateway:9400'] - job_name: 'node' static_configs: - targets: ['host-gateway:9100'] - name: Start Prometheus container shell: | docker run -d \ --name prometheus \ --restart always \ --add-host=host-gateway:172.17.0.1 \ -p 9090:9090 \ -v /opt/cezen/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \ prom/prometheus:latest register: prom_result failed_when: prom_result.rc != 0 and 'already in use' not in prom_result.stderr ignore_errors: true # ── Node Exporter (CPU/RAM/disk metrics) ─────────────── - name: Start Node Exporter container shell: | docker run -d \ --name node-exporter \ --restart always \ --network=host \ --pid=host \ -v /:/host:ro,rslave \ prom/node-exporter:latest \ --path.rootfs=/host register: node_exp_result failed_when: node_exp_result.rc != 0 and 'already in use' not in node_exp_result.stderr ignore_errors: true # ── Grafana ───────────────────────────────────────────── - name: Start Grafana container shell: | docker run -d \ --name grafana \ --restart always \ -p 3000:3000 \ --add-host=host-gateway:172.17.0.1 \ -v grafana-storage:/var/lib/grafana \ -e GF_SECURITY_ADMIN_USER=admin \ -e GF_SECURITY_ADMIN_PASSWORD=cezen2024 \ -e GF_USERS_ALLOW_SIGN_UP=false \ grafana/grafana:latest register: grafana_result failed_when: grafana_result.rc != 0 and 'already in use' not in grafana_result.stderr ignore_errors: true - name: Wait for Grafana to be ready wait_for: host: localhost port: 3000 timeout: 120 register: grafana_wait ignore_errors: true - name: Add Prometheus datasource to Grafana uri: url: http://localhost:3000/api/datasources method: POST user: admin password: cezen2024 force_basic_auth: yes body_format: json body: name: Prometheus type: prometheus url: "http://host-gateway:9090" access: proxy isDefault: true status_code: [200, 409] # 409 = already exists, that's fine ignore_errors: true when: not (grafana_wait is failed) - name: Import NVIDIA GPU dashboard (ID 12239) uri: url: http://localhost:3000/api/dashboards/import method: POST user: admin password: cezen2024 force_basic_auth: yes body_format: json body: inputs: - name: DS_PROMETHEUS type: datasource pluginId: prometheus value: Prometheus overwrite: true folderId: 0 dashboard: "__inputs": [] "__requires": [] id: null title: "NVIDIA GPU Overview" uid: "nvidia-gpu" status_code: [200, 412] ignore_errors: true when: - not (grafana_wait is failed) - gpu_available | default(false) | bool