aipackage/ansible/roles/monitoring/tasks/main.yml

---
# Monitoring: Prometheus + Grafana + DCGM Exporter (GPU metrics)
# ignore_errors: true on most tasks — monitoring is optional and should never block the install

- name: Create monitoring directories
  file:
    path: "{{ item }}"
    state: directory
    owner: cezen
    group: cezen
  loop:
    - /opt/cezen/monitoring
    - /opt/cezen/monitoring/prometheus
    - /opt/cezen/monitoring/grafana

# ── DCGM Exporter (GPU metrics for Prometheus) ──────────
- name: Start DCGM Exporter container
  shell: |
    docker run -d \
      --name dcgm-exporter \
      --restart always \
      --gpus all \
      -p 9400:9400 \
      nvcr.io/nvidia/k8s/dcgm-exporter:3.3.0-3.2.0-ubuntu22.04
  register: dcgm_result
  failed_when: dcgm_result.rc != 0 and 'already in use' not in dcgm_result.stderr
  ignore_errors: true
  when: gpu_available | default(false) | bool

# ── Prometheus ──────────────────────────────────────────
- name: Write Prometheus config
  copy:
    dest: /opt/cezen/monitoring/prometheus/prometheus.yml
    owner: cezen
    group: cezen
    content: |
      global:
        scrape_interval: 15s
        evaluation_interval: 15s

      scrape_configs:
        - job_name: 'prometheus'
          static_configs:
            - targets: ['localhost:9090']

        - job_name: 'dcgm'
          static_configs:
            - targets: ['host-gateway:9400']

        - job_name: 'node'
          static_configs:
            - targets: ['host-gateway:9100']

- name: Start Prometheus container
  shell: |
    docker run -d \
      --name prometheus \
      --restart always \
      --add-host=host-gateway:172.17.0.1 \
      -p 9090:9090 \
      -v /opt/cezen/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \
      prom/prometheus:latest
  register: prom_result
  failed_when: prom_result.rc != 0 and 'already in use' not in prom_result.stderr
  ignore_errors: true

# ── Node Exporter (CPU/RAM/disk metrics) ───────────────
- name: Start Node Exporter container
  shell: |
    docker run -d \
      --name node-exporter \
      --restart always \
      --network=host \
      --pid=host \
      -v /:/host:ro,rslave \
      prom/node-exporter:latest \
      --path.rootfs=/host
  register: node_exp_result
  failed_when: node_exp_result.rc != 0 and 'already in use' not in node_exp_result.stderr
  ignore_errors: true

# ── Grafana ─────────────────────────────────────────────
- name: Start Grafana container
  shell: |
    docker run -d \
      --name grafana \
      --restart always \
      -p 3000:3000 \
      --add-host=host-gateway:172.17.0.1 \
      -v grafana-storage:/var/lib/grafana \
      -e GF_SECURITY_ADMIN_USER=admin \
      -e GF_SECURITY_ADMIN_PASSWORD=cezen2024 \
      -e GF_USERS_ALLOW_SIGN_UP=false \
      grafana/grafana:latest
  register: grafana_result
  failed_when: grafana_result.rc != 0 and 'already in use' not in grafana_result.stderr
  ignore_errors: true

- name: Wait for Grafana to be ready
  wait_for:
    host: localhost
    port: 3000
    timeout: 120
  register: grafana_wait
  ignore_errors: true

- name: Add Prometheus datasource to Grafana
  uri:
    url: http://localhost:3000/api/datasources
    method: POST
    user: admin
    password: cezen2024
    force_basic_auth: yes
    body_format: json
    body:
      name: Prometheus
      type: prometheus
      url: "http://host-gateway:9090"
      access: proxy
      isDefault: true
    status_code: [200, 409]  # 409 = already exists, that's fine
  ignore_errors: true
  when: not (grafana_wait is failed)

- name: Import NVIDIA GPU dashboard (ID 12239)
  uri:
    url: http://localhost:3000/api/dashboards/import
    method: POST
    user: admin
    password: cezen2024
    force_basic_auth: yes
    body_format: json
    body:
      inputs:
        - name: DS_PROMETHEUS
          type: datasource
          pluginId: prometheus
          value: Prometheus
      overwrite: true
      folderId: 0
      dashboard:
        "__inputs": []
        "__requires": []
        id: null
        title: "NVIDIA GPU Overview"
        uid: "nvidia-gpu"
    status_code: [200, 412]
  ignore_errors: true
  when:
    - not (grafana_wait is failed)
    - gpu_available | default(false) | bool