aipackage/ansible/roles/monitoring/tasks/main.yml

152 lines
4.3 KiB
YAML

---
# Monitoring: Prometheus + Grafana + DCGM Exporter (GPU metrics)
# ignore_errors: true on most tasks — monitoring is optional and should never block the install
- name: Create monitoring directories
file:
path: "{{ item }}"
state: directory
owner: cezen
group: cezen
loop:
- /opt/cezen/monitoring
- /opt/cezen/monitoring/prometheus
- /opt/cezen/monitoring/grafana
# ── DCGM Exporter (GPU metrics for Prometheus) ──────────
- name: Start DCGM Exporter container
shell: |
docker run -d \
--name dcgm-exporter \
--restart always \
--gpus all \
-p 9400:9400 \
nvcr.io/nvidia/k8s/dcgm-exporter:3.3.0-3.2.0-ubuntu22.04
register: dcgm_result
failed_when: dcgm_result.rc != 0 and 'already in use' not in dcgm_result.stderr
ignore_errors: true
when: gpu_available | default(false) | bool
# ── Prometheus ──────────────────────────────────────────
- name: Write Prometheus config
copy:
dest: /opt/cezen/monitoring/prometheus/prometheus.yml
owner: cezen
group: cezen
content: |
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'dcgm'
static_configs:
- targets: ['host-gateway:9400']
- job_name: 'node'
static_configs:
- targets: ['host-gateway:9100']
- name: Start Prometheus container
shell: |
docker run -d \
--name prometheus \
--restart always \
--add-host=host-gateway:172.17.0.1 \
-p 9090:9090 \
-v /opt/cezen/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \
prom/prometheus:latest
register: prom_result
failed_when: prom_result.rc != 0 and 'already in use' not in prom_result.stderr
ignore_errors: true
# ── Node Exporter (CPU/RAM/disk metrics) ───────────────
- name: Start Node Exporter container
shell: |
docker run -d \
--name node-exporter \
--restart always \
--network=host \
--pid=host \
-v /:/host:ro,rslave \
prom/node-exporter:latest \
--path.rootfs=/host
register: node_exp_result
failed_when: node_exp_result.rc != 0 and 'already in use' not in node_exp_result.stderr
ignore_errors: true
# ── Grafana ─────────────────────────────────────────────
- name: Start Grafana container
shell: |
docker run -d \
--name grafana \
--restart always \
-p 3000:3000 \
--add-host=host-gateway:172.17.0.1 \
-v grafana-storage:/var/lib/grafana \
-e GF_SECURITY_ADMIN_USER=admin \
-e GF_SECURITY_ADMIN_PASSWORD=cezen2024 \
-e GF_USERS_ALLOW_SIGN_UP=false \
grafana/grafana:latest
register: grafana_result
failed_when: grafana_result.rc != 0 and 'already in use' not in grafana_result.stderr
ignore_errors: true
- name: Wait for Grafana to be ready
wait_for:
host: localhost
port: 3000
timeout: 120
register: grafana_wait
ignore_errors: true
- name: Add Prometheus datasource to Grafana
uri:
url: http://localhost:3000/api/datasources
method: POST
user: admin
password: cezen2024
force_basic_auth: yes
body_format: json
body:
name: Prometheus
type: prometheus
url: "http://host-gateway:9090"
access: proxy
isDefault: true
status_code: [200, 409] # 409 = already exists, that's fine
ignore_errors: true
when: not (grafana_wait is failed)
- name: Import NVIDIA GPU dashboard (ID 12239)
uri:
url: http://localhost:3000/api/dashboards/import
method: POST
user: admin
password: cezen2024
force_basic_auth: yes
body_format: json
body:
inputs:
- name: DS_PROMETHEUS
type: datasource
pluginId: prometheus
value: Prometheus
overwrite: true
folderId: 0
dashboard:
"__inputs": []
"__requires": []
id: null
title: "NVIDIA GPU Overview"
uid: "nvidia-gpu"
status_code: [200, 412]
ignore_errors: true
when:
- not (grafana_wait is failed)
- gpu_available | default(false) | bool