146 lines
4.2 KiB
YAML
146 lines
4.2 KiB
YAML
---
|
|
# Monitoring: Prometheus + Grafana + DCGM Exporter (GPU metrics)
|
|
# ignore_errors: true on most tasks — monitoring is optional and should never block the install
|
|
|
|
- name: Create monitoring directories
|
|
file:
|
|
path: "{{ item }}"
|
|
state: directory
|
|
owner: cezen
|
|
group: cezen
|
|
loop:
|
|
- /opt/cezen/monitoring
|
|
- /opt/cezen/monitoring/prometheus
|
|
- /opt/cezen/monitoring/grafana
|
|
|
|
# ── DCGM Exporter (GPU metrics for Prometheus) ──────────
|
|
- name: Start DCGM Exporter container
|
|
shell: |
|
|
docker run -d \
|
|
--name dcgm-exporter \
|
|
--restart always \
|
|
--gpus all \
|
|
-p 9400:9400 \
|
|
nvcr.io/nvidia/k8s/dcgm-exporter:3.3.0-3.2.0-ubuntu22.04
|
|
register: dcgm_result
|
|
failed_when: dcgm_result.rc != 0 and 'already in use' not in dcgm_result.stderr
|
|
ignore_errors: true
|
|
|
|
# ── Prometheus ──────────────────────────────────────────
|
|
- name: Write Prometheus config
|
|
copy:
|
|
dest: /opt/cezen/monitoring/prometheus/prometheus.yml
|
|
owner: cezen
|
|
group: cezen
|
|
content: |
|
|
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
|
|
scrape_configs:
|
|
- job_name: 'prometheus'
|
|
static_configs:
|
|
- targets: ['localhost:9090']
|
|
|
|
- job_name: 'dcgm'
|
|
static_configs:
|
|
- targets: ['host-gateway:9400']
|
|
|
|
- job_name: 'node'
|
|
static_configs:
|
|
- targets: ['host-gateway:9100']
|
|
|
|
- name: Start Prometheus container
|
|
shell: |
|
|
docker run -d \
|
|
--name prometheus \
|
|
--restart always \
|
|
--add-host=host-gateway:172.17.0.1 \
|
|
-p 9090:9090 \
|
|
-v /opt/cezen/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \
|
|
prom/prometheus:latest
|
|
register: prom_result
|
|
failed_when: prom_result.rc != 0 and 'already in use' not in prom_result.stderr
|
|
ignore_errors: true
|
|
|
|
# ── Node Exporter (CPU/RAM/disk metrics) ───────────────
|
|
- name: Start Node Exporter container
|
|
shell: |
|
|
docker run -d \
|
|
--name node-exporter \
|
|
--restart always \
|
|
--network=host \
|
|
--pid=host \
|
|
-v /:/host:ro,rslave \
|
|
prom/node-exporter:latest \
|
|
--path.rootfs=/host
|
|
register: node_exp_result
|
|
failed_when: node_exp_result.rc != 0 and 'already in use' not in node_exp_result.stderr
|
|
ignore_errors: true
|
|
|
|
# ── Grafana ─────────────────────────────────────────────
|
|
- name: Start Grafana container
|
|
shell: |
|
|
docker run -d \
|
|
--name grafana \
|
|
--restart always \
|
|
-p 3000:3000 \
|
|
--add-host=host-gateway:172.17.0.1 \
|
|
-v grafana-storage:/var/lib/grafana \
|
|
-e GF_SECURITY_ADMIN_USER=admin \
|
|
-e GF_SECURITY_ADMIN_PASSWORD=cezen2024 \
|
|
-e GF_USERS_ALLOW_SIGN_UP=false \
|
|
grafana/grafana:latest
|
|
register: grafana_result
|
|
failed_when: grafana_result.rc != 0 and 'already in use' not in grafana_result.stderr
|
|
ignore_errors: true
|
|
|
|
- name: Wait for Grafana to be ready
|
|
wait_for:
|
|
host: localhost
|
|
port: 3000
|
|
timeout: 60
|
|
ignore_errors: true
|
|
|
|
- name: Add Prometheus datasource to Grafana
|
|
uri:
|
|
url: http://localhost:3000/api/datasources
|
|
method: POST
|
|
user: admin
|
|
password: cezen2024
|
|
force_basic_auth: yes
|
|
body_format: json
|
|
body:
|
|
name: Prometheus
|
|
type: prometheus
|
|
url: "http://host-gateway:9090"
|
|
access: proxy
|
|
isDefault: true
|
|
status_code: [200, 409] # 409 = already exists, that's fine
|
|
ignore_errors: true
|
|
|
|
- name: Import NVIDIA GPU dashboard (ID 12239)
|
|
uri:
|
|
url: http://localhost:3000/api/dashboards/import
|
|
method: POST
|
|
user: admin
|
|
password: cezen2024
|
|
force_basic_auth: yes
|
|
body_format: json
|
|
body:
|
|
inputs:
|
|
- name: DS_PROMETHEUS
|
|
type: datasource
|
|
pluginId: prometheus
|
|
value: Prometheus
|
|
overwrite: true
|
|
folderId: 0
|
|
dashboard:
|
|
"__inputs": []
|
|
"__requires": []
|
|
id: null
|
|
title: "NVIDIA GPU Overview"
|
|
uid: "nvidia-gpu"
|
|
status_code: [200, 412]
|
|
ignore_errors: true
|