aipackage/ansible/roles/nvidia/tasks/main.yml

82 lines
2.0 KiB
YAML

---
# NVIDIA role: Drivers + CUDA + cuDNN
# NOTE: Tested on L40S (Entry) and A40 (lab). Requires reboot after this role.
# If no GPU is present, this role will install drivers but nvidia-smi won't show GPUs.
- name: Add NVIDIA package repository key
apt_key:
url: https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub
state: present
- name: Add NVIDIA CUDA apt repository
apt_repository:
repo: "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /"
state: present
filename: cuda
- name: Update apt cache after adding NVIDIA repo
apt:
update_cache: yes
- name: Install NVIDIA driver (open kernel module, recommended for data center GPUs)
apt:
name:
- nvidia-driver-550-open
- nvidia-utils-550
state: present
notify: reboot required
# CUDA Toolkit
- name: Install CUDA Toolkit 12.4
apt:
name:
- cuda-toolkit-12-4
- cuda-cudart-12-4
state: present
# cuDNN
- name: Add cuDNN repository
apt_repository:
repo: "deb https://developer.download.nvidia.com/compute/cudnn/repos/ubuntu2204/x86_64/ /"
state: present
filename: cudnn
- name: Install cuDNN 9 for CUDA 12
apt:
name:
- cudnn9-cuda-12
state: present
# Environment variables
- name: Set CUDA paths system-wide
copy:
dest: /etc/profile.d/cuda.sh
content: |
export CUDA_HOME=/usr/local/cuda
export PATH=$CUDA_HOME/bin:$PATH
export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
mode: "0644"
- name: Set NVIDIA persistence mode (survives reboots)
copy:
dest: /etc/systemd/system/nvidia-persistenced-mode.service
content: |
[Unit]
Description=NVIDIA Persistence Daemon Mode
After=nvidia-persistenced.service
[Service]
Type=oneshot
ExecStart=/usr/bin/nvidia-smi -pm 1
RemainAfterExit=yes
[Install]
WantedBy=multi-user.target
mode: "0644"
- name: Enable NVIDIA persistence service
systemd:
name: nvidia-persistenced-mode
enabled: yes
daemon_reload: yes