* Added tool to get a detailed picture of GPU utilization on a SLURM cluster.

Tested on Wahab cluster.
This commit is contained in:
Wirawan Purwanto
2023-04-06 16:25:04 -04:00
parent 3aa1688f8e
commit dfb9db6a60

41
slurm/check-gpu-utilization.sh Executable file
View File

@@ -0,0 +1,41 @@
#!/bin/bash
#
# check-gpu-utilization.sh
# Given a SLURM cluster, enumerate all the GPU nodes except those that are down
#
# Created: 2023-04-06
set -eu
# Must be a valid regex
GPU_PARTITIONS='^gpu$'
# list GPU nodes being utilized (partially/fully)
LIST_GPU_NODES=( $(sinfo -N | awk '($3 ~ /'"$GPU_PARTITIONS"'/) && ($4 ~ /^(mix|alloc)$/) { print $1 }') )
echo "$0"
date
# list all the jobs:
echo "=== LISTING OF ALL GPU JOBS ==="
LIST_GPU_JOBS=$(squeue | awk '$2 ~ /'"$GPU_PARTITIONS"'/ { print }')
echo "$LIST_GPU_JOBS"
echo
echo "=== LISTING OF GPU UTILIZATIONS PER NODE ==="
for Node in "${LIST_GPU_NODES[@]}"; do
echo " :: node: $Node"
ssh "$Node" nvidia-smi
echo
done
echo
echo "=== LISTING OF GPU JOB SPECIFICATIONS ==="
for Job in $(echo "${LIST_GPU_JOBS}" | awk '{ if ($1 != "JOBID") { print($1) } }'); do
#echo " :: Job: $Node"
scontrol show job "$Job"
#echo
done
#squeue | awk '$2 ~ /'"$GPU_PARTITIONS"'/ { print }'