Skip to content

Instantly share code, notes, and snippets.

@zuazo
Last active March 13, 2025 14:21
Show Gist options
  • Save zuazo/1e6b4dff5fd1cec7d19113e1260d87e9 to your computer and use it in GitHub Desktop.
Save zuazo/1e6b4dff5fd1cec7d19113e1260d87e9 to your computer and use it in GitHub Desktop.
Generate a Monthly GPU Usage Report on Slurm HPC Clusters
#!/usr/bin/env python
"""
gpu_monthly_usage_slurm.py - Generate a Monthly GPU Usage Report on Slurm HPC Clusters
Description:
------------
This script calculates GPU usage for a specified month on a Slurm-managed
cluster by aggregating GPU-hours for each user. It can read sacct output
directly (and optionally save it) or load pre-saved sacct data from a file.
The script accounts for partial usage only within the specified month (e.g.,
for jobs that start before the month or end afterwards).
The script can also detect the total number of GPUs in a given Slurm partition
(using sinfo) and computes the fraction of GPU-hours consumed relative to total
available GPU-hours in that month.
Usage Examples:
---------------
1) Report one month usage using `YYYY` `MM` format:
$ ./gpu_monthly_usage_slurm.py -w output.csv 2025 1
2) Directly query `sacct` for February 2025, saving the `sacct` output to
`output.csv`:
$ ./gpu_monthly_usage_slurm.py -w output.csv 2025 2
3) Read previously saved `sacct` results from `output.csv`:
$ ./gpu_monthly_usage_slurm.py -r output.csv 2025 2
4) Focus on the 'gpu-H100' partition with `-p` or `--partition`:
$ ./gpu_monthly_usage_slurm.py -p gpu-H100 2025 2
Output example:
```
GPU Usage Report for 2025-02
Total GPUs in node: 8
Days in month: 28
Total available GPU-hours in 2025-02: 5376.00
Per-User GPU-Hour Usage:
user1 : 4599.73 (99.76%) GPU-hours
user2 : 6.44 (0.14%) GPU-hours
user3 : 4.51 (0.10%) GPU-hours
Overall GPU-Hours Used: 4610.69
Percentage of total: 85.76%
```
Author:
-------
Xabier de Zuazo <[email protected]> (2025-03)
License:
--------
Apache License 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
"""
import os
import sys
import logging
import argparse
import calendar
import datetime
import subprocess
def run_command_compat(cmd):
"""
Runs a shell command and returns the subprocess.CompletedProcess object.
On Python 3.7+:
Uses capture_output=True and text=True
On older Python:
Uses stdout=PIPE, stderr=PIPE, universal_newlines=True
"""
if sys.version_info >= (3, 7):
# Python 3.7+ supports capture_output and text
result = subprocess.run(
cmd,
capture_output=True,
text=True,
check=True
)
else:
# Fallback for older Python (<3.7)
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
check=True
)
return result
def parse_elapsed_to_hours(elapsed_str):
"""
Parse Slurm's Elapsed format (d-hh:mm:ss or hh:mm:ss) into total hours (float).
Examples:
2-01:15:30 => 2 days, 1 hour, 15 min, 30 sec
03:45:10 => 0 days, 3 hours, 45 min, 10 sec
"""
if "-" in elapsed_str:
days_part, hms_part = elapsed_str.split("-")
days = int(days_part)
else:
days = 0
hms_part = elapsed_str
h, m, s = hms_part.split(":")
hours = int(h) + int(m) / 60 + int(s) / 3600
hours += days * 24
return hours
def parse_gpu_alloc(alloc_tres_str):
"""
Extract the GPU count from the AllocTRES field, e.g.:
'billing=28,cpu=28,gres/gpu=1,node=1'
The GPU count is after 'gres/gpu='.
Returns an integer number of GPUs if found, else 0.
"""
# Example: "billing=28,cpu=28,gres/gpu=1,node=1"
# We look for 'gres/gpu=' substring
gpu_count = 0
entries = alloc_tres_str.split(",")
for entry in entries:
if entry.startswith("gres/gpu="):
# entry is something like 'gres/gpu=1'
# take everything after '='
gpu_str = entry.split("=")[1]
gpu_count = int(gpu_str)
break
return gpu_count
def get_partition_gpus(partition=None):
"""
Returns the total number of GPUs in all nodes of the given Slurm partition.
Example:
total_gpus = get_partition_gpus("gpu-H100")
print("Partition gpu-H100 has", total_gpus, "GPUs")
Notes:
- If the partition has multiple nodes, we sum the GPUs across them.
- If a node has multiple GPU types in its GRES, we also sum them all.
"""
sinfo_cmd = [
"sinfo", "-h",
"-N",
"-o", "%N|%G",
"--noheader",
]
if partition is not None:
sinfo_cmd += ["--partition", partition]
logging.debug("Command run: %s", " ".join(sinfo_cmd))
try:
result = run_command_compat(sinfo_cmd)
except subprocess.CalledProcessError as e:
# If sinfo fails, raise or handle as needed
raise RuntimeError(f"sinfo failed:\n{e.stderr}")
total_gpus = 0
lines = result.stdout.strip().split("\n")
for line in lines:
# Each line looks like: "node01|gpu:8" or "node01|gpu:h100:8"
if not line.strip():
continue
node_name, gres_str = line.split("|", 1)
# gres_str can have multiple comma-separated items, e.g. "gpu:8,shmem:32G"
# or "gpu:a100:8,gpu:v100:2" on some clusters.
for gres_item in gres_str.split(","):
gres_item = gres_item.strip()
if not gres_item:
continue
# The last chunk might look like "gpu:a100:8(S:0-1)"
# Remove everything after an open parenthesis, if present
gres_item = gres_item.split("(")[0]
# Typical GPU entries look like "gpu:8" or "gpu:a100:8".
# We'll split by ':' and the last field is usually the integer count.
parts = gres_item.split(":")
if parts[0] == "gpu":
# The last piece in 'gpu:a100:8' is '8'
last_part = parts[-1]
# Now parse the remaining text as an integer
try:
count = int(last_part)
total_gpus += count
except ValueError:
# If parsing fails, ignore or handle gracefully
pass
return total_gpus
def parse_args():
"""Parse command line arguments.
Returns
-------
namespace
The namespace populated with the command line argument values.
"""
parser = argparse.ArgumentParser(
description="Generates GPU usage monthly report for Slurm."
)
parser.add_argument(
"year",
type=int,
help="Year in YYYY format.",
)
parser.add_argument(
"month",
type=int,
help="Month in MM format.",
)
parser.add_argument(
"--partition",
"-p",
default=None,
help="Name of the partition.",
)
parser.add_argument(
"--n_gpus", "--gpus",
"-g",
type=int,
default=None,
help="Number of GPUs in the cluster.",
)
parser.add_argument(
"--write",
"-w",
default=None,
help="File to save the `sacct` command output.",
)
parser.add_argument(
"--read",
"-r",
default=None,
help="File to read the `sacct` command output from.",
)
levels = ("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL")
parser.add_argument("--log-level", "-l", default="INFO", choices=levels)
args = parser.parse_args()
return args
def main():
"""Start the program."""
args = parse_args()
logging.basicConfig(level=args.log_level)
year = args.year
month = args.month
partition = args.partition
n_gpus = args.n_gpus
# Compute the first and last day of that month
start_date = datetime.datetime(year, month, 1, 0, 0, 0)
days_in_month = calendar.monthrange(year, month)[1]
end_date = datetime.datetime(year, month, days_in_month, 23, 59, 59)
# Construct time strings for sacct
start_str = start_date.strftime("%Y-%m-%dT%H:%M:%S")
end_str = end_date.strftime("%Y-%m-%dT%H:%M:%S")
if args.read is not None:
fhandle = open(args.read, "r", encoding="utf-8")
lines = fhandle.readlines()
else:
# We prepare the sacct command
# --parsable2 gives us a '|' delimiter
# We format=User,JobID,JobName,Elapsed,AllocTRES
sacct_cmd = [
"sacct",
"-X",
"--parsable2",
"--starttime",
start_str,
"--endtime",
end_str,
"--format=User,JobID,JobName,Start,End,Elapsed,AllocTRES",
"--allusers",
]
if partition is not None:
sacct_cmd += ["--partition", partition]
logging.debug("Command run: %s", " ".join(sacct_cmd))
# Call sacct and capture output
try:
result = run_command_compat(sacct_cmd)
except subprocess.CalledProcessError as e:
print("Error running sacct:\n", e.stderr)
sys.exit(1)
lines = result.stdout.strip().split("\n")
# Save the output file
if args.write is not None:
output_dir = os.path.dirname(args.write)
if output_dir:
os.makedirs(output_dir, exist_ok=True)
with open(args.write, "w", encoding="utf-8") as fhandle:
fhandle.write("\n".join(lines) + "\n")
if n_gpus is None:
n_gpus = get_partition_gpus(partition)
# The columns are: User | JobID | JobName | Elapsed | AllocTRES
# subsequent lines contain the data
# Track GPU-hour usage per user
usage_by_user = {}
for line in lines[1:]: # skip header
parts = line.split("|")
if len(parts) < 5:
continue
user = parts[0]
# jobid = parts[1]
# jobname = parts[2]
start_time_str = parts[3]
end_time_str = parts[4]
elapsed_str = parts[5]
alloc_tres_str = parts[6]
nones = ["None", "Unknown"]
if start_time_str in nones or end_time_str in nones:
# Some jobs might not have a start/end if they never ran, etc.
# Convert elapsed to hours
hours = parse_elapsed_to_hours(elapsed_str)
# Find how many GPUs used
gpu_count = parse_gpu_alloc(alloc_tres_str)
# Compute total GPU-hours for this job
gpu_hours = hours * gpu_count
if user not in usage_by_user:
usage_by_user[user] = 0.0
usage_by_user[user] += gpu_hours
else:
job_start = datetime.datetime.strptime(
start_time_str, "%Y-%m-%dT%H:%M:%S"
)
job_end = datetime.datetime.strptime(
end_time_str, "%Y-%m-%dT%H:%M:%S"
)
# Compute the overlap with the query month range
actual_start = max(job_start, start_date)
actual_end = min(job_end, end_date)
if actual_end <= actual_start:
# No overlap in the month
continue
# Overlap in hours
overlap_hours = (actual_end - actual_start).total_seconds() / 3600.0
# Parse GPU count as before
gpu_count = parse_gpu_alloc(alloc_tres_str)
# Then partial GPU-hours for just the overlapping portion
partial_gpu_hours = overlap_hours * gpu_count
# Accumulate
usage_by_user[user] = usage_by_user.get(user, 0.0) + partial_gpu_hours
# Now compute total available GPU hours
# This cluster has 8 GPUs in the single node.
total_gpu_hours_month = days_in_month * 24.0 * n_gpus
# Calculate total hours
overall_used_gpu_hours = 0.0
for user, gpu_hours in sorted(
usage_by_user.items(), key=lambda x: x[1], reverse=True
):
overall_used_gpu_hours += gpu_hours
# Summarize
print(f"GPU Usage Report for {year}-{month:02d}")
print(f" Total GPUs in node: {n_gpus}")
print(f" Days in month: {days_in_month}")
print((
f" Total available GPU-hours in {year}-{month:02d}: "
f"{total_gpu_hours_month:.2f}"
))
print("")
# Print usage by user
print("Per-User GPU-Hour Usage:")
for user, gpu_hours in sorted(
usage_by_user.items(), key=lambda x: x[1], reverse=True
):
gpu_hours_pct = gpu_hours / overall_used_gpu_hours * 100.0
print(
f" {user:20s}: {gpu_hours:10.2f} ({gpu_hours_pct:.2f}%) GPU-hours"
)
# Overall usage fraction
usage_pct = 0.0
if total_gpu_hours_month > 0:
usage_pct = overall_used_gpu_hours / total_gpu_hours_month * 100.0
print("")
print(f"Overall GPU-Hours Used: {overall_used_gpu_hours:.2f}")
print(f"Percentage of total: {usage_pct:.2f}%")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment