Last active
March 13, 2025 14:21
-
-
Save zuazo/1e6b4dff5fd1cec7d19113e1260d87e9 to your computer and use it in GitHub Desktop.
Generate a Monthly GPU Usage Report on Slurm HPC Clusters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
gpu_monthly_usage_slurm.py - Generate a Monthly GPU Usage Report on Slurm HPC Clusters | |
Description: | |
------------ | |
This script calculates GPU usage for a specified month on a Slurm-managed | |
cluster by aggregating GPU-hours for each user. It can read sacct output | |
directly (and optionally save it) or load pre-saved sacct data from a file. | |
The script accounts for partial usage only within the specified month (e.g., | |
for jobs that start before the month or end afterwards). | |
The script can also detect the total number of GPUs in a given Slurm partition | |
(using sinfo) and computes the fraction of GPU-hours consumed relative to total | |
available GPU-hours in that month. | |
Usage Examples: | |
--------------- | |
1) Report one month usage using `YYYY` `MM` format: | |
$ ./gpu_monthly_usage_slurm.py -w output.csv 2025 1 | |
2) Directly query `sacct` for February 2025, saving the `sacct` output to | |
`output.csv`: | |
$ ./gpu_monthly_usage_slurm.py -w output.csv 2025 2 | |
3) Read previously saved `sacct` results from `output.csv`: | |
$ ./gpu_monthly_usage_slurm.py -r output.csv 2025 2 | |
4) Focus on the 'gpu-H100' partition with `-p` or `--partition`: | |
$ ./gpu_monthly_usage_slurm.py -p gpu-H100 2025 2 | |
Output example: | |
``` | |
GPU Usage Report for 2025-02 | |
Total GPUs in node: 8 | |
Days in month: 28 | |
Total available GPU-hours in 2025-02: 5376.00 | |
Per-User GPU-Hour Usage: | |
user1 : 4599.73 (99.76%) GPU-hours | |
user2 : 6.44 (0.14%) GPU-hours | |
user3 : 4.51 (0.10%) GPU-hours | |
Overall GPU-Hours Used: 4610.69 | |
Percentage of total: 85.76% | |
``` | |
Author: | |
------- | |
Xabier de Zuazo <[email protected]> (2025-03) | |
License: | |
-------- | |
Apache License 2.0 | |
https://www.apache.org/licenses/LICENSE-2.0.txt | |
""" | |
import os | |
import sys | |
import logging | |
import argparse | |
import calendar | |
import datetime | |
import subprocess | |
def run_command_compat(cmd): | |
""" | |
Runs a shell command and returns the subprocess.CompletedProcess object. | |
On Python 3.7+: | |
Uses capture_output=True and text=True | |
On older Python: | |
Uses stdout=PIPE, stderr=PIPE, universal_newlines=True | |
""" | |
if sys.version_info >= (3, 7): | |
# Python 3.7+ supports capture_output and text | |
result = subprocess.run( | |
cmd, | |
capture_output=True, | |
text=True, | |
check=True | |
) | |
else: | |
# Fallback for older Python (<3.7) | |
result = subprocess.run( | |
cmd, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
universal_newlines=True, | |
check=True | |
) | |
return result | |
def parse_elapsed_to_hours(elapsed_str): | |
""" | |
Parse Slurm's Elapsed format (d-hh:mm:ss or hh:mm:ss) into total hours (float). | |
Examples: | |
2-01:15:30 => 2 days, 1 hour, 15 min, 30 sec | |
03:45:10 => 0 days, 3 hours, 45 min, 10 sec | |
""" | |
if "-" in elapsed_str: | |
days_part, hms_part = elapsed_str.split("-") | |
days = int(days_part) | |
else: | |
days = 0 | |
hms_part = elapsed_str | |
h, m, s = hms_part.split(":") | |
hours = int(h) + int(m) / 60 + int(s) / 3600 | |
hours += days * 24 | |
return hours | |
def parse_gpu_alloc(alloc_tres_str): | |
""" | |
Extract the GPU count from the AllocTRES field, e.g.: | |
'billing=28,cpu=28,gres/gpu=1,node=1' | |
The GPU count is after 'gres/gpu='. | |
Returns an integer number of GPUs if found, else 0. | |
""" | |
# Example: "billing=28,cpu=28,gres/gpu=1,node=1" | |
# We look for 'gres/gpu=' substring | |
gpu_count = 0 | |
entries = alloc_tres_str.split(",") | |
for entry in entries: | |
if entry.startswith("gres/gpu="): | |
# entry is something like 'gres/gpu=1' | |
# take everything after '=' | |
gpu_str = entry.split("=")[1] | |
gpu_count = int(gpu_str) | |
break | |
return gpu_count | |
def get_partition_gpus(partition=None): | |
""" | |
Returns the total number of GPUs in all nodes of the given Slurm partition. | |
Example: | |
total_gpus = get_partition_gpus("gpu-H100") | |
print("Partition gpu-H100 has", total_gpus, "GPUs") | |
Notes: | |
- If the partition has multiple nodes, we sum the GPUs across them. | |
- If a node has multiple GPU types in its GRES, we also sum them all. | |
""" | |
sinfo_cmd = [ | |
"sinfo", "-h", | |
"-N", | |
"-o", "%N|%G", | |
"--noheader", | |
] | |
if partition is not None: | |
sinfo_cmd += ["--partition", partition] | |
logging.debug("Command run: %s", " ".join(sinfo_cmd)) | |
try: | |
result = run_command_compat(sinfo_cmd) | |
except subprocess.CalledProcessError as e: | |
# If sinfo fails, raise or handle as needed | |
raise RuntimeError(f"sinfo failed:\n{e.stderr}") | |
total_gpus = 0 | |
lines = result.stdout.strip().split("\n") | |
for line in lines: | |
# Each line looks like: "node01|gpu:8" or "node01|gpu:h100:8" | |
if not line.strip(): | |
continue | |
node_name, gres_str = line.split("|", 1) | |
# gres_str can have multiple comma-separated items, e.g. "gpu:8,shmem:32G" | |
# or "gpu:a100:8,gpu:v100:2" on some clusters. | |
for gres_item in gres_str.split(","): | |
gres_item = gres_item.strip() | |
if not gres_item: | |
continue | |
# The last chunk might look like "gpu:a100:8(S:0-1)" | |
# Remove everything after an open parenthesis, if present | |
gres_item = gres_item.split("(")[0] | |
# Typical GPU entries look like "gpu:8" or "gpu:a100:8". | |
# We'll split by ':' and the last field is usually the integer count. | |
parts = gres_item.split(":") | |
if parts[0] == "gpu": | |
# The last piece in 'gpu:a100:8' is '8' | |
last_part = parts[-1] | |
# Now parse the remaining text as an integer | |
try: | |
count = int(last_part) | |
total_gpus += count | |
except ValueError: | |
# If parsing fails, ignore or handle gracefully | |
pass | |
return total_gpus | |
def parse_args(): | |
"""Parse command line arguments. | |
Returns | |
------- | |
namespace | |
The namespace populated with the command line argument values. | |
""" | |
parser = argparse.ArgumentParser( | |
description="Generates GPU usage monthly report for Slurm." | |
) | |
parser.add_argument( | |
"year", | |
type=int, | |
help="Year in YYYY format.", | |
) | |
parser.add_argument( | |
"month", | |
type=int, | |
help="Month in MM format.", | |
) | |
parser.add_argument( | |
"--partition", | |
"-p", | |
default=None, | |
help="Name of the partition.", | |
) | |
parser.add_argument( | |
"--n_gpus", "--gpus", | |
"-g", | |
type=int, | |
default=None, | |
help="Number of GPUs in the cluster.", | |
) | |
parser.add_argument( | |
"--write", | |
"-w", | |
default=None, | |
help="File to save the `sacct` command output.", | |
) | |
parser.add_argument( | |
"--read", | |
"-r", | |
default=None, | |
help="File to read the `sacct` command output from.", | |
) | |
levels = ("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL") | |
parser.add_argument("--log-level", "-l", default="INFO", choices=levels) | |
args = parser.parse_args() | |
return args | |
def main(): | |
"""Start the program.""" | |
args = parse_args() | |
logging.basicConfig(level=args.log_level) | |
year = args.year | |
month = args.month | |
partition = args.partition | |
n_gpus = args.n_gpus | |
# Compute the first and last day of that month | |
start_date = datetime.datetime(year, month, 1, 0, 0, 0) | |
days_in_month = calendar.monthrange(year, month)[1] | |
end_date = datetime.datetime(year, month, days_in_month, 23, 59, 59) | |
# Construct time strings for sacct | |
start_str = start_date.strftime("%Y-%m-%dT%H:%M:%S") | |
end_str = end_date.strftime("%Y-%m-%dT%H:%M:%S") | |
if args.read is not None: | |
fhandle = open(args.read, "r", encoding="utf-8") | |
lines = fhandle.readlines() | |
else: | |
# We prepare the sacct command | |
# --parsable2 gives us a '|' delimiter | |
# We format=User,JobID,JobName,Elapsed,AllocTRES | |
sacct_cmd = [ | |
"sacct", | |
"-X", | |
"--parsable2", | |
"--starttime", | |
start_str, | |
"--endtime", | |
end_str, | |
"--format=User,JobID,JobName,Start,End,Elapsed,AllocTRES", | |
"--allusers", | |
] | |
if partition is not None: | |
sacct_cmd += ["--partition", partition] | |
logging.debug("Command run: %s", " ".join(sacct_cmd)) | |
# Call sacct and capture output | |
try: | |
result = run_command_compat(sacct_cmd) | |
except subprocess.CalledProcessError as e: | |
print("Error running sacct:\n", e.stderr) | |
sys.exit(1) | |
lines = result.stdout.strip().split("\n") | |
# Save the output file | |
if args.write is not None: | |
output_dir = os.path.dirname(args.write) | |
if output_dir: | |
os.makedirs(output_dir, exist_ok=True) | |
with open(args.write, "w", encoding="utf-8") as fhandle: | |
fhandle.write("\n".join(lines) + "\n") | |
if n_gpus is None: | |
n_gpus = get_partition_gpus(partition) | |
# The columns are: User | JobID | JobName | Elapsed | AllocTRES | |
# subsequent lines contain the data | |
# Track GPU-hour usage per user | |
usage_by_user = {} | |
for line in lines[1:]: # skip header | |
parts = line.split("|") | |
if len(parts) < 5: | |
continue | |
user = parts[0] | |
# jobid = parts[1] | |
# jobname = parts[2] | |
start_time_str = parts[3] | |
end_time_str = parts[4] | |
elapsed_str = parts[5] | |
alloc_tres_str = parts[6] | |
nones = ["None", "Unknown"] | |
if start_time_str in nones or end_time_str in nones: | |
# Some jobs might not have a start/end if they never ran, etc. | |
# Convert elapsed to hours | |
hours = parse_elapsed_to_hours(elapsed_str) | |
# Find how many GPUs used | |
gpu_count = parse_gpu_alloc(alloc_tres_str) | |
# Compute total GPU-hours for this job | |
gpu_hours = hours * gpu_count | |
if user not in usage_by_user: | |
usage_by_user[user] = 0.0 | |
usage_by_user[user] += gpu_hours | |
else: | |
job_start = datetime.datetime.strptime( | |
start_time_str, "%Y-%m-%dT%H:%M:%S" | |
) | |
job_end = datetime.datetime.strptime( | |
end_time_str, "%Y-%m-%dT%H:%M:%S" | |
) | |
# Compute the overlap with the query month range | |
actual_start = max(job_start, start_date) | |
actual_end = min(job_end, end_date) | |
if actual_end <= actual_start: | |
# No overlap in the month | |
continue | |
# Overlap in hours | |
overlap_hours = (actual_end - actual_start).total_seconds() / 3600.0 | |
# Parse GPU count as before | |
gpu_count = parse_gpu_alloc(alloc_tres_str) | |
# Then partial GPU-hours for just the overlapping portion | |
partial_gpu_hours = overlap_hours * gpu_count | |
# Accumulate | |
usage_by_user[user] = usage_by_user.get(user, 0.0) + partial_gpu_hours | |
# Now compute total available GPU hours | |
# This cluster has 8 GPUs in the single node. | |
total_gpu_hours_month = days_in_month * 24.0 * n_gpus | |
# Calculate total hours | |
overall_used_gpu_hours = 0.0 | |
for user, gpu_hours in sorted( | |
usage_by_user.items(), key=lambda x: x[1], reverse=True | |
): | |
overall_used_gpu_hours += gpu_hours | |
# Summarize | |
print(f"GPU Usage Report for {year}-{month:02d}") | |
print(f" Total GPUs in node: {n_gpus}") | |
print(f" Days in month: {days_in_month}") | |
print(( | |
f" Total available GPU-hours in {year}-{month:02d}: " | |
f"{total_gpu_hours_month:.2f}" | |
)) | |
print("") | |
# Print usage by user | |
print("Per-User GPU-Hour Usage:") | |
for user, gpu_hours in sorted( | |
usage_by_user.items(), key=lambda x: x[1], reverse=True | |
): | |
gpu_hours_pct = gpu_hours / overall_used_gpu_hours * 100.0 | |
print( | |
f" {user:20s}: {gpu_hours:10.2f} ({gpu_hours_pct:.2f}%) GPU-hours" | |
) | |
# Overall usage fraction | |
usage_pct = 0.0 | |
if total_gpu_hours_month > 0: | |
usage_pct = overall_used_gpu_hours / total_gpu_hours_month * 100.0 | |
print("") | |
print(f"Overall GPU-Hours Used: {overall_used_gpu_hours:.2f}") | |
print(f"Percentage of total: {usage_pct:.2f}%") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment