Skip to content

Instantly share code, notes, and snippets.

@maxidl
Last active August 5, 2025 07:55
Show Gist options
  • Save maxidl/844160a41d7a4d6d8bc99af24eb1d208 to your computer and use it in GitHub Desktop.
Save maxidl/844160a41d7a4d6d8bc99af24eb1d208 to your computer and use it in GitHub Desktop.
SLURM PyTorch NCCL Multi-Node Test Script: A SLURM batch script that tests PyTorch's NCCL functionality across multiple GPU nodes. The script sets up a distributed PyTorch environment using torchrun and runs a comprehensive test that verifies NCCL initialization, inter-process communication barriers, and proper cleanup. Includes diagnostic outpu…
#!/bin/bash
#SBATCH --job-name=pytorch-nccl-test
#SBATCH --partition=
#SBATCH --account=
#SBATCH --qos=
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=32
#SBATCH --gres=gpu:H100:4
#SBATCH --time 0:05:00
#SBATCH --output=%x-%j.out
#SBATCH --exclude=
# SLURM PyTorch NCCL Multi-Node Test Script:
# A SLURM batch script that tests PyTorch's NCCL functionality across multiple GPU nodes.
# The script sets up a distributed PyTorch environment using torchrun and runs a comprehensive
# test that verifies NCCL initialization, inter-process communication barriers, and proper cleanup.
# Includes diagnostic output for troubleshooting multi-node GPU communication issues in HPC environments.
GPUS_PER_NODE=4
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
# Print job information
echo "=== SLURM Job Information ==="
echo "SLURM_JOB_NAME: ${SLURM_JOB_NAME}"
echo "SLURM_JOB_ID: ${SLURM_JOB_ID}"
echo "SLURM_JOB_PARTITION: ${SLURM_JOB_PARTITION}"
echo "SLURM_JOB_ACCOUNT: ${SLURM_JOB_ACCOUNT}"
echo "SLURM_JOB_NUM_NODES: ${SLURM_JOB_NUM_NODES}"
echo "SLURM_JOB_NODELIST: ${SLURM_JOB_NODELIST}"
echo "SLURM_NODEID: ${SLURM_NODEID}"
echo "============================="
# activate environment
eval "$(pixi shell-hook -e cuda-sglang)"
export LAUNCHER="torchrun \
--nproc_per_node $GPUS_PER_NODE \
--nnodes $SLURM_JOB_NUM_NODES \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT \
"
export SCRIPT=pytorch-nccl-test.py
cat << EOT > $SCRIPT
import torch.distributed as dist
import torch
import socket
import os
import fcntl
import time
SUCCESS=0
def printflock(*msgs):
""" print """
with open(__file__, "r") as fh:
fcntl.flock(fh, fcntl.LOCK_EX)
try:
print(*msgs)
finally:
fcntl.flock(fh, fcntl.LOCK_UN)
local_rank = int(os.environ["LOCAL_RANK"])
header = f"{socket.gethostname()}-{local_rank}"
if local_rank == 0:
printflock(f"{header}: torch.__version__: {torch.__version__}")
printflock(f"{header}: torch.version.cuda: {torch.version.cuda}")
printflock(f"{header}: torch.cuda.is_available(): {torch.cuda.is_available()}")
printflock(f"{header}: torch.cuda.nccl.version(): {torch.cuda.nccl.version()}")
printflock(f'{header}: running dist.init_process_group("nccl", device_id=torch.device(f"cuda:{local_rank}")) ...')
torch.cuda.set_device(local_rank)
dist.init_process_group("nccl", device_id=torch.device(f"cuda:{local_rank}"))
printflock(f'{header}: dist.init_process_group("nccl", device_id=torch.device(f"cuda:{local_rank}")) SUCCESS')
try:
printflock(f"{header}: Trying dist.barrier()")
dist.barrier()
printflock(f"{header}: NCCL {torch.cuda.nccl.version()} OK")
SUCCESS=1
except Exception as e:
printflock(f"{header}: NCCL {torch.cuda.nccl.version()} ERROR: {e}")
raise
finally:
# Properly destroy the process group to avoid resource leaks
if dist.is_initialized():
printflock(f"{header}: Destroying process group...")
dist.destroy_process_group()
printflock(f"{header}: Process group destroyed successfully")
time.sleep(1)
printflock(f"{header}: NCCL TEST SUCCESS: {bool(SUCCESS)}")
EOT
export NCCL_DEBUG=INFO
# export NCCL_SOCKET_IFNAME=ib0
# export NCCL_NET_GDR_LEVEL=2
# export NCCL_NET_GDR_READ=0
# export NCCL_P2P_DISABLE=1
# export NCCL_IB_DISABLE=1
echo "============================="
echo "Software versions:"
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): nvidia-smi: $(nvidia-smi)"'
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): nvidia driver version: $(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits)"'
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): nvcc version: $(nvcc --version)"'
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): ibstat: $(ibstat)"'
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): ibdev2netdev: $(ibdev2netdev)"'
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): ofed_info -s: $(ofed_info -s)"'
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): lsmod | grep nvidia_peermem: $(lsmod | grep nvidia_peermem)"'
echo "============================="
echo "NCCL ENV VARS:"
echo "NCCL_DEBUG: $NCCL_DEBUG"
echo "NCCL_SOCKET_IFNAME: $NCCL_SOCKET_IFNAME"
echo "NCCL_NET_GDR_LEVEL: $NCCL_NET_GDR_LEVEL"
echo "NCCL_P2P_DISABLE: $NCCL_P2P_DISABLE"
echo "NCCL_IB_DISABLE: $NCCL_IB_DISABLE"
echo "============================="
echo "Running NCCL test:"
echo $LAUNCHER --node_rank $SLURM_PROCID $SCRIPT
srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_NODEID $SCRIPT'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment