Last active
August 5, 2025 07:55
-
-
Save maxidl/844160a41d7a4d6d8bc99af24eb1d208 to your computer and use it in GitHub Desktop.
SLURM PyTorch NCCL Multi-Node Test Script: A SLURM batch script that tests PyTorch's NCCL functionality across multiple GPU nodes. The script sets up a distributed PyTorch environment using torchrun and runs a comprehensive test that verifies NCCL initialization, inter-process communication barriers, and proper cleanup. Includes diagnostic outpu…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --job-name=pytorch-nccl-test | |
#SBATCH --partition= | |
#SBATCH --account= | |
#SBATCH --qos= | |
#SBATCH --nodes=2 | |
#SBATCH --ntasks-per-node=1 | |
#SBATCH --cpus-per-task=32 | |
#SBATCH --gres=gpu:H100:4 | |
#SBATCH --time 0:05:00 | |
#SBATCH --output=%x-%j.out | |
#SBATCH --exclude= | |
# SLURM PyTorch NCCL Multi-Node Test Script: | |
# A SLURM batch script that tests PyTorch's NCCL functionality across multiple GPU nodes. | |
# The script sets up a distributed PyTorch environment using torchrun and runs a comprehensive | |
# test that verifies NCCL initialization, inter-process communication barriers, and proper cleanup. | |
# Includes diagnostic output for troubleshooting multi-node GPU communication issues in HPC environments. | |
GPUS_PER_NODE=4 | |
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) | |
MASTER_PORT=6000 | |
# Print job information | |
echo "=== SLURM Job Information ===" | |
echo "SLURM_JOB_NAME: ${SLURM_JOB_NAME}" | |
echo "SLURM_JOB_ID: ${SLURM_JOB_ID}" | |
echo "SLURM_JOB_PARTITION: ${SLURM_JOB_PARTITION}" | |
echo "SLURM_JOB_ACCOUNT: ${SLURM_JOB_ACCOUNT}" | |
echo "SLURM_JOB_NUM_NODES: ${SLURM_JOB_NUM_NODES}" | |
echo "SLURM_JOB_NODELIST: ${SLURM_JOB_NODELIST}" | |
echo "SLURM_NODEID: ${SLURM_NODEID}" | |
echo "=============================" | |
# activate environment | |
eval "$(pixi shell-hook -e cuda-sglang)" | |
export LAUNCHER="torchrun \ | |
--nproc_per_node $GPUS_PER_NODE \ | |
--nnodes $SLURM_JOB_NUM_NODES \ | |
--master_addr $MASTER_ADDR \ | |
--master_port $MASTER_PORT \ | |
" | |
export SCRIPT=pytorch-nccl-test.py | |
cat << EOT > $SCRIPT | |
import torch.distributed as dist | |
import torch | |
import socket | |
import os | |
import fcntl | |
import time | |
SUCCESS=0 | |
def printflock(*msgs): | |
""" print """ | |
with open(__file__, "r") as fh: | |
fcntl.flock(fh, fcntl.LOCK_EX) | |
try: | |
print(*msgs) | |
finally: | |
fcntl.flock(fh, fcntl.LOCK_UN) | |
local_rank = int(os.environ["LOCAL_RANK"]) | |
header = f"{socket.gethostname()}-{local_rank}" | |
if local_rank == 0: | |
printflock(f"{header}: torch.__version__: {torch.__version__}") | |
printflock(f"{header}: torch.version.cuda: {torch.version.cuda}") | |
printflock(f"{header}: torch.cuda.is_available(): {torch.cuda.is_available()}") | |
printflock(f"{header}: torch.cuda.nccl.version(): {torch.cuda.nccl.version()}") | |
printflock(f'{header}: running dist.init_process_group("nccl", device_id=torch.device(f"cuda:{local_rank}")) ...') | |
torch.cuda.set_device(local_rank) | |
dist.init_process_group("nccl", device_id=torch.device(f"cuda:{local_rank}")) | |
printflock(f'{header}: dist.init_process_group("nccl", device_id=torch.device(f"cuda:{local_rank}")) SUCCESS') | |
try: | |
printflock(f"{header}: Trying dist.barrier()") | |
dist.barrier() | |
printflock(f"{header}: NCCL {torch.cuda.nccl.version()} OK") | |
SUCCESS=1 | |
except Exception as e: | |
printflock(f"{header}: NCCL {torch.cuda.nccl.version()} ERROR: {e}") | |
raise | |
finally: | |
# Properly destroy the process group to avoid resource leaks | |
if dist.is_initialized(): | |
printflock(f"{header}: Destroying process group...") | |
dist.destroy_process_group() | |
printflock(f"{header}: Process group destroyed successfully") | |
time.sleep(1) | |
printflock(f"{header}: NCCL TEST SUCCESS: {bool(SUCCESS)}") | |
EOT | |
export NCCL_DEBUG=INFO | |
# export NCCL_SOCKET_IFNAME=ib0 | |
# export NCCL_NET_GDR_LEVEL=2 | |
# export NCCL_NET_GDR_READ=0 | |
# export NCCL_P2P_DISABLE=1 | |
# export NCCL_IB_DISABLE=1 | |
echo "=============================" | |
echo "Software versions:" | |
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): nvidia-smi: $(nvidia-smi)"' | |
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): nvidia driver version: $(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits)"' | |
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): nvcc version: $(nvcc --version)"' | |
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): ibstat: $(ibstat)"' | |
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): ibdev2netdev: $(ibdev2netdev)"' | |
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): ofed_info -s: $(ofed_info -s)"' | |
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): lsmod | grep nvidia_peermem: $(lsmod | grep nvidia_peermem)"' | |
echo "=============================" | |
echo "NCCL ENV VARS:" | |
echo "NCCL_DEBUG: $NCCL_DEBUG" | |
echo "NCCL_SOCKET_IFNAME: $NCCL_SOCKET_IFNAME" | |
echo "NCCL_NET_GDR_LEVEL: $NCCL_NET_GDR_LEVEL" | |
echo "NCCL_P2P_DISABLE: $NCCL_P2P_DISABLE" | |
echo "NCCL_IB_DISABLE: $NCCL_IB_DISABLE" | |
echo "=============================" | |
echo "Running NCCL test:" | |
echo $LAUNCHER --node_rank $SLURM_PROCID $SCRIPT | |
srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_NODEID $SCRIPT' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment