harry-stark · November 24, 2022 14:59
diff --git a/multinode.sbatch b/multinode.sbatch
 #!/bin/bash
 #SBATCH --job-name="elm"
 #SBATCH --partition=gpu
 #SBATCH --mem-per-cpu=16GB        # Amount of CPU memory
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=8      # Crucial - only 1 task per dist per node!
 #SBATCH --cpus-per-task=6           # Number of cores per tasks
 #SBATCH --hint=nomultithread         # We get physical cores not logical
 #SBATCH --gres=gpu:8                 # Number of gpus
 #SBATCH --output=%x_%j.out   # Set this dir where you want slurm outs to go
 #SBATCH --error=%x_%j.out    # Set this dir where you want slurm outs to go
 #SBATCH --exclusive      # Turn off node sharing
 #SBATCH --comment=elm

 module load openmpi
 module load cuda/11.4

 mkdir -p /fsx/home-$(whoami)/hostfiles
 hostfile=/fsx/home-$(whoami)/hostfiles/hosts_$SLURM_JOBID
 rm $hostfile &> /dev/null # for consecutive calls to this script in interactive jobs

 for i in `scontrol show hostnames $SLURM_NODELIST`
 do
    echo $i slots=8 >>$hostfile
 done

 export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
 export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 export MASTER_PORT=12802
 export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`

 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib
 export NCCL_PROTO=simple
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/aws-ofi-nccl/lib
 export PATH=$PATH:/opt/amazon/efa/bin:/opt/amazon/openmpi/bin
 export FI_EFA_FORK_SAFE=1
 export FI_LOG_LEVEL=1
 export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
 export NCCL_DEBUG=info
 export OMPI_MCA_mtl_base_verbose=1
 export FI_EFA_ENABLE_SHM_TRANSFER=0
 export FI_PROVIDER=efa
 export FI_EFA_TX_MIN_CREDITS=64
 export NCCL_TREE_THRESHOLD=0
 export OMPI_MCA_pml="^cm"
 export OMPI_MCA_btl="tcp,self"
 export OMPI_MCA_btl_tcp_if_exclude="lo,docker1"
 export OMPI_MCA_plm_rsh_no_tree_spawn=1


 export TORCH_EXTENSIONS_DIR=extensions
 export XDG_CACHE_HOME=hf_cache

 #source /fsx/home-honglu/miniconda3/bin/activate
 #source /fsx/codeSeCodegen/codeSeEnv/bin/activate
 #conda activate training
 #apt-get install libopenmpi-dev
 pip install -r requirements.txt
 pip install mpi4py
 conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia -y
 #apt-get install -y pdsh
 export TORCH_EXTENSIONS_DIR=extensions

 deepspeed --num_nodes 2 --num_gpus 8 --hostfile $hostfile --master_addr $MASTER_ADDR  run_clm.py --model_name_or_path=codegen-350 --per_device_train_batch_size=2 --num_train_epochs 1 --preprocessing_num_workers 25 --save_strategy=epoch --output_dir=finetune_codegen_350_full --report_to "wandb" --dataset_name Full --tokenizer_name codegen-350 --block_size 2048 --gradient_accumulation_steps 2 --do_train --logging_strategy=epoch --fp16 --overwrite_output_dir --adam_beta1=0.9 --adam_beta2=0.95 --weight_decay=2e-02 --learning_rate=1e-05 --warmup_steps=895 --per_device_eval_batch_size=1 --cache_dir="hf_cache" --gradient_checkpointing=True --deepspeed --deepspeed_config config_multinode.json
	#!/bin/bash
	#SBATCH --job-name="elm"
	#SBATCH --partition=gpu
	#SBATCH --mem-per-cpu=16GB # Amount of CPU memory
	#SBATCH --nodes=4
	#SBATCH --ntasks-per-node=8 # Crucial - only 1 task per dist per node!
	#SBATCH --cpus-per-task=6 # Number of cores per tasks
	#SBATCH --hint=nomultithread # We get physical cores not logical
	#SBATCH --gres=gpu:8 # Number of gpus
	#SBATCH --output=%x_%j.out # Set this dir where you want slurm outs to go
	#SBATCH --error=%x_%j.out # Set this dir where you want slurm outs to go
	#SBATCH --exclusive # Turn off node sharing
	#SBATCH --comment=elm

	module load openmpi
	module load cuda/11.4

	mkdir -p /fsx/home-$(whoami)/hostfiles
	hostfile=/fsx/home-$(whoami)/hostfiles/hosts_$SLURM_JOBID
	rm $hostfile &> /dev/null # for consecutive calls to this script in interactive jobs

	for i in `scontrol show hostnames $SLURM_NODELIST`
	do
	echo $i slots=8 >>$hostfile
	done

	export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
	export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" \| head -n 1)
	export MASTER_PORT=12802
	export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" \| wc -l`

	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib
	export NCCL_PROTO=simple
	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/aws-ofi-nccl/lib
	export PATH=$PATH:/opt/amazon/efa/bin:/opt/amazon/openmpi/bin
	export FI_EFA_FORK_SAFE=1
	export FI_LOG_LEVEL=1
	export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
	export NCCL_DEBUG=info
	export OMPI_MCA_mtl_base_verbose=1
	export FI_EFA_ENABLE_SHM_TRANSFER=0
	export FI_PROVIDER=efa
	export FI_EFA_TX_MIN_CREDITS=64
	export NCCL_TREE_THRESHOLD=0
	export OMPI_MCA_pml="^cm"
	export OMPI_MCA_btl="tcp,self"
	export OMPI_MCA_btl_tcp_if_exclude="lo,docker1"
	export OMPI_MCA_plm_rsh_no_tree_spawn=1


	export TORCH_EXTENSIONS_DIR=extensions
	export XDG_CACHE_HOME=hf_cache

	#source /fsx/home-honglu/miniconda3/bin/activate
	#source /fsx/codeSeCodegen/codeSeEnv/bin/activate
	#conda activate training
	#apt-get install libopenmpi-dev
	pip install -r requirements.txt
	pip install mpi4py
	conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia -y
	#apt-get install -y pdsh
	export TORCH_EXTENSIONS_DIR=extensions

	deepspeed --num_nodes 2 --num_gpus 8 --hostfile $hostfile --master_addr $MASTER_ADDR run_clm.py --model_name_or_path=codegen-350 --per_device_train_batch_size=2 --num_train_epochs 1 --preprocessing_num_workers 25 --save_strategy=epoch --output_dir=finetune_codegen_350_full --report_to "wandb" --dataset_name Full --tokenizer_name codegen-350 --block_size 2048 --gradient_accumulation_steps 2 --do_train --logging_strategy=epoch --fp16 --overwrite_output_dir --adam_beta1=0.9 --adam_beta2=0.95 --weight_decay=2e-02 --learning_rate=1e-05 --warmup_steps=895 --per_device_eval_batch_size=1 --cache_dir="hf_cache" --gradient_checkpointing=True --deepspeed --deepspeed_config config_multinode.json