| 12345678910111213141516171819202122232425262728293031323334353637 | 
							- # Copyright (c) Meta Platforms, Inc. and affiliates.
 
- # This software may be used and distributed according to the terms of the GNU General Public License version 3.
 
- #!/bin/bash
 
- #SBATCH --job-name=Nano-2d-trainer-20b-8nodes
 
- #SBATCH --ntasks=2
 
- #SBATCH --nodes=2
 
- #SBATCH --gpus-per-task=4
 
- #SBATCH --partition=train 
 
- nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
 
- nodes_array=($nodes)
 
- head_node=${nodes_array[0]}
 
- head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
 
- # Enable for A100
 
- export FI_PROVIDER="efa"
 
- echo Node IP: $head_node_ip
 
- export LOGLEVEL=INFO
 
- # debugging flags (optional)
 
- export NCCL_DEBUG=WARN
 
- export NCCL_DEBUG_SUBSYS=WARN
 
- export PYTHONFAULTHANDLER=1
 
- export LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH
 
- export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
 
- export CUDA_LAUNCH_BLOCKING=0
 
- # on your cluster you might need these:
 
- # set the network interface
 
- export NCCL_SOCKET_IFNAME="ens"
 
- export FI_EFA_USE_DEVICE_RDMA=1
 
- srun  torchrun --nproc_per_node 4 --rdzv_id $RANDOM --rdzv_backend c10d --rdzv_endpoint $head_node_ip:29500 ./finetuning.py  --enable_fsdp --use_peft --peft_method lora
 
 
  |