| 12345678910111213141516171819202122232425262728293031323334353637 | # Copyright (c) Meta Platforms, Inc. and affiliates.# This software may be used and distributed according to the terms of the GNU General Public License version 3.#!/bin/bash#SBATCH --job-name=Nano-2d-trainer-20b-8nodes#SBATCH --ntasks=2#SBATCH --nodes=2#SBATCH --gpus-per-task=4#SBATCH --partition=train nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )nodes_array=($nodes)head_node=${nodes_array[0]}head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)# Enable for A100export FI_PROVIDER="efa"echo Node IP: $head_node_ipexport LOGLEVEL=INFO# debugging flags (optional)export NCCL_DEBUG=WARNexport NCCL_DEBUG_SUBSYS=WARNexport PYTHONFAULTHANDLER=1export LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATHexport LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATHexport CUDA_LAUNCH_BLOCKING=0# on your cluster you might need these:# set the network interfaceexport NCCL_SOCKET_IFNAME="ens"export FI_EFA_USE_DEVICE_RDMA=1srun  torchrun --nproc_per_node 4 --rdzv_id $RANDOM --rdzv_backend c10d --rdzv_endpoint $head_node_ip:29500 examples/finetuning.py  --enable_fsdp --use_peft --peft_method lora
 |