|
@@ -37,19 +37,15 @@ Every 300 seconds this information will be saved to a file named using the
|
|
Slurm array job and task IDs as discussed in [the Slurm
|
|
Slurm array job and task IDs as discussed in [the Slurm
|
|
section](#parametrising-job-arrays)
|
|
section](#parametrising-job-arrays)
|
|
|
|
|
|
-This job is sent to the background and stopped after the `$command` has run.
|
|
|
|
|
|
+This job is sent to the background and stopped after the `$COMMAND` has run.
|
|
|
|
|
|
```bash
|
|
```bash
|
|
-...
|
|
|
|
-
|
|
|
|
nvidia-smi dmon -o TD -s puct -d 300 > "dmon-${Slurm_ARRAY_JOB_ID}_${Slurm_ARRAY_TASK_ID}".txt &
|
|
nvidia-smi dmon -o TD -s puct -d 300 > "dmon-${Slurm_ARRAY_JOB_ID}_${Slurm_ARRAY_TASK_ID}".txt &
|
|
-gpu_watch_pid=$!
|
|
|
|
-
|
|
|
|
-$command
|
|
|
|
|
|
+GPU_WATCH_PID=$!
|
|
|
|
|
|
-kill $gpu_watch_pid
|
|
|
|
|
|
+$COMMAND
|
|
|
|
|
|
-...
|
|
|
|
|
|
+kill $GPU_WATCH_PID
|
|
```
|
|
```
|
|
|
|
|
|
## Slurm
|
|
## Slurm
|
|
@@ -105,7 +101,7 @@ program.
|
|
|
|
|
|
```bash
|
|
```bash
|
|
date --iso-8601=seconds --utc
|
|
date --iso-8601=seconds --utc
|
|
-$command
|
|
|
|
|
|
+$COMMAND
|
|
date --iso-8601=seconds --utc
|
|
date --iso-8601=seconds --utc
|
|
```
|
|
```
|
|
|
|
|
|
@@ -213,7 +209,7 @@ INPUT_DIR=$(basename $INPUT_DATA)
|
|
OUTPUT_DIR=/path/to/output/dir
|
|
OUTPUT_DIR=/path/to/output/dir
|
|
|
|
|
|
# Create a directory on scratch disk for this job
|
|
# Create a directory on scratch disk for this job
|
|
-JOB_SCRATCH_PATH= $HOST_SCRATCH_PATH/${Slurm_JOB_NAME}_${Slurm_ARRAY_JOB_ID}_${Slurm_ARRAY_TASK_ID}
|
|
|
|
|
|
+JOB_SCRATCH_PATH=$HOST_SCRATCH_PATH/${Slurm_JOB_NAME}_${Slurm_ARRAY_JOB_ID}_${Slurm_ARRAY_TASK_ID}
|
|
mkdir -p $JOB_SCRATCH_PATH
|
|
mkdir -p $JOB_SCRATCH_PATH
|
|
|
|
|
|
# Copy input data to scratch directory
|
|
# Copy input data to scratch directory
|
|
@@ -225,7 +221,7 @@ mkdir -p $JOB_SCRATCH_PATH/output
|
|
# Run the application
|
|
# Run the application
|
|
singularity run --bind $JOB_SCRATCH_PATH:/scratch_mount --nv my_container.sif --input /scratch_mount/$INPUT_DIR --output /scratch_mount/output/
|
|
singularity run --bind $JOB_SCRATCH_PATH:/scratch_mount --nv my_container.sif --input /scratch_mount/$INPUT_DIR --output /scratch_mount/output/
|
|
|
|
|
|
-# Copy output
|
|
|
|
|
|
+# Copy output from scratch
|
|
cp -r $JOB_SCRATCH_PATH/output $OUTPUT_DIR/output_${Slurm_ARRAY_JOB_ID}_${Slurm_ARRAY_TASK_ID}
|
|
cp -r $JOB_SCRATCH_PATH/output $OUTPUT_DIR/output_${Slurm_ARRAY_JOB_ID}_${Slurm_ARRAY_TASK_ID}
|
|
|
|
|
|
# Clean up
|
|
# Clean up
|
|
@@ -239,3 +235,89 @@ namespace so there is no possibility of file or directory names clashing
|
|
between different jobs.
|
|
between different jobs.
|
|
|
|
|
|
### Template
|
|
### Template
|
|
|
|
+
|
|
|
|
+Collecting the above tips, here is a template batch script that can be adapted
|
|
|
|
+to run these (or other) calculations on clusters with the Slurm scheduler.
|
|
|
|
+
|
|
|
|
+```bash
|
|
|
|
+#!/bin/bash
|
|
|
|
+
|
|
|
|
+##########
|
|
|
|
+# Slurm parameters
|
|
|
|
+##########
|
|
|
|
+
|
|
|
|
+# set the number of nodes
|
|
|
|
+#SBATCH --nodes=...
|
|
|
|
+
|
|
|
|
+# set max wallclock time
|
|
|
|
+#SBATCH --time=...
|
|
|
|
+
|
|
|
|
+# set name of job
|
|
|
|
+#SBATCH --job-name=...
|
|
|
|
+
|
|
|
|
+# set number of GPUs
|
|
|
|
+#SBATCH --gres=gpu:...
|
|
|
|
+
|
|
|
|
+##########
|
|
|
|
+# Job parameters
|
|
|
|
+##########
|
|
|
|
+
|
|
|
|
+# Path to scratch disk on host
|
|
|
|
+HOST_SCRATCH_PATH=...
|
|
|
|
+
|
|
|
|
+# Path to input data on host
|
|
|
|
+INPUT_DATA=...
|
|
|
|
+
|
|
|
|
+# Get name of input data directory
|
|
|
|
+INPUT_DIR=$(basename $INPUT_DATA)
|
|
|
|
+
|
|
|
|
+# Path to place output data on host
|
|
|
|
+OUTPUT_DIR=...
|
|
|
|
+
|
|
|
|
+# Define command to run
|
|
|
|
+COMMAND=singularity exec --nv --bind $JOB_SCRATCH_PATH:/scratch_mount ...
|
|
|
|
+
|
|
|
|
+##########
|
|
|
|
+# Prepare data and directories in scratch space
|
|
|
|
+##########
|
|
|
|
+
|
|
|
|
+# Create a directory on scratch disk for this job
|
|
|
|
+JOB_SCRATCH_PATH=$HOST_SCRATCH_PATH/${Slurm_JOB_NAME}_${Slurm_ARRAY_JOB_ID}_${Slurm_ARRAY_TASK_ID}
|
|
|
|
+mkdir -p $JOB_SCRATCH_PATH
|
|
|
|
+
|
|
|
|
+# Copy input data to scratch directory
|
|
|
|
+cp -r $INPUT_DATA $JOB_SCRATCH_PATH
|
|
|
|
+
|
|
|
|
+# Make output data directory
|
|
|
|
+mkdir -p $JOB_SCRATCH_PATH/output
|
|
|
|
+
|
|
|
|
+##########
|
|
|
|
+# Monitor and run the job
|
|
|
|
+##########
|
|
|
|
+
|
|
|
|
+# load modules (will be system dependent, may not be necessary)
|
|
|
|
+module purge
|
|
|
|
+module load singularity
|
|
|
|
+
|
|
|
|
+# Monitor GPU usage
|
|
|
|
+nvidia-smi dmon -o TD -s puct -d 300 > "dmon-${Slurm_ARRAY_JOB_ID}_${Slurm_ARRAY_TASK_ID}".txt &
|
|
|
|
+GPU_WATCH_PID=$!
|
|
|
|
+
|
|
|
|
+# Run command
|
|
|
|
+date --iso-8601=seconds --utc
|
|
|
|
+$COMMAND
|
|
|
|
+date --iso-8601=seconds --utc
|
|
|
|
+
|
|
|
|
+##########
|
|
|
|
+# Post job clean up
|
|
|
|
+##########
|
|
|
|
+
|
|
|
|
+# Stop nvidia-smi dmon process
|
|
|
|
+kill $GPU_WATCH_PID
|
|
|
|
+
|
|
|
|
+# Copy output from scratch
|
|
|
|
+cp -r $JOB_SCRATCH_PATH/output $OUTPUT_DIR/output_${Slurm_ARRAY_JOB_ID}_${Slurm_ARRAY_TASK_ID}
|
|
|
|
+
|
|
|
|
+# Clean up
|
|
|
|
+rm -rf $JOB_SCRATCH_PATH
|
|
|
|
+```
|