Преглед на файлове

Merge pull request #2 from j0hnL/master

adding k8s and slurm submission examples
John Lockman преди 5 години
родител
ревизия
85887da242
променени са 2 файла, в които са добавени 77 реда и са изтрити 0 реда
  1. 54 0
      examples/k8s-TensorFlow-resnet50-multinode-MPIOperator.yaml
  2. 23 0
      examples/slurm-TensorFlow-resnet50-multinode-MPI.batch

+ 54 - 0
examples/k8s-TensorFlow-resnet50-multinode-MPIOperator.yaml

@@ -0,0 +1,54 @@
+# Run multi-node training benchmark w/ Nvidia NGC Container: nvcr.io/nvidia/tensorflow:19.06-py3
+#
+# 2 C4140 compute nodes
+#  - 8 V100 GPUs
+#  - ConnectX-5
+#  - IPoIB EDR Infiniband in Ethernet mode
+#
+apiVersion: kubeflow.org/v1alpha1
+kind: MPIJob
+metadata:
+  name: tensorflow-benchmarks-resnet50
+spec:
+  replicas: 2
+  template:
+    spec:
+      containers:
+      - image: nvcr.io/nvidia/tensorflow:19.06-py3
+        name: tensorflow-benchmarks
+        volumeMounts:
+          - mountPath: /foo
+            name: work-volume
+          - mountPath: /data
+            name: mem-volume
+        resources:
+          limits:
+            nvidia.com/gpu: 4
+        command:
+          - mpirun
+          - --allow-run-as-root
+          - --map-by
+          - numa
+          - python
+          - /foo/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
+          - --batch_size=512
+          - --model=resnet50
+          - --variable_update=horovod
+          - --optimizer=momentum
+          - --nodistortions
+          - --gradient_repacking=8
+          - --weight_decay=1e-4
+          - --use_fp16=true
+          - --data_dir=/data/tensorflow/
+          - --data_name=imagenet
+      volumes:
+      - name: work-volume
+        hostPath:
+          # directory locally mounted on host
+          path: /work
+          type: Directory
+      - name: mem-volume
+        hostPath:
+          # dev shm directory on host
+          path: /dev/shm
+          type: Directory

+ 23 - 0
examples/slurm-TensorFlow-resnet50-multinode-MPI.batch

@@ -0,0 +1,23 @@
+#!/bin/bash
+
+#SBATCH -n 2
+#SBATCH -N 2
+#SBATCH -J TF-resnet50
+#SBATCH -o %J-tf-resnet50.txt
+#SBATCH -t 00:30:00
+
+
+mpirun  \ 
+      --map-by numa  \
+      python  \
+      /foo/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py  \
+      --batch_size=512  \
+      --model=resnet50  \
+      --variable_update=horovod  \
+      --optimizer=momentum  \
+      --nodistortions  \
+      --gradient_repacking=8  \
+      --weight_decay=1e-4  \
+      --use_fp16=true  \
+      --data_dir=/data/tensorflow/  \
+      --data_name=imagenet