|
@@ -0,0 +1,54 @@
|
|
|
+# Run multi-node training benchmark w/ Nvidia NGC Container: nvcr.io/nvidia/tensorflow:19.06-py3
|
|
|
+#
|
|
|
+# 2 C4140 compute nodes
|
|
|
+# - 8 V100 GPUs
|
|
|
+# - ConnectX-5
|
|
|
+# - IPoIB EDR Infiniband in Ethernet mode
|
|
|
+#
|
|
|
+apiVersion: kubeflow.org/v1alpha1
|
|
|
+kind: MPIJob
|
|
|
+metadata:
|
|
|
+ name: tensorflow-benchmarks-resnet50
|
|
|
+spec:
|
|
|
+ replicas: 2
|
|
|
+ template:
|
|
|
+ spec:
|
|
|
+ containers:
|
|
|
+ - image: nvcr.io/nvidia/tensorflow:19.06-py3
|
|
|
+ name: tensorflow-benchmarks
|
|
|
+ volumeMounts:
|
|
|
+ - mountPath: /foo
|
|
|
+ name: work-volume
|
|
|
+ - mountPath: /data
|
|
|
+ name: mem-volume
|
|
|
+ resources:
|
|
|
+ limits:
|
|
|
+ nvidia.com/gpu: 4
|
|
|
+ command:
|
|
|
+ - mpirun
|
|
|
+ - --allow-run-as-root
|
|
|
+ - --map-by
|
|
|
+ - numa
|
|
|
+ - python
|
|
|
+ - /foo/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
|
|
|
+ - --batch_size=512
|
|
|
+ - --model=resnet50
|
|
|
+ - --variable_update=horovod
|
|
|
+ - --optimizer=momentum
|
|
|
+ - --nodistortions
|
|
|
+ - --gradient_repacking=8
|
|
|
+ - --weight_decay=1e-4
|
|
|
+ - --use_fp16=true
|
|
|
+ - --data_dir=/data/tensorflow/
|
|
|
+ - --data_name=imagenet
|
|
|
+ volumes:
|
|
|
+ - name: work-volume
|
|
|
+ hostPath:
|
|
|
+ # directory locally mounted on host
|
|
|
+ path: /work
|
|
|
+ type: Directory
|
|
|
+ - name: mem-volume
|
|
|
+ hostPath:
|
|
|
+ # dev shm directory on host
|
|
|
+ path: /dev/shm
|
|
|
+ type: Directory
|