|
@@ -1,54 +0,0 @@
|
|
|
-# Run multi-node training benchmark w/ Nvidia NGC Container: nvcr.io/nvidia/tensorflow:19.06-py3
|
|
|
-#
|
|
|
-# 2 C4140 compute nodes
|
|
|
-# - 8 V100 GPUs
|
|
|
-# - ConnectX-5
|
|
|
-# - IPoIB EDR Infiniband in Ethernet mode
|
|
|
-#
|
|
|
-apiVersion: kubeflow.org/v1alpha1
|
|
|
-kind: MPIJob
|
|
|
-metadata:
|
|
|
- name: tensorflow-benchmarks-resnet50
|
|
|
-spec:
|
|
|
- replicas: 2
|
|
|
- template:
|
|
|
- spec:
|
|
|
- containers:
|
|
|
- - image: nvcr.io/nvidia/tensorflow:19.06-py3
|
|
|
- name: tensorflow-benchmarks
|
|
|
- volumeMounts:
|
|
|
- - mountPath: /foo
|
|
|
- name: work-volume
|
|
|
- - mountPath: /data
|
|
|
- name: mem-volume
|
|
|
- resources:
|
|
|
- limits:
|
|
|
- nvidia.com/gpu: 4
|
|
|
- command:
|
|
|
- - mpirun
|
|
|
- - --allow-run-as-root
|
|
|
- - --map-by
|
|
|
- - numa
|
|
|
- - python
|
|
|
- - /foo/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
|
|
|
- - --batch_size=512
|
|
|
- - --model=resnet50
|
|
|
- - --variable_update=horovod
|
|
|
- - --optimizer=momentum
|
|
|
- - --nodistortions
|
|
|
- - --gradient_repacking=8
|
|
|
- - --weight_decay=1e-4
|
|
|
- - --use_fp16=true
|
|
|
- - --data_dir=/data/tensorflow/
|
|
|
- - --data_name=imagenet
|
|
|
- volumes:
|
|
|
- - name: work-volume
|
|
|
- hostPath:
|
|
|
- # directory locally mounted on host
|
|
|
- path: /work
|
|
|
- type: Directory
|
|
|
- - name: mem-volume
|
|
|
- hostPath:
|
|
|
- # dev shm directory on host
|
|
|
- path: /dev/shm
|
|
|
- type: Directory
|