# Run multi-node training benchmark w/ Nvidia NGC Container: nvcr.io/nvidia/tensorflow:19.06-py3 # # 2 C4140 compute nodes # - 8 V100 GPUs # - ConnectX-5 # - IPoIB EDR Infiniband in Ethernet mode # apiVersion: kubeflow.org/v1alpha1 kind: MPIJob metadata: name: tensorflow-benchmarks-resnet50 spec: replicas: 2 template: spec: containers: - image: nvcr.io/nvidia/tensorflow:19.06-py3 name: tensorflow-benchmarks volumeMounts: - mountPath: /foo name: work-volume - mountPath: /data name: mem-volume resources: limits: nvidia.com/gpu: 4 command: - mpirun - --allow-run-as-root - --map-by - numa - python - /foo/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py - --batch_size=512 - --model=resnet50 - --variable_update=horovod - --optimizer=momentum - --nodistortions - --gradient_repacking=8 - --weight_decay=1e-4 - --use_fp16=true - --data_dir=/data/tensorflow/ - --data_name=imagenet volumes: - name: work-volume hostPath: # directory locally mounted on host path: /work type: Directory - name: mem-volume hostPath: # dev shm directory on host path: /dev/shm type: Directory