apiVersion: kubeflow.org/v1alpha2 kind: MPIJob metadata: name: tensorflow-benchmarks spec: slotsPerWorker: 4 cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 template: spec: containers: - image: nvcr.io/nvidia/tensorflow:19.06-py3 imagePullPolicy: IfNotPresent name: tensorflow-benchmarks volumeMounts: - mountPath: /local_mount name: work-volume command: - mpirun - --allow-run-as-root - -np - "4" - -bind-to - none - -map-by #- slot - numa - -x - NCCL_DEBUG=INFO - -x - LD_LIBRARY_PATH - python - /local_mount/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py - --batch_size=512 - --model=resnet50 - --variable_update=horovod - --optimizer=momentum - --nodistortions - --gradient_repacking=8 - --weight_decay=1e-4 - --use_fp16=true volumes: - name: work-volume hostPath: # directory locally mounted on host path: /work type: Directory Worker: replicas: 1 template: spec: containers: - image: nvcr.io/nvidia/tensorflow:19.06-py3 imagePullPolicy: IfNotPresent name: tensorflow-benchmarks resources: limits: nvidia.com/gpu: 4 volumeMounts: - mountPath: /local_mount name: work-volume volumes: - name: work-volume hostPath: # directory locally mounted on host path: /work type: Directory