# Run multi-node training benchmark w/ Nvidia NGC Container: nvcr.io/nvidia/tensorflow:19.06-py3
#
# 2 C4140 compute nodes
#  - 8 V100 GPUs
#  - ConnectX-5
#  - IPoIB EDR Infiniband in Ethernet mode
#
apiVersion: kubeflow.org/v1alpha1
kind: MPIJob
metadata:
  name: tensorflow-benchmarks-resnet50
spec:
  replicas: 2
  template:
    spec:
      containers:
      - image: nvcr.io/nvidia/tensorflow:19.06-py3
        name: tensorflow-benchmarks
        volumeMounts:
          - mountPath: /foo
            name: work-volume
          - mountPath: /data
            name: mem-volume
        resources:
          limits:
            nvidia.com/gpu: 4
        command:
          - mpirun
          - --allow-run-as-root
          - --map-by
          - numa
          - python
          - /foo/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
          - --batch_size=512
          - --model=resnet50
          - --variable_update=horovod
          - --optimizer=momentum
          - --nodistortions
          - --gradient_repacking=8
          - --weight_decay=1e-4
          - --use_fp16=true
          - --data_dir=/data/tensorflow/
          - --data_name=imagenet
      volumes:
      - name: work-volume
        hostPath:
          # directory locally mounted on host
          path: /work
          type: Directory
      - name: mem-volume
        hostPath:
          # dev shm directory on host
          path: /dev/shm
          type: Directory