apiVersion: kubeflow.org/v1alpha2
kind: MPIJob
metadata:
  name: tensorflow-benchmarks
spec:
  slotsPerWorker: 4
  cleanPodPolicy: Running
  mpiReplicaSpecs:
    Launcher:
      replicas: 1
      template:
         spec:
           containers:
           - image: nvcr.io/nvidia/tensorflow:19.06-py3
             imagePullPolicy: IfNotPresent
             name: tensorflow-benchmarks
             volumeMounts:
               - mountPath: /local_mount
                 name: work-volume
             command:
             - mpirun
             - --allow-run-as-root
             - -np
             - "4"
             - -bind-to
             - none
             - -map-by
             #- slot
             - numa
             - -x
             - NCCL_DEBUG=INFO
             - -x
             - LD_LIBRARY_PATH
             - python
             - /local_mount/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
             - --batch_size=512
             - --model=resnet50
             - --variable_update=horovod
             - --optimizer=momentum
             - --nodistortions
             - --gradient_repacking=8
             - --weight_decay=1e-4
             - --use_fp16=true
           volumes:
             - name: work-volume
               hostPath:
                 # directory locally mounted on host
                 path: /work
                 type: Directory
    Worker:
      replicas: 1
      template:
        spec:
          containers:
          - image: nvcr.io/nvidia/tensorflow:19.06-py3
            imagePullPolicy: IfNotPresent
            name: tensorflow-benchmarks
            resources:
              limits:
                nvidia.com/gpu: 4
            volumeMounts:
              - mountPath: /local_mount
                name: work-volume
          volumes:
            - name: work-volume
              hostPath:
                # directory locally mounted on host
                path: /work
                type: Directory