12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- apiVersion: kubeflow.org/v1alpha2
- kind: MPIJob
- metadata:
- name: tensorflow-benchmarks
- spec:
- slotsPerWorker: 4
- cleanPodPolicy: Running
- mpiReplicaSpecs:
- Launcher:
- replicas: 1
- template:
- spec:
- containers:
- - image: nvcr.io/nvidia/tensorflow:19.06-py3
- imagePullPolicy: IfNotPresent
- name: tensorflow-benchmarks
- volumeMounts:
- - mountPath: /local_mount
- name: work-volume
- command:
- - mpirun
- - --allow-run-as-root
- - -np
- - "4"
- - -bind-to
- - none
- - -map-by
- #- slot
- - numa
- - -x
- - NCCL_DEBUG=INFO
- - -x
- - LD_LIBRARY_PATH
- - python
- - /local_mount/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
- - --batch_size=512
- - --model=resnet50
- - --variable_update=horovod
- - --optimizer=momentum
- - --nodistortions
- - --gradient_repacking=8
- - --weight_decay=1e-4
- - --use_fp16=true
- volumes:
- - name: work-volume
- hostPath:
- # directory locally mounted on host
- path: /work
- type: Directory
- Worker:
- replicas: 1
- template:
- spec:
- containers:
- - image: nvcr.io/nvidia/tensorflow:19.06-py3
- imagePullPolicy: IfNotPresent
- name: tensorflow-benchmarks
- resources:
- limits:
- nvidia.com/gpu: 4
- volumeMounts:
- - mountPath: /local_mount
- name: work-volume
- volumes:
- - name: work-volume
- hostPath:
- # directory locally mounted on host
- path: /work
- type: Directory
|