12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455 |
- # Run multi-node training benchmark w/ Nvidia NGC Container: nvcr.io/nvidia/tensorflow:19.06-py3
- #
- # 2 C4140 compute nodes
- # - 8 V100 GPUs
- # - ConnectX-5
- # - IPoIB EDR Infiniband in Ethernet mode
- #
- apiVersion: kubeflow.org/v1alpha1
- kind: MPIJob
- metadata:
- name: tensorflow-benchmarks-resnet50
- spec:
- replicas: 2
- template:
- spec:
- containers:
- - image: nvcr.io/nvidia/tensorflow:19.06-py3
- name: tensorflow-benchmarks
- volumeMounts:
- - mountPath: /foo
- name: work-volume
- - mountPath: /data
- name: mem-volume
- resources:
- limits:
- nvidia.com/gpu: 4
- command:
- - mpirun
- - --allow-run-as-root
- - --map-by
- - numa
- - python
- - /foo/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
- - --batch_size=512
- - --model=resnet50
- - --variable_update=horovod
- - --optimizer=momentum
- - --nodistortions
- - --gradient_repacking=8
- - --weight_decay=1e-4
- - --use_fp16=true
- - --data_dir=/data/tensorflow/
- - --data_name=imagenet
- volumes:
- - name: work-volume
- hostPath:
- # directory locally mounted on host
- path: /work
- type: Directory
- - name: mem-volume
- hostPath:
- # dev shm directory on host
- path: /dev/shm
- type: Directory
|