k8s-TensorFlow-resnet50-multinode-MPIOperator.yaml 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. # Run multi-node training benchmark w/ Nvidia NGC Container: nvcr.io/nvidia/tensorflow:19.06-py3
  2. #
  3. # 2 C4140 compute nodes
  4. # - 8 V100 GPUs
  5. # - ConnectX-5
  6. # - IPoIB EDR Infiniband in Ethernet mode
  7. #
  8. apiVersion: kubeflow.org/v1alpha1
  9. kind: MPIJob
  10. metadata:
  11. name: tensorflow-benchmarks-resnet50
  12. spec:
  13. replicas: 2
  14. template:
  15. spec:
  16. containers:
  17. - image: nvcr.io/nvidia/tensorflow:19.06-py3
  18. name: tensorflow-benchmarks
  19. volumeMounts:
  20. - mountPath: /foo
  21. name: work-volume
  22. - mountPath: /data
  23. name: mem-volume
  24. resources:
  25. limits:
  26. nvidia.com/gpu: 4
  27. command:
  28. - mpirun
  29. - --allow-run-as-root
  30. - --map-by
  31. - numa
  32. - python
  33. - /foo/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
  34. - --batch_size=512
  35. - --model=resnet50
  36. - --variable_update=horovod
  37. - --optimizer=momentum
  38. - --nodistortions
  39. - --gradient_repacking=8
  40. - --weight_decay=1e-4
  41. - --use_fp16=true
  42. - --data_dir=/data/tensorflow/
  43. - --data_name=imagenet
  44. volumes:
  45. - name: work-volume
  46. hostPath:
  47. # directory locally mounted on host
  48. path: /work
  49. type: Directory
  50. - name: mem-volume
  51. hostPath:
  52. # dev shm directory on host
  53. path: /dev/shm
  54. type: Directory