k8s-tensorflow-nvidia-ngc-resnet50-multinode-mpioperator.yaml 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. apiVersion: kubeflow.org/v1alpha2
  2. kind: MPIJob
  3. metadata:
  4. name: tensorflow-benchmarks
  5. spec:
  6. slotsPerWorker: 4
  7. cleanPodPolicy: Running
  8. mpiReplicaSpecs:
  9. Launcher:
  10. replicas: 1
  11. template:
  12. spec:
  13. containers:
  14. - image: nvcr.io/nvidia/tensorflow:19.06-py3
  15. imagePullPolicy: IfNotPresent
  16. name: tensorflow-benchmarks
  17. volumeMounts:
  18. - mountPath: /local_mount
  19. name: work-volume
  20. command:
  21. - mpirun
  22. - --allow-run-as-root
  23. - -np
  24. - "4"
  25. - -bind-to
  26. - none
  27. - -map-by
  28. #- slot
  29. - numa
  30. - -x
  31. - NCCL_DEBUG=INFO
  32. - -x
  33. - LD_LIBRARY_PATH
  34. - python
  35. - /local_mount/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
  36. - --batch_size=512
  37. - --model=resnet50
  38. - --variable_update=horovod
  39. - --optimizer=momentum
  40. - --nodistortions
  41. - --gradient_repacking=8
  42. - --weight_decay=1e-4
  43. - --use_fp16=true
  44. volumes:
  45. - name: work-volume
  46. hostPath:
  47. # directory locally mounted on host
  48. path: /work
  49. type: Directory
  50. Worker:
  51. replicas: 1
  52. template:
  53. spec:
  54. containers:
  55. - image: nvcr.io/nvidia/tensorflow:19.06-py3
  56. imagePullPolicy: IfNotPresent
  57. name: tensorflow-benchmarks
  58. resources:
  59. limits:
  60. nvidia.com/gpu: 4
  61. volumeMounts:
  62. - mountPath: /local_mount
  63. name: work-volume
  64. volumes:
  65. - name: work-volume
  66. hostPath:
  67. # directory locally mounted on host
  68. path: /work
  69. type: Directory