|
@@ -0,0 +1,69 @@
|
|
|
|
+apiVersion: kubeflow.org/v1alpha2
|
|
|
|
+kind: MPIJob
|
|
|
|
+metadata:
|
|
|
|
+ name: tensorflow-benchmarks
|
|
|
|
+spec:
|
|
|
|
+ slotsPerWorker: 4
|
|
|
|
+ cleanPodPolicy: Running
|
|
|
|
+ mpiReplicaSpecs:
|
|
|
|
+ Launcher:
|
|
|
|
+ replicas: 1
|
|
|
|
+ template:
|
|
|
|
+ spec:
|
|
|
|
+ containers:
|
|
|
|
+ - image: nvcr.io/nvidia/tensorflow:19.06-py3
|
|
|
|
+ imagePullPolicy: IfNotPresent
|
|
|
|
+ name: tensorflow-benchmarks
|
|
|
|
+ volumeMounts:
|
|
|
|
+ - mountPath: /local_mount
|
|
|
|
+ name: work-volume
|
|
|
|
+ command:
|
|
|
|
+ - mpirun
|
|
|
|
+ - --allow-run-as-root
|
|
|
|
+ - -np
|
|
|
|
+ - "4"
|
|
|
|
+ - -bind-to
|
|
|
|
+ - none
|
|
|
|
+ - -map-by
|
|
|
|
+ #- slot
|
|
|
|
+ - numa
|
|
|
|
+ - -x
|
|
|
|
+ - NCCL_DEBUG=INFO
|
|
|
|
+ - -x
|
|
|
|
+ - LD_LIBRARY_PATH
|
|
|
|
+ - python
|
|
|
|
+ - /local_mount/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
|
|
|
|
+ - --batch_size=512
|
|
|
|
+ - --model=resnet50
|
|
|
|
+ - --variable_update=horovod
|
|
|
|
+ - --optimizer=momentum
|
|
|
|
+ - --nodistortions
|
|
|
|
+ - --gradient_repacking=8
|
|
|
|
+ - --weight_decay=1e-4
|
|
|
|
+ - --use_fp16=true
|
|
|
|
+ volumes:
|
|
|
|
+ - name: work-volume
|
|
|
|
+ hostPath:
|
|
|
|
+ # directory locally mounted on host
|
|
|
|
+ path: /work
|
|
|
|
+ type: Directory
|
|
|
|
+ Worker:
|
|
|
|
+ replicas: 1
|
|
|
|
+ template:
|
|
|
|
+ spec:
|
|
|
|
+ containers:
|
|
|
|
+ - image: nvcr.io/nvidia/tensorflow:19.06-py3
|
|
|
|
+ imagePullPolicy: IfNotPresent
|
|
|
|
+ name: tensorflow-benchmarks
|
|
|
|
+ resources:
|
|
|
|
+ limits:
|
|
|
|
+ nvidia.com/gpu: 4
|
|
|
|
+ volumeMounts:
|
|
|
|
+ - mountPath: /local_mount
|
|
|
|
+ name: work-volume
|
|
|
|
+ volumes:
|
|
|
|
+ - name: work-volume
|
|
|
|
+ hostPath:
|
|
|
|
+ # directory locally mounted on host
|
|
|
|
+ path: /work
|
|
|
|
+ type: Directory
|