Prechádzať zdrojové kódy

adding updated TF example using NGC and latest MPI Operator
resolves issue #77

Signed-off-by: John Lockman <jlockman3@gmail.com>

John Lockman 4 rokov pred
rodič
commit
de429f2459

+ 69 - 0
examples/k8s-TensorFlow-NvidiaNGC-esnet50-multinode-MPIOperator.yaml

@@ -0,0 +1,69 @@
+apiVersion: kubeflow.org/v1alpha2
+kind: MPIJob
+metadata:
+  name: tensorflow-benchmarks
+spec:
+  slotsPerWorker: 4
+  cleanPodPolicy: Running
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+         spec:
+           containers:
+           - image: nvcr.io/nvidia/tensorflow:19.06-py3
+             imagePullPolicy: IfNotPresent
+             name: tensorflow-benchmarks
+             volumeMounts:
+               - mountPath: /local_mount
+                 name: work-volume
+             command:
+             - mpirun
+             - --allow-run-as-root
+             - -np
+             - "4"
+             - -bind-to
+             - none
+             - -map-by
+             #- slot
+             - numa
+             - -x
+             - NCCL_DEBUG=INFO
+             - -x
+             - LD_LIBRARY_PATH
+             - python
+             - /local_mount/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
+             - --batch_size=512
+             - --model=resnet50
+             - --variable_update=horovod
+             - --optimizer=momentum
+             - --nodistortions
+             - --gradient_repacking=8
+             - --weight_decay=1e-4
+             - --use_fp16=true
+           volumes:
+             - name: work-volume
+               hostPath:
+                 # directory locally mounted on host
+                 path: /work
+                 type: Directory
+    Worker:
+      replicas: 1
+      template:
+        spec:
+          containers:
+          - image: nvcr.io/nvidia/tensorflow:19.06-py3
+            imagePullPolicy: IfNotPresent
+            name: tensorflow-benchmarks
+            resources:
+              limits:
+                nvidia.com/gpu: 4
+            volumeMounts:
+              - mountPath: /local_mount
+                name: work-volume
+          volumes:
+            - name: work-volume
+              hostPath:
+                # directory locally mounted on host
+                path: /work
+                type: Directory