%!s(int64=3) %!d(string=hai) anos · a90e3b4220
--- a/examples/k8s-TensorFlow-resnet50-multinode-MPIOperator.yaml
+++ b/examples/k8s-TensorFlow-resnet50-multinode-MPIOperator.yaml
@@ -1,54 +0,0 @@
 
				-# Run multi-node training benchmark w/ Nvidia NGC Container: nvcr.io/nvidia/tensorflow:19.06-py3
			
 
				-#
			
 
				-# 2 C4140 compute nodes
			
 
				-#  - 8 V100 GPUs
			
 
				-#  - ConnectX-5
			
 
				-#  - IPoIB EDR Infiniband in Ethernet mode
			
 
				-#
			
 
				-apiVersion: kubeflow.org/v1alpha1
			
 
				-kind: MPIJob
			
 
				-metadata:
			
 
				-  name: tensorflow-benchmarks-resnet50
			
 
				-spec:
			
 
				-  replicas: 2
			
 
				-  template:
			
 
				-    spec:
			
 
				-      containers:
			
 
				-      - image: nvcr.io/nvidia/tensorflow:19.06-py3
			
 
				-        name: tensorflow-benchmarks
			
 
				-        volumeMounts:
			
 
				-          - mountPath: /foo
			
 
				-            name: work-volume
			
 
				-          - mountPath: /data
			
 
				-            name: mem-volume
			
 
				-        resources:
			
 
				-          limits:
			
 
				-            nvidia.com/gpu: 4
			
 
				-        command:
			
 
				-          - mpirun
			
 
				-          - --allow-run-as-root
			
 
				-          - --map-by
			
 
				-          - numa
			
 
				-          - python
			
 
				-          - /foo/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
			
 
				-          - --batch_size=512
			
 
				-          - --model=resnet50
			
 
				-          - --variable_update=horovod
			
 
				-          - --optimizer=momentum
			
 
				-          - --nodistortions
			
 
				-          - --gradient_repacking=8
			
 
				-          - --weight_decay=1e-4
			
 
				-          - --use_fp16=true
			
 
				-          - --data_dir=/data/tensorflow/
			
 
				-          - --data_name=imagenet
			
 
				-      volumes:
			
 
				-      - name: work-volume
			
 
				-        hostPath:
			
 
				-          # directory locally mounted on host
			
 
				-          path: /work
			
 
				-          type: Directory
			
 
				-      - name: mem-volume
			
 
				-        hostPath:
			
 
				-          # dev shm directory on host
			
 
				-          path: /dev/shm
			
 
				-          type: Directory
			
--- a/roles/k8s_start_services/tasks/deploy_k8s_services.yml
+++ b/roles/k8s_start_services/tasks/deploy_k8s_services.yml
@@ -96,11 +96,13 @@
 
				 - name: Helm - add Nvidia k8s-device-plugin (nvdp) repo
			
 
				   command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'"
			
 
				   changed_when: true
			
 
				+  when: ansible_local.inventory.nvidia_gpu > 0
			
 
				   tags: init
			
 
				 
			
 
				 - name: Helm - add Nvidia GPU discovery (nvgfd) repo
			
 
				   command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'"
			
 
				   changed_when: true
			
 
				+  when: ansible_local.inventory.nvidia_gpu > 0
			
 
				   tags: init
			
 
				 
			
 
				 - name: Helm - update repo
			
@@ -189,13 +191,17 @@
 
				 - name: Install nvidia-device-plugin
			
 
				   command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
			
 
				   changed_when: true
			
 
				-  when: "'nvidia-device-plugin' not in k8s_pods.stdout"
			
 
				+  when:
			
 
				+    - "'nvidia-device-plugin' not in k8s_pods.stdout"
			
 
				+    - ansible_local.inventory.nvidia_gpu > 0
			
 
				   tags: init
			
 
				 
			
 
				 - name: Install GPU Feature Discovery
			
 
				   command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
			
 
				   changed_when: true
			
 
				-  when: "'node-feature-discovery' not in k8s_pods.stdout"
			
 
				+  when:
			
 
				+    - "'node-feature-discovery' not in k8s_pods.stdout"
			
 
				+    - ansible_local.inventory.nvidia_gpu > 0
			
 
				   tags: init
			
 
				 
			
 
				 - name: Deploy Xilinx Device plugin
			
--- a/roles/k8s_start_services/vars/main.yml
+++ b/roles/k8s_start_services/vars/main.yml
@@ -91,7 +91,7 @@ prometheus_path_on_host: /var/lib/prometheus-2.23.0.linux-amd64/
 
				 
			
 
				 spark_operator_repo: https://googlecloudplatform.github.io/spark-on-k8s-operator
			
 
				 
			
 
				-operator_image_tag: latest
			
 
				+operator_image_tag: v1beta2-1.3.3-3.1.1
			
 
				 
			
 
				 volcano_scheduling_yaml_url: https://raw.githubusercontent.com/volcano-sh/volcano/v1.3.0/installer/volcano-development.yaml