Explorar o código

Merge branch 'devel' into workflow

John Lockman %!s(int64=3) %!d(string=hai) anos
pai
achega
a90e3b4220

+ 0 - 54
examples/k8s-TensorFlow-resnet50-multinode-MPIOperator.yaml

@@ -1,54 +0,0 @@
-# Run multi-node training benchmark w/ Nvidia NGC Container: nvcr.io/nvidia/tensorflow:19.06-py3
-#
-# 2 C4140 compute nodes
-#  - 8 V100 GPUs
-#  - ConnectX-5
-#  - IPoIB EDR Infiniband in Ethernet mode
-#
-apiVersion: kubeflow.org/v1alpha1
-kind: MPIJob
-metadata:
-  name: tensorflow-benchmarks-resnet50
-spec:
-  replicas: 2
-  template:
-    spec:
-      containers:
-      - image: nvcr.io/nvidia/tensorflow:19.06-py3
-        name: tensorflow-benchmarks
-        volumeMounts:
-          - mountPath: /foo
-            name: work-volume
-          - mountPath: /data
-            name: mem-volume
-        resources:
-          limits:
-            nvidia.com/gpu: 4
-        command:
-          - mpirun
-          - --allow-run-as-root
-          - --map-by
-          - numa
-          - python
-          - /foo/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
-          - --batch_size=512
-          - --model=resnet50
-          - --variable_update=horovod
-          - --optimizer=momentum
-          - --nodistortions
-          - --gradient_repacking=8
-          - --weight_decay=1e-4
-          - --use_fp16=true
-          - --data_dir=/data/tensorflow/
-          - --data_name=imagenet
-      volumes:
-      - name: work-volume
-        hostPath:
-          # directory locally mounted on host
-          path: /work
-          type: Directory
-      - name: mem-volume
-        hostPath:
-          # dev shm directory on host
-          path: /dev/shm
-          type: Directory

+ 8 - 2
roles/k8s_start_services/tasks/deploy_k8s_services.yml

@@ -96,11 +96,13 @@
 - name: Helm - add Nvidia k8s-device-plugin (nvdp) repo
   command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'"
   changed_when: true
+  when: ansible_local.inventory.nvidia_gpu > 0
   tags: init
 
 - name: Helm - add Nvidia GPU discovery (nvgfd) repo
   command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'"
   changed_when: true
+  when: ansible_local.inventory.nvidia_gpu > 0
   tags: init
 
 - name: Helm - update repo
@@ -189,13 +191,17 @@
 - name: Install nvidia-device-plugin
   command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
   changed_when: true
-  when: "'nvidia-device-plugin' not in k8s_pods.stdout"
+  when:
+    - "'nvidia-device-plugin' not in k8s_pods.stdout"
+    - ansible_local.inventory.nvidia_gpu > 0
   tags: init
 
 - name: Install GPU Feature Discovery
   command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
   changed_when: true
-  when: "'node-feature-discovery' not in k8s_pods.stdout"
+  when:
+    - "'node-feature-discovery' not in k8s_pods.stdout"
+    - ansible_local.inventory.nvidia_gpu > 0
   tags: init
 
 - name: Deploy Xilinx Device plugin

+ 1 - 1
roles/k8s_start_services/vars/main.yml

@@ -91,7 +91,7 @@ prometheus_path_on_host: /var/lib/prometheus-2.23.0.linux-amd64/
 
 spark_operator_repo: https://googlecloudplatform.github.io/spark-on-k8s-operator
 
-operator_image_tag: latest
+operator_image_tag: v1beta2-1.3.3-3.1.1
 
 volcano_scheduling_yaml_url: https://raw.githubusercontent.com/volcano-sh/volcano/v1.3.0/installer/volcano-development.yaml