Jelajahi Sumber

Merge pull request #158 from j0hnL/issue-29

update nvidia-device-plugin
John Lockman 4 tahun lalu
induk
melakukan
d768ccedd7

+ 3 - 0
kubernetes/host_inventory_file

@@ -18,3 +18,6 @@ all:
       vars:
         single_node: false
         manager_ip: 10.0.0.100
+        nfs_server: 10.0.0.100
+        nfs_path: /work
+        MIG_STRATEGY: none

+ 0 - 3
kubernetes/roles/common/files/nvidia

@@ -1,3 +0,0 @@
-#!/bin/sh
-PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" exec nvidia-container-runtime-hook "$@"
-

+ 6 - 46
kubernetes/roles/common/tasks/main.yml

@@ -31,6 +31,11 @@
     state: present
   tags: install
 
+- name: Add Docker Community Edition Repo
+  get_url:
+    url: https://download.docker.com/linux/centos/docker-ce.repo
+    dest: /etc/yum.repos.d/docker-ce.repo
+
 - name: update sysctl to handle incorrectly routed traffic when iptables is bypassed
   copy: src=k8s.conf dest=/etc/sysctl.d/ owner=root group=root mode=644
   tags: install
@@ -43,11 +48,6 @@
   yum: name=epel-release state=present
   tags: install
 
-#likely need to add a reboot hook in here
-#- name: update kernel and all other system packages
-  #yum: name=* state=latest
-  #tags: install
-
 - name: disable swap
   command: /sbin/swapoff -a
   tags: install
@@ -64,7 +64,7 @@
       - gcc
       - nfs-utils
       - python-pip
-      - docker
+      - docker-ce
       - bash-completion
       - kubelet-1.16.7
       - kubeadm-1.16.7
@@ -77,7 +77,6 @@
   command: yum versionlock kubelet-1.16.7 kubectl-1.16.7 kubeadm-1.16.7
   tags: install
 
-
 - name: install InfiniBand Support
   package:
     name: "@Infiniband Support"
@@ -87,17 +86,6 @@
   command: /bin/pip install --upgrade pip
   tags: install
 
-#- name: Enable DevicePlugins for all GPU nodes (nvidia-container-runtime-hook)
-  #copy: src=nvidia dest=/usr/libexec/oci/hooks.d/ owner=root group=root mode=755
-  #tags: install
-
-- name: Add KUBE_EXTRA_ARGS to enable GPUs
-  lineinfile:
-    path: /usr/lib/systemd/system/kubelet.service.d/10-kubeadm.conf
-    line: 'Environment="KUBELET_EXTRA_ARGS=--feature-gates=DevicePlugins=true"'
-    insertbefore: 'KUBELET_KUBECONFIG_ARGS='
-  tags: install
-
 - name: Start and Enable docker service
   service:
     name: docker
@@ -111,31 +99,3 @@
     state: restarted
     enabled: yes
   tags: install
-
-- name: Start and rpcbind service
-  service:
-    name: rpcbind
-    state: restarted
-    enabled: yes
-  tags: install
-
-- name: Start and nfs-server service
-  service:
-    name: nfs-server
-    state: restarted
-    enabled: yes
-  tags: install
-
-- name: Start and nfs-lock service
-  service:
-    name: nfs-lock
-    #state: restarted
-    enabled: yes
-  tags: install
-
-- name: Start and nfs-idmap service
-  service:
-    name: nfs-idmap
-    state: restarted
-    enabled: yes
-  tags: install

+ 9 - 0
kubernetes/roles/computeGPU/files/daemon.json

@@ -0,0 +1,9 @@
+{
+  "runtimes": {
+    "nvidia": {
+      "path": "nvidia-container-runtime",
+      "runtimeArgs": []
+    }
+  },
+  "default-runtime": "nvidia"
+}

+ 6 - 29
kubernetes/roles/computeGPU/tasks/main.yml

@@ -13,20 +13,6 @@
 #  limitations under the License.
 
 ---
-- name: install Nvidia driver
-  package:
-    name:
-      - kmod-nvidia
-      #- nvidia-x11-drv
-    state: present
-  tags: install
-
-#- name: add Nvidia container runtime support
-  #get_url:
-    #url: https://nvidia.github.io/nvidia-docker/centos7/nvidia-docker.repo
-    #dest: /etc/yum.repos.d/nvidia-docker.repo
-  #tags: install
-
 - name: add Nvidia container runtime support
   get_url:
     url: https://nvidia.github.io/nvidia-container-runtime/centos7/nvidia-container-runtime.repo
@@ -39,29 +25,20 @@
     regexp: 'repo_gpgcheck=1'
     replace: 'repo_gpgcheck=0'
     backup: yes
-  tags: testing
+  tags: install
 
 - name: install Nvidia-container-runtime-hook
   package:
     name:
-      #- nvidia-detect
-      #- kmod-nvidia-410.73-1.el7_5.elrepo
+      - kmod-nvidia
       - nvidia-container-runtime-hook
+      - nvidia-docker2
     state: present
   tags: install
 
-
-# This needs to be done on GPU nodes
-#- name: Enable DevicePlugins for all GPU nodes (nvidia-container-runtime-hook)
-  #copy: src=nvidia dest=/usr/libexec/oci/hooks.d/ owner=root group=root mode=755
-  #tags: install
-
-#- name: Add KUBE_EXTRA_ARGS to enable Plugins (GPU support)  --III alreday done in common
-  #lineinfile:
-    #path: /etc/systemd/system/kubelet.service.d/10-kubeadm.conf
-    #line: 'Environment="KUBELET_EXTRA_ARGS=--feature-gates=DevicePlugins=true"'
-    #insertbefore: 'KUBELET_KUBECONFIG_ARGS='
-  #tags: install
+- name: Set nvidia as default runtime 
+  copy: src=daemon.json dest=/etc/docker/ owner=root group=root mode=644
+  tags: install
 
 - name: Restart and Enable docker service
   service:

+ 17 - 1
kubernetes/roles/startservices/tasks/main.yml

@@ -49,12 +49,20 @@
   shell: helm repo add stable https://charts.helm.sh/stable
   tags: init
 
+- name: Helm - Add Nvidia k8s-device-plugin (nvdp) Repo
+  shell: helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
+  tags: init
+
+- name: Helm - Add Nvidia GPU Discovery (nvgfd) Repo
+  shell: helm repo add nvgfd https://nvidia.github.io/gpu-feature-discovery
+  tags: init
+
 - name: Helm - Update Repo
   shell: helm repo update
   tags: init
 
 - name: Start NFS Client Provisioner
-  shell: helm install stable/nfs-client-provisioner --set nfs.server=10.0.0.1 --set nfs.path=/work --generate-name
+  shell: helm install stable/nfs-client-provisioner --set nfs.server={{ nfs_server }}  --set nfs.path={{ nfs_path }} --generate-name
   tags: init
 
 - name: Set NFS-Client Provisioner as DEFAULT StorageClass
@@ -68,3 +76,11 @@
 - name: Install MPI Operator
   shell: kubectl create -f https://raw.githubusercontent.com/kubeflow/mpi-operator/master/deploy/v1alpha2/mpi-operator.yaml
   tags: init
+
+- name: Install nvidia-device-plugin
+  shell: helm install --version=0.7.0  --generate-name  --set migStrategy={{ MIG_STRATEGY }}  nvdp/nvidia-device-plugin 
+  tags: init
+
+- name: Install GPU Feature Discovery
+  shell: helm install  --version=0.2.0  --generate-name  --set migStrategy={{ MIG_STRATEGY }}  nvgfd/gpu-feature-discovery
+  tags: init