Browse Source

Issue #154: Update Slurm Playbook

Signed-off-by: VishnupriyaKrish <Vishnupriya_Krishnar@dellteam.com>
Lucas A. Wilson 4 years ago
parent
commit
60416a20db
42 changed files with 1503 additions and 565 deletions
  1. 3 4
      kubernetes/jupyterhub.yml
  2. 3 5
      kubernetes/kubeflow.yml
  3. 22 17
      kubernetes/kubernetes.yml
  4. 44 42
      kubernetes/roles/common/tasks/main.yml
  5. 40 5
      kubernetes/roles/common/vars/main.yml
  6. 0 21
      kubernetes/roles/computeGPU/handlers/main.yml
  7. 0 10
      kubernetes/roles/computeGPU/vars/main.yml
  8. 0 0
      kubernetes/roles/compute_gpu/files/daemon.json
  9. 0 0
      kubernetes/roles/compute_gpu/files/k8s.conf
  10. 0 0
      kubernetes/roles/compute_gpu/files/kubernetes.repo
  11. 19 13
      kubernetes/roles/computeGPU/tasks/main.yml
  12. 26 0
      kubernetes/roles/compute_gpu/vars/main.yml
  13. 77 0
      kubernetes/roles/firewalld/tasks/main.yml
  14. 43 0
      kubernetes/roles/firewalld/vars/main.yml
  15. 45 9
      kubernetes/roles/jupyterhub/tasks/main.yml
  16. 26 0
      kubernetes/roles/jupyterhub/vars/main.yml
  17. 50 39
      kubernetes/roles/kubeflow/tasks/main.yml
  18. 56 0
      kubernetes/roles/kubeflow/vars/main.yml
  19. 12 25
      kubernetes/roles/manager/tasks/main.yml
  20. 24 0
      kubernetes/roles/manager/vars/main.yml
  21. 78 41
      kubernetes/roles/startmanager/tasks/main.yml
  22. 52 0
      kubernetes/roles/startmanager/vars/main.yml
  23. 60 26
      kubernetes/roles/startservices/tasks/main.yml
  24. 46 0
      kubernetes/roles/startservices/vars/main.yml
  25. 5 26
      kubernetes/roles/startworkers/tasks/main.yml
  26. 3 7
      slurm/roles/start-slurm-workers/tasks/main.yml
  27. 28 14
      slurm/roles/common/handlers/main.yml
  28. 17 2
      slurm/roles/common/tasks/main.yml
  29. 27 13
      slurm/roles/common/tasks/ntp.yml
  30. 26 1
      slurm/roles/common/vars/main.yml
  31. 0 104
      slurm/roles/slurm-common/tasks/main.yml
  32. 0 118
      slurm/roles/slurm-manager/tasks/main.yml
  33. 0 0
      slurm/roles/slurm_common/files/munge.key
  34. 14 14
      slurm/roles/slurm-common/files/slurm.conf
  35. 164 0
      slurm/roles/slurm_common/tasks/main.yml
  36. 42 0
      slurm/roles/slurm_common/vars/main.yml
  37. 38 0
      slurm/roles/slurm_manager/files/slurmdbd.conf
  38. 174 0
      slurm/roles/slurm_manager/tasks/main.yml
  39. 62 0
      slurm/roles/slurm_manager/vars/main.yml
  40. 64 0
      slurm/roles/slurm_start_services/tasks/main.yml
  41. 97 0
      slurm/roles/start_slurm_workers/tasks/main.yml
  42. 16 9
      slurm/slurm.yml

+ 3 - 4
kubernetes/jupyterhub.yml

@@ -13,9 +13,8 @@
 # limitations under the License.
 ---
 
-#Playbook for installing JupyterHub v1.1.0 in Omnia
-# Start K8s worker servers
-- hosts: manager
+- name: Installing JupyterHub
+  hosts: manager
   gather_facts: false
   roles:
-    - jupyterhub
+    - jupyterhub

+ 3 - 5
kubernetes/kubeflow.yml

@@ -11,12 +11,10 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
 ---
-#Playbook for installing Kubeflow v1.0 on Omnia
 
-# Start K8s worker servers
-- hosts: manager
+- name: Installing Kubeflow
+  hosts: manager
   gather_facts: false
   roles:
-    - kubeflow
+    - kubeflow

+ 22 - 17
kubernetes/kubernetes.yml

@@ -11,45 +11,50 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
 ---
 #Playbook for kubernetes cluster
 
-#collect info from everything
-- hosts: all
+- name: Gather facts from all the nodes
+  hosts: all
 
-# Apply Common Installation and Config
-- hosts: cluster
+- name: Apply common installation and config
+  hosts: manager, compute
   gather_facts: false
   roles:
     - common
 
-# Apply GPU Node Config
-- hosts: gpus
+- name: Apply GPU node config
+  hosts: compute
   gather_facts: false
   roles:
-    - computeGPU
+    - compute_gpu
 
-# Apply Manager Config
-- hosts: manager
+- name: Apply manager config
+  hosts: manager
   gather_facts: false
   roles:
     - manager
 
-# Start K8s on manager server
-- hosts: manager
+- name: Apply firewalld config on manager and compute nodes
+  hosts: manager, compute
+  gather_facts: false
+  roles:
+    - firewalld
+
+- name: Start K8s on manager server
+  hosts: manager
   gather_facts: false
   roles:
     - startmanager
 
-# Start K8s worker servers
-- hosts: compute,gpus
+- name: Start K8s worker servers on compute nodes
+  hosts: compute
   gather_facts: false
   roles:
     - startworkers
 
-# Start K8s worker servers
-- hosts: manager
+- name: Start K8s worker servers on manager nodes
+  hosts: manager
   gather_facts: false
   roles:
-    - startservices
+    - startservices

+ 44 - 42
kubernetes/roles/common/tasks/main.yml

@@ -11,91 +11,93 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
 ---
 
-- name: add kubernetes repo
-  copy: src=kubernetes.repo dest=/etc/yum.repos.d/ owner=root group=root mode=644
+- name: Add kubernetes repo
+  copy:
+    src: kubernetes.repo
+    dest: "{{ k8s_repo_dest }}"
+    owner: root
+    group: root
+    mode: "{{ k8s_repo_file_mode }}"
   tags: install
 
-# add ElRepo GPG Key
-- name: add ElRepo GPG Key
+- name: Add elrepo GPG key
   rpm_key:
     state: present
-    key: https://www.elrepo.org/RPM-GPG-KEY-elrepo.org
+    key: "{{ elrepo_gpg_key_url }}"
   tags: install
 
-- name: add ElRepo (Nvidia kmod drivers)
-  yum:
-    name: http://www.elrepo.org/elrepo-release-7.0-3.el7.elrepo.noarch.rpm
+- name: Add elrepo (nvidia kmod drivers)
+  package:
+    name: "{{ elrepo_rpm_url }}"
     state: present
   tags: install
 
-- name: Add Docker Community Edition Repo
+- name: Add docker community edition repository
   get_url:
-    url: https://download.docker.com/linux/centos/docker-ce.repo
-    dest: /etc/yum.repos.d/docker-ce.repo
-
-- name: update sysctl to handle incorrectly routed traffic when iptables is bypassed
-  copy: src=k8s.conf dest=/etc/sysctl.d/ owner=root group=root mode=644
+    url: "{{ docker_repo_url }}"
+    dest: "{{ docker_repo_dest }}"
   tags: install
 
-- name: update sysctl
-  command: /sbin/sysctl --system
+- name: Update sysctl to handle incorrectly routed traffic when iptables is bypassed
+  copy:
+    src: k8s.conf
+    dest: "{{ k8s_conf_dest }}"
+    owner: root
+    group: root
+    mode: "{{ k8s_conf_file_mode }}"
   tags: install
 
-- name: Install EPEL Repository
-  yum: name=epel-release state=present
+- name: Update sysctl
+  command: /sbin/sysctl --system
+  changed_when: true
   tags: install
 
-- name: disable swap
+- name: Disable swap
   command: /sbin/swapoff -a
+  changed_when: true
   tags: install
 
-- name: Disable SELinux
+- name: Disable selinux
   selinux:
     state: disabled
   tags: install
 
-- name: install common packages
+- name: Install common packages
   package:
-    name:
-      - yum-plugin-versionlock
-      - gcc
-      - nfs-utils
-      - python-pip
-      - docker-ce
-      - bash-completion
-      - kubelet-1.16.7
-      - kubeadm-1.16.7
-      - kubectl-1.16.7
-      - nvidia-detect
+    name: "{{ item }}"
     state: present
+  with_items:
+    - "{{ common_packages }}"
+    - "{{ k8s_packages }}"
   tags: install
 
-- name: versionlock kubernetes
-  command: yum versionlock kubelet-1.16.7 kubectl-1.16.7 kubeadm-1.16.7
+- name: Versionlock kubernetes
+  command: "dnf versionlock '{{ item }}'"
+  args:
+    warn: false
+  with_items:
+    - "{{ k8s_packages }}"
+  changed_when: true
   tags: install
 
-- name: install InfiniBand Support
+- name: Install infiniBand support
   package:
     name: "@Infiniband Support"
     state: present
-
-- name: upgrade pip
-  command: /bin/pip install --upgrade pip
   tags: install
 
-- name: Start and Enable docker service
+- name: Start and enable docker service
   service:
     name: docker
     state: restarted
     enabled: yes
   tags: install
 
-- name: Start and Enable Kubernetes - kubelet
+- name: Start and enable kubernetes - kubelet
   service:
     name: kubelet
     state: restarted
     enabled: yes
-  tags: install
+  tags: install

+ 40 - 5
kubernetes/roles/common/vars/main.yml

@@ -1,10 +1,45 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
 ---
 
 common_packages:
   - epel-release
-  - python-pip
-  - docker
+  - yum-plugin-versionlock
+  - gcc
+  - nfs-utils
+  - python3-pip
+  - docker-ce
   - bash-completion
-  - kubelet 
-  - kubeadm
-  - kubectl
+  - nvidia-detect
+
+k8s_packages:
+  - kubelet-1.16.7
+  - kubeadm-1.16.7
+  - kubectl-1.16.7
+
+k8s_repo_dest: /etc/yum.repos.d/
+
+elrepo_gpg_key_url: https://www.elrepo.org/RPM-GPG-KEY-elrepo.org
+
+elrepo_rpm_url: https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm
+
+docker_repo_url: https://download.docker.com/linux/centos/docker-ce.repo
+
+docker_repo_dest: /etc/yum.repos.d/docker-ce.repo
+
+k8s_conf_dest: /etc/sysctl.d/
+
+k8s_repo_file_mode: 0644
+
+k8s_conf_file_mode: 0644

+ 0 - 21
kubernetes/roles/computeGPU/handlers/main.yml

@@ -1,21 +0,0 @@
----
-
-#- name: Enable docker service
-  #service:
-    #name: docker
-    #enabled: yes
-#
-- name: Start and Enable docker service
-  service:
-    name: docker
-    state: restarted
-    enabled: yes
-  #tags: install
-
-- name: Start and Enable Kubernetes - kubelet
-  service:
-    name: kubelet
-    state: started
-    enabled: yes
-  #tags: install
-

+ 0 - 10
kubernetes/roles/computeGPU/vars/main.yml

@@ -1,10 +0,0 @@
----
-
-common_packages:
-  - epel-release
-  - python-pip
-  - docker
-  - bash-completion
-  - kubelet 
-  - kubeadm
-  - kubectl

kubernetes/roles/computeGPU/files/daemon.json → kubernetes/roles/compute_gpu/files/daemon.json


kubernetes/roles/computeGPU/files/k8s.conf → kubernetes/roles/compute_gpu/files/k8s.conf


kubernetes/roles/computeGPU/files/kubernetes.repo → kubernetes/roles/compute_gpu/files/kubernetes.repo


+ 19 - 13
kubernetes/roles/computeGPU/tasks/main.yml

@@ -11,36 +11,42 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
 ---
-- name: add nvidia-docker2 Repo
+
+- name: Add nvidia-docker2 Repo
   get_url:
-    url: https://nvidia.github.io/nvidia-docker/centos7/nvidia-docker.repo
-    dest: /etc/yum.repos.d/nvidia-docker.repo
+    url: "{{ nvidia_docker_repo_url }}"
+    dest: "{{ nvidia_docker_repo_dest }}"
   tags: install, testing
 
-- name: install Nvidia driver and nvidia-docker2
+- name: Install nvidia driver and nvidia-docker2
   package:
-    name:
-      - kmod-nvidia
-      - nvidia-docker2
+    name: "{{ item }}"
     state: present
+  with_items:
+    - "{{ nvidia_packages }}"
   tags: install
 
-- name: Set nvidia as default runtime 
-  copy: src=daemon.json dest=/etc/docker/ owner=root group=root mode=644
+- name: Set nvidia as default runtime
+  copy:
+    src: daemon.json
+    dest: "{{ daemon_file_dest }}"
+    owner: root
+    group: root
+    mode: "{{ daemon_file_mode }}"
   tags: install
 
-- name: Restart and Enable docker service
+- name: Restart and enable docker service
   service:
     name: docker
     state: restarted
     enabled: yes
+    daemon_reload: yes
   tags: install
 
-- name: Restart and Enable Kubernetes - kubelet
+- name: Restart and enable kubernetes - kubelet
   service:
     name: kubelet
     state: restarted
     enabled: yes
-  tags: install
+  tags: install

+ 26 - 0
kubernetes/roles/compute_gpu/vars/main.yml

@@ -0,0 +1,26 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+nvidia_docker_repo_url: https://nvidia.github.io/nvidia-docker/centos7/nvidia-docker.repo
+
+nvidia_docker_repo_dest: /etc/yum.repos.d/nvidia-docker.repo
+
+nvidia_packages:
+  - kmod-nvidia
+  - nvidia-docker2
+
+daemon_file_dest: /etc/docker/
+
+daemon_file_mode: 0644

+ 77 - 0
kubernetes/roles/firewalld/tasks/main.yml

@@ -0,0 +1,77 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Install firewalld
+  package:
+    name: firewalld
+    state: present
+  tags: firewalld
+
+- name: Start and enable firewalld
+  service:
+    name: firewalld
+    state: started
+    enabled: yes
+  tags: firewalld
+
+- name: Configure firewalld on master nodes
+  firewalld:
+    port: "{{ item }}/tcp"
+    permanent: yes
+    state: enabled
+  with_items: '{{ k8s_master_ports }}'
+  when: "'manager' in group_names"
+  tags: firewalld
+
+- name: Configure firewalld on compute nodes
+  firewalld:
+    port: "{{ item }}/tcp"
+    permanent: yes
+    state: enabled
+  with_items: '{{ k8s_worker_ports }}'
+  when: "'compute' in group_names"
+  tags: firewalld
+
+- name: Open flannel ports on the firewall
+  firewalld:
+    port: "{{ item }}/udp"
+    permanent: yes
+    state: enabled
+  with_items: "{{ flannel_udp_ports }}"
+  when: k8s_cni == "flannel"
+  tags: firewalld
+
+- name: Open calico UDP ports on the firewall
+  firewalld:
+    port: "{{ item }}/udp"
+    permanent: yes
+    state: enabled
+  with_items: "{{ calico_udp_ports }}"
+  when: k8s_cni == "calico"
+  tags: firewalld
+
+- name: Open calico TCP ports on the firewall
+  firewalld:
+    port: "{{ item }}/tcp"
+    permanent: yes
+    state: enabled
+  with_items: "{{ calico_tcp_ports }}"
+  when: k8s_cni == "calico"
+  tags: firewalld
+
+- name: Reload firewalld
+  command: firewall-cmd --reload
+  changed_when: true
+  tags: firewalld

+ 43 - 0
kubernetes/roles/firewalld/vars/main.yml

@@ -0,0 +1,43 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+# Kubernetes SDN network
+k8s_cni: calico
+
+# Master nodes firewall ports
+k8s_master_ports:
+  - 6443
+  - 2379-2380
+  - 10250
+  - 10251
+  - 10252
+
+# Worker nodes firewall ports
+k8s_worker_ports:
+  - 10250
+  - 30000-32767
+
+# Calico CNI firewall ports
+calico_udp_ports:
+  - 4789
+calico_tcp_ports:
+  - 5473
+  - 179
+  - 5473
+
+# Flannel CNI firewall ports
+flannel_udp_ports:
+  - 8285
+  - 8472

+ 45 - 9
kubernetes/roles/jupyterhub/tasks/main.yml

@@ -11,16 +11,52 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
 ---
-- name: Helm - Add JupyterHub Repo
-  shell: helm repo add jupyterhub https://jupyterhub.github.io/helm-chart/
 
-- name: Helm - Update Repo
-  shell: helm repo update
+- name: Helm - add JupyterHub repo
+  command: "helm repo add jupyterhub '{{ jupyterhub_helm_chart_repo }}'"
+  changed_when: true
+
+- name: Helm - update repo
+  command: helm repo update
+  changed_when: true
+
+- name: Copy JupyterHub custom config file
+  copy:
+   src: jupyter_config.yaml
+   dest: "{{ jupyter_config_file_dest }}"
+   owner: root
+   group: root
+   mode: "{{ jupyter_config_file_mode }}"
+
+- name: JupyterHub deploy
+  block:
+    - name: JupyterHub deploy
+      command: >
+        helm upgrade --cleanup-on-fail \
+        --install {{ jupyterhub_namespace }} jupyterhub/jupyterhub \
+        --namespace {{ jupyterhub_namespace }} \
+        --create-namespace \
+        --version {{ helm_chart_version }} \
+        --values {{ jupyter_config_file_dest }} \
+        --timeout {{ timeout_min_sec }}
+      register: deployment_output
+
+  rescue:
+    - name: JupyterHub deployment error
+      debug:
+        msg: "Previous JupyterHub deployment is in progress"
+      when: "'another operation (install/upgrade/rollback) is in progress' in deployment_output.stderr"
 
-- name: JupyterHub Custom Config (files)
-  copy: src=jupyter_config.yaml dest=/root/k8s/jupyter_config.yaml owner=root group=root mode=0655
+    - name: Delete existing release
+      command: helm delete '{{ jupyterhub_namespace }}'
 
-- name: jupyterHub deploy
-  shell: helm install jupyterhub/jupyterhub  --namespace default --version 0.9.0 --values /root/k8s/jupyter_config.yaml --generate-name --wait --timeout 60m
+    - name: JupyterHub deploy
+      command: >
+        helm upgrade --cleanup-on-fail \
+        --install {{ jupyterhub_namespace }} jupyterhub/jupyterhub \
+        --namespace {{ jupyterhub_namespace }} \
+        --create-namespace \
+        --version {{ helm_chart_version }} \
+        --values {{ jupyter_config_file_dest }} \
+        --timeout {{ timeout_min_sec }}

+ 26 - 0
kubernetes/roles/jupyterhub/vars/main.yml

@@ -0,0 +1,26 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+jupyterhub_helm_chart_repo: https://jupyterhub.github.io/helm-chart/
+
+jupyter_config_file_dest: /root/k8s/jupyter_config.yaml
+
+jupyter_config_file_mode: 0655
+
+helm_chart_version: 0.9.0
+
+timeout_min_sec: 60m
+
+jupyterhub_namespace: default

+ 50 - 39
kubernetes/roles/kubeflow/tasks/main.yml

@@ -11,112 +11,123 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
 ---
 
-#Configure build and deploy kubeflow v1.0
-
-- name: Download kfctl v1.0.2 release from the Kubeflow releases page.
+- name: Download kfctl release from the Kubeflow releases page
   unarchive:
-    src: https://github.com/kubeflow/kfctl/releases/download/v1.0.2/kfctl_v1.0.2-0-ga476281_linux.tar.gz
-    dest: /usr/bin/
+    src: "{{ kfctl_download_url }}"
+    dest: "{{ kfctl_download_dest_path }}"
+    mode: "{{ kfctl_download_file_mode }}"
     remote_src: yes
 
-- name: Delete Omnia Kubeflow Directory if exists
+- name: Delete omnia kubeflow directory if exists
   file:
-    path: /root/k8s/omnia-kubeflow
+    path: "{{ omnia_kubeflow_dir_path }}"
     state: absent
 
-- name: Create Kubeflow Directory
+- name: Create omnia kubeflow directory
   file:
-    path: /root/k8s/omnia-kubeflow
+    path: "{{ omnia_kubeflow_dir_path }}"
     state: directory
+    mode: "{{ omnia_kubeflow_dir_mode }}"
     recurse: yes
 
-- name: Build Kubeflow Configuration
-  shell:
-    cmd: /usr/bin/kfctl build -V -f https://raw.githubusercontent.com/kubeflow/manifests/v1.0-branch/kfdef/kfctl_k8s_istio.v1.0.2.yaml
-    chdir: /root/k8s/omnia-kubeflow
+- name: Build kubeflow configuration
+  command:
+    cmd: /usr/bin/kfctl build -V -f "{{ kubeflow_config_yaml_url }}"
+    chdir: "{{ omnia_kubeflow_dir_path }}"
+  changed_when: true
 
-- name: Modify Cpu Limit for istio-ingressgateway-service-account
+- name: Modify CPU limit for istio-ingressgateway-service-account
   replace:
-    path: /root/k8s/omnia-kubeflow/kustomize/istio-install/base/istio-noauth.yaml
+    path: "{{ istio_noauth_yaml_file_path }}"
     after: 'serviceAccountName: istio-ingressgateway-service-account'
     before: '---'
     regexp: 'cpu: 100m'
     replace: 'cpu: 2'
 
-- name: Modify Mem Limit for istio-ingressgateway-service-account
+- name: Modify memory limit for istio-ingressgateway-service-account
   replace:
-    path: /root/k8s/omnia-kubeflow/kustomize/istio-install/base/istio-noauth.yaml
+    path: "{{ istio_noauth_yaml_file_path }}"
     after: 'serviceAccountName: istio-ingressgateway-service-account'
     before: '---'
     regexp: 'memory: 128Mi'
     replace: 'memory: 512Mi'
 
-- name: Modify Cpu Request for istio-ingressgateway-service-account
+- name: Modify CPU request for istio-ingressgateway-service-account
   replace:
-    path: /root/k8s/omnia-kubeflow/kustomize/istio-install/base/istio-noauth.yaml
+    path: "{{ istio_noauth_yaml_file_path }}"
     after: 'serviceAccountName: istio-ingressgateway-service-account'
     before: '---'
     regexp: 'cpu: 10m'
     replace: 'cpu: 1'
 
-- name: Modify Mem Request for istio-ingressgateway-service-account
+- name: Modify memory request for istio-ingressgateway-service-account
   replace:
-    path: /root/k8s/omnia-kubeflow/kustomize/istio-install/base/istio-noauth.yaml
+    path: "{{ istio_noauth_yaml_file_path }}"
     after: 'serviceAccountName: istio-ingressgateway-service-account'
     before: '---'
     regexp: 'memory: 40Mi'
     replace: 'memory: 256Mi'
 
-
-- name: Modify Cpu Limit for kfserving-gateway
+- name: Modify CPU limit for kfserving-gateway
   replace:
-    path: /root/k8s/omnia-kubeflow/kustomize/kfserving-gateway/base/deployment.yaml
+    path: "{{ kfserving_gateway_yaml_file_path }}"
     after: 'serviceAccountName: istio-ingressgateway-service-account'
     before: 'env:'
     regexp: 'cpu: 100m'
     replace: 'cpu: 2'
 
-- name: Modify Mem Limit for kfserving-gateway
+- name: Modify memory limit for kfserving-gateway
   replace:
-    path: /root/k8s/omnia-kubeflow/kustomize/kfserving-gateway/base/deployment.yaml
+    path: "{{ kfserving_gateway_yaml_file_path }}"
     after: 'serviceAccountName: istio-ingressgateway-service-account'
     before: 'env:'
     regexp: 'memory: 128Mi'
     replace: 'memory: 512Mi'
 
-- name: Modify Cpu Request for kfserving-gateway
+- name: Modify CPU request for kfserving-gateway
   replace:
-    path: /root/k8s/omnia-kubeflow/kustomize/kfserving-gateway/base/deployment.yaml
+    path: "{{ kfserving_gateway_yaml_file_path }}"
     after: 'serviceAccountName: istio-ingressgateway-service-account'
     before: 'env:'
     regexp: 'cpu: 10m'
     replace: 'cpu: 1'
 
-- name: Modify Mem Request for kfserving-gateway
+- name: Modify memory request for kfserving-gateway
   replace:
-    path: /root/k8s/omnia-kubeflow/kustomize/kfserving-gateway/base/deployment.yaml
+    path: "{{ kfserving_gateway_yaml_file_path }}"
     after: 'serviceAccountName: istio-ingressgateway-service-account'
     before: 'env:'
     regexp: 'memory: 40Mi'
     replace: 'memory: 256Mi'
 
-
-- name: Change Argo base service from NodePort to LoadBalancer
+- name: Change argo base service from NodePort to LoadBalancer
   replace:
-    path: /root/k8s/omnia-kubeflow/kustomize/argo/base/service.yaml
+    path: "{{ argo_yaml_file_path }}"
     regexp: 'NodePort'
     replace: 'LoadBalancer'
 
 - name: Change istio-install base istio-noauth service from NodePort to LoadBalancer
   replace:
-    path: /root/k8s/omnia-kubeflow/kustomize/istio-install/base/istio-noauth.yaml
+    path: "{{ istio_noauth_yaml_file_path }}"
     regexp: 'NodePort'
     replace: 'LoadBalancer'
 
-- name: Apply Kubeflow Configuration
-  shell:
-    cmd: /usr/bin/kfctl apply -V -f /root/k8s/omnia-kubeflow/kfctl_k8s_istio.v1.0.2.yaml
-    chdir: /root/k8s/omnia-kubeflow
+- name: Remove cert-manager application block
+  replace:
+    path: "{{ kubeflow_config_file }}"
+    regexp: "{{ cert_manager_block }}"
+    replace: "\n"
+
+- name: Remove seldon-core-operator application block
+  replace:
+    path: "{{ kubeflow_config_file }}"
+    regexp: "{{ seldon_core_operator_block }}"
+    replace: "\n"
+
+- name: Apply kubeflow configuration
+  command:
+    cmd: "/usr/bin/kfctl apply -V -f '{{ kubeflow_config_file }}'"
+    chdir: "{{ omnia_kubeflow_dir_path }}"
+  changed_when: true

+ 56 - 0
kubernetes/roles/kubeflow/vars/main.yml

@@ -0,0 +1,56 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+kfctl_download_url: https://github.com/kubeflow/kfctl/releases/download/v1.0.2/kfctl_v1.0.2-0-ga476281_linux.tar.gz
+
+kfctl_download_dest_path: /usr/bin/
+
+kfctl_download_file_mode: 0755
+
+omnia_kubeflow_dir_path: /root/k8s/omnia-kubeflow
+
+omnia_kubeflow_dir_mode: 0755
+
+kubeflow_config_yaml_url: https://raw.githubusercontent.com/kubeflow/manifests/v1.0-branch/kfdef/kfctl_k8s_istio.v1.0.2.yaml
+
+istio_noauth_yaml_file_path: "{{ omnia_kubeflow_dir_path }}/kustomize/istio-install/base/istio-noauth.yaml"
+
+kfserving_gateway_yaml_file_path: "{{ omnia_kubeflow_dir_path }}/kustomize/kfserving-gateway/base/deployment.yaml"
+
+argo_yaml_file_path: "{{ omnia_kubeflow_dir_path }}/kustomize/argo/base/service.yaml"
+
+kubeflow_config_file: "{{ omnia_kubeflow_dir_path }}/kfctl_k8s_istio.v1.0.2.yaml"
+
+cert_manager_block: >
+    - kustomizeConfig:
+          overlays:
+          - self-signed
+          - application
+          parameters:
+          - name: namespace
+            value: cert-manager
+          repoRef:
+            name: manifests
+            path: cert-manager/cert-manager
+        name: cert-manager
+
+seldon_core_operator_block: >
+    - kustomizeConfig:
+          overlays:
+          - application
+          repoRef:
+            name: manifests
+            path: seldon/seldon-core-operator
+        name: seldon-core-operator

+ 12 - 25
kubernetes/roles/manager/tasks/main.yml

@@ -11,35 +11,22 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
 ---
-#- name: Firewall Rule K8s:6443/tcp
-  #command: firewall-cmd  --zone=internal --add-port=6443/tcp --permanent
-  #tags: manager
-#
-#- name: Firewall Rule K8s:10250/tcp
-  #command: firewall-cmd  --zone=internal --add-port=10250/tcp --permanent
-  #tags: manager
-##
-#- name: Firewall Reload
-  #command: firewall-cmd  --reload
-  #tags: manager
-#
-- name: Create /root/bin (if it doesn't exist)
+
+- name: Create directory for helm installer file
   file:
-    path: /root/bin
+    path: "{{ helm_installer_file_directory }}"
     state: directory
-    mode: 0755
+    mode: "{{ helm_installer_file_directory_mode }}"
 
-- name: Get Helm Installer
+- name: Get helm installer
   get_url:
-    url: https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-    dest: /root/bin/get_helm.sh
-    mode: 700
-  tags: manager
-
-- name: Install Helm
-  command: /root/bin/get_helm.sh
+    url: "{{ helm_installer_url }}"
+    dest: "{{ helm_installer_file_dest }}"
+    mode: "{{ helm_installer_file_mode }}"
   tags: manager
 
-# install and start up OpenSM -  III
+- name: Install helm
+  command: "/bin/bash {{ helm_installer_file_dest }}"
+  changed_when: true
+  tags: manager

+ 24 - 0
kubernetes/roles/manager/vars/main.yml

@@ -0,0 +1,24 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+helm_installer_file_directory: /root/bin
+
+helm_installer_file_directory_mode: 0755
+
+helm_installer_url: https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+
+helm_installer_file_dest: /root/bin/get_helm.sh
+
+helm_installer_file_mode: 0700

+ 78 - 41
kubernetes/roles/startmanager/tasks/main.yml

@@ -11,39 +11,64 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
 ---
-- name: Turn Swap OFF (if not already disabled)
+
+- name: Disable swap (if not already disabled)
   command: /usr/sbin/swapoff -a
+  changed_when: true
   tags: init
 
+- name: Start and enable docker service
+  systemd:
+    name: docker
+    state: started
+    enabled: yes
+    daemon_reload: yes
+  tags: docker
+
 - name: Initialize kubeadm
-  command: /bin/kubeadm init --pod-network-cidr=10.244.0.0/16 --apiserver-advertise-address={{ manager_ip }}
-  #command: /bin/kubeadm init
+  command: "/bin/kubeadm init --pod-network-cidr='{{ pod_network_cidr_ip }}' --apiserver-advertise-address='{{ manager_ip }}'"
+  changed_when: true
   register: init_output
   tags: init
 
-- name: Setup Directory for Kubernetes environment for root
-  file: path=/root/.kube state=directory
+- name: Setup directory for Kubernetes environment for root
+  file:
+    path: "{{ k8s_root_directory }}"
+    state: directory
+    mode: "{{ k8s_root_directory_mode }}"
   tags: init
 
-- name: Copy Kubernetes Config for root #do this for other users too?
+- name: Copy Kubernetes config for root
   copy:
-    src: /etc/kubernetes/admin.conf
-    dest: /root/.kube/config
+    src: "{{ k8s_config_src }}"
+    dest: "{{ k8s_config_dest }}"
     owner: root
     group: root
-    mode: 0644
+    mode: "{{ k8s_config_file_mode }}"
     remote_src: yes
   tags: init
 
+- name: Update the kubernetes config file permissions
+  shell: "chown $(id -u):$(id -g) '{{ k8s_config_dest }}'"
+  args:
+    warn: false
+  changed_when: true
+  tags: init
+
 - name: Cluster token
-  shell: kubeadm token list | cut -d ' ' -f1 | sed -n '2p'
+  shell: >
+    set -o pipefail && \
+      kubeadm token list | cut -d ' ' -f1 | sed -n '2p'
+  changed_when: false
   register: K8S_TOKEN
   tags: init
 
 - name: CA Hash
-  shell: openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | sed 's/^.* //'
+  shell: >
+    set -o pipefail && \
+      openssl x509 -pubkey -in {{ k8s_cert_path }} | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | sed 's/^.* //'
+  changed_when: false
   register: K8S_MANAGER_CA_HASH
   tags: init
 
@@ -55,65 +80,77 @@
     ip:     "{{ manager_ip }}"
   tags: init
 
-- name:
+- name: Print k8s token
   debug:
     msg: "[Manager] K8S_TOKEN_HOLDER K8S token is {{ hostvars['K8S_TOKEN_HOLDER']['token'] }}"
+    verbosity: 2
   tags: init
 
-- name:
+- name: Print k8s hash
   debug:
     msg: "[Manager] K8S_TOKEN_HOLDER K8S Hash is  {{ hostvars['K8S_TOKEN_HOLDER']['hash'] }}"
+    verbosity: 2
   tags: init
 
-- name:
+- name: Print k8s manager_ip
   debug:
     msg: "[Manager] K8S_MANAGER_IP is  {{ manager_ip }}"
+    verbosity: 2
   tags: init
 
 - name: Setup Calico SDN network
-  shell: kubectl apply -f https://docs.projectcalico.org/manifests/calico.yaml
+  command: "kubectl apply -f '{{ calico_yml_url }}'"
+  when: k8s_cni == "calico"
   tags: init
 
-#- name: Setup Flannel SDN network
-  #shell: kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
-  #tags: init
-
+- name: Setup Flannel SDN network
+  command: "kubectl apply -f '{{ flannel_yml_url }}'"
+  when: k8s_cni == "flannel"
+  tags: init
 
 - name: Create yaml repo for setup
   file:
-    path: /root/k8s
-    owner: root
-    group: root
-    mode: 0755
+    path: "{{ yaml_repo_dir_path }}"
     state: directory
+    mode: "{{ yaml_repo_dir_mode }}"
   tags: init
 
-- name: Create Service Account (K8S Dashboard) Files
-  copy: src=create_admin_user.yaml dest=/root/k8s/create_admin_user.yaml owner=root group=root mode=655
+- name: Create service account (K8s dashboard) files
+  copy:
+    src: create_admin_user.yaml
+    dest: "{{ k8s_service_account_file_dest }}"
+    owner: root
+    group: root
+    mode: "{{ k8s_service_account_file_mode }}"
   tags: init
 
-- name: Create Service Account (K8S Dashboard) - Create
-  shell: kubectl create -f /root/k8s/create_admin_user.yaml
+- name: Create service account (K8s dashboard)
+  command: "kubectl create -f '{{ k8s_service_account_file_dest }}'"
+  changed_when: true
   tags: init
 
-- name: Create ClusterRoleBinding (K8S Dashboard) Files
-  copy: src=create_clusterRoleBinding.yaml dest=/root/k8s/create_clusterRoleBinding.yaml owner=root group=root mode=655
+- name: Create clusterRoleBinding (K8s dashboard) files
+  copy:
+    src: create_clusterRoleBinding.yaml
+    dest: "{{ k8s_clusterRoleBinding_file_dest }}"
+    owner: root
+    group: root
+    mode: "{{ k8s_clusterRoleBinding_file_mode }}"
   tags: init
 
-- name: Create ClusterRoleBinding (K8S Dashboard) - Apply
-  shell: kubectl create -f /root/k8s/create_clusterRoleBinding.yaml
+- name: Create clusterRoleBinding (K8s dashboard)
+  command: "kubectl create -f '{{ k8s_clusterRoleBinding_file_dest }}'"
+  changed_when: true
   tags: init
 
-- name: Dump Bearer Token for K8S Dashboard Login
-  shell: kubectl -n kube-system describe secret $(kubectl -n kube-system get secret | grep admin-user | awk '{print $1}') > /root/k8s/token
+- name: Dump bearer token for K8s dashboard login
+  shell: >
+    set -o pipefail && \
+      kubectl -n kube-system describe secret $(kubectl -n kube-system get secret | grep admin-user | awk '{print $1}') > /root/k8s/token
+  changed_when: true
   tags: init
 
 - name: Edge / Workstation Install allows pods to scheudle on manager
-  shell: kubectl taint nodes --all node-role.kubernetes.io/master-
+  command: kubectl taint nodes --all node-role.kubernetes.io/master-
   when: single_node
-  tags: init
-
-
-# If more debug information is needed during init uncomment the following 2 lines
-#- debug: var=init_output.stdout_lines
-  #tags: init
+  tags: init

+ 52 - 0
kubernetes/roles/startmanager/vars/main.yml

@@ -0,0 +1,52 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+single_node: false
+
+manager_ip: "{{ ansible_host }}"
+
+k8s_cni: calico
+
+pod_network_cidr_ip: 10.244.0.0/16
+
+k8s_root_directory: /root/.kube
+
+k8s_root_directory_mode: 0755
+
+k8s_config_src: /etc/kubernetes/admin.conf
+
+k8s_config_dest: /root/.kube/config
+
+k8s_config_file_mode: 0644
+
+k8s_cert_path: /etc/kubernetes/pki/ca.crt
+
+k8s_dummy_hostname: K8S_TOKEN_HOLDER
+
+yaml_repo_dir_path: /root/k8s
+
+yaml_repo_dir_mode: 0755
+
+k8s_service_account_file_dest: /root/k8s/create_admin_user.yaml
+
+k8s_service_account_file_mode: 0655
+
+k8s_clusterRoleBinding_file_dest: /root/k8s/create_clusterRoleBinding.yaml
+
+k8s_clusterRoleBinding_file_mode: 0655
+
+calico_yml_url: https://docs.projectcalico.org/manifests/calico.yaml
+
+flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml

+ 60 - 26
kubernetes/roles/startservices/tasks/main.yml

@@ -11,77 +11,111 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
 ---
+
 - name: Wait for CoreDNS to restart
-  shell: kubectl rollout status deployment/coredns -n kube-system
+  command: kubectl rollout status deployment/coredns -n kube-system
+  changed_when: false
+  ignore_errors: True
   tags: init
 
 - name: Deploy MetalLB
-  shell: kubectl apply -f https://raw.githubusercontent.com/google/metallb/v0.8.1/manifests/metallb.yaml
+  command: "kubectl apply -f '{{ metallb_yaml_url }}'"
+  changed_when: true
   tags: init
 
 - name: Create MetalLB Setup Config Files
-  copy: src=metal-config.yaml dest=/root/k8s/metal-config.yaml owner=root group=root mode=655
+  copy:
+    src: metal-config.yaml
+    dest: "{{ metallb_config_file_dest }}"
+    owner: root
+    group: root
+    mode: "{{ metallb_config_file_mode }}"
   tags: init
 
 - name: Create MetalLB Setup Deployment Files
-  copy: src=metallb.yaml dest=/root/k8s/metallb.yaml owner=root group=root mode=655
+  copy:
+    src: metallb.yaml
+    dest: "{{ metallb_deployment_file_dest }}"
+    owner: root
+    group: root
+    mode: "{{ metallb_deployment_file_mode }}"
   tags: init
 
 - name: Deploy MetalLB
-  shell: kubectl apply -f /root/k8s/metallb.yaml
+  command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
+  changed_when: true
   tags: init
 
 - name: Create default setup for MetalLB
-  shell: kubectl apply -f /root/k8s/metal-config.yaml
+  command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
+  changed_when: true
   tags: init
 
-- name: Start K8S Dashboard
-  shell: kubectl create -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.0.0/aio/deploy/recommended.yaml
+- name: Start k8s dashboard
+  command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
+  changed_when: true
+  register: result
   tags: init
 
-- name: Helm - Add Stable Repo
-  shell: helm repo add stable https://charts.helm.sh/stable
+- name: Helm - add stable repo
+  command: "helm repo add stable '{{ helm_stable_repo_url }}'"
+  changed_when: true
   tags: init
 
-- name: Helm - Add Nvidia k8s-device-plugin (nvdp) Repo
-  shell: helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
+- name: Helm - add Nvidia k8s-device-plugin (nvdp) repo
+  command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'"
+  changed_when: true
   tags: init
 
-- name: Helm - Add Nvidia GPU Discovery (nvgfd) Repo
-  shell: helm repo add nvgfd https://nvidia.github.io/gpu-feature-discovery
+- name: Helm - add Nvidia GPU discovery (nvgfd) repo
+  command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'"
+  changed_when: true
   tags: init
 
-- name: Helm - Update Repo
-  shell: helm repo update
+- name: Helm - update repo
+  command: helm repo update
+  changed_when: true
   tags: init
 
 - name: Start NFS Client Provisioner
-  shell: helm install stable/nfs-client-provisioner --set nfs.server={{ nfs_server }}  --set nfs.path={{ nfs_path }} --generate-name
+  command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server }}' --set nfs.path='{{ nfs_path }}' --generate-name"
+  changed_when: true
+  register: result
   tags: init
 
 - name: Set NFS-Client Provisioner as DEFAULT StorageClass
-  shell: "kubectl patch storageclasses.storage.k8s.io nfs-client -p '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"true\"}}}'"
+  shell: >
+    kubectl patch storageclasses.storage.k8s.io nfs-client \
+    -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
+  changed_when: true
   tags: init
 
 - name: Prometheus deployment
-  shell: helm install stable/prometheus --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer --generate-name
+  command: >
+    helm install stable/prometheus \
+    --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
+    --generate-name
+  changed_when: true
   tags: init
 
 - name: Install MPI Operator
-  shell: kubectl create -f https://raw.githubusercontent.com/kubeflow/mpi-operator/master/deploy/v1alpha2/mpi-operator.yaml
+  command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
+  changed_when: true
   tags: init
 
 - name: Install nvidia-device-plugin
-  shell: helm install --version=0.7.0  --generate-name  --set migStrategy={{ MIG_STRATEGY }}  nvdp/nvidia-device-plugin 
+  command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
+  changed_when: true
   tags: init
 
 - name: Install GPU Feature Discovery
-  shell: helm install  --version=0.2.0  --generate-name  --set migStrategy={{ MIG_STRATEGY }}  nvgfd/gpu-feature-discovery
+  command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
+  changed_when: true
   tags: init
 
-- name: Deploy Xilinx Device Plugin
-  shell: kubectl create -f https://raw.githubusercontent.com/Xilinx/FPGA_as_a_Service/master/k8s-fpga-device-plugin/fpga-device-plugin.yml
+- name: Deploy Xilinx Device plugin
+  command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
+  changed_when: true
   register: fpga_enable
-  tags: init
+  tags: init

+ 46 - 0
kubernetes/roles/startservices/vars/main.yml

@@ -0,0 +1,46 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+metallb_config_file_dest: /root/k8s/metal-config.yaml
+
+metallb_config_file_mode: 0655
+
+metallb_deployment_file_dest: /root/k8s/metallb.yaml
+
+metallb_deployment_file_mode: 0655
+
+metallb_yaml_url: https://raw.githubusercontent.com/google/metallb/v0.8.1/manifests/metallb.yaml
+
+k8s_dashboard_yaml_url: https://raw.githubusercontent.com/kubernetes/dashboard/v2.0.0-beta6/aio/deploy/recommended.yaml
+
+helm_stable_repo_url: https://charts.helm.sh/stable
+
+nfs_server: "{{ ansible_host }}"
+
+nfs_path: /work
+
+mpi_operator_yaml_url: https://raw.githubusercontent.com/kubeflow/mpi-operator/master/deploy/v1alpha2/mpi-operator.yaml
+
+nvidia_k8s_device_plugin_repo_url: https://nvidia.github.io/k8s-device-plugin
+
+nvidia_gpu_discovery_repo_url: https://nvidia.github.io/gpu-feature-discovery
+
+nvidia_device_plugin_version: 0.7.0
+
+mig_strategy: none
+
+gpu_feature_discovery_version: 0.2.0
+
+fpga_device_plugin_yaml_url: https://raw.githubusercontent.com/Xilinx/FPGA_as_a_Service/master/k8s-fpga-device-plugin/fpga-device-plugin.yml

+ 5 - 26
kubernetes/roles/startworkers/tasks/main.yml

@@ -11,38 +11,17 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
 ---
 
-- name: Turn Swap OFF (if not already disabled)
+- name: Disable swap (if not already disabled)
   command: /usr/sbin/swapoff -a
+  changed_when: true
   tags: init
 
-#- name:
-  #debug:
-    #msg: "[Worker] K8S_TOKEN_HOLDER K8S token is {{ hostvars['K8S_TOKEN_HOLDER']['token'] }}"
-  #tags: init
-
-#- name:
-  #debug:
-    #msg: "[Worker] K8S_TOKEN_HOLDER K8S Hash is  {{ hostvars['K8S_TOKEN_HOLDER']['hash'] }}"
-  #tags: init
-
-#- name:
-  #debug:
-    #msg: "[Worker] K8S_MANGER_IP is  {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}"
-  #tags: init
-
-- name: "Kubeadmn join"
+- name: Execute kubeadm join command
   shell: >
     kubeadm join --token={{ hostvars['K8S_TOKEN_HOLDER']['token'] }}
     --discovery-token-ca-cert-hash sha256:{{ hostvars['K8S_TOKEN_HOLDER']['hash'] }}
-    {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}:6443
+    {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}:{{ apiserver_bind_port }}
   when: not single_node
-  tags: init
-
-
-#- name: Join Computes to pool
-#   command: "{{ kubeJoinCommand }}"
-# tags: init
-
+  tags: init

+ 3 - 7
slurm/roles/start-slurm-workers/tasks/main.yml

@@ -12,11 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 ---
-- name: Install SLURM RPMs on compute
-  yum:
-    name: "{{ item }}"
-    #name: "{{ query('fileglob', ['/home/rpms/slurm*20*.rpm']) }}" <-- how it should work to avoid loop
-  with_fileglob:
-    - /home/rpms/slurm*20*.rpm
-  tags: install
 
+single_node: false
+
+apiserver_bind_port: 6443

+ 28 - 14
slurm/roles/common/handlers/main.yml

@@ -1,26 +1,40 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
 ---
-- name: restart ntpd
+
+- name: Restart ntpd
   service:
-    name=ntpd
-    state=restarted
-    enabled=yes
+    name: ntpd
+    state: restarted
+    enabled: yes
 
-- name: restart chrony
+- name: Restart chrony
   service:
-    name=chronyd
-    state=restarted
-    enabled=yes
+    name: chronyd
+    state: restarted
+    enabled: yes
 
-- name: sync ntp clocks
+- name: Sync tp clocks
   command: ntpdc -np
   register: ntp_clock
   until:  ntp_clock.stdout.find('*') > -1
-  retries: 10
-  delay: 60
+  retries: "{{ retry_count_one }}"
+  delay: "{{ delay_count_one }}"
 
-- name: sync chrony sources
+- name: Sync chrony sources
   command: chronyc sources
   register: chrony_src
   until:  chrony_src.stdout.find('^*') > -1
-  retries: 6
-  delay: 10
+  retries: "{{ retry_count }}"
+  delay: "{{ delay_count }}"

+ 17 - 2
slurm/roles/common/tasks/main.yml

@@ -1,2 +1,17 @@
-- name: deploy time ntp/chrony
-  include_tasks: ntp.yml
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Deploy time ntp/chrony
+  include_tasks: ntp.yml

+ 27 - 13
slurm/roles/common/tasks/ntp.yml

@@ -1,42 +1,56 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
 ---
 
-  - name: deploy ntp servers
+  - name: Deploy ntp servers
     block:
-      - name: deploy ntpd
+      - name: Deploy ntpd
         package:
           name: ntp
           state: present
-      - name: deploy ntpdate
+      - name: Deploy ntpdate
         package:
           name: ntpdate
           state: present
-      - name: update ntp servers
+      - name: Update ntp servers
         template:
           src: ntp.conf.j2
-          dest: /etc/ntp.conf
+          dest: "{{ ntp_path }}"
           owner: root
           group: root
-          mode: u=rw,g=r,o=r
+          mode: "{{ ntp_mode }}"
           backup: yes
         notify:
           - restart ntpd
           - sync ntp clocks
-    when:  ( ansible_distribution == "CentOS" or   ansible_distribution == "RedHat" ) and ansible_distribution_major_version | int < 8
-  - name:   deploy chrony server
+    when:  ( ansible_distribution == "CentOS" or   ansible_distribution == "RedHat" ) and ansible_distribution_major_version  < os_higher_version
+
+  - name: Deploy chrony server
     block:
-      - name: deploy chrony
+      - name: Deploy chrony
         package:
             name: chrony
             state: present
-      - name: update ntp servers
+      - name: Update ntp servers
         template:
           src: chrony.conf.j2
-          dest: /etc/chrony.conf
+          dest: "{{ chrony_path }}"
           owner: root
           group: root
-          mode: u=rw,g=r,o=r
+          mode: "{{ ntp_mode }}"
           backup: yes
         notify:
           - restart chrony
           - sync chrony sources
-    when:  ( ansible_distribution == "CentOS" or   ansible_distribution == "RedHat" ) and ansible_distribution_major_version | int > 7
+    when:  ( ansible_distribution == "CentOS" or   ansible_distribution == "RedHat" ) and ansible_distribution_major_version  > os_version

+ 26 - 1
slurm/roles/common/vars/main.yml

@@ -1,6 +1,31 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+chrony_path: "/etc/chrony.conf"
+ntp_path: "/etc/ntp.conf"
+ntp_mode: "0644"
+os_higher_version: "8"
+os_version: "7"
+retry_count_one: "10"
+delay_count_one: "60"
+retry_count: "6"
+delay_count: "10"
+
 ntp_servers: 
   - 0.centos.pool.ntp.org
   - 1.centos.pool.ntp.org
   - 2.centos.pool.ntp.org
 chrony_servers:
-  - 2.centos.pool.ntp.org 
+  - 2.centos.pool.ntp.org

+ 0 - 104
slurm/roles/slurm-common/tasks/main.yml

@@ -1,104 +0,0 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
----
-
-- name: install packages for slurm
-  package:
-    name:
-      - munge
-      - mariadb
-      - mariadb-devel
-      - python3
-    state: present
-  tags: install
-
-- name: create munge key
-  command: /usr/sbin/create-munge-key -f
-  tags: install
-
-- name: Copy munge key
-  copy:
-    src: munge.key
-    dest: /etc/munge
-    owner: munge
-    group: munge
-    mode: 0400
-  tags: install
-
-- name: Copy example Slurm Configuration - slurm.conf
-  copy:
-    src: slurm.conf
-    dest: /etc/slurm/
-    mode: 0644
-  tags: install
-
-
-- name: create SLURM Group
-  group:
-    name: slurm
-    state: present
-  tags: install
-
-- name: Add the user 'slurm' with uid 6001 and a primary group of 'slurm'
-  user:
-    name: slurm
-    comment: Slurm User Account
-    uid: 6001
-    group: slurm
-  tags: install
-
-- name: create SLURM log directory
-  file:
-    path: /var/log/slurm
-    state: directory
-    owner: slurm
-    group: slurm
-    mode: 0755
-    recurse: yes
-  tags: install
-
-- name: give slurm user permission to spool
-  file:
-    path: /var/spool/slurm
-    owner: slurm
-    group: slurm
-    state: directory
-    mode: 0755
-    recurse: yes
-
-- name: give slurm user permission to slurmctld
-  file:
-    path: /var/run/slurmctld.pid
-    owner: slurm
-    group: slurm
-    mode: 0755
-    state: touch
-
-- name: give slurm user permission to slurmd
-  file:
-    path: /var/run/slurmd.pid
-    owner: slurm
-    group: slurm
-    mode: 0755
-    state: touch
-
-- name: start munge service
-  service:
-    name: munge
-    state: restarted
-    enabled: yes
-  tags: install
-
-
-

+ 0 - 118
slurm/roles/slurm-manager/tasks/main.yml

@@ -1,118 +0,0 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
----
-- name: create download folder
-  file:
-    path: /root/Downloads
-    state: directory
-    mode: '0755'
-- name: Download Slurm source
-  get_url:
-    url: "{{ slurm_url }}"
-    dest: /root/Downloads/
-    checksum: "{{ slurm_md5 }}"
-    validate_certs: no    
-  tags: install
-
-- name: Build SLURM RPMs
-  command: rpmbuild -ta /root/Downloads/slurm-20.02.0.tar.bz2
-  tags: install
-
-- name: Copy RPMs to NFS share
-  copy:
-    src: "{{ item }}"
-    dest: /home/rpms/
-    mode: '0755'
-  with_fileglob:
-    - /root/rpmbuild/RPMS/x86_64/slurm*20*.rpm
-  tags: install
-
-- name: Install SLURM RPMs on Manager
-  yum:
-    name: "{{ item }}"
-    #name: "{{ query('fileglob', ['/home/rpms/slurm*20*.rpm']) }}" <-- how it should work to avoid loop
-  with_fileglob:
-    - /home/rpms/slurm*20*.rpm
-  tags: install
-
-- name: Firewall Rule slurm allow 6817/tcp
-  command: firewall-cmd  --zone=internal --add-port=6817/tcp --permanent
-  tags: install
-
-- name: Firewall Rule slurm allow 6818/tcp
-  command: firewall-cmd  --zone=internal --add-port=6818/tcp --permanent
-  tags: install
-
-- name: Firewall Rule slurm allow 6819/tcp
-  command: firewall-cmd  --zone=internal --add-port=6819/tcp --permanent
-  tags: install
-
-- name: Firewall Rule slurm allow all incoming traffic on internal network
-  command: firewall-cmd --permanent --zone=internal --add-rich-rule='rule family="ipv4" source address="192.168.1.0/24" accept'
-  tags: install
-
-- name: Firewall Reload
-  command: firewall-cmd  --reload
-  tags: install
-
-
-- name: Start MariaDB
-  service:
-    name: mariadb
-    state: restarted
-    enabled: yes
-  tags: install
-
-- name: Grant Permissions for SLURM DB
-  command: mysql -u root -e "GRANT ALL ON slurm_acct_db.* TO 'slurm'@'localhost' identified by 'password' with grant option;"
-  tags: install
-
-- name: Create slurmdbd.conf file
-  copy:
-    src: /etc/slurm/slurmdbd.conf.example
-    dest: /etc/slurm/slurmdbd.conf
-    mode: 0600
-  tags: install
-
-- name: Populate Accounting Database
-  command: slurmdbd
-  tags: install
-
-- name: Create Slurm Cluster
-  command: sacctmgr -i add cluster {{ inventory_hostname }}
-  tags: install
-
-- name: Create Default Slurm Group
-  command: sacctmgr -i add account defaultgroup Cluster={{inventory_hostname}} Description="Default Account" Organization="Default Org"
-  tags: install
-
-- name: Add root to the Default Account
-  command: sacctmgr -i add user root DefaultAccount=defaultgroup
-  tags: install
-
-- name: Start slurmctld on Manager
-  service:
-    name: slurmctld
-    state: restarted
-    enabled: yes
-  tags: install
-
-- name: Enable Slurmdbd on Manager
-  service:
-    name: slurmdbd
-    state: restarted
-    enabled: yes
-  tags: install
-

slurm/roles/slurm-common/files/munge.key → slurm/roles/slurm_common/files/munge.key


+ 14 - 14
slurm/roles/slurm-common/files/slurm.conf

@@ -8,25 +8,25 @@
 #
 # See the slurm.conf man page for more information.
 #
-ClusterName=friday
-ControlMachine=friday
-ControlAddr=10.0.0.1
+ClusterName=
+ControlMachine=
+#ControlAddr=
 #BackupController=
 #BackupAddr=
 #
-SlurmUser=slurm
+SlurmUser=
 #SlurmdUser=root
-SlurmctldPort=6817
-SlurmdPort=6818
+SlurmctldPort=
+SlurmdPort=
 AuthType=auth/munge
 #JobCredentialPrivateKey=
 #JobCredentialPublicCertificate=
-StateSaveLocation=/var/spool/slurm/ctld
-SlurmdSpoolDir=/var/spool/slurm/
+#StateSaveLocation=/var/spool/
+SlurmdSpoolDir=
 SwitchType=switch/none
 MpiDefault=none
-SlurmctldPidFile=/var/run/slurmctld.pid
-SlurmdPidFile=/var/run/slurmd.pid
+SlurmctldPidFile=
+SlurmdPidFile=
 ProctrackType=proctrack/pgid
 #PluginDir=
 #FirstJobId=
@@ -72,9 +72,9 @@ PriorityMaxAge=14-0
 #
 # LOGGING
 SlurmctldDebug=3
-SlurmctldLogFile=/var/log/slurm/slurmctld.log
+SlurmctldLogFile=
 SlurmdDebug=1
-SlurmdLogFile=/var/log/slurm/slurmd.log
+SlurmdLogFile=
 JobCompType=jobcomp/none
 #JobCompLoc=
 #
@@ -91,7 +91,7 @@ AccountingStorageType=accounting_storage/slurmdbd
 # COMPUTE NODES
 #NodeName=linux[1-32] Procs=1 State=UNKNOWN
 #NodeName=DEFAULT Sockets=2 CoresPerSocket=20 State=UNKNOWN
-NodeName=compute000 Sockets=2 CoresPerSocket=8
-NodeName=compute[002-005] CoresPerSocket=20
+NodeName= Sockets= CoresPerSocket=
+#NodeName=compute[002-005] CoresPerSocket=20
 PartitionName=normal Nodes=ALL Default=YES MaxTime=INFINITE State=UP
 #PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP

+ 164 - 0
slurm/roles/slurm_common/tasks/main.yml

@@ -0,0 +1,164 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Install epel repository
+  package:
+    name: epel-release
+    state: present
+  tags: install
+
+- name: Munge installation
+  package:
+    name: munge-devel
+    enablerepo: PowerTools
+    state: present
+
+- name: Install packages for slurm
+  package:
+    name: "{{ item }}"
+    state: present
+  with_items:
+    - "{{ common_packages }}"
+  tags: install
+
+- name: Create munge key
+  command: "{{ munge_cmd }}"
+  changed_when: true
+
+- name: Copy munge key
+  copy:
+    src: munge.key
+    dest: "{{ munge_dest }}"
+    owner: munge
+    group: munge
+    mode: "{{ munge_mode }}"
+  tags: install
+
+- name: Slurm configuration - slurm.conf
+  copy:
+    src: slurm.conf
+    dest: "{{ slurm_dest }}"
+    mode: "{{ slurm_mode }}"
+  tags: install
+
+- name: Add cluster name
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "ClusterName="
+    line: "ClusterName={{ cluster_name }}"
+
+- name: Add slurm user name
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmUser="
+    line: "SlurmUser={{ slurm_user }}"
+
+- name: Add slurmctld port no
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmctldPort="
+    line: "SlurmctldPort={{ slurmctld_port }}"
+
+- name: Add slurmd port no
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmdPort="
+    line: "SlurmdPort={{ slurmd_port }}"
+
+- name: Add spool path
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmdSpoolDir="
+    line: "SlurmdSpoolDir={{ spool_pth }}"
+
+- name: Add slurmctld pid file path
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmctldPidFile="
+    line: "SlurmctldPidFile={{ slurmctld_pid }}"
+
+- name: Add slurmd pid file path
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmdPidFile="
+    line: "SlurmdPidFile={{ slurmd_pid }}"
+
+- name: Add slurmctld log file path
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmctldLogFile="
+    line: "SlurmctldLogFile={{ slurmctld_log }}"
+
+- name: Add slurmd log file path
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmdLogFile="
+    line: "SlurmdLogFile={{ slurmd_log }}"
+
+- name: Create slurm group
+  group:
+    name: slurm
+    state: present
+  tags: install
+
+- name: Add the user 'slurm' with uid 6001 and a primary group of 'slurm'
+  user:
+    name: slurm
+    comment: Slurm User Account
+    uid: "{{ slurm_uid }}"
+    group: slurm
+  tags: install
+
+- name: Create slurm log directory
+  file:
+    path: "{{ slurm_logpth }}"
+    state: directory
+    owner: slurm
+    group: slurm
+    mode: "{{ gen_mode }}"
+    recurse: yes
+  tags: install
+
+- name: Give slurm user permission to spool
+  file:
+    path: "{{ spool_pth }}"
+    owner: slurm
+    group: slurm
+    state: directory
+    mode: "{{ gen_mode }}"
+    recurse: yes
+
+- name: Give slurm user permission to slurmctld
+  file:
+    path: "{{ slurmctld_pid }}"
+    owner: slurm
+    group: slurm
+    mode: "{{ gen_mode }}"
+    state: touch
+
+- name: Give slurm user permission to slurmd
+  file:
+    path: "{{ slurmd_pid }}"
+    owner: slurm
+    group: slurm
+    mode: "{{ gen_mode }}"
+    state: touch
+
+- name: Start munge service
+  service:
+    name: munge
+    state: restarted
+    enabled: yes
+  tags: install

+ 42 - 0
slurm/roles/slurm_common/vars/main.yml

@@ -0,0 +1,42 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+epel_url: https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
+
+common_packages:
+   - munge
+   - munge-libs
+   - mariadb-server
+   - mariadb-devel
+   - python3
+
+munge_dest: "/etc/munge/"
+munge_cmd: "/usr/sbin/create-munge-key -f"
+munge_mode: "0400"
+slurm_mode: "0644"
+slurm_dest: "/etc/slurm/"
+slurm_confpth: "/etc/slurm/slurm.conf"
+slurm_user: "slurm"
+slurmctld_port: "6817"
+slurmd_port: "6818"
+slurm_uid: "6001"
+slurm_logpth: "/var/log/slurm/"
+gen_mode: "0755"
+spool_pth: "/var/spool/slurm/"
+slurmctld_pid: "/var/run/slurmctld.pid"
+slurmd_pid: "/var/run/slurmd.pid"
+cluster_name : "manager,compute"
+slurmctld_log: "/var/log/slurm/slurmctld.log"
+slurmd_log: "/var/log/slurm/slurmd.log"

+ 38 - 0
slurm/roles/slurm_manager/files/slurmdbd.conf

@@ -0,0 +1,38 @@
+#
+# Example slurmdbd.conf file.
+#
+# See the slurmdbd.conf man page for more information.
+#
+# Archive info
+#ArchiveJobs=yes
+#ArchiveDir="/tmp"
+#ArchiveSteps=yes
+#ArchiveScript=
+#JobPurge=12
+#StepPurge=1
+#
+# Authentication info
+AuthType=auth/munge
+#AuthInfo=/var/run/munge/munge.socket.2
+#
+# slurmDBD info
+DbdAddr=
+DbdHost=
+#DbdPort=7031
+SlurmUser=
+#MessageTimeout=300
+DebugLevel=verbose
+#DefaultQOS=normal,standby
+LogFile=
+PidFile=
+#PluginDir=/usr/lib/slurm
+#PrivateData=accounts,users,usage,jobs
+#TrackWCKey=yes
+#
+# Database info
+StorageType=accounting_storage/mysql
+#StorageHost=
+#StoragePort=
+#StoragePass=
+#StorageUser=
+#StorageLoc=

+ 174 - 0
slurm/roles/slurm_manager/tasks/main.yml

@@ -0,0 +1,174 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Install packages for slurm
+  package:
+    name: "{{ item }}"
+    state: present
+  with_items:
+    - "{{ slurm_packages }}"
+  tags: install
+
+- name: Install development tools
+  package:
+    name: "{{ item }}"
+    enablerepo: PowerTools
+    state: present
+  with_items:
+    - "{{ dev_tools }}"
+  tags: install
+
+- name: Create temporary download folder for slurm
+  file:
+    path: "{{ tmp_path }}"
+    owner: slurm
+    group: slurm
+    mode: "{{ tmp_mode }}"
+    state: directory
+
+- name: Download slurm source
+  get_url:
+    url: "{{ slurm_url }}"
+    dest: "{{ tmp_path }}"
+    checksum: "{{ slurm_md5 }}"
+    validate_certs: no
+  tags: install
+
+- name: Build slurm rpms
+  command: rpmbuild -ta "{{ rpmbuild_path }}"
+  changed_when: false
+  args:
+    warn: no
+
+- name: Verify package md5
+  command: rpm -qa
+  ignore_errors: true
+  register: verify_result
+  changed_when: no
+  failed_when: no
+  args:
+    warn: no
+
+- name: Install rpms
+  command: rpm -Uvh ~"{{ rpm_loop }}"
+  args:
+    chdir: "{{ rpm_path }}"
+    warn: no
+  when: verify_result.rc != 0
+
+- name: Add control machine name
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "ControlMachine="
+    line: "ControlMachine={{ group_names[0] }}"
+
+- name: Add slurm user name
+  lineinfile:
+    path: "{{ slurmdbd_path }}"
+    regexp: "SlurmUser="
+    line: "SlurmUser={{ slurm_user }}"
+
+- name: Firewall rule for slurm - tcp/ip,udp
+  firewalld:
+    zone: internal
+    port: "{{ item }}"
+    permanent: true
+    state: enabled
+  with_items:
+    - "{{ tcp_port1 }}"
+    - "{{ tcp_port2 }}"
+    - "{{ tcp_port3 }}"
+    - "{{ tcp_port4 }}"
+    - "{{ udp_port1 }}"
+    - "{{ udp_port2 }}"
+  tags: install
+
+- name: Get network address/subnet mask through ipaddr
+  set_fact:
+    network_address: "{{ (ansible_default_ipv4.network + '/' + ansible_default_ipv4.netmask) | ipaddr('network/prefix') }}"
+
+- name: Firewall rule slurm - allow all incoming traffic on internal network
+  firewalld:
+    zone: internal
+    rich_rule: 'rule family="{{ family }}" source address="{{ network_address }}" accept'
+    permanent: true
+    state: enabled
+  tags: install
+
+- name: Firewall reload
+  systemd:
+    name: firewalld
+    state: reloaded
+  tags: install
+
+- name: Start mariadb
+  service:
+    name: mariadb
+    state: restarted
+    enabled: yes
+  tags: install
+
+- name: Grant permissions for slurm db
+  command: mysql -u root -e "GRANT ALL ON slurm_acct_db.* TO 'slurm'@'localhost' identified by 'password' with grant option;"
+  tags: install
+  changed_when: true
+
+- name: Create slurmdbd.conf file
+  copy:
+    src: slurmdbd.conf
+    dest: "{{ slurmdbd_path }}"
+    mode: "{{ slurmdbd_mode }}"
+  tags: install
+
+- name: Add slurm user name
+  lineinfile:
+    path: "{{ slurmdbd_path }}"
+    regexp: "SlurmUser="
+    line: "SlurmUser={{ slurm_user }}"
+
+- name: Add db address
+  lineinfile:
+    path: "{{ slurmdbd_path }}"
+    regexp: "DbdAddr="
+    line: "DbdAddr={{ DbdAddr }}"
+
+- name: Add db host
+  lineinfile:
+    path: "{{ slurmdbd_path }}"
+    regexp: "DbdHost="
+    line: "DbdHost={{ DbdHost }}"
+
+- name: Add log file path
+  lineinfile:
+    path: "{{ slurmdbd_path }}"
+    regexp: "LogFile="
+    line: "LogFile={{ logfile }}"
+
+- name: Add pid file path
+  lineinfile:
+    path: "{{ slurmdbd_path }}"
+    regexp: "PidFile="
+    line: "PidFile={{ pidfile }}"
+
+- name: Populate accounting database
+  command: slurmdbd
+  tags: install
+  changed_when: true
+
+- name: Save slurm conf file in buffer
+  fetch:
+    src: "{{ slurm_confpth }}"
+    dest: "{{ buffer_path }}"
+    flat: true

+ 62 - 0
slurm/roles/slurm_manager/vars/main.yml

@@ -0,0 +1,62 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+slurm_packages:
+   - python3
+   - gcc
+   - openssl
+   - openssl-devel
+   - numactl
+   - numactl-devel
+   - hwloc
+   - lua
+   - readline
+   - readline-devel
+   - pam-devel
+   - perl-ExtUtils-MakeMaker
+   - cpanm*
+   - rpm-build
+
+dev_tools:
+   - rrdtool-devel
+   - lua-devel
+   - hwloc-devel
+
+tmp_path: "/root/slurm-tmp"
+tmp_mode: "0755"
+slurm_url: https://download.schedmd.com/slurm/slurm-20.02.3.tar.bz2
+slurm_md5: "md5:c71a300d6c5d33ef8ca60e52a203bb1e"
+rpmbuild_path: "/root/slurm-tmp/slurm-20.02.3.tar.bz2"
+rpm_loop: "/rpmbuild/RPMS/x86_64/*.rpm"
+tcp_port1: "6817/tcp"
+tcp_port2: "6818/tcp"
+tcp_port3: "6819/tcp"
+tcp_port4: "7321/tcp"
+udp_port1: "6817/udp"
+udp_port2: "7321/udp"
+family: "ipv4"
+db_user: "slurm"
+db_host: "localhost"
+slurmdbd_path: "/etc/slurm/slurmdbd.conf"
+slurmdbd_mode: "0600"
+slurm_confpth: "/etc/slurm/slurm.conf"
+slurm_user: "slurm"
+DbdAddr: "localhost"
+DbdHost: "localhost"
+logfile: "/var/log/slurm/slurmdbd.log"
+pidfile: "/var/run/slurm/slurmdbd.pid"
+buffer_path: "/tmp/slurm.conf"
+rpm_path: "/root/rpmbuild/RPMS/x86_64/"
+slurm_mode: "0644"

+ 64 - 0
slurm/roles/slurm_start_services/tasks/main.yml

@@ -0,0 +1,64 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Include common variables
+  include_vars: ../../slurm_manager/vars/main.yml
+
+- name: Copy slurm conf from buffer
+  copy:
+    src: "{{ buffer_path }}"
+    dest: "{{ slurm_confpth }}"
+    mode: "{{ slurm_mode }}"
+
+- name: Start slurmctld on manager
+  service:
+    name: slurmctld
+    enabled: yes
+  tags: install
+
+- name: Enable slurmdbd on manager
+  service:
+    name: slurmdbd
+    enabled: yes
+  tags: install
+
+- name: Show cluster if exists
+  command: sacctmgr -n show cluster {{ inventory_hostname }}
+  register: slurm_clusterlist
+  changed_when: false
+
+- name: Create slurm cluster
+  command: sacctmgr -i add cluster {{ inventory_hostname }}
+  when: slurm_clusterlist.stdout.find(inventory_hostname) == 1
+
+- name: Show account
+  command: sacctmgr show account
+  register: account_added
+  changed_when: false
+
+- name: Create default slurm group
+  command: sacctmgr -i add account defaultgroup Cluster={{ inventory_hostname }} Description="Default Account" Organization="Default Org"
+  when: account_added.stdout.find(inventory_hostname) == 1
+  tags: install
+
+- name: Check if user exists
+  command: sacctmgr show user
+  register: user_added
+  changed_when: false
+
+- name: Add root to the default account
+  command: sacctmgr -i add user root DefaultAccount=defaultgroup
+  when: account_added.stdout.find(inventory_hostname) == 1
+  tags: install

+ 97 - 0
slurm/roles/start_slurm_workers/tasks/main.yml

@@ -0,0 +1,97 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Include common variables
+  include_vars: ../../slurm_manager/vars/main.yml
+
+- name: Copy slurm conf from buffer
+  copy:
+    src: "{{ buffer_path }}"
+    dest: "{{ slurm_confpth }}"
+    mode: "{{ slurm_mode }}"
+
+- name: Install packages for slurm
+  package:
+    name: "{{ item }}"
+    state: present
+  with_items:
+    - "{{ slurm_packages }}"
+  tags: install
+
+- name: Install development tools
+  package:
+    name: "{{ item }}"
+    enablerepo: PowerTools
+    state: present
+  with_items:
+    - "{{ dev_tools }}"
+  tags: install
+
+- name: Create temporary download folder for slurm
+  file:
+    path: "{{ tmp_path }}"
+    owner: slurm
+    group: slurm
+    mode: "{{ tmp_mode }}"
+    state: directory
+
+- name: Download slurm source
+  get_url:
+    url: "{{ slurm_url }}"
+    dest: "{{ tmp_path }}"
+    checksum: "{{ slurm_md5 }}"
+    validate_certs: no
+  tags: install
+
+- name: Build slurm rpms
+  command: rpmbuild -ta "{{ rpmbuild_path }}"
+  changed_when: false
+  args:
+    warn: no
+
+- name: Verify package md5
+  command: rpm -qa
+  ignore_errors: true
+  register: verify_result
+  changed_when: no
+  failed_when: no
+  args:
+    warn: no
+
+- name: Install rpms
+  command: rpm -Uvh ~"{{ rpm_loop }}"
+  args:
+    chdir: "{{ rpm_path }}"
+    warn: no
+  when: verify_result.rc != 0
+
+- name: Add socket and core info
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "NodeName= Sockets= CoresPerSocket="
+    line: "NodeName={{ group_names[0] }} Sockets={{ hostvars[inventory_hostname]['ansible_facts']['processor_count'] }}
+      CoresPerSocket={{ hostvars[inventory_hostname]['ansible_facts']['processor_cores'] }}"
+
+- name: Save slurm conf in buffer
+  fetch:
+    src: "{{ slurm_confpth }}"
+    dest: "{{ buffer_path }}"
+    flat: true
+
+- name: Start slurmd on compute nodes
+  service:
+    name: slurmd.service
+    enabled: yes
+  tags: install

+ 16 - 9
slurm/slurm.yml

@@ -12,26 +12,33 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 ---
+
 #Playbook for installing Slurm on a cluster
 
 #collect info from everything
 - hosts: all
 
-# Apply Common Installation and Config
-- hosts: cluster
+- name: Apply common installation and config
+  hosts: manager,compute
   gather_facts: false
   roles:
+    - slurm_common
     - common
-    - slurm-common
 
-# Apply Manager Config, start services
-- hosts: manager
+- name: Apply manager config
+  hosts: manager
   gather_facts: false
   roles:
-    - slurm-manager
+    - slurm_manager
+
+- name: Start slurm workers
+  hosts: compute
+  gather_facts: true
+  roles:
+    - start_slurm_workers
 
-# Start SLURM workers
-- hosts: compute
+- name: Start services
+  hosts: manager
   gather_facts: false
   roles:
-    - start-slurm-workers
+    - slurm_start_services