Browse Source

adding new roles dir

Signed-off-by: John lockman <john.lockman@friday.local>
John lockman 4 years ago
parent
commit
28cdd7f8e4
59 changed files with 2943 additions and 0 deletions
  1. 17 0
      examples/host_inventory_file
  2. 13 0
      examples/host_inventory_file.ini
  3. 3 0
      roles/common/files/k8s.conf
  4. 8 0
      roles/common/files/kubernetes.repo
  5. 35 0
      roles/common/handlers/main.yml
  6. 109 0
      roles/common/tasks/main.yml
  7. 56 0
      roles/common/tasks/ntp.yml
  8. 42 0
      roles/common/templates/chrony.conf.j2
  9. 16 0
      roles/common/templates/ntp.conf.j2
  10. 63 0
      roles/common/vars/main.yml
  11. 9 0
      roles/compute_gpu/files/daemon.json
  12. 3 0
      roles/compute_gpu/files/k8s.conf
  13. 8 0
      roles/compute_gpu/files/kubernetes.repo
  14. 61 0
      roles/compute_gpu/tasks/main.yml
  15. 30 0
      roles/compute_gpu/vars/main.yml
  16. 84 0
      roles/firewalld/tasks/main.yml
  17. 43 0
      roles/firewalld/vars/main.yml
  18. 3 0
      roles/manager/files/k8s.conf
  19. 8 0
      roles/manager/files/kubernetes.repo
  20. 32 0
      roles/manager/tasks/main.yml
  21. 24 0
      roles/manager/vars/main.yml
  22. BIN
      roles/slurm_common/files/munge.key
  23. 97 0
      roles/slurm_common/files/slurm.conf
  24. 183 0
      roles/slurm_common/tasks/main.yml
  25. 43 0
      roles/slurm_common/vars/main.yml
  26. 38 0
      roles/slurm_manager/files/slurmdbd.conf
  27. 163 0
      roles/slurm_manager/tasks/main.yml
  28. 62 0
      roles/slurm_manager/vars/main.yml
  29. 64 0
      roles/slurm_start_services/tasks/main.yml
  30. 96 0
      roles/start_slurm_workers/tasks/main.yml
  31. 5 0
      roles/startmanager/files/create_admin_user.yaml
  32. 12 0
      roles/startmanager/files/create_clusterRoleBinding.yaml
  33. 20 0
      roles/startmanager/files/data-pv.yaml
  34. 20 0
      roles/startmanager/files/data2-pv.yaml
  35. 20 0
      roles/startmanager/files/data3-pv.yaml
  36. 20 0
      roles/startmanager/files/data4-pv.yaml
  37. 3 0
      roles/startmanager/files/flannel_net.sh
  38. 16 0
      roles/startmanager/files/katib-pv.yaml
  39. 536 0
      roles/startmanager/files/kube-flannel.yaml
  40. 51 0
      roles/startmanager/files/kubeflow_persistent_volumes.yaml
  41. 16 0
      roles/startmanager/files/minio-pvc.yaml
  42. 17 0
      roles/startmanager/files/mysql-pv.yaml
  43. 7 0
      roles/startmanager/files/nfs-class.yaml
  44. 32 0
      roles/startmanager/files/nfs-deployment.yaml
  45. 4 0
      roles/startmanager/files/nfs-serviceaccount.yaml
  46. 20 0
      roles/startmanager/files/nfs_clusterrole.yaml
  47. 12 0
      roles/startmanager/files/nfs_clusterrolebinding.yaml
  48. 17 0
      roles/startmanager/files/notebook-pv.yaml
  49. 20 0
      roles/startmanager/files/persistent_volumes.yaml
  50. 12 0
      roles/startmanager/files/pvc.yaml
  51. 3 0
      roles/startmanager/files/tiller_config.sh
  52. 156 0
      roles/startmanager/tasks/main.yml
  53. 52 0
      roles/startmanager/vars/main.yml
  54. 21 0
      roles/startservices/files/metal-config.yaml
  55. 225 0
      roles/startservices/files/metallb.yaml
  56. 121 0
      roles/startservices/tasks/main.yml
  57. 47 0
      roles/startservices/vars/main.yml
  58. 27 0
      roles/startworkers/tasks/main.yml
  59. 18 0
      roles/startworkers/vars/main.yml

+ 17 - 0
examples/host_inventory_file

@@ -0,0 +1,17 @@
+all: 
+  children:
+    cluster:
+      children:
+        manager:
+          hosts:
+            compute000:
+        workers:
+          children:
+            compute:
+              hosts:
+                compute001:
+            gpus:
+              hosts:
+                compute002:
+                compute003:
+                compute004:

+ 13 - 0
examples/host_inventory_file.ini

@@ -0,0 +1,13 @@
+[manager]
+friday
+
+[compute]
+compute000
+compute[002:005]
+
+[workers:children]
+compute
+
+[cluster:children]
+manager
+workers

+ 3 - 0
roles/common/files/k8s.conf

@@ -0,0 +1,3 @@
+net.bridge.bridge-nf-call-ip6tables = 1
+net.bridge.bridge-nf-call-iptables = 1
+

+ 8 - 0
roles/common/files/kubernetes.repo

@@ -0,0 +1,8 @@
+[kubernetes]
+name=Kubernetes
+baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64
+enabled=1
+gpgcheck=1
+repo_gpgcheck=1
+gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
+

+ 35 - 0
roles/common/handlers/main.yml

@@ -0,0 +1,35 @@
+---
+
+- name: Start and Enable docker service
+  service:
+    name: docker
+    state: restarted
+    enabled: yes
+  #tags: install
+
+- name: Start and Enable Kubernetes - kubelet
+  service:
+    name: kubelet
+    state: started
+    enabled: yes
+  #tags: install
+
+- name: Restart chrony
+  service:
+    name: chronyd
+    state: restarted
+    enabled: yes
+
+- name: Sync tp clocks
+  command: ntpdc -np
+  register: ntp_clock
+  until:  ntp_clock.stdout.find('*') > -1
+  retries: "{{ retry_count_one }}"
+  delay: "{{ delay_count_one }}"
+
+- name: Sync chrony sources
+  command: chronyc sources
+  register: chrony_src
+  until:  chrony_src.stdout.find('^*') > -1
+  retries: "{{ retry_count }}"
+  delay: "{{ delay_count }}"

+ 109 - 0
roles/common/tasks/main.yml

@@ -0,0 +1,109 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Add kubernetes repo
+  copy:
+    src: kubernetes.repo
+    dest: "{{ k8s_repo_dest }}"
+    owner: root
+    group: root
+    mode: "{{ k8s_repo_file_mode }}"
+  tags: install
+
+- name: Add elrepo GPG key
+  rpm_key:
+    state: present
+    key: "{{ elrepo_gpg_key_url }}"
+  tags: install
+
+- name: Add elrepo (nvidia kmod drivers)
+  package:
+    name: "{{ elrepo_rpm_url }}"
+    state: present
+  tags: install
+
+- name: Add docker community edition repository
+  get_url:
+    url: "{{ docker_repo_url }}"
+    dest: "{{ docker_repo_dest }}"
+  tags: install
+
+- name: Update sysctl to handle incorrectly routed traffic when iptables is bypassed
+  copy:
+    src: k8s.conf
+    dest: "{{ k8s_conf_dest }}"
+    owner: root
+    group: root
+    mode: "{{ k8s_conf_file_mode }}"
+  tags: install
+
+- name: Update sysctl
+  command: /sbin/sysctl --system
+  changed_when: true
+  tags: install
+
+- name: Disable swap
+  command: /sbin/swapoff -a
+  changed_when: true
+  tags: install
+
+- name: Disable selinux
+  selinux:
+    state: disabled
+  tags: install
+
+- name: Install common packages
+  package:
+    name: "{{ common_packages }}"
+    state: present
+  tags: install
+
+- name: Install k8s packages
+  package:
+    name: "{{ k8s_packages }}"
+    state: present
+  tags: install
+
+- name: Versionlock kubernetes
+  command: "yum versionlock '{{ item }}'"
+  args:
+    warn: false
+  with_items:
+    - "{{ k8s_packages }}"
+  changed_when: true
+  tags: install
+
+- name: Install infiniBand support
+  package:
+    name: "@Infiniband Support"
+    state: present
+  tags: install
+
+- name: Start and enable docker service
+  service:
+    name: docker
+    state: restarted
+    enabled: yes
+  tags: install
+
+- name: Start and enable kubernetes - kubelet
+  service:
+    name: kubelet
+    state: restarted
+    enabled: yes
+
+- name: Deploy time ntp/chrony
+  include_tasks: ntp.yml
+  tags: install

+ 56 - 0
roles/common/tasks/ntp.yml

@@ -0,0 +1,56 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+#- name: Deploy ntp servers
+#block:
+#- name: Deploy ntpd
+#package:
+#name: ntp
+#state: present
+#- name: Deploy ntpdate
+#package:
+#name: ntpdate
+#state: present
+#- name: Update ntp servers
+#template:
+#src: ntp.conf.j2
+#dest: "{{ ntp_path }}"
+#owner: root
+#group: root
+#mode: "{{ ntp_mode }}"
+          #backup: yes
+          #notify:
+          #- restart ntpd
+            #- sync ntp clocks
+            #when:  ( ansible_distribution == "CentOS" or   ansible_distribution == "RedHat" ) and ansible_distribution_major_version  < os_higher_version
+
+  - name: Deploy chrony server
+    block:
+      - name: Deploy chrony
+        package:
+            name: chrony
+            state: present
+      - name: Update ntp servers
+        template:
+          src: chrony.conf.j2
+          dest: "{{ chrony_path }}"
+          owner: root
+          group: root
+          mode: "{{ ntp_mode }}"
+          backup: yes
+        notify:
+          - restart chrony
+          - sync chrony sources
+    when:  ( ansible_distribution == "CentOS" or   ansible_distribution == "RedHat" ) and ansible_distribution_major_version  > os_version

+ 42 - 0
roles/common/templates/chrony.conf.j2

@@ -0,0 +1,42 @@
+# Use public servers from the pool.ntp.org project.
+# Please consider joining the pool (http://www.pool.ntp.org/join.html).
+{% for item in chrony_servers %}
+pool {{ item }} iburst
+{% endfor %}
+
+
+# Record the rate at which the system clock gains/losses time.
+driftfile /var/lib/chrony/drift
+
+# Allow the system clock to be stepped in the first three updates
+# if its offset is larger than 1 second.
+makestep 1.0 3
+
+# Enable kernel synchronization of the real-time clock (RTC).
+rtcsync
+
+# Enable hardware timestamping on all interfaces that support it.
+#hwtimestamp *
+
+# Increase the minimum number of selectable sources required to adjust
+# the system clock.
+#minsources 2
+
+# Allow NTP client access from local network.
+#allow 192.168.0.0/16
+
+# Serve time even if not synchronized to a time source.
+#local stratum 10
+
+# Specify file containing keys for NTP authentication.
+keyfile /etc/chrony.keys
+
+# Get TAI-UTC offset and leap seconds from the system tz database.
+leapsectz right/UTC
+
+# Specify directory for log files.
+logdir /var/log/chrony
+
+# Select which information is logged.
+#log measurements statistics tracking
+

+ 16 - 0
roles/common/templates/ntp.conf.j2

@@ -0,0 +1,16 @@
+driftfile /var/lib/ntp/drift
+
+restrict default nomodify notrap nopeer noquery
+
+restrict 127.0.0.1
+restrict ::1
+
+{% for item in ntp_servers %}
+server  {{ item }} iburst
+{% endfor %}
+
+includefile /etc/ntp/crypto/pw
+
+keys /etc/ntp/keys
+
+

+ 63 - 0
roles/common/vars/main.yml

@@ -0,0 +1,63 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+common_packages:
+  - epel-release
+  - yum-plugin-versionlock
+  - gcc
+  - nfs-utils
+  - python3-pip
+  - docker-ce
+  - bash-completion
+  - nvidia-detect
+  - chrony
+
+k8s_packages:
+  - kubelet-1.16.7
+  - kubeadm-1.16.7
+  - kubectl-1.16.7
+
+k8s_repo_dest: /etc/yum.repos.d/
+
+elrepo_gpg_key_url: https://www.elrepo.org/RPM-GPG-KEY-elrepo.org
+
+elrepo_rpm_url: https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm
+
+docker_repo_url: https://download.docker.com/linux/centos/docker-ce.repo
+
+docker_repo_dest: /etc/yum.repos.d/docker-ce.repo
+
+k8s_conf_dest: /etc/sysctl.d/
+
+k8s_repo_file_mode: 0644
+
+k8s_conf_file_mode: 0644
+
+chrony_path: "/etc/chrony.conf"
+ntp_path: "/etc/ntp.conf"
+ntp_mode: "0644"
+os_higher_version: "8"
+os_version: "7"
+retry_count_one: "10"
+delay_count_one: "60"
+retry_count: "6"
+delay_count: "10"
+
+ntp_servers: 
+  - 0.centos.pool.ntp.org
+  - 1.centos.pool.ntp.org
+  - 2.centos.pool.ntp.org
+chrony_servers:
+  - 2.centos.pool.ntp.org

+ 9 - 0
roles/compute_gpu/files/daemon.json

@@ -0,0 +1,9 @@
+{
+  "runtimes": {
+    "nvidia": {
+      "path": "nvidia-container-runtime",
+      "runtimeArgs": []
+    }
+  },
+  "default-runtime": "nvidia"
+}

+ 3 - 0
roles/compute_gpu/files/k8s.conf

@@ -0,0 +1,3 @@
+net.bridge.bridge-nf-call-ip6tables = 1
+net.bridge.bridge-nf-call-iptables = 1
+

+ 8 - 0
roles/compute_gpu/files/kubernetes.repo

@@ -0,0 +1,8 @@
+[kubernetes]
+name=Kubernetes
+baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64
+enabled=1
+gpgcheck=1
+repo_gpgcheck=1
+gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
+

+ 61 - 0
roles/compute_gpu/tasks/main.yml

@@ -0,0 +1,61 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Add nvidia-docker2 Repo
+  get_url:
+    url: "{{ nvidia_docker_repo_url }}"
+    dest: "{{ nvidia_docker_repo_dest }}"
+  tags: install, testing
+
+- name: Add libnvidia container Repo
+  get_url:
+    url: "{{ nvidia_container_repo_url }}"
+    dest: "{{ nvidia_container_repo_dest }}"
+  tags: install, testing
+
+- name: Install nvidia driver and nvidia-docker2
+  package:
+    name: "{{ nvidia_packages }}"
+    enablerepo: libnvidia-container,nvidia-docker
+    state: present
+  tags: install
+
+- name: Reboot after installing GPU drivers
+  reboot:
+  tags: install
+
+- name: Set nvidia as default runtime
+  copy:
+    src: daemon.json
+    dest: "{{ daemon_file_dest }}"
+    owner: root
+    group: root
+    mode: "{{ daemon_file_mode }}"
+  tags: install
+
+- name: Restart and enable docker service
+  service:
+    name: docker
+    state: restarted
+    enabled: yes
+    daemon_reload: yes
+  tags: install
+
+- name: Restart and enable kubernetes - kubelet
+  service:
+    name: kubelet
+    state: restarted
+    enabled: yes
+  tags: install

+ 30 - 0
roles/compute_gpu/vars/main.yml

@@ -0,0 +1,30 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+nvidia_docker_repo_url: https://nvidia.github.io/nvidia-docker/centos7/nvidia-docker.repo
+
+nvidia_docker_repo_dest: /etc/yum.repos.d/nvidia-docker.repo
+
+nvidia_container_repo_url: https://nvidia.github.io/libnvidia-container/centos7/libnvidia-container.repo 
+
+nvidia_container_repo_dest: /etc/yum.repos.d/libnvidia-container.repo
+
+nvidia_packages:
+  - kmod-nvidia
+  - nvidia-docker2
+
+daemon_file_dest: /etc/docker/
+
+daemon_file_mode: 0644

+ 84 - 0
roles/firewalld/tasks/main.yml

@@ -0,0 +1,84 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Install firewalld
+  package:
+    name: firewalld
+    state: present
+  tags: firewalld
+
+- name: Start and enable firewalld
+  service:
+    name: firewalld
+    state: started
+    enabled: yes
+  tags: firewalld
+
+- name: Configure firewalld on master nodes
+  firewalld:
+    port: "{{ item }}/tcp"
+    permanent: yes
+    state: enabled
+  with_items: '{{ k8s_master_ports }}'
+  when: "'manager' in group_names"
+  tags: firewalld
+
+- name: Configure firewalld on compute nodes
+  firewalld:
+    port: "{{ item }}/tcp"
+    permanent: yes
+    state: enabled
+  with_items: '{{ k8s_worker_ports }}'
+  when: "'compute' in group_names"
+  tags: firewalld
+
+- name: Open flannel ports on the firewall
+  firewalld:
+    port: "{{ item }}/udp"
+    permanent: yes
+    state: enabled
+  with_items: "{{ flannel_udp_ports }}"
+  when: k8s_cni == "flannel"
+  tags: firewalld
+
+- name: Open calico UDP ports on the firewall
+  firewalld:
+    port: "{{ item }}/udp"
+    permanent: yes
+    state: enabled
+  with_items: "{{ calico_udp_ports }}"
+  when: k8s_cni == "calico"
+  tags: firewalld
+
+- name: Open calico TCP ports on the firewall
+  firewalld:
+    port: "{{ item }}/tcp"
+    permanent: yes
+    state: enabled
+  with_items: "{{ calico_tcp_ports }}"
+  when: k8s_cni == "calico"
+  tags: firewalld
+
+- name: Reload firewalld
+  command: firewall-cmd --reload
+  changed_when: true
+  tags: firewalld
+
+- name: Stop and disable firewalld
+  service:
+    name: firewalld
+    state: stopped
+    enabled: no
+  tags: firewalld

+ 43 - 0
roles/firewalld/vars/main.yml

@@ -0,0 +1,43 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+# Kubernetes SDN network
+k8s_cni: calico
+
+# Master nodes firewall ports
+k8s_master_ports:
+  - 6443
+  - 2379-2380
+  - 10250
+  - 10251
+  - 10252
+
+# Worker nodes firewall ports
+k8s_worker_ports:
+  - 10250
+  - 30000-32767
+
+# Calico CNI firewall ports
+calico_udp_ports:
+  - 4789
+calico_tcp_ports:
+  - 5473
+  - 179
+  - 5473
+
+# Flannel CNI firewall ports
+flannel_udp_ports:
+  - 8285
+  - 8472

+ 3 - 0
roles/manager/files/k8s.conf

@@ -0,0 +1,3 @@
+net.bridge.bridge-nf-call-ip6tables = 1
+net.bridge.bridge-nf-call-iptables = 1
+

+ 8 - 0
roles/manager/files/kubernetes.repo

@@ -0,0 +1,8 @@
+[kubernetes]
+name=Kubernetes
+baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64
+enabled=1
+gpgcheck=1
+repo_gpgcheck=1
+gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
+

+ 32 - 0
roles/manager/tasks/main.yml

@@ -0,0 +1,32 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Create directory for helm installer file
+  file:
+    path: "{{ helm_installer_file_directory }}"
+    state: directory
+    mode: "{{ helm_installer_file_directory_mode }}"
+
+- name: Get helm installer
+  get_url:
+    url: "{{ helm_installer_url }}"
+    dest: "{{ helm_installer_file_dest }}"
+    mode: "{{ helm_installer_file_mode }}"
+  tags: manager
+
+- name: Install helm
+  command: "/bin/bash {{ helm_installer_file_dest }}"
+  changed_when: true
+  tags: manager

+ 24 - 0
roles/manager/vars/main.yml

@@ -0,0 +1,24 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+helm_installer_file_directory: /root/bin
+
+helm_installer_file_directory_mode: 0755
+
+helm_installer_url: https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+
+helm_installer_file_dest: /root/bin/get_helm.sh
+
+helm_installer_file_mode: 0700

BIN
roles/slurm_common/files/munge.key


+ 97 - 0
roles/slurm_common/files/slurm.conf

@@ -0,0 +1,97 @@
+#
+# Example slurm.conf file. Please run configurator.html
+# (in doc/html) to build a configuration file customized
+# for your environment.
+#
+#
+# slurm.conf file generated by configurator.html.
+#
+# See the slurm.conf man page for more information.
+#
+ClusterName=
+ControlMachine=
+#ControlAddr=
+#BackupController=
+#BackupAddr=
+#
+SlurmUser=
+#SlurmdUser=root
+SlurmctldPort=
+SlurmdPort=
+AuthType=auth/munge
+#JobCredentialPrivateKey=
+#JobCredentialPublicCertificate=
+#StateSaveLocation=/var/spool/
+SlurmdSpoolDir=
+SwitchType=switch/none
+MpiDefault=none
+SlurmctldPidFile=
+SlurmdPidFile=
+ProctrackType=proctrack/pgid
+#PluginDir=
+#FirstJobId=
+ReturnToService=2
+#MaxJobCount=
+#PlugStackConfig=
+#PropagatePrioProcess=
+#PropagateResourceLimits=
+#PropagateResourceLimitsExcept=
+#Prolog=
+#Epilog=
+#SrunProlog=
+#SrunEpilog=
+#TaskProlog=
+#TaskEpilog=
+#TaskPlugin=
+#TrackWCKey=no
+#TreeWidth=50
+#TmpFS=
+#UsePAM=
+#
+# TIMERS
+SlurmctldTimeout=300
+SlurmdTimeout=300
+InactiveLimit=0
+MinJobAge=300
+KillWait=30
+Waittime=0
+#
+# SCHEDULING
+SchedulerType=sched/backfill
+#SchedulerAuth=
+SelectType=select/linear
+#FastSchedule=1
+PriorityType=priority/multifactor
+PriorityDecayHalfLife=14-0
+#PriorityUsageResetPeriod=14-0
+PriorityWeightFairshare=100000
+PriorityWeightAge=1000
+PriorityWeightPartition=10000
+PriorityWeightJobSize=1000
+PriorityMaxAge=14-0
+#
+# LOGGING
+SlurmctldDebug=3
+SlurmctldLogFile=
+SlurmdDebug=1
+SlurmdLogFile=
+JobCompType=jobcomp/none
+#JobCompLoc=
+#
+# ACCOUNTING
+JobAcctGatherType=jobacct_gather/linux
+JobAcctGatherFrequency=30
+#
+AccountingStorageType=accounting_storage/slurmdbd
+#AccountingStorageHost=
+#AccountingStorageLoc=
+#AccountingStoragePass=
+#AccountingStorageUser=
+#
+# COMPUTE NODES
+#NodeName=linux[1-32] Procs=1 State=UNKNOWN
+#NodeName=DEFAULT Sockets=2 CoresPerSocket=20 State=UNKNOWN
+NodeName= Sockets= CoresPerSocket=
+#NodeName=compute[002-005] CoresPerSocket=20
+PartitionName=normal Nodes=ALL Default=YES MaxTime=INFINITE State=UP
+#PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP

+ 183 - 0
roles/slurm_common/tasks/main.yml

@@ -0,0 +1,183 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Install epel repository
+  package:
+    name: epel-release
+    state: present
+  tags: install
+
+- name: Munge installation
+  package:
+    name: munge-devel
+    state: present
+  tags: install
+
+- name: Install packages for slurm
+  package:
+    name: "{{ common_packages }}"
+    state: present
+  tags: install
+
+- name: pip upgrade pip
+  pip:
+    name: pip
+    executable: pip3
+    extra_args: --upgrade
+    state: latest
+  tags: install
+
+- name: create munge key
+  command: "{{ munge_cmd }}"
+  changed_when: true
+  tags: install
+
+- name: copy munge key
+  copy:
+    src: munge.key
+    dest: "{{ munge_dest }}"
+    owner: munge
+    group: munge
+    mode: "{{ munge_mode }}"
+  tags: install
+
+- name: slurm configuration - slurm.conf
+  copy:
+    src: slurm.conf
+    dest: "{{ slurm_dest }}"
+    mode: "{{ slurm_mode }}"
+  tags: install
+
+- name: add cluster name
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "clustername="
+    line: "clustername={{ cluster_name }}"
+  tags: install
+
+- name: add slurm user name
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmUser="
+    line: "SlurmUser={{ slurm_user }}"
+  tags: install
+
+- name: Add slurmctld port no
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmctldPort="
+    line: "SlurmctldPort={{ slurmctld_port }}"
+  tags: install
+
+- name: Add slurmd port no
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmdPort="
+    line: "SlurmdPort={{ slurmd_port }}"
+  tags: install
+
+- name: Add spool path
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmdSpoolDir="
+    line: "SlurmdSpoolDir={{ spool_pth }}"
+  tags: install
+
+- name: Add slurmctld pid file path
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmctldPidFile="
+    line: "SlurmctldPidFile={{ slurmctld_pid }}"
+  tags: install
+
+- name: Add slurmd pid file path
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmdPidFile="
+    line: "SlurmdPidFile={{ slurmd_pid }}"
+  tags: install
+
+- name: Add slurmctld log file path
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmctldLogFile="
+    line: "SlurmctldLogFile={{ slurmctld_log }}"
+  tags: install
+
+- name: Add slurmd log file path
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmdLogFile="
+    line: "SlurmdLogFile={{ slurmd_log }}"
+  tags: install
+
+- name: Create slurm group
+  group:
+    name: slurm
+    state: present
+  tags: install
+
+- name: Add the user 'slurm' with uid 6001 and a primary group of 'slurm'
+  user:
+    name: slurm
+    comment: Slurm User Account
+    uid: "{{ slurm_uid }}"
+    group: slurm
+  tags: install
+
+- name: Create slurm log directory
+  file:
+    path: "{{ slurm_logpth }}"
+    state: directory
+    owner: slurm
+    group: slurm
+    mode: "{{ gen_mode }}"
+    recurse: yes
+  tags: install
+
+- name: Give slurm user permission to spool
+  file:
+    path: "{{ spool_pth }}"
+    owner: slurm
+    group: slurm
+    state: directory
+    mode: "{{ gen_mode }}"
+    recurse: yes
+  tags: install
+
+- name: Give slurm user permission to slurmctld
+  file:
+    path: "{{ slurmctld_pid }}"
+    owner: slurm
+    group: slurm
+    mode: "{{ gen_mode }}"
+    state: touch
+  tags: install
+
+- name: Give slurm user permission to slurmd
+  file:
+    path: "{{ slurmd_pid }}"
+    owner: slurm
+    group: slurm
+    mode: "{{ gen_mode }}"
+    state: touch
+  tags: install
+
+- name: Start munge service
+  service:
+    name: munge
+    state: restarted
+    enabled: yes
+  tags: install

+ 43 - 0
roles/slurm_common/vars/main.yml

@@ -0,0 +1,43 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+epel_url: https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
+
+common_packages:
+   - munge
+   - munge-libs
+   - mariadb-server
+   - mariadb-devel
+   - python3
+   - python-pip
+
+munge_dest: "/etc/munge/"
+munge_cmd: "/usr/sbin/create-munge-key -f"
+munge_mode: "0400"
+slurm_mode: "0644"
+slurm_dest: "/etc/slurm/"
+slurm_confpth: "/etc/slurm/slurm.conf"
+slurm_user: "slurm"
+slurmctld_port: "6817"
+slurmd_port: "6818"
+slurm_uid: "6001"
+slurm_logpth: "/var/log/slurm/"
+gen_mode: "0755"
+spool_pth: "/var/spool/slurm/"
+slurmctld_pid: "/var/run/slurmctld.pid"
+slurmd_pid: "/var/run/slurmd.pid"
+cluster_name : "manager,compute"
+slurmctld_log: "/var/log/slurm/slurmctld.log"
+slurmd_log: "/var/log/slurm/slurmd.log"

+ 38 - 0
roles/slurm_manager/files/slurmdbd.conf

@@ -0,0 +1,38 @@
+#
+# Example slurmdbd.conf file.
+#
+# See the slurmdbd.conf man page for more information.
+#
+# Archive info
+#ArchiveJobs=yes
+#ArchiveDir="/tmp"
+#ArchiveSteps=yes
+#ArchiveScript=
+#JobPurge=12
+#StepPurge=1
+#
+# Authentication info
+AuthType=auth/munge
+#AuthInfo=/var/run/munge/munge.socket.2
+#
+# slurmDBD info
+DbdAddr=
+DbdHost=
+#DbdPort=7031
+SlurmUser=
+#MessageTimeout=300
+DebugLevel=verbose
+#DefaultQOS=normal,standby
+LogFile=
+PidFile=
+#PluginDir=/usr/lib/slurm
+#PrivateData=accounts,users,usage,jobs
+#TrackWCKey=yes
+#
+# Database info
+StorageType=accounting_storage/mysql
+#StorageHost=
+#StoragePort=
+#StoragePass=
+#StorageUser=
+#StorageLoc=

+ 163 - 0
roles/slurm_manager/tasks/main.yml

@@ -0,0 +1,163 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+#- name: Install packages for slurm
+# package:
+#   name: "{{ slurm_packages }}"
+#   state: present
+# tags: install
+
+#- name: Install development tools
+# package:
+#   name: "{{ dev_tools }}"
+#   state: present
+# tags: install
+
+- name: Create temporary download folder for slurm
+  file:
+    path: "{{ tmp_path }}"
+    owner: slurm
+    group: slurm
+    mode: "{{ tmp_mode }}"
+    state: directory
+
+- name: Download slurm source
+  get_url:
+    url: "{{ slurm_url }}"
+    dest: "{{ tmp_path }}"
+    checksum: "{{ slurm_md5 }}"
+    validate_certs: no
+  tags: install
+ 
+- name: Build slurm rpms
+  command: rpmbuild -ta "{{ rpmbuild_path }}"
+  changed_when: false
+  args:
+    warn: no
+
+#- name: Verify package md5
+#command: rpm -qa
+#  ignore_errors: true
+#  register: verify_result
+#  changed_when: no
+#  failed_when: no
+#  args:
+#    warn: no
+
+- name: Install rpms
+  command: rpm -Uvh ~"{{ rpm_loop }}"
+  args:
+    chdir: "{{ rpm_path }}"
+    warn: no
+    #  when: verify_result.rc != 0
+
+- name: Add control machine name
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "ControlMachine="
+    line: "ControlMachine={{ group_names[0] }}"
+
+- name: Firewall rule for slurm - tcp/ip,udp
+  firewalld:
+    zone: internal
+    port: "{{ item }}"
+    permanent: true
+    state: enabled
+  with_items:
+    - "{{ tcp_port1 }}"
+    - "{{ tcp_port2 }}"
+    - "{{ tcp_port3 }}"
+    - "{{ tcp_port4 }}"
+    - "{{ udp_port1 }}"
+    - "{{ udp_port2 }}"
+  tags: install
+
+- name: Get network address/subnet mask through ipaddr
+  set_fact:
+    network_address: "{{ (ansible_default_ipv4.network + '/' + ansible_default_ipv4.netmask) | ipaddr('network/prefix') }}"
+
+- name: Firewall rule slurm - allow all incoming traffic on internal network
+  firewalld:
+    zone: internal
+    rich_rule: 'rule family="{{ family }}" source address="{{ network_address }}" accept'
+    permanent: true
+    state: enabled
+  tags: install
+
+- name: Firewall reload
+  systemd:
+    name: firewalld
+    state: reloaded
+  tags: install
+
+- name: Start mariadb
+  service:
+    name: mariadb
+    state: restarted
+    enabled: yes
+  tags: install
+
+- name: Grant permissions for slurm db
+  command: mysql -u root -e "GRANT ALL ON slurm_acct_db.* TO 'slurm'@'localhost' identified by 'password' with grant option;"
+  tags: install
+  changed_when: true
+
+- name: Create slurmdbd.conf file
+  copy:
+    src: slurmdbd.conf
+    dest: "{{ slurmdbd_path }}"
+    mode: "{{ slurmdbd_mode }}"
+  tags: install
+
+- name: Add slurm user name
+  lineinfile:
+    path: "{{ slurmdbd_path }}"
+    regexp: "SlurmUser="
+    line: "SlurmUser={{ slurm_user }}"
+
+- name: Add db address
+  lineinfile:
+    path: "{{ slurmdbd_path }}"
+    regexp: "DbdAddr="
+    line: "DbdAddr={{ DbdAddr }}"
+
+- name: Add db host
+  lineinfile:
+    path: "{{ slurmdbd_path }}"
+    regexp: "DbdHost="
+    line: "DbdHost={{ DbdHost }}"
+
+- name: Add log file path
+  lineinfile:
+    path: "{{ slurmdbd_path }}"
+    regexp: "LogFile="
+    line: "LogFile={{ logfile }}"
+
+- name: Add pid file path
+  lineinfile:
+    path: "{{ slurmdbd_path }}"
+    regexp: "PidFile="
+    line: "PidFile={{ pidfile }}"
+
+- name: Populate accounting database
+  command: slurmdbd
+  tags: install
+  changed_when: true
+
+- name: Save slurm conf file in buffer
+  fetch:
+    src: "{{ slurm_confpth }}"
+    dest: "{{ buffer_path }}"
+    flat: true

+ 62 - 0
roles/slurm_manager/vars/main.yml

@@ -0,0 +1,62 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+slurm_packages:
+   - python3
+   - gcc
+   - openssl
+   - openssl-devel
+   - numactl
+   - numactl-devel
+   - hwloc
+   - lua
+   - readline
+   - readline-devel
+   - pam-devel
+   - perl-ExtUtils-MakeMaker
+   - cpanm*
+   - rpm-build
+
+dev_tools:
+   - rrdtool-devel
+   - lua-devel
+   - hwloc-devel
+
+tmp_path: "/root/slurm-tmp"
+tmp_mode: "0755"
+slurm_url: https://download.schedmd.com/slurm/slurm-20.02.3.tar.bz2
+slurm_md5: "md5:c71a300d6c5d33ef8ca60e52a203bb1e"
+rpmbuild_path: "/root/slurm-tmp/slurm-20.02.3.tar.bz2"
+rpm_loop: "/rpmbuild/RPMS/x86_64/*.rpm"
+tcp_port1: "6817/tcp"
+tcp_port2: "6818/tcp"
+tcp_port3: "6819/tcp"
+tcp_port4: "7321/tcp"
+udp_port1: "6817/udp"
+udp_port2: "7321/udp"
+family: "ipv4"
+db_user: "slurm"
+db_host: "localhost"
+slurmdbd_path: "/etc/slurm/slurmdbd.conf"
+slurmdbd_mode: "0600"
+slurm_confpth: "/etc/slurm/slurm.conf"
+slurm_user: "slurm"
+DbdAddr: "localhost"
+DbdHost: "localhost"
+logfile: "/var/log/slurm/slurmdbd.log"
+pidfile: "/var/run/slurm/slurmdbd.pid"
+buffer_path: "/tmp/slurm.conf"
+rpm_path: "/root/rpmbuild/RPMS/x86_64/"
+slurm_mode: "0644"

+ 64 - 0
roles/slurm_start_services/tasks/main.yml

@@ -0,0 +1,64 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Include common variables
+  include_vars: ../../slurm_manager/vars/main.yml
+
+- name: Copy slurm conf from buffer
+  copy:
+    src: "{{ buffer_path }}"
+    dest: "{{ slurm_confpth }}"
+    mode: "{{ slurm_mode }}"
+
+- name: Start slurmctld on manager
+  service:
+    name: slurmctld
+    enabled: yes
+  tags: install
+
+- name: Enable slurmdbd on manager
+  service:
+    name: slurmdbd
+    enabled: yes
+  tags: install
+
+- name: Show cluster if exists
+  command: sacctmgr -n show cluster {{ inventory_hostname }}
+  register: slurm_clusterlist
+  changed_when: false
+
+- name: Create slurm cluster
+  command: sacctmgr -i add cluster {{ inventory_hostname }}
+  when: slurm_clusterlist.stdout.find(inventory_hostname) == 1
+
+- name: Show account
+  command: sacctmgr show account
+  register: account_added
+  changed_when: false
+
+- name: Create default slurm group
+  command: sacctmgr -i add account defaultgroup Cluster={{ inventory_hostname }} Description="Default Account" Organization="Default Org"
+  when: account_added.stdout.find(inventory_hostname) == 1
+  tags: install
+
+- name: Check if user exists
+  command: sacctmgr show user
+  register: user_added
+  changed_when: false
+
+- name: Add root to the default account
+  command: sacctmgr -i add user root DefaultAccount=defaultgroup
+  when: account_added.stdout.find(inventory_hostname) == 1
+  tags: install

+ 96 - 0
roles/start_slurm_workers/tasks/main.yml

@@ -0,0 +1,96 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Include common variables
+  include_vars: ../../slurm_manager/vars/main.yml
+
+- name: Copy slurm conf from buffer
+  copy:
+    src: "{{ buffer_path }}"
+    dest: "{{ slurm_confpth }}"
+    mode: "{{ slurm_mode }}"
+
+- name: Install packages for slurm
+  package:
+    name: "{{ slurm_packages }}"
+    state: present
+  with_items:
+    - "{{ slurm_packages }}"
+  tags: install
+
+- name: Install development tools
+  package:
+    name: "{{ item | join (',') }}"
+    state: present
+  with_items:
+    - "{{ dev_tools }}"
+  tags: install
+
+- name: Create temporary download folder for slurm
+  file:
+    path: "{{ tmp_path }}"
+    owner: slurm
+    group: slurm
+    mode: "{{ tmp_mode }}"
+    state: directory
+
+- name: Download slurm source
+  get_url:
+    url: "{{ slurm_url }}"
+    dest: "{{ tmp_path }}"
+    checksum: "{{ slurm_md5 }}"
+    validate_certs: no
+  tags: install
+
+- name: Build slurm rpms
+  command: rpmbuild -ta "{{ rpmbuild_path }}"
+  changed_when: false
+  args:
+    warn: no
+
+- name: Verify package md5
+  command: rpm -qa
+  ignore_errors: true
+  register: verify_result
+  changed_when: no
+  failed_when: no
+  args:
+    warn: no
+
+- name: Install rpms
+  command: rpm -Uvh ~"{{ rpm_loop }}"
+  args:
+    chdir: "{{ rpm_path }}"
+    warn: no
+  when: verify_result.rc != 0
+
+- name: Add socket and core info
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "NodeName= Sockets= CoresPerSocket="
+    line: "NodeName={{ group_names[0] }} Sockets={{ hostvars[inventory_hostname]['ansible_facts']['processor_count'] }}
+      CoresPerSocket={{ hostvars[inventory_hostname]['ansible_facts']['processor_cores'] }}"
+
+- name: Save slurm conf in buffer
+  fetch:
+    src: "{{ slurm_confpth }}"
+    dest: "{{ buffer_path }}"
+    flat: true
+
+- name: Start slurmd on compute nodes
+  service:
+    name: slurmd.service
+    enabled: yes
+  tags: install

+ 5 - 0
roles/startmanager/files/create_admin_user.yaml

@@ -0,0 +1,5 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: admin-user
+  namespace: kube-system

+ 12 - 0
roles/startmanager/files/create_clusterRoleBinding.yaml

@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRoleBinding
+metadata:
+  name: admin-user
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: cluster-admin
+subjects:
+- kind: ServiceAccount
+  name: admin-user
+  namespace: kube-system

+ 20 - 0
roles/startmanager/files/data-pv.yaml

@@ -0,0 +1,20 @@
+# yaml file contents
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: data-pv
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+  - ReadWriteOnce
+  - ReadOnlyMany
+  - ReadWriteMany
+  nfs:
+    server: 10.0.0.1
+    path: /work/k8s/data1
+  persistentVolumeReclaimPolicy: Recycle
+  #storageClassName: local-storage
+  #hostPath:
+    #path: /home/k8s/data1
+ 

+ 20 - 0
roles/startmanager/files/data2-pv.yaml

@@ -0,0 +1,20 @@
+# yaml file contents
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: data2-pv
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+  - ReadWriteOnce
+  - ReadOnlyMany
+  - ReadWriteMany
+  nfs:
+    server: 10.0.0.1
+    path: /work/k8s/data2
+  persistentVolumeReclaimPolicy: Recycle
+  #storageClassName: local-storage
+  #hostPath:
+    #path: /home/k8s/
+ 

+ 20 - 0
roles/startmanager/files/data3-pv.yaml

@@ -0,0 +1,20 @@
+# yaml file contents
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: data3-pv
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+  - ReadWriteOnce
+  - ReadOnlyMany
+  - ReadWriteMany
+  nfs:
+    server: 10.0.0.1
+    path: /work/k8s/data3
+  persistentVolumeReclaimPolicy: Recycle
+  #storageClassName: local-storage
+  #hostPath:
+    #path: /home/k8s/
+ 

+ 20 - 0
roles/startmanager/files/data4-pv.yaml

@@ -0,0 +1,20 @@
+# yaml file contents
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: data4-pv
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+  - ReadWriteOnce
+  - ReadOnlyMany
+  - ReadWriteMany
+  nfs:
+    server: 10.0.0.1
+    path: /work/k8s/data4
+  persistentVolumeReclaimPolicy: Recycle
+  #storageClassName: local-storage
+  #hostPath:
+    #path: /home/k8s/
+ 

+ 3 - 0
roles/startmanager/files/flannel_net.sh

@@ -0,0 +1,3 @@
+kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/v0.10.0/Documentation/kube-flannel.yml
+kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/master/Documentation/k8s-manifests/kube-flannel-rbac.yml
+

+ 16 - 0
roles/startmanager/files/katib-pv.yaml

@@ -0,0 +1,16 @@
+# yaml file contents
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: katib-pv
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+  - ReadWriteOnce
+  - ReadOnlyMany
+  - ReadWriteMany
+  persistentVolumeReclaimPolicy: Recycle
+  #storageClassName: local-storage
+  hostPath:
+    path: /home/k8s/katibsql

+ 536 - 0
roles/startmanager/files/kube-flannel.yaml

@@ -0,0 +1,536 @@
+---
+apiVersion: extensions/v1beta1
+kind: PodSecurityPolicy
+metadata:
+  name: psp.flannel.unprivileged
+  annotations:
+    seccomp.security.alpha.kubernetes.io/allowedProfileNames: docker/default
+    seccomp.security.alpha.kubernetes.io/defaultProfileName: docker/default
+    apparmor.security.beta.kubernetes.io/allowedProfileNames: runtime/default
+    apparmor.security.beta.kubernetes.io/defaultProfileName: runtime/default
+spec:
+  privileged: false
+  volumes:
+    - configMap
+    - secret
+    - emptyDir
+    - hostPath
+  allowedHostPaths:
+    - pathPrefix: "/etc/cni/net.d"
+    - pathPrefix: "/etc/kube-flannel"
+    - pathPrefix: "/run/flannel"
+  readOnlyRootFilesystem: false
+  # Users and groups
+  runAsUser:
+    rule: RunAsAny
+  supplementalGroups:
+    rule: RunAsAny
+  fsGroup:
+    rule: RunAsAny
+  # Privilege Escalation
+  allowPrivilegeEscalation: false
+  defaultAllowPrivilegeEscalation: false
+  # Capabilities
+  allowedCapabilities: ['NET_ADMIN']
+  defaultAddCapabilities: []
+  requiredDropCapabilities: []
+  # Host namespaces
+  hostPID: false
+  hostIPC: false
+  hostNetwork: true
+  hostPorts:
+  - min: 0
+    max: 65535
+  # SELinux
+  seLinux:
+    # SELinux is unsed in CaaSP
+    rule: 'RunAsAny'
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1beta1
+metadata:
+  name: flannel
+rules:
+  - apiGroups: ['extensions']
+    resources: ['podsecuritypolicies']
+    verbs: ['use']
+    resourceNames: ['psp.flannel.unprivileged']
+  - apiGroups:
+      - ""
+    resources:
+      - pods
+    verbs:
+      - get
+  - apiGroups:
+      - ""
+    resources:
+      - nodes
+    verbs:
+      - list
+      - watch
+  - apiGroups:
+      - ""
+    resources:
+      - nodes/status
+    verbs:
+      - patch
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1beta1
+metadata:
+  name: flannel
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: flannel
+subjects:
+- kind: ServiceAccount
+  name: flannel
+  namespace: kube-system
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: flannel
+  namespace: kube-system
+---
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: kube-flannel-cfg
+  namespace: kube-system
+  labels:
+    tier: node
+    app: flannel
+data:
+  cni-conf.json: |
+    {
+      "name": "cbr0",
+      "plugins": [
+        {
+          "type": "flannel",
+          "delegate": {
+            "hairpinMode": true,
+            "isDefaultGateway": true
+          }
+        },
+        {
+          "type": "portmap",
+          "capabilities": {
+            "portMappings": true
+          }
+        }
+      ]
+    }
+  net-conf.json: |
+    {
+      "Network": "10.244.0.0/16",
+      "Backend": {
+        "Type": "vxlan"
+      }
+    }
+---
+apiVersion: extensions/v1beta1
+kind: DaemonSet
+metadata:
+  name: kube-flannel-ds-amd64
+  namespace: kube-system
+  labels:
+    tier: node
+    app: flannel
+spec:
+  template:
+    metadata:
+      labels:
+        tier: node
+        app: flannel
+    spec:
+      hostNetwork: true
+      nodeSelector:
+        beta.kubernetes.io/arch: amd64
+      tolerations:
+      - operator: Exists
+        effect: NoSchedule
+      serviceAccountName: flannel
+      initContainers:
+      - name: install-cni
+        image: quay.io/coreos/flannel:v0.11.0-amd64
+        command:
+        - cp
+        args:
+        - -f
+        - /etc/kube-flannel/cni-conf.json
+        - /etc/cni/net.d/10-flannel.conflist
+        volumeMounts:
+        - name: cni
+          mountPath: /etc/cni/net.d
+        - name: flannel-cfg
+          mountPath: /etc/kube-flannel/
+      containers:
+      - name: kube-flannel
+        image: quay.io/coreos/flannel:v0.11.0-amd64
+        command:
+        - /opt/bin/flanneld
+        args:
+        - --ip-masq
+        - --kube-subnet-mgr
+        - --iface=ib0
+        resources:
+          requests:
+            cpu: "100m"
+            memory: "50Mi"
+          limits:
+            cpu: "100m"
+            memory: "50Mi"
+        securityContext:
+          privileged: false
+          capabilities:
+             add: ["NET_ADMIN"]
+        env:
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        volumeMounts:
+        - name: run
+          mountPath: /run/flannel
+        - name: flannel-cfg
+          mountPath: /etc/kube-flannel/
+      volumes:
+        - name: run
+          hostPath:
+            path: /run/flannel
+        - name: cni
+          hostPath:
+            path: /etc/cni/net.d
+        - name: flannel-cfg
+          configMap:
+            name: kube-flannel-cfg
+---
+apiVersion: extensions/v1beta1
+kind: DaemonSet
+metadata:
+  name: kube-flannel-ds-arm64
+  namespace: kube-system
+  labels:
+    tier: node
+    app: flannel
+spec:
+  template:
+    metadata:
+      labels:
+        tier: node
+        app: flannel
+    spec:
+      hostNetwork: true
+      nodeSelector:
+        beta.kubernetes.io/arch: arm64
+      tolerations:
+      - operator: Exists
+        effect: NoSchedule
+      serviceAccountName: flannel
+      initContainers:
+      - name: install-cni
+        image: quay.io/coreos/flannel:v0.11.0-arm64
+        command:
+        - cp
+        args:
+        - -f
+        - /etc/kube-flannel/cni-conf.json
+        - /etc/cni/net.d/10-flannel.conflist
+        volumeMounts:
+        - name: cni
+          mountPath: /etc/cni/net.d
+        - name: flannel-cfg
+          mountPath: /etc/kube-flannel/
+      containers:
+      - name: kube-flannel
+        image: quay.io/coreos/flannel:v0.11.0-arm64
+        command:
+        - /opt/bin/flanneld
+        args:
+        - --ip-masq
+        - --kube-subnet-mgr
+        - --iface=ib0
+        resources:
+          requests:
+            cpu: "100m"
+            memory: "50Mi"
+          limits:
+            cpu: "100m"
+            memory: "50Mi"
+        securityContext:
+          privileged: false
+          capabilities:
+             add: ["NET_ADMIN"]
+        env:
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        volumeMounts:
+        - name: run
+          mountPath: /run/flannel
+        - name: flannel-cfg
+          mountPath: /etc/kube-flannel/
+      volumes:
+        - name: run
+          hostPath:
+            path: /run/flannel
+        - name: cni
+          hostPath:
+            path: /etc/cni/net.d
+        - name: flannel-cfg
+          configMap:
+            name: kube-flannel-cfg
+---
+apiVersion: extensions/v1beta1
+kind: DaemonSet
+metadata:
+  name: kube-flannel-ds-arm
+  namespace: kube-system
+  labels:
+    tier: node
+    app: flannel
+spec:
+  template:
+    metadata:
+      labels:
+        tier: node
+        app: flannel
+    spec:
+      hostNetwork: true
+      nodeSelector:
+        beta.kubernetes.io/arch: arm
+      tolerations:
+      - operator: Exists
+        effect: NoSchedule
+      serviceAccountName: flannel
+      initContainers:
+      - name: install-cni
+        image: quay.io/coreos/flannel:v0.11.0-arm
+        command:
+        - cp
+        args:
+        - -f
+        - /etc/kube-flannel/cni-conf.json
+        - /etc/cni/net.d/10-flannel.conflist
+        volumeMounts:
+        - name: cni
+          mountPath: /etc/cni/net.d
+        - name: flannel-cfg
+          mountPath: /etc/kube-flannel/
+      containers:
+      - name: kube-flannel
+        image: quay.io/coreos/flannel:v0.11.0-arm
+        command:
+        - /opt/bin/flanneld
+        args:
+        - --ip-masq
+        - --kube-subnet-mgr
+        - --iface=ib0
+        resources:
+          requests:
+            cpu: "100m"
+            memory: "50Mi"
+          limits:
+            cpu: "100m"
+            memory: "50Mi"
+        securityContext:
+          privileged: false
+          capabilities:
+             add: ["NET_ADMIN"]
+        env:
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        volumeMounts:
+        - name: run
+          mountPath: /run/flannel
+        - name: flannel-cfg
+          mountPath: /etc/kube-flannel/
+      volumes:
+        - name: run
+          hostPath:
+            path: /run/flannel
+        - name: cni
+          hostPath:
+            path: /etc/cni/net.d
+        - name: flannel-cfg
+          configMap:
+            name: kube-flannel-cfg
+---
+apiVersion: extensions/v1beta1
+kind: DaemonSet
+metadata:
+  name: kube-flannel-ds-ppc64le
+  namespace: kube-system
+  labels:
+    tier: node
+    app: flannel
+spec:
+  template:
+    metadata:
+      labels:
+        tier: node
+        app: flannel
+    spec:
+      hostNetwork: true
+      nodeSelector:
+        beta.kubernetes.io/arch: ppc64le
+      tolerations:
+      - operator: Exists
+        effect: NoSchedule
+      serviceAccountName: flannel
+      initContainers:
+      - name: install-cni
+        image: quay.io/coreos/flannel:v0.11.0-ppc64le
+        command:
+        - cp
+        args:
+        - -f
+        - /etc/kube-flannel/cni-conf.json
+        - /etc/cni/net.d/10-flannel.conflist
+        volumeMounts:
+        - name: cni
+          mountPath: /etc/cni/net.d
+        - name: flannel-cfg
+          mountPath: /etc/kube-flannel/
+      containers:
+      - name: kube-flannel
+        image: quay.io/coreos/flannel:v0.11.0-ppc64le
+        command:
+        - /opt/bin/flanneld
+        args:
+        - --ip-masq
+        - --kube-subnet-mgr
+        - --iface=ib0
+        resources:
+          requests:
+            cpu: "100m"
+            memory: "50Mi"
+          limits:
+            cpu: "100m"
+            memory: "50Mi"
+        securityContext:
+          privileged: false
+          capabilities:
+             add: ["NET_ADMIN"]
+        env:
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        volumeMounts:
+        - name: run
+          mountPath: /run/flannel
+        - name: flannel-cfg
+          mountPath: /etc/kube-flannel/
+      volumes:
+        - name: run
+          hostPath:
+            path: /run/flannel
+        - name: cni
+          hostPath:
+            path: /etc/cni/net.d
+        - name: flannel-cfg
+          configMap:
+            name: kube-flannel-cfg
+---
+apiVersion: extensions/v1beta1
+kind: DaemonSet
+metadata:
+  name: kube-flannel-ds-s390x
+  namespace: kube-system
+  labels:
+    tier: node
+    app: flannel
+spec:
+  template:
+    metadata:
+      labels:
+        tier: node
+        app: flannel
+    spec:
+      hostNetwork: true
+      nodeSelector:
+        beta.kubernetes.io/arch: s390x
+      tolerations:
+      - operator: Exists
+        effect: NoSchedule
+      serviceAccountName: flannel
+      initContainers:
+      - name: install-cni
+        image: quay.io/coreos/flannel:v0.11.0-s390x
+        command:
+        - cp
+        args:
+        - -f
+        - /etc/kube-flannel/cni-conf.json
+        - /etc/cni/net.d/10-flannel.conflist
+        volumeMounts:
+        - name: cni
+          mountPath: /etc/cni/net.d
+        - name: flannel-cfg
+          mountPath: /etc/kube-flannel/
+      containers:
+      - name: kube-flannel
+        image: quay.io/coreos/flannel:v0.11.0-s390x
+        command:
+        - /opt/bin/flanneld
+        args:
+        - --ip-masq
+        - --kube-subnet-mgr
+        - --iface=ib0
+        resources:
+          requests:
+            cpu: "100m"
+            memory: "50Mi"
+          limits:
+            cpu: "100m"
+            memory: "50Mi"
+        securityContext:
+          privileged: false
+          capabilities:
+             add: ["NET_ADMIN"]
+        env:
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        volumeMounts:
+        - name: run
+          mountPath: /run/flannel
+        - name: flannel-cfg
+          mountPath: /etc/kube-flannel/
+      volumes:
+        - name: run
+          hostPath:
+            path: /run/flannel
+        - name: cni
+          hostPath:
+            path: /etc/cni/net.d
+        - name: flannel-cfg
+          configMap:
+            name: kube-flannel-cfg

+ 51 - 0
roles/startmanager/files/kubeflow_persistent_volumes.yaml

@@ -0,0 +1,51 @@
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: data1-pv
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+  - ReadWriteOnce
+  - ReadOnlyMany
+  - ReadWriteMany
+  nfs:
+    server: 10.0.0.1
+    path: /work/k8s/data1
+  persistentVolumeReclaimPolicy: Recycle
+ 
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: data2-pv
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+  - ReadWriteOnce
+  - ReadOnlyMany
+  - ReadWriteMany
+  nfs:
+    server: 10.0.0.1
+    path: /work/k8s/data2
+  persistentVolumeReclaimPolicy: Recycle
+ 
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: data3-pv
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+  - ReadWriteOnce
+  - ReadOnlyMany
+  - ReadWriteMany
+  nfs:
+    server: 10.0.0.1
+    path: /work/k8s/data3
+  persistentVolumeReclaimPolicy: Recycle
+ 

+ 16 - 0
roles/startmanager/files/minio-pvc.yaml

@@ -0,0 +1,16 @@
+# yaml file contents
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: minio-pv
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+  - ReadWriteOnce
+  - ReadOnlyMany
+  - ReadWriteMany
+  persistentVolumeReclaimPolicy: Recycle
+  #storageClassName: local-storage
+  hostPath:
+    path: /home/k8s

+ 17 - 0
roles/startmanager/files/mysql-pv.yaml

@@ -0,0 +1,17 @@
+# yaml file contents
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: mysql-pv
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+  - ReadWriteOnce
+  - ReadOnlyMany
+  - ReadWriteMany
+  persistentVolumeReclaimPolicy: Recycle
+  #storageClassName: local-storage
+  hostPath:
+    path: /home/k8s/
+ 

+ 7 - 0
roles/startmanager/files/nfs-class.yaml

@@ -0,0 +1,7 @@
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: managed-nfs-storage
+provisioner: fuseim.pri/ifs # or choose another name, must match deployment's env PROVISIONER_NAME'
+parameters:
+  archiveOnDelete: "false"

+ 32 - 0
roles/startmanager/files/nfs-deployment.yaml

@@ -0,0 +1,32 @@
+kind: Deployment
+apiVersion: extensions/v1beta1
+metadata:
+  name: nfs-client-provisioner
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      labels:
+        app: nfs-client-provisioner
+    spec:
+      serviceAccountName: nfs-client-provisioner
+      containers:
+        - name: nfs-client-provisioner
+          image: quay.io/external_storage/nfs-client-provisioner:latest
+          volumeMounts:
+            - name: nfs-client-root
+              mountPath: /persistentvolumes
+          env:
+            - name: PROVISIONER_NAME
+              value: fuseim.pri/ifs
+            - name: NFS_SERVER
+              value: 10.0.0.1
+            - name: NFS_PATH
+              value: /work/k8s
+      volumes:
+        - name: nfs-client-root
+          nfs:
+            server: 10.0.0.1
+            path: /work/k8s

+ 4 - 0
roles/startmanager/files/nfs-serviceaccount.yaml

@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: nfs-client-provisioner

+ 20 - 0
roles/startmanager/files/nfs_clusterrole.yaml

@@ -0,0 +1,20 @@
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: nfs-client-provisioner-runner
+rules:
+  - apiGroups: [""]
+    resources: ["persistentvolumes"]
+    verbs: ["get", "list", "watch", "create", "delete"]
+  - apiGroups: [""]
+    resources: ["persistentvolumeclaims"]
+    verbs: ["get", "list", "watch", "update"]
+  - apiGroups: ["storage.k8s.io"]
+    resources: ["storageclasses"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: [""]
+    resources: ["events"]
+    verbs: ["create", "update", "patch"]
+  - apiGroups: [""]
+    resources: ["endpoints"]
+    verbs: ["get", "list", "watch", "create", "update", "patch"]

+ 12 - 0
roles/startmanager/files/nfs_clusterrolebinding.yaml

@@ -0,0 +1,12 @@
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: run-nfs-client-provisioner
+subjects:
+  - kind: ServiceAccount
+    name: nfs-client-provisioner
+    namespace: default
+roleRef:
+  kind: ClusterRole
+  name: nfs-client-provisioner-runner
+  apiGroup: rbac.authorization.k8s.io

+ 17 - 0
roles/startmanager/files/notebook-pv.yaml

@@ -0,0 +1,17 @@
+# yaml file contents
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: notebooks-pv
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+  - ReadWriteOnce
+  - ReadOnlyMany
+  - ReadWriteMany
+  #persistentVolumeReclaimPolicy: Recycle
+  #storageClassName: local-storage
+  hostPath:
+    path: /home/k8s/
+ 

+ 20 - 0
roles/startmanager/files/persistent_volumes.yaml

@@ -0,0 +1,20 @@
+# yaml file contents
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: nfs-pv
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+  #- ReadWriteOnce
+  #- ReadOnlyMany
+  - ReadWriteMany
+  nfs:
+    server: 10.0.0.1
+    path: /work/k8s
+  #persistentVolumeReclaimPolicy: Recycle
+  #storageClassName: local-storage
+  #hostPath:
+    #path: /home/k8s
+ 

+ 12 - 0
roles/startmanager/files/pvc.yaml

@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: pets-pvc
+  namespace: kubeflow
+spec:
+  accessModes:
+  - ReadWriteMany
+  storageClassName: ""
+  resources:
+    requests:
+      storage: 20Gi

+ 3 - 0
roles/startmanager/files/tiller_config.sh

@@ -0,0 +1,3 @@
+kubectl create serviceaccount --namespace kube-system tiller
+kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
+kubectl patch deploy --namespace kube-system tiller-deploy -p '{"spec":{"template":{"spec":{"serviceAccount":"tiller"}}}}'

+ 156 - 0
roles/startmanager/tasks/main.yml

@@ -0,0 +1,156 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Disable swap (if not already disabled)
+  command: /usr/sbin/swapoff -a
+  changed_when: true
+  tags: init
+
+- name: Start and enable docker service
+  systemd:
+    name: docker
+    state: started
+    enabled: yes
+    daemon_reload: yes
+  tags: docker
+
+- name: Initialize kubeadm
+  command: "/bin/kubeadm init --pod-network-cidr='{{ pod_network_cidr_ip }}' --apiserver-advertise-address='{{ manager_ip }}'"
+  changed_when: true
+  register: init_output
+  tags: init
+
+- name: Setup directory for Kubernetes environment for root
+  file:
+    path: "{{ k8s_root_directory }}"
+    state: directory
+    mode: "{{ k8s_root_directory_mode }}"
+  tags: init
+
+- name: Copy Kubernetes config for root
+  copy:
+    src: "{{ k8s_config_src }}"
+    dest: "{{ k8s_config_dest }}"
+    owner: root
+    group: root
+    mode: "{{ k8s_config_file_mode }}"
+    remote_src: yes
+  tags: init
+
+- name: Update the kubernetes config file permissions
+  shell: "chown $(id -u):$(id -g) '{{ k8s_config_dest }}'"
+  args:
+    warn: false
+  changed_when: true
+  tags: init
+
+- name: Cluster token
+  shell: >
+    set -o pipefail && \
+      kubeadm token list | cut -d ' ' -f1 | sed -n '2p'
+  changed_when: false
+  register: K8S_TOKEN
+  tags: init
+
+- name: CA Hash
+  shell: >
+    set -o pipefail && \
+      openssl x509 -pubkey -in {{ k8s_cert_path }} | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | sed 's/^.* //'
+  changed_when: false
+  register: K8S_MANAGER_CA_HASH
+  tags: init
+
+- name: Add K8S Manager IP, Token, and Hash to dummy host
+  add_host:
+    name:   "K8S_TOKEN_HOLDER"
+    token:  "{{ K8S_TOKEN.stdout }}"
+    hash:   "{{ K8S_MANAGER_CA_HASH.stdout }}"
+    ip:     "{{ manager_ip }}"
+  tags: init
+
+- name: Print k8s token
+  debug:
+    msg: "[Manager] K8S_TOKEN_HOLDER K8S token is {{ hostvars['K8S_TOKEN_HOLDER']['token'] }}"
+    verbosity: 2
+  tags: init
+
+- name: Print k8s hash
+  debug:
+    msg: "[Manager] K8S_TOKEN_HOLDER K8S Hash is  {{ hostvars['K8S_TOKEN_HOLDER']['hash'] }}"
+    verbosity: 2
+  tags: init
+
+- name: Print k8s manager_ip
+  debug:
+    msg: "[Manager] K8S_MANAGER_IP is  {{ manager_ip }}"
+    verbosity: 2
+  tags: init
+
+- name: Setup Calico SDN network
+  command: "kubectl apply -f '{{ calico_yml_url }}'"
+  when: k8s_cni == "calico"
+  tags: init
+
+- name: Setup Flannel SDN network
+  command: "kubectl apply -f '{{ flannel_yml_url }}'"
+  when: k8s_cni == "flannel"
+  tags: init
+
+- name: Create yaml repo for setup
+  file:
+    path: "{{ yaml_repo_dir_path }}"
+    state: directory
+    mode: "{{ yaml_repo_dir_mode }}"
+  tags: init
+
+- name: Create service account (K8s dashboard) files
+  copy:
+    src: create_admin_user.yaml
+    dest: "{{ k8s_service_account_file_dest }}"
+    owner: root
+    group: root
+    mode: "{{ k8s_service_account_file_mode }}"
+  tags: init
+
+- name: Create service account (K8s dashboard)
+  command: "kubectl create -f '{{ k8s_service_account_file_dest }}'"
+  changed_when: true
+  tags: init
+
+- name: Create clusterRoleBinding (K8s dashboard) files
+  copy:
+    src: create_clusterRoleBinding.yaml
+    dest: "{{ k8s_clusterRoleBinding_file_dest }}"
+    owner: root
+    group: root
+    mode: "{{ k8s_clusterRoleBinding_file_mode }}"
+  tags: init
+
+- name: Create clusterRoleBinding (K8s dashboard)
+  command: "kubectl create -f '{{ k8s_clusterRoleBinding_file_dest }}'"
+  changed_when: true
+  tags: init
+
+- name: Dump bearer token for K8s dashboard login
+  shell: >
+    set -o pipefail && \
+      kubectl -n kube-system describe secret $(kubectl -n kube-system get secret | grep admin-user | awk '{print $1}') > /root/k8s/token
+  changed_when: true
+  tags: init
+
+- name: Edge / Workstation Install allows pods to scheudle on manager
+  command: kubectl taint nodes --all node-role.kubernetes.io/master-
+  when: single_node
+  tags: init

+ 52 - 0
roles/startmanager/vars/main.yml

@@ -0,0 +1,52 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+single_node: false
+
+manager_ip: 10.0.0.101
+
+k8s_cni: calico
+
+pod_network_cidr_ip: 10.244.0.0/16
+
+k8s_root_directory: /root/.kube
+
+k8s_root_directory_mode: 0755
+
+k8s_config_src: /etc/kubernetes/admin.conf
+
+k8s_config_dest: /root/.kube/config
+
+k8s_config_file_mode: 0644
+
+k8s_cert_path: /etc/kubernetes/pki/ca.crt
+
+k8s_dummy_hostname: K8S_TOKEN_HOLDER
+
+yaml_repo_dir_path: /root/k8s
+
+yaml_repo_dir_mode: 0755
+
+k8s_service_account_file_dest: /root/k8s/create_admin_user.yaml
+
+k8s_service_account_file_mode: 0655
+
+k8s_clusterRoleBinding_file_dest: /root/k8s/create_clusterRoleBinding.yaml
+
+k8s_clusterRoleBinding_file_mode: 0655
+
+calico_yml_url: https://docs.projectcalico.org/manifests/calico.yaml
+
+flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml

+ 21 - 0
roles/startservices/files/metal-config.yaml

@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  namespace: metallb-system
+  name: config
+data:
+  config: |
+    address-pools:
+    - name: default
+      protocol: layer2
+      addresses:
+      - 192.168.2.150/32
+      - 192.168.2.151/32
+      - 192.168.2.152/32
+      - 192.168.2.153/32
+      - 192.168.2.154/32
+      - 192.168.2.155/32
+      - 192.168.2.156/32
+      - 192.168.2.157/32
+      - 192.168.2.158/32
+      - 192.168.2.159/32

+ 225 - 0
roles/startservices/files/metallb.yaml

@@ -0,0 +1,225 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: metallb-system
+  labels:
+    app: metallb
+---
+
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  namespace: metallb-system
+  name: controller
+  labels:
+    app: metallb
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  namespace: metallb-system
+  name: speaker
+  labels:
+    app: metallb
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: metallb-system:controller
+  labels:
+    app: metallb
+rules:
+- apiGroups: [""]
+  resources: ["services"]
+  verbs: ["get", "list", "watch", "update"]
+- apiGroups: [""]
+  resources: ["services/status"]
+  verbs: ["update"]
+- apiGroups: [""]
+  resources: ["events"]
+  verbs: ["create", "patch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: metallb-system:speaker
+  labels:
+    app: metallb
+rules:
+- apiGroups: [""]
+  resources: ["services", "endpoints", "nodes"]
+  verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  namespace: metallb-system
+  name: config-watcher
+  labels:
+    app: metallb
+rules:
+- apiGroups: [""]
+  resources: ["configmaps"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+  resources: ["events"]
+  verbs: ["create"]
+---
+
+## Role bindings
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: metallb-system:controller
+  labels:
+    app: metallb
+subjects:
+- kind: ServiceAccount
+  name: controller
+  namespace: metallb-system
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: metallb-system:controller
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: metallb-system:speaker
+  labels:
+    app: metallb
+subjects:
+- kind: ServiceAccount
+  name: speaker
+  namespace: metallb-system
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: metallb-system:speaker
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  namespace: metallb-system
+  name: config-watcher
+  labels:
+    app: metallb
+subjects:
+- kind: ServiceAccount
+  name: controller
+- kind: ServiceAccount
+  name: speaker
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: config-watcher
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  namespace: metallb-system
+  name: speaker
+  labels:
+    app: metallb
+    component: speaker
+spec:
+  selector:
+    matchLabels:
+      app: metallb
+      component: speaker
+  template:
+    metadata:
+      labels:
+        app: metallb
+        component: speaker
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "7472"
+    spec:
+      serviceAccountName: speaker
+      terminationGracePeriodSeconds: 0
+      hostNetwork: true
+      containers:
+      - name: speaker
+        image: metallb/speaker:v0.7.3
+        imagePullPolicy: IfNotPresent
+        args:
+        - --port=7472
+        - --config=config
+        env:
+        - name: METALLB_NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        ports:
+        - name: monitoring
+          containerPort: 7472
+        resources:
+          limits:
+            cpu: 100m
+            memory: 100Mi
+          
+        securityContext:
+          allowPrivilegeEscalation: false
+          readOnlyRootFilesystem: true
+          capabilities:
+            drop:
+            - all
+            add:
+            - net_raw
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  namespace: metallb-system
+  name: controller
+  labels:
+    app: metallb
+    component: controller
+spec:
+  revisionHistoryLimit: 3
+  selector:
+    matchLabels:
+      app: metallb
+      component: controller
+  template:
+    metadata:
+      labels:
+        app: metallb
+        component: controller
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "7472"
+    spec:
+      serviceAccountName: controller
+      terminationGracePeriodSeconds: 0
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 65534 # nobody
+      containers:
+      - name: controller
+        image: metallb/controller:v0.7.3
+        imagePullPolicy: IfNotPresent
+        args:
+        - --port=7472
+        - --config=config
+        ports:
+        - name: monitoring
+          containerPort: 7472
+        resources:
+          limits:
+            cpu: 100m
+            memory: 100Mi
+          
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop:
+            - all
+          readOnlyRootFilesystem: true
+
+---
+
+

+ 121 - 0
roles/startservices/tasks/main.yml

@@ -0,0 +1,121 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Wait for CoreDNS to restart
+  command: kubectl rollout status deployment/coredns -n kube-system
+  changed_when: false
+  ignore_errors: True
+  tags: init
+
+- name: Deploy MetalLB
+  command: "kubectl apply -f '{{ metallb_yaml_url }}'"
+  changed_when: true
+  tags: init
+
+- name: Create MetalLB Setup Config Files
+  copy:
+    src: metal-config.yaml
+    dest: "{{ metallb_config_file_dest }}"
+    owner: root
+    group: root
+    mode: "{{ metallb_config_file_mode }}"
+  tags: init
+
+- name: Create MetalLB Setup Deployment Files
+  copy:
+    src: metallb.yaml
+    dest: "{{ metallb_deployment_file_dest }}"
+    owner: root
+    group: root
+    mode: "{{ metallb_deployment_file_mode }}"
+  tags: init
+
+- name: Deploy MetalLB
+  command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
+  changed_when: true
+  tags: init
+
+- name: Create default setup for MetalLB
+  command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
+  changed_when: true
+  tags: init
+
+- name: Start k8s dashboard
+  command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
+  changed_when: true
+  register: result
+  tags: init
+
+- name: Helm - add stable repo
+  command: "helm repo add stable '{{ helm_stable_repo_url }}'"
+  changed_when: true
+  tags: init
+
+- name: Helm - add Nvidia k8s-device-plugin (nvdp) repo
+  command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'"
+  changed_when: true
+  tags: init
+
+- name: Helm - add Nvidia GPU discovery (nvgfd) repo
+  command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'"
+  changed_when: true
+  tags: init
+
+- name: Helm - update repo
+  command: helm repo update
+  changed_when: true
+  tags: init
+
+- name: Start NFS Client Provisioner
+  command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server }}' --set nfs.path='{{ nfs_path }}' --generate-name"
+  changed_when: true
+  register: result
+  tags: init
+
+- name: Set NFS-Client Provisioner as DEFAULT StorageClass
+  shell: >
+    kubectl patch storageclasses.storage.k8s.io nfs-client \
+    -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
+  changed_when: true
+  tags: init
+
+- name: Prometheus deployment
+  command: >
+    helm install stable/prometheus \
+    --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
+    --generate-name
+  changed_when: true
+  tags: init
+
+- name: Install MPI Operator
+  command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
+  changed_when: true
+  tags: init
+
+- name: Install nvidia-device-plugin
+  command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
+  changed_when: true
+  tags: init
+
+- name: Install GPU Feature Discovery
+  command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
+  changed_when: true
+  tags: init
+
+- name: Deploy Xilinx Device plugin
+  command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
+  changed_when: true
+  register: fpga_enable
+  tags: init

+ 47 - 0
roles/startservices/vars/main.yml

@@ -0,0 +1,47 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+metallb_config_file_dest: /root/k8s/metal-config.yaml
+
+metallb_config_file_mode: 0655
+
+metallb_deployment_file_dest: /root/k8s/metallb.yaml
+
+metallb_deployment_file_mode: 0655
+
+metallb_yaml_url: https://raw.githubusercontent.com/google/metallb/v0.8.1/manifests/metallb.yaml
+
+k8s_dashboard_yaml_url: https://raw.githubusercontent.com/kubernetes/dashboard/v2.0.0/aio/deploy/recommended.yaml
+
+helm_stable_repo_url: https://charts.helm.sh/stable
+
+#nfs_server: "{{ ansible_host }}"
+nfs_server: 10.0.0.1
+
+nfs_path: /home/k8snfs
+
+mpi_operator_yaml_url: https://raw.githubusercontent.com/kubeflow/mpi-operator/master/deploy/v1alpha2/mpi-operator.yaml
+
+nvidia_k8s_device_plugin_repo_url: https://nvidia.github.io/k8s-device-plugin
+
+nvidia_gpu_discovery_repo_url: https://nvidia.github.io/gpu-feature-discovery
+
+nvidia_device_plugin_version: 0.7.0
+
+mig_strategy: none
+
+gpu_feature_discovery_version: 0.2.0
+
+fpga_device_plugin_yaml_url: https://raw.githubusercontent.com/Xilinx/FPGA_as_a_Service/master/k8s-fpga-device-plugin/fpga-device-plugin.yml

+ 27 - 0
roles/startworkers/tasks/main.yml

@@ -0,0 +1,27 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Disable swap (if not already disabled)
+  command: /usr/sbin/swapoff -a
+  changed_when: true
+  tags: init
+
+- name: Execute kubeadm join command
+  shell: >
+    kubeadm join --token={{ hostvars['K8S_TOKEN_HOLDER']['token'] }}
+    --discovery-token-ca-cert-hash sha256:{{ hostvars['K8S_TOKEN_HOLDER']['hash'] }}
+    {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}:{{ apiserver_bind_port }}
+  when: not single_node
+  tags: init

+ 18 - 0
roles/startworkers/vars/main.yml

@@ -0,0 +1,18 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+single_node: false
+
+apiserver_bind_port: 6443