Browse Source

Merge pull request #167 from j0hnL/refactor

Refactor
Lucas A. Wilson 4 years ago
parent
commit
f03772004b
77 changed files with 282 additions and 363 deletions
  1. 6 6
      docs/INSTALL.md
  2. 2 8
      kubernetes/host_inventory_file
  3. 13 0
      examples/host_inventory_file.ini
  4. 0 60
      kubernetes/kubernetes.yml
  5. 0 21
      kubernetes/roles/common/handlers/main.yml
  6. 71 8
      omnia.yml
  7. 0 0
      platforms/jupyterhub.yml
  8. 0 0
      platforms/kubeflow.yml
  9. 0 0
      platforms/roles/jupyterhub/files/jupyter_config.yaml
  10. 0 0
      platforms/roles/jupyterhub/tasks/main.yml
  11. 0 0
      platforms/roles/jupyterhub/vars/main.yml
  12. 0 0
      platforms/roles/kubeflow/tasks/main.yml
  13. 0 0
      platforms/roles/kubeflow/vars/main.yml
  14. 0 0
      roles/common/files/k8s.conf
  15. 0 0
      roles/common/files/kubernetes.repo
  16. 35 0
      roles/common/handlers/main.yml
  17. 12 6
      kubernetes/roles/common/tasks/main.yml
  18. 23 23
      slurm/roles/common/tasks/ntp.yml
  19. 0 0
      roles/common/templates/chrony.conf.j2
  20. 0 0
      roles/common/templates/ntp.conf.j2
  21. 20 2
      kubernetes/roles/common/vars/main.yml
  22. 0 0
      roles/compute_gpu/files/daemon.json
  23. 0 0
      roles/compute_gpu/files/k8s.conf
  24. 0 0
      roles/compute_gpu/files/kubernetes.repo
  25. 13 4
      kubernetes/roles/compute_gpu/tasks/main.yml
  26. 5 1
      kubernetes/roles/compute_gpu/vars/main.yml
  27. 8 1
      kubernetes/roles/firewalld/tasks/main.yml
  28. 0 0
      roles/firewalld/vars/main.yml
  29. 0 0
      roles/manager/files/k8s.conf
  30. 0 0
      roles/manager/files/kubernetes.repo
  31. 0 0
      roles/manager/tasks/main.yml
  32. 0 0
      roles/manager/vars/main.yml
  33. 0 0
      roles/slurm_common/files/munge.key
  34. 0 0
      roles/slurm_common/files/slurm.conf
  35. 31 12
      slurm/roles/slurm_common/tasks/main.yml
  36. 2 1
      slurm/roles/slurm_common/vars/main.yml
  37. 0 0
      roles/slurm_manager/files/slurmdbd.conf
  38. 22 33
      slurm/roles/slurm_manager/tasks/main.yml
  39. 1 1
      slurm/roles/slurm_manager/vars/main.yml
  40. 0 0
      roles/slurm_start_services/tasks/main.yml
  41. 3 4
      slurm/roles/start_slurm_workers/tasks/main.yml
  42. 0 0
      roles/startmanager/files/create_admin_user.yaml
  43. 0 0
      roles/startmanager/files/create_clusterRoleBinding.yaml
  44. 0 0
      roles/startmanager/files/data-pv.yaml
  45. 0 0
      roles/startmanager/files/data2-pv.yaml
  46. 0 0
      roles/startmanager/files/data3-pv.yaml
  47. 0 0
      roles/startmanager/files/data4-pv.yaml
  48. 0 0
      roles/startmanager/files/flannel_net.sh
  49. 0 0
      roles/startmanager/files/katib-pv.yaml
  50. 0 0
      roles/startmanager/files/kube-flannel.yaml
  51. 0 0
      roles/startmanager/files/kubeflow_persistent_volumes.yaml
  52. 0 0
      roles/startmanager/files/minio-pvc.yaml
  53. 0 0
      roles/startmanager/files/mysql-pv.yaml
  54. 0 0
      roles/startmanager/files/nfs-class.yaml
  55. 0 0
      roles/startmanager/files/nfs-deployment.yaml
  56. 0 0
      roles/startmanager/files/nfs-serviceaccount.yaml
  57. 0 0
      roles/startmanager/files/nfs_clusterrole.yaml
  58. 0 0
      roles/startmanager/files/nfs_clusterrolebinding.yaml
  59. 0 0
      roles/startmanager/files/notebook-pv.yaml
  60. 0 0
      roles/startmanager/files/persistent_volumes.yaml
  61. 0 0
      roles/startmanager/files/pvc.yaml
  62. 0 0
      roles/startmanager/files/tiller_config.sh
  63. 9 13
      kubernetes/roles/startmanager/tasks/main.yml
  64. 1 3
      kubernetes/roles/startmanager/vars/main.yml
  65. 0 0
      roles/startservices/files/metal-config.yaml
  66. 0 0
      roles/startservices/files/metallb.yaml
  67. 0 0
      roles/startservices/tasks/main.yml
  68. 5 4
      kubernetes/roles/startservices/vars/main.yml
  69. 0 0
      roles/startworkers/tasks/main.yml
  70. 0 0
      roles/startworkers/vars/main.yml
  71. 0 2
      slurm/roles/common/README.md
  72. 0 40
      slurm/roles/common/handlers/main.yml
  73. 0 17
      slurm/roles/common/tasks/main.yml
  74. 0 31
      slurm/roles/common/vars/main.yml
  75. 0 44
      slurm/slurm.yml
  76. 0 18
      slurm/slurm_inventory_file
  77. 0 0
      tools/scuttle

+ 6 - 6
docs/INSTALL.md

@@ -1,24 +1,24 @@
 ## TL;DR Installation
  
-### Kubernetes
+### Kubernetes Only
 Install Kubernetes and all dependencies
 ```
-ansible-playbook -i host_inventory_file kubernetes/kubernetes.yml
+ansible-playbook -i host_inventory_file omnia.yml --skip-tags "slurm"
 ```
 
 Initialize K8s cluster
 ```
-ansible-playbook -i host_inventory_file kubernetes/kubernetes.yml --tags "init"
+ansible-playbook -i host_inventory_file omnia.yml --tags "init"
 ```
 
 ### Install Kubeflow 
 ```
-ansible-playbook -i host_inventory_file kubernetes/kubeflow.yaml
+ansible-playbook -i host_inventory_file platform/kubeflow.yaml
 ```
 
-### Slurm
+### Slurm Only
 ```
-ansible-playbook -i host_inventory_file slurm/slurm.yml
+ansible-playbook -i host_inventory_file omnia.yml --skip-tags "k8s"
 ```
 
 # Omnia  

+ 2 - 8
kubernetes/host_inventory_file

@@ -9,15 +9,9 @@ all:
           children:
             compute:
               hosts:
-                compute003:
+                compute001:
             gpus:
               hosts:
                 compute002:
+                compute003:
                 compute004:
-                #compute005:
-      vars:
-        single_node: false
-        manager_ip: 10.0.0.100
-        nfs_server: 10.0.0.100
-        nfs_path: /work
-        MIG_STRATEGY: none

+ 13 - 0
examples/host_inventory_file.ini

@@ -0,0 +1,13 @@
+[manager]
+friday
+
+[compute]
+compute000
+compute[002:005]
+
+[workers:children]
+compute
+
+[cluster:children]
+manager
+workers

+ 0 - 60
kubernetes/kubernetes.yml

@@ -1,60 +0,0 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
----
-#Playbook for kubernetes cluster
-
-- name: Gather facts from all the nodes
-  hosts: all
-
-- name: Apply common installation and config
-  hosts: manager, compute
-  gather_facts: false
-  roles:
-    - common
-
-- name: Apply GPU node config
-  hosts: compute
-  gather_facts: false
-  roles:
-    - compute_gpu
-
-- name: Apply manager config
-  hosts: manager
-  gather_facts: false
-  roles:
-    - manager
-
-- name: Apply firewalld config on manager and compute nodes
-  hosts: manager, compute
-  gather_facts: false
-  roles:
-    - firewalld
-
-- name: Start K8s on manager server
-  hosts: manager
-  gather_facts: false
-  roles:
-    - startmanager
-
-- name: Start K8s worker servers on compute nodes
-  hosts: compute
-  gather_facts: false
-  roles:
-    - startworkers
-
-- name: Start K8s worker servers on manager nodes
-  hosts: manager
-  gather_facts: false
-  roles:
-    - startservices

+ 0 - 21
kubernetes/roles/common/handlers/main.yml

@@ -1,21 +0,0 @@
----
-
-#- name: Enable docker service
-  #service:
-    #name: docker
-    #enabled: yes
-#
-- name: Start and Enable docker service
-  service:
-    name: docker
-    state: restarted
-    enabled: yes
-  #tags: install
-
-- name: Start and Enable Kubernetes - kubelet
-  service:
-    name: kubelet
-    state: started
-    enabled: yes
-  #tags: install
-

+ 71 - 8
omnia.yml

@@ -15,12 +15,75 @@
 
 # Omnia playbook. Will be updated later.
 
-- name: omnia
-  hosts: localhost
-  connection: local
-  gather_facts: no
-  tasks:
-    - name: Hello
-      debug:
-        msg: "Hello omnia.yml"
+- name: Gather facts from all the nodes
+  hosts: all
 
+- name: Apply common installation and config
+  hosts: manager, compute
+  gather_facts: false
+  roles:
+    - common
+ 
+- name: Apply GPU node config
+  hosts: gpus
+  gather_facts: false
+  roles:
+    - compute_gpu
+
+- name: Apply K8s manager config
+  hosts: manager
+  gather_facts: true
+  roles:
+    - manager
+
+- name: Apply K8s firewalld config on manager and compute nodes
+  hosts: manager, compute
+  gather_facts: false
+  roles:
+    - firewalld
+
+- name: Start K8s on manager server
+  hosts: manager
+  gather_facts: true
+  roles:
+    - startmanager
+
+- name: Start K8s worker servers on compute nodes
+  hosts: compute
+  gather_facts: false
+  roles:
+    - startworkers
+
+- name: Start K8s worker servers on manager nodes
+  hosts: manager
+  gather_facts: false
+  roles:
+    - startservices
+
+- name: Apply SLURM manager config
+  hosts: manager
+  gather_facts: false
+  roles:
+    - slurm_manager
+  tags: slurm
+
+- name: Apply common Slurm installation and config
+  hosts: manager, compute
+  gather_facts: false
+  roles:
+    - slurm_common
+  tags: slurm
+
+- name: Start slurm workers
+  hosts: compute
+  gather_facts: false
+  roles:
+    - start_slurm_workers
+  tags: slurm
+
+- name: Start Slurm services
+  hosts: manager
+  gather_facts: false
+  roles:
+    - slurm_start_services
+  tags: slurm

kubernetes/jupyterhub.yml → platforms/jupyterhub.yml


kubernetes/kubeflow.yml → platforms/kubeflow.yml


kubernetes/roles/jupyterhub/files/jupyter_config.yaml → platforms/roles/jupyterhub/files/jupyter_config.yaml


kubernetes/roles/jupyterhub/tasks/main.yml → platforms/roles/jupyterhub/tasks/main.yml


kubernetes/roles/jupyterhub/vars/main.yml → platforms/roles/jupyterhub/vars/main.yml


kubernetes/roles/kubeflow/tasks/main.yml → platforms/roles/kubeflow/tasks/main.yml


kubernetes/roles/kubeflow/vars/main.yml → platforms/roles/kubeflow/vars/main.yml


kubernetes/roles/common/files/k8s.conf → roles/common/files/k8s.conf


kubernetes/roles/common/files/kubernetes.repo → roles/common/files/kubernetes.repo


+ 35 - 0
roles/common/handlers/main.yml

@@ -0,0 +1,35 @@
+---
+
+- name: Start and Enable docker service
+  service:
+    name: docker
+    state: restarted
+    enabled: yes
+  #tags: install
+
+- name: Start and Enable Kubernetes - kubelet
+  service:
+    name: kubelet
+    state: started
+    enabled: yes
+  #tags: install
+
+- name: Restart chrony
+  service:
+    name: chronyd
+    state: restarted
+    enabled: yes
+
+- name: Sync tp clocks
+  command: ntpdc -np
+  register: ntp_clock
+  until:  ntp_clock.stdout.find('*') > -1
+  retries: "{{ retry_count_one }}"
+  delay: "{{ delay_count_one }}"
+
+- name: Sync chrony sources
+  command: chronyc sources
+  register: chrony_src
+  until:  chrony_src.stdout.find('^*') > -1
+  retries: "{{ retry_count }}"
+  delay: "{{ delay_count }}"

+ 12 - 6
kubernetes/roles/common/tasks/main.yml

@@ -66,15 +66,18 @@
 
 - name: Install common packages
   package:
-    name: "{{ item }}"
+    name: "{{ common_packages }}"
+    state: present
+  tags: install
+
+- name: Install k8s packages
+  package:
+    name: "{{ k8s_packages }}"
     state: present
-  with_items:
-    - "{{ common_packages }}"
-    - "{{ k8s_packages }}"
   tags: install
 
 - name: Versionlock kubernetes
-  command: "dnf versionlock '{{ item }}'"
+  command: "yum versionlock '{{ item }}'"
   args:
     warn: false
   with_items:
@@ -100,4 +103,7 @@
     name: kubelet
     state: restarted
     enabled: yes
-  tags: install
+
+- name: Deploy time ntp/chrony
+  include_tasks: ntp.yml
+  tags: install

+ 23 - 23
slurm/roles/common/tasks/ntp.yml

@@ -13,28 +13,28 @@
 #  limitations under the License.
 ---
 
-  - name: Deploy ntp servers
-    block:
-      - name: Deploy ntpd
-        package:
-          name: ntp
-          state: present
-      - name: Deploy ntpdate
-        package:
-          name: ntpdate
-          state: present
-      - name: Update ntp servers
-        template:
-          src: ntp.conf.j2
-          dest: "{{ ntp_path }}"
-          owner: root
-          group: root
-          mode: "{{ ntp_mode }}"
-          backup: yes
-        notify:
-          - restart ntpd
-          - sync ntp clocks
-    when:  ( ansible_distribution == "CentOS" or   ansible_distribution == "RedHat" ) and ansible_distribution_major_version  < os_higher_version
+#- name: Deploy ntp servers
+#block:
+#- name: Deploy ntpd
+#package:
+#name: ntp
+#state: present
+#- name: Deploy ntpdate
+#package:
+#name: ntpdate
+#state: present
+#- name: Update ntp servers
+#template:
+#src: ntp.conf.j2
+#dest: "{{ ntp_path }}"
+#owner: root
+#group: root
+#mode: "{{ ntp_mode }}"
+          #backup: yes
+          #notify:
+          #- restart ntpd
+            #- sync ntp clocks
+            #when:  ( ansible_distribution == "CentOS" or   ansible_distribution == "RedHat" ) and ansible_distribution_major_version  < os_higher_version
 
   - name: Deploy chrony server
     block:
@@ -53,4 +53,4 @@
         notify:
           - restart chrony
           - sync chrony sources
-    when:  ( ansible_distribution == "CentOS" or   ansible_distribution == "RedHat" ) and ansible_distribution_major_version  > os_version
+    when:  ( ansible_distribution == "CentOS" or   ansible_distribution == "RedHat" ) and ansible_distribution_major_version  > os_version

slurm/roles/common/templates/chrony.conf.j2 → roles/common/templates/chrony.conf.j2


slurm/roles/common/templates/ntp.conf.j2 → roles/common/templates/ntp.conf.j2


+ 20 - 2
kubernetes/roles/common/vars/main.yml

@@ -22,6 +22,7 @@ common_packages:
   - docker-ce
   - bash-completion
   - nvidia-detect
+  - chrony
 
 k8s_packages:
   - kubelet-1.16.7
@@ -32,7 +33,7 @@ k8s_repo_dest: /etc/yum.repos.d/
 
 elrepo_gpg_key_url: https://www.elrepo.org/RPM-GPG-KEY-elrepo.org
 
-elrepo_rpm_url: https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm
+elrepo_rpm_url: https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm
 
 docker_repo_url: https://download.docker.com/linux/centos/docker-ce.repo
 
@@ -42,4 +43,21 @@ k8s_conf_dest: /etc/sysctl.d/
 
 k8s_repo_file_mode: 0644
 
-k8s_conf_file_mode: 0644
+k8s_conf_file_mode: 0644
+
+chrony_path: "/etc/chrony.conf"
+ntp_path: "/etc/ntp.conf"
+ntp_mode: "0644"
+os_higher_version: "8"
+os_version: "7"
+retry_count_one: "10"
+delay_count_one: "60"
+retry_count: "6"
+delay_count: "10"
+
+ntp_servers: 
+  - 0.centos.pool.ntp.org
+  - 1.centos.pool.ntp.org
+  - 2.centos.pool.ntp.org
+chrony_servers:
+  - 2.centos.pool.ntp.org

kubernetes/roles/compute_gpu/files/daemon.json → roles/compute_gpu/files/daemon.json


kubernetes/roles/compute_gpu/files/k8s.conf → roles/compute_gpu/files/k8s.conf


kubernetes/roles/compute_gpu/files/kubernetes.repo → roles/compute_gpu/files/kubernetes.repo


+ 13 - 4
kubernetes/roles/compute_gpu/tasks/main.yml

@@ -19,12 +19,21 @@
     dest: "{{ nvidia_docker_repo_dest }}"
   tags: install, testing
 
+- name: Add libnvidia container Repo
+  get_url:
+    url: "{{ nvidia_container_repo_url }}"
+    dest: "{{ nvidia_container_repo_dest }}"
+  tags: install, testing
+
 - name: Install nvidia driver and nvidia-docker2
   package:
-    name: "{{ item }}"
+    name: "{{ nvidia_packages }}"
+    enablerepo: libnvidia-container,nvidia-docker
     state: present
-  with_items:
-    - "{{ nvidia_packages }}"
+  tags: install
+
+- name: Reboot after installing GPU drivers
+  reboot:
   tags: install
 
 - name: Set nvidia as default runtime
@@ -49,4 +58,4 @@
     name: kubelet
     state: restarted
     enabled: yes
-  tags: install
+  tags: install

+ 5 - 1
kubernetes/roles/compute_gpu/vars/main.yml

@@ -17,10 +17,14 @@ nvidia_docker_repo_url: https://nvidia.github.io/nvidia-docker/centos7/nvidia-do
 
 nvidia_docker_repo_dest: /etc/yum.repos.d/nvidia-docker.repo
 
+nvidia_container_repo_url: https://nvidia.github.io/libnvidia-container/centos7/libnvidia-container.repo 
+
+nvidia_container_repo_dest: /etc/yum.repos.d/libnvidia-container.repo
+
 nvidia_packages:
   - kmod-nvidia
   - nvidia-docker2
 
 daemon_file_dest: /etc/docker/
 
-daemon_file_mode: 0644
+daemon_file_mode: 0644

+ 8 - 1
kubernetes/roles/firewalld/tasks/main.yml

@@ -74,4 +74,11 @@
 - name: Reload firewalld
   command: firewall-cmd --reload
   changed_when: true
-  tags: firewalld
+  tags: firewalld
+
+- name: Stop and disable firewalld
+  service:
+    name: firewalld
+    state: stopped
+    enabled: no
+  tags: firewalld

kubernetes/roles/firewalld/vars/main.yml → roles/firewalld/vars/main.yml


kubernetes/roles/manager/files/k8s.conf → roles/manager/files/k8s.conf


kubernetes/roles/manager/files/kubernetes.repo → roles/manager/files/kubernetes.repo


kubernetes/roles/manager/tasks/main.yml → roles/manager/tasks/main.yml


kubernetes/roles/manager/vars/main.yml → roles/manager/vars/main.yml


slurm/roles/slurm_common/files/munge.key → roles/slurm_common/files/munge.key


slurm/roles/slurm_common/files/slurm.conf → roles/slurm_common/files/slurm.conf


+ 31 - 12
slurm/roles/slurm_common/tasks/main.yml

@@ -22,22 +22,29 @@
 - name: Munge installation
   package:
     name: munge-devel
-    enablerepo: PowerTools
     state: present
+  tags: install
 
 - name: Install packages for slurm
   package:
-    name: "{{ item }}"
+    name: "{{ common_packages }}"
     state: present
-  with_items:
-    - "{{ common_packages }}"
   tags: install
 
-- name: Create munge key
+- name: pip upgrade pip
+  pip:
+    name: pip
+    executable: pip3
+    extra_args: --upgrade
+    state: latest
+  tags: install
+
+- name: create munge key
   command: "{{ munge_cmd }}"
   changed_when: true
+  tags: install
 
-- name: Copy munge key
+- name: copy munge key
   copy:
     src: munge.key
     dest: "{{ munge_dest }}"
@@ -46,66 +53,75 @@
     mode: "{{ munge_mode }}"
   tags: install
 
-- name: Slurm configuration - slurm.conf
+- name: slurm configuration - slurm.conf
   copy:
     src: slurm.conf
     dest: "{{ slurm_dest }}"
     mode: "{{ slurm_mode }}"
   tags: install
 
-- name: Add cluster name
+- name: add cluster name
   lineinfile:
     path: "{{ slurm_confpth }}"
-    regexp: "ClusterName="
-    line: "ClusterName={{ cluster_name }}"
+    regexp: "clustername="
+    line: "clustername={{ cluster_name }}"
+  tags: install
 
-- name: Add slurm user name
+- name: add slurm user name
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmUser="
     line: "SlurmUser={{ slurm_user }}"
+  tags: install
 
 - name: Add slurmctld port no
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmctldPort="
     line: "SlurmctldPort={{ slurmctld_port }}"
+  tags: install
 
 - name: Add slurmd port no
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmdPort="
     line: "SlurmdPort={{ slurmd_port }}"
+  tags: install
 
 - name: Add spool path
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmdSpoolDir="
     line: "SlurmdSpoolDir={{ spool_pth }}"
+  tags: install
 
 - name: Add slurmctld pid file path
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmctldPidFile="
     line: "SlurmctldPidFile={{ slurmctld_pid }}"
+  tags: install
 
 - name: Add slurmd pid file path
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmdPidFile="
     line: "SlurmdPidFile={{ slurmd_pid }}"
+  tags: install
 
 - name: Add slurmctld log file path
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmctldLogFile="
     line: "SlurmctldLogFile={{ slurmctld_log }}"
+  tags: install
 
 - name: Add slurmd log file path
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmdLogFile="
     line: "SlurmdLogFile={{ slurmd_log }}"
+  tags: install
 
 - name: Create slurm group
   group:
@@ -139,6 +155,7 @@
     state: directory
     mode: "{{ gen_mode }}"
     recurse: yes
+  tags: install
 
 - name: Give slurm user permission to slurmctld
   file:
@@ -147,6 +164,7 @@
     group: slurm
     mode: "{{ gen_mode }}"
     state: touch
+  tags: install
 
 - name: Give slurm user permission to slurmd
   file:
@@ -155,10 +173,11 @@
     group: slurm
     mode: "{{ gen_mode }}"
     state: touch
+  tags: install
 
 - name: Start munge service
   service:
     name: munge
     state: restarted
     enabled: yes
-  tags: install
+  tags: install

+ 2 - 1
slurm/roles/slurm_common/vars/main.yml

@@ -21,6 +21,7 @@ common_packages:
    - mariadb-server
    - mariadb-devel
    - python3
+   - python-pip
 
 munge_dest: "/etc/munge/"
 munge_cmd: "/usr/sbin/create-munge-key -f"
@@ -39,4 +40,4 @@ slurmctld_pid: "/var/run/slurmctld.pid"
 slurmd_pid: "/var/run/slurmd.pid"
 cluster_name : "manager,compute"
 slurmctld_log: "/var/log/slurm/slurmctld.log"
-slurmd_log: "/var/log/slurm/slurmd.log"
+slurmd_log: "/var/log/slurm/slurmd.log"

slurm/roles/slurm_manager/files/slurmdbd.conf → roles/slurm_manager/files/slurmdbd.conf


+ 22 - 33
slurm/roles/slurm_manager/tasks/main.yml

@@ -13,22 +13,17 @@
 #  limitations under the License.
 ---
 
-- name: Install packages for slurm
-  package:
-    name: "{{ item }}"
-    state: present
-  with_items:
-    - "{{ slurm_packages }}"
-  tags: install
-
-- name: Install development tools
-  package:
-    name: "{{ item }}"
-    enablerepo: PowerTools
-    state: present
-  with_items:
-    - "{{ dev_tools }}"
-  tags: install
+#- name: Install packages for slurm
+# package:
+#   name: "{{ slurm_packages }}"
+#   state: present
+# tags: install
+
+#- name: Install development tools
+# package:
+#   name: "{{ dev_tools }}"
+#   state: present
+# tags: install
 
 - name: Create temporary download folder for slurm
   file:
@@ -45,28 +40,28 @@
     checksum: "{{ slurm_md5 }}"
     validate_certs: no
   tags: install
-
+ 
 - name: Build slurm rpms
   command: rpmbuild -ta "{{ rpmbuild_path }}"
   changed_when: false
   args:
     warn: no
 
-- name: Verify package md5
-  command: rpm -qa
-  ignore_errors: true
-  register: verify_result
-  changed_when: no
-  failed_when: no
-  args:
-    warn: no
+#- name: Verify package md5
+#command: rpm -qa
+#  ignore_errors: true
+#  register: verify_result
+#  changed_when: no
+#  failed_when: no
+#  args:
+#    warn: no
 
 - name: Install rpms
   command: rpm -Uvh ~"{{ rpm_loop }}"
   args:
     chdir: "{{ rpm_path }}"
     warn: no
-  when: verify_result.rc != 0
+    #  when: verify_result.rc != 0
 
 - name: Add control machine name
   lineinfile:
@@ -74,12 +69,6 @@
     regexp: "ControlMachine="
     line: "ControlMachine={{ group_names[0] }}"
 
-- name: Add slurm user name
-  lineinfile:
-    path: "{{ slurmdbd_path }}"
-    regexp: "SlurmUser="
-    line: "SlurmUser={{ slurm_user }}"
-
 - name: Firewall rule for slurm - tcp/ip,udp
   firewalld:
     zone: internal
@@ -171,4 +160,4 @@
   fetch:
     src: "{{ slurm_confpth }}"
     dest: "{{ buffer_path }}"
-    flat: true
+    flat: true

+ 1 - 1
slurm/roles/slurm_manager/vars/main.yml

@@ -59,4 +59,4 @@ logfile: "/var/log/slurm/slurmdbd.log"
 pidfile: "/var/run/slurm/slurmdbd.pid"
 buffer_path: "/tmp/slurm.conf"
 rpm_path: "/root/rpmbuild/RPMS/x86_64/"
-slurm_mode: "0644"
+slurm_mode: "0644"

slurm/roles/slurm_start_services/tasks/main.yml → roles/slurm_start_services/tasks/main.yml


+ 3 - 4
slurm/roles/start_slurm_workers/tasks/main.yml

@@ -24,7 +24,7 @@
 
 - name: Install packages for slurm
   package:
-    name: "{{ item }}"
+    name: "{{ slurm_packages }}"
     state: present
   with_items:
     - "{{ slurm_packages }}"
@@ -32,8 +32,7 @@
 
 - name: Install development tools
   package:
-    name: "{{ item }}"
-    enablerepo: PowerTools
+    name: "{{ item | join (',') }}"
     state: present
   with_items:
     - "{{ dev_tools }}"
@@ -94,4 +93,4 @@
   service:
     name: slurmd.service
     enabled: yes
-  tags: install
+  tags: install

kubernetes/roles/startmanager/files/create_admin_user.yaml → roles/startmanager/files/create_admin_user.yaml


kubernetes/roles/startmanager/files/create_clusterRoleBinding.yaml → roles/startmanager/files/create_clusterRoleBinding.yaml


kubernetes/roles/startmanager/files/data-pv.yaml → roles/startmanager/files/data-pv.yaml


kubernetes/roles/startmanager/files/data2-pv.yaml → roles/startmanager/files/data2-pv.yaml


kubernetes/roles/startmanager/files/data3-pv.yaml → roles/startmanager/files/data3-pv.yaml


kubernetes/roles/startmanager/files/data4-pv.yaml → roles/startmanager/files/data4-pv.yaml


kubernetes/roles/startmanager/files/flannel_net.sh → roles/startmanager/files/flannel_net.sh


kubernetes/roles/startmanager/files/katib-pv.yaml → roles/startmanager/files/katib-pv.yaml


kubernetes/roles/startmanager/files/kube-flannel.yaml → roles/startmanager/files/kube-flannel.yaml


kubernetes/roles/startmanager/files/kubeflow_persistent_volumes.yaml → roles/startmanager/files/kubeflow_persistent_volumes.yaml


kubernetes/roles/startmanager/files/minio-pvc.yaml → roles/startmanager/files/minio-pvc.yaml


kubernetes/roles/startmanager/files/mysql-pv.yaml → roles/startmanager/files/mysql-pv.yaml


kubernetes/roles/startmanager/files/nfs-class.yaml → roles/startmanager/files/nfs-class.yaml


kubernetes/roles/startmanager/files/nfs-deployment.yaml → roles/startmanager/files/nfs-deployment.yaml


kubernetes/roles/startmanager/files/nfs-serviceaccount.yaml → roles/startmanager/files/nfs-serviceaccount.yaml


kubernetes/roles/startmanager/files/nfs_clusterrole.yaml → roles/startmanager/files/nfs_clusterrole.yaml


kubernetes/roles/startmanager/files/nfs_clusterrolebinding.yaml → roles/startmanager/files/nfs_clusterrolebinding.yaml


kubernetes/roles/startmanager/files/notebook-pv.yaml → roles/startmanager/files/notebook-pv.yaml


kubernetes/roles/startmanager/files/persistent_volumes.yaml → roles/startmanager/files/persistent_volumes.yaml


kubernetes/roles/startmanager/files/pvc.yaml → roles/startmanager/files/pvc.yaml


kubernetes/roles/startmanager/files/tiller_config.sh → roles/startmanager/files/tiller_config.sh


+ 9 - 13
kubernetes/roles/startmanager/tasks/main.yml

@@ -13,21 +13,17 @@
 #  limitations under the License.
 ---
 
-- name: Disable swap (if not already disabled)
+- name: Disable swap
   command: /usr/sbin/swapoff -a
   changed_when: true
   tags: init
 
-- name: Start and enable docker service
-  systemd:
-    name: docker
-    state: started
-    enabled: yes
-    daemon_reload: yes
-  tags: docker
+- name: Get netaddr
+  setup:
+    filter: ansible_default_ipv4.address
 
 - name: Initialize kubeadm
-  command: "/bin/kubeadm init --pod-network-cidr='{{ pod_network_cidr_ip }}' --apiserver-advertise-address='{{ manager_ip }}'"
+  command: "/bin/kubeadm init --pod-network-cidr='{{ pod_network_cidr_ip }}' --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
   changed_when: true
   register: init_output
   tags: init
@@ -77,7 +73,7 @@
     name:   "K8S_TOKEN_HOLDER"
     token:  "{{ K8S_TOKEN.stdout }}"
     hash:   "{{ K8S_MANAGER_CA_HASH.stdout }}"
-    ip:     "{{ manager_ip }}"
+    ip:     "{{ ansible_default_ipv4.address }}"
   tags: init
 
 - name: Print k8s token
@@ -92,9 +88,9 @@
     verbosity: 2
   tags: init
 
-- name: Print k8s manager_ip
+- name: Print k8s ansible_default_ipv4.address
   debug:
-    msg: "[Manager] K8S_MANAGER_IP is  {{ manager_ip }}"
+    msg: "[Manager] K8S_MANAGER_IP is  {{ ansible_default_ipv4.address }}"
     verbosity: 2
   tags: init
 
@@ -153,4 +149,4 @@
 - name: Edge / Workstation Install allows pods to scheudle on manager
   command: kubectl taint nodes --all node-role.kubernetes.io/master-
   when: single_node
-  tags: init
+  tags: init

+ 1 - 3
kubernetes/roles/startmanager/vars/main.yml

@@ -15,8 +15,6 @@
 
 single_node: false
 
-manager_ip: "{{ ansible_host }}"
-
 k8s_cni: calico
 
 pod_network_cidr_ip: 10.244.0.0/16
@@ -49,4 +47,4 @@ k8s_clusterRoleBinding_file_mode: 0655
 
 calico_yml_url: https://docs.projectcalico.org/manifests/calico.yaml
 
-flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
+flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml

kubernetes/roles/startservices/files/metal-config.yaml → roles/startservices/files/metal-config.yaml


kubernetes/roles/startservices/files/metallb.yaml → roles/startservices/files/metallb.yaml


kubernetes/roles/startservices/tasks/main.yml → roles/startservices/tasks/main.yml


+ 5 - 4
kubernetes/roles/startservices/vars/main.yml

@@ -23,13 +23,14 @@ metallb_deployment_file_mode: 0655
 
 metallb_yaml_url: https://raw.githubusercontent.com/google/metallb/v0.8.1/manifests/metallb.yaml
 
-k8s_dashboard_yaml_url: https://raw.githubusercontent.com/kubernetes/dashboard/v2.0.0-beta6/aio/deploy/recommended.yaml
+k8s_dashboard_yaml_url: https://raw.githubusercontent.com/kubernetes/dashboard/v2.0.0/aio/deploy/recommended.yaml
 
 helm_stable_repo_url: https://charts.helm.sh/stable
 
-nfs_server: "{{ ansible_host }}"
+#nfs_server: "{{ ansible_host }}"
+nfs_server: 10.0.0.1
 
-nfs_path: /work
+nfs_path: /home/k8snfs
 
 mpi_operator_yaml_url: https://raw.githubusercontent.com/kubeflow/mpi-operator/master/deploy/v1alpha2/mpi-operator.yaml
 
@@ -43,4 +44,4 @@ mig_strategy: none
 
 gpu_feature_discovery_version: 0.2.0
 
-fpga_device_plugin_yaml_url: https://raw.githubusercontent.com/Xilinx/FPGA_as_a_Service/master/k8s-fpga-device-plugin/fpga-device-plugin.yml
+fpga_device_plugin_yaml_url: https://raw.githubusercontent.com/Xilinx/FPGA_as_a_Service/master/k8s-fpga-device-plugin/fpga-device-plugin.yml

kubernetes/roles/startworkers/tasks/main.yml → roles/startworkers/tasks/main.yml


kubernetes/roles/startworkers/vars/main.yml → roles/startworkers/vars/main.yml


+ 0 - 2
slurm/roles/common/README.md

@@ -1,2 +0,0 @@
-includes :
-- ntp deployment using ntpd and configuration basic template with handler will restart and wait for at least one server to sync .

+ 0 - 40
slurm/roles/common/handlers/main.yml

@@ -1,40 +0,0 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
----
-
-- name: Restart ntpd
-  service:
-    name: ntpd
-    state: restarted
-    enabled: yes
-
-- name: Restart chrony
-  service:
-    name: chronyd
-    state: restarted
-    enabled: yes
-
-- name: Sync tp clocks
-  command: ntpdc -np
-  register: ntp_clock
-  until:  ntp_clock.stdout.find('*') > -1
-  retries: "{{ retry_count_one }}"
-  delay: "{{ delay_count_one }}"
-
-- name: Sync chrony sources
-  command: chronyc sources
-  register: chrony_src
-  until:  chrony_src.stdout.find('^*') > -1
-  retries: "{{ retry_count }}"
-  delay: "{{ delay_count }}"

+ 0 - 17
slurm/roles/common/tasks/main.yml

@@ -1,17 +0,0 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
----
-
-- name: Deploy time ntp/chrony
-  include_tasks: ntp.yml

+ 0 - 31
slurm/roles/common/vars/main.yml

@@ -1,31 +0,0 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
----
-
-chrony_path: "/etc/chrony.conf"
-ntp_path: "/etc/ntp.conf"
-ntp_mode: "0644"
-os_higher_version: "8"
-os_version: "7"
-retry_count_one: "10"
-delay_count_one: "60"
-retry_count: "6"
-delay_count: "10"
-
-ntp_servers: 
-  - 0.centos.pool.ntp.org
-  - 1.centos.pool.ntp.org
-  - 2.centos.pool.ntp.org
-chrony_servers:
-  - 2.centos.pool.ntp.org

+ 0 - 44
slurm/slurm.yml

@@ -1,44 +0,0 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
----
-
-#Playbook for installing Slurm on a cluster
-
-#collect info from everything
-- hosts: all
-
-- name: Apply common installation and config
-  hosts: manager,compute
-  gather_facts: false
-  roles:
-    - slurm_common
-    - common
-
-- name: Apply manager config
-  hosts: manager
-  gather_facts: false
-  roles:
-    - slurm_manager
-
-- name: Start slurm workers
-  hosts: compute
-  gather_facts: true
-  roles:
-    - start_slurm_workers
-
-- name: Start services
-  hosts: manager
-  gather_facts: false
-  roles:
-    - slurm_start_services

+ 0 - 18
slurm/slurm_inventory_file

@@ -1,18 +0,0 @@
-[manager]
-friday
-
-[manager:vars]
-slurm_url=https://download.schedmd.com/slurm/slurm-20.02.0.tar.bz2
-slurm_md5=md5:8ed2257471ff24ca213b510a4c1c3563
-
-[compute]
-compute000
-compute[002:005]
-
-
-[workers:children]
-compute
-
-[cluster:children]
-manager
-workers

kubernetes/scuttle → tools/scuttle