Browse Source

Merge pull request #41 from dellhpc/release-0.1

Release 0.1
John Lockman 5 years ago
parent
commit
b19625f221
67 changed files with 596 additions and 249 deletions
  1. 2 1
      .github/ISSUE_TEMPLATE/feature_request.md
  2. 4 7
      README.md
  3. 52 42
      docs/INSTALL.md
  4. 9 0
      docs/PREINSTALL.md
  5. 24 0
      docs/README.md
  6. 0 30
      example.yaml
  7. 6 0
      build-kubernetes-cluster.yml
  8. 6 5
      host_inventory_file
  9. 0 0
      kubernetes/roles/common/files/k8s.conf
  10. 0 0
      kubernetes/roles/common/files/kubernetes.repo
  11. 0 0
      kubernetes/roles/common/files/nvidia
  12. 0 0
      kubernetes/roles/common/handlers/main.yml
  13. 9 3
      roles/common/tasks/main.yml
  14. 0 0
      kubernetes/roles/common/vars/main.yml
  15. 0 0
      kubernetes/roles/computeGPU/files/k8s.conf
  16. 0 0
      kubernetes/roles/computeGPU/files/kubernetes.repo
  17. 0 0
      kubernetes/roles/computeGPU/files/nvidia
  18. 0 0
      kubernetes/roles/computeGPU/handlers/main.yml
  19. 0 0
      kubernetes/roles/computeGPU/tasks/main.yml
  20. 0 0
      kubernetes/roles/computeGPU/vars/main.yml
  21. 0 0
      kubernetes/roles/master/files/k8s.conf
  22. 0 0
      kubernetes/roles/master/files/kubernetes.repo
  23. 0 0
      kubernetes/roles/master/files/nvidia
  24. 1 1
      roles/master/tasks/main.yml
  25. 0 0
      kubernetes/roles/startmaster/files/create_admin_user.yaml
  26. 0 0
      kubernetes/roles/startmaster/files/create_clusterRoleBinding.yaml
  27. 0 0
      kubernetes/roles/startmaster/files/data-pv.yaml
  28. 0 0
      kubernetes/roles/startmaster/files/data2-pv.yaml
  29. 0 0
      kubernetes/roles/startmaster/files/data3-pv.yaml
  30. 0 0
      kubernetes/roles/startmaster/files/data4-pv.yaml
  31. 0 0
      kubernetes/roles/startmaster/files/enable_gpu_k8s.sh
  32. 0 0
      kubernetes/roles/startmaster/files/flannel_net.sh
  33. 0 0
      kubernetes/roles/startmaster/files/katib-pv.yaml
  34. 0 0
      kubernetes/roles/startmaster/files/kube-flannel.yaml
  35. 0 0
      kubernetes/roles/startmaster/files/kubeflow_persistent_volumes.yaml
  36. 0 0
      kubernetes/roles/startmaster/files/minio-pvc.yaml
  37. 0 0
      kubernetes/roles/startmaster/files/mysql-pv.yaml
  38. 0 0
      kubernetes/roles/startmaster/files/nfs-class.yaml
  39. 0 0
      kubernetes/roles/startmaster/files/nfs-deployment.yaml
  40. 0 0
      kubernetes/roles/startmaster/files/nfs-serviceaccount.yaml
  41. 0 0
      kubernetes/roles/startmaster/files/nfs_clusterrole.yaml
  42. 0 0
      kubernetes/roles/startmaster/files/nfs_clusterrolebinding.yaml
  43. 0 0
      kubernetes/roles/startmaster/files/notebook-pv.yaml
  44. 0 0
      kubernetes/roles/startmaster/files/persistent_volumes.yaml
  45. 0 0
      kubernetes/roles/startmaster/files/pvc.yaml
  46. 0 0
      kubernetes/roles/startmaster/files/tiller_config.sh
  47. 7 56
      roles/startmaster/tasks/main.yml
  48. 1 1
      roles/startservices/files/jhub-db-pv.yaml
  49. 0 0
      kubernetes/roles/startservices/files/jupyter-pvc.yaml
  50. 0 0
      kubernetes/roles/startservices/files/jupyter_config.yaml
  51. 0 0
      kubernetes/roles/startservices/files/metal-config.yaml
  52. 0 0
      kubernetes/roles/startservices/files/metallb.yaml
  53. 84 0
      kubernetes/roles/startservices/tasks/main.yml
  54. 0 0
      kubernetes/roles/startworkers/tasks/main.yml
  55. 2 2
      scuttle
  56. 0 21
      roles/startservices/files/metal-config.yaml
  57. 0 80
      roles/startservices/tasks/main.yml
  58. BIN
      slurm/roles/slurm-common/files/munge.key
  59. 97 0
      slurm/roles/slurm-common/files/slurm.conf
  60. 91 0
      slurm/roles/slurm-common/tasks/main.yaml
  61. 98 0
      slurm/roles/slurm-master/tasks/main.yaml
  62. 9 0
      slurm/roles/start-slurm-workers/tasks/main.yml
  63. 23 0
      slurm/slurm-cluster.yaml
  64. 18 0
      slurm/slurm_inventory_file
  65. 7 0
      tools/README.md
  66. 35 0
      tools/change_personality
  67. 11 0
      tools/install_tools.yml

+ 2 - 1
.github/ISSUE_TEMPLATE/feature_request.md

@@ -2,8 +2,9 @@
 name: Feature request
 about: Suggest an idea for this project
 title: ''
-labels: ''
+labels: 'enhancement'
 assignees: ''
+project: 'Future Features'
 
 ---
 

+ 4 - 7
README.md

@@ -1,13 +1,10 @@
 # Omnia
-#### Ansible playbook-based deployment of Slurm and Kubernetes on factory-provisioned Dell EMC PowerEdge servers
+#### Ansible playbook-based deployment of Slurm and Kubernetes on Dell EMC PowerEdge servers running an RPM-based Linux OS
 
-Omnia (Latin: all or everything) is a deployment tool to turn Dell EMC PowerEdge servers with factory-installed OS images into a functioning Slurm/Kubernetes cluster.
+Omnia (Latin: all or everything) is a deployment tool to turn Dell EMC PowerEdge servers with RPM-based Linux images into a functioning Slurm/Kubernetes cluster.
 
-## Installing Omnia
-To install Omnia, see [INSTALL](docs/INSTALL.md)
-
-## Contributing
-To contribute to the Omnia project, see [CONTRIBUTING](CONTRIBUTING.md)
+## Omnia Documentation
+For Omnia documentation, including installation and contribution instructions, see [docs](docs/README.md).
 
 ### Current maintainers:
 * Lucas A. Wilson (Dell Technologies)

+ 52 - 42
docs/INSTALL.md

@@ -1,55 +1,65 @@
-Dancing to the beat of a different drum.
+# Installing Omnia
 
-# Short Version:
+## TL;DR
 
+### Kubernetes
 Install Kubernetes and all dependencies
 ```
-ansible-playbook -i host_inventory_file build-kubernetes-cluster.yml
+ansible-playbook -i host_inventory_file kubernetes/kubernetes.yml
 ```
 
-Initialize K8S cluster
+Initialize K8s cluster
 ```
-ansible-playbook -i host_inventory_file build-kubernetes-cluster.yml --tags "init"
+ansible-playbook -i host_inventory_file kubernetes/kubernetes.yml --tags "init"
+```
+### Slurm
+```
+ansible-playbook -i host_inventory_file slurm/slurm.yml
 ```
-
-
-# What this does:
 
 ## Build/Install
+Omnia is a collection of [Ansible](https://www.ansible.com/) playbooks which perform:
+* Installation of [Slurm](https://slurm.schedmd.com/) and/or [Kubernetes](https://kubernetes.io/) on servers already provisioned with a standard [CentOS](https://www.centos.org/) image.
+* Installation of auxiliary scripts for administrator functions such as moving nodes between Slurm and Kubernetes personalities.
 
-### Add additional repositories:
-
-- Kubernetes (Google)
-- El Repo (nvidia drivers)
-- Nvidia (nvidia-docker)
-- EPEL (Extra Packages for Enterprise Linux)
-
-### Install common packages
- - gcc
- - python-pip
- - docker
- - kubelet
- - kubeadm
- - kubectl
- - nvidia-detect
- - kmod-nvidia
- - nvidia-x11-drv
- - nvidia-container-runtime
- - ksonnet (CLI framework for K8S configs)
-
-### Enable GPU Device Plugins (nvidia-container-runtime-hook)
-
-### Modify kubeadm config to allow GPUs as schedulable resource 
+### Kubernetes
 
-### Start and enable services
- - Docker
- - Kubelet
+* Add additional repositories:
+    - Kubernetes (Google)
+    - El Repo (nvidia drivers)
+    - Nvidia (nvidia-docker)
+    - EPEL (Extra Packages for Enterprise Linux)
+* Install common packages
+    - gcc
+    - python-pip
+    - docker
+    - kubelet
+    - kubeadm
+    - kubectl
+    - nvidia-detect
+    - kmod-nvidia
+    - nvidia-x11-drv
+    - nvidia-container-runtime
+    - ksonnet (CLI framework for K8S configs)
+* Enable GPU Device Plugins (nvidia-container-runtime-hook)
+* Modify kubeadm config to allow GPUs as schedulable resource 
+* Start and enable services
+    - Docker
+    - Kubelet
+* Initialize Cluster
+    * Head/master
+        - Start K8S pass startup token to compute/slaves
+        - Initialize networking (Currently using WeaveNet)
+        - Setup K8S Dashboard
+        - Create dynamic/persistent volumes
+    * Compute/slaves
+        - Join k8s cluster
 
-## Initialize Cluster
-### Head/master
-- Start K8S pass startup token to compute/slaves
-- Initialize networking (Currently using WeaveNet)
--Setup K8S Dashboard
-- Create dynamic/persistent volumes
-### Compute/slaves
-- Join k8s cluster
+### Slurm
+* Download and build Slurm source
+* Install necessary dependencies
+    - Python3
+    - munge
+    - MariaDB
+    - MariaDB development libraries
+* Build Slurm configuration files

+ 9 - 0
docs/PREINSTALL.md

@@ -16,3 +16,12 @@ Omnia can configure systems which use Ethernet- or Infiniband-based fabric to co
 ![Example system configuration with Ethernet fabric](images/example-system-ethernet.png)
 
 ![Example system configuration with Infiniband fabric](images/example-system-infiniband.png)
+
+## Network Setup
+Omnia assumes that servers are already connected to the network and have access to the internet.
+### Network Topology
+Possible network configurations include:
+* A flat topology where all nodes are connected to a switch which includes an uplink to the internet. This requires multiple externally-facing IP addresses
+* A hierarchical topology where compute nodes are connected to a common switch, but the master node contains a second network connection which is connected to the internet. All outbound/inbound traffic would be routed through the master node. This requires setting up firewall rules for IP masquerade, see [here](https://www.server-world.info/en/note?os=CentOS_7&p=firewalld&f=2) for an example.
+### IP and Hostname Assignment
+The recommended setup is to assign IP addresses to individual servers. This can be done manually by logging onto each node, or via DHCP.

+ 24 - 0
docs/README.md

@@ -0,0 +1,24 @@
+# Omnia Documentation
+Omnia (Latin: all or everything) is a deployment tool to turn Dell EMC PowerEdge servers with standard RPM-based Linux OS images into a functioning Slurm/Kubernetes cluster. Omnia is a collection of [Ansible](https://ansible.org) playbooks for installing and configuring Slurm or Kubernetes on an inventory of servers, along with additional software packages and services.
+
+## Installing Omnia
+Omnia requires that servers already have an RPM-based Linux OS running on them, and are all connected to the Internet. Currently all Omnia testing is done on [CentOS](https://centos.org). Please see [PREINSTALL](PREINSTALL.md) for instructions on network setup.
+
+Once servers have functioning OS and networking, you can using Omnia to install and start Slurm and/or Kubernetes. Please see [INSTALL](INSTALL.md) for instructions.
+
+## Contributing to Omnia
+The Omnia project was started to give members of the [Dell Technologies HPC Community](https://dellhpc.org) a way to easily setup clusters of Dell EMC servers, but to contribute useful tools, fixes, and functionality back to the HPC Community.
+
+### Open to All
+While we started Omnia within the Dell Technologies HPC Community, that doesn't mean that it's limited to Dell EMC servers, networking, and storage. This is an open project, and we want to encourage *everyone* to use and contribute to Omnia!
+
+### Anyone Can Contribute!
+It's not just new features and bug fixes that can be contributed to the Omnia project! Anyone should feel comfortable contributing. We are asking for all types of contributions:
+* New feature code
+* Bug fixes
+* Documentation updates
+* Feature suggestions
+* Feedback
+* Validation that it works for your particular configuration
+
+If you would like to contribute, see [CONTRIBUTING](../CONTRIBUTING.md).

+ 0 - 30
example.yaml

@@ -1,30 +0,0 @@
-apiVersion: "kubeflow.org/v1alpha2"
-kind: "TFJob"
-metadata:
-  name: "example-job"
-spec:
-  replicaSpecs:
-    - replicas: 1
-      tfReplicaType: MASTER
-      template:
-        spec:
-          containers:
-            - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff
-              name: tensorflow
-          restartPolicy: OnFailure
-    - replicas: 1
-      tfReplicaType: WORKER
-      template:
-        spec:
-          containers:
-            - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff
-              name: tensorflow
-          restartPolicy: OnFailure
-    - replicas: 2
-      tfReplicaType: PS
-      template:
-        spec:
-          containers:
-            - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff
-              name: tensorflow
-          restartPolicy: OnFailure

+ 6 - 0
build-kubernetes-cluster.yml

@@ -33,3 +33,9 @@
   gather_facts: false
   roles:
     - startworkers
+
+# Start K8s worker servers
+- hosts: master
+  gather_facts: false
+  roles:
+    - startservices

+ 6 - 5
host_inventory_file

@@ -2,13 +2,14 @@
 friday
 
 [compute]
-compute[000:005]
-#compute000
-#compute001
-#compute002
+compute000
+compute[002:005]
 
 [gpus]
-compute[003:005]
+#compute001
+compute002
+compute004
+compute005
 
 [workers:children]
 compute

roles/common/files/k8s.conf → kubernetes/roles/common/files/k8s.conf


roles/common/files/kubernetes.repo → kubernetes/roles/common/files/kubernetes.repo


roles/common/files/nvidia → kubernetes/roles/common/files/nvidia


roles/common/handlers/main.yml → kubernetes/roles/common/handlers/main.yml


+ 9 - 3
roles/common/tasks/main.yml

@@ -45,18 +45,24 @@
 - name: install common packages
   yum: 
     name:
+      - yum-plugin-versionlock
       - gcc
       - nfs-utils
       - python-pip
       - docker
       - bash-completion
-      - kubelet
-      - kubeadm
-      - kubectl
+      - kubelet-1.16.7
+      - kubeadm-1.16.7
+      - kubectl-1.16.7
       - nvidia-detect
     state: present
   tags: install
 
+- name: versionlock kubernetes
+  command: yum versionlock kubelet-1.16.7 kubectl-1.16.7 kubeadm-1.16.7
+  tags: install
+  
+
 - name: install InfiniBand Support
   yum:
     name: "@Infiniband Support"

roles/common/vars/main.yml → kubernetes/roles/common/vars/main.yml


roles/computeGPU/files/k8s.conf → kubernetes/roles/computeGPU/files/k8s.conf


roles/computeGPU/files/kubernetes.repo → kubernetes/roles/computeGPU/files/kubernetes.repo


roles/computeGPU/files/nvidia → kubernetes/roles/computeGPU/files/nvidia


roles/computeGPU/handlers/main.yml → kubernetes/roles/computeGPU/handlers/main.yml


roles/computeGPU/tasks/main.yml → kubernetes/roles/computeGPU/tasks/main.yml


roles/computeGPU/vars/main.yml → kubernetes/roles/computeGPU/vars/main.yml


roles/master/files/k8s.conf → kubernetes/roles/master/files/k8s.conf


roles/master/files/kubernetes.repo → kubernetes/roles/master/files/kubernetes.repo


roles/master/files/nvidia → kubernetes/roles/master/files/nvidia


+ 1 - 1
roles/master/tasks/main.yml

@@ -19,7 +19,7 @@
 
 - name: Get Helm Installer
   get_url:
-    url: https://raw.githubusercontent.com/kubernetes/helm/master/scripts/get 
+    url: https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
     dest: /root/bin/get_helm.sh
     mode: 700 
   tags: master

roles/startmaster/files/create_admin_user.yaml → kubernetes/roles/startmaster/files/create_admin_user.yaml


roles/startmaster/files/create_clusterRoleBinding.yaml → kubernetes/roles/startmaster/files/create_clusterRoleBinding.yaml


roles/startmaster/files/data-pv.yaml → kubernetes/roles/startmaster/files/data-pv.yaml


roles/startmaster/files/data2-pv.yaml → kubernetes/roles/startmaster/files/data2-pv.yaml


roles/startmaster/files/data3-pv.yaml → kubernetes/roles/startmaster/files/data3-pv.yaml


roles/startmaster/files/data4-pv.yaml → kubernetes/roles/startmaster/files/data4-pv.yaml


roles/startmaster/files/enable_gpu_k8s.sh → kubernetes/roles/startmaster/files/enable_gpu_k8s.sh


roles/startmaster/files/flannel_net.sh → kubernetes/roles/startmaster/files/flannel_net.sh


roles/startmaster/files/katib-pv.yaml → kubernetes/roles/startmaster/files/katib-pv.yaml


roles/startmaster/files/kube-flannel.yaml → kubernetes/roles/startmaster/files/kube-flannel.yaml


roles/startmaster/files/kubeflow_persistent_volumes.yaml → kubernetes/roles/startmaster/files/kubeflow_persistent_volumes.yaml


roles/startmaster/files/minio-pvc.yaml → kubernetes/roles/startmaster/files/minio-pvc.yaml


roles/startmaster/files/mysql-pv.yaml → kubernetes/roles/startmaster/files/mysql-pv.yaml


roles/startmaster/files/nfs-class.yaml → kubernetes/roles/startmaster/files/nfs-class.yaml


roles/startmaster/files/nfs-deployment.yaml → kubernetes/roles/startmaster/files/nfs-deployment.yaml


roles/startmaster/files/nfs-serviceaccount.yaml → kubernetes/roles/startmaster/files/nfs-serviceaccount.yaml


roles/startmaster/files/nfs_clusterrole.yaml → kubernetes/roles/startmaster/files/nfs_clusterrole.yaml


roles/startmaster/files/nfs_clusterrolebinding.yaml → kubernetes/roles/startmaster/files/nfs_clusterrolebinding.yaml


roles/startmaster/files/notebook-pv.yaml → kubernetes/roles/startmaster/files/notebook-pv.yaml


roles/startmaster/files/persistent_volumes.yaml → kubernetes/roles/startmaster/files/persistent_volumes.yaml


roles/startmaster/files/pvc.yaml → kubernetes/roles/startmaster/files/pvc.yaml


roles/startmaster/files/tiller_config.sh → kubernetes/roles/startmaster/files/tiller_config.sh


+ 7 - 56
roles/startmaster/tasks/main.yml

@@ -51,68 +51,26 @@
     msg: "[Master] K8S_MASTER_IP is  {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}"
   tags: init
 
-  
-- name: Setup Flannel SDN network
-  shell: kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
+- name: Setup Calico SDN network
+  shell: kubectl apply -f https://docs.projectcalico.org/manifests/calico.yaml
   tags: init
+  
+#- name: Setup Flannel SDN network
+  #shell: kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
+  #tags: init
 
 - name: Enabled GPU support in Kubernetes
-  #script: enable_gpu_k8s.sh
   shell: kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta4/nvidia-device-plugin.yml
                            #https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v1.11/nvidia-device-plugin.yml
   register: gpu_enable
   tags: init
 
-
 - name: Create yaml repo for setup
   file: 
     path: /root/k8s 
     state: directory
   tags: init
 
-
-#- name: Persistent Volume Setup Files
-  #copy: src=persistent_volumes.yaml dest=/root/k8s/persistent_volumes.yaml owner=root group=root mode=655
-  #tags: init
-#
-#- name: Persistent Volume Setup - Apply 
-  #shell: kubectl apply -f /root/k8s/persistent_volumes.yaml
-  #tags: init
-  #
-
-#- name: Copy Service Account (NFS Setup)
-  #copy: src=nfs-serviceaccount.yaml dest=/root/k8s/nfs-serviceaccount.yaml owner=root group=root mode=655
-  #tags: init
-#
-#- name: Copy Cluster Role (NFS Setup)
-  #copy: src=nfs_clusterrole.yaml dest=/root/k8s/nfs_clusterrole.yaml owner=root group=root mode=655
-  #tags: init
-#
-#- name: Copy Cluster Role Binding (NFS Setup)
-  #copy: src=nfs_clusterrolebinding.yaml dest=/root/k8s/nfs_clusterrolebinding.yaml owner=root group=root mode=655
-  #tags: init
-#
-#- name: Copy NFS Storage Deployment (NFS Setup)
-  #copy: src=nfs-deployment.yaml dest=/root/k8s/nfs-deployment.yaml owner=root group=root mode=655
-  #tags: init
-#
-#- name: Copy NFS Storage Class (NFS Setup)
-  #copy: src=nfs-class.yaml dest=/root/k8s/nfs-class.yaml owner=root group=root mode=655
-  #tags: init
-#
-#- name: Deploy NFS (NFS Setup)
-  #shell: kubectl create -f /root/k8s/nfs-deployment.yaml -f /root/k8s/nfs-class.yaml -f /root/k8s/nfs-serviceaccount.yaml -f /root/k8s/nfs_clusterrole.yaml -f /root/k8s/nfs_clusterrolebinding.yaml 
-  #tags: init
-
-#- name: Patch NFS Setup (NFS Setup)
-  #shell: kubectl patch deployment nfs-client-provisioner -p '{"spec":{"template":{"spec":{"serviceAccount":"nfs-client-provisioner"}}}}'
-  #tags: init
-
-#- name: Patch NFS Setup (NFS Setup)
-  #shell: "kubectl patch storageclass managed-nfs-storage -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'"
-  #tags: init
-
-  
 - name: Create Service Account (K8S Dashboard) Files
   copy: src=create_admin_user.yaml dest=/root/k8s/create_admin_user.yaml owner=root group=root mode=655
   tags: init
@@ -129,17 +87,10 @@
   shell: kubectl create -f /root/k8s/create_clusterRoleBinding.yaml
   tags: init
 
-- name: Start K8S Dashboard
-  shell: kubectl create -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.0.0-beta6/aio/deploy/recommended.yaml
-  tags: init
-
 - name: Dump Bearer Token for K8S Dashboard Login
   shell: kubectl -n kube-system describe secret $(kubectl -n kube-system get secret | grep admin-user | awk '{print $1}') > /root/k8s/token
   tags: init
 
-#- name: Proxy K8S Dashboard to 8001 on localhost
-  #shell: nohup kubectl proxy </dev/null >/dev/null 2>&1 & 
-  #tags: init
-
+# If more debug information is needed during init uncomment the following 2 lines
 #- debug: var=init_output.stdout_lines
   #tags: init

+ 1 - 1
roles/startservices/files/jhub-db-pv.yaml

@@ -12,5 +12,5 @@ spec:
   nfs:
     server: 10.0.0.1
     path: /work/k8s/jhub-db
-  persistentVolumeReclaimPolicy: Retain
+  persistentVolumeReclaimPolicy: Recycle
 

roles/startservices/files/jupyter-pvc.yaml → kubernetes/roles/startservices/files/jupyter-pvc.yaml


roles/startservices/files/jupyter_config.yaml → kubernetes/roles/startservices/files/jupyter_config.yaml


roles/startmaster/files/metal-config.yaml → kubernetes/roles/startservices/files/metal-config.yaml


roles/startservices/files/metallb.yaml → kubernetes/roles/startservices/files/metallb.yaml


+ 84 - 0
kubernetes/roles/startservices/tasks/main.yml

@@ -0,0 +1,84 @@
+---
+#- name: Kick CoreDNS (this is a hack that needs to be fixed)
+  #shell:  kubectl get pods -n kube-system --no-headers=true | awk '/coredns/{print $1}'|xargs kubectl delete -n kube-system pod
+  #tags: init
+
+- name: Wait for CoreDNS to restart 
+  shell: kubectl rollout status deployment/coredns -n kube-system
+  tags: init
+
+- name: Deploy MetalLB
+  shell: kubectl apply -f https://raw.githubusercontent.com/google/metallb/v0.8.1/manifests/metallb.yaml
+  tags: init
+
+- name: Create MetalLB Setup Config Files
+  copy: src=metal-config.yaml dest=/root/k8s/metal-config.yaml owner=root group=root mode=655
+  tags: init
+
+- name: Create MetalLB Setup Deployment Files
+  copy: src=metallb.yaml dest=/root/k8s/metallb.yaml owner=root group=root mode=655
+  tags: init
+
+- name: Deploy MetalLB
+  shell: kubectl apply -f /root/k8s/metallb.yaml
+  tags: init
+
+- name: Create default setup for MetalLB
+  shell: kubectl apply -f /root/k8s/metal-config.yaml
+  tags: init
+
+#- name: Helm - create service account
+  #shell: kubectl create serviceaccount --namespace kube-system tiller
+  #tags: init
+
+#- name: Helm - create clusterRole Binding for tiller-cluster-rule
+  #shell: kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
+  #tags: init
+
+#- name: Helm - create clusterRoleBinding for admin
+  #shell: kubectl create clusterrolebinding tiller-cluster-admin --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
+  #tags: init
+
+#- name: Helm - init
+  #shell: helm init  --upgrade
+  #tags: init
+
+#- name: Wait for tiller to start 
+  #shell: kubectl rollout status deployment/tiller-deploy -n kube-system
+  #tags: init
+
+#- name: Helm - patch cluster Role Binding for tiller
+  #shell:  kubectl --namespace kube-system patch deploy tiller-deploy -p '{"spec":{"template":{"spec":{"serviceAccount":"tiller"}}}}'
+  #tags: init
+
+#- name: Wait for tiller to start 
+  #shell: kubectl rollout status deployment/tiller-deploy -n kube-system
+  #tags: init
+
+- name: Start K8S Dashboard
+  shell: kubectl create -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.0.0-beta6/aio/deploy/recommended.yaml
+  tags: init
+
+- name: Start NFS Client Provisioner
+  shell: helm install stable/nfs-client-provisioner --set nfs.server=10.0.0.1 --set nfs.path=/work --generate-name
+  tags: init
+
+- name: JupyterHub Persistent Volume Creation (files)  
+  copy: src=jhub-db-pv.yaml dest=/root/k8s/jhub-db-pv.yaml owner=root group=root mode=655
+  tags: init
+
+- name: jupyterHub Persistent Volume creation
+  shell: kubectl create -f /root/k8s/jhub-db-pv.yaml
+  tags: init
+
+- name: JupyterHub Custom Config (files)  
+  copy: src=jupyter_config.yaml dest=/root/k8s/jupyter_config.yaml owner=root group=root mode=655
+  tags: init
+ 
+- name: jupyterHub deploy
+  shell: helm install jupyterhub/jupyterhub  --namespace default --version 0.8.2 --values /root/k8s/jupyter_config.yaml --generate-name
+  tags: init
+
+- name: Prometheus deployment
+  shell: helm install stable/prometheus --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer --generate-name
+  tags: init

roles/startworkers/tasks/main.yml → kubernetes/roles/startworkers/tasks/main.yml


+ 2 - 2
scuttle

@@ -14,6 +14,6 @@ ifconfig cni0 down
 clush -ab "ifconfig cni0 down"
 ifconfig flannel.1 down
 clush -ab "ifconfig flannel.1 down"
-brctl delbr flannel.1
-clush -ab "brctl delbr flannel.1"
+#brctl delbr flannel.1
+#clush -ab "brctl delbr flannel.1"
 clush -ab "brctl delbr cni0"

+ 0 - 21
roles/startservices/files/metal-config.yaml

@@ -1,21 +0,0 @@
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  namespace: metallb-system
-  name: config
-data:
-  config: |
-    address-pools:
-    - name: default
-      protocol: layer2
-      addresses:
-      - 10.0.0.150/32
-      - 10.0.0.151/32
-      - 10.0.0.152/32
-      - 10.0.0.153/32
-      - 10.0.0.154/32
-      - 10.0.0.155/32
-      - 10.0.0.156/32
-      - 10.0.0.157/32
-      - 10.0.0.158/32
-      - 10.0.0.159/32

+ 0 - 80
roles/startservices/tasks/main.yml

@@ -1,80 +0,0 @@
----
-- name: Kick CoreDNS (needed for kubernetes <= v1.15.10)
-  shell:  kubectl get pods -n kube-system --no-headers=true | awk '/coredns/{print $1}'|xargs kubectl delete -n kube-system pod
-  tags: init
-
-- name: Wait for CoreDNS to restart 
-  shell: kubectl rollout status deployment/coredns -n kube-system
-  tags: init
-
-- name: Deploy MetalLB
-  shell: kubectl apply -f https://raw.githubusercontent.com/google/metallb/v0.8.1/manifests/metallb.yaml
-  tags: init
-
-- name: Create MetalLB Setup Config Files
-  copy: src=metal-config.yaml dest=/root/k8s/metal-config.yaml owner=root group=root mode=655
-  tags: init
-
-- name: Create MetalLB Setup Deployment Files
-  copy: src=metallb.yaml dest=/root/k8s/metallb.yaml owner=root group=root mode=655
-  tags: init
-
-- name: Deploy MetalLB
-  shell: kubectl apply -f /root/k8s/metallb.yaml
-  tags: init
-
-- name: Create default setup for MetalLB
-  shell: kubectl apply -f /root/k8s/metal-config.yaml
-  tags: init
-
-- name: Helm - create service account
-  shell: kubectl create serviceaccount --namespace kube-system tiller
-  tags: init
-
-- name: Helm - create clusterRole Binding for tiller-cluster-rule
-  shell: kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
-  tags: init
-
-- name: Helm - create clusterRoleBinding for admin
-  shell: kubectl create clusterrolebinding tiller-cluster-admin --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
-  tags: init
-
-- name: Helm - init
-  shell: helm init  --upgrade
-  tags: init
-
-- name: Wait for tiller to start 
-  shell: kubectl rollout status deployment/tiller-deploy -n kube-system
-  tags: init
-
-- name: Helm - patch cluster Role Binding for tiller
-  shell:  kubectl --namespace kube-system patch deploy tiller-deploy -p '{"spec":{"template":{"spec":{"serviceAccount":"tiller"}}}}'
-  tags: init
-
-- name: Wait for tiller to start 
-  shell: kubectl rollout status deployment/tiller-deploy -n kube-system
-  tags: init
-
-- name: Start K8S Dashboard
-  shell: kubectl create -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.0.0-beta6/aio/deploy/recommended.yaml
-  tags: init
-
-- name: Start NFS Client Provisioner
-  shell: helm install --name nfs  stable/nfs-client-provisioner --set nfs.server=10.0.0.1 --set nfs.path=/work
-  tags: init
-
-- name: JupyterHub Persistent Volume Creation (files)  
-  copy: src=jhub-db-pv.yaml dest=/root/k8s/jhub-db-pv.yaml owner=root group=root mode=655
-  tags: init
-
-- name: jupyterHub Persistent Volume creation
-  shell: kubectl create -f /root/k8s/jhub-db-pv.yaml
-  tags: init
-
-- name: JupyterHub Custom Config (files)  
-  copy: src=jupyter_config.yaml dest=/root/k8s/jupyter_config.yaml owner=root group=root mode=655
-  tags: init
- 
-- name: jupyterHub deploy
-  shell: helm install jupyterhub/jupyterhub  --namespace default --version 0.8.2 --values /root/k8s/jupyter_config.yaml
-  tags: init

BIN
slurm/roles/slurm-common/files/munge.key


+ 97 - 0
slurm/roles/slurm-common/files/slurm.conf

@@ -0,0 +1,97 @@
+#
+# Example slurm.conf file. Please run configurator.html
+# (in doc/html) to build a configuration file customized
+# for your environment.
+#
+#
+# slurm.conf file generated by configurator.html.
+#
+# See the slurm.conf man page for more information.
+#
+ClusterName=friday
+ControlMachine=friday
+ControlAddr=10.0.0.1
+#BackupController=
+#BackupAddr=
+#
+SlurmUser=slurm
+#SlurmdUser=root
+SlurmctldPort=6817
+SlurmdPort=6818
+AuthType=auth/munge
+#JobCredentialPrivateKey=
+#JobCredentialPublicCertificate=
+StateSaveLocation=/var/spool/slurm/ctld
+SlurmdSpoolDir=/var/spool/slurm/
+SwitchType=switch/none
+MpiDefault=none
+SlurmctldPidFile=/var/run/slurmctld.pid
+SlurmdPidFile=/var/run/slurmd.pid
+ProctrackType=proctrack/pgid
+#PluginDir=
+#FirstJobId=
+ReturnToService=2
+#MaxJobCount=
+#PlugStackConfig=
+#PropagatePrioProcess=
+#PropagateResourceLimits=
+#PropagateResourceLimitsExcept=
+#Prolog=
+#Epilog=
+#SrunProlog=
+#SrunEpilog=
+#TaskProlog=
+#TaskEpilog=
+#TaskPlugin=
+#TrackWCKey=no
+#TreeWidth=50
+#TmpFS=
+#UsePAM=
+#
+# TIMERS
+SlurmctldTimeout=300
+SlurmdTimeout=300
+InactiveLimit=0
+MinJobAge=300
+KillWait=30
+Waittime=0
+#
+# SCHEDULING
+SchedulerType=sched/backfill
+#SchedulerAuth=
+SelectType=select/linear
+#FastSchedule=1
+PriorityType=priority/multifactor
+PriorityDecayHalfLife=14-0
+#PriorityUsageResetPeriod=14-0
+PriorityWeightFairshare=100000
+PriorityWeightAge=1000
+PriorityWeightPartition=10000
+PriorityWeightJobSize=1000
+PriorityMaxAge=14-0
+#
+# LOGGING
+SlurmctldDebug=3
+SlurmctldLogFile=/var/log/slurm/slurmctld.log
+SlurmdDebug=1
+SlurmdLogFile=/var/log/slurm/slurmd.log
+JobCompType=jobcomp/none
+#JobCompLoc=
+#
+# ACCOUNTING
+JobAcctGatherType=jobacct_gather/linux
+JobAcctGatherFrequency=30
+#
+AccountingStorageType=accounting_storage/slurmdbd
+#AccountingStorageHost=
+#AccountingStorageLoc=
+#AccountingStoragePass=
+#AccountingStorageUser=
+#
+# COMPUTE NODES
+#NodeName=linux[1-32] Procs=1 State=UNKNOWN
+#NodeName=DEFAULT Sockets=2 CoresPerSocket=20 State=UNKNOWN
+NodeName=compute000 Sockets=2 CoresPerSocket=8
+NodeName=compute[002-005] CoresPerSocket=20
+PartitionName=normal Nodes=ALL Default=YES MaxTime=INFINITE State=UP
+#PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP

+ 91 - 0
slurm/roles/slurm-common/tasks/main.yaml

@@ -0,0 +1,91 @@
+---
+
+- name: install packages for slurm
+  yum: 
+    name:
+      - munge
+      - mariadb
+      - mariadb-devel
+      - python3
+    state: present
+  tags: install
+
+- name: create munge key
+  command: /usr/sbin/create-munge-key -f
+  tags: install
+
+- name: Copy munge key
+  copy:
+    src: munge.key
+    dest: /etc/munge
+    owner: munge
+    group: munge
+    mode: 0400
+  tags: install
+
+- name: Copy example Slurm Configuration - slurm.conf
+  copy:
+    src: slurm.conf
+    dest: /etc/slurm/
+    mode: 0644
+  tags: install
+
+
+- name: create SLURM Group
+  group: 
+    name: slurm 
+    state: present
+  tags: install
+
+- name: Add the user 'slurm' with uid 6001 and a primary group of 'slurm'
+  user:
+    name: slurm
+    comment: Slurm User Account
+    uid: 6001
+    group: slurm
+  tags: install
+
+- name: create SLURM log directory
+  file:
+    path: /var/log/slurm
+    state: directory
+    owner: slurm
+    group: slurm
+    mode: 0755
+    recurse: yes
+  tags: install
+
+- name: give slurm user permission to spool
+  file: 
+    path: /var/spool/slurm
+    owner: slurm
+    group: slurm
+    state: directory
+    mode: 0755
+    recurse: yes
+
+- name: give slurm user permission to slurmctld
+  file: 
+    path: /var/run/slurmctld.pid
+    owner: slurm
+    group: slurm
+    mode: 0755
+    state: touch
+
+- name: give slurm user permission to slurmd
+  file: 
+    path: /var/run/slurmd.pid
+    owner: slurm
+    group: slurm
+    mode: 0755
+    state: touch
+
+- name: start munge service
+  service:
+    name: munge 
+    state: restarted
+    enabled: yes
+  tags: install
+
+
+

+ 98 - 0
slurm/roles/slurm-master/tasks/main.yaml

@@ -0,0 +1,98 @@
+---
+
+- name: Download Slurm source
+  get_url:
+    url: "{{ slurm_url }}"
+    dest: /root/Downloads/
+    checksum: "{{ slurm_md5 }}" 
+  tags: install
+
+- name: Build SLURM RPMs
+  command: rpmbuild -ta /root/Downloads/slurm-20.02.0.tar.bz2 
+  tags: install
+
+- name: Copy RPMs to NFS share
+  copy:
+    src: "{{ item }}" 
+    dest: /home/rpms/
+  with_fileglob:
+    - /root/rpmbuild/RPMS/x86_64/slurm*20*.rpm
+  tags: install
+
+- name: Install SLURM RPMs on Master
+  yum: 
+    name: "{{ item }}"
+    #name: "{{ query('fileglob', ['/home/rpms/slurm*20*.rpm']) }}" <-- how it should work to avoid loop
+  with_fileglob:
+    - /home/rpms/slurm*20*.rpm
+  tags: install
+
+- name: Firewall Rule slurm allow 6817/tcp
+  command: firewall-cmd  --zone=internal --add-port=6817/tcp --permanent
+  tags: install
+
+- name: Firewall Rule slurm allow 6818/tcp
+  command: firewall-cmd  --zone=internal --add-port=6818/tcp --permanent
+  tags: install
+
+- name: Firewall Rule slurm allow 6819/tcp
+  command: firewall-cmd  --zone=internal --add-port=6819/tcp --permanent
+  tags: install
+
+- name: Firewall Rule slurm allow all incoming traffic on internal network
+  command: firewall-cmd --permanent --zone=internal --add-rich-rule='rule family="ipv4" source address="192.168.1.0/24" accept'
+  tags: install
+
+- name: Firewall Reload
+  command: firewall-cmd  --reload
+  tags: install
+
+
+- name: Start MariaDB 
+  service:
+    name: mariadb
+    state: restarted
+    enabled: yes
+  tags: install
+
+- name: Grant Permissions for SLURM DB
+  command: mysql -u root -e "GRANT ALL ON slurm_acct_db.* TO 'slurm'@'localhost' identified by 'password' with grant option;"
+  tags: install
+
+- name: Create slurmdbd.conf file
+  copy:
+    src: /etc/slurm/slurmdbd.conf.example
+    dest: /etc/slurm/slurmdbd.conf
+    mode: 0600
+  tags: install
+
+- name: Populate Accounting Database
+  command: slurmdbd
+  tags: install
+
+- name: Create Slurm Cluster
+  command: sacctmgr -i add cluster {{inventory_hostname}}
+  tags: install
+
+- name: Create Default Slurm Group
+  command: sacctmgr -i add account defaultgroup Cluster={{inventory_hostname}} Description="Default Account" Organization="Default Org"
+  tags: install
+
+- name: Add root to the Default Account 
+  command: sacctmgr -i add user root DefaultAccount=defaultgroup
+  tags: install
+
+- name: Start slurmctld on Master
+  service: 
+    name: slurmctld
+    state: restarted
+    enabled: yes
+  tags: install
+
+- name: Enable Slurmdbd on Master
+  service: 
+    name: slurmdbd
+    state: restarted
+    enabled: yes
+  tags: install
+

+ 9 - 0
slurm/roles/start-slurm-workers/tasks/main.yml

@@ -0,0 +1,9 @@
+---
+- name: Install SLURM RPMs on compute
+  yum:
+    name: "{{ item }}"
+    #name: "{{ query('fileglob', ['/home/rpms/slurm*20*.rpm']) }}" <-- how it should work to avoid loop
+  with_fileglob:
+    - /home/rpms/slurm*20*.rpm
+  tags: install
+

+ 23 - 0
slurm/slurm-cluster.yaml

@@ -0,0 +1,23 @@
+---
+#Playbook for installing Slurm on a cluster 
+
+#collect info from everything
+- hosts: all
+
+# Apply Common Installation and Config
+- hosts: cluster
+  gather_facts: false
+  roles:
+    - slurm-common
+
+# Apply Master Config, start services
+- hosts: master
+  gather_facts: false
+  roles:
+    - slurm-master
+
+# Start SLURM workers
+- hosts: compute
+  gather_facts: false
+  roles:
+    - start-slurm-workers

+ 18 - 0
slurm/slurm_inventory_file

@@ -0,0 +1,18 @@
+[master]
+friday
+
+[master:vars]
+slurm_url=https://download.schedmd.com/slurm/slurm-20.02.0.tar.bz2
+slurm_md5=md5:8ed2257471ff24ca213b510a4c1c3563
+
+[compute]
+compute000
+compute[002:005]
+
+
+[workers:children]
+compute
+
+[cluster:children]
+master
+workers

+ 7 - 0
tools/README.md

@@ -0,0 +1,7 @@
+# Tools for Omnia
+
+## change_personality
+```
+change_personality k|s <node_list>
+```
+Change the personality of a node (or list of nodes) to Kubernetes (`k`) or Slurm (`s`). System does not wait for currently running jobs to complete before making nodes available to the new personality.

+ 35 - 0
tools/change_personality

@@ -0,0 +1,35 @@
+#!/bin/bash
+
+#Usage: change_personality <k|s> <node_name>
+#       k = Kubernetes
+#       s = Slurm
+
+new_personality=$1
+dnsdomain=`dnsdomainname`
+shift
+
+if [ $new_personality == "k" ] 
+then
+# Change Personality to Kubernetes
+  echo "[INFO] Changing personality to Kubernetes"
+  for node in $*
+  do
+    echo -n "$node"
+    scontrol update nodename=$node state=DRAIN reason="used for k8s"
+    kubectl uncordon $node.$dnsdomain > /dev/null  
+    echo " [OK]"
+  done
+elif [ $new_personality == "s" ]
+then
+# Change Personality to Slurm
+  echo "[INFO] Changing personality to Slurm"
+  for node in $*
+  do
+    echo -n "$node"
+    kubectl cordon $node.$dnsdomain > /dev/null
+    scontrol update nodename=$node state=IDLE reason="used for Slurm"
+    echo " [OK]"
+  done
+else
+  echo "[ERROR] $new_personality is not a valid personality. Use 'k' or 's'"
+fi 

+ 11 - 0
tools/install_tools.yml

@@ -0,0 +1,11 @@
+---
+
+- hosts: master
+  tasks:
+  - name: Install Change Personality Script
+    copy:
+      src: tools/change_personality
+      dest: /usr/sbin/
+      owner: root
+      group: root
+      mode: '0700'