Browse Source

Issue #200: Fixed k8s_nfs_client_setup bug and other minor bugs

Signed-off-by: Blesson James <blesson_james@dellteam.com>
John Lockman 4 years ago
parent
commit
9c75d8a4c3

+ 4 - 4
omnia.yml

@@ -22,7 +22,7 @@
 
 
 - name: Gather facts from all the nodes
 - name: Gather facts from all the nodes
   hosts: all
   hosts: all
-    
+
 - name: Apply common installation and config
 - name: Apply common installation and config
   hosts: manager, compute
   hosts: manager, compute
   gather_facts: false
   gather_facts: false
@@ -56,7 +56,7 @@
   gather_facts: false
   gather_facts: false
   roles:
   roles:
     - k8s_nfs_server_setup
     - k8s_nfs_server_setup
-  tags: 
+  tags:
     - kubernetes
     - kubernetes
     - nfs
     - nfs
 
 
@@ -65,7 +65,7 @@
   gather_facts: false
   gather_facts: false
   roles:
   roles:
     - k8s_nfs_client_setup
     - k8s_nfs_client_setup
-  tags: 
+  tags:
     - kubernetes
     - kubernetes
     - nfs
     - nfs
 
 
@@ -143,4 +143,4 @@
       set_fact:
       set_fact:
         ssh_to: "{{ groups['manager'] }}"
         ssh_to: "{{ groups['manager'] }}"
   roles:
   roles:
-    - cluster_preperation
+    - cluster_preperation

+ 1 - 13
platforms/roles/kubeflow/tasks/main.yml

@@ -114,20 +114,8 @@
     regexp: 'NodePort'
     regexp: 'NodePort'
     replace: 'LoadBalancer'
     replace: 'LoadBalancer'
 
 
-- name: Remove cert-manager application block
-  replace:
-    path: "{{ kubeflow_config_file }}"
-    regexp: "{{ cert_manager_block }}"
-    replace: "\n"
-
-- name: Remove seldon-core-operator application block
-  replace:
-    path: "{{ kubeflow_config_file }}"
-    regexp: "{{ seldon_core_operator_block }}"
-    replace: "\n"
-
 - name: Apply kubeflow configuration
 - name: Apply kubeflow configuration
   command:
   command:
     cmd: "/usr/bin/kfctl apply -V -f '{{ kubeflow_config_file }}'"
     cmd: "/usr/bin/kfctl apply -V -f '{{ kubeflow_config_file }}'"
     chdir: "{{ omnia_kubeflow_dir_path }}"
     chdir: "{{ omnia_kubeflow_dir_path }}"
-  changed_when: true
+  changed_when: true

+ 0 - 22
platforms/roles/kubeflow/vars/main.yml

@@ -32,25 +32,3 @@ kfserving_gateway_yaml_file_path: "{{ omnia_kubeflow_dir_path }}/kustomize/kfser
 argo_yaml_file_path: "{{ omnia_kubeflow_dir_path }}/kustomize/argo/base/service.yaml"
 argo_yaml_file_path: "{{ omnia_kubeflow_dir_path }}/kustomize/argo/base/service.yaml"
 
 
 kubeflow_config_file: "{{ omnia_kubeflow_dir_path }}/kfctl_k8s_istio.v1.0.2.yaml"
 kubeflow_config_file: "{{ omnia_kubeflow_dir_path }}/kfctl_k8s_istio.v1.0.2.yaml"
-
-cert_manager_block: >
-    - kustomizeConfig:
-          overlays:
-          - self-signed
-          - application
-          parameters:
-          - name: namespace
-            value: cert-manager
-          repoRef:
-            name: manifests
-            path: cert-manager/cert-manager
-        name: cert-manager
-
-seldon_core_operator_block: >
-    - kustomizeConfig:
-          overlays:
-          - application
-          repoRef:
-            name: manifests
-            path: seldon/seldon-core-operator
-        name: seldon-core-operator

+ 2 - 2
roles/common/tasks/nvidia.yml

@@ -26,7 +26,7 @@
     enabled: yes
     enabled: yes
   tags: install
   tags: install
 
 
-- name: Add nvidia-container-runtime Repo 
+- name: Add nvidia-container-runtime Repo
   yum_repository:
   yum_repository:
     name: nvidia-container-runtime
     name: nvidia-container-runtime
     description:  nvidia-container-runtime
     description:  nvidia-container-runtime
@@ -39,7 +39,7 @@
     enabled: yes
     enabled: yes
   tags: install
   tags: install
 
 
-- name: Add nvidia-docker Repo 
+- name: Add nvidia-docker Repo
   yum_repository:
   yum_repository:
     name: nvidia-docker
     name: nvidia-docker
     description:  nvidia-docker
     description:  nvidia-docker

+ 1 - 0
roles/common/vars/main.yml

@@ -24,6 +24,7 @@ common_packages:
   - chrony
   - chrony
   - pciutils
   - pciutils
   - docker-ce
   - docker-ce
+  - openssl
 
 
 custom_fact_dir: /etc/ansible/facts.d
 custom_fact_dir: /etc/ansible/facts.d
 
 

+ 2 - 2
roles/k8s_common/tasks/main.yml

@@ -21,8 +21,8 @@
     enabled: yes
     enabled: yes
     gpgcheck: no
     gpgcheck: no
     repo_gpgcheck: no
     repo_gpgcheck: no
-    gpgkey: 
-      - https://packages.cloud.google.com/yum/doc/yum-key.gpg 
+    gpgkey:
+      - https://packages.cloud.google.com/yum/doc/yum-key.gpg
       - https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
       - https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
   tags: install
   tags: install
 
 

+ 4 - 4
roles/k8s_firewalld/tasks/main.yml

@@ -50,7 +50,7 @@
     permanent: yes
     permanent: yes
     state: enabled
     state: enabled
   with_items: "{{ flannel_udp_ports }}"
   with_items: "{{ flannel_udp_ports }}"
-  when: k8s_cni == "flannel"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "flannel"
   tags: firewalld
   tags: firewalld
 
 
 - name: Open calico UDP ports on the firewall
 - name: Open calico UDP ports on the firewall
@@ -59,7 +59,7 @@
     permanent: yes
     permanent: yes
     state: enabled
     state: enabled
   with_items: "{{ calico_udp_ports }}"
   with_items: "{{ calico_udp_ports }}"
-  when: k8s_cni == "calico"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
   tags: firewalld
   tags: firewalld
 
 
 - name: Open calico TCP ports on the firewall
 - name: Open calico TCP ports on the firewall
@@ -68,7 +68,7 @@
     permanent: yes
     permanent: yes
     state: enabled
     state: enabled
   with_items: "{{ calico_tcp_ports }}"
   with_items: "{{ calico_tcp_ports }}"
-  when: k8s_cni == "calico"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
   tags: firewalld
   tags: firewalld
 
 
 - name: Reload firewalld
 - name: Reload firewalld
@@ -81,4 +81,4 @@
     name: firewalld
     name: firewalld
     state: stopped
     state: stopped
     enabled: no
     enabled: no
-  tags: firewalld
+  tags: firewalld

+ 1 - 4
roles/k8s_firewalld/vars/main.yml

@@ -13,9 +13,6 @@
 #  limitations under the License.
 #  limitations under the License.
 ---
 ---
 
 
-# Kubernetes SDN network
-k8s_cni: calico
-
 # Master nodes firewall ports
 # Master nodes firewall ports
 k8s_master_ports:
 k8s_master_ports:
   - 6443
   - 6443
@@ -39,4 +36,4 @@ calico_tcp_ports:
 # Flannel CNI firewall ports
 # Flannel CNI firewall ports
 flannel_udp_ports:
 flannel_udp_ports:
   - 8285
   - 8285
-  - 8472
+  - 8472

+ 14 - 3
roles/k8s_nfs_client_setup/tasks/main.yml

@@ -19,6 +19,15 @@
     state: present
     state: present
   tags: nfs_client
   tags: nfs_client
 
 
+- name: Check mounted share
+  shell: mount | grep nfs
+  changed_when: false
+  args:
+    warn: false
+  register: mounted_share
+  ignore_errors: True
+  tags: nfs_client
+
 - name: Creating directory to mount NFS Share
 - name: Creating directory to mount NFS Share
   file:
   file:
     path: "{{ nfs_mnt_dir }}"
     path: "{{ nfs_mnt_dir }}"
@@ -27,14 +36,16 @@
   tags: nfs_client
   tags: nfs_client
 
 
 - name: Mounting NFS Share
 - name: Mounting NFS Share
-  command: "mount {{ groups['manager'] }}:{{ nfs_mnt_dir }} {{ nfs_mnt_dir }}"
+  command: "mount {{ groups['manager'][0] }}:{{ nfs_mnt_dir }} {{ nfs_mnt_dir }}"
   changed_when: true
   changed_when: true
   args:
   args:
     warn: false
     warn: false
+  when: groups['manager'][0] not in mounted_share.stdout
   tags: nfs_client
   tags: nfs_client
 
 
 - name: Configuring Automount NFS Shares on reboot
 - name: Configuring Automount NFS Shares on reboot
   lineinfile:
   lineinfile:
     path: "{{ fstab_file_path }}"
     path: "{{ fstab_file_path }}"
-    line: "{{ groups['manager'] }}:{{ nfs_mnt_dir }}     {{ nfs_mnt_dir }}  nfs     nosuid,rw,sync,hard,intr 0 0"
-  tags: nfs_client
+    line: "{{ groups['manager'][0] }}:{{ nfs_mnt_dir }}     {{ nfs_mnt_dir }}  nfs     nosuid,rw,sync,hard,intr 0 0"
+  when: groups['manager'][0] not in mounted_share.stdout
+  tags: nfs_client

+ 22 - 5
roles/k8s_start_manager/tasks/main.yml

@@ -22,9 +22,17 @@
   setup:
   setup:
     filter: ansible_default_ipv4.address
     filter: ansible_default_ipv4.address
 
 
+- name: Check K8s nodes status
+  command: kubectl get nodes
+  changed_when: false
+  ignore_errors: True
+  register: k8s_nodes
+  tags: init
+
 - name: Initialize kubeadm
 - name: Initialize kubeadm
   command: "/bin/kubeadm init --pod-network-cidr='{{ pod_network_cidr_ip }}' --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
   command: "/bin/kubeadm init --pod-network-cidr='{{ pod_network_cidr_ip }}' --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
   changed_when: true
   changed_when: true
+  when: "'master' not in k8s_nodes.stdout"
   register: init_output
   register: init_output
   tags: init
   tags: init
 
 
@@ -74,6 +82,7 @@
     token:  "{{ K8S_TOKEN.stdout }}"
     token:  "{{ K8S_TOKEN.stdout }}"
     hash:   "{{ K8S_MANAGER_CA_HASH.stdout }}"
     hash:   "{{ K8S_MANAGER_CA_HASH.stdout }}"
     ip:     "{{ ansible_default_ipv4.address }}"
     ip:     "{{ ansible_default_ipv4.address }}"
+    k8s_nodes:  "{{ k8s_nodes.stdout }}"
   tags: init
   tags: init
 
 
 - name: Print k8s token
 - name: Print k8s token
@@ -96,12 +105,12 @@
 
 
 - name: Setup Calico SDN network
 - name: Setup Calico SDN network
   command: "kubectl apply -f '{{ calico_yml_url }}'"
   command: "kubectl apply -f '{{ calico_yml_url }}'"
-  when: k8s_cni == "calico"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
   tags: init
   tags: init
 
 
 - name: Setup Flannel SDN network
 - name: Setup Flannel SDN network
   command: "kubectl apply -f '{{ flannel_yml_url }}'"
   command: "kubectl apply -f '{{ flannel_yml_url }}'"
-  when: k8s_cni == "flannel"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "flannel"
   tags: init
   tags: init
 
 
 - name: Create yaml repo for setup
 - name: Create yaml repo for setup
@@ -120,9 +129,16 @@
     mode: "{{ k8s_service_account_file_mode }}"
     mode: "{{ k8s_service_account_file_mode }}"
   tags: init
   tags: init
 
 
+- name: Check K8s service accounts status
+  command: "kubectl get serviceaccounts"
+  changed_when: false
+  register: k8s_service_accounts
+  tags: init
+
 - name: Create service account (K8s dashboard)
 - name: Create service account (K8s dashboard)
   command: "kubectl create -f '{{ k8s_service_account_file_dest }}'"
   command: "kubectl create -f '{{ k8s_service_account_file_dest }}'"
   changed_when: true
   changed_when: true
+  when: "'default' not in k8s_service_accounts.stdout"
   tags: init
   tags: init
 
 
 - name: Create clusterRoleBinding (K8s dashboard) files
 - name: Create clusterRoleBinding (K8s dashboard) files
@@ -137,6 +153,7 @@
 - name: Create clusterRoleBinding (K8s dashboard)
 - name: Create clusterRoleBinding (K8s dashboard)
   command: "kubectl create -f '{{ k8s_clusterRoleBinding_file_dest }}'"
   command: "kubectl create -f '{{ k8s_clusterRoleBinding_file_dest }}'"
   changed_when: true
   changed_when: true
+  ignore_errors: True
   tags: init
   tags: init
 
 
 - name: Dump bearer token for K8s dashboard login
 - name: Dump bearer token for K8s dashboard login
@@ -146,7 +163,7 @@
   changed_when: true
   changed_when: true
   tags: init
   tags: init
 
 
-- name: Edge / Workstation Install allows pods to schedule on manager
+- name: Edge / Workstation Install allows pods to scheudle on manager
   command: kubectl taint nodes --all node-role.kubernetes.io/master-
   command: kubectl taint nodes --all node-role.kubernetes.io/master-
-  when: single_node
-  tags: init
+  when: groups['manager'][0] == groups['compute'][0] and groups['compute']|length == 1
+  tags: init

+ 1 - 5
roles/k8s_start_manager/vars/main.yml

@@ -13,10 +13,6 @@
 #  limitations under the License.
 #  limitations under the License.
 ---
 ---
 
 
-single_node: false
-
-k8s_cni: calico
-
 pod_network_cidr_ip: 10.244.0.0/16
 pod_network_cidr_ip: 10.244.0.0/16
 
 
 k8s_root_directory: /root/.kube
 k8s_root_directory: /root/.kube
@@ -47,4 +43,4 @@ k8s_clusterRoleBinding_file_mode: 0655
 
 
 calico_yml_url: https://docs.projectcalico.org/manifests/calico.yaml
 calico_yml_url: https://docs.projectcalico.org/manifests/calico.yaml
 
 
-flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
+flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml

+ 17 - 3
roles/k8s_start_services/tasks/main.yml

@@ -19,9 +19,16 @@
   ignore_errors: True
   ignore_errors: True
   tags: init
   tags: init
 
 
+- name: Get K8s pods
+  command: kubectl get pods --all-namespaces
+  changed_when: false
+  register: k8s_pods
+  tags: init
+
 - name: Deploy MetalLB
 - name: Deploy MetalLB
   command: "kubectl apply -f '{{ metallb_yaml_url }}'"
   command: "kubectl apply -f '{{ metallb_yaml_url }}'"
   changed_when: true
   changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
   tags: init
   tags: init
 
 
 - name: Create MetalLB Setup Config Files
 - name: Create MetalLB Setup Config Files
@@ -45,17 +52,19 @@
 - name: Deploy MetalLB
 - name: Deploy MetalLB
   command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
   command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
   changed_when: true
   changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
   tags: init
   tags: init
 
 
 - name: Create default setup for MetalLB
 - name: Create default setup for MetalLB
   command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
   command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
   changed_when: true
   changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
   tags: init
   tags: init
 
 
 - name: Start k8s dashboard
 - name: Start k8s dashboard
   command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
   command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
   changed_when: true
   changed_when: true
-  register: result
+  when: "'kubernetes-dashboard' not in k8s_pods.stdout"
   tags: init
   tags: init
 
 
 - name: Helm - add stable repo
 - name: Helm - add stable repo
@@ -81,7 +90,7 @@
 - name: Start NFS Client Provisioner
 - name: Start NFS Client Provisioner
   command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server }}' --set nfs.path='{{ nfs_path }}' --generate-name"
   command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server }}' --set nfs.path='{{ nfs_path }}' --generate-name"
   changed_when: true
   changed_when: true
-  register: result
+  when: "'nfs-client-provisioner' not in k8s_pods.stdout"
   tags: init
   tags: init
 
 
 - name: Set NFS-Client Provisioner as DEFAULT StorageClass
 - name: Set NFS-Client Provisioner as DEFAULT StorageClass
@@ -97,25 +106,30 @@
     --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
     --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
     --generate-name
     --generate-name
   changed_when: true
   changed_when: true
+  when: "'prometheus' not in k8s_pods.stdout"
   tags: init
   tags: init
 
 
 - name: Install MPI Operator
 - name: Install MPI Operator
   command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
   command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
   changed_when: true
   changed_when: true
+  when: "'mpi-operator' not in k8s_pods.stdout"
   tags: init
   tags: init
 
 
 - name: Install nvidia-device-plugin
 - name: Install nvidia-device-plugin
   command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
   command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
   changed_when: true
   changed_when: true
+  when: "'nvidia-device-plugin' not in k8s_pods.stdout"
   tags: init
   tags: init
 
 
 - name: Install GPU Feature Discovery
 - name: Install GPU Feature Discovery
   command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
   command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
   changed_when: true
   changed_when: true
+  when: "'node-feature-discovery' not in k8s_pods.stdout"
   tags: init
   tags: init
 
 
 - name: Deploy Xilinx Device plugin
 - name: Deploy Xilinx Device plugin
   command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
   command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
   changed_when: true
   changed_when: true
   register: fpga_enable
   register: fpga_enable
-  tags: init
+  when: "'fpga-device-plugin' not in k8s_pods.stdout"
+  tags: init

+ 8 - 2
roles/k8s_start_workers/tasks/main.yml

@@ -18,10 +18,16 @@
   changed_when: true
   changed_when: true
   tags: init
   tags: init
 
 
+- name: Get hostname
+  command: hostname
+  changed_when: true
+  register: node_hostname
+  tags: init
+
 - name: Execute kubeadm join command
 - name: Execute kubeadm join command
   shell: >
   shell: >
     kubeadm join --token={{ hostvars['K8S_TOKEN_HOLDER']['token'] }}
     kubeadm join --token={{ hostvars['K8S_TOKEN_HOLDER']['token'] }}
     --discovery-token-ca-cert-hash sha256:{{ hostvars['K8S_TOKEN_HOLDER']['hash'] }}
     --discovery-token-ca-cert-hash sha256:{{ hostvars['K8S_TOKEN_HOLDER']['hash'] }}
     {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}:{{ apiserver_bind_port }}
     {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}:{{ apiserver_bind_port }}
-  when: not single_node
-  tags: init
+  when: groups['manager'][0] != groups['compute'][0] and groups['compute']|length >= 1 and node_hostname.stdout not in hostvars['K8S_TOKEN_HOLDER']['k8s_nodes']
+  tags: init

+ 1 - 3
roles/k8s_start_workers/vars/main.yml

@@ -13,6 +13,4 @@
 #  limitations under the License.
 #  limitations under the License.
 ---
 ---
 
 
-single_node: false
-
-apiserver_bind_port: 6443
+apiserver_bind_port: 6443

+ 6 - 0
site/CONTRIBUTORS.md

@@ -0,0 +1,6 @@
+# Omnia Maintainers
+- Luke Wilson and John Lockman (Dell Technologies)
+<img src="images/delltech.jpg" height="90px" alt="Dell Technologies">
+
+# Omnia Contributors
+<img src="images/delltech.jpg" height="90px" alt="Dell Technologies"> <img src="images/pisa.png" height="100px" alt="Universita di Pisa">

+ 110 - 0
site/INSTALL.md

@@ -0,0 +1,110 @@
+## TL;DR Installation
+ 
+### Kubernetes
+Install Slurm and Kubernetes, along with all dependencies
+```
+ansible-playbook -i host_inventory_file omnia.yml
+```
+
+Install Slurm only
+```
+ansible-playbook -i host_inventory_file omnia.yml --skip-tags "k8s"
+```
+
+Install Kubernetes only
+```
+ansible-playbook -i host_inventory_file omnia.yml --skip-tags "slurm"
+ 
+
+Initialize Kubernetes cluster (packages already installed)
+```
+ansible-playbook -i host_inventory_file omnia.yml --skip-tags "slurm" --tags "init"
+```
+
+### Install Kubeflow 
+```
+ansible-playbook -i host_inventory_file platforms/kubeflow.yml
+```
+
+# Omnia  
+Omnia is a collection of [Ansible](https://www.ansible.com/) playbooks which perform:
+* Installation of [Slurm](https://slurm.schedmd.com/) and/or [Kubernetes](https://kubernetes.io/) on servers already provisioned with a standard [CentOS](https://www.centos.org/) image.
+* Installation of auxiliary scripts for administrator functions such as moving nodes between Slurm and Kubernetes personalities.
+
+Omnia playbooks perform several tasks:
+`common` playbook handles installation of software 
+* Add yum repositories:
+    - Kubernetes (Google)
+    - El Repo (for Nvidia drivers)
+    - EPEL (Extra Packages for Enterprise Linux)
+* Install Packages from repos:
+    - bash-completion
+    - docker
+    - gcc
+    - python-pip
+    - kubelet
+    - kubeadm
+    - kubectl
+    - nfs-utils
+    - nvidia-detect
+    - yum-plugin-versionlock
+* Restart and enable system level services
+    - Docker
+    - Kubelet
+
+`computeGPU` playbook installs Nvidia drivers and nvidia-container-runtime-hook
+* Add yum repositories:
+    - Nvidia (container runtime)
+* Install Packages from repos:
+    - kmod-nvidia
+    - nvidia-container-runtime-hook
+* Restart and enable system level services
+    - Docker
+    - Kubelet
+* Configuration:
+    - Enable GPU Device Plugins (nvidia-container-runtime-hook)
+    - Modify kubeadm config to allow GPUs as schedulable resource 
+* Restart and enable system level services
+    - Docker
+    - Kubelet
+
+`master` playbook
+* Install Helm v3
+* (optional) add firewall rules for Slurm and kubernetes
+
+Everything from this point on can be called by using the `init` tag
+```
+ansible-playbook -i host_inventory_file kubernetes/kubernetes.yml --tags "init"
+```
+
+`startmaster` playbook
+* turn off swap
+*Initialize Kubernetes
+    * Head/master
+        - Start K8S pass startup token to compute/slaves
+        - Initialize software defined networking (Calico)
+
+`startworkers` playbook
+* turn off swap
+* Join k8s cluster
+
+`startservices` playbook
+* Setup K8S Dashboard
+* Add `stable` repo to helm
+* Add `jupyterhub` repo to helm
+* Update helm repos
+* Deploy NFS client Provisioner
+* Deploy Jupyterhub
+* Deploy Prometheus
+* Install MPI Operator
+
+
+### Slurm
+* Downloads and builds Slurm from source
+* Install package dependencies
+    - Python3
+    - munge
+    - MariaDB
+    - MariaDB development libraries
+* Build Slurm configuration files
+

+ 27 - 0
site/PREINSTALL.md

@@ -0,0 +1,27 @@
+# Pre-Installation Preparation
+
+## Assumptions
+Omnia assumes that prior to installation:
+* Systems have a base operating system (currently CentOS 7 or 8)
+* Network(s) has been cabled and nodes can reach the internet
+* SSH Keys for `root` have been installed on all nodes to allow for password-less SSH
+* Ansible is installed on either the master node or a separate deployment node
+```
+yum install ansible
+```
+
+## Example system designs
+Omnia can configure systems which use Ethernet- or Infiniband-based fabric to connect the compute servers.
+
+![Example system configuration with Ethernet fabric](images/example-system-ethernet.png)
+
+![Example system configuration with Infiniband fabric](images/example-system-infiniband.png)
+
+## Network Setup
+Omnia assumes that servers are already connected to the network and have access to the internet.
+### Network Topology
+Possible network configurations include:
+* A flat topology where all nodes are connected to a switch which includes an uplink to the internet. This requires multiple externally-facing IP addresses
+* A hierarchical topology where compute nodes are connected to a common switch, but the master node contains a second network connection which is connected to the internet. All outbound/inbound traffic would be routed through the master node. This requires setting up firewall rules for IP masquerade, see [here](https://www.server-world.info/en/note?os=CentOS_7&p=firewalld&f=2) for an example.
+### IP and Hostname Assignment
+The recommended setup is to assign IP addresses to individual servers. This can be done manually by logging onto each node, or via DHCP.

File diff suppressed because it is too large
+ 43 - 0
site/README.md


+ 4 - 0
site/_config.yml

@@ -0,0 +1,4 @@
+theme: jekyll-theme-minimal
+title: Omnia
+description: Ansible playbook-based tools for deploying Slurm and Kubernetes clusters for High Performance Computing, Machine Learning, Deep Learning, and High-Performance Data Analytics
+logo: images/omnia-logo.png

BIN
site/images/delltech.jpg


BIN
site/images/example-system-ethernet.png


BIN
site/images/example-system-infiniband.png


BIN
site/images/omnia-branch-structure.png


BIN
site/images/omnia-k8s.png


BIN
site/images/omnia-logo.png


BIN
site/images/omnia-overview.png


BIN
site/images/omnia-slurm.png


BIN
site/images/pisa.png


+ 10 - 0
site/metalLB/README.md

@@ -0,0 +1,10 @@
+# MetalLB 
+
+MetalLB is a load-balancer implementation for bare metal Kubernetes clusters, using standard routing protocols.
+https://metallb.universe.tf/
+
+Omnia installs MetalLB by manifest in the playbook `startservices`. A default configuration is provdied for layer2 protocol and an example for providing an address pool. Modify metal-config.yaml to suit your network requirements and apply the changes using with: 
+
+``` 
+kubectl apply -f metal-config.yaml
+```

+ 21 - 0
site/metalLB/metal-config.yaml

@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  namespace: metallb-system
+  name: config
+data:
+  config: |
+    address-pools:
+    - name: default
+      protocol: layer2
+      addresses:
+      - 192.168.2.150/32
+      - 192.168.2.151/32
+      - 192.168.2.152/32
+      - 192.168.2.153/32
+      - 192.168.2.154/32
+      - 192.168.2.155/32
+      - 192.168.2.156/32
+      - 192.168.2.157/32
+      - 192.168.2.158/32
+      - 192.168.2.159/32