Browse Source

Issue #200: Fixed k8s_nfs_client_setup bug and other minor bugs

Signed-off-by: Blesson James <blesson_james@dellteam.com>
John Lockman 4 years ago
parent
commit
9c75d8a4c3

+ 4 - 4
omnia.yml

@@ -22,7 +22,7 @@
 
 - name: Gather facts from all the nodes
   hosts: all
-    
+
 - name: Apply common installation and config
   hosts: manager, compute
   gather_facts: false
@@ -56,7 +56,7 @@
   gather_facts: false
   roles:
     - k8s_nfs_server_setup
-  tags: 
+  tags:
     - kubernetes
     - nfs
 
@@ -65,7 +65,7 @@
   gather_facts: false
   roles:
     - k8s_nfs_client_setup
-  tags: 
+  tags:
     - kubernetes
     - nfs
 
@@ -143,4 +143,4 @@
       set_fact:
         ssh_to: "{{ groups['manager'] }}"
   roles:
-    - cluster_preperation
+    - cluster_preperation

+ 1 - 13
platforms/roles/kubeflow/tasks/main.yml

@@ -114,20 +114,8 @@
     regexp: 'NodePort'
     replace: 'LoadBalancer'
 
-- name: Remove cert-manager application block
-  replace:
-    path: "{{ kubeflow_config_file }}"
-    regexp: "{{ cert_manager_block }}"
-    replace: "\n"
-
-- name: Remove seldon-core-operator application block
-  replace:
-    path: "{{ kubeflow_config_file }}"
-    regexp: "{{ seldon_core_operator_block }}"
-    replace: "\n"
-
 - name: Apply kubeflow configuration
   command:
     cmd: "/usr/bin/kfctl apply -V -f '{{ kubeflow_config_file }}'"
     chdir: "{{ omnia_kubeflow_dir_path }}"
-  changed_when: true
+  changed_when: true

+ 0 - 22
platforms/roles/kubeflow/vars/main.yml

@@ -32,25 +32,3 @@ kfserving_gateway_yaml_file_path: "{{ omnia_kubeflow_dir_path }}/kustomize/kfser
 argo_yaml_file_path: "{{ omnia_kubeflow_dir_path }}/kustomize/argo/base/service.yaml"
 
 kubeflow_config_file: "{{ omnia_kubeflow_dir_path }}/kfctl_k8s_istio.v1.0.2.yaml"
-
-cert_manager_block: >
-    - kustomizeConfig:
-          overlays:
-          - self-signed
-          - application
-          parameters:
-          - name: namespace
-            value: cert-manager
-          repoRef:
-            name: manifests
-            path: cert-manager/cert-manager
-        name: cert-manager
-
-seldon_core_operator_block: >
-    - kustomizeConfig:
-          overlays:
-          - application
-          repoRef:
-            name: manifests
-            path: seldon/seldon-core-operator
-        name: seldon-core-operator

+ 2 - 2
roles/common/tasks/nvidia.yml

@@ -26,7 +26,7 @@
     enabled: yes
   tags: install
 
-- name: Add nvidia-container-runtime Repo 
+- name: Add nvidia-container-runtime Repo
   yum_repository:
     name: nvidia-container-runtime
     description:  nvidia-container-runtime
@@ -39,7 +39,7 @@
     enabled: yes
   tags: install
 
-- name: Add nvidia-docker Repo 
+- name: Add nvidia-docker Repo
   yum_repository:
     name: nvidia-docker
     description:  nvidia-docker

+ 1 - 0
roles/common/vars/main.yml

@@ -24,6 +24,7 @@ common_packages:
   - chrony
   - pciutils
   - docker-ce
+  - openssl
 
 custom_fact_dir: /etc/ansible/facts.d
 

+ 2 - 2
roles/k8s_common/tasks/main.yml

@@ -21,8 +21,8 @@
     enabled: yes
     gpgcheck: no
     repo_gpgcheck: no
-    gpgkey: 
-      - https://packages.cloud.google.com/yum/doc/yum-key.gpg 
+    gpgkey:
+      - https://packages.cloud.google.com/yum/doc/yum-key.gpg
       - https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
   tags: install
 

+ 4 - 4
roles/k8s_firewalld/tasks/main.yml

@@ -50,7 +50,7 @@
     permanent: yes
     state: enabled
   with_items: "{{ flannel_udp_ports }}"
-  when: k8s_cni == "flannel"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "flannel"
   tags: firewalld
 
 - name: Open calico UDP ports on the firewall
@@ -59,7 +59,7 @@
     permanent: yes
     state: enabled
   with_items: "{{ calico_udp_ports }}"
-  when: k8s_cni == "calico"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
   tags: firewalld
 
 - name: Open calico TCP ports on the firewall
@@ -68,7 +68,7 @@
     permanent: yes
     state: enabled
   with_items: "{{ calico_tcp_ports }}"
-  when: k8s_cni == "calico"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
   tags: firewalld
 
 - name: Reload firewalld
@@ -81,4 +81,4 @@
     name: firewalld
     state: stopped
     enabled: no
-  tags: firewalld
+  tags: firewalld

+ 1 - 4
roles/k8s_firewalld/vars/main.yml

@@ -13,9 +13,6 @@
 #  limitations under the License.
 ---
 
-# Kubernetes SDN network
-k8s_cni: calico
-
 # Master nodes firewall ports
 k8s_master_ports:
   - 6443
@@ -39,4 +36,4 @@ calico_tcp_ports:
 # Flannel CNI firewall ports
 flannel_udp_ports:
   - 8285
-  - 8472
+  - 8472

+ 14 - 3
roles/k8s_nfs_client_setup/tasks/main.yml

@@ -19,6 +19,15 @@
     state: present
   tags: nfs_client
 
+- name: Check mounted share
+  shell: mount | grep nfs
+  changed_when: false
+  args:
+    warn: false
+  register: mounted_share
+  ignore_errors: True
+  tags: nfs_client
+
 - name: Creating directory to mount NFS Share
   file:
     path: "{{ nfs_mnt_dir }}"
@@ -27,14 +36,16 @@
   tags: nfs_client
 
 - name: Mounting NFS Share
-  command: "mount {{ groups['manager'] }}:{{ nfs_mnt_dir }} {{ nfs_mnt_dir }}"
+  command: "mount {{ groups['manager'][0] }}:{{ nfs_mnt_dir }} {{ nfs_mnt_dir }}"
   changed_when: true
   args:
     warn: false
+  when: groups['manager'][0] not in mounted_share.stdout
   tags: nfs_client
 
 - name: Configuring Automount NFS Shares on reboot
   lineinfile:
     path: "{{ fstab_file_path }}"
-    line: "{{ groups['manager'] }}:{{ nfs_mnt_dir }}     {{ nfs_mnt_dir }}  nfs     nosuid,rw,sync,hard,intr 0 0"
-  tags: nfs_client
+    line: "{{ groups['manager'][0] }}:{{ nfs_mnt_dir }}     {{ nfs_mnt_dir }}  nfs     nosuid,rw,sync,hard,intr 0 0"
+  when: groups['manager'][0] not in mounted_share.stdout
+  tags: nfs_client

+ 22 - 5
roles/k8s_start_manager/tasks/main.yml

@@ -22,9 +22,17 @@
   setup:
     filter: ansible_default_ipv4.address
 
+- name: Check K8s nodes status
+  command: kubectl get nodes
+  changed_when: false
+  ignore_errors: True
+  register: k8s_nodes
+  tags: init
+
 - name: Initialize kubeadm
   command: "/bin/kubeadm init --pod-network-cidr='{{ pod_network_cidr_ip }}' --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
   changed_when: true
+  when: "'master' not in k8s_nodes.stdout"
   register: init_output
   tags: init
 
@@ -74,6 +82,7 @@
     token:  "{{ K8S_TOKEN.stdout }}"
     hash:   "{{ K8S_MANAGER_CA_HASH.stdout }}"
     ip:     "{{ ansible_default_ipv4.address }}"
+    k8s_nodes:  "{{ k8s_nodes.stdout }}"
   tags: init
 
 - name: Print k8s token
@@ -96,12 +105,12 @@
 
 - name: Setup Calico SDN network
   command: "kubectl apply -f '{{ calico_yml_url }}'"
-  when: k8s_cni == "calico"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
   tags: init
 
 - name: Setup Flannel SDN network
   command: "kubectl apply -f '{{ flannel_yml_url }}'"
-  when: k8s_cni == "flannel"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "flannel"
   tags: init
 
 - name: Create yaml repo for setup
@@ -120,9 +129,16 @@
     mode: "{{ k8s_service_account_file_mode }}"
   tags: init
 
+- name: Check K8s service accounts status
+  command: "kubectl get serviceaccounts"
+  changed_when: false
+  register: k8s_service_accounts
+  tags: init
+
 - name: Create service account (K8s dashboard)
   command: "kubectl create -f '{{ k8s_service_account_file_dest }}'"
   changed_when: true
+  when: "'default' not in k8s_service_accounts.stdout"
   tags: init
 
 - name: Create clusterRoleBinding (K8s dashboard) files
@@ -137,6 +153,7 @@
 - name: Create clusterRoleBinding (K8s dashboard)
   command: "kubectl create -f '{{ k8s_clusterRoleBinding_file_dest }}'"
   changed_when: true
+  ignore_errors: True
   tags: init
 
 - name: Dump bearer token for K8s dashboard login
@@ -146,7 +163,7 @@
   changed_when: true
   tags: init
 
-- name: Edge / Workstation Install allows pods to schedule on manager
+- name: Edge / Workstation Install allows pods to scheudle on manager
   command: kubectl taint nodes --all node-role.kubernetes.io/master-
-  when: single_node
-  tags: init
+  when: groups['manager'][0] == groups['compute'][0] and groups['compute']|length == 1
+  tags: init

+ 1 - 5
roles/k8s_start_manager/vars/main.yml

@@ -13,10 +13,6 @@
 #  limitations under the License.
 ---
 
-single_node: false
-
-k8s_cni: calico
-
 pod_network_cidr_ip: 10.244.0.0/16
 
 k8s_root_directory: /root/.kube
@@ -47,4 +43,4 @@ k8s_clusterRoleBinding_file_mode: 0655
 
 calico_yml_url: https://docs.projectcalico.org/manifests/calico.yaml
 
-flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
+flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml

+ 17 - 3
roles/k8s_start_services/tasks/main.yml

@@ -19,9 +19,16 @@
   ignore_errors: True
   tags: init
 
+- name: Get K8s pods
+  command: kubectl get pods --all-namespaces
+  changed_when: false
+  register: k8s_pods
+  tags: init
+
 - name: Deploy MetalLB
   command: "kubectl apply -f '{{ metallb_yaml_url }}'"
   changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
   tags: init
 
 - name: Create MetalLB Setup Config Files
@@ -45,17 +52,19 @@
 - name: Deploy MetalLB
   command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
   changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
   tags: init
 
 - name: Create default setup for MetalLB
   command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
   changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
   tags: init
 
 - name: Start k8s dashboard
   command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
   changed_when: true
-  register: result
+  when: "'kubernetes-dashboard' not in k8s_pods.stdout"
   tags: init
 
 - name: Helm - add stable repo
@@ -81,7 +90,7 @@
 - name: Start NFS Client Provisioner
   command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server }}' --set nfs.path='{{ nfs_path }}' --generate-name"
   changed_when: true
-  register: result
+  when: "'nfs-client-provisioner' not in k8s_pods.stdout"
   tags: init
 
 - name: Set NFS-Client Provisioner as DEFAULT StorageClass
@@ -97,25 +106,30 @@
     --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
     --generate-name
   changed_when: true
+  when: "'prometheus' not in k8s_pods.stdout"
   tags: init
 
 - name: Install MPI Operator
   command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
   changed_when: true
+  when: "'mpi-operator' not in k8s_pods.stdout"
   tags: init
 
 - name: Install nvidia-device-plugin
   command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
   changed_when: true
+  when: "'nvidia-device-plugin' not in k8s_pods.stdout"
   tags: init
 
 - name: Install GPU Feature Discovery
   command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
   changed_when: true
+  when: "'node-feature-discovery' not in k8s_pods.stdout"
   tags: init
 
 - name: Deploy Xilinx Device plugin
   command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
   changed_when: true
   register: fpga_enable
-  tags: init
+  when: "'fpga-device-plugin' not in k8s_pods.stdout"
+  tags: init

+ 8 - 2
roles/k8s_start_workers/tasks/main.yml

@@ -18,10 +18,16 @@
   changed_when: true
   tags: init
 
+- name: Get hostname
+  command: hostname
+  changed_when: true
+  register: node_hostname
+  tags: init
+
 - name: Execute kubeadm join command
   shell: >
     kubeadm join --token={{ hostvars['K8S_TOKEN_HOLDER']['token'] }}
     --discovery-token-ca-cert-hash sha256:{{ hostvars['K8S_TOKEN_HOLDER']['hash'] }}
     {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}:{{ apiserver_bind_port }}
-  when: not single_node
-  tags: init
+  when: groups['manager'][0] != groups['compute'][0] and groups['compute']|length >= 1 and node_hostname.stdout not in hostvars['K8S_TOKEN_HOLDER']['k8s_nodes']
+  tags: init

+ 1 - 3
roles/k8s_start_workers/vars/main.yml

@@ -13,6 +13,4 @@
 #  limitations under the License.
 ---
 
-single_node: false
-
-apiserver_bind_port: 6443
+apiserver_bind_port: 6443

+ 6 - 0
site/CONTRIBUTORS.md

@@ -0,0 +1,6 @@
+# Omnia Maintainers
+- Luke Wilson and John Lockman (Dell Technologies)
+<img src="images/delltech.jpg" height="90px" alt="Dell Technologies">
+
+# Omnia Contributors
+<img src="images/delltech.jpg" height="90px" alt="Dell Technologies"> <img src="images/pisa.png" height="100px" alt="Universita di Pisa">

+ 110 - 0
site/INSTALL.md

@@ -0,0 +1,110 @@
+## TL;DR Installation
+ 
+### Kubernetes
+Install Slurm and Kubernetes, along with all dependencies
+```
+ansible-playbook -i host_inventory_file omnia.yml
+```
+
+Install Slurm only
+```
+ansible-playbook -i host_inventory_file omnia.yml --skip-tags "k8s"
+```
+
+Install Kubernetes only
+```
+ansible-playbook -i host_inventory_file omnia.yml --skip-tags "slurm"
+ 
+
+Initialize Kubernetes cluster (packages already installed)
+```
+ansible-playbook -i host_inventory_file omnia.yml --skip-tags "slurm" --tags "init"
+```
+
+### Install Kubeflow 
+```
+ansible-playbook -i host_inventory_file platforms/kubeflow.yml
+```
+
+# Omnia  
+Omnia is a collection of [Ansible](https://www.ansible.com/) playbooks which perform:
+* Installation of [Slurm](https://slurm.schedmd.com/) and/or [Kubernetes](https://kubernetes.io/) on servers already provisioned with a standard [CentOS](https://www.centos.org/) image.
+* Installation of auxiliary scripts for administrator functions such as moving nodes between Slurm and Kubernetes personalities.
+
+Omnia playbooks perform several tasks:
+`common` playbook handles installation of software 
+* Add yum repositories:
+    - Kubernetes (Google)
+    - El Repo (for Nvidia drivers)
+    - EPEL (Extra Packages for Enterprise Linux)
+* Install Packages from repos:
+    - bash-completion
+    - docker
+    - gcc
+    - python-pip
+    - kubelet
+    - kubeadm
+    - kubectl
+    - nfs-utils
+    - nvidia-detect
+    - yum-plugin-versionlock
+* Restart and enable system level services
+    - Docker
+    - Kubelet
+
+`computeGPU` playbook installs Nvidia drivers and nvidia-container-runtime-hook
+* Add yum repositories:
+    - Nvidia (container runtime)
+* Install Packages from repos:
+    - kmod-nvidia
+    - nvidia-container-runtime-hook
+* Restart and enable system level services
+    - Docker
+    - Kubelet
+* Configuration:
+    - Enable GPU Device Plugins (nvidia-container-runtime-hook)
+    - Modify kubeadm config to allow GPUs as schedulable resource 
+* Restart and enable system level services
+    - Docker
+    - Kubelet
+
+`master` playbook
+* Install Helm v3
+* (optional) add firewall rules for Slurm and kubernetes
+
+Everything from this point on can be called by using the `init` tag
+```
+ansible-playbook -i host_inventory_file kubernetes/kubernetes.yml --tags "init"
+```
+
+`startmaster` playbook
+* turn off swap
+*Initialize Kubernetes
+    * Head/master
+        - Start K8S pass startup token to compute/slaves
+        - Initialize software defined networking (Calico)
+
+`startworkers` playbook
+* turn off swap
+* Join k8s cluster
+
+`startservices` playbook
+* Setup K8S Dashboard
+* Add `stable` repo to helm
+* Add `jupyterhub` repo to helm
+* Update helm repos
+* Deploy NFS client Provisioner
+* Deploy Jupyterhub
+* Deploy Prometheus
+* Install MPI Operator
+
+
+### Slurm
+* Downloads and builds Slurm from source
+* Install package dependencies
+    - Python3
+    - munge
+    - MariaDB
+    - MariaDB development libraries
+* Build Slurm configuration files
+

+ 27 - 0
site/PREINSTALL.md

@@ -0,0 +1,27 @@
+# Pre-Installation Preparation
+
+## Assumptions
+Omnia assumes that prior to installation:
+* Systems have a base operating system (currently CentOS 7 or 8)
+* Network(s) has been cabled and nodes can reach the internet
+* SSH Keys for `root` have been installed on all nodes to allow for password-less SSH
+* Ansible is installed on either the master node or a separate deployment node
+```
+yum install ansible
+```
+
+## Example system designs
+Omnia can configure systems which use Ethernet- or Infiniband-based fabric to connect the compute servers.
+
+![Example system configuration with Ethernet fabric](images/example-system-ethernet.png)
+
+![Example system configuration with Infiniband fabric](images/example-system-infiniband.png)
+
+## Network Setup
+Omnia assumes that servers are already connected to the network and have access to the internet.
+### Network Topology
+Possible network configurations include:
+* A flat topology where all nodes are connected to a switch which includes an uplink to the internet. This requires multiple externally-facing IP addresses
+* A hierarchical topology where compute nodes are connected to a common switch, but the master node contains a second network connection which is connected to the internet. All outbound/inbound traffic would be routed through the master node. This requires setting up firewall rules for IP masquerade, see [here](https://www.server-world.info/en/note?os=CentOS_7&p=firewalld&f=2) for an example.
+### IP and Hostname Assignment
+The recommended setup is to assign IP addresses to individual servers. This can be done manually by logging onto each node, or via DHCP.

File diff suppressed because it is too large
+ 43 - 0
site/README.md


+ 4 - 0
site/_config.yml

@@ -0,0 +1,4 @@
+theme: jekyll-theme-minimal
+title: Omnia
+description: Ansible playbook-based tools for deploying Slurm and Kubernetes clusters for High Performance Computing, Machine Learning, Deep Learning, and High-Performance Data Analytics
+logo: images/omnia-logo.png

BIN
site/images/delltech.jpg


BIN
site/images/example-system-ethernet.png


BIN
site/images/example-system-infiniband.png


BIN
site/images/omnia-branch-structure.png


BIN
site/images/omnia-k8s.png


BIN
site/images/omnia-logo.png


BIN
site/images/omnia-overview.png


BIN
site/images/omnia-slurm.png


BIN
site/images/pisa.png


+ 10 - 0
site/metalLB/README.md

@@ -0,0 +1,10 @@
+# MetalLB 
+
+MetalLB is a load-balancer implementation for bare metal Kubernetes clusters, using standard routing protocols.
+https://metallb.universe.tf/
+
+Omnia installs MetalLB by manifest in the playbook `startservices`. A default configuration is provdied for layer2 protocol and an example for providing an address pool. Modify metal-config.yaml to suit your network requirements and apply the changes using with: 
+
+``` 
+kubectl apply -f metal-config.yaml
+```

+ 21 - 0
site/metalLB/metal-config.yaml

@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  namespace: metallb-system
+  name: config
+data:
+  config: |
+    address-pools:
+    - name: default
+      protocol: layer2
+      addresses:
+      - 192.168.2.150/32
+      - 192.168.2.151/32
+      - 192.168.2.152/32
+      - 192.168.2.153/32
+      - 192.168.2.154/32
+      - 192.168.2.155/32
+      - 192.168.2.156/32
+      - 192.168.2.157/32
+      - 192.168.2.158/32
+      - 192.168.2.159/32