瀏覽代碼

Issue #200: Fixed k8s_nfs_client_setup bug and other minor bugs

Signed-off-by: Blesson James <blesson_james@dellteam.com>
John Lockman 4 年之前
父節點
當前提交
9c75d8a4c3

+ 4 - 4
omnia.yml

@@ -22,7 +22,7 @@
 
 - name: Gather facts from all the nodes
   hosts: all
-    
+
 - name: Apply common installation and config
   hosts: manager, compute
   gather_facts: false
@@ -56,7 +56,7 @@
   gather_facts: false
   roles:
     - k8s_nfs_server_setup
-  tags: 
+  tags:
     - kubernetes
     - nfs
 
@@ -65,7 +65,7 @@
   gather_facts: false
   roles:
     - k8s_nfs_client_setup
-  tags: 
+  tags:
     - kubernetes
     - nfs
 
@@ -143,4 +143,4 @@
       set_fact:
         ssh_to: "{{ groups['manager'] }}"
   roles:
-    - cluster_preperation
+    - cluster_preperation

+ 1 - 13
platforms/roles/kubeflow/tasks/main.yml

@@ -114,20 +114,8 @@
     regexp: 'NodePort'
     replace: 'LoadBalancer'
 
-- name: Remove cert-manager application block
-  replace:
-    path: "{{ kubeflow_config_file }}"
-    regexp: "{{ cert_manager_block }}"
-    replace: "\n"
-
-- name: Remove seldon-core-operator application block
-  replace:
-    path: "{{ kubeflow_config_file }}"
-    regexp: "{{ seldon_core_operator_block }}"
-    replace: "\n"
-
 - name: Apply kubeflow configuration
   command:
     cmd: "/usr/bin/kfctl apply -V -f '{{ kubeflow_config_file }}'"
     chdir: "{{ omnia_kubeflow_dir_path }}"
-  changed_when: true
+  changed_when: true

+ 0 - 22
platforms/roles/kubeflow/vars/main.yml

@@ -32,25 +32,3 @@ kfserving_gateway_yaml_file_path: "{{ omnia_kubeflow_dir_path }}/kustomize/kfser
 argo_yaml_file_path: "{{ omnia_kubeflow_dir_path }}/kustomize/argo/base/service.yaml"
 
 kubeflow_config_file: "{{ omnia_kubeflow_dir_path }}/kfctl_k8s_istio.v1.0.2.yaml"
-
-cert_manager_block: >
-    - kustomizeConfig:
-          overlays:
-          - self-signed
-          - application
-          parameters:
-          - name: namespace
-            value: cert-manager
-          repoRef:
-            name: manifests
-            path: cert-manager/cert-manager
-        name: cert-manager
-
-seldon_core_operator_block: >
-    - kustomizeConfig:
-          overlays:
-          - application
-          repoRef:
-            name: manifests
-            path: seldon/seldon-core-operator
-        name: seldon-core-operator

+ 2 - 2
roles/common/tasks/nvidia.yml

@@ -26,7 +26,7 @@
     enabled: yes
   tags: install
 
-- name: Add nvidia-container-runtime Repo 
+- name: Add nvidia-container-runtime Repo
   yum_repository:
     name: nvidia-container-runtime
     description:  nvidia-container-runtime
@@ -39,7 +39,7 @@
     enabled: yes
   tags: install
 
-- name: Add nvidia-docker Repo 
+- name: Add nvidia-docker Repo
   yum_repository:
     name: nvidia-docker
     description:  nvidia-docker

+ 1 - 0
roles/common/vars/main.yml

@@ -24,6 +24,7 @@ common_packages:
   - chrony
   - pciutils
   - docker-ce
+  - openssl
 
 custom_fact_dir: /etc/ansible/facts.d
 

+ 2 - 2
roles/k8s_common/tasks/main.yml

@@ -21,8 +21,8 @@
     enabled: yes
     gpgcheck: no
     repo_gpgcheck: no
-    gpgkey: 
-      - https://packages.cloud.google.com/yum/doc/yum-key.gpg 
+    gpgkey:
+      - https://packages.cloud.google.com/yum/doc/yum-key.gpg
       - https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
   tags: install
 

+ 4 - 4
roles/k8s_firewalld/tasks/main.yml

@@ -50,7 +50,7 @@
     permanent: yes
     state: enabled
   with_items: "{{ flannel_udp_ports }}"
-  when: k8s_cni == "flannel"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "flannel"
   tags: firewalld
 
 - name: Open calico UDP ports on the firewall
@@ -59,7 +59,7 @@
     permanent: yes
     state: enabled
   with_items: "{{ calico_udp_ports }}"
-  when: k8s_cni == "calico"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
   tags: firewalld
 
 - name: Open calico TCP ports on the firewall
@@ -68,7 +68,7 @@
     permanent: yes
     state: enabled
   with_items: "{{ calico_tcp_ports }}"
-  when: k8s_cni == "calico"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
   tags: firewalld
 
 - name: Reload firewalld
@@ -81,4 +81,4 @@
     name: firewalld
     state: stopped
     enabled: no
-  tags: firewalld
+  tags: firewalld

+ 1 - 4
roles/k8s_firewalld/vars/main.yml

@@ -13,9 +13,6 @@
 #  limitations under the License.
 ---
 
-# Kubernetes SDN network
-k8s_cni: calico
-
 # Master nodes firewall ports
 k8s_master_ports:
   - 6443
@@ -39,4 +36,4 @@ calico_tcp_ports:
 # Flannel CNI firewall ports
 flannel_udp_ports:
   - 8285
-  - 8472
+  - 8472

+ 14 - 3
roles/k8s_nfs_client_setup/tasks/main.yml

@@ -19,6 +19,15 @@
     state: present
   tags: nfs_client
 
+- name: Check mounted share
+  shell: mount | grep nfs
+  changed_when: false
+  args:
+    warn: false
+  register: mounted_share
+  ignore_errors: True
+  tags: nfs_client
+
 - name: Creating directory to mount NFS Share
   file:
     path: "{{ nfs_mnt_dir }}"
@@ -27,14 +36,16 @@
   tags: nfs_client
 
 - name: Mounting NFS Share
-  command: "mount {{ groups['manager'] }}:{{ nfs_mnt_dir }} {{ nfs_mnt_dir }}"
+  command: "mount {{ groups['manager'][0] }}:{{ nfs_mnt_dir }} {{ nfs_mnt_dir }}"
   changed_when: true
   args:
     warn: false
+  when: groups['manager'][0] not in mounted_share.stdout
   tags: nfs_client
 
 - name: Configuring Automount NFS Shares on reboot
   lineinfile:
     path: "{{ fstab_file_path }}"
-    line: "{{ groups['manager'] }}:{{ nfs_mnt_dir }}     {{ nfs_mnt_dir }}  nfs     nosuid,rw,sync,hard,intr 0 0"
-  tags: nfs_client
+    line: "{{ groups['manager'][0] }}:{{ nfs_mnt_dir }}     {{ nfs_mnt_dir }}  nfs     nosuid,rw,sync,hard,intr 0 0"
+  when: groups['manager'][0] not in mounted_share.stdout
+  tags: nfs_client

+ 22 - 5
roles/k8s_start_manager/tasks/main.yml

@@ -22,9 +22,17 @@
   setup:
     filter: ansible_default_ipv4.address
 
+- name: Check K8s nodes status
+  command: kubectl get nodes
+  changed_when: false
+  ignore_errors: True
+  register: k8s_nodes
+  tags: init
+
 - name: Initialize kubeadm
   command: "/bin/kubeadm init --pod-network-cidr='{{ pod_network_cidr_ip }}' --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
   changed_when: true
+  when: "'master' not in k8s_nodes.stdout"
   register: init_output
   tags: init
 
@@ -74,6 +82,7 @@
     token:  "{{ K8S_TOKEN.stdout }}"
     hash:   "{{ K8S_MANAGER_CA_HASH.stdout }}"
     ip:     "{{ ansible_default_ipv4.address }}"
+    k8s_nodes:  "{{ k8s_nodes.stdout }}"
   tags: init
 
 - name: Print k8s token
@@ -96,12 +105,12 @@
 
 - name: Setup Calico SDN network
   command: "kubectl apply -f '{{ calico_yml_url }}'"
-  when: k8s_cni == "calico"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
   tags: init
 
 - name: Setup Flannel SDN network
   command: "kubectl apply -f '{{ flannel_yml_url }}'"
-  when: k8s_cni == "flannel"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "flannel"
   tags: init
 
 - name: Create yaml repo for setup
@@ -120,9 +129,16 @@
     mode: "{{ k8s_service_account_file_mode }}"
   tags: init
 
+- name: Check K8s service accounts status
+  command: "kubectl get serviceaccounts"
+  changed_when: false
+  register: k8s_service_accounts
+  tags: init
+
 - name: Create service account (K8s dashboard)
   command: "kubectl create -f '{{ k8s_service_account_file_dest }}'"
   changed_when: true
+  when: "'default' not in k8s_service_accounts.stdout"
   tags: init
 
 - name: Create clusterRoleBinding (K8s dashboard) files
@@ -137,6 +153,7 @@
 - name: Create clusterRoleBinding (K8s dashboard)
   command: "kubectl create -f '{{ k8s_clusterRoleBinding_file_dest }}'"
   changed_when: true
+  ignore_errors: True
   tags: init
 
 - name: Dump bearer token for K8s dashboard login
@@ -146,7 +163,7 @@
   changed_when: true
   tags: init
 
-- name: Edge / Workstation Install allows pods to schedule on manager
+- name: Edge / Workstation Install allows pods to scheudle on manager
   command: kubectl taint nodes --all node-role.kubernetes.io/master-
-  when: single_node
-  tags: init
+  when: groups['manager'][0] == groups['compute'][0] and groups['compute']|length == 1
+  tags: init

+ 1 - 5
roles/k8s_start_manager/vars/main.yml

@@ -13,10 +13,6 @@
 #  limitations under the License.
 ---
 
-single_node: false
-
-k8s_cni: calico
-
 pod_network_cidr_ip: 10.244.0.0/16
 
 k8s_root_directory: /root/.kube
@@ -47,4 +43,4 @@ k8s_clusterRoleBinding_file_mode: 0655
 
 calico_yml_url: https://docs.projectcalico.org/manifests/calico.yaml
 
-flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
+flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml

+ 17 - 3
roles/k8s_start_services/tasks/main.yml

@@ -19,9 +19,16 @@
   ignore_errors: True
   tags: init
 
+- name: Get K8s pods
+  command: kubectl get pods --all-namespaces
+  changed_when: false
+  register: k8s_pods
+  tags: init
+
 - name: Deploy MetalLB
   command: "kubectl apply -f '{{ metallb_yaml_url }}'"
   changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
   tags: init
 
 - name: Create MetalLB Setup Config Files
@@ -45,17 +52,19 @@
 - name: Deploy MetalLB
   command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
   changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
   tags: init
 
 - name: Create default setup for MetalLB
   command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
   changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
   tags: init
 
 - name: Start k8s dashboard
   command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
   changed_when: true
-  register: result
+  when: "'kubernetes-dashboard' not in k8s_pods.stdout"
   tags: init
 
 - name: Helm - add stable repo
@@ -81,7 +90,7 @@
 - name: Start NFS Client Provisioner
   command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server }}' --set nfs.path='{{ nfs_path }}' --generate-name"
   changed_when: true
-  register: result
+  when: "'nfs-client-provisioner' not in k8s_pods.stdout"
   tags: init
 
 - name: Set NFS-Client Provisioner as DEFAULT StorageClass
@@ -97,25 +106,30 @@
     --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
     --generate-name
   changed_when: true
+  when: "'prometheus' not in k8s_pods.stdout"
   tags: init
 
 - name: Install MPI Operator
   command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
   changed_when: true
+  when: "'mpi-operator' not in k8s_pods.stdout"
   tags: init
 
 - name: Install nvidia-device-plugin
   command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
   changed_when: true
+  when: "'nvidia-device-plugin' not in k8s_pods.stdout"
   tags: init
 
 - name: Install GPU Feature Discovery
   command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
   changed_when: true
+  when: "'node-feature-discovery' not in k8s_pods.stdout"
   tags: init
 
 - name: Deploy Xilinx Device plugin
   command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
   changed_when: true
   register: fpga_enable
-  tags: init
+  when: "'fpga-device-plugin' not in k8s_pods.stdout"
+  tags: init

+ 8 - 2
roles/k8s_start_workers/tasks/main.yml

@@ -18,10 +18,16 @@
   changed_when: true
   tags: init
 
+- name: Get hostname
+  command: hostname
+  changed_when: true
+  register: node_hostname
+  tags: init
+
 - name: Execute kubeadm join command
   shell: >
     kubeadm join --token={{ hostvars['K8S_TOKEN_HOLDER']['token'] }}
     --discovery-token-ca-cert-hash sha256:{{ hostvars['K8S_TOKEN_HOLDER']['hash'] }}
     {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}:{{ apiserver_bind_port }}
-  when: not single_node
-  tags: init
+  when: groups['manager'][0] != groups['compute'][0] and groups['compute']|length >= 1 and node_hostname.stdout not in hostvars['K8S_TOKEN_HOLDER']['k8s_nodes']
+  tags: init

+ 1 - 3
roles/k8s_start_workers/vars/main.yml

@@ -13,6 +13,4 @@
 #  limitations under the License.
 ---
 
-single_node: false
-
-apiserver_bind_port: 6443
+apiserver_bind_port: 6443

+ 6 - 0
site/CONTRIBUTORS.md

@@ -0,0 +1,6 @@
+# Omnia Maintainers
+- Luke Wilson and John Lockman (Dell Technologies)
+<img src="images/delltech.jpg" height="90px" alt="Dell Technologies">
+
+# Omnia Contributors
+<img src="images/delltech.jpg" height="90px" alt="Dell Technologies"> <img src="images/pisa.png" height="100px" alt="Universita di Pisa">

+ 110 - 0
site/INSTALL.md

@@ -0,0 +1,110 @@
+## TL;DR Installation
+ 
+### Kubernetes
+Install Slurm and Kubernetes, along with all dependencies
+```
+ansible-playbook -i host_inventory_file omnia.yml
+```
+
+Install Slurm only
+```
+ansible-playbook -i host_inventory_file omnia.yml --skip-tags "k8s"
+```
+
+Install Kubernetes only
+```
+ansible-playbook -i host_inventory_file omnia.yml --skip-tags "slurm"
+ 
+
+Initialize Kubernetes cluster (packages already installed)
+```
+ansible-playbook -i host_inventory_file omnia.yml --skip-tags "slurm" --tags "init"
+```
+
+### Install Kubeflow 
+```
+ansible-playbook -i host_inventory_file platforms/kubeflow.yml
+```
+
+# Omnia  
+Omnia is a collection of [Ansible](https://www.ansible.com/) playbooks which perform:
+* Installation of [Slurm](https://slurm.schedmd.com/) and/or [Kubernetes](https://kubernetes.io/) on servers already provisioned with a standard [CentOS](https://www.centos.org/) image.
+* Installation of auxiliary scripts for administrator functions such as moving nodes between Slurm and Kubernetes personalities.
+
+Omnia playbooks perform several tasks:
+`common` playbook handles installation of software 
+* Add yum repositories:
+    - Kubernetes (Google)
+    - El Repo (for Nvidia drivers)
+    - EPEL (Extra Packages for Enterprise Linux)
+* Install Packages from repos:
+    - bash-completion
+    - docker
+    - gcc
+    - python-pip
+    - kubelet
+    - kubeadm
+    - kubectl
+    - nfs-utils
+    - nvidia-detect
+    - yum-plugin-versionlock
+* Restart and enable system level services
+    - Docker
+    - Kubelet
+
+`computeGPU` playbook installs Nvidia drivers and nvidia-container-runtime-hook
+* Add yum repositories:
+    - Nvidia (container runtime)
+* Install Packages from repos:
+    - kmod-nvidia
+    - nvidia-container-runtime-hook
+* Restart and enable system level services
+    - Docker
+    - Kubelet
+* Configuration:
+    - Enable GPU Device Plugins (nvidia-container-runtime-hook)
+    - Modify kubeadm config to allow GPUs as schedulable resource 
+* Restart and enable system level services
+    - Docker
+    - Kubelet
+
+`master` playbook
+* Install Helm v3
+* (optional) add firewall rules for Slurm and kubernetes
+
+Everything from this point on can be called by using the `init` tag
+```
+ansible-playbook -i host_inventory_file kubernetes/kubernetes.yml --tags "init"
+```
+
+`startmaster` playbook
+* turn off swap
+*Initialize Kubernetes
+    * Head/master
+        - Start K8S pass startup token to compute/slaves
+        - Initialize software defined networking (Calico)
+
+`startworkers` playbook
+* turn off swap
+* Join k8s cluster
+
+`startservices` playbook
+* Setup K8S Dashboard
+* Add `stable` repo to helm
+* Add `jupyterhub` repo to helm
+* Update helm repos
+* Deploy NFS client Provisioner
+* Deploy Jupyterhub
+* Deploy Prometheus
+* Install MPI Operator
+
+
+### Slurm
+* Downloads and builds Slurm from source
+* Install package dependencies
+    - Python3
+    - munge
+    - MariaDB
+    - MariaDB development libraries
+* Build Slurm configuration files
+

+ 27 - 0
site/PREINSTALL.md

@@ -0,0 +1,27 @@
+# Pre-Installation Preparation
+
+## Assumptions
+Omnia assumes that prior to installation:
+* Systems have a base operating system (currently CentOS 7 or 8)
+* Network(s) has been cabled and nodes can reach the internet
+* SSH Keys for `root` have been installed on all nodes to allow for password-less SSH
+* Ansible is installed on either the master node or a separate deployment node
+```
+yum install ansible
+```
+
+## Example system designs
+Omnia can configure systems which use Ethernet- or Infiniband-based fabric to connect the compute servers.
+
+![Example system configuration with Ethernet fabric](images/example-system-ethernet.png)
+
+![Example system configuration with Infiniband fabric](images/example-system-infiniband.png)
+
+## Network Setup
+Omnia assumes that servers are already connected to the network and have access to the internet.
+### Network Topology
+Possible network configurations include:
+* A flat topology where all nodes are connected to a switch which includes an uplink to the internet. This requires multiple externally-facing IP addresses
+* A hierarchical topology where compute nodes are connected to a common switch, but the master node contains a second network connection which is connected to the internet. All outbound/inbound traffic would be routed through the master node. This requires setting up firewall rules for IP masquerade, see [here](https://www.server-world.info/en/note?os=CentOS_7&p=firewalld&f=2) for an example.
+### IP and Hostname Assignment
+The recommended setup is to assign IP addresses to individual servers. This can be done manually by logging onto each node, or via DHCP.

文件差異過大導致無法顯示
+ 43 - 0
site/README.md


+ 4 - 0
site/_config.yml

@@ -0,0 +1,4 @@
+theme: jekyll-theme-minimal
+title: Omnia
+description: Ansible playbook-based tools for deploying Slurm and Kubernetes clusters for High Performance Computing, Machine Learning, Deep Learning, and High-Performance Data Analytics
+logo: images/omnia-logo.png

二進制
site/images/delltech.jpg


二進制
site/images/example-system-ethernet.png


二進制
site/images/example-system-infiniband.png


二進制
site/images/omnia-branch-structure.png


二進制
site/images/omnia-k8s.png


二進制
site/images/omnia-logo.png


二進制
site/images/omnia-overview.png


二進制
site/images/omnia-slurm.png


二進制
site/images/pisa.png


+ 10 - 0
site/metalLB/README.md

@@ -0,0 +1,10 @@
+# MetalLB 
+
+MetalLB is a load-balancer implementation for bare metal Kubernetes clusters, using standard routing protocols.
+https://metallb.universe.tf/
+
+Omnia installs MetalLB by manifest in the playbook `startservices`. A default configuration is provdied for layer2 protocol and an example for providing an address pool. Modify metal-config.yaml to suit your network requirements and apply the changes using with: 
+
+``` 
+kubectl apply -f metal-config.yaml
+```

+ 21 - 0
site/metalLB/metal-config.yaml

@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  namespace: metallb-system
+  name: config
+data:
+  config: |
+    address-pools:
+    - name: default
+      protocol: layer2
+      addresses:
+      - 192.168.2.150/32
+      - 192.168.2.151/32
+      - 192.168.2.152/32
+      - 192.168.2.153/32
+      - 192.168.2.154/32
+      - 192.168.2.155/32
+      - 192.168.2.156/32
+      - 192.168.2.157/32
+      - 192.168.2.158/32
+      - 192.168.2.159/32