Jelajahi Sumber

Issue #454: Supporting multiple configuration files for different ethernet switches

Signed-off-by: sakshiarora13 <sakshi_arora1@dell.com>
Lucas A. Wilson 3 tahun lalu
induk
melakukan
9f61a353e4

+ 193 - 0
control_plane/input_params/ethernet_tor_vars.yml

@@ -0,0 +1,193 @@
+# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+# ---Default configurations written for Dell PowerSwitch S3048-ON---
+# ---Change the configurations as per the switch model to avoid failures---
+# ---Use this configuration file for all S3* and S4* switches---
+# ---S3* Switches: S3048-ON, S4048T-ON, S4112F-ON---
+# ---S4* Switches: S4048-ON, S4048T-ON, S4112F-ON, S4112T-ON, S4128F-ON,
+# ---------------- S4148F-ON, S4128T-ON, S4148T-ON, S4148FE, S4148U, S4248FBL-ON ---
+
+# Global configuration for switch
+os10_config:
+    - "interface vlan1"
+    - "exit"
+
+# Interface configuration for switch
+os10_interface:
+    ethernet 1/1/1:
+      desc: "Port 1"
+      admin: up
+    ethernet 1/1/2:
+      desc: "Port 2"
+      admin: up
+    ethernet 1/1/3:
+      desc: "Port 3"
+      admin: up
+    ethernet 1/1/4:
+      desc: "Port 4"
+      admin: up
+    ethernet 1/1/5:
+      desc: "Port 5"
+      admin: up
+    ethernet 1/1/6:
+      desc: "Port 6"
+      admin: up
+    ethernet 1/1/7:
+      desc: "Port 7"
+      admin: up
+    ethernet 1/1/8:
+      desc: "Port 8"
+      admin: up
+    ethernet 1/1/9:
+      desc: "Port 9"
+      admin: up
+    ethernet 1/1/10:
+      desc: "Port 10"
+      admin: up
+    ethernet 1/1/11:
+      desc: "Port 11"
+      admin: up
+    ethernet 1/1/12:
+      desc: "Port 12"
+      admin: up
+    ethernet 1/1/13:
+      desc: "Port 13"
+      admin: up
+    ethernet 1/1/14:
+      desc: "Port 14"
+      admin: up
+    ethernet 1/1/15:
+      desc: "Port 15"
+      admin: up
+    ethernet 1/1/16:
+      desc: "Port 16"
+      admin: up
+    ethernet 1/1/17:
+      desc: "Port 17"
+      admin: up
+    ethernet 1/1/18:
+      desc: "Port 18"
+      admin: up
+    ethernet 1/1/19:
+      desc: "Port 19"
+      admin: up
+    ethernet 1/1/20:
+      desc: "Port 20"
+      admin: up
+    ethernet 1/1/21:
+      desc: "Port 21"
+      admin: up
+    ethernet 1/1/22:
+      desc: "Port 22"
+      admin: up
+    ethernet 1/1/23:
+      desc: "Port 23"
+      admin: up
+    ethernet 1/1/24:
+      desc: "Port 24"
+      admin: up
+    ethernet 1/1/25:
+      desc: "Port 25"
+      admin: up
+    ethernet 1/1/26:
+      desc: "Port 26"
+      admin: up
+    ethernet 1/1/27:
+      desc: "Port 27"
+      admin: up
+    ethernet 1/1/28:
+      desc: "Port 28"
+      admin: up
+    ethernet 1/1/29:
+      desc: "Port 29"
+      admin: up
+    ethernet 1/1/30:
+      desc: "Port 30"
+      admin: up
+    ethernet 1/1/31:
+      desc: "Port 31"
+      admin: up
+    ethernet 1/1/32:
+      desc: "Port 32"
+      admin: up
+    ethernet 1/1/33:
+      desc: "Port 33"
+      admin: up
+    ethernet 1/1/34:
+      desc: "Port 34"
+      admin: up
+    ethernet 1/1/35:
+      desc: "Port 35"
+      admin: up
+    ethernet 1/1/36:
+      desc: "Port 36"
+      admin: up
+    ethernet 1/1/37:
+      desc: "Port 37"
+      admin: up
+    ethernet 1/1/38:
+      desc: "Port 38"
+      admin: up
+    ethernet 1/1/39:
+      desc: "Port 39"
+      admin: up
+    ethernet 1/1/40:
+      desc: "Port 40"
+      admin: up
+    ethernet 1/1/41:
+      desc: "Port 41"
+      admin: up
+    ethernet 1/1/42:
+      desc: "Port 42"
+      admin: up
+    ethernet 1/1/43:
+      desc: "Port 43"
+      admin: up
+    ethernet 1/1/44:
+      desc: "Port 4"
+      admin: up
+    ethernet 1/1/45:
+      desc: "Port 45"
+      admin: up
+    ethernet 1/1/46:
+      desc: "Port 46"
+      admin: up
+    ethernet 1/1/47:
+      desc: "Port 47"
+      admin: up
+    ethernet 1/1/48:
+      desc: "Port 48"
+      admin: up
+    ethernet 1/1/49:
+      desc: "Port 49"
+      admin: up
+    ethernet 1/1/50:
+      desc: "Port 50"
+      admin: up
+    ethernet 1/1/51:
+      desc: "Port 51"
+      admin: up
+    ethernet 1/1/52:
+      desc: "Port 52"
+      admin: up
+    vlan 1:
+      admin: up
+
+# save_changes_to_startup is a boolean flag. By default, this option is set to false.
+# When set to true, it will save the switch's running configuration to the startup configuration file
+# after the role applies its configuration. This will allow the configuration to persist after a
+# restart or power failure.
+save_changes_to_startup: false

+ 50 - 31
control_plane/input_params/ethernet_vars.yml

@@ -13,105 +13,124 @@
 #  limitations under the License.
 
 
-# ---Default configurations written for Dell PowerSwitch S4128T-ON---
+# ---Default configurations written for Dell PowerSwitch S5232F-ON---
 # ---Change the configurations as per the switch model to avoid failures---
+# ---Use this configuration file for all switches other than S3* and S4* switches---
+
 
 # Global configuration for switch
 os10_config:
     - "interface vlan1"
     - "exit"
 
+# By default, all ports are set up in 10g-4x breakout mode
+# Possible values of breakout/fanout mode: 10g-4x, 25g-4x, 40g-1x, 50g-2x, 100g-1x
+breakout_value: 10g-4x
+
 # Interface configuration for switch
+# By default, all ports are brought up in admin UP state
 os10_interface:
     ethernet 1/1/1:
-      desc: "Port 1"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/2:
-      desc: "Port 2"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/3:
-      desc: "Port 3"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/4:
-      desc: "Port 4"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/5:
-      desc: "Port 5"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/6:
-      desc: "Port 6"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/7:
-      desc: "Port 7"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/8:
-      desc: "Port 8"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/9:
-      desc: "Port 9"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/10:
-      desc: "Port 10"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/11:
-      desc: "Port 11"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/12:
-      desc: "Port 12"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/13:
-      desc: "Port 13"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/14:
-      desc: "Port 14"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/15:
-      desc: "Port 15"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/16:
-      desc: "Port 16"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/17:
-      desc: "Port 17"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/18:
-      desc: "Port 18"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/19:
-      desc: "Port 19"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/20:
-      desc: "Port 20"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/21:
-      desc: "Port 21"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/22:
-      desc: "Port 22"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/23:
-      desc: "Port 23"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/24:
-      desc: "Port 24"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/25:
-      desc: "Port 25"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/26:
-      desc: "Port 26"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/27:
-      desc: "Port 27"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/28:
-      desc: "Port 28"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/29:
-      desc: "Port 29"
       admin: up
+      fanout: "{{ breakout_value }}"
     ethernet 1/1/30:
-      desc: "Port 30"
+      admin: up
+      fanout: "{{ breakout_value }}"
+    ethernet 1/1/31:
+      admin: up
+      fanout: "{{ breakout_value }}"
+    ethernet 1/1/32:
+      desc: "Port 32"
+      admin: up
+    ethernet 1/1/33:
+      desc: "Port 33"
+      admin: up
+    ethernet 1/1/34:
+      desc: "Port 34"
       admin: up
     vlan 1:
       admin: up

+ 14 - 13
control_plane/roles/control_plane_k8s/tasks/k8s_init.yml

@@ -27,30 +27,31 @@
 - name: Get K8s nodes status
   command: kubectl get nodes
   changed_when: false
-  ignore_errors: True
+  failed_when: false
   register: k8s_nodes
 
 - name: Get K8s pods status
   command: kubectl get pods --all-namespaces
   changed_when: false
-  ignore_errors: True
+  failed_when: false
   register: k8s_pods
 
 - name: Docker login
   command: docker login -u {{ docker_username }} -p {{ docker_password }}
   changed_when: true
   register: docker_login_output
-  ignore_errors: True
+  failed_when: false
   when: docker_username or docker_password
+  no_log: true
 
 - name: Docker login check
   fail:
     msg: "{{ docker_login_fail_msg }}"
   when: docker_login_output is failed
 
-- name: Initialize kubeadm
+- name: Initialize kubeadm (This process may take 5-10min)
   block:
-    - name: Initialize kubeadm
+    - name: Initialize kubeadm (This process may take 5-10min)
       command: "/bin/kubeadm init --pod-network-cidr='{{ appliance_k8s_pod_net_cidr }}' \
         --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
       changed_when: true
@@ -60,7 +61,7 @@
       command: "kubeadm reset -f"
       changed_when: true
 
-    - name: Initialize kubeadm
+    - name: Initialize kubeadm (This process may take 5-10min)
       command: "/bin/kubeadm init --pod-network-cidr='{{ appliance_k8s_pod_net_cidr }}' \
           --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
       changed_when: true
@@ -69,7 +70,7 @@
     - name: Get K8s pods status
       command: kubectl get pods --all-namespaces
       changed_when: false
-      ignore_errors: True
+      failed_when: false
       register: k8s_pods
   when: "'master' not in k8s_nodes.stdout"
 
@@ -99,20 +100,20 @@
     set -o pipefail && \
       kubeadm token list | cut -d ' ' -f1 | sed -n '2p'
   changed_when: false
-  register: K8S_TOKEN
+  register: k8s_token
 
 - name: CA Hash
   shell: >
     set -o pipefail && \
       openssl x509 -pubkey -in {{ k8s_cert_path }} | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | sed 's/^.* //'
   changed_when: false
-  register: K8S_MANAGER_CA_HASH
+  register: k8s_manager_ca_hash
 
 - name: Add K8S Manager IP, Token, and Hash to dummy host
   add_host:
     name:   "K8S_TOKEN_HOLDER"
-    token:  "{{ K8S_TOKEN.stdout }}"
-    hash:   "{{ K8S_MANAGER_CA_HASH.stdout }}"
+    token:  "{{ k8s_token.stdout }}"
+    hash:   "{{ k8s_manager_ca_hash.stdout }}"
     ip:     "{{ ansible_default_ipv4.address }}"
 
 - name: Create yaml repo for setup
@@ -129,10 +130,10 @@
 - name: Setup Calico SDN network - custom-resources
   command: "kubectl create -f {{ calico_yml_url }}"
   changed_when: true
-  ignore_errors: True
+  failed_when: false
   when: "'calico-system' not in k8s_pods.stdout"
 
 - name: Edge / Workstation Install allows pods to schedule on manager
   command: kubectl taint nodes --all node-role.kubernetes.io/master-
   changed_when: true
-  ignore_errors: True
+  failed_when: false

+ 1 - 1
control_plane/roles/control_plane_k8s/tasks/k8s_services.yml

@@ -16,7 +16,7 @@
 - name: Wait for CoreDNS to restart
   command: kubectl rollout status deployment/coredns -n kube-system
   changed_when: false
-  ignore_errors: True
+  failed_when: false
 
 - name: Get K8s pods
   command: kubectl get pods --all-namespaces

+ 47 - 10
control_plane/roles/network_ethernet/tasks/pre_requisites.yml

@@ -13,6 +13,21 @@
 # limitations under the License.
 ---
 
+- name: Install paramiko
+  command: pip3 install paramiko -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
+  delegate_to: localhost
+  changed_when: false
+
+- name: Check if ethernet_tor_vars.yml exists
+  stat:
+    path: "{{ tor_config_file }}"
+  register: stat_result
+
+- name: Fail if config file doesn't exist
+  fail:
+    msg: "{{ fail_msg_tor_config_file }}"
+  when: not stat_result.stat.exists
+
 - name: Check if ethernet_vars.yml exists
   stat:
     path: "{{ config_file }}"
@@ -23,15 +38,37 @@
     msg: "{{ fail_msg_config_file }}"
   when: not stat_result.stat.exists
 
-- name: Include switch config variable file
-  include_vars: "{{ config_file }}"
+- name: Check switch model name
+  dellos10_command:
+    commands: 'show system | grep "Current Type"'
+  register: model_type
 
-- name: Assert save_changes_to_startup variable
-  assert:
-    that: "save_changes_to_startup == true or save_changes_to_startup == false"
-    success_msg: "{{ success_msg_save_config }}"
-    fail_msg: "{{ fail_msg_save_config }}"
+- name: Save switch model name
+  set_fact:
+    model_type: "{{ model_type.stdout[0].split(' ')[-1] }}"
 
-- name: Install paramiko
-  command: pip3 install paramiko -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
-  changed_when: false
+- name: Pre-requisite tasks for S3* and S4* switches
+  block:
+    - name: Include ethernet_tor_vars.yml config variable file
+      include_vars: "{{ tor_config_file }}"
+
+    - name: Assert save_changes_to_startup variable
+      assert:
+        that: "save_changes_to_startup == true or save_changes_to_startup == false"
+        success_msg: "{{ success_msg_tor_save_config }}"
+        fail_msg: "{{ fail_msg_tor_save_config }}"
+
+  when: "'S3' in model_type or 'S4' in model_type"
+
+- name: Pre-requisite tasks for other ethernet switches
+  block:
+    - name: Include ethernet_vars.yml config variable file
+      include_vars: "{{ config_file }}"
+
+    - name: Assert save_changes_to_startup variable
+      assert:
+        that: "save_changes_to_startup == true or save_changes_to_startup == false"
+        success_msg: "{{ success_msg_save_config }}"
+        fail_msg: "{{ fail_msg_save_config }}"
+
+  when: "'S3' not in model_type and 'S4' not in model_type"

+ 4 - 0
control_plane/roles/network_ethernet/vars/main.yml

@@ -16,7 +16,11 @@
 
 # Usage: pre_requisites.yml
 base_vars_file: "{{ role_path }}/../../input_params/base_vars.yml"
+tor_config_file: "{{ role_path }}/../../input_params/ethernet_tor_vars.yml"
 config_file: "{{ role_path }}/../../input_params/ethernet_vars.yml"
+fail_msg_tor_config_file: TOR Ethernet config file doesn't exist.
 fail_msg_config_file: Ethernet config file doesn't exist.
+success_msg_tor_save_config: TOR Ethernet config file validated.
+fail_msg_tor_save_config: save_changes_to_startup variable can only be set to true or false in ethernet_tor_vars.yml
 success_msg_save_config: Ethernet config file validated.
 fail_msg_save_config: save_changes_to_startup variable can only be set to true or false.

+ 1 - 1
control_plane/roles/network_ib/tasks/authenticate.yml

@@ -45,7 +45,7 @@
   rescue:
     - name: Filtered response creation
       set_fact:
-        filtered_dict: "{{filtered_dict |combine({item.key: item.value})}}"
+        filtered_dict: "{{ filtered_dict |combine({ item.key: item.value }) }}"
       when: item.key not in 'invocation'
       with_dict: "{{ login }}"
       no_log: true

+ 1 - 1
control_plane/roles/network_ib/tasks/interface_config.yml

@@ -24,7 +24,7 @@
       {
       "commands":
        [
-         "interface {{ item.key}}",
+         "interface {{ item.key }}",
          "description {{ item.value.description | default('<none>') }}",
          "{{ item.value.config | default([]) |join(', ') }}"
        ]

+ 24 - 19
roles/k8s_start_manager/tasks/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -31,21 +31,25 @@
 - name: Get K8s nodes status
   command: kubectl get nodes
   changed_when: false
-  ignore_errors: True
+  failed_when: false
   register: k8s_nodes
   tags: init
 
 - name: Get K8s ready compute nodes
-  shell: kubectl get node --selector='!node-role.kubernetes.io/master' | grep -v 'NotReady'
+  shell: >
+    set -o pipefail && \
+    kubectl get node --selector='!node-role.kubernetes.io/master' | grep -v 'NotReady'
   changed_when: false
-  ignore_errors: True
+  failed_when: false
   register: k8s_nodes_ready
   tags: init
 
 - name: Get K8s not ready compute nodes
-  shell: kubectl get node --selector='!node-role.kubernetes.io/master' | grep 'NotReady'
+  shell: >
+    set -o pipefail && \
+    kubectl get node --selector='!node-role.kubernetes.io/master' | grep 'NotReady'
   changed_when: false
-  ignore_errors: True
+  failed_when: false
   register: k8s_nodes_not_ready
   tags: init
 
@@ -53,17 +57,18 @@
   command: docker login -u {{ hostvars['127.0.0.1']['docker_username'] }} -p {{ hostvars['127.0.0.1']['docker_password'] }}
   changed_when: true
   register: docker_login_output
-  ignore_errors: True
+  failed_when: false
   when: hostvars['127.0.0.1']['docker_username'] or hostvars['127.0.0.1']['docker_password']
+  no_log: true
 
 - name: Docker login check
   fail:
     msg: "{{ docker_login_fail_msg }}"
   when: docker_login_output is failed
 
-- name: Initialize kubeadm
+- name: Initialize kubeadm (This process may take 5-10min)
   block:
-    - name: Initialize kubeadm
+    - name: Initialize kubeadm (This process may take 5-10min)
       command: "/bin/kubeadm init --pod-network-cidr='{{ hostvars['127.0.0.1']['k8s_pod_network_cidr'] }}' \
         --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
       changed_when: true
@@ -74,7 +79,7 @@
       command: "kubeadm reset -f"
       changed_when: true
 
-    - name: Initialize kubeadm
+    - name: Initialize kubeadm (This process may take 5-10min)
       command: "/bin/kubeadm init --pod-network-cidr='{{ hostvars['127.0.0.1']['k8s_pod_network_cidr'] }}' \
         --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
       changed_when: true
@@ -111,7 +116,7 @@
     set -o pipefail && \
       kubeadm token list | cut -d ' ' -f1 | sed -n '2p'
   changed_when: false
-  register: K8S_TOKEN
+  register: k8s_token
   tags: init
 
 - name: CA Hash
@@ -119,14 +124,14 @@
     set -o pipefail && \
       openssl x509 -pubkey -in {{ k8s_cert_path }} | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | sed 's/^.* //'
   changed_when: false
-  register: K8S_MANAGER_CA_HASH
+  register: k8s_manager_ca_hash
   tags: init
 
 - name: Add K8S Manager IP, Token, and Hash to dummy host
   add_host:
     name:   "K8S_TOKEN_HOLDER"
-    token:  "{{ K8S_TOKEN.stdout }}"
-    hash:   "{{ K8S_MANAGER_CA_HASH.stdout }}"
+    token:  "{{ k8s_token.stdout }}"
+    hash:   "{{ k8s_manager_ca_hash.stdout }}"
     ip:     "{{ ansible_default_ipv4.address }}"
     k8s_nodes:  "{{ k8s_nodes.stdout }}"
     k8s_nodes_ready:  "{{ k8s_nodes_ready.stdout }}"
@@ -214,16 +219,16 @@
 - name: Create clusterRoleBinding (K8s dashboard) files
   copy:
     src: create_clusterRoleBinding.yaml
-    dest: "{{ k8s_clusterRoleBinding_file_dest }}"
+    dest: "{{ cluster_role_binding_file_dest }}"
     owner: root
     group: root
-    mode: "{{ k8s_clusterRoleBinding_file_mode }}"
+    mode: "{{ cluster_role_binding_file_mode }}"
   tags: init
 
 - name: Create clusterRoleBinding (K8s dashboard)
-  command: "kubectl create -f '{{ k8s_clusterRoleBinding_file_dest }}'"
+  command: "kubectl create -f '{{ cluster_role_binding_file_dest }}'"
   changed_when: true
-  ignore_errors: True
+  failed_when: false
   tags: init
 
 - name: Dump bearer token for K8s dashboard login
@@ -236,5 +241,5 @@
 - name: Edge / Workstation Install allows pods to scheudle on manager
   command: kubectl taint nodes --all node-role.kubernetes.io/master-
   when: groups['manager'][0] == groups['compute'][0] and groups['compute']|length == 1
-  ignore_errors: True
+  failed_when: false
   tags: init

+ 3 - 3
roles/k8s_start_manager/vars/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -37,9 +37,9 @@ k8s_service_account_file_dest: /root/k8s/create_admin_user.yaml
 
 k8s_service_account_file_mode: 0655
 
-k8s_clusterRoleBinding_file_dest: /root/k8s/create_clusterRoleBinding.yaml
+cluster_role_binding_file_dest: /root/k8s/create_clusterRoleBinding.yaml
 
-k8s_clusterRoleBinding_file_mode: 0655
+cluster_role_binding_file_mode: 0655
 
 calico_yml_url: https://docs.projectcalico.org/manifests/calico.yaml
 

+ 42 - 53
roles/k8s_start_services/tasks/main.yml

@@ -17,38 +17,10 @@
   include_vars: ../../slurm_exporter/vars/main.yml
 
 - name: Wait for CoreDNS to restart
-  block:
-    - name: Wait for CoreDNS to restart
-      command: kubectl rollout status deployment/coredns -n kube-system  --timeout=4m
-      changed_when: false
-      tags: init
-  rescue:
-    - name: Get K8s pods
-      command: kubectl get pods --all-namespaces
-      register: k8s_pods
-      tags: init
-
-    - name: Pull docker images
-      command: docker pull {{ item }}
-      with_items: "{{ kube_system_docker_images }}"
-      when:
-        - hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password']
-        - "'ImagePullBackOff' in k8s_pods.stdout"
-      register: docker_image_pull_result
-      until: docker_image_pull_result is not failed
-      retries: 5
-
-    - name: Wait for CoreDNS to restart
-      command: kubectl rollout status deployment/coredns -n kube-system
-      when: hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password']
-      tags: init
-
-    - name: Fail message
-      fail:
-        msg: "{{ docker_pull_limit_msg }}"
-      when:
-        - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout"
-        - not hostvars['127.0.0.1']['docker_username'] and not hostvars['127.0.0.1']['docker_password']
+  command: kubectl rollout status deployment/coredns -n kube-system  --timeout=5m
+  changed_when: false
+  failed_when: false
+  tags: init
 
 - name: Get K8s pods
   command: kubectl get pods --all-namespaces
@@ -139,7 +111,7 @@
 - name: Set NFS-Client Provisioner as DEFAULT StorageClass
   shell: >
     kubectl patch storageclasses.storage.k8s.io nfs-client \
-    -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
+    -p '{ "metadata": { "annotations":{ "storageclass.kubernetes.io/is-default-class":"true" }}}'
   changed_when: true
   tags: init
 
@@ -224,35 +196,52 @@
   when: "'amdgpu-device-plugin' not in k8s_pods.stdout"
   tags: init
 
+- name: Deploy Volcano Scheduling
+  command: "kubectl apply -f '{{ volcano_scheduling_yaml_url }}'"
+  changed_when: true
+  when: "'volcano-system' not in k8s_pods.stdout"
+  tags: init
+
 - name: Install Spark Operator
   command: "helm repo add spark-operator '{{ spark_operator_repo }}'"
   changed_when: true
   tags: init
 
 - name: Install Spark Operator Namespace
-  command: "helm install my-release spark-operator/spark-operator --namespace spark-operator --create-namespace"
+  command: helm install my-release spark-operator/spark-operator --set image.tag={{ operator_image_tag }} --namespace spark-operator --create-namespace
   changed_when: true
   when: "'spark-operator' not in k8s_pods.stdout"
   tags: init
 
-- name: Deploy Volcano Scheduling
-  command: "kubectl apply -f '{{ volcano_scheduling_yaml_url }}'"
-  changed_when: true
-  when: "'volcano-system' not in k8s_pods.stdout"
-  tags: init
+- name: Wait for k8s pod to come to ready state
+  block:
+    - name: Wait for k8s pod to come to ready state
+      command: "kubectl wait --for=condition=ready -n {{ item.namespace }} pod -l app={{ item.app }} --timeout={{ item.timeout }}"
+      with_items:
+        - { namespace: "default", app: "nfs-client-provisioner", timeout: "10m" }
+        - { namespace: "volcano-system", app: "volcano-scheduler", timeout: "5m" }
+      changed_when: false
+      tags: install
+  rescue:
+    - name: Get K8s pods
+      command: kubectl get pods --all-namespaces
+      changed_when: false
+      register: k8s_pods
+      tags: init
 
-- name: Get K8s pods
-  command: kubectl get pods --all-namespaces
-  changed_when: false
-  register: k8s_pods
-  tags: init
+    - name: Fail message
+      fail:
+        msg: "{{ docker_pull_limit_msg }}"
+      when:
+        - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout"
+        - not hostvars['127.0.0.1']['docker_username'] and not hostvars['127.0.0.1']['docker_password']
 
-- name: Pull K8s services docker images
-  command: docker pull {{ item }}
-  with_items: "{{ k8s_services_docker_images }}"
-  when:
-    - "'ImagePullBackOff' in k8s_pods.stdout"
-    - hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password']
-  register: docker_image_pull_result
-  until: docker_image_pull_result is not failed
-  retries: 5
+    - name: Pull K8s services docker images
+      command: docker pull {{ item }}
+      with_items: "{{ k8s_docker_images }}"
+      when:
+        - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout"
+        - hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password']
+      register: docker_image_pull_result
+      until: docker_image_pull_result is not failed
+      retries: 5

+ 6 - 6
roles/k8s_start_services/vars/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -13,15 +13,14 @@
 #  limitations under the License.
 ---
 
-kube_system_docker_images:
+k8s_docker_images:
   - docker.io/calico/kube-controllers:v3.19.1
   - docker.io/calico/cni:v3.19.1
   - docker.io/calico/pod2daemon-flexvol:v3.19.1
   - docker.io/calico/node:v3.19.1
   - xilinxatg/xilinx_k8s_fpga_plugin:2020.11.24
   - nvidia/k8s-device-plugin:v0.7.0
-
-k8s_services_docker_images:
+  - quay.io/external_storage/nfs-client-provisioner:v3.1.0-k8s1.11
   - docker.io/rocm/k8s-device-plugin
   - kubernetesui/dashboard:v2.0.5
   - kubernetesui/metrics-scraper:v1.0.6
@@ -36,7 +35,6 @@ k8s_services_docker_images:
   - volcanosh/vc-controller-manager:latest
   - volcanosh/vc-scheduler:latest
   - volcanosh/vc-webhook-manager:latest
-  - quay.io/external_storage/nfs-client-provisioner:v3.1.0-k8s1.11
 
 docker_pull_limit_msg: "You have reached your docker pull rate limit. Please provide docker credentials in omnia_config.yml and try again"
 
@@ -88,4 +86,6 @@ prometheus_path_on_host: /var/lib/prometheus-2.23.0.linux-amd64/
 
 spark_operator_repo: https://googlecloudplatform.github.io/spark-on-k8s-operator
 
-volcano_scheduling_yaml_url: https://raw.githubusercontent.com/volcano-sh/volcano/master/installer/volcano-development.yaml
+operator_image_tag: v1beta2-1.2.3-3.1.1
+
+volcano_scheduling_yaml_url: https://raw.githubusercontent.com/volcano-sh/volcano/v1.3.0/installer/volcano-development.yaml

+ 1 - 1
roles/k8s_start_workers/tasks/main.yml

@@ -27,7 +27,7 @@
 - name: Reset kubeadm
   command: kubeadm reset -f
   changed_when: true
-  ignore_errors: True
+  failed_when: false
   when:
     - groups['manager'][0] != groups['compute'][0]
     - groups['compute']|length >= 1