Parcourir la source

Merge pull request #453 from blesson-james/devel

Issue #452: Fixed volcano and spark-operator to a stable version
Lucas A. Wilson il y a 3 ans
Parent
commit
2359bf8a43

+ 14 - 13
control_plane/roles/control_plane_k8s/tasks/k8s_init.yml

@@ -27,30 +27,31 @@
 - name: Get K8s nodes status
   command: kubectl get nodes
   changed_when: false
-  ignore_errors: True
+  failed_when: false
   register: k8s_nodes
 
 - name: Get K8s pods status
   command: kubectl get pods --all-namespaces
   changed_when: false
-  ignore_errors: True
+  failed_when: false
   register: k8s_pods
 
 - name: Docker login
   command: docker login -u {{ docker_username }} -p {{ docker_password }}
   changed_when: true
   register: docker_login_output
-  ignore_errors: True
+  failed_when: false
   when: docker_username or docker_password
+  no_log: true
 
 - name: Docker login check
   fail:
     msg: "{{ docker_login_fail_msg }}"
   when: docker_login_output is failed
 
-- name: Initialize kubeadm
+- name: Initialize kubeadm (This process may take 5-10min)
   block:
-    - name: Initialize kubeadm
+    - name: Initialize kubeadm (This process may take 5-10min)
       command: "/bin/kubeadm init --pod-network-cidr='{{ appliance_k8s_pod_net_cidr }}' \
         --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
       changed_when: true
@@ -60,7 +61,7 @@
       command: "kubeadm reset -f"
       changed_when: true
 
-    - name: Initialize kubeadm
+    - name: Initialize kubeadm (This process may take 5-10min)
       command: "/bin/kubeadm init --pod-network-cidr='{{ appliance_k8s_pod_net_cidr }}' \
           --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
       changed_when: true
@@ -69,7 +70,7 @@
     - name: Get K8s pods status
       command: kubectl get pods --all-namespaces
       changed_when: false
-      ignore_errors: True
+      failed_when: false
       register: k8s_pods
   when: "'master' not in k8s_nodes.stdout"
 
@@ -99,20 +100,20 @@
     set -o pipefail && \
       kubeadm token list | cut -d ' ' -f1 | sed -n '2p'
   changed_when: false
-  register: K8S_TOKEN
+  register: k8s_token
 
 - name: CA Hash
   shell: >
     set -o pipefail && \
       openssl x509 -pubkey -in {{ k8s_cert_path }} | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | sed 's/^.* //'
   changed_when: false
-  register: K8S_MANAGER_CA_HASH
+  register: k8s_manager_ca_hash
 
 - name: Add K8S Manager IP, Token, and Hash to dummy host
   add_host:
     name:   "K8S_TOKEN_HOLDER"
-    token:  "{{ K8S_TOKEN.stdout }}"
-    hash:   "{{ K8S_MANAGER_CA_HASH.stdout }}"
+    token:  "{{ k8s_token.stdout }}"
+    hash:   "{{ k8s_manager_ca_hash.stdout }}"
     ip:     "{{ ansible_default_ipv4.address }}"
 
 - name: Create yaml repo for setup
@@ -129,10 +130,10 @@
 - name: Setup Calico SDN network - custom-resources
   command: "kubectl create -f {{ calico_yml_url }}"
   changed_when: true
-  ignore_errors: True
+  failed_when: false
   when: "'calico-system' not in k8s_pods.stdout"
 
 - name: Edge / Workstation Install allows pods to schedule on manager
   command: kubectl taint nodes --all node-role.kubernetes.io/master-
   changed_when: true
-  ignore_errors: True
+  failed_when: false

+ 1 - 1
control_plane/roles/control_plane_k8s/tasks/k8s_services.yml

@@ -16,7 +16,7 @@
 - name: Wait for CoreDNS to restart
   command: kubectl rollout status deployment/coredns -n kube-system
   changed_when: false
-  ignore_errors: True
+  failed_when: false
 
 - name: Get K8s pods
   command: kubectl get pods --all-namespaces

+ 24 - 19
roles/k8s_start_manager/tasks/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -31,21 +31,25 @@
 - name: Get K8s nodes status
   command: kubectl get nodes
   changed_when: false
-  ignore_errors: True
+  failed_when: false
   register: k8s_nodes
   tags: init
 
 - name: Get K8s ready compute nodes
-  shell: kubectl get node --selector='!node-role.kubernetes.io/master' | grep -v 'NotReady'
+  shell: >
+    set -o pipefail && \
+    kubectl get node --selector='!node-role.kubernetes.io/master' | grep -v 'NotReady'
   changed_when: false
-  ignore_errors: True
+  failed_when: false
   register: k8s_nodes_ready
   tags: init
 
 - name: Get K8s not ready compute nodes
-  shell: kubectl get node --selector='!node-role.kubernetes.io/master' | grep 'NotReady'
+  shell: >
+    set -o pipefail && \
+    kubectl get node --selector='!node-role.kubernetes.io/master' | grep 'NotReady'
   changed_when: false
-  ignore_errors: True
+  failed_when: false
   register: k8s_nodes_not_ready
   tags: init
 
@@ -53,17 +57,18 @@
   command: docker login -u {{ hostvars['127.0.0.1']['docker_username'] }} -p {{ hostvars['127.0.0.1']['docker_password'] }}
   changed_when: true
   register: docker_login_output
-  ignore_errors: True
+  failed_when: false
   when: hostvars['127.0.0.1']['docker_username'] or hostvars['127.0.0.1']['docker_password']
+  no_log: true
 
 - name: Docker login check
   fail:
     msg: "{{ docker_login_fail_msg }}"
   when: docker_login_output is failed
 
-- name: Initialize kubeadm
+- name: Initialize kubeadm (This process may take 5-10min)
   block:
-    - name: Initialize kubeadm
+    - name: Initialize kubeadm (This process may take 5-10min)
       command: "/bin/kubeadm init --pod-network-cidr='{{ hostvars['127.0.0.1']['k8s_pod_network_cidr'] }}' \
         --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
       changed_when: true
@@ -74,7 +79,7 @@
       command: "kubeadm reset -f"
       changed_when: true
 
-    - name: Initialize kubeadm
+    - name: Initialize kubeadm (This process may take 5-10min)
       command: "/bin/kubeadm init --pod-network-cidr='{{ hostvars['127.0.0.1']['k8s_pod_network_cidr'] }}' \
         --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
       changed_when: true
@@ -111,7 +116,7 @@
     set -o pipefail && \
       kubeadm token list | cut -d ' ' -f1 | sed -n '2p'
   changed_when: false
-  register: K8S_TOKEN
+  register: k8s_token
   tags: init
 
 - name: CA Hash
@@ -119,14 +124,14 @@
     set -o pipefail && \
       openssl x509 -pubkey -in {{ k8s_cert_path }} | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | sed 's/^.* //'
   changed_when: false
-  register: K8S_MANAGER_CA_HASH
+  register: k8s_manager_ca_hash
   tags: init
 
 - name: Add K8S Manager IP, Token, and Hash to dummy host
   add_host:
     name:   "K8S_TOKEN_HOLDER"
-    token:  "{{ K8S_TOKEN.stdout }}"
-    hash:   "{{ K8S_MANAGER_CA_HASH.stdout }}"
+    token:  "{{ k8s_token.stdout }}"
+    hash:   "{{ k8s_manager_ca_hash.stdout }}"
     ip:     "{{ ansible_default_ipv4.address }}"
     k8s_nodes:  "{{ k8s_nodes.stdout }}"
     k8s_nodes_ready:  "{{ k8s_nodes_ready.stdout }}"
@@ -214,16 +219,16 @@
 - name: Create clusterRoleBinding (K8s dashboard) files
   copy:
     src: create_clusterRoleBinding.yaml
-    dest: "{{ k8s_clusterRoleBinding_file_dest }}"
+    dest: "{{ cluster_role_binding_file_dest }}"
     owner: root
     group: root
-    mode: "{{ k8s_clusterRoleBinding_file_mode }}"
+    mode: "{{ cluster_role_binding_file_mode }}"
   tags: init
 
 - name: Create clusterRoleBinding (K8s dashboard)
-  command: "kubectl create -f '{{ k8s_clusterRoleBinding_file_dest }}'"
+  command: "kubectl create -f '{{ cluster_role_binding_file_dest }}'"
   changed_when: true
-  ignore_errors: True
+  failed_when: false
   tags: init
 
 - name: Dump bearer token for K8s dashboard login
@@ -236,5 +241,5 @@
 - name: Edge / Workstation Install allows pods to scheudle on manager
   command: kubectl taint nodes --all node-role.kubernetes.io/master-
   when: groups['manager'][0] == groups['compute'][0] and groups['compute']|length == 1
-  ignore_errors: True
+  failed_when: false
   tags: init

+ 3 - 3
roles/k8s_start_manager/vars/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -37,9 +37,9 @@ k8s_service_account_file_dest: /root/k8s/create_admin_user.yaml
 
 k8s_service_account_file_mode: 0655
 
-k8s_clusterRoleBinding_file_dest: /root/k8s/create_clusterRoleBinding.yaml
+cluster_role_binding_file_dest: /root/k8s/create_clusterRoleBinding.yaml
 
-k8s_clusterRoleBinding_file_mode: 0655
+cluster_role_binding_file_mode: 0655
 
 calico_yml_url: https://docs.projectcalico.org/manifests/calico.yaml
 

+ 42 - 53
roles/k8s_start_services/tasks/main.yml

@@ -17,38 +17,10 @@
   include_vars: ../../slurm_exporter/vars/main.yml
 
 - name: Wait for CoreDNS to restart
-  block:
-    - name: Wait for CoreDNS to restart
-      command: kubectl rollout status deployment/coredns -n kube-system  --timeout=4m
-      changed_when: false
-      tags: init
-  rescue:
-    - name: Get K8s pods
-      command: kubectl get pods --all-namespaces
-      register: k8s_pods
-      tags: init
-
-    - name: Pull docker images
-      command: docker pull {{ item }}
-      with_items: "{{ kube_system_docker_images }}"
-      when:
-        - hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password']
-        - "'ImagePullBackOff' in k8s_pods.stdout"
-      register: docker_image_pull_result
-      until: docker_image_pull_result is not failed
-      retries: 5
-
-    - name: Wait for CoreDNS to restart
-      command: kubectl rollout status deployment/coredns -n kube-system
-      when: hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password']
-      tags: init
-
-    - name: Fail message
-      fail:
-        msg: "{{ docker_pull_limit_msg }}"
-      when:
-        - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout"
-        - not hostvars['127.0.0.1']['docker_username'] and not hostvars['127.0.0.1']['docker_password']
+  command: kubectl rollout status deployment/coredns -n kube-system  --timeout=5m
+  changed_when: false
+  failed_when: false
+  tags: init
 
 - name: Get K8s pods
   command: kubectl get pods --all-namespaces
@@ -139,7 +111,7 @@
 - name: Set NFS-Client Provisioner as DEFAULT StorageClass
   shell: >
     kubectl patch storageclasses.storage.k8s.io nfs-client \
-    -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
+    -p '{ "metadata": { "annotations":{ "storageclass.kubernetes.io/is-default-class":"true" }}}'
   changed_when: true
   tags: init
 
@@ -224,35 +196,52 @@
   when: "'amdgpu-device-plugin' not in k8s_pods.stdout"
   tags: init
 
+- name: Deploy Volcano Scheduling
+  command: "kubectl apply -f '{{ volcano_scheduling_yaml_url }}'"
+  changed_when: true
+  when: "'volcano-system' not in k8s_pods.stdout"
+  tags: init
+
 - name: Install Spark Operator
   command: "helm repo add spark-operator '{{ spark_operator_repo }}'"
   changed_when: true
   tags: init
 
 - name: Install Spark Operator Namespace
-  command: "helm install my-release spark-operator/spark-operator --namespace spark-operator --create-namespace"
+  command: helm install my-release spark-operator/spark-operator --set image.tag={{ operator_image_tag }} --namespace spark-operator --create-namespace
   changed_when: true
   when: "'spark-operator' not in k8s_pods.stdout"
   tags: init
 
-- name: Deploy Volcano Scheduling
-  command: "kubectl apply -f '{{ volcano_scheduling_yaml_url }}'"
-  changed_when: true
-  when: "'volcano-system' not in k8s_pods.stdout"
-  tags: init
+- name: Wait for k8s pod to come to ready state
+  block:
+    - name: Wait for k8s pod to come to ready state
+      command: "kubectl wait --for=condition=ready -n {{ item.namespace }} pod -l app={{ item.app }} --timeout={{ item.timeout }}"
+      with_items:
+        - { namespace: "default", app: "nfs-client-provisioner", timeout: "10m" }
+        - { namespace: "volcano-system", app: "volcano-scheduler", timeout: "5m" }
+      changed_when: false
+      tags: install
+  rescue:
+    - name: Get K8s pods
+      command: kubectl get pods --all-namespaces
+      changed_when: false
+      register: k8s_pods
+      tags: init
 
-- name: Get K8s pods
-  command: kubectl get pods --all-namespaces
-  changed_when: false
-  register: k8s_pods
-  tags: init
+    - name: Fail message
+      fail:
+        msg: "{{ docker_pull_limit_msg }}"
+      when:
+        - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout"
+        - not hostvars['127.0.0.1']['docker_username'] and not hostvars['127.0.0.1']['docker_password']
 
-- name: Pull K8s services docker images
-  command: docker pull {{ item }}
-  with_items: "{{ k8s_services_docker_images }}"
-  when:
-    - "'ImagePullBackOff' in k8s_pods.stdout"
-    - hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password']
-  register: docker_image_pull_result
-  until: docker_image_pull_result is not failed
-  retries: 5
+    - name: Pull K8s services docker images
+      command: docker pull {{ item }}
+      with_items: "{{ k8s_docker_images }}"
+      when:
+        - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout"
+        - hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password']
+      register: docker_image_pull_result
+      until: docker_image_pull_result is not failed
+      retries: 5

+ 6 - 6
roles/k8s_start_services/vars/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -13,15 +13,14 @@
 #  limitations under the License.
 ---
 
-kube_system_docker_images:
+k8s_docker_images:
   - docker.io/calico/kube-controllers:v3.19.1
   - docker.io/calico/cni:v3.19.1
   - docker.io/calico/pod2daemon-flexvol:v3.19.1
   - docker.io/calico/node:v3.19.1
   - xilinxatg/xilinx_k8s_fpga_plugin:2020.11.24
   - nvidia/k8s-device-plugin:v0.7.0
-
-k8s_services_docker_images:
+  - quay.io/external_storage/nfs-client-provisioner:v3.1.0-k8s1.11
   - docker.io/rocm/k8s-device-plugin
   - kubernetesui/dashboard:v2.0.5
   - kubernetesui/metrics-scraper:v1.0.6
@@ -36,7 +35,6 @@ k8s_services_docker_images:
   - volcanosh/vc-controller-manager:latest
   - volcanosh/vc-scheduler:latest
   - volcanosh/vc-webhook-manager:latest
-  - quay.io/external_storage/nfs-client-provisioner:v3.1.0-k8s1.11
 
 docker_pull_limit_msg: "You have reached your docker pull rate limit. Please provide docker credentials in omnia_config.yml and try again"
 
@@ -88,4 +86,6 @@ prometheus_path_on_host: /var/lib/prometheus-2.23.0.linux-amd64/
 
 spark_operator_repo: https://googlecloudplatform.github.io/spark-on-k8s-operator
 
-volcano_scheduling_yaml_url: https://raw.githubusercontent.com/volcano-sh/volcano/master/installer/volcano-development.yaml
+operator_image_tag: v1beta2-1.2.3-3.1.1
+
+volcano_scheduling_yaml_url: https://raw.githubusercontent.com/volcano-sh/volcano/v1.3.0/installer/volcano-development.yaml

+ 1 - 1
roles/k8s_start_workers/tasks/main.yml

@@ -27,7 +27,7 @@
 - name: Reset kubeadm
   command: kubeadm reset -f
   changed_when: true
-  ignore_errors: True
+  failed_when: false
   when:
     - groups['manager'][0] != groups['compute'][0]
     - groups['compute']|length >= 1