@@ -17,38 +17,10 @@
include_vars: ../../slurm_exporter/vars/main.yml
- name: Wait for CoreDNS to restart
- block:
- - name: Wait for CoreDNS to restart
- command: kubectl rollout status deployment/coredns -n kube-system --timeout=4m
- changed_when: false
- tags: init
- rescue:
- - name: Get K8s pods
- command: kubectl get pods --all-namespaces
- register: k8s_pods
- tags: init
- - name: Pull docker images
- command: docker pull {{ item }}
- with_items: "{{ kube_system_docker_images }}"
- when:
- - hostvars['']['docker_username'] and hostvars['']['docker_password']
- - "'ImagePullBackOff' in k8s_pods.stdout"
- register: docker_image_pull_result
- until: docker_image_pull_result is not failed
- retries: 5
- - name: Wait for CoreDNS to restart
- command: kubectl rollout status deployment/coredns -n kube-system
- when: hostvars['']['docker_username'] and hostvars['']['docker_password']
- tags: init
- - name: Fail message
- fail:
- msg: "{{ docker_pull_limit_msg }}"
- when:
- - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout"
- - not hostvars['']['docker_username'] and not hostvars['']['docker_password']
+ command: kubectl rollout status deployment/coredns -n kube-system --timeout=5m
+ changed_when: false
+ failed_when: false
+ tags: init
- name: Get K8s pods
command: kubectl get pods --all-namespaces
@@ -139,7 +111,7 @@
- name: Set NFS-Client Provisioner as DEFAULT StorageClass
shell: >
kubectl patch storageclasses.storage.k8s.io nfs-client \
- -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
+ -p '{ "metadata": { "annotations":{ "storageclass.kubernetes.io/is-default-class":"true" }}}'
changed_when: true
tags: init
@@ -224,35 +196,52 @@
when: "'amdgpu-device-plugin' not in k8s_pods.stdout"
tags: init
+- name: Deploy Volcano Scheduling
+ command: "kubectl apply -f '{{ volcano_scheduling_yaml_url }}'"
+ changed_when: true
+ when: "'volcano-system' not in k8s_pods.stdout"
+ tags: init
- name: Install Spark Operator
command: "helm repo add spark-operator '{{ spark_operator_repo }}'"
changed_when: true
tags: init
- name: Install Spark Operator Namespace
- command: "helm install my-release spark-operator/spark-operator --namespace spark-operator --create-namespace"
+ command: helm install my-release spark-operator/spark-operator --set image.tag={{ operator_image_tag }} --namespace spark-operator --create-namespace
changed_when: true
when: "'spark-operator' not in k8s_pods.stdout"
tags: init
-- name: Deploy Volcano Scheduling
- command: "kubectl apply -f '{{ volcano_scheduling_yaml_url }}'"
- changed_when: true
- when: "'volcano-system' not in k8s_pods.stdout"
- tags: init
+- name: Wait for k8s pod to come to ready state
+ block:
+ - name: Wait for k8s pod to come to ready state
+ command: "kubectl wait --for=condition=ready -n {{ item.namespace }} pod -l app={{ item.app }} --timeout={{ item.timeout }}"
+ with_items:
+ - { namespace: "default", app: "nfs-client-provisioner", timeout: "10m" }
+ - { namespace: "volcano-system", app: "volcano-scheduler", timeout: "5m" }
+ changed_when: false
+ tags: install
+ rescue:
+ - name: Get K8s pods
+ command: kubectl get pods --all-namespaces
+ changed_when: false
+ register: k8s_pods
+ tags: init
-- name: Get K8s pods
- command: kubectl get pods --all-namespaces
- changed_when: false
- register: k8s_pods
- tags: init
+ - name: Fail message
+ fail:
+ msg: "{{ docker_pull_limit_msg }}"
+ when:
+ - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout"
+ - not hostvars['']['docker_username'] and not hostvars['']['docker_password']
-- name: Pull K8s services docker images
- command: docker pull {{ item }}
- with_items: "{{ k8s_services_docker_images }}"
- when:
- - "'ImagePullBackOff' in k8s_pods.stdout"
- - hostvars['']['docker_username'] and hostvars['']['docker_password']
- register: docker_image_pull_result
- until: docker_image_pull_result is not failed
- retries: 5
+ - name: Pull K8s services docker images
+ command: docker pull {{ item }}
+ with_items: "{{ k8s_docker_images }}"
+ when:
+ - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout"
+ - hostvars['']['docker_username'] and hostvars['']['docker_password']
+ register: docker_image_pull_result
+ until: docker_image_pull_result is not failed
+ retries: 5