|
@@ -1,4 +1,4 @@
|
|
|
-# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
|
|
|
+# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
|
|
|
#
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
# you may not use this file except in compliance with the License.
|
|
@@ -13,235 +13,11 @@
|
|
|
# limitations under the License.
|
|
|
---
|
|
|
|
|
|
-- name: Include common variables
|
|
|
- include_vars: ../../slurm_exporter/vars/main.yml
|
|
|
+- name: Deploy K8s services
|
|
|
+ include_tasks: deploy_k8s_services.yml
|
|
|
+ when: "'manager' in group_names"
|
|
|
+ tags: install
|
|
|
|
|
|
-- name: Wait for CoreDNS to restart
|
|
|
- command: kubectl rollout status deployment/coredns -n kube-system --timeout=5m
|
|
|
- changed_when: false
|
|
|
- failed_when: false
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Get K8s pods
|
|
|
- command: kubectl get pods --all-namespaces
|
|
|
- changed_when: false
|
|
|
- register: k8s_pods
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Deploy MetalLB
|
|
|
- command: "kubectl apply -f '{{ metallb_yaml_url }}'"
|
|
|
- changed_when: true
|
|
|
- when: "'metallb' not in k8s_pods.stdout"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Create MetalLB Setup Config Files
|
|
|
- copy:
|
|
|
- src: metal-config.yaml
|
|
|
- dest: "{{ metallb_config_file_dest }}"
|
|
|
- owner: root
|
|
|
- group: root
|
|
|
- mode: "{{ metallb_config_file_mode }}"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Create MetalLB Setup Deployment Files
|
|
|
- copy:
|
|
|
- src: metallb.yaml
|
|
|
- dest: "{{ metallb_deployment_file_dest }}"
|
|
|
- owner: root
|
|
|
- group: root
|
|
|
- mode: "{{ metallb_deployment_file_mode }}"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Deploy MetalLB
|
|
|
- command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
|
|
|
- changed_when: true
|
|
|
- when: "'metallb' not in k8s_pods.stdout"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Create default setup for MetalLB
|
|
|
- command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
|
|
|
- changed_when: true
|
|
|
- when: "'metallb' not in k8s_pods.stdout"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Start k8s dashboard
|
|
|
- command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
|
|
|
- changed_when: true
|
|
|
- when: "'kubernetes-dashboard' not in k8s_pods.stdout"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Copy k8s_dashboard_admin.yml file
|
|
|
- copy:
|
|
|
- src: k8s_dashboard_admin.yaml
|
|
|
- dest: "{{ k8s_dashboard_admin_file_dest }}"
|
|
|
- owner: root
|
|
|
- group: root
|
|
|
- mode: "{{ k8s_dashboard_admin_file_mode }}"
|
|
|
-
|
|
|
-- name: Create admin user for K8s dashboard
|
|
|
- command: "kubectl apply -f {{ k8s_dashboard_admin_file_dest }}"
|
|
|
- changed_when: true
|
|
|
-
|
|
|
-- name: Helm - add stable repo
|
|
|
- command: "helm repo add stable '{{ helm_stable_repo_url }}'"
|
|
|
- changed_when: true
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Helm - add Nvidia k8s-device-plugin (nvdp) repo
|
|
|
- command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'"
|
|
|
- changed_when: true
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Helm - add Nvidia GPU discovery (nvgfd) repo
|
|
|
- command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'"
|
|
|
- changed_when: true
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Helm - update repo
|
|
|
- command: helm repo update
|
|
|
- changed_when: true
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Start NFS Client Provisioner
|
|
|
- command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server }}' --set nfs.path='{{ nfs_path }}' --generate-name"
|
|
|
- changed_when: true
|
|
|
- when: "'nfs-client-provisioner' not in k8s_pods.stdout"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Set NFS-Client Provisioner as DEFAULT StorageClass
|
|
|
- shell: >
|
|
|
- kubectl patch storageclasses.storage.k8s.io nfs-client \
|
|
|
- -p '{ "metadata": { "annotations":{ "storageclass.kubernetes.io/is-default-class":"true" }}}'
|
|
|
- changed_when: true
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Check if prometheus is installed on the host
|
|
|
- stat:
|
|
|
- path: "{{ prometheus_path_on_host }}"
|
|
|
- register: prometheus_status
|
|
|
- changed_when: False
|
|
|
- ignore_errors: yes
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Delete prometheus installed on host if it exists
|
|
|
- file:
|
|
|
- path: "{{ prometheus_path_on_host }}"
|
|
|
- state: absent
|
|
|
- when: prometheus_status.stat.exists
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Copy the slurm exporter config file
|
|
|
- copy:
|
|
|
- src: "{{ slurm_exporter_config_file }}"
|
|
|
- dest: "{{ slurm_exporter_config_file_path }}"
|
|
|
- owner: root
|
|
|
- group: root
|
|
|
- mode: "{{ slurm_exporter_file_mode }}"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Fetch the public IP of the host
|
|
|
- shell: >
|
|
|
- set -o pipefail && \
|
|
|
- ip route get 8.8.8.8 | awk '{print $7}'
|
|
|
- register: public_ip
|
|
|
- changed_when: False
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Add the host IP to config file
|
|
|
- replace:
|
|
|
- path: "{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}"
|
|
|
- regexp: "localhost:8080"
|
|
|
- replace: "{{ public_ip.stdout }}:{{ slurm_exporter_port }}"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Prometheus deployment
|
|
|
- command: >
|
|
|
- helm install stable/prometheus \
|
|
|
- --set-file extraScrapeConfigs="{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}" \
|
|
|
- --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
|
|
|
- --generate-name
|
|
|
- changed_when: true
|
|
|
- when: "'prometheus' not in k8s_pods.stdout"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Install MPI Operator
|
|
|
- command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
|
|
|
- changed_when: true
|
|
|
- when: "'mpi-operator' not in k8s_pods.stdout"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Install nvidia-device-plugin
|
|
|
- command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
|
|
|
- changed_when: true
|
|
|
- when: "'nvidia-device-plugin' not in k8s_pods.stdout"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Install GPU Feature Discovery
|
|
|
- command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
|
|
|
- changed_when: true
|
|
|
- when: "'node-feature-discovery' not in k8s_pods.stdout"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Deploy Xilinx Device plugin
|
|
|
- command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
|
|
|
- changed_when: true
|
|
|
- register: fpga_enable
|
|
|
- when: "'fpga-device-plugin' not in k8s_pods.stdout"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Deploy ROCm Device plugin
|
|
|
- command: "kubectl create -f '{{ rocm_device_plugin_yaml_url }}'"
|
|
|
- changed_when: true
|
|
|
- register: amd_gpu_enable
|
|
|
- when: "'amdgpu-device-plugin' not in k8s_pods.stdout"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Deploy Volcano Scheduling
|
|
|
- command: "kubectl apply -f '{{ volcano_scheduling_yaml_url }}'"
|
|
|
- changed_when: true
|
|
|
- when: "'volcano-system' not in k8s_pods.stdout"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Install Spark Operator
|
|
|
- command: "helm repo add spark-operator '{{ spark_operator_repo }}'"
|
|
|
- changed_when: true
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Install Spark Operator Namespace
|
|
|
- command: helm install my-release spark-operator/spark-operator --set image.tag={{ operator_image_tag }} --namespace spark-operator --create-namespace
|
|
|
- changed_when: true
|
|
|
- when: "'spark-operator' not in k8s_pods.stdout"
|
|
|
- tags: init
|
|
|
-
|
|
|
-- name: Wait for k8s pod to come to ready state
|
|
|
- block:
|
|
|
- - name: Wait for k8s pod to come to ready state
|
|
|
- command: "kubectl wait --for=condition=ready -n {{ item.namespace }} pod -l app={{ item.app }} --timeout={{ item.timeout }}"
|
|
|
- with_items:
|
|
|
- - { namespace: "default", app: "nfs-client-provisioner", timeout: "10m" }
|
|
|
- - { namespace: "volcano-system", app: "volcano-scheduler", timeout: "5m" }
|
|
|
- changed_when: false
|
|
|
- tags: install
|
|
|
- rescue:
|
|
|
- - name: Get K8s pods
|
|
|
- command: kubectl get pods --all-namespaces
|
|
|
- changed_when: false
|
|
|
- register: k8s_pods
|
|
|
- tags: init
|
|
|
-
|
|
|
- - name: Fail message
|
|
|
- fail:
|
|
|
- msg: "{{ docker_pull_limit_msg }}"
|
|
|
- when:
|
|
|
- - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout"
|
|
|
- - not hostvars['127.0.0.1']['docker_username'] and not hostvars['127.0.0.1']['docker_password']
|
|
|
-
|
|
|
- - name: Pull K8s services docker images
|
|
|
- command: docker pull {{ item }}
|
|
|
- with_items: "{{ k8s_docker_images }}"
|
|
|
- when:
|
|
|
- - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout"
|
|
|
- - hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password']
|
|
|
- register: docker_image_pull_result
|
|
|
- until: docker_image_pull_result is not failed
|
|
|
- retries: 5
|
|
|
+- name: Check K8s pods
|
|
|
+ include_tasks: check_k8s_pods.yml
|
|
|
+ tags: install
|