# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. --- - name: Include common variables include_vars: ../../slurm_exporter/vars/main.yml - name: Wait for CoreDNS to restart command: kubectl rollout status deployment/coredns -n kube-system --timeout=5m changed_when: false failed_when: false tags: init - name: Get K8s pods command: kubectl get pods --all-namespaces changed_when: false register: k8s_pods tags: init - name: Deploy MetalLB command: "kubectl apply -f '{{ metallb_yaml_url }}'" changed_when: true when: "'metallb' not in k8s_pods.stdout" tags: init - name: Create MetalLB Setup Config Files copy: src: metal-config.yaml dest: "{{ metallb_config_file_dest }}" owner: root group: root mode: "{{ metallb_config_file_mode }}" tags: init - name: Create MetalLB Setup Deployment Files copy: src: metallb.yaml dest: "{{ metallb_deployment_file_dest }}" owner: root group: root mode: "{{ metallb_deployment_file_mode }}" tags: init - name: Deploy MetalLB command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'" changed_when: true when: "'metallb' not in k8s_pods.stdout" tags: init - name: Create default setup for MetalLB command: "kubectl apply -f '{{ metallb_config_file_dest }}'" changed_when: true when: "'metallb' not in k8s_pods.stdout" tags: init - name: Start k8s dashboard command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'" changed_when: true when: "'kubernetes-dashboard' not in k8s_pods.stdout" tags: init - name: Copy k8s_dashboard_admin.yml file copy: src: k8s_dashboard_admin.yaml dest: "{{ k8s_dashboard_admin_file_dest }}" owner: root group: root mode: "{{ k8s_dashboard_admin_file_mode }}" - name: Create admin user for K8s dashboard command: "kubectl apply -f {{ k8s_dashboard_admin_file_dest }}" changed_when: true - name: Helm - add stable repo command: "helm repo add stable '{{ helm_stable_repo_url }}'" changed_when: true tags: init - name: Helm - add Nvidia k8s-device-plugin (nvdp) repo command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'" changed_when: true tags: init - name: Helm - add Nvidia GPU discovery (nvgfd) repo command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'" changed_when: true tags: init - name: Helm - update repo command: helm repo update changed_when: true tags: init - name: Start NFS Client Provisioner command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server }}' --set nfs.path='{{ nfs_path }}' --generate-name" changed_when: true when: "'nfs-client-provisioner' not in k8s_pods.stdout" tags: init - name: Set NFS-Client Provisioner as DEFAULT StorageClass shell: > kubectl patch storageclasses.storage.k8s.io nfs-client \ -p '{ "metadata": { "annotations":{ "storageclass.kubernetes.io/is-default-class":"true" }}}' changed_when: true tags: init - name: Check if prometheus is installed on the host stat: path: "{{ prometheus_path_on_host }}" register: prometheus_status changed_when: False ignore_errors: yes tags: init - name: Delete prometheus installed on host if it exists file: path: "{{ prometheus_path_on_host }}" state: absent when: prometheus_status.stat.exists tags: init - name: Copy the slurm exporter config file copy: src: "{{ slurm_exporter_config_file }}" dest: "{{ slurm_exporter_config_file_path }}" owner: root group: root mode: "{{ slurm_exporter_file_mode }}" tags: init - name: Fetch the public IP of the host shell: > set -o pipefail && \ ip route get 8.8.8.8 | awk '{print $7}' register: public_ip changed_when: False tags: init - name: Add the host IP to config file replace: path: "{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}" regexp: "localhost:8080" replace: "{{ public_ip.stdout }}:{{ slurm_exporter_port }}" tags: init - name: Prometheus deployment command: > helm install stable/prometheus \ --set-file extraScrapeConfigs="{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}" \ --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \ --generate-name changed_when: true when: "'prometheus' not in k8s_pods.stdout" tags: init - name: Install MPI Operator command: "kubectl create -f '{{ mpi_operator_yaml_url }}'" changed_when: true when: "'mpi-operator' not in k8s_pods.stdout" tags: init - name: Install nvidia-device-plugin command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin" changed_when: true when: "'nvidia-device-plugin' not in k8s_pods.stdout" tags: init - name: Install GPU Feature Discovery command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery" changed_when: true when: "'node-feature-discovery' not in k8s_pods.stdout" tags: init - name: Deploy Xilinx Device plugin command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'" changed_when: true register: fpga_enable when: "'fpga-device-plugin' not in k8s_pods.stdout" tags: init - name: Deploy ROCm Device plugin command: "kubectl create -f '{{ rocm_device_plugin_yaml_url }}'" changed_when: true register: amd_gpu_enable when: "'amdgpu-device-plugin' not in k8s_pods.stdout" tags: init - name: Deploy Volcano Scheduling command: "kubectl apply -f '{{ volcano_scheduling_yaml_url }}'" changed_when: true when: "'volcano-system' not in k8s_pods.stdout" tags: init - name: Install Spark Operator command: "helm repo add spark-operator '{{ spark_operator_repo }}'" changed_when: true tags: init - name: Install Spark Operator Namespace command: helm install my-release spark-operator/spark-operator --set image.tag={{ operator_image_tag }} --namespace spark-operator --create-namespace changed_when: true when: "'spark-operator' not in k8s_pods.stdout" tags: init - name: Wait for k8s pod to come to ready state block: - name: Wait for k8s pod to come to ready state command: "kubectl wait --for=condition=ready -n {{ item.namespace }} pod -l app={{ item.app }} --timeout={{ item.timeout }}" with_items: - { namespace: "default", app: "nfs-client-provisioner", timeout: "10m" } - { namespace: "volcano-system", app: "volcano-scheduler", timeout: "5m" } changed_when: false tags: install rescue: - name: Get K8s pods command: kubectl get pods --all-namespaces changed_when: false register: k8s_pods tags: init - name: Fail message fail: msg: "{{ docker_pull_limit_msg }}" when: - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout" - not hostvars['127.0.0.1']['docker_username'] and not hostvars['127.0.0.1']['docker_password'] - name: Pull K8s services docker images command: docker pull {{ item }} with_items: "{{ k8s_docker_images }}" when: - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout" - hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password'] register: docker_image_pull_result until: docker_image_pull_result is not failed retries: 5