123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231 |
- # Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- ---
- - name: Include common variables
- include_vars: ../../slurm_exporter/vars/main.yml
- - name: Include k8s_nfs_server_setup variables
- include_vars: ../../k8s_nfs_server_setup/vars/main.yml
- - name: Include powervault_me4_nfs variables
- include_vars: ../../powervault_me4_nfs/vars/main.yml
- - name: Wait for CoreDNS to restart
- command: kubectl rollout status deployment/coredns -n kube-system --timeout=5m
- changed_when: false
- failed_when: false
- tags: init
- - name: Get K8s pods
- command: kubectl get pods --all-namespaces
- changed_when: false
- register: k8s_pods
- tags: init
- - name: Deploy MetalLB
- command: "kubectl apply -f '{{ metallb_yaml_url }}'"
- changed_when: true
- when: "'metallb' not in k8s_pods.stdout"
- tags: init
- - name: Create MetalLB Setup Config Files
- copy:
- src: metal-config.yaml
- dest: "{{ metallb_config_file_dest }}"
- owner: root
- group: root
- mode: "{{ metallb_config_file_mode }}"
- tags: init
- - name: Create MetalLB Setup Deployment Files
- copy:
- src: metallb.yaml
- dest: "{{ metallb_deployment_file_dest }}"
- owner: root
- group: root
- mode: "{{ metallb_deployment_file_mode }}"
- tags: init
- - name: Deploy MetalLB
- command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
- changed_when: true
- when: "'metallb' not in k8s_pods.stdout"
- tags: init
- - name: Create default setup for MetalLB
- command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
- changed_when: true
- when: "'metallb' not in k8s_pods.stdout"
- tags: init
- - name: Start k8s dashboard
- command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
- changed_when: true
- when: "'kubernetes-dashboard' not in k8s_pods.stdout"
- tags: init
- - name: Copy k8s_dashboard_admin.yml file
- copy:
- src: k8s_dashboard_admin.yaml
- dest: "{{ k8s_dashboard_admin_file_dest }}"
- owner: root
- group: root
- mode: "{{ k8s_dashboard_admin_file_mode }}"
- - name: Create admin user for K8s dashboard
- command: "kubectl apply -f {{ k8s_dashboard_admin_file_dest }}"
- changed_when: true
- - name: Helm - add stable repo
- command: "helm repo add stable '{{ helm_stable_repo_url }}'"
- changed_when: true
- tags: init
- - name: Helm - add Nvidia k8s-device-plugin (nvdp) repo
- command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'"
- changed_when: true
- tags: init
- - name: Helm - add Nvidia GPU discovery (nvgfd) repo
- command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'"
- changed_when: true
- tags: init
- - name: Helm - update repo
- command: helm repo update
- changed_when: true
- tags: init
- - name: Start NFS Client Provisioner using NFS on manager node
- command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_manager_node }}' --set nfs.path='{{ nfs_share_dir }}' --generate-name"
- changed_when: true
- when:
- - "'nfs-client-provisioner' not in k8s_pods.stdout"
- - not hostvars['127.0.0.1']['powervault_status']
- tags: init
- - name: Start NFS Client Provisioner using NFS on NFS Node
- command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_nfs_node }}' --set nfs.path='{{ me4_nfs_share_k8s }}' --generate-name"
- changed_when: true
- when:
- - "'nfs-client-provisioner' not in k8s_pods.stdout"
- - hostvars['127.0.0.1']['powervault_status']
- tags: init
- - name: Set NFS-Client Provisioner as DEFAULT StorageClass
- shell: >
- kubectl patch storageclasses.storage.k8s.io nfs-client \
- -p '{ "metadata": { "annotations":{ "storageclass.kubernetes.io/is-default-class":"true" }}}'
- changed_when: true
- tags: init
- - name: Check if prometheus is installed on the host
- stat:
- path: "{{ prometheus_path_on_host }}"
- register: prometheus_status
- changed_when: False
- ignore_errors: yes
- tags: init
- - name: Delete prometheus installed on host if it exists
- file:
- path: "{{ prometheus_path_on_host }}"
- state: absent
- when: prometheus_status.stat.exists
- tags: init
- - name: Copy the slurm exporter config file
- copy:
- src: "{{ slurm_exporter_config_file }}"
- dest: "{{ slurm_exporter_config_file_path }}"
- owner: root
- group: root
- mode: "{{ slurm_exporter_file_mode }}"
- tags: init
- - name: Fetch the public IP of the host
- shell: >
- set -o pipefail && \
- ip route get 8.8.8.8 | awk '{print $7}'
- register: public_ip
- changed_when: False
- tags: init
- - name: Add the host IP to config file
- replace:
- path: "{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}"
- regexp: "localhost:8080"
- replace: "{{ public_ip.stdout }}:{{ slurm_exporter_port }}"
- tags: init
- - name: Prometheus deployment
- command: >
- helm install stable/prometheus \
- --set-file extraScrapeConfigs="{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}" \
- --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
- --generate-name
- changed_when: true
- when: "'prometheus' not in k8s_pods.stdout"
- tags: init
- - name: Install MPI Operator
- command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
- changed_when: true
- when: "'mpi-operator' not in k8s_pods.stdout"
- tags: init
- - name: Install nvidia-device-plugin
- command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
- changed_when: true
- when: "'nvidia-device-plugin' not in k8s_pods.stdout"
- tags: init
- - name: Install GPU Feature Discovery
- command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
- changed_when: true
- when: "'node-feature-discovery' not in k8s_pods.stdout"
- tags: init
- - name: Deploy Xilinx Device plugin
- command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
- changed_when: true
- register: fpga_enable
- when: "'fpga-device-plugin' not in k8s_pods.stdout"
- tags: init
- - name: Deploy ROCm Device plugin
- command: "kubectl create -f '{{ rocm_device_plugin_yaml_url }}'"
- changed_when: true
- register: amd_gpu_enable
- when: "'amdgpu-device-plugin' not in k8s_pods.stdout"
- tags: init
- - name: Deploy Volcano Scheduling
- command: "kubectl apply -f '{{ volcano_scheduling_yaml_url }}'"
- changed_when: true
- when: "'volcano-system' not in k8s_pods.stdout"
- tags: init
- - name: Install Spark Operator
- command: "helm repo add spark-operator '{{ spark_operator_repo }}'"
- changed_when: true
- tags: init
- - name: Install Spark Operator Namespace
- command: helm install my-release spark-operator/spark-operator --set image.tag={{ operator_image_tag }} --namespace spark-operator --create-namespace
- changed_when: true
- when: "'spark-operator' not in k8s_pods.stdout"
- tags: init
|