main.yml 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. # Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. ---
  15. - name: Include common variables
  16. include_vars: ../../slurm_exporter/vars/main.yml
  17. - name: Wait for CoreDNS to restart
  18. command: kubectl rollout status deployment/coredns -n kube-system
  19. changed_when: false
  20. ignore_errors: True
  21. tags: init
  22. - name: Get K8s pods
  23. command: kubectl get pods --all-namespaces
  24. changed_when: false
  25. register: k8s_pods
  26. tags: init
  27. - name: Deploy MetalLB
  28. command: "kubectl apply -f '{{ metallb_yaml_url }}'"
  29. changed_when: true
  30. when: "'metallb' not in k8s_pods.stdout"
  31. tags: init
  32. - name: Create MetalLB Setup Config Files
  33. copy:
  34. src: metal-config.yaml
  35. dest: "{{ metallb_config_file_dest }}"
  36. owner: root
  37. group: root
  38. mode: "{{ metallb_config_file_mode }}"
  39. tags: init
  40. - name: Create MetalLB Setup Deployment Files
  41. copy:
  42. src: metallb.yaml
  43. dest: "{{ metallb_deployment_file_dest }}"
  44. owner: root
  45. group: root
  46. mode: "{{ metallb_deployment_file_mode }}"
  47. tags: init
  48. - name: Deploy MetalLB
  49. command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
  50. changed_when: true
  51. when: "'metallb' not in k8s_pods.stdout"
  52. tags: init
  53. - name: Create default setup for MetalLB
  54. command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
  55. changed_when: true
  56. when: "'metallb' not in k8s_pods.stdout"
  57. tags: init
  58. - name: Start k8s dashboard
  59. command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
  60. changed_when: true
  61. when: "'kubernetes-dashboard' not in k8s_pods.stdout"
  62. tags: init
  63. - name: Copy k8s_dashboard_admin.yml file
  64. copy:
  65. src: k8s_dashboard_admin.yaml
  66. dest: "{{ k8s_dashboard_admin_file_dest }}"
  67. owner: root
  68. group: root
  69. mode: "{{ k8s_dashboard_admin_file_mode }}"
  70. - name: Create admin user for K8s dashboard
  71. command: "kubectl apply -f {{ k8s_dashboard_admin_file_dest }}"
  72. changed_when: true
  73. - name: Helm - add stable repo
  74. command: "helm repo add stable '{{ helm_stable_repo_url }}'"
  75. changed_when: true
  76. tags: init
  77. - name: Helm - add Nvidia k8s-device-plugin (nvdp) repo
  78. command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'"
  79. changed_when: true
  80. tags: init
  81. - name: Helm - add Nvidia GPU discovery (nvgfd) repo
  82. command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'"
  83. changed_when: true
  84. tags: init
  85. - name: Helm - update repo
  86. command: helm repo update
  87. changed_when: true
  88. tags: init
  89. - name: Start NFS Client Provisioner
  90. command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server }}' --set nfs.path='{{ nfs_path }}' --generate-name"
  91. changed_when: true
  92. when: "'nfs-client-provisioner' not in k8s_pods.stdout"
  93. tags: init
  94. - name: Set NFS-Client Provisioner as DEFAULT StorageClass
  95. shell: >
  96. kubectl patch storageclasses.storage.k8s.io nfs-client \
  97. -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
  98. changed_when: true
  99. tags: init
  100. - name: Check if prometheus is installed on the host
  101. stat:
  102. path: "{{ prometheus_path_on_host }}"
  103. register: prometheus_status
  104. changed_when: False
  105. ignore_errors: yes
  106. tags: init
  107. - name: Delete prometheus installed on host if it exists
  108. file:
  109. path: "{{ prometheus_path_on_host }}"
  110. state: absent
  111. when: prometheus_status.stat.exists
  112. tags: init
  113. - name: Copy the slurm exporter config file
  114. copy:
  115. src: "{{ slurm_exporter_config_file }}"
  116. dest: "{{ slurm_exporter_config_file_path }}"
  117. owner: root
  118. group: root
  119. mode: "{{ slurm_exporter_file_mode }}"
  120. tags: init
  121. - name: Fetch the public IP of the host
  122. shell: >
  123. set -o pipefail && \
  124. ip route get 8.8.8.8 | awk '{print $7}'
  125. register: public_ip
  126. changed_when: False
  127. tags: init
  128. - name: Add the host IP to config file
  129. replace:
  130. path: "{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}"
  131. regexp: "localhost:8080"
  132. replace: "{{ public_ip.stdout }}:{{ slurm_exporter_port }}"
  133. tags: init
  134. - name: Prometheus deployment
  135. command: >
  136. helm install stable/prometheus \
  137. --set-file extraScrapeConfigs="{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}" \
  138. --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
  139. --generate-name
  140. changed_when: true
  141. when: "'prometheus' not in k8s_pods.stdout"
  142. tags: init
  143. - name: Install MPI Operator
  144. command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
  145. changed_when: true
  146. when: "'mpi-operator' not in k8s_pods.stdout"
  147. tags: init
  148. - name: Install nvidia-device-plugin
  149. command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
  150. changed_when: true
  151. when: "'nvidia-device-plugin' not in k8s_pods.stdout"
  152. tags: init
  153. - name: Install GPU Feature Discovery
  154. command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
  155. changed_when: true
  156. when: "'node-feature-discovery' not in k8s_pods.stdout"
  157. tags: init
  158. - name: Deploy Xilinx Device plugin
  159. command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
  160. changed_when: true
  161. register: fpga_enable
  162. when: "'fpga-device-plugin' not in k8s_pods.stdout"
  163. tags: init
  164. - name: Deploy ROCm Device plugin
  165. command: "kubectl create -f '{{ rocm_device_plugin_yaml_url }}'"
  166. changed_when: true
  167. register: amd_gpu_enable
  168. when: "'amdgpu-device-plugin' not in k8s_pods.stdout"
  169. tags: init
  170. - name: Install Spark Operator
  171. command: "helm repo add spark-operator '{{ spark_operator_repo }}'"
  172. changed_when: true
  173. tags: init
  174. - name: Install Spark Operator Namespace
  175. command: "helm install my-release spark-operator/spark-operator --namespace spark-operator --create-namespace"
  176. changed_when: true
  177. when: "'spark-operator' not in k8s_pods.stdout"
  178. tags: init
  179. - name: Deploy Volcano Scheduling
  180. command: "kubectl apply -f '{{ volcano_scheduling_yaml_url }}'"
  181. changed_when: true
  182. when: "'volcano-system' not in k8s_pods.stdout"
  183. tags: init