main.yml 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. # Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. ---
  15. - name: Include common variables
  16. include_vars: ../../slurm_exporter/vars/main.yml
  17. - name: Wait for CoreDNS to restart
  18. block:
  19. - name: Wait for CoreDNS to restart
  20. command: kubectl rollout status deployment/coredns -n kube-system --timeout=4m
  21. changed_when: false
  22. tags: init
  23. rescue:
  24. - name: Get K8s pods
  25. command: kubectl get pods --all-namespaces
  26. register: k8s_pods
  27. tags: init
  28. - name: Pull docker images
  29. command: docker pull {{ item }}
  30. with_items: "{{ kube_system_docker_images }}"
  31. when:
  32. - hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password']
  33. - "'ImagePullBackOff' in k8s_pods.stdout"
  34. register: docker_image_pull_result
  35. until: docker_image_pull_result is not failed
  36. retries: 5
  37. - name: Wait for CoreDNS to restart
  38. command: kubectl rollout status deployment/coredns -n kube-system
  39. when: hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password']
  40. tags: init
  41. - name: Fail message
  42. fail:
  43. msg: "{{ docker_pull_limit_msg }}"
  44. when:
  45. - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout"
  46. - not hostvars['127.0.0.1']['docker_username'] and not hostvars['127.0.0.1']['docker_password']
  47. - name: Get K8s pods
  48. command: kubectl get pods --all-namespaces
  49. changed_when: false
  50. register: k8s_pods
  51. tags: init
  52. - name: Deploy MetalLB
  53. command: "kubectl apply -f '{{ metallb_yaml_url }}'"
  54. changed_when: true
  55. when: "'metallb' not in k8s_pods.stdout"
  56. tags: init
  57. - name: Create MetalLB Setup Config Files
  58. copy:
  59. src: metal-config.yaml
  60. dest: "{{ metallb_config_file_dest }}"
  61. owner: root
  62. group: root
  63. mode: "{{ metallb_config_file_mode }}"
  64. tags: init
  65. - name: Create MetalLB Setup Deployment Files
  66. copy:
  67. src: metallb.yaml
  68. dest: "{{ metallb_deployment_file_dest }}"
  69. owner: root
  70. group: root
  71. mode: "{{ metallb_deployment_file_mode }}"
  72. tags: init
  73. - name: Deploy MetalLB
  74. command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
  75. changed_when: true
  76. when: "'metallb' not in k8s_pods.stdout"
  77. tags: init
  78. - name: Create default setup for MetalLB
  79. command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
  80. changed_when: true
  81. when: "'metallb' not in k8s_pods.stdout"
  82. tags: init
  83. - name: Start k8s dashboard
  84. command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
  85. changed_when: true
  86. when: "'kubernetes-dashboard' not in k8s_pods.stdout"
  87. tags: init
  88. - name: Copy k8s_dashboard_admin.yml file
  89. copy:
  90. src: k8s_dashboard_admin.yaml
  91. dest: "{{ k8s_dashboard_admin_file_dest }}"
  92. owner: root
  93. group: root
  94. mode: "{{ k8s_dashboard_admin_file_mode }}"
  95. - name: Create admin user for K8s dashboard
  96. command: "kubectl apply -f {{ k8s_dashboard_admin_file_dest }}"
  97. changed_when: true
  98. - name: Helm - add stable repo
  99. command: "helm repo add stable '{{ helm_stable_repo_url }}'"
  100. changed_when: true
  101. tags: init
  102. - name: Helm - add Nvidia k8s-device-plugin (nvdp) repo
  103. command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'"
  104. changed_when: true
  105. tags: init
  106. - name: Helm - add Nvidia GPU discovery (nvgfd) repo
  107. command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'"
  108. changed_when: true
  109. tags: init
  110. - name: Helm - update repo
  111. command: helm repo update
  112. changed_when: true
  113. tags: init
  114. - name: Start NFS Client Provisioner
  115. command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server }}' --set nfs.path='{{ nfs_path }}' --generate-name"
  116. changed_when: true
  117. when: "'nfs-client-provisioner' not in k8s_pods.stdout"
  118. tags: init
  119. - name: Set NFS-Client Provisioner as DEFAULT StorageClass
  120. shell: >
  121. kubectl patch storageclasses.storage.k8s.io nfs-client \
  122. -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
  123. changed_when: true
  124. tags: init
  125. - name: Check if prometheus is installed on the host
  126. stat:
  127. path: "{{ prometheus_path_on_host }}"
  128. register: prometheus_status
  129. changed_when: False
  130. ignore_errors: yes
  131. tags: init
  132. - name: Delete prometheus installed on host if it exists
  133. file:
  134. path: "{{ prometheus_path_on_host }}"
  135. state: absent
  136. when: prometheus_status.stat.exists
  137. tags: init
  138. - name: Copy the slurm exporter config file
  139. copy:
  140. src: "{{ slurm_exporter_config_file }}"
  141. dest: "{{ slurm_exporter_config_file_path }}"
  142. owner: root
  143. group: root
  144. mode: "{{ slurm_exporter_file_mode }}"
  145. tags: init
  146. - name: Fetch the public IP of the host
  147. shell: >
  148. set -o pipefail && \
  149. ip route get 8.8.8.8 | awk '{print $7}'
  150. register: public_ip
  151. changed_when: False
  152. tags: init
  153. - name: Add the host IP to config file
  154. replace:
  155. path: "{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}"
  156. regexp: "localhost:8080"
  157. replace: "{{ public_ip.stdout }}:{{ slurm_exporter_port }}"
  158. tags: init
  159. - name: Prometheus deployment
  160. command: >
  161. helm install stable/prometheus \
  162. --set-file extraScrapeConfigs="{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}" \
  163. --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
  164. --generate-name
  165. changed_when: true
  166. when: "'prometheus' not in k8s_pods.stdout"
  167. tags: init
  168. - name: Install MPI Operator
  169. command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
  170. changed_when: true
  171. when: "'mpi-operator' not in k8s_pods.stdout"
  172. tags: init
  173. - name: Install nvidia-device-plugin
  174. command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
  175. changed_when: true
  176. when: "'nvidia-device-plugin' not in k8s_pods.stdout"
  177. tags: init
  178. - name: Install GPU Feature Discovery
  179. command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
  180. changed_when: true
  181. when: "'node-feature-discovery' not in k8s_pods.stdout"
  182. tags: init
  183. - name: Deploy Xilinx Device plugin
  184. command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
  185. changed_when: true
  186. register: fpga_enable
  187. when: "'fpga-device-plugin' not in k8s_pods.stdout"
  188. tags: init
  189. - name: Deploy ROCm Device plugin
  190. command: "kubectl create -f '{{ rocm_device_plugin_yaml_url }}'"
  191. changed_when: true
  192. register: amd_gpu_enable
  193. when: "'amdgpu-device-plugin' not in k8s_pods.stdout"
  194. tags: init
  195. - name: Install Spark Operator
  196. command: "helm repo add spark-operator '{{ spark_operator_repo }}'"
  197. changed_when: true
  198. tags: init
  199. - name: Install Spark Operator Namespace
  200. command: "helm install my-release spark-operator/spark-operator --namespace spark-operator --create-namespace"
  201. changed_when: true
  202. when: "'spark-operator' not in k8s_pods.stdout"
  203. tags: init
  204. - name: Deploy Volcano Scheduling
  205. command: "kubectl apply -f '{{ volcano_scheduling_yaml_url }}'"
  206. changed_when: true
  207. when: "'volcano-system' not in k8s_pods.stdout"
  208. tags: init
  209. - name: Get K8s pods
  210. command: kubectl get pods --all-namespaces
  211. changed_when: false
  212. register: k8s_pods
  213. tags: init
  214. - name: Pull K8s services docker images
  215. command: docker pull {{ item }}
  216. with_items: "{{ k8s_services_docker_images }}"
  217. when:
  218. - "'ImagePullBackOff' in k8s_pods.stdout"
  219. - hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password']
  220. register: docker_image_pull_result
  221. until: docker_image_pull_result is not failed
  222. retries: 5