deploy_k8s_services.yml 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. # Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. ---
  15. - name: Include common variables
  16. include_vars: ../../slurm_exporter/vars/main.yml
  17. - name: Include k8s_nfs_server_setup variables
  18. include_vars: ../../k8s_nfs_server_setup/vars/main.yml
  19. - name: Include powervault_me4_nfs variables
  20. include_vars: ../../powervault_me4_nfs/vars/main.yml
  21. - name: Wait for CoreDNS to restart
  22. command: kubectl rollout status deployment/coredns -n kube-system --timeout=5m
  23. changed_when: false
  24. failed_when: false
  25. tags: init
  26. - name: Get K8s pods
  27. command: kubectl get pods --all-namespaces
  28. changed_when: false
  29. register: k8s_pods
  30. tags: init
  31. #- name: Get metallb repo
  32. #command: "helm repo add metallb '{{ metallb_helm_url }}'"
  33. #changed_when: false
  34. #tags: init
  35. - name: Create MetalLB Setup Config Files
  36. copy:
  37. src: metal-config.yaml
  38. dest: "{{ metallb_config_file_dest }}"
  39. owner: root
  40. group: root
  41. mode: "{{ metallb_config_file_mode }}"
  42. tags: init
  43. #- name: Create MetalLB Setup Deployment Files
  44. #copy:
  45. #src: metallb.yaml
  46. #dest: "{{ metallb_deployment_file_dest }}"
  47. #owner: root
  48. #group: root
  49. #mode: "{{ metallb_deployment_file_mode }}"
  50. #tags: init
  51. - name: Create Metallb namespace
  52. command: "kubectl create -f https://raw.githubusercontent.com/metallb/metallb/v0.10.3/manifests/namespace.yaml"
  53. changed_when: true
  54. when: "'metallb' not in k8s_pods.stdout"
  55. tags: init
  56. - name: Generate Metallb default secret
  57. command: "kubectl create secret generic metallb-memberlist --from-literal=secretkey='$(openssl rand -base64 128)'"
  58. changed_when: true
  59. when: "'metallb' not in k8s_pods.stdout"
  60. tags: init
  61. - name: Generate Metallb metallb-system secret
  62. command: "kubectl create secret generic -n metallb-system metallb-memberlist --from-literal=secretkey='$(openssl rand -base64 128)'"
  63. changed_when: true
  64. when: "'metallb' not in k8s_pods.stdout"
  65. tags: init
  66. - name: Deploy Metallb
  67. #command: "helm install metallb metallb/metallb -f '{{ metallb_config_file_dest }}'"
  68. command: "kubectl create -f https://raw.githubusercontent.com/metallb/metallb/v0.10.3/manifests/metallb.yaml"
  69. changed_when: true
  70. when: "'metallb' not in k8s_pods.stdout"
  71. tags: init
  72. - name: Apply Metallb config
  73. command: "kubectl create -f '{{ metallb_config_file_dest }}'"
  74. changed_when: true
  75. when: "'metallb' not in k8s_pods.stdout"
  76. tags: init
  77. - name: Start k8s dashboard
  78. command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
  79. changed_when: true
  80. when: "'kubernetes-dashboard' not in k8s_pods.stdout"
  81. tags: init
  82. - name: Copy k8s_dashboard_admin.yml file
  83. copy:
  84. src: k8s_dashboard_admin.yaml
  85. dest: "{{ k8s_dashboard_admin_file_dest }}"
  86. owner: root
  87. group: root
  88. mode: "{{ k8s_dashboard_admin_file_mode }}"
  89. - name: Create admin user for K8s dashboard
  90. command: "kubectl apply -f {{ k8s_dashboard_admin_file_dest }}"
  91. changed_when: true
  92. - name: Helm - add stable repo
  93. command: "helm repo add stable '{{ helm_stable_repo_url }}'"
  94. changed_when: true
  95. tags: init
  96. - name: Helm - add Nvidia k8s-device-plugin (nvdp) repo
  97. command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'"
  98. changed_when: true
  99. when: ansible_local.inventory.nvidia_gpu > 0
  100. tags: init
  101. - name: Helm - add Nvidia GPU discovery (nvgfd) repo
  102. command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'"
  103. changed_when: true
  104. when: ansible_local.inventory.nvidia_gpu > 0
  105. tags: init
  106. - name: Helm - update repo
  107. command: helm repo update
  108. changed_when: true
  109. tags: init
  110. - name: Start NFS Client Provisioner using NFS on manager node
  111. command: "helm install nfs-omnia stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_manager_node }}' --set nfs.path='{{ nfs_share_dir }}'"
  112. changed_when: true
  113. when:
  114. - "'nfs-client-provisioner' not in k8s_pods.stdout"
  115. - not hostvars['127.0.0.1']['powervault_status']
  116. tags: init
  117. - name: Start NFS Client Provisioner using NFS on NFS Node
  118. command: "helm install nfs-omnia stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_nfs_node }}' --set nfs.path='{{ me4_nfs_share_k8s }}'"
  119. changed_when: true
  120. when:
  121. - "'nfs-client-provisioner' not in k8s_pods.stdout"
  122. - hostvars['127.0.0.1']['powervault_status']
  123. tags: init
  124. - name: Set NFS-Client Provisioner as DEFAULT StorageClass
  125. shell: >
  126. kubectl patch storageclasses.storage.k8s.io nfs-client \
  127. -p '{ "metadata": { "annotations":{ "storageclass.kubernetes.io/is-default-class":"true" }}}'
  128. changed_when: true
  129. tags: init
  130. - name: Check if prometheus is installed on the host
  131. stat:
  132. path: "{{ prometheus_path_on_host }}"
  133. register: prometheus_status
  134. changed_when: False
  135. ignore_errors: yes
  136. tags: init
  137. - name: Delete prometheus installed on host if it exists
  138. file:
  139. path: "{{ prometheus_path_on_host }}"
  140. state: absent
  141. when: prometheus_status.stat.exists
  142. tags: init
  143. - name: Copy the slurm exporter config file
  144. copy:
  145. src: "{{ slurm_exporter_config_file }}"
  146. dest: "{{ slurm_exporter_config_file_path }}"
  147. owner: root
  148. group: root
  149. mode: "{{ slurm_exporter_file_mode }}"
  150. tags: init
  151. - name: Fetch the public IP of the host
  152. shell: >
  153. set -o pipefail && \
  154. ip route get 8.8.8.8 | awk '{print $7}'
  155. register: public_ip
  156. changed_when: False
  157. tags: init
  158. - name: Add the host IP to config file
  159. replace:
  160. path: "{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}"
  161. regexp: "localhost:8080"
  162. replace: "{{ public_ip.stdout }}:{{ slurm_exporter_port }}"
  163. tags: init
  164. - name: Prometheus deployment
  165. command: >
  166. helm install stable/prometheus \
  167. --set-file extraScrapeConfigs="{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}" \
  168. --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
  169. --generate-name
  170. changed_when: true
  171. when: "'prometheus' not in k8s_pods.stdout"
  172. tags: init
  173. - name: Install MPI Operator
  174. command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
  175. changed_when: true
  176. when: "'mpi-operator' not in k8s_pods.stdout"
  177. tags: init
  178. - name: Install nvidia-device-plugin
  179. command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
  180. changed_when: true
  181. when:
  182. - "'nvidia-device-plugin' not in k8s_pods.stdout"
  183. - ansible_local.inventory.nvidia_gpu > 0
  184. tags: init
  185. - name: Install GPU Feature Discovery
  186. command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
  187. changed_when: true
  188. when:
  189. - "'node-feature-discovery' not in k8s_pods.stdout"
  190. - ansible_local.inventory.nvidia_gpu > 0
  191. tags: init
  192. - name: Deploy Xilinx Device plugin
  193. command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
  194. changed_when: true
  195. register: fpga_enable
  196. when: "'fpga-device-plugin' not in k8s_pods.stdout"
  197. tags: init
  198. - name: Deploy ROCm Device plugin
  199. command: "kubectl create -f '{{ rocm_device_plugin_yaml_url }}'"
  200. changed_when: true
  201. register: amd_gpu_enable
  202. when: "'amdgpu-device-plugin' not in k8s_pods.stdout"
  203. tags: init
  204. #- name: Deploy Volcano Scheduling
  205. #command: "kubectl apply -f '{{ volcano_scheduling_yaml_url }}'"
  206. #changed_when: true
  207. #when: "'volcano-system' not in k8s_pods.stdout"
  208. #tags: init
  209. - name: Install Spark Operator
  210. command: "helm repo add spark-operator '{{ spark_operator_repo }}'"
  211. changed_when: true
  212. tags: init
  213. - name: Install Spark Operator Namespace
  214. command: helm install my-release spark-operator/spark-operator --set image.tag={{ operator_image_tag }} --namespace spark-operator --create-namespace
  215. changed_when: true
  216. when: "'spark-operator' not in k8s_pods.stdout"
  217. tags: init