main.yml 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. # Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. ---
  15. - name: Wait for CoreDNS to restart
  16. command: kubectl rollout status deployment/coredns -n kube-system
  17. changed_when: false
  18. ignore_errors: True
  19. tags: init
  20. - name: Get K8s pods
  21. command: kubectl get pods --all-namespaces
  22. changed_when: false
  23. register: k8s_pods
  24. tags: init
  25. - name: Deploy MetalLB
  26. command: "kubectl apply -f '{{ metallb_yaml_url }}'"
  27. changed_when: true
  28. when: "'metallb' not in k8s_pods.stdout"
  29. tags: init
  30. - name: Create MetalLB Setup Config Files
  31. copy:
  32. src: metal-config.yaml
  33. dest: "{{ metallb_config_file_dest }}"
  34. owner: root
  35. group: root
  36. mode: "{{ metallb_config_file_mode }}"
  37. tags: init
  38. - name: Create MetalLB Setup Deployment Files
  39. copy:
  40. src: metallb.yaml
  41. dest: "{{ metallb_deployment_file_dest }}"
  42. owner: root
  43. group: root
  44. mode: "{{ metallb_deployment_file_mode }}"
  45. tags: init
  46. - name: Deploy MetalLB
  47. command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
  48. changed_when: true
  49. when: "'metallb' not in k8s_pods.stdout"
  50. tags: init
  51. - name: Create default setup for MetalLB
  52. command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
  53. changed_when: true
  54. when: "'metallb' not in k8s_pods.stdout"
  55. tags: init
  56. - name: Start k8s dashboard
  57. command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
  58. changed_when: true
  59. when: "'kubernetes-dashboard' not in k8s_pods.stdout"
  60. tags: init
  61. - name: Helm - add stable repo
  62. command: "helm repo add stable '{{ helm_stable_repo_url }}'"
  63. changed_when: true
  64. tags: init
  65. - name: Helm - add Nvidia k8s-device-plugin (nvdp) repo
  66. command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'"
  67. changed_when: true
  68. tags: init
  69. - name: Helm - add Nvidia GPU discovery (nvgfd) repo
  70. command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'"
  71. changed_when: true
  72. tags: init
  73. - name: Helm - update repo
  74. command: helm repo update
  75. changed_when: true
  76. tags: init
  77. - name: Start NFS Client Provisioner
  78. command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server }}' --set nfs.path='{{ nfs_path }}' --generate-name"
  79. changed_when: true
  80. when: "'nfs-client-provisioner' not in k8s_pods.stdout"
  81. tags: init
  82. - name: Set NFS-Client Provisioner as DEFAULT StorageClass
  83. shell: >
  84. kubectl patch storageclasses.storage.k8s.io nfs-client \
  85. -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
  86. changed_when: true
  87. tags: init
  88. - name: Check if prometheus is installed on the host
  89. stat:
  90. path: "{{ prometheus_path_on_host }}"
  91. register: prometheus_status
  92. changed_when: False
  93. ignore_errors: yes
  94. tags: init
  95. - name: Delete prometheus installed on host if it exists
  96. file:
  97. path: "{{ prometheus_path_on_host }}"
  98. state: absent
  99. when: prometheus_status.stat.exists
  100. tags: init
  101. - name: Copy the slurm exporter config file
  102. copy:
  103. src: "{{ slurm_exporter_config_file }}"
  104. dest: "{{ slurm_exporter_config_file_path }}"
  105. owner: root
  106. group: root
  107. mode: "{{ slurm_exporter_file_mode }}"
  108. tags: init
  109. - name: Fetch the public IP of the host
  110. shell: >
  111. set -o pipefail && \
  112. ip route get 8.8.8.8 | awk '{print $7}'
  113. register: public_ip
  114. changed_when: False
  115. tags: init
  116. - name: Add the host IP to config file
  117. replace:
  118. path: "{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}"
  119. regexp: "localhost"
  120. replace: "{{ public_ip.stdout }}"
  121. tags: init
  122. - name: Prometheus deployment
  123. command: >
  124. helm install stable/prometheus \
  125. --set-file extraScrapeConfigs="{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}" \
  126. --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
  127. --generate-name
  128. changed_when: true
  129. when: "'prometheus' not in k8s_pods.stdout"
  130. tags: init
  131. - name: Install MPI Operator
  132. command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
  133. changed_when: true
  134. when: "'mpi-operator' not in k8s_pods.stdout"
  135. tags: init
  136. - name: Install nvidia-device-plugin
  137. command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
  138. changed_when: true
  139. when: "'nvidia-device-plugin' not in k8s_pods.stdout"
  140. tags: init
  141. - name: Install GPU Feature Discovery
  142. command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
  143. changed_when: true
  144. when: "'node-feature-discovery' not in k8s_pods.stdout"
  145. tags: init
  146. - name: Deploy Xilinx Device plugin
  147. command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
  148. changed_when: true
  149. register: fpga_enable
  150. when: "'fpga-device-plugin' not in k8s_pods.stdout"
  151. tags: init
  152. - name: Deploy ROCm Device plugin
  153. command: "kubectl create -f '{{ rocm_device_plugin_yaml_url }}'"
  154. changed_when: true
  155. register: amd_gpu_enable
  156. when: "'amdgpu-device-plugin' not in k8s_pods.stdout"
  157. tags: init