test_omnia_validation.yml 15 KB


  1. # Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. ---
  15. # Testcase OMNIA_1.1_US_CRD_TC_001
  16. # Execute omnia.yml with separate servers for manager,compute,login,nfs node with default parameters
  17. - name: OMNIA_1.1_US_CRD_TC_001
  18. hosts: manager, compute
  19. vars_files:
  20. - test_vars/test_k8s_common_vars.yml
  21. - test_vars/test_slurm_common_vars.yml
  22. tasks:
  23. - name: Checking K8s service status
  24. systemd:
  25. name: kubelet
  26. register: kubelet_service
  27. tags: VERIFY_OMNIA_01
  28. - name: Validating K8s service status
  29. assert:
  30. that:
  31. - kubelet_service.status.ActiveState == 'active'
  32. fail_msg: "{{ kubelet_service_fail_msg }}"
  33. success_msg: "{{ kubelet_service_success_msg }}"
  34. tags: VERIFY_OMNIA_01
  35. - name: Checking munge service status
  36. systemd:
  37. name: munge
  38. register: munge_service
  39. tags: VERIFY_OMNIA_01
  40. - name: Validating munge service status
  41. assert:
  42. that:
  43. - munge_service.status.ActiveState == 'active'
  44. fail_msg: "{{ munge_service_fail_msg }}"
  45. success_msg: "{{ munge_service_success_msg }}"
  46. tags: VERIFY_OMNIA_01
  47. - name: OMNIA_1.1_US_CRD_TC_001
  48. hosts: manager
  49. vars_files:
  50. - test_vars/test_k8s_start_manager_workers_vars.yml
  51. - test_vars/test_k8s_start_services_vars.yml
  52. - test_vars/test_slurmexporter_vars.yml
  53. - test_vars/test_slurm_start_services_vars.yml
  54. - test_vars/test_login_server_vars.yml
  55. - test_vars/test_slurm_manager_vars.yml
  56. - test_vars/test_login_node_vars.yml
  57. tasks:
  58. - name: Checking kube-system pods
  59. command: kubectl get pods --namespace kube-system --field-selector=status.phase=Running
  60. register: kube_system_pods
  61. changed_when: false
  62. ignore_errors: True
  63. tags: VERIFY_OMNIA_01
  64. - name: Validating controller-manager and scheduler and coreDNS pods status
  65. assert:
  66. that:
  67. - "'kube-scheduler' in kube_system_pods.stdout"
  68. - "'kube-controller' in kube_system_pods.stdout"
  69. fail_msg: "{{ controller_scheduler_status_fail_msg }}"
  70. success_msg: "{{ controller_scheduler_status_success_msg }}"
  71. tags: VERIFY_OMNIA_01
  72. - name: Validating coreDNS pods status
  73. assert:
  74. that:
  75. - "'coredns' in kube_system_pods.stdout"
  76. fail_msg: "{{ coredns_status_fail_msg }}"
  77. success_msg: "{{ coredns_status_success_msg }}"
  78. tags: VERIFY_OMNIA_01
  79. - name: Checking all running pods
  80. command: kubectl get pods --all-namespaces --field-selector=status.phase=Running
  81. register: running_pods
  82. changed_when: false
  83. ignore_errors: True
  84. tags: VERIFY_OMNIA_01
  85. - name: Validating Metallb, Prometheus and MPI pods
  86. assert:
  87. that:
  88. - "'metallb' in running_pods.stdout"
  89. - "'prometheus' in running_pods.stdout"
  90. - "'mpi-operator' in running_pods.stdout"
  91. fail_msg: "{{ metallb_prometheus_mpi_pods_fail_msg }}"
  92. success_msg: "{{ metallb_prometheus_mpi_pods_success_msg }}"
  93. tags: VERIFY_OMNIA_01
  94. - name: Validating K8s dashboard
  95. assert:
  96. that:
  97. - "'kubernetes-dashboard' in running_pods.stdout"
  98. fail_msg: "{{ kubernetes_dashboard_fail_msg }}"
  99. success_msg: "{{ kubernetes_dashboard_success_msg }}"
  100. tags: VERIFY_OMNIA_01
  101. - name: Verify slurm exporter status
  102. systemd:
  103. name: prometheus-slurm-exporter
  104. register: slurm_exporter_status
  105. tags: VERIFY_OMNIA_01
  106. - name: Validate slurm exporter service status
  107. assert:
  108. that:
  109. - slurm_exporter_status.status.ActiveState == 'active'
  110. fail_msg: "{{ slurm_exporter_service_fail_msg }}"
  111. success_msg: "{{ slurm_exporter_service_success_msg }}"
  112. tags: VERIFY_OMNIA_01
  113. - name: Verify slurm exporter job in k8s services
  114. shell: >-
  115. export POD_NAME=$(kubectl get pods --namespace
  116. default -l "app=prometheus,component=server" -o jsonpath="{.items[0].metadata.name}")
  117. changed_when: true
  118. failed_when: false
  119. tags: VERIFY_OMNIA_01
  120. - name: Get pod name
  121. shell: echo $POD_NAME
  122. register: pod_name
  123. changed_when: false
  124. tags: VERIFY_OMNIA_01
  125. - name: Check if prometheus-server is in running state
  126. command: kubectl get pods {{ pod_name.stdout }}
  127. register: slurm_exporter_pod_status
  128. ignore_errors: yes
  129. changed_when: false
  130. tags: VERIFY_OMNIA_01
  131. - name: Validate slurm exporter job in k8s services
  132. assert:
  133. that:
  134. - "'Error from server' not in slurm_exporter_pod_status.stdout"
  135. fail_msg: "{{ slurm_exporter_job_fail_msg }}"
  136. success_msg: "{{ slurm_exporter_job_success_msg }}"
  137. tags: VERIFY_OMNIA_01
  138. - name: Checking mariadb service status
  139. systemd:
  140. name: mariadb
  141. register: mariadb_service
  142. tags: VERIFY_OMNIA_01
  143. - name: Validating mariadb service status
  144. assert:
  145. that:
  146. - mariadb_service.status.ActiveState == 'active'
  147. fail_msg: "{{ mariadb_service_fail_msg }}"
  148. success_msg: "{{ mariadb_service_success_msg }}"
  149. tags: VERIFY_OMNIA_01
  150. - name: Checking slurmctld service status
  151. systemd:
  152. name: slurmctld
  153. register: slurmctld_service
  154. tags: VERIFY_OMNIA_01
  155. - name: Checking slurmdbd service status
  156. systemd:
  157. name: slurmdbd
  158. register: slurmdbd_service
  159. tags: VERIFY_OMNIA_01
  160. - name: Check if slurm is installed
  161. command: sinfo -V
  162. register: slurm_version
  163. changed_when: false
  164. ignore_errors: True
  165. tags: VERIFY_OMNIA_01
  166. - name: Validating slurmctld service status
  167. assert:
  168. that:
  169. - slurmctld_service.status.ActiveState == 'active'
  170. fail_msg: "{{ slurmctld_service_fail_msg }}"
  171. success_msg: "{{ slurmctld_service_success_msg }}"
  172. tags: VERIFY_OMNIA_01
  173. - name: Validating slurmdbd service status
  174. assert:
  175. that:
  176. - slurmdbd_service.status.ActiveState == 'active'
  177. fail_msg: "{{ slurmdbd_service_fail_msg }}"
  178. success_msg: "{{ slurmdbd_service_success_msg }}"
  179. tags: VERIFY_OMNIA_01
  180. - name: Validate slurm installation
  181. assert:
  182. that: "'command not found' not in slurm_version.stdout"
  183. fail_msg: "{{ slurm_status_fail_msg }}"
  184. success_msg: "{{ slurm_status_success_msg }}"
  185. tags: VERIFY_OMNIA_01
  186. - name: Submit kubernetes job
  187. command: kubectl run nginx --image=nginx --restart=Never
  188. changed_when: false
  189. failed_when: false
  190. tags: VERIFY_OMNIA_01
  191. - name: Check submitted kubernetes job status
  192. command: kubectl get pod nginx
  193. register: kubo_job
  194. changed_when: false
  195. ignore_errors: True
  196. tags: VERIFY_OMNIA_01
  197. - name: Validate kubernetes job submission
  198. assert:
  199. that: "'pods nginx not found' not in kubo_job.stdout"
  200. fail_msg: "{{ kubernetes_job_status_fail_msg }}"
  201. success_msg: "{{ kubernetes_job_status_success_msg }}"
  202. tags: VERIFY_OMNIA_01
  203. - name: OMNIA_1.1_US_CRD_TC_001
  204. hosts: compute
  205. vars_files:
  206. - test_vars/test_slurm_workers_vars.yml
  207. tasks:
  208. - name: Check if slurm is installed
  209. command: sinfo -V
  210. register: slurm_version
  211. changed_when: false
  212. ignore_errors: True
  213. tags: VERIFY_OMNIA_01
  214. - name: Checking slurmd service status
  215. service:
  216. name: slurmd.service
  217. register: slurmd_service
  218. tags: VERIFY_OMNIA_01
  219. - name: Validate slurm installation
  220. assert:
  221. that: "'command not found' not in slurm_version.stdout"
  222. fail_msg: "{{ slurm_status_fail_msg }}"
  223. success_msg: "{{ slurm_status_success_msg }}"
  224. tags: VERIFY_OMNIA_01
  225. - name: Validating slurmd service status
  226. assert:
  227. that:
  228. - slurmd_service.status.ActiveState == 'active'
  229. fail_msg: "{{ slurmd_service_fail_msg }}"
  230. success_msg: "{{ slurmd_service_success_msg }}"
  231. tags: VERIFY_OMNIA_01
  232. - name: OMNIA_1.1_US_CRD_TC_001
  233. hosts: manager, login_node
  234. vars_files:
  235. - test_vars/test_login_common_vars.yml
  236. tasks:
  237. - name: Checking installed Freeipa version
  238. command: ipa --version
  239. register: ipa_version
  240. changed_when: false
  241. ignore_errors: True
  242. tags: VERIFY_OMNIA_01
  243. - name: Validating Freeipa Installation
  244. assert:
  245. that:
  246. - "'command not found' not in ipa_version.stdout"
  247. fail_msg: "{{ ipa_install_fail_msg }}"
  248. success_msg: "{{ ipa_install_success_msg }}"
  249. tags: VERIFY_OMNIA_01
  250. - name: Start and enable firewalld
  251. service:
  252. name: firewalld
  253. state: started
  254. enabled: yes
  255. tags: VERIFY_OMNIA_01
  256. - name: Checking firewalld open ports on manager/login node
  257. command: firewall-cmd --list-ports
  258. changed_when: false
  259. register: login_common_firewalld_ports
  260. tags: VERIFY_OMNIA_01
  261. - name: Validating firewalld open ports on manager/login node
  262. assert:
  263. that:
  264. - "'80/tcp' in login_common_firewalld_ports.stdout"
  265. - "'443/tcp' in login_common_firewalld_ports.stdout"
  266. - "'389/tcp' in login_common_firewalld_ports.stdout"
  267. - "'636/tcp' in login_common_firewalld_ports.stdout"
  268. - "'88/tcp' in login_common_firewalld_ports.stdout"
  269. - "'464/tcp' in login_common_firewalld_ports.stdout"
  270. - "'88/udp' in login_common_firewalld_ports.stdout"
  271. - "'464/udp' in login_common_firewalld_ports.stdout"
  272. - "'53/tcp' in login_common_firewalld_ports.stdout"
  273. - "'53/udp' in login_common_firewalld_ports.stdout"
  274. - "'123/udp' in login_common_firewalld_ports.stdout"
  275. - "'7389/tcp' in login_common_firewalld_ports.stdout"
  276. fail_msg: "{{ login_common_ports_status_fail_msg }}"
  277. success_msg: "{{ login_common_ports_status_success_msg }}"
  278. tags: VERIFY_OMNIA_01
  279. - name: Stop and disable firewalld
  280. service:
  281. name: firewalld
  282. state: stopped
  283. enabled: no
  284. tags: VERIFY_OMNIA_01
  285. - name: Check Freeipa server/client configuration
  286. command: ipa help topics
  287. register: ipa_config
  288. changed_when: false
  289. ignore_errors: True
  290. tags: VERIFY_OMNIA_01
  291. - name: Validating Freeipa server/client Configuration
  292. assert:
  293. that:
  294. - "'command not found' not in ipa_config.stdout"
  295. fail_msg: "{{ ipa_configuration_fail_msg }}"
  296. success_msg: "{{ ipa_configuration_success_msg }}"
  297. failed_when: false
  298. tags: VERIFY_OMNIA_01
  299. - name: Ensure host is present
  300. shell: echo "{{ ipa_admin_password }}" | kinit admin
  301. register: authen
  302. changed_when: false
  303. ignore_errors: true
  304. tags: VERIFY_OMNIA_01
  305. - name: Validate admin user in ipa server/client
  306. assert:
  307. that:
  308. - authen.rc == 0
  309. fail_msg: "{{ admin_user_authentication_status_fail_msg }}"
  310. success_msg: "{{ admin_user_authentication_status_success_msg }}"
  311. tags: VERIFY_OMNIA_01
  312. - name: OMNIA_1.1_US_CRD_TC_001
  313. hosts: login_node
  314. gather_facts: false
  315. vars_files:
  316. - test_vars/test_login_node_vars.yml
  317. - test_vars/test_slurm_workers_vars.yml
  318. tasks:
  319. - name: Checking slurmd service status
  320. service:
  321. name: slurmd.service
  322. register: slurmd_service
  323. tags: VERIFY_OMNIA_01
  324. - name: Validating slurmd service status
  325. assert:
  326. that:
  327. - slurmd_service.status.ActiveState == 'active'
  328. fail_msg: "{{ slurmd_service_fail_msg }}"
  329. success_msg: "{{ slurmd_service_success_msg }}"
  330. tags: VERIFY_OMNIA_01
  331. - name: Submit slurm jobs
  332. command: srun --nodes "{{ nodes }}" --ntasks-per-node "{{ ntasks }}" --partition normal hostname
  333. register: job_status
  334. changed_when: false
  335. ignore_errors: True
  336. tags: VERIFY_OMNIA_01
  337. - name: Validate slurm job submission
  338. assert:
  339. that: "'compute.ipa.test' in job_status.stdout"
  340. fail_msg: "{{ slurm_job_status_fail_msg }}"
  341. success_msg: "{{ slurm_job_status_success_msg }}"
  342. failed_when: false
  343. tags: VERIFY_OMNIA_01
  344. - name: OMNIA_1.1_US_CRD_TC_001
  345. hosts: nfs_node
  346. vars_files:
  347. - test_vars/test_nfs_node_vars.yml
  348. tasks:
  349. - name: Checking rpcbind service status
  350. systemd:
  351. name: rpcbind
  352. register: rpcbind_service
  353. tags: VERIFY_OMNIA_01
  354. - name: Validating rpcbind service status
  355. assert:
  356. that:
  357. - rpcbind_service.status.ActiveState == 'active'
  358. fail_msg: "{{ rpcbind_service_fail_msg }}"
  359. success_msg: "{{ rpcbind_service_success_msg }}"
  360. tags: VERIFY_OMNIA_01
  361. - name: Checking nfs-server service status
  362. systemd:
  363. name: nfs-server
  364. register: nfs_server_service
  365. tags: VERIFY_OMNIA_01
  366. - name: Validating nfs-server service status
  367. assert:
  368. that:
  369. - nfs_server_service.status.ActiveState == 'active'
  370. fail_msg: "{{ nfs_server_service_fail_msg }}"
  371. success_msg: "{{ nfs_server_service_success_msg }}"
  372. tags: VERIFY_OMNIA_01
  373. - name: Checking nfs-lock service status
  374. systemd:
  375. name: nfs-lock
  376. register: nfs_lock_service
  377. tags: VERIFY_OMNIA_01
  378. - name: Validating nfs-lock service status
  379. assert:
  380. that:
  381. - nfs_lock_service.status.ActiveState == 'active'
  382. fail_msg: "{{ nfs_lock_service_fail_msg }}"
  383. success_msg: "{{ nfs_lock_service_success_msg }}"
  384. tags: VERIFY_OMNIA_01
  385. - name: Checking nfs-idmap service status
  386. systemd:
  387. name: nfs-idmap
  388. register: nfs_idmap_service
  389. tags: VERIFY_OMNIA_01
  390. - name: Validating nfs-idmap service status
  391. assert:
  392. that:
  393. - nfs_idmap_service.status.ActiveState == 'active'
  394. fail_msg: "{{ nfs_idmap_service_fail_msg }}"
  395. success_msg: "{{ nfs_idmap_service_success_msg }}"
  396. tags: VERIFY_OMNIA_01
  397. - name: Check if nfs server setup is complete
  398. command: exportfs -v
  399. changed_when: false
  400. register: nfs_share
  401. tags: VERIFY_OMNIA_01
  402. - name: Validate nfs server setup
  403. assert:
  404. that: "'{{ nfs_dir }}' in nfs_share.stdout"
  405. fail_msg: "{{ nfs_server_fail_msg }}"
  406. success_msg: "{{ nfs_server_success_msg }}"
  407. tags: VERIFY_OMNIA_01