Browse Source

Merge branch 'devel' into bugfix

Shubhangi-dell 3 năm trước cách đây
mục cha
commit
36a44da5de
100 tập tin đã thay đổi với 2257 bổ sung791 xóa
  1. 55 35
      .all-contributorsrc
  2. 1 0
      .gitattributes
  3. 8 1
      .github/workflows/ansible-lint.yml
  4. 29 11
      README.md
  5. 7 1
      control_plane/collect_node_info.yml
  6. 2 0
      control_plane/input_params/base_vars.yml
  7. 24 31
      control_plane/input_params/login_vars.yml
  8. 2 3
      control_plane/roles/collect_device_info/files/create_inventory.yml
  9. 41 26
      control_plane/roles/collect_node_info/files/add_host.yml
  10. 95 33
      control_plane/roles/collect_node_info/files/create_inventory.yml
  11. 17 14
      control_plane/roles/collect_node_info/tasks/main.yml
  12. 8 11
      control_plane/roles/control_plane_common/tasks/pre_requisite.yml
  13. 1 2
      control_plane/roles/control_plane_common/vars/main.yml
  14. 2 0
      control_plane/roles/control_plane_customiso/files/temp_leap15.xml
  15. 9 2
      control_plane/roles/control_plane_ib/files/infiniband_container_configure.yml
  16. 2 2
      control_plane/roles/control_plane_ib/files/k8s_infiniband.yml
  17. 0 2
      control_plane/roles/control_plane_ib/files/start.sh
  18. 1 4
      control_plane/roles/control_plane_ib/tasks/check_prerequisites.yml
  19. 2 13
      control_plane/roles/control_plane_ib/tasks/configure_infiniband_container.yml
  20. 9 1
      control_plane/roles/control_plane_ib/tasks/infiniband_container_image.yml
  21. 1 2
      control_plane/roles/control_plane_ib/tasks/main.yml
  22. 1 2
      control_plane/roles/control_plane_ib/vars/main.yml
  23. 11 7
      control_plane/roles/control_plane_k8s/tasks/k8s_init.yml
  24. 0 4
      control_plane/roles/control_plane_k8s/tasks/k8s_installation.yml
  25. 21 16
      control_plane/roles/control_plane_k8s/tasks/k8s_installation_leap.yml
  26. 7 2
      control_plane/roles/control_plane_k8s/vars/main.yml
  27. 33 21
      control_plane/roles/control_plane_monitoring/tasks/configure_k8s_prom_grafana.yml
  28. 31 19
      control_plane/roles/control_plane_monitoring/tasks/configure_loki_grafana.yml
  29. 21 9
      control_plane/roles/control_plane_security/tasks/install_389ds.yml
  30. 11 5
      control_plane/roles/control_plane_security/tasks/install_ipa_server.yml
  31. 4 2
      control_plane/roles/network_ib/tasks/global_config.yml
  32. 2 0
      control_plane/roles/provision_cobbler/files/temp_leap15.xml
  33. 8 8
      control_plane/roles/provision_idrac/tasks/check_prerequisites.yml
  34. 2 0
      control_plane/roles/webui_awx/files/requirements.yml
  35. 3 1
      control_plane/roles/webui_awx/tasks/awx_configuration.yml
  36. 3 3
      control_plane/roles/webui_awx/vars/main.yml
  37. 64 0
      control_plane/test/test_awx.yml
  38. 28 0
      control_plane/test/test_os_check.yml
  39. 1 0
      control_plane/test/test_os_inventory
  40. 27 0
      control_plane/test/test_os_validation.yml
  41. 2 0
      control_plane/test/test_security_inventory
  42. 284 0
      control_plane/test/test_security_ipa_task_validation.yml
  43. 20 0
      control_plane/test/test_vars/test_awx_vars.yml
  44. 4 0
      control_plane/test/test_vars/test_idrac_vars.yml
  45. 32 0
      control_plane/test/test_vars/test_security_ipa_vars.yml
  46. 20 0
      control_plane/tools/control_plane_cleanup.yml
  47. 13 2
      control_plane/tools/roles/cluster_preperation/tasks/main.yml
  48. 21 5
      control_plane/tools/roles/cluster_preperation/tasks/passwordless_ssh.yml
  49. 4 2
      control_plane/tools/roles/cluster_preperation/vars/main.yml
  50. 52 0
      control_plane/tools/roles/control_plane_cleanup/tasks/decrypt_vault_files.yml
  51. 40 0
      control_plane/tools/roles/control_plane_cleanup/tasks/delete_files_vault_keys.yml
  52. 26 0
      control_plane/tools/roles/control_plane_cleanup/tasks/main.yml
  53. 133 0
      control_plane/tools/roles/control_plane_cleanup/tasks/remove_containers_images.yml
  54. 31 0
      control_plane/tools/roles/control_plane_cleanup/tasks/reset_kubeadm_cluster.yml
  55. 55 0
      control_plane/tools/roles/control_plane_cleanup/vars/main.yml
  56. 6 1
      docs/EXAMPLE_SYSTEM_DESIGNS.md
  57. 10 0
      docs/FAQ.md
  58. 33 23
      docs/INSTALL_OMNIA.md
  59. 101 49
      docs/INSTALL_OMNIA_CONTROL_PLANE.md
  60. 1 1
      docs/MONITOR_CLUSTERS.md
  61. 75 63
      docs/README.md
  62. 74 0
      docs/Security/ENABLE_SECURITY_LOGIN_NODE.md
  63. 128 0
      docs/Security/ENABLE_SECURITY_MANAGEMENT_STATION.md
  64. 0 27
      docs/Security/Enable_Security_LoginNode.md
  65. 0 79
      docs/Security/Enable_Security_ManagementStation.md
  66. 0 0
      docs/Security/LOGIN_USER_CREATION.md
  67. BIN
      docs/Telemetry_Visualization/Images/ParallelCoordinates.png
  68. BIN
      docs/Telemetry_Visualization/Images/PowerMap.png
  69. BIN
      docs/Telemetry_Visualization/Images/SankeyViewer.png
  70. BIN
      docs/Telemetry_Visualization/Images/Spirallayout.gif
  71. 12 18
      docs/Telemetry_Visualization/Visualization.md
  72. 44 0
      docs/Telemetry_Visualization/VISUALIZATION.md
  73. 8 3
      docs/control_plane/device_templates/CONFIGURE_INFINIBAND_SWITCHES.md
  74. 3 1
      docs/control_plane/device_templates/PROVISION_SERVERS.md
  75. 2 2
      docs/control_plane/input_parameters/PROVISION_SERVERS.md
  76. BIN
      docs/images/Omnia_Architecture.png
  77. BIN
      docs/images/Omnia_Flow.png
  78. BIN
      docs/images/Omnia_NetworkConfig_Inet.png
  79. BIN
      docs/images/Omnia_NetworkConfig_NoInet.png
  80. 2 2
      examples/PyTorch/pytorch-deploy.yaml
  81. 0 54
      examples/k8s-TensorFlow-resnet50-multinode-MPIOperator.yaml
  82. 30 0
      roles/cluster_validation/tasks/install_packages.yml
  83. 11 1
      roles/cluster_validation/tasks/main.yml
  84. 5 1
      roles/cluster_validation/vars/main.yml
  85. 2 0
      roles/common/tasks/main.yml
  86. 12 1
      roles/hostname_validation/tasks/main.yml
  87. 110 0
      roles/hostname_validation/tasks/validate_login_node_vars.yml
  88. 2 2
      roles/k8s_start_manager/vars/main.yml
  89. 10 10
      roles/k8s_start_services/files/metal-config.yaml
  90. 17 11
      roles/k8s_start_services/tasks/configure_nginx_prom_grafana.yml
  91. 16 17
      roles/k8s_start_services/tasks/deploy_k8s_services.yml
  92. 3 1
      roles/k8s_start_services/vars/main.yml
  93. 16 11
      roles/k8s_start_workers/tasks/main.yml
  94. 2 1
      roles/k8s_start_workers/vars/main.yml
  95. 14 0
      roles/login_node/tasks/configure_alerting.yml
  96. 21 11
      roles/login_node/tasks/install_389ds.yml
  97. 26 19
      roles/login_node/tasks/install_ipa_client.yml
  98. 36 23
      roles/login_server/tasks/install_ipa_server.yml
  99. 94 57
      roles/powervault_me4_nfs/tasks/nfs_node_configure.yml
  100. 0 0
      roles/powervault_me4_nfs/vars/main.yml

+ 55 - 35
.all-contributorsrc

@@ -39,7 +39,8 @@
         "mentoring",
         "projectManagement",
         "review",
-        "talk"
+        "talk",
+        "bug"
       ]
     },
     {
@@ -66,7 +67,8 @@
         "code",
         "test",
         "bug",
-        "security"
+        "security",
+        "talk"
       ]
     },
     {
@@ -76,10 +78,47 @@
       "profile": "https://github.com/sakshiarora13",
       "contributions": [
         "code",
+        "bug",
+        "talk"
+      ]
+    },
+    {
+      "login": "abhishek-sa1",
+      "name": "Abhishek SA",
+      "avatar_url": "https://avatars.githubusercontent.com/u/94038029?v=4",
+      "profile": "https://github.com/abhishek-sa1",
+      "contributions": [
+        "code",
+        "bug",
+        "doc",
+        "test",
+        "maintenance"
+      ]
+    },
+    {
+      "login": "Shubhangi-dell",
+      "name": "Shubhangi Srivastava",
+      "avatar_url": "https://avatars.githubusercontent.com/u/72869337?v=4",
+      "profile": "https://github.com/Shubhangi-dell",
+      "contributions": [
+        "code",
+        "maintenance",
         "bug"
       ]
     },
     {
+      "login": "cgoveas",
+      "name": "Cassey Goveas",
+      "avatar_url": "https://avatars.githubusercontent.com/u/88071888?v=4",
+      "profile": "https://github.com/cgoveas",
+      "contributions": [
+        "doc",
+        "bug",
+        "maintenance",
+        "talk"
+      ]
+    },
+    {
       "login": "araji",
       "name": "araji",
       "avatar_url": "https://avatars.githubusercontent.com/u/216020?v=4",
@@ -140,37 +179,6 @@
       ]
     },
     {
-      "login": "Shubhangi-dell",
-      "name": "Shubhangi Srivastava",
-      "avatar_url": "https://avatars.githubusercontent.com/u/72869337?v=4",
-      "profile": "https://github.com/Shubhangi-dell",
-      "contributions": [
-        "code",
-        "maintenance",
-        "bug"
-      ]
-    },
-    {
-      "login": "cgoveas",
-      "name": "Cassey Goveas",
-      "avatar_url": "https://avatars.githubusercontent.com/u/88071888?v=4",
-      "profile": "https://github.com/cgoveas",
-      "contributions": [
-        "doc",
-        "bug"
-      ]
-    },
-    {
-      "login": "abhishek-sa1",
-      "name": "Abhishek SA",
-      "avatar_url": "https://avatars.githubusercontent.com/u/94038029?v=4",
-      "profile": "https://github.com/abhishek-sa1",
-      "contributions": [
-        "code",
-        "bug"
-      ]
-    },
-    {
       "login": "Franklin-Johnson",
       "name": "Franklin-Johnson",
       "avatar_url": "https://avatars.githubusercontent.com/u/84760103?v=4",
@@ -363,7 +371,8 @@
       "avatar_url": "https://avatars.githubusercontent.com/u/18387748?v=4",
       "profile": "http://www.myweb.ttu.edu/ngu00336/",
       "contributions": [
-        "code"
+        "code",
+        "plugin"
       ]
     },
     {
@@ -382,7 +391,18 @@
       "avatar_url": "https://avatars.githubusercontent.com/u/100141664?v=4",
       "profile": "https://github.com/shemasr",
       "contributions": [
-        "bug"
+        "bug",
+        "code",
+        "test"
+      ]
+    },
+    {
+      "login": "Khushboodholi",
+      "name": "Khushboodholi",
+      "avatar_url": "https://avatars.githubusercontent.com/u/12014935?v=4",
+      "profile": "https://github.com/Khushboodholi",
+      "contributions": [
+        "code"
       ]
     }
   ],

+ 1 - 0
.gitattributes

@@ -0,0 +1 @@
+*.yml linguist-detectable

+ 8 - 1
.github/workflows/ansible-lint.yml

@@ -17,7 +17,7 @@ jobs:
 
     - name: ansible-lint 
       # replace "master" with any valid ref
-      uses: ansible/ansible-lint-action@master
+      uses: ansible/ansible-lint-action@c37fb7b4bda2c8cb18f4942716bae9f11b0dc9bc
       with:
         # [required]
         # Paths to ansible files (i.e., playbooks, tasks, handlers etc..)
@@ -30,6 +30,13 @@ jobs:
         targets: |
           /github/workspace/omnia.yml
           /github/workspace/control_plane/control_plane.yml
+          /github/workspace/control_plane/collect_device_info.yml
+          /github/workspace/control_plane/collect_node_info.yml
+          /github/workspace/control_plane/ethernet.yml
+          /github/workspace/control_plane/idrac.yml
+          /github/workspace/control_plane/infiniband.yml
+          /github/workspace/control_plane/powervault_me4.yml
+          /github/workspace/telemetry/telemetry.yml
           /github/workspace/platforms/jupyterhub.yml
           /github/workspace/platforms/kubeflow.yml
           /github/workspace/tools/install_tools.yml

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 29 - 11
README.md


+ 7 - 1
control_plane/collect_node_info.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,3 +19,9 @@
   gather_facts: false
   roles:
     - collect_node_info
+
+- import_playbook: "{{ playbook_dir }}/roles/collect_node_info/files/create_inventory.yml"
+  vars:
+    host_username: "{{ hostvars['127.0.0.1']['host_username'] }}"
+    host_password: "{{ hostvars['127.0.0.1']['provision_password'] }}"
+    mapping_file: "{{ hostvars['127.0.0.1']['mapping_file'] | bool }}"

+ 2 - 0
control_plane/input_params/base_vars.yml

@@ -94,6 +94,8 @@ awx_organization: "DellEMC"
 
 ### Usage: webui_grafana ###
 # At this location grafana persistent volume will be created.
+# If using telemetry, all telemetry related files will also be stored and
+# both timescale and mysql databases will be mounted to this location.
 mount_location: /opt/omnia/
 
 ### Usage: provision_cobbler, provision_idrac ###

+ 24 - 31
control_plane/input_params/login_vars.yml

@@ -29,25 +29,6 @@ provision_password: ""
 # Required field
 cobbler_password: ""
 
-### Usage: provision_idrac ###
-
-# The username for idrac
-# The username must not contain -,\, ',"
-# Required only if idrac_support: true
-idrac_username: ""
-
-# Password used for idrac
-# The password must not contain -,\, ',"
-# Required only if idrac_support: true
-idrac_password: ""
-
-### Usage: webui_awx ###
-
-# Password used for awx UI
-# The Length of the password should be at least 8.
-# The password must not contain -,\, ',"
-#awx_password: ""
-
 ### Usage: webui_grafana ###
 
 # The username for grafana UI
@@ -61,6 +42,30 @@ grafana_username: ""
 # The password should not be kept 'admin'
 grafana_password: ""
 
+# The directory server operations require an administrative user.
+# This user is referred to as the Directory Manager and has full access to the Directory for system management tasks
+# and will be added to the instance of directory server created for IPA.
+# The password must be at least 8 characters long
+# The password must not contain -,\, ',"
+ms_directory_manager_password: ""
+
+# ms_kerberos_admin_password used by IPA admin user in Rocky OS and used by 389-ds for kerberos admin password in leap OS
+# The IPA server requires an administrative user, named 'admin'.
+# This user is a regular system account used for IPA server administration
+ms_kerberos_admin_password: ""
+
+### Usage: provision_idrac ###
+
+# The username for idrac
+# The username must not contain -,\, ',"
+# Required only if idrac_support: true
+idrac_username: ""
+
+# Password used for idrac
+# The password must not contain -,\, ',"
+# Required only if idrac_support: true
+idrac_password: ""
+
 ### Usage: network_ethernet ###
 
 # The username for ethernet switch
@@ -92,15 +97,3 @@ powervault_me4_username: ""
 # one numeric character and one non-alphanumeric character.
 # The password must not contain -,\, ',", . , < , comma(,)
 powervault_me4_password: ""
-
-# The directory server operations require an administrative user.
-# This user is referred to as the Directory Manager and has full access to the Directory for system management tasks
-# and will be added to the instance of directory server created for IPA.
-# The password must be at least 8 characters long
-# The password must not contain -,\, ',"
-ms_directory_manager_password: ""
-
-# ms_kerberos_admin_password used by IPA admin user in Rocky OS and used by 389-ds for kerberos admin password in leap OS
-# The IPA server requires an administrative user, named 'admin'.
-# This user is a regular system account used for IPA server administration
-ms_kerberos_admin_password: ""

+ 2 - 3
control_plane/roles/collect_device_info/files/create_inventory.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 ---
 
 # This role will not group the devices if user provides invalid credentials
@@ -276,7 +275,7 @@
           when:
             - inventory_hostname not in infiniband_switches.stdout
             - not login.failed
-            - infinibandswitch_info.json.data['Product name'] == infiniband_search_key
+            - (infinibandswitch_info.json.results is defined and infinibandswitch_info.json.results[0].data['Product name'] == infiniband_search_key) or (infinibandswitch_info.json.data is defined and infinibandswitch_info.json.data['Product name'] == infiniband_search_key)
       rescue:
         - name: Failed while adding device to ib_inventory
           debug:

+ 41 - 26
control_plane/roles/collect_node_info/files/add_host.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,40 +12,55 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+
 - name: Initialise host description
   set_fact:
     host_description: "Description Unavailable"
     
 - name: Fetch description
   set_fact:
-    host_description: "CPU:{{ hostvars[item]['ansible_processor_count'] }}
-    Cores:{{ hostvars[item]['ansible_processor_cores'] }}
-    Memory:{{ hostvars[item]['ansible_memtotal_mb'] }}MB
-    BIOS:{{ hostvars[item]['ansible_bios_version'] }}"
-  ignore_errors: yes
+    host_description: "Service Tag: {{ service_tag }}, Operating System: {{ ansible_distribution }}"
+  failed_when: false
+  when: hostname_check.stdout is defined
 
-- name: Fetch the hosts in awx node inventory
-  command: >-
-    awx --conf.host {{ awx_host }} --conf.username {{ awx_username }} --conf.password {{ awx_password }}
-    --conf.insecure hosts list --inventory node_inventory
-  changed_when: false
-  no_log: true
-  when:
-     - host_description != "Description Unavailable"
-  register: hosts
-  ignore_errors: yes
+- block:
+    - name: Fetch the hosts in awx node inventory
+      command: >-
+        awx --conf.host {{ awx_host }} --conf.username {{ awx_username }} --conf.password {{ awx_password }}
+        --conf.insecure hosts list --inventory node_inventory
+      changed_when: false
+      delegate_to: localhost
+      no_log: true
+      run_once: true
+      register: fetch_hosts
+  rescue:
+    - name: Failed to fetch hosts in AWX
+      fail:
+        msg: "{{ fetch_hosts.stderr }}"
   
-- name: Add the host to awx node inventory if not present
-  command: >-
-    awx --conf.host {{ awx_host }} --conf.username {{ awx_username }} --conf.password {{ awx_password }}
-    --conf.insecure hosts create --name {{ item }} --inventory node_inventory
-  changed_when: true
-  when: item not in hosts.stdout
-  no_log: true
-  ignore_errors: yes
+- block:
+    - name: Add the host to awx node inventory if not present
+      command: >-
+        awx --conf.host {{ awx_host }} --conf.username {{ awx_username }} --conf.password {{ awx_password }}
+        --conf.insecure hosts create --name {{ inventory_hostname }} --description "{{ host_description }}" --inventory node_inventory
+      changed_when: true
+      register: add_host_awx
+      delegate_to: localhost
+      no_log: true
+      when:
+        - hostname_check.stdout is defined
+        - fetch_hosts.stdout is defined
+        - inventory_hostname not in fetch_hosts.stdout
+  rescue:
+    - name: Failed to add host to AWX
+      fail:
+        msg: "{{ add_host_awx.stderr }}"
+      when: add_host_awx is defined
 
 - name: Host added msg
   debug:
-    msg: "{{ host_added_msg + item }}"
+    msg: "{{ hostvars['localhost']['host_added_msg'] + inventory_hostname }}"
   when:
-    - host_description != "Description Unavailable"
+    - host_description != "Description Unavailable"
+    - add_host_awx is defined
+    - add_host_awx is not failed

+ 95 - 33
control_plane/roles/collect_node_info/files/create_inventory.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
 # limitations under the License.
 ---
 - name: Find reachable hosts
-  hosts: all
+  hosts: node_inventory
   gather_facts: false
   ignore_unreachable: true
   ignore_errors: true
@@ -47,7 +47,7 @@
       command: "cat {{ omnia_config_file }}"
       changed_when: false
       register: config_content
-      #no_log: True
+      no_log: true
 
     - name: Decrpyt omnia_config.yml
       command: >-
@@ -78,7 +78,7 @@
       register: hostname_check
       changed_when: false
       ignore_errors: true
-
+      
     - name: Check if IP is present in mapping file
       command: grep "{{ inventory_hostname }}" ../../provision_cobbler/files/new_host_mapping_file.csv
       delegate_to: localhost
@@ -95,23 +95,39 @@
     - name: Get the static hostname from mapping file
       shell: awk -F',' '$3 == "{{ inventory_hostname }}" { print $2 }' ../../provision_cobbler/files/new_host_mapping_file.csv
       delegate_to: localhost
-      when: ('localhost' in hostname_check.stdout) and (mapping_file_present != "" ) and ( mapping_file | bool == true )
+      when: 
+        - ( hostname_check.stdout is defined )
+        - ( 'localhost' in hostname_check.stdout ) 
+        - ( mapping_file_present != "" ) 
+        - ( mapping_file | bool == true )
       register: host_name
       ignore_errors: true
 
     - name: Set the hostname from mapping file
       command: hostnamectl set-hostname "{{ host_name.stdout + '.' + hostvars['localhost']['domain_name'] }}"
-      when: ('localhost' in hostname_check.stdout) and (mapping_file_present != "" ) and  (mapping_file | bool == true )
+      when: 
+        - ( hostname_check.stdout is defined )
+        - ( 'localhost' in hostname_check.stdout ) 
+        - ( mapping_file_present != "" ) 
+        - ( mapping_file | bool == true )
       ignore_errors: true
 
     - name: Set the hostname if hostname not present mapping file
       command: hostnamectl set-hostname "compute{{ inventory_hostname.split('.')[-2] + '-' + inventory_hostname.split('.')[-1] + '.' + hostvars['localhost']['domain_name'] }}"
-      when: ('localhost' in hostname_check.stdout) and (file_present.rc != 0) and (mapping_file | bool == true )
+      when: 
+        - ( hostname_check.stdout is defined )
+        - ( 'localhost' in hostname_check.stdout )
+        - ( file_present.rc is defined ) 
+        - ( file_present.rc != 0 ) 
+        - ( mapping_file | bool == true )
       ignore_errors: true
 
-    - name: Set the system hostname
+    - name: Set the system hostname if mapping file not present
       command: hostnamectl set-hostname "compute{{ inventory_hostname.split('.')[-2] + '-' + inventory_hostname.split('.')[-1]+'.'+ hostvars['localhost']['domain_name'] }}"
-      when: ('localhost' in hostname_check.stdout) and (mapping_file | bool == false)
+      when: 
+        - ( hostname_check.stdout is defined )
+        - ( 'localhost' in hostname_check.stdout ) 
+        - ( mapping_file | bool == false )
       ignore_errors: true
 
     - name: Add new hostname to /etc/hosts from mapping file
@@ -119,7 +135,11 @@
         dest: /etc/hosts
         line: "{{ inventory_hostname }} {{ host_name.stdout + '.' + hostvars['localhost']['domain_name'] }}"
         state: present
-      when: ('localhost' in hostname_check.stdout) and ( mapping_file_present != "" ) and ( mapping_file | bool == true )
+      when: 
+        - ( hostname_check.stdout is defined )
+        - ( 'localhost' in hostname_check.stdout ) 
+        - ( mapping_file_present != "" ) 
+        - ( mapping_file | bool == true )
       ignore_errors: true
 
     - name: Add new hostname to /etc/hosts if hostname not present mapping file
@@ -127,38 +147,74 @@
         dest: /etc/hosts
         line: "{{ inventory_hostname }} compute{{ inventory_hostname.split('.')[-2] + '-' + inventory_hostname.split('.')[-1]+'.'+ hostvars['localhost']['domain_name'] }}"
         state: present
-      when: ('localhost' in hostname_check.stdout) and ( file_present.rc != 0 ) and ( mapping_file | bool == true )
+      when: 
+        - ( hostname_check.stdout is defined )
+        - ( 'localhost' in hostname_check.stdout )
+        - ( file_present.rc is defined ) 
+        - ( file_present.rc != 0 ) 
+        - ( mapping_file | bool == true )
       ignore_errors: true
 
-    - name: Add new hostname to /etc/hosts
+    - name: Add new hostname to /etc/hosts if mapping file not present
       lineinfile:
         dest: /etc/hosts
         line: "{{ inventory_hostname }} compute{{ inventory_hostname.split('.')[-2] + '-' + inventory_hostname.split('.')[-1] +'.'+ hostvars['localhost']['domain_name'] }}"
         state: present
-      when: ('localhost' in hostname_check.stdout) and (mapping_file | bool == false )
+      when: 
+        - ( hostname_check.stdout is defined )
+        - ( 'localhost' in hostname_check.stdout ) 
+        - ( mapping_file | bool == false )
       ignore_errors: true
 
+    - name: Initialize service tag
+      set_fact:
+        service_tag: "Not Found"
+
+    - name: Install dmidecode package
+      package:
+        name: dmidecode
+        state: present
+
+    - name: Get service tag
+      shell: >
+          set -o pipefail && \
+          dmidecode -t 1 | grep Serial
+      changed_when: false
+      failed_when: false
+      register: service_tag_details
+      when: hostname_check.stdout is defined
+
+    - name: Set fact service tag
+      set_fact:
+        service_tag: "{{ service_tag_details.stdout.split(':')[1].strip() }}"
+      when: service_tag_details.stdout is defined
+
 - name: Update inventory
-  hosts: localhost
+  hosts: reachable
   connection: local
   gather_facts: false
   tasks:
     - name: Encrypt omnia_config.yml file
       command: >-
-        ansible-vault encrypt "{{ omnia_config_file }}"
-        --vault-password-file "{{ omnia_config_vault_file }}"
+        ansible-vault encrypt "{{ hostvars['localhost']['omnia_config_file'] }}"
+        --vault-password-file "{{ hostvars['localhost']['omnia_config_vault_file'] }}"
       changed_when: false
+      delegate_to: localhost
+      run_once: true
 
     - name: Update omnia_config.yml permissions
       file:
-        path: "{{ omnia_config_file }}"
-        mode: "{{ file_perm }}"
+        path: "{{ hostvars['localhost']['omnia_config_file'] }}"
+        mode: "{{ hostvars['localhost']['file_perm'] }}"
+      delegate_to: localhost
+      run_once: true
 
     - name: Check if tower_config_file file is encrypted
       command: cat "{{ playbook_dir }}/../../webui_awx/files/.tower_cli.cfg"
       changed_when: false
       no_log: true
       register: tower_config_content
+      delegate_to: localhost
       run_once: true
 
     - name: Decrypt tower_config_file
@@ -167,17 +223,21 @@
         --vault-password-file "{{ playbook_dir }}/../../webui_awx/files/.tower_vault_key"
       changed_when: false
       when: "'$ANSIBLE_VAULT;' in tower_config_content.stdout"
+      delegate_to: localhost
       run_once: true
 
-    - name: Change file permissions
+    - name: Change file permissions - tower_config_file
       file:
         path: "{{ playbook_dir }}/../../webui_awx/files/.tower_cli.cfg"
-        mode: "{{ file_perm }}"
+        mode: "{{ hostvars['localhost']['file_perm'] }}"
+      delegate_to: localhost
+      run_once: true
 
     - name: Fetch awx host
       command: grep "host:" "{{ playbook_dir }}/../../webui_awx/files/.tower_cli.cfg"
       register: fetch_awx_host
       changed_when: false
+      delegate_to: localhost
       run_once: true
 
     - name: Fetch awx username
@@ -185,14 +245,16 @@
       register: fetch_awx_username
       changed_when: false
       run_once: true
-      no_log: true
+      delegate_to: localhost
+      run_once: true
 
     - name: Fetch awx password
       command: grep "password:" "{{ playbook_dir }}/../../webui_awx/files/.tower_cli.cfg"
       register: fetch_awx_password
       changed_when: false
       run_once: true
-      no_log: true
+      delegate_to: localhost
+      run_once: true
 
     - name: Set awx variables
       set_fact:
@@ -208,15 +270,15 @@
       changed_when: false
       when: "'$ANSIBLE_VAULT;' in tower_config_content.stdout"
       run_once: true
+      delegate_to: localhost
+      run_once: true
+
+    - name: Change file permissions - tower_config_file
+      file:
+        path: "{{ playbook_dir }}/../../webui_awx/files/.tower_cli.cfg"
+        mode: "{{ hostvars['localhost']['file_perm'] }}"
+      delegate_to: localhost
+      run_once: true
 
-    - name: Update inventory file
-      block:
-        - name: Fetch facts and add new hosts
-          include_tasks: add_host.yml
-          with_items: "{{ groups['reachable'] }}"
-      when: "'reachable' in groups"
-
-    - name: Show unreachable hosts
-      debug:
-        msg: "{{ host_unreachable_msg }} + {{ groups['ungrouped'] }}"
-      when: "'ungrouped' in groups"
+    - name: Fetch facts and add new hosts
+      include_tasks: add_host.yml

+ 17 - 14
control_plane/roles/collect_node_info/tasks/main.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -74,17 +74,20 @@
       when: "'$ANSIBLE_VAULT;' in config_content.stdout"
       run_once: true
 
-    - name: Add inventory playbook
-      block:
-        - name: add hosts with description to inventory file
-          command: >-
-            ansible-playbook -i {{ provisioned_hosts_file }}
-            {{ role_path }}/files/create_inventory.yml
-            --extra-vars "host_username={{ host_username }} host_password={{ provision_password }} mapping_file={{ mapping_file | bool }}"
-          no_log: True
-          register: register_error
-      rescue:
-        - name: Fail if host addition was not successful
-          fail:
-            msg: "{{ register_error.stderr + register_error.stdout | regex_replace(host_username) | regex_replace(provision_password) }}"
+    - name: Change file permissions
+      file:
+        path: "{{ login_vars_file }}"
+        mode: "{{ file_perm }}"
+
+    - name: Check the provisioned_hosts_file output
+      command: cat {{ provisioned_hosts_file }}
+      changed_when: false
+      register: os_hosts
+      
+    - name: Create device_inventory
+      add_host:
+        name: "{{ item }}"
+        groups: "node_inventory"
+      with_items: "{{ os_hosts.stdout_lines }}"
+      when: item | trim | length > 1
   when: provisioned_file.stat.exists

+ 8 - 11
control_plane/roles/control_plane_common/tasks/pre_requisite.yml

@@ -86,17 +86,14 @@
       register: sestatus_current
       changed_when: false
 
-    - name: Disable SElinux
-      replace:
-        path: "{{ selinux_config_path }}"
-        regexp: 'SELINUX=[a-z]+'
-        replace: 'SELINUX=disabled'
-      when: '"SELinux status:                 enabled" in sestatus_current.stdout_lines'
+    - name: Check SELinux status
+      debug:
+        msg: "{{ selinux_warning }}"
+      when: '"SELinux status:                 disabled" in sestatus_current.stdout_lines'
 
-    - name: Status of SElinux
-      fail:
-        msg: "{{ selinux_status }}"
+    - name: Set SElinux to permissive mode
+      command: setenforce 0
       when: '"SELinux status:                 enabled" in sestatus_current.stdout_lines'
-      register: selinux_value
+
   tags: init
-  when: os_supported_leap not in mgmt_os
+  when: os_supported_leap not in mgmt_os

+ 1 - 2
control_plane/roles/control_plane_common/vars/main.yml

@@ -61,13 +61,12 @@ os_supported_rocky_version: "8.4"
 fail_os_status: "Unsupported OS or OS version. OS should be {{ os_supported_centos }} {{ os_supported_centos_version }} or {{ os_supported_rocky }} {{ os_supported_rocky_version }} or {{ os_supported_leap }} {{ os_supported_leap_version }}"
 success_os_status: "Management Station OS validated"
 internet_status: "Failed. No Internet connection. Make sure network is up."
-selinux_status: "SElinux is not disabled. Disable it in /etc/sysconfig/selinux and reboot the system"
+selinux_warning: "Warning! SELinux status is disabled by user. No SELinux policy is loaded"
 ansible_python_version_status: "For {{ mgmt_os }} {{ ansible_distribution_version }}, python bindings of firewalld, dnf, selinux are not available if python is installed from source and not from dnf or zypper. So please make sure python3.6 is installed using dnf or zypper. And ansible uses the python version 3.6 installed using dnf or zypper"
 python_version_support: '3.6.8'
 default_ansible_config_file_path: /etc/ansible/ansible.cfg
 invalid_run_tag_msg: "Failed. init tag should be used with run tags"
 invalid_skip_tag_msg: "Failed. init tag can't be used with skip tags"
-selinux_config_path: /etc/sysconfig/selinux
 
 # Usage: verify_login_inputs.yml
 login_vars_filename: "input_params/login_vars.yml"

+ 2 - 0
control_plane/roles/control_plane_customiso/files/temp_leap15.xml

@@ -57,6 +57,8 @@
       <package>openssh</package>
       <package>firewalld</package>
       <package>chrony</package>
+      <package>dmidecode</package>
+      <package>vim</package>
     </packages>
   </software>
   <ssh_import t="map">

+ 9 - 2
control_plane/roles/control_plane_ib/files/infiniband_container_configure.yml

@@ -19,5 +19,12 @@
   gather_facts: false
   tasks:
   - name: Start dhcpd services
-    command: dhcpd {{ ib_nic }}
-    changed_when: false
+    service:
+      name: dhcpd
+      state: started
+      enabled: yes
+
+  - name: Run opensm
+    shell: nohup /usr/sbin/opensm -F /etc/rdma/opensm.conf &
+    changed_when: true
+    failed_when: false

+ 2 - 2
control_plane/roles/control_plane_ib/files/k8s_infiniband.yml

@@ -35,8 +35,8 @@ spec:
         - name: infiniband-container
           image: 'localhost/infiniband-container:latest'
           imagePullPolicy: Never
-          command: [ "/start.sh" ]
-          args: [ "/sbin/init" ]
+          command:
+            - /sbin/init
           volumeMounts:
             - name: omnia-storage
               mountPath: /root/omnia

+ 0 - 2
control_plane/roles/control_plane_ib/files/start.sh

@@ -1,5 +1,3 @@
 #!/bin/bash
 
-/usr/libexec/rdma-init-kernel
-
 exec /usr/sbin/opensm -F /etc/rdma/opensm.conf

+ 1 - 4
control_plane/roles/control_plane_ib/tasks/check_prerequisites.yml

@@ -17,9 +17,6 @@
   set_fact:
     infiniband_container_status: false
     infiniband_container_image_status: false
-    infiniband_container_config_status: false
-    infiniband_backup_map_status: false
-    infiniband_new_node_status: false
 
 - name: Inspect the infiniband_container image
   command: "buildah images"
@@ -72,4 +69,4 @@
 - name: Update infiniband_container container status
   set_fact:
     infiniband_container_status: true
-  when: "'infiniband-container' in infiniband_container_result.stdout"
+  when: "'infiniband-container' in infiniband_container_result.stdout"

+ 2 - 13
control_plane/roles/control_plane_ib/tasks/configure_infiniband_container.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,17 +13,6 @@
 # limitations under the License.
 ---
 
-- name: Check infiniband pod status
-  command: kubectl get pods -n network-config
-  changed_when: false
-  register: infiniband_pod_status
-  failed_when: false
-
-- name: Deploy infiniband pod
-  command: "kubectl apply -f {{ role_path }}/files/k8s_infiniband.yml"
-  changed_when: true
-  when: infiniband_container_status and  (not infiniband_container_config_status)
-
 - name: Wait for infiniband pod to come to ready state
   command: kubectl wait --for=condition=ready -n network-config pod -l app=infiniband
   changed_when: false
@@ -35,5 +24,5 @@
 
 - name: Configuring infiniband container
   command: 'kubectl exec --stdin --tty -n network-config {{ infiniband_pod_name.stdout }} \
-    -- ansible-playbook /root/omnia/control_plane/roles/control_plane_ib/files/infiniband_container_configure.yml -e ib_nic="{{ ib_network_nic }}"'
+    -- ansible-playbook /root/omnia/control_plane/roles/control_plane_ib/files/infiniband_container_configure.yml'
   changed_when: false

+ 9 - 1
control_plane/roles/control_plane_ib/tasks/infiniband_container_image.yml

@@ -18,7 +18,8 @@
   changed_when: true
   args:
     chdir: "{{ role_path }}/files/"
-
+  when: not infiniband_container_image_status
+  
 - name: Update image name in k8s_infiniband.yml
   replace:
     path: "{{ ib_kube_config_file }}"
@@ -55,6 +56,13 @@
     regexp: "        - name: opensm-logs\n          hostPath:\n            path:.*"
     replace: "        - name: opensm-logs\n          hostPath:\n            path: {{ subnet_manager.log_directory }} "
 
+- name: Check infiniband pod status
+  command: kubectl get pods -n network-config
+  changed_when: false
+  register: infiniband_pod_status
+  failed_when: false
+
 - name: Deploy infiniband pod
   command: "kubectl apply -f {{ ib_kube_config_file }}"
   changed_when: true
+  when: not infiniband_container_status

+ 1 - 2
control_plane/roles/control_plane_ib/tasks/main.yml

@@ -56,5 +56,4 @@
           when: not infiniband_container_status
   when:
     - device_support_status
-    - ib_switch_support
-    - mgmt_os in os_supported_rocky
+    - ib_switch_support

+ 1 - 2
control_plane/roles/control_plane_ib/vars/main.yml

@@ -25,5 +25,4 @@ mount_path: /root/omnia
 infiniband_message_skipped: "The container is already present"
 infiniband_message_installed: "The container is installed"
 ib_kube_config_file: "{{ role_path }}/files/k8s_infiniband.yml"
-ib_container_name: infiniband-container
-infiniband_message_installed: "The container is installed"
+ib_container_name: infiniband-container

+ 11 - 7
control_plane/roles/control_plane_k8s/tasks/k8s_init.yml

@@ -35,13 +35,17 @@
   failed_when: false
   register: k8s_pods
 
-- name: Docker login
-  command: docker login -u {{ docker_username }} -p {{ docker_password }}
-  changed_when: true
-  register: docker_login_output
-  failed_when: false
-  when: docker_username or docker_password
-  no_log: true
+- block:
+    - name: Docker login
+      command: docker login -u {{ docker_username }} -p {{ docker_password }}
+      changed_when: true
+      register: docker_login_output
+      when: docker_username or docker_password
+      no_log: true
+  rescue:
+    - name: Warning - docker login failed
+      debug:
+        msg: "Warning: {{ docker_login_output.stderr }}" 
 
 - name: Docker login check
   fail:

+ 0 - 4
control_plane/roles/control_plane_k8s/tasks/k8s_installation.yml

@@ -19,10 +19,6 @@
     fstype: swap
     state: absent
 
-- name: Disable selinux
-  selinux:
-    state: disabled
-
 - name: Copy k8s.conf file
   copy:
     src: k8s.conf

+ 21 - 16
control_plane/roles/control_plane_k8s/tasks/k8s_installation_leap.yml

@@ -54,6 +54,14 @@
   command: /sbin/sysctl --system
   changed_when: true
 
+- name: Add crio repo
+  zypper_repository:
+    repo: "{{ crio_repo_leap }}"
+    state: present
+    disable_gpg_check: yes
+    auto_import_keys: yes
+    autorefresh: yes
+
 - name: Installing cri-o
   package:
     name: cri-o
@@ -120,7 +128,7 @@
 
 - name: Install Kubeadm
   ansible.builtin.expect:
-    command: zypper install --oldpackage "{{ kubeadm_version }}"
+    command: zypper install --replacefiles --force --oldpackage "{{ kubeadm_version }}"
     responses:
         (.*) [1/2/c/d/?](.): '2'
         (.*)(y): 'y'
@@ -130,7 +138,7 @@
 
 - name: Install Kubelet
   ansible.builtin.expect:
-    command: zypper install --oldpackage "{{ kubelet_version }}"
+    command: zypper install --replacefiles --force --oldpackage "{{ kubelet_version }}"
     responses:
         (.*) [1/2/c/d/?](.): '2'
         (.*)(y): 'y'
@@ -138,38 +146,35 @@
   until: kubelet_status is not failed
   retries: "{{ max_retries }}"
 
+- name: Fetch status of kubectl
+  command: kubectl version
+  failed_when: false
+  changed_when: false
+  register: kubectl_status
+
 - name: Install Kubectl
   zypper:
      name: "{{ kubectl_version }}"
      state: present
+     replacefiles: true
      oldpackage: yes
      force: yes
   register: kubectl_status
   until: kubectl_status is not failed
   retries: "{{ max_retries }}"
+  when: version_kubectl not in kubectl_status.stdout
 
 - name: Install common packages
   zypper:
     name: "{{ common_packages }}"
     state: present
 
-- name: Versionlocking kubeadm
-  command: zypper addlock "{{ kubeadm_version }}"
-  args:
-    warn: false
-  changed_when: false
-
-- name: Versionlocking kubectl
-  command: zypper addlock "{{ kubelet_version }}"
-  args:
-    warn: false
-  changed_when: false
-
-- name: Versionlocking kubelet
-  command: zypper addlock "{{ kubectl_version }}"
+- name: Versionlock k8s packages
+  command: zypper addlock {{ item }}
   args:
     warn: false
   changed_when: false
+  with_items: "{{ k8s_package_names }}"
 
 - name: Add docker community edition repository for docker-ce-cli
   get_url:

+ 7 - 2
control_plane/roles/control_plane_k8s/vars/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -31,7 +31,12 @@ min_retries: 3
 max_retries: 10
 min_delay: 10
 wait_time: 30
- 
+crio_repo_leap: "https://download.opensuse.org/repositories/devel:kubic/15.3/devel:kubic.repo"
+k8s_package_names:
+  - kubeadm
+  - kubelet
+  - kubectl
+version_kubectl: "v1.21.0"
 # Usage: k8s_installation.yml
 common_packages:
   - openssl

+ 33 - 21
control_plane/roles/control_plane_monitoring/tasks/configure_k8s_prom_grafana.yml

@@ -23,28 +23,40 @@
   changed_when: false
   register: kube_prom_svc_port
 
-- name: Create prometheus datasource in grafana
-  community.grafana.grafana_datasource:
-    name: control-plane-prometheus
-    grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
-    grafana_user: "{{ grafana_username }}"
-    grafana_password: "{{ grafana_password }}"
-    ds_type: prometheus
-    ds_url: "http://{{ kube_prom_svc_ip.stdout }}:{{ kube_prom_svc_port.stdout }}"
-    access: proxy
-  no_log: true
+- block:
+    - name: Create prometheus datasource in grafana
+      community.grafana.grafana_datasource:
+        name: control-plane-prometheus
+        grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
+        grafana_user: "{{ grafana_username }}"
+        grafana_password: "{{ grafana_password }}"
+        ds_type: prometheus
+        ds_url: "http://{{ kube_prom_svc_ip.stdout }}:{{ kube_prom_svc_port.stdout }}"
+        access: proxy
+      no_log: true
+      register: create_prom_datasource
+  rescue:
+    - name: Create prometheus datasource in grafana failed
+      fail:
+        msg: "Error: {{ create_prom_datasource.msg }}"
 
-- name: Import K8s grafana dashboards
-  community.grafana.grafana_dashboard:
-    grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
-    grafana_user: "{{ grafana_username }}"
-    grafana_password: "{{ grafana_password }}"
-    state: present
-    commit_message: Updated by ansible
-    overwrite: yes
-    path: "{{ role_path }}/files/{{ item }}"
-  with_items: "{{ grafana_dashboard_json_files }}"
-  no_log: true
+- block:
+    - name: Import K8s grafana dashboards
+      community.grafana.grafana_dashboard:
+        grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
+        grafana_user: "{{ grafana_username }}"
+        grafana_password: "{{ grafana_password }}"
+        state: present
+        commit_message: Updated by ansible
+        overwrite: yes
+        path: "{{ role_path }}/files/{{ item }}"
+      with_items: "{{ grafana_dashboard_json_files }}"
+      no_log: true
+      register: import_prom_dashboards
+  rescue:
+    - name: Import K8s grafana dashboards failed
+      fail:
+        msg: "Error: {{ import_prom_dashboards.msg }}"
 
 - name: Save grafana svc ip
   replace:

+ 31 - 19
control_plane/roles/control_plane_monitoring/tasks/configure_loki_grafana.yml

@@ -27,23 +27,35 @@
   changed_when: false
   register: loki_svc_port
 
-- name: Create loki datasource in grafana
-  community.grafana.grafana_datasource:
-    name: control-plane-loki
-    grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
-    grafana_user: "{{ grafana_username }}"
-    grafana_password: "{{ grafana_password }}"
-    ds_type: loki
-    ds_url: "http://{{ loki_svc_ip.stdout }}:{{ loki_svc_port.stdout }}"
-  no_log: true
+- block:
+    - name: Create loki datasource in grafana
+      community.grafana.grafana_datasource:
+        name: control-plane-loki
+        grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
+        grafana_user: "{{ grafana_username }}"
+        grafana_password: "{{ grafana_password }}"
+        ds_type: loki
+        ds_url: "http://{{ loki_svc_ip.stdout }}:{{ loki_svc_port.stdout }}"
+      no_log: true
+      register: create_loki_datasource
+  rescue:
+    - name: Create loki datasource in grafana failed
+      fail:
+        msg: "Error: {{ create_loki_datasource.msg }}"
 
-- name: Import loki dashboard in grafana
-  community.grafana.grafana_dashboard:
-    grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
-    grafana_user: "{{ grafana_username }}"
-    grafana_password: "{{ grafana_password }}"
-    state: present
-    commit_message: Updated by ansible
-    overwrite: yes
-    path: "{{ role_path }}/files/loki_dashboard.json"
-  no_log: true
+- block:
+    - name: Import loki dashboard in grafana
+      community.grafana.grafana_dashboard:
+        grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
+        grafana_user: "{{ grafana_username }}"
+        grafana_password: "{{ grafana_password }}"
+        state: present
+        commit_message: Updated by ansible
+        overwrite: yes
+        path: "{{ role_path }}/files/loki_dashboard.json"
+      no_log: true
+      register: import_loki_dashboard
+  rescue:
+    - name: Import loki datasource in grafana failed
+      fail:
+        msg: "Error: {{ import_loki_dashboard.msg }}"

+ 21 - 9
control_plane/roles/control_plane_security/tasks/install_389ds.yml

@@ -198,11 +198,17 @@
         - name: Create admin principal failed
           fail:
             msg: "Error: {{ create_admin_principal.stderr }}"
-
-    - name: Authenticate as admin
-      shell: set -o pipefail && echo {{ ms_kerberos_admin_password }} | kinit admin
-      no_log: true
-      changed_when: false
+    
+    - block:
+        - name: Authenticate as admin
+          shell: set -o pipefail && echo {{ ms_kerberos_admin_password }} | kinit admin
+          no_log: true
+          changed_when: false
+          register: authenticate_admin
+      rescue:
+        - name: Authenticate as admin failed
+          fail:
+            msg: "Error: {{ authenticate_admin.stderr }}"
 
     - name: Install sssd packages
       zypper:
@@ -244,8 +250,14 @@
         state: started
         enabled: yes
 
-    - name: Configure password policy in 389-ds
-      command: dsconf -w {{ ms_directory_manager_password }} -D "cn=Directory Manager" ldap://{{ server_hostname_ms }} pwpolicy set --pwdlockoutduration {{ lockout_duration }} --pwdmaxfailures {{ max_failures }} --pwdresetfailcount {{ failure_reset_interval }}
-      no_log: true
-      changed_when: true
+    - block:
+        - name: Configure password policy in 389-ds
+          command: dsconf -w {{ ms_directory_manager_password }} -D "cn=Directory Manager" ldap://{{ server_hostname_ms }} pwpolicy set --pwdlockoutduration {{ lockout_duration }} --pwdmaxfailures {{ max_failures }} --pwdresetfailcount {{ failure_reset_interval }}
+          no_log: true
+          changed_when: true
+          register: configure_pwpolicy
+      rescue:
+        - name: Configure password policy in 389-ds failed
+          fail:
+            msg: "Error: {{ configure_pwpolicy.stderr }}"
   when: not ds389_status

+ 11 - 5
control_plane/roles/control_plane_security/tasks/install_ipa_server.yml

@@ -26,11 +26,17 @@
       fail:
         msg: "Error: {{ install_ipa_server.stderr_lines }}"
 
-- name: Authenticate as admin
-  shell: set -o pipefail && echo $'{{ ms_kerberos_admin_password }}' | kinit {{ ms_ipa_admin_username }}
-  no_log: true
-  changed_when: false
-
+- block:
+    - name: Authenticate as admin
+      shell: set -o pipefail && echo $'{{ ms_kerberos_admin_password }}' | kinit {{ ms_ipa_admin_username }}
+      no_log: true
+      changed_when: false
+      register: authenticate_admin
+  rescue:
+    - name: Authenticate as admin failed
+      fail:
+        msg: "Error: {{ authenticate_admin.stderr }}"
+  
 - name: Replace the /etc/resolv.conf file
   copy:
     src: "{{ temp_resolv_conf_path }}"

+ 4 - 2
control_plane/roles/network_ib/tasks/global_config.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,4 +32,6 @@
 - name: Status check for - "{{ inventory_hostname }}" - "{{ item }}"
   fail:
     msg: "{{ global_conf.json }}"
-  when: global_conf.json.status != "OK"
+  when: 
+    - ( global_conf.json.status is defined and global_conf.json.status != "OK" ) or 
+      ( global_conf.json.results is defined and global_conf.json.results[0].status != "OK" )

+ 2 - 0
control_plane/roles/provision_cobbler/files/temp_leap15.xml

@@ -59,6 +59,8 @@
       <package>openssh</package>
       <package>firewalld</package>
       <package>chrony</package>
+      <package>dmidecode</package>
+      <package>vim</package>
     </packages>
   </software>
   <ssh_import t="map">

+ 8 - 8
control_plane/roles/provision_idrac/tasks/check_prerequisites.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -174,9 +174,9 @@
             idrac_license_name: "{{ idrac_info.system_info.License[my_idx1].LicenseDescription }}"
           with_items: "{{ idrac_info.system_info.License }}"
           when:
-            - '"iDRAC" in idrac_info.system_info.License[my_idx1].LicenseDescription'
-            - '"Enterprise" in idrac_info.system_info.License[my_idx1].LicenseDescription'
-            - '"License" in idrac_info.system_info.License[my_idx1].LicenseDescription'
+            - '"idrac" in idrac_info.system_info.License[my_idx1].LicenseDescription | lower'
+            - '"enterprise" in idrac_info.system_info.License[my_idx1].LicenseDescription | lower'
+            - '"license" in idrac_info.system_info.License[my_idx1].LicenseDescription | lower'
             - '"Healthy" in idrac_info.system_info.License[my_idx1].PrimaryStatus'
           loop_control:
             index_var: my_idx1
@@ -184,12 +184,12 @@
         - name: Set datacenter license status
           set_fact:
             datacenter_license: true
-            idrac_license_name: "{{ idrac_info.system_info.License[my_idx1].LicenseDescription }}"
+            idrac_license_name: "{{ idrac_info.system_info.License[my_idx2].LicenseDescription }}"
           with_items: "{{ idrac_info.system_info.License }}"
           when:
-            - '"iDRAC" in idrac_info.system_info.License[my_idx2].LicenseDescription'
-            - '"Datacenter" in idrac_info.system_info.License[my_idx2].LicenseDescription'
-            - '"License" in idrac_info.system_info.License[my_idx2].LicenseDescription'
+            - '"idrac" in idrac_info.system_info.License[my_idx2].LicenseDescription | lower'
+            - '"data" in idrac_info.system_info.License[my_idx2].LicenseDescription | lower'
+            - '"license" in idrac_info.system_info.License[my_idx2].LicenseDescription | lower'
             - '"Healthy" in idrac_info.system_info.License[my_idx2].PrimaryStatus'
           loop_control:
             index_var: my_idx2

+ 2 - 0
control_plane/roles/webui_awx/files/requirements.yml

@@ -10,3 +10,5 @@ collections:
     version: 2.2.3
   - name: community.grafana
     version: 1.3.0
+  - name: ansible.utils
+    version: 2.5.2

+ 3 - 1
control_plane/roles/webui_awx/tasks/awx_configuration.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+
 - name: Check if {{ tower_config_file }} file is encrypted
   command: cat {{ tower_config_file }}
   changed_when: false
@@ -143,6 +144,7 @@
     project: "{{ project_name }}"
     playbook: "{{ item.playbook }}"
     ask_skip_tags_on_launch: true
+    extra_vars: "{{ item.extra_vars }}"
     credentials:
       - "{{ item.credential }}"
     state: present

+ 3 - 3
control_plane/roles/webui_awx/vars/main.yml

@@ -99,7 +99,7 @@ job_template_details:
   - { name: infiniband_template, inventory: infiniband_inventory, playbook: control_plane/infiniband.yml, credential: infiniband_credential, flag: "{{ ib_switch_support }}" }
   - { name: powervault_me4_template, inventory: powervault_me4_inventory, playbook: control_plane/powervault_me4.yml, credential: powervault_me4_credential, flag: "{{ powervault_support }}" }
 omnia_job_template_details:
-  - { name: deploy_omnia_template, inventory: node_inventory, playbook: omnia.yml, credential: node_credential }
+  - { name: deploy_omnia_template, inventory: node_inventory, playbook: omnia.yml, credential: node_credential, extra_vars: "ansible_python_interpreter=/usr/bin/python3" }
 scheduled_templates:
-  - { name: NodeInventorySchedule, template: node_inventory_job, schedule_rule: "DTSTART:20210815T120000Z RRULE:FREQ=MINUTELY;INTERVAL=10", flag: true }
-  - { name: DeviceInventorySchedule, template: device_inventory_job, schedule_rule: "DTSTART:20210815T060000Z RRULE:FREQ=DAILY;INTERVAL=1", flag: "{{ device_support_status }}"}
+  - { name: NodeInventorySchedule, template: node_inventory_job, schedule_rule: "DTSTART:20210815T120000Z RRULE:FREQ=HOURLY;INTERVAL=1", flag: true }
+  - { name: DeviceInventorySchedule, template: device_inventory_job, schedule_rule: "DTSTART:20210815T060000Z RRULE:FREQ=DAILY;INTERVAL=1", flag: "{{ device_support_status }}"}

+ 64 - 0
control_plane/test/test_awx.yml

@@ -0,0 +1,64 @@
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+# Testcase to verify count of AWX pods
+- name: OMNIA_1.2_AWX_TC_001
+  hosts: localhost
+  vars_files:
+    - vars/test_awx_vars.yml
+  tasks:
+    - name: Get AWX pods
+      shell: "kubectl get pods -n awx"
+      register: awx_pods
+
+    - name: Set the count of fetched pods
+      set_fact:
+         count: "{{ awx_pods.stdout_lines|length - 1 }}"
+
+    - name: Validate count of AWX pods with defined count
+      assert:   
+        that:
+          - "{{ awx_pod_count }} == {{ count }}" 
+        success_msg: "Pod count validated"
+        fail_msg: "Some pods missing"
+  tags: AWX_TC_001
+
+# Testcase to verify AWX version
+- name: OMNIA_1.2_AWX_TC_002
+  hosts: localhost
+  vars_files:
+    - vars/test_awx_vars.yml
+  tasks:
+    - name: Get AWX image info
+      shell: "buildah images | grep 'awx'"
+      register: awx_image_info
+
+    - name: Get awx image details
+      set_fact:
+        awx_images: "{{ item }}"
+      with_items: 
+        - "{{ awx_image_info.stdout_lines }}"
+      run_once: true
+      ignore_errors: true
+      when: item | regex_search(awx_latest_version)
+
+    - name: Get version for awx
+      assert:
+        that:
+          - awx_image_info.stdout_lines[{{ item }}] | regex_search( "{{ awx_latest_version }}")
+        success_msg: "Version check successful"
+        fail_msg: "Version check failed"
+      ignore_errors: yes
+      with_sequence: start=0 end={{ awx_image_info.stdout_lines |length - 1 }}
+  tags: AWX_TC_002

+ 28 - 0
control_plane/test/test_os_check.yml

@@ -0,0 +1,28 @@
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+# Testcase to validate OS provisioned against the user given OS
+- name: OS Validation
+  hosts: localhost
+  vars_files:
+    - test_vars/test_idrac_vars.yml
+    - ../input_params/base_vars.yml
+  tasks:
+    - name: Copy all provisioned hosts to test_os_inventory
+      copy:
+        src: "{{ provisioned_hosts_path }}"
+        dest: "{{ test_os_inventory_path }}"
+        remote_src: yes
+    - name: Execute OS validation script
+      command: ansible-playbook test_os_validation.yml -i "{{ test_os_inventory_path }}"

+ 1 - 0
control_plane/test/test_os_inventory

@@ -0,0 +1 @@
+xx.xx.xx.xx

+ 27 - 0
control_plane/test/test_os_validation.yml

@@ -0,0 +1,27 @@
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+# Testcase to validate OS name after provisioning
+- name: OS Verification
+  hosts: all
+  vars_files:
+    - test_vars/test_idrac_vars.yml
+    - ../input_params/base_vars.yml
+  tasks:
+    - debug:
+        msg: "{{ os_validation_success_msg }}"
+      when: provision_os in ansible_distribution | lower
+    - debug:
+        msg: "{{ os_validation_fail_msg }}"
+      when: not provision_os in ansible_distribution | lower

+ 2 - 0
control_plane/test/test_security_inventory

@@ -0,0 +1,2 @@
+[manager]
+xx.xx.xx.xx

+ 284 - 0
control_plane/test/test_security_ipa_task_validation.yml

@@ -0,0 +1,284 @@
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+# Testcase to verify ipa installation in manager node
+- name: OMNIA_1.2_SEC2_TC_013
+  hosts: manager
+  vars_files:
+    - test_vars/test_security_ipa_vars.yml
+  tasks:
+    - name: Gather IPA version
+      command: ipa --version
+      register: ipa_version_op
+      tags: SECURITY_TC_001
+
+    - name: Validate the ipa version
+      assert:
+        that:
+          - "'VERSION' in ipa_version_op.stdout"
+        fail_msg: "{{ ipa_install_verification_fail_msg }}"
+        success_msg: "{{ ipa_install_verification_success_msg }}"
+      tags: SECURITY_TC_001
+
+# OMNIA_1.2_SEC2_TC_014
+# Testcase to verify user creation in Free IPA
+- name: OMNIA_1.2_SEC2_TC_014
+  hosts: manager
+  vars_files:
+    - test_vars/test_security_ipa_vars.yml
+  tasks:
+    - name: Generate random user
+      command: echo "{{ user_var + random_number }}"
+      register: user_name
+      tags: SECURITY_TC_002
+
+    - name: Create a random user for testing in free IPA
+      command: ipa user-add {{ user_name.stdout }}  --first='userFirst' --last='userLast' --random
+      register: create_user_op
+      tags: SECURITY_TC_002
+
+    - name: Get the list of users created in free IPA
+      command: ipa user-find
+      register: user_find_op
+      tags: SECURITY_TC_002
+ 
+    - name: Verify the presence of created user in free IPA
+      assert:
+        that:
+          - "'{{ user_name.stdout }}' in user_find_op.stdout"
+        fail_msg: "{{ user_creation_fail_msg }}"
+        success_msg: "{{ user_creation_success_msg }}"
+      tags: SECURITY_TC_002
+
+# Test case to verify group creation in Free IPA
+- name: OMNIA_1.2_SEC2_TC_015
+  hosts: manager
+  vars_files:
+    - test_vars/test_security_ipa_vars.yml
+  tasks:
+    - name: Generate random group
+      command: echo "{{ group_var + random_number }}"
+      register: group_name
+      tags: SECURITY_TC_003
+
+    - name: Create a random group for testing in free IPA
+      command: ipa group-add {{ group_name.stdout }}
+      register: create_group_op
+      tags: SECURITY_TC_003
+
+    - name: Get the list of groups created in free IPA
+      command: ipa group-find
+      register: group_find_op
+      tags: SECURITY_TC_003
+  
+    - name: Verify the presence of created group in free IPA
+      assert:
+        that:
+          - "'{{ group_name.stdout }}' in group_find_op.stdout"
+        fail_msg: "{{ group_creation_fail_msg }}"
+        success_msg: "{{ group_creation_success_msg }}"
+      tags: SECURITY_TC_003
+
+# Testcase to verify whether user is added to group
+- name: OMNIA_1.2_SEC2_TC_016
+  hosts: manager
+  vars_files:
+    - test_vars/test_security_ipa_vars.yml
+  tasks:
+    - name: Generate random user
+      command: echo "{{ user_var + random_number }}"
+      register: user_name
+      tags: SECURITY_TC_004
+
+    - name: Create user to add into group in free IPA
+      command: ipa user-add {{ user_name.stdout }}  --first='user' --last='Random' --random
+      register: create_user_op
+      tags: SECURITY_TC_004
+
+    - name: Get the list of users created in free IPA
+      command: ipa user-find
+      register: user_find_op
+      tags: SECURITY_TC_004
+
+    - name: Verify the presence of created user in free IPA
+      assert:
+        that:
+          - "'{{ user_name.stdout }}' in user_find_op.stdout"
+        fail_msg: "{{ user_creation_fail_msg }}"
+        success_msg: "{{ user_creation_success_msg }}"
+      tags: SECURITY_TC_004
+    
+    - name: Generate random group
+      command: echo "{{ group_var + random_number }}"
+      register: group_name
+      tags: SECURITY_TC_004
+
+    - name: Create a group to add users in free IPA
+      command: ipa group-add {{ group_name.stdout }}
+      register: create_group_op
+      tags: SECURITY_TC_004
+
+    - name: Get the list of groups created in free IPA
+      command: ipa group-find
+      register: group_find_op
+      tags: SECURITY_TC_004
+
+    - name: Verify the presence of created group in free IPA
+      assert:
+        that:
+          - "'{{ group_name.stdout }}' in group_find_op.stdout"
+        fail_msg: "{{ group_creation_fail_msg }}"
+        success_msg: "{{ group_creation_success_msg }}"
+      tags: SECURITY_TC_004
+
+    - name: Add created user to created group in free IPA
+      command: ipa group-add-member {{ group_name.stdout }} --users={{ user_name.stdout}}
+      register: group_add_member_op
+      tags: SECURITY_TC_004
+ 
+    - name: Get details of group in which user is added in free IPA
+      command: ipa group-show {{ group_name.stdout }}
+      register: group_show_op
+      tags: SECURITY_TC_004
+
+    - name: Verify user presence in the group from free IPA
+      assert:
+        that:
+          - "'{{ user_name.stdout }}' in group_show_op.stdout"
+        fail_msg: "{{ add_user_to_group_fail_msg }}"
+        success_msg: "{{ add_user_to_group_success_msg }}" 
+      tags: SECURITY_TC_004
+
+# Testcase to verify user deletion in Free IPA
+- name: OMNIA_1.2_SEC2_TC_017
+  hosts: manager
+  vars_files:
+    - test_vars/test_security_ipa_vars.yml  
+  tasks:
+    - name: Generate random user
+      command: echo "{{ user_var + random_number }}"
+      register: user_name
+      tags: SECURITY_TC_005
+
+    - name: Create a user to perform deletion
+      command: ipa user-add {{ user_name.stdout }}  --first='userFirst' --last='userLast' --random
+      register: create_user_op
+      tags: SECURITY_TC_005
+
+    - name: Get the list of users created in free IPA
+      command: ipa user-find
+      register: user_find_op
+      tags: SECURITY_TC_005
+
+    - name: Verify the presence of created user in free IPA
+      assert:
+        that:
+          - "'{{ user_name.stdout }}' in user_find_op.stdout"
+        fail_msg: "{{ user_creation_fail_msg }}"
+        success_msg: "{{ user_creation_success_msg }}"
+      tags: SECURITY_TC_005
+   
+    - name: Delete created user in free IPA
+      command: ipa user-del  {{ user_name.stdout }}
+      register: user_del_op
+      tags: SECURITY_TC_005
+ 
+    - name: Select all the remaining users from free IPA
+      command: ipa user-find
+      register: user_find_op_after_del 
+      tags: SECURITY_TC_005
+
+    - name: Verify the absence of deleted user in free IPA
+      assert:
+        that:
+          - "'{{ user_name.stdout }}' not in user_find_op_after_del.stdout"		
+        fail_msg: "{{ user_deletion_fail_msg }}"
+        success_msg: "{{ user_deletion_success_msg }}"
+      tags: SECURITY_TC_005
+
+# Testcase to verify group deletion in Free IPA
+- name: OMNIA_1.2_SEC2_TC_018
+  hosts: manager
+  vars_files:
+    - test_vars/test_security_ipa_vars.yml
+  tasks:
+    - name: Generate random group
+      command: echo "{{ group_var + random_number }}"
+      register: group_name
+      tags: SECURITY_TC_006
+
+    - name: Create a group to perform deletion in free IPA
+      command: ipa group-add {{ group_name.stdout }}
+      register: create_group_op
+      tags: SECURITY_TC_006
+
+    - name: Get the list of groups created in free IPA
+      command: ipa group-find
+      register: group_find_op
+      tags: SECURITY_TC_006
+
+    - name: Verify the presence of created group in free IPA
+      assert:
+        that:
+          - "'{{ group_name.stdout }}' in group_find_op.stdout"
+        fail_msg: "{{ group_creation_fail_msg }}"
+        success_msg: "{{ group_creation_success_msg }}"
+      tags: SECURITY_TC_006
+
+    - name: Delete created group in free IPA
+      command: ipa group-del {{ group_name.stdout }}
+      register: group_del_op
+      tags: SECURITY_TC_006
+  
+    - name: Select all the remaining users from free IPA
+      command: ipa group-find
+      register: group_find_op_after_del
+      tags: SECURITY_TC_006
+
+    - name: Verify the absence of deleted user in IPA
+      assert:
+        that:
+          - "'{{ group_name.stdout }}' not in group_find_op_after_del.stdout"		
+        fail_msg: "{{ group_deletion_fail_msg }}"
+        success_msg: "{{ group_deletion_success_msg }}"
+      tags: SECURITY_TC_006
+
+# Testcase to verify role creation in Free IPA
+- name: OMNIA_1.2_SEC2_TC_019
+  hosts: manager
+  vars_files:
+    - test_vars/test_security_ipa_vars.yml
+  tasks:
+    - name: Generate random role
+      command: echo "{{ role_var + random_number }}"
+      register: role_name
+      tags: SECURITY_TC_007
+
+    - name: Create a random role for testing in free IPA
+      command: ipa role-add {{ role_name.stdout }} --des='User Defined Role'
+      register: create_role_op
+      tags: SECURITY_TC_007
+
+    - name: Get the list of roles created in free IPA
+      command: ipa role-find
+      register: role_find_op
+      tags: SECURITY_TC_007
+  
+    - name: Verify the presence of created role in free IPA
+      assert:
+        that:
+          - "'{{ role_name.stdout }}' in role_find_op.stdout"
+        fail_msg: "{{ role_creation_fail_msg }}"
+        success_msg: "{{ role_creation_success_msg }}"
+      tags: SECURITY_TC_007

+ 20 - 0
control_plane/test/test_vars/test_awx_vars.yml

@@ -0,0 +1,20 @@
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+awx_pod_count: "3"
+awx_latest_version: "19.4.0"
+awx_version_check_success_msg: "Version check successful"
+awx_version_check_fail_msg: "Version check failed"
+awx_pod_count_success_msg: "Pod count validated"
+awx_pod_count_fail_msg: "Some pods missing"

+ 4 - 0
control_plane/test/test_vars/test_idrac_vars.yml

@@ -73,3 +73,7 @@ provisioned_ip_fail_msg: "IP is not added to provisioned_idrac_inventory"
 failed_msg: "Failed. Please check input parameters and try again!"
 firmware_fail_msg: "Firmware update is failed"
 firmware_success_msg: "Firmware updated is success"
+provisioned_hosts_path: "/root/omnia/control_plane/roles/collect_node_info/files/provisioned_hosts.yml"
+test_os_inventory_path: "/root/omnia/control_plane/test/test_os_inventory"
+os_validation_success_msg: "OS validation is successful"
+os_validation_fail_msg: "OS validation failed"

+ 32 - 0
control_plane/test/test_vars/test_security_ipa_vars.yml

@@ -0,0 +1,32 @@
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+ipa_install_verification_success_msg: "IPA installation successful"
+ipa_install_verification_fail_msg: "IPA is not installed"
+user_var: "user"
+random_number: "{{ 10000 | random }}"
+user_creation_success_msg: "User creation successful"
+user_creation_fail_msg: "User creation failed"
+group_var: "group"
+group_creation_success_msg: "Group creation successful"
+group_creation_fail_msg: "Group creation failed"
+role_var: "role"
+role_creation_success_msg: "Role creation successful"
+role_creation_fail_msg: "Role creation failed"
+add_user_to_group_success_msg: "User successfully added to group"
+add_user_to_group_fail_msg: "User add to group failed"
+user_deletion_success_msg: "User deletion successful"
+user_deletion_fail_msg: "User deletion failed"
+group_deletion_success_msg: "Group deletion successful"
+group_deletion_fail_msg: "Group deletion failed"

+ 20 - 0
control_plane/tools/control_plane_cleanup.yml

@@ -0,0 +1,20 @@
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Cleanup control_plane
+  hosts: localhost
+  connection: local
+  roles:
+    - control_plane_cleanup

+ 13 - 2
control_plane/tools/roles/cluster_preperation/tasks/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -22,6 +22,17 @@
     regexp: '#   StrictHostKeyChecking ask'
     replace: 'StrictHostKeyChecking no'
 
+- name: Disable strict mode checking
+  replace:
+    path: /etc/ssh/ssh_config
+    regexp: '^StrictModes\ '
+    replace: 'StrictModes no'
+
+- name: Restart sshd
+  service:
+    name: sshd
+    state: restarted
+
 - name: Install sshpass
   package:
     name: sshpass
@@ -33,4 +44,4 @@
       include_tasks: passwordless_ssh.yml
       with_items: "{{ ssh_to }}"
       loop_control:
-        pause: 5
+        pause: 5

+ 21 - 5
control_plane/tools/roles/cluster_preperation/tasks/passwordless_ssh.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
   when: "'manager' in group_names"
 
 - name: Verify whether passwordless ssh is set on the remote host
-  shell: sshpass ssh -o "PasswordAuthentication=no" root@{{ current_host }} 'hostname'
+  command: sshpass ssh -o "PasswordAuthentication=no" root@{{ current_host }} 'hostname'
   register: ssh_output
   async: 30
   poll: 5
@@ -45,12 +45,28 @@
   register: verify_rsa_id_file
   when: not ssh_status
 
-- name: Generate ssh key pair
-  command: ssh-keygen -t rsa -b 4096 -f "{{ rsa_id_file }}" -q -N "{{ passphrase }}"
+- name: Create rsa_id file if it doesn't exist
+  ansible.builtin.file:
+    path: "{{ rsa_id_file }}"
+    state: touch
+    mode: "{{ ssh_file_mode }}"
   when:
     - not ssh_status
     - not verify_rsa_id_file.stat.exists
 
+- name: Generate ssh key pair
+  shell: ssh-keygen -t rsa -b 4096 -f "{{ rsa_id_file }}" -q -N "{{ passphrase }}" <<<y >/dev/null 2>&1
+  when:
+    - not ssh_status
+
+- name: Creating ssh config file with IdentifyFile value
+  copy:
+    dest: "{{ config_file }}"
+    content: |
+      Host *
+          IdentityFile "{{ rsa_id_file }}"
+    mode: "{{ ssh_file_mode }}"
+
 - name: Add the key identity
   shell: |
     eval `ssh-agent -s`
@@ -85,4 +101,4 @@
   rescue:
     - name: Passwordless ssh failed
       fail:
-        msg: "{{ register_error.stderr | regex_replace(hostvars['127.0.0.1']['cobbler_password']) | regex_replace(auth_key_path) }}"
+        msg: "{{ register_error.stderr | regex_replace(hostvars['127.0.0.1']['cobbler_password']) | regex_replace(auth_key_path) }}"

+ 4 - 2
control_plane/tools/roles/cluster_preperation/vars/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -16,4 +16,6 @@
 #Usage: passwordless_ssh.yml
 rsa_id_file: "/root/.ssh/id_rsa"
 passphrase: ""
-auth_key_path: "/root/.ssh/authorized_keys"
+auth_key_path: "/root/.ssh/authorized_keys"
+config_file: "/root/.ssh/config"
+ssh_file_mode: "0600"

+ 52 - 0
control_plane/tools/roles/control_plane_cleanup/tasks/decrypt_vault_files.yml

@@ -0,0 +1,52 @@
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Check if {{ login_vars_file }} file is encrypted
+  command: cat {{ login_vars_file }}
+  changed_when: false
+  no_log: true
+  register: config_content
+
+- name: Decrpyt {{ login_vars_file }}
+  command: >-
+    ansible-vault decrypt {{ login_vars_file }}
+    --vault-password-file {{ login_vault_file }}
+  when: "'$ANSIBLE_VAULT;' in config_content.stdout"
+  changed_when: false
+
+- name: Check idrac_tools_vars.yml file is encrypted
+  command: cat "{{ idrac_tools_vars_filename }}"
+  changed_when: false
+  no_log: true
+  register: config_content
+
+- name: Decrpyt idrac_tools_vars.yml
+  command: >-
+    ansible-vault decrypt "{{ idrac_tools_vars_filename }}"
+    --vault-password-file "{{ idrac_tools_vaultname }}"
+  when: "'$ANSIBLE_VAULT;' in config_content.stdout"
+  changed_when: false
+
+- name: Check if omnia config file is encrypted
+  command: cat {{ config_filename }}
+  changed_when: false
+  register: config_content
+  no_log: True
+
+- name: Decrpyt omnia_config.yml
+  command: >-
+    ansible-vault decrypt {{ config_filename }} --vault-password-file {{ config_vaultname }}
+  when: "'$ANSIBLE_VAULT;' in config_content.stdout"
+  changed_when: false

+ 40 - 0
control_plane/tools/roles/control_plane_cleanup/tasks/delete_files_vault_keys.yml

@@ -0,0 +1,40 @@
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Unmount /mnt/leap
+  command: "umount /mnt/leap"
+  changed_when: true
+  failed_when: false
+
+- name: Unmount /mnt/rocky
+  command: "umount /mnt/rocky"
+  changed_when: true
+  failed_when: false
+
+- name: Delete folders and files
+  file:
+    path: "{{ item }}"
+    state: absent
+  failed_when: false
+  with_items:
+    - "{{ del_files }}"
+
+- name: Delete all vault keys
+  file:
+    path: "{{ item }}"
+    state: absent
+  failed_when: false
+  with_items:
+    - "{{ vault_keys }}"

+ 26 - 0
control_plane/tools/roles/control_plane_cleanup/tasks/main.yml

@@ -0,0 +1,26 @@
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Remove containers and images
+  include_tasks: remove_containers_images.yml
+
+- name: Decrypt vault files
+  include_tasks: decrypt_vault_files.yml
+
+- name: Reset kubeadm cluster
+  include_tasks: reset_kubeadm_cluster.yml
+
+- name: Delete vault keys and files
+  include_tasks: delete_files_vault_keys.yml

+ 133 - 0
control_plane/tools/roles/control_plane_cleanup/tasks/remove_containers_images.yml

@@ -0,0 +1,133 @@
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Get K8s pods
+  command: "kubectl get pods --all-namespaces"
+  changed_when: false
+  register: k8s_pods
+
+- name: Get the image list
+  command: "buildah images"
+  changed_when: false
+  register: image_list
+
+- name: Get the infiniband pod name
+  command: 'kubectl get pod -n network-config -l app=infiniband -o jsonpath="{.items[0].metadata.name}"'
+  changed_when: false
+  failed_when: false
+  register: infiniband_pod_name
+
+- name: Delete infiniband container
+  command: "kubectl delete -f {{ k8s_infiniband }}"
+  when: "infiniband_pod_name.stdout in k8s_pods.stdout"
+  failed_when: false
+
+- name: Delete infiniband image
+  command: "buildah rmi -f {{ infiniband_container }}:latest"
+  when: infiniband_container in image_list.stdout
+
+- name: Check if awx.yml file exists
+  stat:
+     path: "{{ awx_file }}"
+  register: awx_exists
+
+- name: Delete awx.yml
+  command: "kubectl delete -f {{ awx_file }}"
+  when: awx_exists.stat.exists
+  failed_when: false
+
+- name: Getting pods
+  command: kubectl get pods -n awx
+  changed_when: false
+  register: awx_pods
+
+- name: UnDeploying awx-operator
+  command: make undeploy
+  args:
+    chdir: "{{ awx_operator }}"
+  when: awx_pods.stdout | regex_search('awx-operator-controller-manager-([A-Za-z0-9]{10})-([A-Za-z0-9]{5})')
+
+- name: Get the postgres volume claim name
+  command: "kubectl get pvc -n awx"
+  changed_when: false
+  register: postgresclaimname
+
+- name: Delete the postgres volume claim
+  command: "kubectl delete pvc {{ postgresclaimname.stdout }} -n awx"
+  when: postgresclaimname.stdout
+  failed_when: false
+
+- name: Check if awx_postgres_pv.yml file exists
+  stat:
+      path: "{{ awx_postgres_pv }}"
+  register: awx_postgres_pv_exists
+
+- name: Delete awx_postgres_pv.yml
+  command: "kubectl delete -f {{ awx_postgres_pv }}"
+  when: awx_postgres_pv_exists.stat.exists
+  failed_when: false
+
+- name: Check if awx_projects_pv.yml file exists
+  stat:
+    path: "{{ awx_projects_pv }}"
+  register: awx_projects_pv_exists
+
+- name: Delete awx_projects_pv.yml
+  command: "kubectl delete -f {{ awx_projects_pv }}"
+  when: awx_projects_pv_exists.stat.exists
+  failed_when: false
+
+- name: Delete awx image
+  command: "buildah rmi -f {{ awx_image }}"
+  when: awx_image in image_list.stdout
+
+- name: Check for awx namespace
+  command: kubectl get namespace -n awx
+  changed_when: false
+  register: awx_namespace
+
+- name: Delete Namespace awx
+  command: kubectl delete namespace awx -n awx
+  when: "'awx' in awx_namespace.stdout"
+
+- name: Get cobbler pod name
+  command: 'kubectl get pod -n cobbler -l app=cobbler -o jsonpath="{.items[0].metadata.name}"'
+  changed_when: false
+  failed_when: false
+  register: cobbler_pod_name
+
+- name: Delete cobbler container
+  command: "kubectl delete -f {{ k8s_cobbler }}"
+  when: "cobbler_pod_name.stdout in k8s_pods.stdout"
+  failed_when: false
+
+- name: Delete cobbler image
+  command: "buildah rmi -f {{ cobbler_image }}:latest"
+  when: cobbler_image in image_list.stdout
+
+- name: Get mngmnt_network pod name
+  command: 'kubectl get pod -n network-config -l app=mngmnt-network -o jsonpath="{.items[0].metadata.name}"'
+  changed_when: false
+  failed_when: false
+  register: mngmnt_network_pod_name
+
+- name: Delete management network container
+  command: "kubectl delete -f {{ k8s_mngmnt_network }}"
+  when: "mngmnt_network_pod_name.stdout in k8s_pods.stdout"
+  failed_when: false
+
+- name: Delete management network image
+  command: "buildah rmi -f {{ mngmnt_network_container }}:latest"
+  when: mngmnt_network_container in image_list.stdout

+ 31 - 0
control_plane/tools/roles/control_plane_cleanup/tasks/reset_kubeadm_cluster.yml

@@ -0,0 +1,31 @@
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Reset kubeadm cluster
+  command: "kubeadm reset -f"
+  changed_when: false
+
+- name: Remove CNI and kubeconfig files
+  file:
+    path: "{{ item }}"
+    state: absent
+  failed_when: false
+  with_items:
+    - "{{ k8_files }}"
+
+- name: Reset iptables
+  command: "iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X"
+  changed_when: true
+  failed_when: false

+ 55 - 0
control_plane/tools/roles/control_plane_cleanup/vars/main.yml

@@ -0,0 +1,55 @@
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+# Usage: remove_container_images.yml
+k8s_mngmnt_network: "{{ playbook_dir }}/../roles/control_plane_device/files/k8s_mngmnt_network.yml"
+k8s_cobbler: "{{ playbook_dir }}/../roles/provision_cobbler/files/k8s_cobbler.yml"
+awx_projects_pv: "{{ playbook_dir }}/../roles/webui_awx/files/awx_projects_pv.yml"
+awx_postgres_pv:  "{{ playbook_dir }}/../roles/webui_awx/files/awx_postgres_pv.yml"
+awx_operator: "{{ playbook_dir }}/../../../awx-operator/"
+awx_file: "{{ playbook_dir }}/../roles/webui_awx/files/awx.yml"
+k8s_infiniband: "{{ playbook_dir }}/../roles/control_plane_ib/files/k8s_infiniband.yml"
+infiniband_container: infiniband-container
+mngmnt_network_container: mngmnt_network_container
+cobbler_image: cobbler
+awx_image: custom-awx-ee
+
+# Usage: decrypt_vault_files.yml
+login_vars_file: "{{ playbook_dir }}/../input_params/login_vars.yml"
+login_vault_file: "{{ playbook_dir }}/../input_params/.login_vault_key"
+idrac_tools_vars_filename: "{{ playbook_dir }}/../input_params/idrac_tools_vars.yml"
+idrac_tools_vaultname: "{{ playbook_dir }}/../input_params/.idrac_vault_key"
+config_filename: "{{ playbook_dir }}/../../omnia_config.yml"
+config_vaultname: "{{ playbook_dir }}/../../.omnia_vault_key"
+
+# Usage: reset_kubeadm_cluster.yml
+k8_files:
+   - $HOME/.kube/config
+   - /etc/cni/net.d
+
+# Usage: delete_files_vault_keys.yml
+del_files:
+    - /var/nfs_repo
+    - /var/nfs_awx
+    - /root/dsu
+    - /tmp/unattended_centos8.iso
+    - /tmp/iso
+    - /mnt/leap
+    - /mnt/rocky
+    - "{{ playbook_dir }}/..roles/control_plane_security/files/.ipavars.yml"
+    - "{{ playbook_dir }}/../roles/provision_idrac/files/management_station_ip.txt"
+vault_keys:
+    - "{{ playbook_dir }}/../roles/webui_awx/files/.tower_cli.cfg"
+    - "{{ playbook_dir }}/../roles/webui_awx/files/.tower_vault_key"

+ 6 - 1
docs/EXAMPLE_SYSTEM_DESIGNS.md

@@ -6,7 +6,12 @@ Omnia can configure systems which use Ethernet or Infiniband-based fabric to con
 ![Example system configuration with Infiniband fabric](images/example-system-infiniband.png)
 
 ## Network Setup
-Omnia assumes that servers are already connected to the network and have access to the internet.
+With Omnia 1.2, only the management station requires internet access. In such a situation, the network topology would follow the below diagram:
+![Network Connections when only the Management Station is connected to Internet](images/Omnia_NetworkConfig_NoInet.png)
+
+If the user would like to have all compute nodes connect to the internet, the following network diagram can be followed.
+![Network Connections when all servers are connected to the internet](images/Omnia_NetworkConfig_Inet.png)
+
 ### Network Topology
 Possible network configurations include:
 * A flat topology where all nodes are connected to a switch which includes an uplink to the internet. This requires multiple externally-facing IP addresses

+ 10 - 0
docs/FAQ.md

@@ -9,6 +9,9 @@ Potential Causes:
 Resolution:  
 Wait for AWX UI to be accessible at http://\<management-station-IP>:8081, and then run the `control_plane.yml` file again, where __management-station-IP__ is the IP address of the management node.
 
+## Why does Omnia Control Plane fail at Task: `control_plane_common: Assert Value of idrac_support if mngmt_network container needed`?
+When `device_config_support` is set to true, `idrac_support` also needs to be set to true. 
+
 ## What to do if the nodes in a Kubernetes cluster reboot:
 Wait for 15 minutes after the Kubernetes cluster reboots. Next, verify the status of the cluster using the following commands:
 * `kubectl get nodes` on the manager node to get the real-time k8s cluster status.  
@@ -33,14 +36,21 @@ Resolution:
 3. Verify that the downloaded .iso file is valid and correct.
 4. Delete the Cobbler container using `docker rm -f cobbler` and rerun `control_plane.yml`.
 
+## How to enable DHCP routing on Compute Nodes:
+
+To enable routing, update the `primary_dns` and `secondary_dns` in `base_vars` with the appropriate IPs (hostnames are currently not supported). For compute nodes that are not directly connected to the internet (ie only host network is configured), this configuration allows for internet connectivity.
+
 ## Why does PXE boot fail with tftp timeout or service timeout errors?  
 Potential Causes:
 * RAID is configured on the server.
 * Two or more servers in the same network have Cobbler services running.  
+* The target compute node does not have a configured PXE device with an active NIC.
 
 Resolution:  
 1. Create a Non-RAID or virtual disk on the server.  
 2. Check if other systems except for the management node have cobblerd running. If yes, then stop the Cobbler container using the following commands: `docker rm -f cobbler` and `docker image rm -f cobbler`.
+3. On the server, go to `BIOS Setup -> Network Settings -> PXE Device`. For each listed device (typically 4), configure an active NIC under `PXE device settings`
+
 
 ## What to do when Slurm services do not start automatically after the cluster reboots:
 

+ 33 - 23
docs/INSTALL_OMNIA.md

@@ -5,7 +5,10 @@ The following sections provide details on installing Omnia using CLI.
 To install the Omnia control plane and manage workloads on your cluster using the Omnia control plane, see [Install the Omnia Control Plane](INSTALL_OMNIA_CONTROL_PLANE.md) and [Monitor Kubernetes and Slurm](MONITOR_CLUSTERS.md) for more information.
 
 ## Prerequisites
-* The login, manager, and compute nodes must be running CentOS 7.9 2009 OS.
+* The login, manager, and compute nodes must be running CentOS 7.9 2009 OS/ Rocky 8.x/ LeapOS 15.3.
+>> __Note:__ If you are using LeapOS, the following repositories will be enabled when running `omnia.yml`:
+>> * OSS ([Repository](http://download.opensuse.org/distribution/leap/15.3/repo/oss/) + [Update](http://download.opensuse.org/update/leap/15.3/oss/))
+>> * Non-OSS ([Repository](http://download.opensuse.org/distribution/leap/15.3/repo/non-oss/) + [Update](http://download.opensuse.org/update/leap/15.3/non-oss/))
 * If you have configured the `omnia_config.yml` file to enable the login node, the login node must be part of the cluster. 
 * All nodes must be connected to the network and must have access to the Internet.
 * Set the hostnames of all the nodes in the cluster.
@@ -42,21 +45,23 @@ To install the Omnia control plane and manage workloads on your cluster using th
 	export PATH=$PATH:/usr/local/bin
 	```  
 	
-**Note**: To deploy Omnia, Python 3.6 provides bindings to system tools such as RPM, DNF, and SELinux. As versions greater than 3.6 do not provide these bindings to system tools, ensure that you install Python 3.6 with dnf.  
+>> **Note**: To deploy Omnia, Python 3.6 provides bindings to system tools such as RPM, DNF, and SELinux. As versions greater than 3.6 do not provide these bindings to system tools, ensure that you install Python 3.6 with dnf.  
+
+>> **Note**: If Ansible version 2.9 or later is installed, ensure it is uninstalled before installing a newer version of Ansible. Run the following commands to uninstall Ansible before upgrading to a newer version.  
+>> 1. `pip uninstall ansible`
+>> 2. `pip uninstall ansible-base (if ansible 2.9 is installed)`
+>> 3. `pip uninstall ansible-core (if ansible 2.10  > version is installed)`
+
+>> __Note:__ If you are using LeapOS, zypper may need to be updated before installing Omnia using the command: `zypper update -y`
 
-**Note**: If Ansible version 2.9 or later is installed, ensure it is uninstalled before installing a newer version of Ansible. Run the following commands to uninstall Ansible before upgrading to a newer version.  
-1. `pip uninstall ansible`
-2. `pip uninstall ansible-base (if ansible 2.9 is installed)`
-3. `pip uninstall ansible-core (if ansible 2.10  > version is installed)`
 
-	 
 * On the management station, run the following commands to install Git:
 	```
 	dnf install epel-release -y
 	dnf install git -y
 	```
 
-**Note**: If there are errors while executing the Ansible playbook commands, then re-run the commands.  
+>> **Note**: If there are errors while executing the Ansible playbook commands, then re-run the commands.  
 
 ## Steps to install Omnia using CLI
 
@@ -71,7 +76,7 @@ From release branch:
 git clone -b release https://github.com/dellhpc/omnia.git 
 ```-->  
 
-__Note:__ After the Omnia repository is cloned, a folder named __omnia__ is created. Ensure that you do not rename this folder.
+>> __Note:__ After the Omnia repository is cloned, a folder named __omnia__ is created. Ensure that you do not rename this folder.
 
 2. Change the directory to __omnia__: `cd omnia`
 
@@ -90,19 +95,22 @@ __Note:__ After the Omnia repository is cloned, a folder named __omnia__ is crea
 | domain_name                | omnia.test    | Sets the intended domain name                                                                                                                                                                                                                        |
 | realm_name                 | OMNIA.TEST    | Sets the intended realm name                                                                                                                                                                                                                         |
 | directory_manager_password |               | Password authenticating admin level access to the Directory for system   management tasks. It will be added to the instance of directory server   created for IPA. <br> Required Length: 8 characters. <br> The   password must not contain -,\, '," |
-| kerberos_admin_password         |               | "admin" user password for the IPA server on RockyOS. If LeapOS is in use, it is used as the "kerberos admin" user password for 389-ds <br> This field is not relevant to Management Stations running `LeapOS`                                                                                                                                                                                                                            |
+| kerberos_admin_password    |               | "admin" user password for the IPA server on RockyOS. If LeapOS is in use, it is used as the "kerberos admin" user password for 389-ds <br> This field is not relevant to Management Stations running `LeapOS`                                                                                                                                                                                                                            |
 | enable_secure_login_node   |  **false**, true             | Boolean value deciding whether security features are enabled on the Login Node. For more information, see [here](docs/Security/Enable_Security_LoginNode.md).                                                                                                                                                                                                                           |
 	
 	
 >> __NOTE:__  Without the login node, Slurm jobs can be scheduled only through the manager node.
 
 4. Create an inventory file in the *omnia* folder. Add login node IP address under the *[login_node]* group, manager node IP address under the *[manager]* group, compute node IP addresses under the *[compute]* group, and NFS node IP address under the *[nfs_node]* group. A template file named INVENTORY is provided in the *omnia\docs* folder.  
-	**NOTE**: Ensure that all the four groups (login_node, manager, compute, nfs_node) are present in the template, even if the IP addresses are not updated under login_node and nfs_node groups. 
+>>	**NOTE**: Ensure that all the four groups (login_node, manager, compute, nfs_node) are present in the template, even if the IP addresses are not updated under login_node and nfs_node groups. 
 
 5. To install Omnia:
-```
-ansible-playbook omnia.yml -i inventory 
-```
+
+| Leap OS                     	| CentOS, Rocky                                             	|
+|-----------------------------	|-----------------------------------------------------------	|
+| `ansible-playbook omnia.yml -i inventory -e 'ansible_python_interpreter=/usr/bin/python3'`   	| `ansible-playbook omnia.yml -i inventory`	|
+		
+
 
 6. By default, no skip tags are selected, and both Kubernetes and Slurm will be deployed.  
 
@@ -118,15 +126,15 @@ ansible-playbook omnia.yml -i inventory
 	The default path of the Ansible configuration file is `/etc/ansible/`. If the file is not present in the default path, then edit the `ansible_config_file_path` variable to update the configuration path.
 
 7. To provide passwords for mariaDB Database (for Slurm accounting), Kubernetes Pod Network CIDR, and Kubernetes CNI, edit the `omnia_config.yml` file.  
-__Note:__ 
+>> __Note:__ 
 * Supported values for Kubernetes CNI are calico and flannel. The default value of CNI considered by Omnia is calico. 
 * The default value of Kubernetes Pod Network CIDR is 10.244.0.0/16. If 10.244.0.0/16 is already in use within your network, select a different Pod Network CIDR. For more information, see __https://docs.projectcalico.org/getting-started/kubernetes/quickstart__.
 
-**NOTE**: If you want to view or edit the `omnia_config.yml` file, run the following command:  
+>> **NOTE**: If you want to view or edit the `omnia_config.yml` file, run the following command:  
 - `ansible-vault view omnia_config.yml --vault-password-file .omnia_vault_key` -- To view the file. 
 - `ansible-vault edit omnia_config.yml --vault-password-file .omnia_vault_key` -- To edit the file.
 
-**NOTE**: It is suggested that you use the ansible-vault view or edit commands and that you do not use the ansible-vault decrypt or encrypt commands. If you have used the ansible-vault decrypt or encrypt commands, provide 644 permission to `omnia_config.yml`.  
+>> **NOTE**: It is suggested that you use the ansible-vault view or edit commands and that you do not use the ansible-vault decrypt or encrypt commands. If you have used the ansible-vault decrypt or encrypt commands, provide 644 permission to `omnia_config.yml`.  
 
 Omnia considers `slurm` as the default username for MariaDB.  
 
@@ -160,7 +168,6 @@ The following __kubernetes__ roles are provided by Omnia when __omnia.yml__ file
 - **k8s_start_services** role
 	- Kubernetes services are deployed such as Kubernetes Dashboard, Prometheus, MetalLB and NFS client provisioner
 
-__Note:__ 
 
 * Whenever k8s_version, k8s_cni or k8s_pod_network_cidr needs to be modified after the HPC cluster is setup, the OS in the manager and compute nodes in the cluster must be re-flashed before executing omnia.yml again.
 * After Kubernetes is installed and configured, few Kubernetes and calico/flannel related ports are opened in the manager and compute nodes. This is required for Kubernetes Pod-to-Pod and Pod-to-Service communications. Calico/flannel provides a full networking stack for Kubernetes pods.
@@ -194,11 +201,13 @@ The following __Slurm__ roles are provided by Omnia when __omnia.yml__ file is r
 To enable the login node, the *login_node_required* variable must be set to "true" in the *omnia_config.yml* file.  
 - **login_common** role: The firewall ports are opened on the manager and login nodes.  
 - **login_server** role: FreeIPA server is installed and configured on the manager node to provide authentication using LDAP and Kerberos principles.  
-- **login_node** role: FreeIPA client is installed and configured on the login node and is integrated with the server running on the manager node.  
+- **login_node** role: For Rocky, FreeIPA client is installed and configured on the login node and is integrated with the server running on the manager node. For LeapOS, 389ds will be installed instead.
+
+>>__Note:__ If LeapOS is being deployed, login_common and login_server roles will be skipped.  
 
-**NOTE**: To skip the installation of:
-* The login node-In the `omnia_config.yml` file, set the *login_node_required* variable to "false".  
-* The FreeIPA server and client: Use `--skip-tags freeipa` while executing the *omnia.yml* file. 
+>> **NOTE**: To skip the installation of:
+>> * The login node-In the `omnia_config.yml` file, set the *login_node_required* variable to "false".  
+>> * The FreeIPA server and client: Use `--skip-tags freeipa` while executing the *omnia.yml* file. 
 
 ### Installing JupyterHub and Kubeflow playbooks  
 If you want to install JupyterHub and Kubeflow playbooks, you have to first install the JupyterHub playbook and then install the Kubeflow playbook.
@@ -207,11 +216,12 @@ Commands to install JupyterHub and Kubeflow:
 * `ansible-playbook platforms/jupyterhub.yml -i inventory`
 * `ansible-playbook platforms/kubeflow.yml -i inventory`
 
-__Note:__ When the Internet connectivity is unstable or slow, it may take more time to pull the images to create the Kubeflow containers. If the time limit is exceeded, the **Apply Kubeflow configurations** task may fail. To resolve this issue, you must redeploy Kubernetes cluster and reinstall Kubeflow by completing the following steps:
+>> __Note:__ When the Internet connectivity is unstable or slow, it may take more time to pull the images to create the Kubeflow containers. If the time limit is exceeded, the **Apply Kubeflow configurations** task may fail. To resolve this issue, you must redeploy Kubernetes cluster and reinstall Kubeflow by completing the following steps:
 * Format the OS on manager and compute nodes.
 * In the `omnia_config.yml` file, change the k8s_cni variable value from calico to flannel.
 * Run the Kubernetes and Kubeflow playbooks. 
 
+
 ## Add a new compute node to the cluster
 
 To update the INVENTORY file present in `omnia` directory with the new node IP address under the compute group. Ensure the other nodes which are already a part of the cluster are also present in the compute group along with the new node. Then, run `omnia.yml` to add the new node to the cluster and update the configurations of the manager node.

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 101 - 49
docs/INSTALL_OMNIA_CONTROL_PLANE.md


+ 1 - 1
docs/MONITOR_CLUSTERS.md

@@ -20,7 +20,7 @@ To access any of the dashboards, ensure that a compatible web browser is install
 	5. `logout and login back`
 	6. To launch Firefox from terminal, run `firefox&`
 
-**NOTE**: When the PuTTY or MobaXterm session ends, you must run **export DISPLAY=:10.0** command each time, else Firefox cannot be launched again.  
+>> **NOTE**: When the PuTTY or MobaXterm session ends, you must run **export DISPLAY=:10.0** command each time, else Firefox cannot be launched again.  
 
 ## Access FreeIPA Dashboard  
 The FreeIPA Dashboard can be accessed from the management station, manager, and login nodes. To access the dashboard:

+ 75 - 63
docs/README.md

@@ -15,7 +15,6 @@
 
 ## What Omnia does
 Omnia can build clusters that use Slurm or Kubernetes (or both!) for workload management. Omnia will install software from a variety of sources, including:
-- Standard CentOS and [ELRepo](http://elrepo.org) repositories
 - Helm repositories
 - Source code compilation
 - [OperatorHub](https://operatorhub.io)
@@ -23,19 +22,21 @@ Omnia can build clusters that use Slurm or Kubernetes (or both!) for workload ma
 Whenever possible, Omnia will leverage existing projects rather than reinvent the wheel.
 
 ### Omnia stacks
-Omnia can install Kubernetes or Slurm (or both), along with additional drivers, services, libraries, and user applications.
+Omnia can deploy firmware, install Kubernetes or Slurm (or both), along with additional drivers, services, libraries, and user applications.
 ![Omnia Kubernetes Stack](images/omnia-k8s.png)
 
 ![Omnia Slurm Stack](images/omnia-slurm.png)  
 
 ## What's new in this release
-* Extended support of Leap OS on Management station, login, compute and NFS nodes.
-* Omnia now supports Powervault configurations with 2 network interfaces.
-* Omnia now supports multi profile creation and multi cluster provisioning using Cobbler.
-* Provisioning of Rocky custom ISO on supported PowerEdge servers using iDRAC.
-* Configuring Dell EMC networking switches, Mellanox InfiniBand switches, and PowerVault storage devices in the cluster. 
-* An option to configure a login node with the same configurations as the compute nodes in the cluster. With appropriate user privileges provided by the cluster administrator, users can log in to the login node and schedule Slurm jobs. The authentication mechanism in the login node uses the FreeIPA solution.
-* Options to enable the security settings on the iDRAC such as system lockdown mode, secure boot mode, 2-factor authentication (2FA), and LDAP directory services.
+- Support for Rocky 8.x with latest python/ansible on the Management Station
+- Support for Leap 15.3 on the cluster
+- Support for Rocky 8.x on the cluster
+- Added Grafana integration for better monitoring capability
+- Added Loki Log aggregation of Var Logs
+- Added Slurm/K8s Monitoring capability
+- Added security features to comply with NIST 800-53 Revision 5 and 800-171 Revision 5
+- Added the ability to collect telemetry information from SLURM and iDRAC
+- Added Grafana plugins to view real time graphs of cluster/node statistics
 
 ## Deploying clusters using the Omnia control plane
 The Omnia Control Plane will automate the entire cluster deployment process, starting with provisioning the operating system on the supported devices and updating the firmware versions of PowerEdge Servers. 
@@ -51,18 +52,10 @@ The following table lists the software and operating system requirements on the
 
 Requirements  |   Version
 ----------------------------------  |   -------
-OS pre-installed on the management station  |  Rocky 8.x/ Leap 15.x
+OS pre-installed on the management station  |  Rocky 8.x
 OS deployed by Omnia on bare-metal Dell EMC PowerEdge Servers | Rocky 8.x Minimal Edition/ Leap 15.x
-Cobbler  |  3.2.2
-Ansible AWX  |  19.4.0
-Slurm Workload Manager  |  20.11.2
-Kubernetes on the management station  |  1.21.0
-Kubernetes on the manager and compute nodes	|	1.16.7 or 1.19.3
-Kubeflow  |  1
-Prometheus  |  2.23.0
 Ansible  |  2.9.21
 Python  |  3.6.15
-CRI-O  |  1.21.0
 
 ## Hardware managed by Omnia
 The following table lists the supported devices managed by Omnia. Other devices than those listed in the following table will be discovered by Omnia, but features offered by Omnia will not be applicable.
@@ -78,51 +71,70 @@ Mellanox InfiniBand Switches	|	NVIDIA MQM8700-HS2F Quantum HDR InfiniBand Switch
 ## Software deployed by Omnia
 The following table lists the software and its compatible version managed by Omnia. To avoid any impact on the proper functioning of Omnia, other versions than those listed are not supported.
 
-Software	|	License	|	Compatible Version	|	Description
------------	|	-------	|	----------------	|	-----------------
-LeapOS 15.3	|	-	|	15.x|	Operating system on entire cluster
-CentOS Linux release 7.9.2009 (Core)	|	-	|	7.9	|	Operating system on entire cluster except for management station
-Rocky 8.x	|	-	|	8.x	|	Operating system on entire cluster except for management station
-Rocky 8.x	|	-	|	8.x	|	Operating system on the management station
-MariaDB	|	GPL 2.0	|	5.5.68	|	Relational database used by Slurm
-Slurm	|	GNU General Public	|	20.11.7	|	HPC Workload Manager
-Docker CE	|	Apache-2.0	|	20.10.2	|	Docker Service
-FreeIPA	|	GNU General Public License v3	|	4.6.8	|	Authentication system used in the login node
-OpenSM	|	GNU General Public License 2	|	3.3.24	|	-
-NVIDIA container runtime	|	Apache-2.0	|	3.4.2	|	Nvidia container runtime library
-Python PIP	|	MIT License	|	21.1.2	|	Python Package
-Python3	|	-	|	3.6.8 (3.6.15 if LeapOS is being used)	|	-
-Kubelet	|	Apache-2.0	|	1.16.7,1.19, 1.21 (LeapOS only supports 1.21) 	|	Provides external, versioned ComponentConfig API types for configuring the kubelet
-Kubeadm	|	Apache-2.0	|	1.16.7,1.19, 1.21 (LeapOS only supports 1.21)	|	"fast paths" for creating Kubernetes clusters
-Kubectl	|	Apache-2.0	|	1.16.7,1.19, 1.21 (LeapOS only supports 1.21)	|	Command line tool for Kubernetes
-JupyterHub	|	Modified BSD License	|	1.1.0	|	Multi-user hub
-kubernetes Controllers	|	Apache-2.0	|	1.16.7,1.19 (1.21 if LeapOS is being used)	|	Orchestration tool	
-Kfctl	|	Apache-2.0	|	1.0.2	|	CLI for deploying and managing Kubeflow
-Kubeflow	|	Apache-2.0	|	1	|	Cloud Native platform for machine learning
-Helm	|	Apache-2.0	|	3.5.0	|	Kubernetes Package Manager
-Helm Chart	|	-	|	0.9.0	|	-
-TensorFlow	|	Apache-2.0	|	2.1.0	|	Machine Learning framework
-Horovod	|	Apache-2.0	|	0.21.1	|	Distributed deep learning training framework for Tensorflow
-MPI	|	Copyright (c) 2018-2019 Triad National Security,LLC. All rights reserved.	|	0.3.0	|	HPC library
-CoreDNS	|	Apache-2.0	|	1.6.2	|	DNS server that chains plugins
-CNI	|	Apache-2.0	|	0.3.1	|	Networking for Linux containers
-AWX	|	Apache-2.0	|	19.4.0	|	Web-based User Interface
-AWX.AWX	|	Apache-2.0	|	19.4.0	|	Galaxy collection to perform awx configuration
-AWXkit	|	Apache-2.0	|	to be updated	|	To perform configuration through CLI commands
-Cri-o	|	Apache-2.0	|	1.21	|	Container Service
-Buildah	|	Apache-2.0	|	1.22.4	|	Tool to build and run containers
-PostgreSQL	|	Copyright (c) 1996-2020, PostgreSQL Global Development Group	|	10.15	|	Database Management System
-Redis	|	BSD-3-Clause License	|	6.0.10	|	In-memory database
-NGINX	|	BSD-2-Clause License	|	1.14	|	-
-dellemc.os10	|	GNU-General Public License v3.1	|	1.1.1	|	It provides networking hardware abstraction through a common set of APIs
-OMSDK	|	Apache-2.0	|	1.2.488	|	Dell EMC OpenManage Python SDK (OMSDK) is a python library that helps developers and customers to automate the lifecycle management of PowerEdge Servers
-| Loki                                  | Apache License 2.0               | 2.4.1  | Loki is a log aggregation system   designed to store and query logs from all your applications and   infrastructure                            |
-| Promtail                              | Apache License 2.1               | 2.4.1  | Promtail is an agent which ships the contents of local logs to   a private Grafana Loki instance or Grafana Cloud.                             |
-| kube-prometheus-stack                 | Apache License 2.2               | 25.0.0 | Kube Prometheus Stack is a collection of Kubernetes manifests,   Grafana dashboards, and Prometheus rules.                                     |
-| mailx                                 | MIT License                      | 12.5   | mailx is a Unix utility program for sending and receiving   mail.                                                                              |
-| postfix                               | IBM Public License               | 3.5.8  | Mail Transfer Agent (MTA) designed to determine routes and   send emails                                                                       |
-| xorriso                               | GPL version 3                    | 1.4.8  | xorriso copies file objects from POSIX compliant filesystems   into Rock Ridge enhanced ISO 9660 filesystems.                                  |
-| Dell EMC   OpenManage Ansible Modules | GNU- General Public License v3.0 | 5.0.0  | OpenManage Ansible Modules simplifies and automates   provisioning, deployment, and updates of PowerEdge servers and modular   infrastructure. |
+| Software	                                  	| 	License	                                                                    | 	Compatible Version	                            | 	Description                                                                                                                                                 |
+|-------------------------------------------	|-----------------------------------------------------------------------------	|-------------------------------------------------	|--------------------------------------------------------------------------------------------------------------------------------------------------------------	|
+| LeapOS 15.3	                               	| 	-	                                                                        | 	15.x                                            | 	Operating system on entire cluster                                                                                                                          |
+| CentOS Linux release 7.9.2009 (Core)	      	| 	-	                                                                        | 	7.9	                                            | 	Operating system on entire cluster except for management station                                                                                            |
+| Rocky 8.x	                                 	| 	-	                                                                        | 	8.x	                                            | 	Operating system on entire cluster except for management station                                                                                            |
+| Rocky 8.x	                                 	| 	-	                                                                        | 	8.x	                                            | 	Operating system on the management station                                                                                                                  |
+| MariaDB	                                   	| 	GPL 2.0	                                                                    | 	5.5.68	                                        | 	Relational database used by Slurm                                                                                                                           |
+| Slurm	                                     	| 	GNU General Public	                                                        | 	20.11.7	                                        | 	HPC Workload Manager                                                                                                                                        |
+| Docker CE	                                 	| 	Apache-2.0	                                                                | 	20.10.2	                                        | 	Docker Service                                                                                                                                              |
+| FreeIPA	                                   	| 	GNU General Public License v3	                                            | 	4.6.8	                                        | 	Authentication system used in the login node                                                                                                                |
+| OpenSM	                                    | 	GNU General Public License 2	                                            | 	3.3.24	                                        | 	-                                                                                                                                                           |
+| NVIDIA container runtime	                  	| 	Apache-2.0	                                                                | 	3.4.2	                                        | 	Nvidia container runtime library                                                                                                                            |
+| Python PIP	                                | 	MIT License	                                                                | 	21.1.2	                                        | 	Python Package                                                                                                                                              |
+| Python3	                                   	| 	-	                                                                        | 	3.6.8 (3.6.15 if LeapOS is being used)	        | 	-                                                                                                                                                           |
+| Kubelet	                                   	| 	Apache-2.0	                                                                | 	1.16.7,1.19, 1.21  	                            | 	Provides external, versioned ComponentConfig API types for configuring   the kubelet                                                                        |
+| Kubeadm	                                   	| 	Apache-2.0	                                                                | 	1.16.7,1.19, 1.21 	                            | 	"fast paths" for creating Kubernetes clusters                                                                                                               |
+| Kubectl	                                   	| 	Apache-2.0	                                                                | 	1.16.7,1.19, 1.21 	                            | 	Command line tool for Kubernetes                                                                                                                            |
+| kubernetes.core	                           	| 	GPL 3.0	                                                                    | 	2.2.3 	                                        | 	Performs CRUD operations on K8s onjects                                                                                                                     |
+| JupyterHub	                                | 	Modified BSD License	                                                    | 	1.1.0	                                        | 	Multi-user hub                                                                                                                                              |
+| kubernetes Controllers	                    | 	Apache-2.0	                                                                | 	1.16.7,1.19 (1.21 if LeapOS is being used)	    | 	Orchestration tool	                                                                                                                                        |
+| Kfctl	                                     	| 	Apache-2.0	                                                                | 	1.0.2	                                        | 	CLI for deploying and managing Kubeflow                                                                                                                     |
+| Kubeflow	                                  	| 	Apache-2.0	                                                                | 	1	                                            | 	Cloud Native platform for machine learning                                                                                                                  |
+| Helm	                                      	| 	Apache-2.0	                                                                | 	3.5.0	                                        | 	Kubernetes Package Manager                                                                                                                                  |
+| Helm Chart	                                | 	-	                                                                        | 	0.9.0	                                        | 	-                                                                                                                                                           |
+| TensorFlow	                                | 	Apache-2.0	                                                                | 	2.1.0	                                        | 	Machine Learning framework                                                                                                                                  |
+| Horovod	                                   	| 	Apache-2.0	                                                                | 	0.21.1	                                        | 	Distributed deep learning training framework for Tensorflow                                                                                                 |
+| MPI	                                       	| 	Copyright (c) 2018-2019 Triad National Security,LLC. All rights   reserved.	| 	0.3.0	                                        | 	HPC library                                                                                                                                                 |
+| CoreDNS	                                   	| 	Apache-2.0	                                                                | 	1.6.2	                                        | 	DNS server that chains plugins                                                                                                                              |
+| CNI	                                       	| 	Apache-2.0	                                                                | 	0.3.1	                                        | 	Networking for Linux containers                                                                                                                             |
+| AWX	                                       	| 	Apache-2.0	                                                                | 	20.0.0	                                        | 	Web-based User Interface                                                                                                                                    |
+| AWX.AWX	                                   	| 	Apache-2.0	                                                                | 	19.4.0	                                        | 	Galaxy collection to perform awx configuration                                                                                                              |
+| AWXkit	                                    | 	Apache-2.0	                                                                | 	18.0.0	                                        | 	To perform configuration through CLI commands                                                                                                               |
+| CRI-O	                                     	| 	Apache-2.0	                                                                | 	1.21, 1.22.0  									| 	Container Service                                                                                                                                           |
+| Buildah	                                   	| 	Apache-2.0	                                                                | 	1.22.4	                                        | 	Tool to build and run containers                                                                                                                            |
+| PostgreSQL	                                | 	Copyright (c) 1996-2020, PostgreSQL Global Development Group	            | 	10.15	                                        | 	Database Management System                                                                                                                                  |
+| Redis	                                     	| 	BSD-3-Clause License	                                                    | 	6.0.10	                                        | 	In-memory database                                                                                                                                          |
+| NGINX	                                     	| 	BSD-2-Clause License	                                                    | 	1.14	                                        | 	-                                                                                                                                                           |
+| dellemc.os10	                              	| 	GNU-General Public License v3.1	                                            | 	1.1.1	                                        | 	It provides networking hardware abstraction through a common set of APIs                                                                                    |
+| grafana	                                   	| 	Apache-2.0	                                                                | 	8.3.2	                                        | 	Grafana is the open source analytics & monitoring solution for every   database.                                                                            |
+| community.grafana	                         	| 	GPL 3.0	                                                                    | 	1.3.0	                                        | 	Technical Support for open source grafana                                                                                                                   |
+| OMSDK	                                     	| 	Apache-2.0	                                                                | 	1.2.488	                                        | 	Dell EMC OpenManage Python SDK (OMSDK) is a python library that helps   developers and customers to automate the lifecycle management of PowerEdge   Servers|
+| activemq	                                  	| 	Apache-2.0	                                                                | 	5.10.0	                                        | 	Most popular multi protocol, message broker                                                                                                                 |
+|  Loki                                     	|  Apache License 2.0                                                         	|  2.4.1                                          	|  Loki is a log aggregation   system   designed to store and query   logs from all your applications and     infrastructure                                   	|
+|  Promtail                                 	|  Apache License 2.1                                                         	|  2.4.1                                          	|  Promtail is an agent which ships   the contents of local logs to   a   private Grafana Loki instance or Grafana Cloud.                                      	|
+|  kube-prometheus-stack                    	|  Apache License 2.2                                                         	|  25.0.0                                         	|  Kube Prometheus Stack is a   collection of Kubernetes manifests,     Grafana dashboards, and Prometheus rules.                                              	|
+|  mailx                                    	|  MIT License                                                                	|  12.5                                           	|  mailx is a Unix utility program   for sending and receiving   mail.                                                                                         	|
+|  postfix                                  	|  IBM Public License                                                         	|  3.5.8                                          	|  Mail Transfer Agent (MTA) designed   to determine routes and   send   emails                                                                                	|
+|  xorriso                                  	|  GPL version 3                                                              	|  1.4.8                                          	|  xorriso copies file objects from   POSIX compliant filesystems   into Rock   Ridge enhanced ISO 9660 filesystems.                                           	|
+|  Dell EMC     OpenManage Ansible Modules  	|  GNU- General Public License   v3.0                                         	|  5.0.0                                          	|  OpenManage Ansible Modules   simplifies and automates     provisioning, deployment, and updates of PowerEdge servers and   modular   infrastructure.        	|
+|  389-ds                                   	|  GPL version 3                                                              	|  1.4.4                                          	|   LDAP server used for   authentication, access control.                                                                                                     	|
+|  sssd                                     	|  GPL version 3                                                              	|  1.16.1                                         	|  A set of daemons used to manage   access to remote directory services and authentication mechanisms.                                                        	|
+|  krb5                                     	|  MIT License                                                                	|  1.19.2                                         	|  Authentication protocol providing   strong authentication for client/server applications by using secret-key   cryptography                                 	|
+|  openshift                                	|  Apache 2.0                                                                 	|  0.12.1                                         	|  an on-premises  platform as a   service built around Linux containers orchestrated and managed   by Kubernetes                                              	|
+| golang                                    	| BSD-3-Clause License                                                        	| 1.17                                            	| Go is a statically typed, compiled programming language designed at   Google                                                                                 	|
+| mysql                                     	| GPL 2.0                                                                     	| 8                                               	| MySQL is an open-source relational database management system.                                                                                               	|
+| postgresSQL                               	| PostgresSQL License                                                         	| 12                                              	| PostgreSQL, also known as Postgres, is a free and open-source relational   database management system emphasizing extensibility and SQL compliance.          	|
+| idrac-telemetry-reference tools           	| Apache-2.0                                                                  	| 0.1                                             	| Reference toolset for PowerEdge telemetry metric collection and   integration with analytics and visualization solutions.                                    	|
+| jansson                                   	| MIT License                                                                 	| 2.14                                            	| C library for encoding, decoding and manipulating JSON data                                                                                                  	|
+| libjwt                                    	| MPL-2.0 License                                                             	| 1.13.0                                          	| JWT C Library                                                                                                                                                	|
+| apparmor                                  	| GNU General Public License                                                  	| 3.0.3                                           	| Controls access based on paths of the program files                                                                                                          	|
+| nsfcac/grafana-plugin                     	| Apache-2.0                                                                  	| 2.1.0                                           	| Machine Learning Framework                                                                                                                                   	|
+| apparmor                                  	| GNU General Public License                                                  	| 3.0.3                                           	| Controls access based on paths of the program files                                                                                                          	|
+| snoopy                                    	| GPL 2.0                                                                     	| 2.4.15                                          	| Snoopy is a small library that logs all program executions on your   Linux/BSD system                                                                        	|
+
 
 # Known issues  
 * **Issue**: Hosts are not displayed on the AWX UI.  

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 74 - 0
docs/Security/ENABLE_SECURITY_LOGIN_NODE.md


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 128 - 0
docs/Security/ENABLE_SECURITY_MANAGEMENT_STATION.md


+ 0 - 27
docs/Security/Enable_Security_LoginNode.md

@@ -1,27 +0,0 @@
-# Enabling Security on the Login Node 
-
-* Ensure that `enable_secure_login_node` is set to **true** in `omnia_config.yml`
-* Set the following parameters in `omnia_security_config.yml`
-
-|  Parameter Name        |  Default Value  |  Additional Information                                                                                                                                          |
-|------------------------|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| max_failures           | 3               | Failures allowed before lockout. <br> This value cannot currently   be changed.                                                                                  |
-| failure_reset_interval | 60              | Period (in seconds) after which the number of failed login attempts is   reset <br> Accepted Values: 30-60                                                       |
-| lockout_duration       | 10              | Period (in seconds) for which users are locked out. <br> Accepted   Values: 5-10                                                                                 |
-| session_timeout        | 180             | Period (in seconds) after which idle users get logged out automatically   <br> Accepted Values: 30-90                                                            |
-| alert_email_address    |                 | Email address used for sending alerts in case of authentication failure. Currently, only one email ID is accepted in this field.   <br> If this variable is left blank, authentication failure alerts will   be disabled. |
-| allow_deny             | Allow           | This variable sets whether the user list is Allowed or Denied. <br>   Accepted Values: Allow, Deny                                                               |
-| user                   |                 | Array of users that are allowed or denied based on the `allow_deny`   value. Multiple users must be separated by a space.                                        |
-
-* Set the following parameters in `control_plane/input_params/security_vars.yml`
-
-|  Parameter Name        |  Default Value  |  Additional Information                                                                                                                                          |
-|------------------------|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| allow_deny             | Allow           | This variable sets whether the user list is Allowed or Denied. <br>   Accepted Values: Allow, Deny                                                               |
-| user                   |                 | Array of users that are allowed or denied based on the `allow_deny`   value. Multiple users must be separated by a space.                                        |
-
-
-## Kernel Lockdown
-
-* RockyOS has Kernel Lockdown mode (Integrity) enabled by default
-* SUSE/Leap allows users to set Kernel Lockdown mode to Confidentiality or Integrity.

+ 0 - 79
docs/Security/Enable_Security_ManagementStation.md

@@ -1,79 +0,0 @@
-# Enabling Security on the Management Station
-
-Omnia uses FreeIPA on RockyOS to enable security features like authorisation and access control.
-
-## Enabling Authentication on the Management Station:
-
-Set the parameter 'enable_security_support' to true in `base_vars.yml`
-
-## Prerequisites Before Enabling Security:
-
-* Enter the relevant values in `login_vars.yml`:
-
-| Parameter Name             | Default Value | Additional Information                                                                           |
-|----------------------------|---------------|--------------------------------------------------------------------------------------------------|
-| ms_directory_manager_password |               | Password of the Directory Manager with full access to the directory for system management tasks. |
-| ms_kerberos_admin_password         |               | "admin" user password for the IPA server on RockyOS. If LeapOS is in use, it is used as the "kerberos admin" user password for 389-ds <br> This field is not relevant to Management Stations running `LeapOS`                                                         |
-
-
-
-* Enter the relevant values in `security_vars.yml:
-
-If `RockyOS` is in use on the Management Station:
-
-|  Parameter Name        |  Default Value  |  Additional Information                                                                                                                                                                                                                                                                                                                                      |
-|------------------------|-----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-|  domain_name           |  omnia.test     |  The domain name should not contain   an underscore ( _ )                                                                                                                                                                                                                                                                                                    |
-|  realm_name            |  OMNIA.TEST     |  The realm name should follow the   following rules per https://www.freeipa.org/page/Deployment_Recommendations   <br> * The realm name must not conflict with any other existing   Kerberos realm name (e.g. name used by Active Directory). <br> * The   realm name should be upper-case (EXAMPLE.COM) version of primary DNS domain   name (example.com). |
-| max_failures           | 3               | Failures allowed before lockout. <br> This value cannot currently   be changed.                                                                                                                                                                                                                                                                              |
-| failure_reset_interval | 60              | Period (in seconds) after which the number of failed login attempts is   reset <br> Accepted Values: 30-60                                                                                                                                                                                                                                                   |
-| lockout_duration       | 10              | Period (in seconds) for which users are locked out. <br> Accepted   Values: 5-10                                                                                                                                                                                                                                                                             |
-| session_timeout        | 180             | Period (in seconds) after which idle users get logged out automatically   <br> Accepted Values: 30-90                                                                                                                                                                                                                                                        |
-| alert_email_address    |                 | Email address used for sending alerts in case of authentication failure. Currently, only one email address is supported in this field.   <br> If this variable is left blank, authentication failure alerts will   be disabled.                                                                                                                                                                                             |
-| allow_deny             | Allow           | This variable sets whether the user list is Allowed or Denied. <br>   Accepted Values: Allow, Deny                                                                                                                                                                                                                                                           |
-| user                   |                 | Array of users that are allowed or denied based on the `allow_deny`   value. Multiple users must be separated by a space.                                                                                                                                                                                                                                    |
-
-
-## Log Aggregation via Grafana
-
-[Loki](https://grafana.com/docs/loki/latest/fundamentals/overview/) is a datastore used to efficiently hold log data for security purposes. Using the `promtail` agent, logs are collated and streamed via a HTTP API.
-
->> __Note:__ When `control_plane.yml` is run, Loki is automatically set up as a data source on the Grafana UI.
-
-
-
-### Querying Loki 
-
-Loki uses basic regex based syntax to filter for specific jobs, dates or timestamps.
-
-* Select the Explore ![Explore Icon](../Telemetry_Visualization/Images/ExploreIcon.PNG) tab to select control-plane-loki from the drop down.
-* Using [LogQL queries](https://grafana.com/docs/loki/latest/logql/log_queries/), all logs in `/var/log` can be accessed using filters (Eg: `{job=”Omnia”}` )
-
-## Viewing Logs on the Dashboard
-
-All log files can be viewed via the Dashboard tab (![Dashboard Icon](../Telemetry_Visualization/Images/DashBoardIcon.PNG)). The Default Dashboard displays `omnia.log` and `syslog`. Custom dashboards can be created per user requirements.
-
-Below is a list of all logs available to Loki and can be accessed on the dashboard:
-
-| Name               | Location                                  | Purpose                      | Additional Information                                                                             |
-|--------------------|-------------------------------------------|------------------------------|----------------------------------------------------------------------------------------------------|
-| Omnia Logs         | /var/log/omnia.log                        | Omnia Log                    | This log is configured by Default                                                                  |
-| syslogs            | /var/log/messages                         | System Logging               | This log is configured by Default                                                                  |
-| Audit Logs         | /var/log/audit/audit.log                  | All Login Attempts           | This log is configured by Default                                                                  |
-| CRON logs          | /var/log/cron                             | CRON Job Logging             | This log is configured by Default                                                                  |
-| Pods logs          | /var/log/pods/ * / * / * log                    | k8s pods                     | This log is configured by Default                                                                  |
-| Access Logs        | /var/log/dirsrv/slapd-<Realm Name>/access | Directory Server Utilization | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| Error Log          | /var/log/dirsrv/slapd-<Realm Name>/errors | Directory Server Errors      | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| CA Transaction Log | /var/log/pki/pki-tomcat/ca/transactions   | FreeIPA PKI Transactions     | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| KRB5KDC            | /var/log/krb5kdc.log                      | KDC Utilization              | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| Secure logs        | /var/log/secure                           | Login Error Codes            | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| HTTPD logs         | /var/log/httpd/*                          | FreeIPA API Call             | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| DNF logs           | /var/log/dnf.log                          | Installation Logs            | This log is configured on Rocky OS                                                                 |
-| Zypper Logs        | /var/log/zypper.log                       | Installation Logs            | This log is configured on Leap OS                                                                  |
-
-
-
-
-
-
-

docs/Security/login_user_creation.md → docs/Security/LOGIN_USER_CREATION.md


BIN
docs/Telemetry_Visualization/Images/ParallelCoordinates.png


BIN
docs/Telemetry_Visualization/Images/PowerMap.png


BIN
docs/Telemetry_Visualization/Images/SankeyViewer.png


BIN
docs/Telemetry_Visualization/Images/Spirallayout.gif


+ 12 - 18
docs/Telemetry_Visualization/Visualization.md

@@ -2,30 +2,29 @@
 
 Using Grafana, users can poll multiple devices and create graphs/visualizations of key system metrics such as temperature, System power consumption, Memory Usage, IO Usage, CPU Usage, Total Memory Power, System Output Power, Total Fan Power, Total Storage Power, System Input Power, Total CPU Power, RPM Readings, Total Heat Dissipation, Power to Cool ratio, System Air Flow Efficiency etc.
 
-A lot of these metrics are collected using iDRAC telemetry. iDRAC telemetry allows you to stream telemetry data from your servers to a centralized log/metrics servers. For more information on iDRAC telemetry, click [here]( https://github.com/dell/iDRAC-Telemetry-Reference-Tools).
+A lot of these metrics are collected using iDRAC telemetry. iDRAC telemetry allows you to stream telemetry data from your servers to a centralized log/metrics server. For more information on iDRAC telemetry, click [here]( https://github.com/dell/iDRAC-Telemetry-Reference-Tools).
 
 ## Prerequisites
 
 1. To set up Grafana, ensure that `control_plane/input_params/login_vars.yml` is updated with the Grafana Username and Password.
-2. All parameters in `telemetry/input_params/login_vars.yml` need to be filled in:
+2. All parameters in `telemetry/input_params/telemetry_login_vars.yml` need to be filled in:
 
 | Parameter Name        | Default Value | Information |
 |-----------------------|---------------|-------------|
-| timescaledb_user      | 		        |  Username used for connecting to timescale db. Minimum Legth: 2 characters.          |
-| timescaledb_password  | 		        |  Password used for connecting to timescale db. Minimum Legth: 2 characters.           |
-| mysqldb_user          | 		        |  Username used for connecting to mysql db. Minimum Legth: 2 characters.         |
-| mysqldb_password      | 		        |  Password used for connecting to mysql db. Minimum Legth: 2 characters.            |
+| timescaledb_user      | 		        |  Username used for connecting to timescale db. Minimum Length: 2 characters.          |
+| timescaledb_password  | 		        |  Password used for connecting to timescale db. Minimum Length: 2 characters.           |
+| mysqldb_user          | 		        |  Username used for connecting to mysql db. Minimum Length: 2 characters.         |
+| mysqldb_password      | 		        |  Password used for connecting to mysql db. Minimum Length: 2 characters.            |
 | mysqldb_root_password | 		        |  Password used for connecting to mysql db for root user. Minimum Legth: 2 characters.         |
 
-3. All parameters in `telemetry/input_params/base_vars.yml` need to be filled in:
+3. All parameters in `telemetry/input_params/telemetry_base_vars.yml` need to be filled in:
 
 | Parameter Name          | Default Value     | Information |
 |-------------------------|-------------------|-------------|
-| mount_location          | idrac_telemetrysource_services_db | Sets the location all telemetry related files will be stored and both timescale and mysql databases will be mounted.            |
 | idrac_telemetry_support | true              | This variable is used to enable iDRAC telemetry support and visualizations. Accepted Values: true/false            |
 | slurm_telemetry_support | true              | This variable is used to enable slurm telemetry support and visualizations. Slurm Telemetry support can only be activated when idrac_telemetry_support is set to true. Accepted Values: True/False.        |
 | timescaledb_name        | telemetry_metrics | Postgres DB with timescale extension is used for storing iDRAC and slurm telemetry metrics.            |
-| mysqldb_name			  | idrac_telemetrysource_services_db             | MySQL DB is used to store IPs and credentials of iDRACs having datacenter license           |
+| mysqldb_name			  | idrac_telemetrysource_services_db | MySQL DB is used to store IPs and credentials of iDRACs having datacenter license           |
 
 3. Find the IP of the Grafana UI using:
  
@@ -47,10 +46,12 @@ Use any one of the following browsers to access the Grafana UI (https://< Grafan
 * Omnia control plane should be executed and node_inventory should be created in awx.
 * The slurm manager and compute nodes are fetched at run time from node_inventory.
 * Slurm should be installed on the nodes, if not there is no point in executing slurm telemetry.
+* A minimum of one cluster is required for Slurm Telemetry to work.
+* Once telemetry is running, delete the pods and images on control plane if a cluster change is intended.
 
 ## Initiating Telemetry
 
-1. Once `control_plane.yml` and `telemetry.yml` are executed, run the following commands from `omnia/telemetry`:
+1. Once `control_plane.yml` and `omnia.yml` are executed, run the following commands from `omnia/telemetry`:
 
 `ansible-playbook telemetry.yml`
 
@@ -59,11 +60,4 @@ Use any one of the following browsers to access the Grafana UI (https://< Grafan
 ## Adding a New Node to Telemetry
 After initiation, new nodes can be added to telemetry by running the following commands from `omnia/telemetry`:
 		
-` ansible-playbook add_idrac_node.yml `
-		
-
-
-
-
-
-
+` ansible-playbook add_idrac_node.yml`

+ 44 - 0
docs/Telemetry_Visualization/VISUALIZATION.md

@@ -0,0 +1,44 @@
+# Viewing Performance Stats on Grafana
+
+Using [Texas Technical University data visualization lab](https://idatavisualizationlab.github.io/HPCC), data polled from iDRAC and Slurm can be processed to generate live graphs. These Graphs can be accessed on the Grafana UI.
+
+Once `control_plane.yml` is executed and Grafana is set up, use `telemetry.yml` to initiate the Graphs. Data polled via Slurm and iDRAC is streamed into internal databases. This data is processed to create the 4 graphs listed below.
+
+>> __Note__: This feature only works on Nodes using iDRACs with a datacenter license running a minimum firmware of 4.0.
+
+## All your data in a glance
+
+Using the following graphs, data can be visualized to gather correlational information. These graphs refresh every 5 seconds (Except SankeyViewer). 
+
+>> __Note:__ The timestamps used for the time metric are based on the `timezone` set in `control_plane/input_params/base_vars.yml`. 
+
+1. [Parallel Coordinates](https://idatavisualizationlab.github.io/HPCC/#ParallelCoordinates) <br>
+Parallel coordinates are a great way to capture a systems status. It shows all ranges of individual metrics like CPU temp, Fan Speed, Memory Usage etc. The graph can be narrowed by time or metric ranges to get specific correlations such as CPU Temp vs Fan Speed etc.
+
+![Parallel Coordinates](Images/ParallelCoordinates.png)
+
+<br>
+
+2. [Spiral Layout](https://idatavisualizationlab.github.io/HPCC/#Spiral_Layout) <br>
+Spiral Layouts are best for viewing the change in a single metric over time. It is often used to check trends in metrics over a business day. Data visualized in this graph can be sorted using other metrics like Job IDs etc to understand the pattern of utilization on your devices.
+
+![Spiral Layout](Images/Spirallayout.gif)
+
+<br>
+
+3. [Sankey Viewer](https://idatavisualizationlab.github.io/HPCC/#SankeyViewer) <br>
+Sankey Viewers are perfect for viewing utilization by nodes/users/jobs. It provides point in time information for quick troubleshooting.
+
+>> __Note:__ Due to the tremendous data processing undertaken by SankeyViewer, the graph does not auto-refresh. It can be manually refreshed by refreshing the internet tab or by clicking the refresh button on the top-right corner of the page.
+
+![Sankey Viewer](Images/SankeyViewer.png)
+
+<br>
+
+4. [Power Map](https://idatavisualizationlab.github.io/HPCC/#PowerMap) <br>
+Power Maps are an excellent way to see utilization along the axis of time for different nodes/users/jobs. Hovering over the graph allows the user to narrow down information by Job/User or Node.
+
+![Power Map](Images/PowerMap.png)
+
+<br>
+

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 8 - 3
docs/control_plane/device_templates/CONFIGURE_INFINIBAND_SWITCHES.md


+ 3 - 1
docs/control_plane/device_templates/PROVISION_SERVERS.md

@@ -13,7 +13,7 @@ Edit the following files under the `control_plane/input_params` directory to pro
 	File name	|	Variables</br> [Required/ Optional]	|	Default, choices	|	Description
 	-------	|	----------------	|	-----------------	|	-----------------
 	idrac_vars.yml	|	idrac_system_profile</br> [Required]	|	<ul><li>**Performance**</li> <li>PerformancePerWatt(DAPC)</li> <li>PerformancePerWatt(OS)</li> <li>WorkstationPerformance</li></ul>	|	The system profile used for BIOS configuration. 
-	<br>	|	firmware_update_required</br> [Required]	|	<ul><li>**true**</li> <li>false</li></ul>	|	By default, Omnia updates the firmware on the servers. To disable the firmware update, set the variable to "false".
+	<br>	|	firmware_update_required</br> [Required]	|	<ul><li>**false**</li> <li>true</li></ul>	|	By default, Omnia updates the firmware on the servers. To disable the firmware update, set the variable to "false".
 	<br>	|	poweredge_model</br> [Required if "firmware_update_required" is set to "true"]	|	<ul><li>**C6420**</li> <li>R640</li><li>R740</li><li>C4140</li> <li>And other supported PowerEdge servers</li></ul>	|	Enter the required PowerEdge server models to update the firmware. For example, enter `R640,R740,C4140` to update firmware on these models of PowerEdge servers. For a complete list of supported PowerEdge servers, see the *Hardware managed by Omnia* section in the Readme file.
 	<br>	|	uefi_secure_boot</br> [Optional]	|	<ul><li>**disabled**</li> <li>enabled</li></ul>	|	Option to enable or disable the secure boot mode.
 	<br>	|	system_lockdown</br> [Optional]	|	<ul><li>**disabled**</li> <li>enabled</li></ul>	|	Option to enable or disable system lockdown.
@@ -42,6 +42,8 @@ For the `idrac.yml` file to successfully provision the custom ISO on the PowerEd
 * The Lifecycle Controller Remote Services of PowerEdge Servers is in the 'ready' state.
 * The Redfish services are enabled in the iDRAC settings under **Services**.
 * The PowerEdge Servers have the iDRAC Enterprise or Datacenter license. If the license is not found, servers will be PXE booted and provisioned using Cobbler.  
+* If `provision_method` is set to PXE in `base_vars.yml`, ensure that all PXE devices have a configured, active NIC. To verify/ configure NIC availability: On the server, go to `BIOS Setup -> Network Settings -> PXE Device`. For each listed device (typically 4), configure/ check for an active NIC under `PXE device settings`
+* iDRAC 9 based Dell EMC PowerEdge Servers with firmware versions 5.00.10.20 and above. (With the latest BIOS available)
 
 The **provision_idrac** file configures and validates the following:
 * Required input parameters and prerequisites.

+ 2 - 2
docs/control_plane/input_parameters/PROVISION_SERVERS.md

@@ -7,13 +7,13 @@ Edit the following files under the `control_plane/input_params` directory to pro
 	a. `provision_password`- password used while provisioning OS on bare metal servers.  
 	b. `cobbler_password`- password for Cobbler.    
 	c. `idrac_username` and `idrac_password`- iDRAC username and password.   
-	**NOTE**: Minimum length of the password must be at least eight characters and a maximum of 30 characters. Do not use these characters while entering a password: -, \\, "", and \'
+>>	**NOTE**: Minimum length of the password must be at least eight characters and a maximum of 30 characters. Do not use these characters while entering a password: -, \\, "", and \'
 2. Edit the following variables in the `idrac_vars.yml` file.  
 
 	File name	|	Variables</br> [Required/ Optional]	|	Default, choices	|	Description
 	-------	|	----------------	|	-----------------	|	-----------------
 	idrac_vars.yml	|	idrac_system_profile</br> [Required]	|	<ul><li>**Performance**</li> <li>PerformancePerWatt(DAPC)</li> <li>PerformancePerWatt(OS)</li> <li>WorkstationPerformance</li></ul>	|	The system profile used for BIOS configuration. 
-	<br>	|	firmware_update_required</br> [Required]	|	<ul><li>**true**</li> <li>false</li></ul>	|	By default, Omnia updates the firmware on the servers. To disable the firmware update, set the variable to "false".
+	<br>	|	firmware_update_required</br> [Required]	|	<ul><li>**false**</li> <li>true</li></ul>	|	By default, Omnia updates the firmware on the servers. To disable the firmware update, set the variable to "false".
 	<br>	|	poweredge_model</br> [Required if "firmware_update_required" is set to "true"]	|	<ul><li>**C6420**</li> <li>R640</li><li>R740</li><li>C4140</li> <li>And other supported PowerEdge servers</li></ul>	|	Enter the required PowerEdge server models to update the firmware. For example, enter `R640,R740,C4140` to update firmware on these models of PowerEdge servers. For a complete list of supported PowerEdge servers, see the *Hardware managed by Omnia* section in the Readme file.
 	<br>	|	uefi_secure_boot</br> [Optional]	|	<ul><li>**disabled**</li> <li>enabled</li></ul>	|	Option to enable or disable the secure boot mode.
 	<br>	|	system_lockdown</br> [Optional]	|	<ul><li>**disabled**</li> <li>enabled</li></ul>	|	Option to enable or disable system lockdown.

BIN
docs/images/Omnia_Architecture.png


BIN
docs/images/Omnia_Flow.png


BIN
docs/images/Omnia_NetworkConfig_Inet.png


BIN
docs/images/Omnia_NetworkConfig_NoInet.png


+ 2 - 2
examples/PyTorch/pytorch-deploy.yaml

@@ -12,9 +12,9 @@ spec:
         volumeMounts:
         - mountPath: /pyscript
           name: torch-job-volume
-        command: ["bash","-c","python /pyscript/pytorchcpu-example.py"]
+        command: ["bash","-c","python /pyscript/pytorch-example.py"]
       restartPolicy: Never
       volumes:
       - name: torch-job-volume
         hostPath:
-          path: /home/k8s/torch-example
+          path: /home/k8snfs/torch-example

+ 0 - 54
examples/k8s-TensorFlow-resnet50-multinode-MPIOperator.yaml

@@ -1,54 +0,0 @@
-# Run multi-node training benchmark w/ Nvidia NGC Container: nvcr.io/nvidia/tensorflow:19.06-py3
-#
-# 2 C4140 compute nodes
-#  - 8 V100 GPUs
-#  - ConnectX-5
-#  - IPoIB EDR Infiniband in Ethernet mode
-#
-apiVersion: kubeflow.org/v1alpha1
-kind: MPIJob
-metadata:
-  name: tensorflow-benchmarks-resnet50
-spec:
-  replicas: 2
-  template:
-    spec:
-      containers:
-      - image: nvcr.io/nvidia/tensorflow:19.06-py3
-        name: tensorflow-benchmarks
-        volumeMounts:
-          - mountPath: /foo
-            name: work-volume
-          - mountPath: /data
-            name: mem-volume
-        resources:
-          limits:
-            nvidia.com/gpu: 4
-        command:
-          - mpirun
-          - --allow-run-as-root
-          - --map-by
-          - numa
-          - python
-          - /foo/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
-          - --batch_size=512
-          - --model=resnet50
-          - --variable_update=horovod
-          - --optimizer=momentum
-          - --nodistortions
-          - --gradient_repacking=8
-          - --weight_decay=1e-4
-          - --use_fp16=true
-          - --data_dir=/data/tensorflow/
-          - --data_name=imagenet
-      volumes:
-      - name: work-volume
-        hostPath:
-          # directory locally mounted on host
-          path: /work
-          type: Directory
-      - name: mem-volume
-        hostPath:
-          # dev shm directory on host
-          path: /dev/shm
-          type: Directory

+ 30 - 0
roles/cluster_validation/tasks/install_packages.yml

@@ -0,0 +1,30 @@
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Set fact for ansible version
+  set_fact:
+    ansible_collection_used: true
+  when: "ansible_version.full is version_compare(ansible_base_version, '>')"
+
+- name: Install netaddr
+  pip:
+    name: netaddr
+    state: present
+    executable: pip3
+
+- name: Install ansible galaxy collection ansible.utils
+  command: ansible-galaxy collection install "{{ ipaddr_collection }}"
+  changed_when: false
+  when: ansible_collection_used

+ 11 - 1
roles/cluster_validation/tasks/main.yml

@@ -27,6 +27,7 @@
     control_plane_status: false
     powervault_status: false
     nfs_node_status: false
+    ansible_collection_used: false
 
 - name: Check AWX instance
   command: awx --version
@@ -46,6 +47,15 @@
     - not awx_version_check.failed
     - awx_search_key in awx_hostname.stdout
 
+- name: Install Packages
+  include_tasks: install_packages.yml
+  when: not control_plane_status
+
+- name: Set ansible_collection_used to true in awx
+  set_fact:
+    ansible_collection_used: true
+  when: control_plane_status
+
 - name: Set NFS node status
   set_fact:
     nfs_node_status: true
@@ -90,4 +100,4 @@
         regexp: '#log_path = /var/log/ansible.log'
         replace: 'log_path = /var/log/omnia.log'
       when: ansible_conf_exists.stat.exists
-  when: not control_plane_status
+  when: not control_plane_status

+ 5 - 1
roles/cluster_validation/vars/main.yml

@@ -99,4 +99,8 @@ allow_deny_fail_msg: "Failed. Incorrect Access format in security_vars.yml"
 restrict_program_support_success_msg: "restrict_program_support successfully validated"
 restrict_program_support_failure_msg: "Failed. Accepted values are true or false."
 restrict_softwares_success_msg: "restrict_softwares successfully validated"
-restrict_softwares_failure_msg: "Warning. Values should be comma separated. The supported services are telnet, lpd, bluetooth, rlogin, rexec. Please check restrict_softwares variable"
+restrict_softwares_failure_msg: "Warning. Values should be comma separated. The supported services are telnet, lpd, bluetooth, rlogin, rexec. Please check restrict_softwares variable"
+
+# Usage: install_packages.yml
+ansible_base_version: '2.9'
+ipaddr_collection: ansible.utils:2.5.2

+ 2 - 0
roles/common/tasks/main.yml

@@ -131,6 +131,8 @@
         name: NVIDIA
         repo: "{{ nvidia_repo }}"
         state: present
+        disable_gpg_check: yes
+        auto_import_keys: yes
         autorefresh: yes
       tags: install
 

+ 12 - 1
roles/hostname_validation/tasks/main.yml

@@ -26,4 +26,15 @@
   when:
     - os_leap in ansible_distribution | lower
     - hostvars['127.0.0.1']['login_node_required']
-    - login_node_group in group_names
+    - login_node_group in group_names
+
+- name: Include cluster_validation vars file
+  include_vars: "{{ role_path }}/../cluster_validation/vars/main.yml"
+  
+- name: Validate login node variables if rocky MS and leap login node
+  include_tasks: validate_login_node_vars.yml
+  run_once: true
+  when: 
+    - os_leap in ansible_distribution | lower
+    - hostvars['127.0.0.1']['login_node_required']
+    - hostvars['127.0.0.1']['ipa_server_ms']

+ 110 - 0
roles/hostname_validation/tasks/validate_login_node_vars.yml

@@ -0,0 +1,110 @@
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Check if omnia config file is encrypted
+  command: cat {{ role_path }}/../../{{ config_filename }}
+  changed_when: false
+  register: config_content
+  no_log: True
+  delegate_to: localhost
+
+- name: Decrpyt omnia_config.yml
+  command: >-
+    ansible-vault decrypt {{ role_path }}/../../{{ config_filename }}
+    --vault-password-file {{ role_path }}/../../{{ config_vaultname }}
+  when: "'$ANSIBLE_VAULT;' in config_content.stdout"
+  delegate_to: localhost
+
+- name: Include variable file omnia_config.yml
+  include_vars: "{{ role_path }}/../../{{ config_filename }}"
+  delegate_to: localhost
+  no_log: True
+
+- name: Validate login node parameters when login_node_reqd is set to true
+  fail:
+    msg: "{{ input_config_failure_msg }} for login_node"
+  when:
+    - ( domain_name | length < 1 or
+      realm_name | length < 1 or
+      directory_manager_password | length < 1 or
+      kerberos_admin_password | length < 1 ) 
+    - login_node_required
+  delegate_to: localhost
+
+- name: Verify the value of enable_secure_login_node
+  assert:
+    that:
+      - enable_secure_login_node == true or enable_secure_login_node == false
+    success_msg: "{{ secure_login_node_success_msg }}"
+    fail_msg: "{{ secure_login_node_fail_msg }}"
+  delegate_to: localhost
+
+- name: Login node to contain exactly 1 node
+  assert:
+    that:
+      - "groups['login_node'] | length | int == 1"
+    fail_msg: "{{ login_node_group_fail_msg }}"
+    success_msg: "{{ login_node_group_success_msg }}"
+  delegate_to: localhost
+
+- name: Validate the domain name
+  assert:
+    that:
+      - domain_name is regex("^(?!-)[A-Za-z0-9-]+([\\-\\.]{1}[a-z0-9]+)*\\.[A-Za-z]{2,}$")
+    success_msg: "{{ domain_name_success_msg }}"
+    fail_msg: "{{ domain_name_fail_msg }}"
+  delegate_to: localhost
+
+- name: Validate the realm name
+  assert:
+    that:
+      - realm_name is regex("^(?!-)[A-Z0-9-]+([\\-\\.]{1}[a-z0-9]+)*\\.[A-Z]{2,}$")
+      - '"." in realm_name'
+    success_msg: "{{ realm_name_success_msg }}"
+    fail_msg: "{{ realm_name_fail_msg }}"
+  delegate_to: localhost
+
+- name: Assert directory_manager_password
+  assert:
+    that:
+      - directory_manager_password | length > min_length | int - 1
+      - directory_manager_password | length < max_length | int + 1
+      - '"-" not in directory_manager_password '
+      - '"\\" not in directory_manager_password '
+      - '"\"" not in directory_manager_password '
+      - " \"'\" not in directory_manager_password "
+    success_msg: "{{ success_msg_directory_manager_password }}"
+    fail_msg: "{{ fail_msg_directory_manager_password }}"
+  delegate_to: localhost
+
+- name: Assert kerberos_admin_password
+  assert:
+    that:
+      - kerberos_admin_password | length > min_length | int - 1
+      - kerberos_admin_password | length < max_length | int + 1
+      - '"-" not in kerberos_admin_password '
+      - '"\\" not in kerberos_admin_password '
+      - '"\"" not in kerberos_admin_password '
+      - " \"'\" not in kerberos_admin_password "
+    success_msg: "{{ success_msg_kerberos_admin_password }}"
+    fail_msg: "{{ fail_msg_kerberos_admin_password }}"
+  delegate_to: localhost
+
+- name: Encrypt input config file
+  command: >-
+    ansible-vault encrypt {{ role_path }}/../../{{ config_filename }}
+    --vault-password-file {{ role_path }}/../../{{ config_vaultname }}
+  changed_when: false
+  delegate_to: localhost

+ 2 - 2
roles/k8s_start_manager/vars/main.yml

@@ -23,7 +23,7 @@ k8s_config_src: /etc/kubernetes/admin.conf
 
 k8s_config_dest: /root/.kube/config
 
-k8s_config_file_mode: 0644
+k8s_config_file_mode: 0600
 
 k8s_cert_path: /etc/kubernetes/pki/ca.crt
 
@@ -47,4 +47,4 @@ calico_yml_file_mode: 0644
 
 flannel_yml_file_path: /root/k8s/kube-flannel.yaml
 
-flannel_yml_file_mode: 0644
+flannel_yml_file_mode: 0644

+ 10 - 10
roles/k8s_start_services/files/metal-config.yaml

@@ -9,13 +9,13 @@ data:
     - name: default
       protocol: layer2
       addresses:
-      - 192.168.2.150/32
-      - 192.168.2.151/32
-      - 192.168.2.152/32
-      - 192.168.2.153/32
-      - 192.168.2.154/32
-      - 192.168.2.155/32
-      - 192.168.2.156/32
-      - 192.168.2.157/32
-      - 192.168.2.158/32
-      - 192.168.2.159/32
+      - 192.168.2.150/24
+      - 192.168.2.151/24
+      - 192.168.2.152/24
+      - 192.168.2.153/24
+      - 192.168.2.154/24
+      - 192.168.2.155/24
+      - 192.168.2.156/24
+      - 192.168.2.157/24
+      - 192.168.2.158/24
+      - 192.168.2.159/24

+ 17 - 11
roles/k8s_start_services/tasks/configure_nginx_prom_grafana.yml

@@ -60,14 +60,20 @@
     state: restarted
     enabled: yes
 
-- name: Create prometheus datasource in grafana
-  community.grafana.grafana_datasource:
-    name: "hpc-prometheus-{{ ansible_default_ipv4.address }}"
-    grafana_url: "http://{{ grafana_svc_ip }}:{{ grafana_svc_port }}"
-    grafana_user: "{{ hostvars['127.0.0.1']['grafana_username'] }}"
-    grafana_password: "{{ hostvars['127.0.0.1']['grafana_password'] }}"
-    ds_type: prometheus
-    ds_url: "http://{{ ansible_default_ipv4.address }}:{{ nginx_listen_port }}"
-    access: direct
-  delegate_to: localhost
-  no_log: true
+- block:
+    - name: Create prometheus datasource in grafana
+      community.grafana.grafana_datasource:
+        name: "hpc-prometheus-{{ ansible_default_ipv4.address }}"
+        grafana_url: "http://{{ grafana_svc_ip }}:{{ grafana_svc_port }}"
+        grafana_user: "{{ hostvars['127.0.0.1']['grafana_username'] }}"
+        grafana_password: "{{ hostvars['127.0.0.1']['grafana_password'] }}"
+        ds_type: prometheus
+        ds_url: "http://{{ ansible_default_ipv4.address }}:{{ nginx_listen_port }}"
+        access: direct
+      delegate_to: localhost
+      no_log: true
+      register: create_k8s_prom_datasource
+  rescue:
+    - name: Create prometheus datasource in grafana failed
+      fail:
+        msg: "Error: {{ create_k8s_prom_datasource.msg }}"

+ 16 - 17
roles/k8s_start_services/tasks/deploy_k8s_services.yml

@@ -1,4 +1,4 @@
-#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -34,10 +34,9 @@
   register: k8s_pods
   tags: init
 
-- name: Deploy MetalLB
-  command: "kubectl apply -f '{{ metallb_yaml_url }}'"
-  changed_when: true
-  when: "'metallb' not in k8s_pods.stdout"
+- name: Get metallb repo
+  command: "helm repo add metallb '{{ metallb_helm_url }}'"
+  changed_when: false
   tags: init
 
 - name: Create MetalLB Setup Config Files
@@ -58,14 +57,8 @@
     mode: "{{ metallb_deployment_file_mode }}"
   tags: init
 
-- name: Deploy MetalLB
-  command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
-  changed_when: true
-  when: "'metallb' not in k8s_pods.stdout"
-  tags: init
-
-- name: Create default setup for MetalLB
-  command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
+- name: Deploy Metallb
+  command: "helm install metallb metallb/metallb  -f '{{ metallb_config_file_dest }}'"
   changed_when: true
   when: "'metallb' not in k8s_pods.stdout"
   tags: init
@@ -96,11 +89,13 @@
 - name: Helm - add Nvidia k8s-device-plugin (nvdp) repo
   command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'"
   changed_when: true
+  when: ansible_local.inventory.nvidia_gpu > 0
   tags: init
 
 - name: Helm - add Nvidia GPU discovery (nvgfd) repo
   command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'"
   changed_when: true
+  when: ansible_local.inventory.nvidia_gpu > 0
   tags: init
 
 - name: Helm - update repo
@@ -109,7 +104,7 @@
   tags: init
 
 - name: Start NFS Client Provisioner using NFS on manager node
-  command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_manager_node }}' --set nfs.path='{{ nfs_share_dir }}' --generate-name"
+  command: "helm install nfs-omnia stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_manager_node }}' --set nfs.path='{{ nfs_share_dir }}'"
   changed_when: true
   when:
     - "'nfs-client-provisioner' not in k8s_pods.stdout"
@@ -117,7 +112,7 @@
   tags: init
 
 - name: Start NFS Client Provisioner using NFS on NFS Node
-  command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_nfs_node }}' --set nfs.path='{{ me4_nfs_share_k8s }}' --generate-name"
+  command: "helm install nfs-omnia stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_nfs_node }}' --set nfs.path='{{ me4_nfs_share_k8s }}'"
   changed_when: true
   when:
     - "'nfs-client-provisioner' not in k8s_pods.stdout"
@@ -189,13 +184,17 @@
 - name: Install nvidia-device-plugin
   command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
   changed_when: true
-  when: "'nvidia-device-plugin' not in k8s_pods.stdout"
+  when:
+    - "'nvidia-device-plugin' not in k8s_pods.stdout"
+    - ansible_local.inventory.nvidia_gpu > 0
   tags: init
 
 - name: Install GPU Feature Discovery
   command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
   changed_when: true
-  when: "'node-feature-discovery' not in k8s_pods.stdout"
+  when:
+    - "'node-feature-discovery' not in k8s_pods.stdout"
+    - ansible_local.inventory.nvidia_gpu > 0
   tags: init
 
 - name: Deploy Xilinx Device plugin

+ 3 - 1
roles/k8s_start_services/vars/main.yml

@@ -53,6 +53,8 @@ metallb_deployment_file_mode: 0655
 
 metallb_yaml_url: https://raw.githubusercontent.com/google/metallb/v0.8.1/manifests/metallb.yaml
 
+metallb_helm_url: https://metallb.github.io/metallb
+
 k8s_dashboard_admin_file_dest: /root/k8s/k8s_dashboard_admin.yaml
 
 k8s_dashboard_admin_file_mode: 0655
@@ -91,7 +93,7 @@ prometheus_path_on_host: /var/lib/prometheus-2.23.0.linux-amd64/
 
 spark_operator_repo: https://googlecloudplatform.github.io/spark-on-k8s-operator
 
-operator_image_tag: v1beta2-1.2.3-3.1.1
+operator_image_tag: v1beta2-1.3.3-3.1.1
 
 volcano_scheduling_yaml_url: https://raw.githubusercontent.com/volcano-sh/volcano/v1.3.0/installer/volcano-development.yaml
 

+ 16 - 11
roles/k8s_start_workers/tasks/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -34,13 +34,18 @@
     - node_hostname.stdout in hostvars['K8S_TOKEN_HOLDER']['k8s_nodes_not_ready']
   tags: init
 
-- name: Execute kubeadm join command
-  shell: >
-    kubeadm join --token={{ hostvars['K8S_TOKEN_HOLDER']['token'] }}
-    --discovery-token-ca-cert-hash sha256:{{ hostvars['K8S_TOKEN_HOLDER']['hash'] }}
-    {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}:{{ apiserver_bind_port }}
-  when:
-    - groups['manager'][0] != groups['compute'][0]
-    - groups['compute']|length >= 1
-    - node_hostname.stdout not in hostvars['K8S_TOKEN_HOLDER']['k8s_nodes_ready']
-  tags: init
+- block:
+    - name: Execute kubeadm join command
+      shell: >
+        kubeadm join --token={{ hostvars['K8S_TOKEN_HOLDER']['token'] }}
+        --discovery-token-ca-cert-hash sha256:{{ hostvars['K8S_TOKEN_HOLDER']['hash'] }}
+        {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}:{{ apiserver_bind_port }}
+      when:
+        - groups['manager'][0] != groups['compute'][0]
+        - groups['compute']|length >= 1
+        - node_hostname.stdout not in hostvars['K8S_TOKEN_HOLDER']['k8s_nodes_ready']
+      tags: init
+  rescue:
+    - name: Execute kubeadm join command failed
+      fail:
+        msg: "{{ kubeadm_join_fail_msg }}"

+ 2 - 1
roles/k8s_start_workers/vars/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -14,3 +14,4 @@
 ---
 
 apiserver_bind_port: 6443
+kubeadm_join_fail_msg: "kubeadm join command failed failed in the compute node {{ inventory_hostname }}. Re-retry omnia.yml execution after performing kubeadm reset manually."

+ 14 - 0
roles/login_node/tasks/configure_alerting.yml

@@ -62,6 +62,20 @@
   changed_when: false
   register: ansible_playbook_path
 
+- name: Start cron service in leap
+  systemd:
+    name: cron
+    state: started
+    enabled: yes
+  when: os_leap in ansible_distribution | lower
+
+- name: Start crond service in rocky
+  systemd:
+    name: crond
+    state: started
+    enabled: yes
+  when: os_leap not in ansible_distribution | lower
+
 - name: Schedule cron job for alerting
   cron:
     name: Auth failure alerting

+ 21 - 11
roles/login_node/tasks/install_389ds.yml

@@ -96,8 +96,6 @@
       no_log: true
       when: ldap1_search_key in ldap1_status.stdout
 
-    
-
     - name: Creating 389 directory server instance
       shell: dscreate -v from-file {{ ldap1_config_path }} | tee {{ ldap1_output_path }}
       changed_when: true
@@ -223,10 +221,16 @@
           fail:
             msg: "Error: {{ create_admin_principal.stderr }}"
 
-    - name: Authenticate as admin
-      shell: set -o pipefail && echo {{ kerberos_admin_password }} | kinit {{ ipa_admin_username }}
-      no_log: true
-      changed_when: false
+    - block:
+        - name: Authenticate as admin
+          shell: set -o pipefail && echo {{ kerberos_admin_password }} | kinit {{ ipa_admin_username }}
+          no_log: true
+          changed_when: false
+          register: authenticate_admin
+      rescue:
+        - name: Authenticate as admin failed
+          fail:
+            msg: "Error: {{ authenticate_admin.stderr }}"
 
     - name: Install sssd packages
       zypper:
@@ -269,8 +273,14 @@
         enabled: yes
   when: not ds389_status
 
-- name: Configure password policy in 389-ds
-  command: dsconf -w {{ directory_manager_password }} -D "cn=Directory Manager" ldap://{{ server_hostname_fqdn }} pwpolicy set --pwdlockoutduration {{ hostvars['127.0.0.1']['lockout_duration'] }} --pwdmaxfailures {{ hostvars['127.0.0.1']['max_failures'] }} --pwdresetfailcount {{ hostvars['127.0.0.1']['failure_reset_interval'] }}
-  changed_when: true
-  no_log: true
-  when: hostvars['127.0.0.1']['enable_secure_login_node']
+- block:
+    - name: Configure password policy in 389-ds
+      command: dsconf -w {{ directory_manager_password }} -D "cn=Directory Manager" ldap://{{ server_hostname_fqdn }} pwpolicy set --pwdlockoutduration {{ hostvars['127.0.0.1']['lockout_duration'] }} --pwdmaxfailures {{ hostvars['127.0.0.1']['max_failures'] }} --pwdresetfailcount {{ hostvars['127.0.0.1']['failure_reset_interval'] }}
+      changed_when: true
+      no_log: true
+      when: hostvars['127.0.0.1']['enable_secure_login_node']
+      register: configure_pwpolicy
+  rescue:
+    - name: Configure password policy in 389-ds failed
+      fail:
+        msg: "Error: {{ configure_pwpolicy.stderr }}"

+ 26 - 19
roles/login_node/tasks/install_ipa_client.yml

@@ -55,23 +55,30 @@
   changed_when: false
   failed_when: false
 
-- name: Install ipa client in CentOS 7.9
-  command: >-
-    ipa-client-install --domain '{{ required_domain_name }}' --server '{{ required_server_hostname }}'
-    --principal admin --password '{{ required_ipa_admin_pwd }}' --force-join --enable-dns-updates --force-ntpd -U
-  changed_when: true
-  no_log: true
-  when:
-    - ( ansible_distribution | lower == os_centos )
-    - ( ansible_distribution_version < os_version )
+- block:
+    - name: Install ipa client in CentOS 7.9
+      command: >-
+        ipa-client-install --domain '{{ required_domain_name }}' --server '{{ required_server_hostname }}'
+        --principal admin --password '{{ required_ipa_admin_pwd }}' --force-join --enable-dns-updates --force-ntpd -U
+      changed_when: true
+      no_log: true
+      register: install_ipa_client
+      when:
+        - ( ansible_distribution | lower == os_centos )
+        - ( ansible_distribution_version < os_version )
 
-- name: Install ipa client in Rocky 8.4
-  command: >-
-    ipa-client-install --domain '{{ required_domain_name }}' --server '{{ required_server_hostname }}'
-    --principal admin --password '{{ required_ipa_admin_pwd }}' --force-join --enable-dns-updates --no-ntp -U
-  changed_when: true
-  no_log: true
-  when:
-    - ( ansible_distribution | lower == os_centos ) or
-      ( ansible_distribution | lower == os_rocky )
-    - ( ansible_distribution_version >= os_version )
+    - name: Install ipa client in Rocky 8
+      command: >-
+        ipa-client-install --domain '{{ required_domain_name }}' --server '{{ required_server_hostname }}'
+        --principal admin --password '{{ required_ipa_admin_pwd }}' --force-join --enable-dns-updates --no-ntp -U
+      changed_when: true
+      no_log: true
+      register: install_ipa_client
+      when:
+        - ( ansible_distribution | lower == os_centos ) or
+          ( ansible_distribution | lower == os_rocky )
+        - ( ansible_distribution_version >= os_version )
+  rescue:
+    - name: Install ipa client failed
+      fail:
+        msg: "Error: {{ install_ipa_client.stderr_lines }}"

+ 36 - 23
roles/login_server/tasks/install_ipa_server.yml

@@ -30,31 +30,44 @@
   changed_when: false
   failed_when: false
 
-- name: Install ipa server in CentOS 7.9
-  command: >-
-    ipa-server-install -n '{{ hostvars['127.0.0.1']['domain_name'] }}' --hostname='{{ server_hostname }}' -a '{{ hostvars['127.0.0.1']['kerberos_admin_password'] }}'
-    -p '{{ hostvars['127.0.0.1']['directory_manager_password'] }}' -r '{{ hostvars['127.0.0.1']['realm_name'] }}' --setup-dns --auto-forwarders --auto-reverse -U
-  changed_when: true
-  no_log: true
-  when:
-    - ( ansible_distribution | lower == os_centos )
-    - ( ansible_distribution_version < os_version )
+- block:
+    - name: Install ipa server in CentOS 7.9
+      command: >-
+        ipa-server-install -n '{{ hostvars['127.0.0.1']['domain_name'] }}' --hostname='{{ server_hostname }}' -a '{{ hostvars['127.0.0.1']['kerberos_admin_password'] }}'
+        -p '{{ hostvars['127.0.0.1']['directory_manager_password'] }}' -r '{{ hostvars['127.0.0.1']['realm_name'] }}' --setup-dns --auto-forwarders --auto-reverse -U
+      changed_when: true
+      no_log: true
+      register: install_ipa_server
+      when:
+        - ( ansible_distribution | lower == os_centos )
+        - ( ansible_distribution_version < os_version )
 
-- name: Install ipa server in CentOS > 8 or Rocky 8.4
-  command: >-
-    ipa-server-install -n '{{ hostvars['127.0.0.1']['domain_name'] }}' --hostname='{{ server_hostname }}' -a '{{ hostvars['127.0.0.1']['kerberos_admin_password'] }}'
-    -p '{{ hostvars['127.0.0.1']['directory_manager_password'] }}' -r '{{ hostvars['127.0.0.1']['realm_name'] }}' --setup-dns --no-forwarders --no-reverse --no-ntp -U
-  changed_when: true
-  no_log: true
-  when:
-    - ( ansible_distribution | lower == os_centos ) or
-      ( ansible_distribution | lower == os_rocky )
-    - ( ansible_distribution_version >= os_version )
+    - name: Install ipa server in CentOS > 8 or Rocky 8
+      command: >-
+        ipa-server-install -n '{{ hostvars['127.0.0.1']['domain_name'] }}' --hostname='{{ server_hostname }}' -a '{{ hostvars['127.0.0.1']['kerberos_admin_password'] }}'
+        -p '{{ hostvars['127.0.0.1']['directory_manager_password'] }}' -r '{{ hostvars['127.0.0.1']['realm_name'] }}' --setup-dns --no-forwarders --no-reverse --no-ntp -U
+      changed_when: true
+      no_log: true
+      register: install_ipa_server
+      when:
+        - ( ansible_distribution | lower == os_centos ) or
+          ( ansible_distribution | lower == os_rocky )
+        - ( ansible_distribution_version >= os_version )
+  rescue:
+    - name: Install ipa server failed
+      fail:
+        msg: "Error: {{ install_ipa_server.stderr_lines }}"
 
-- name: Authenticate as admin
-  shell: set -o pipefail && echo $'{{ hostvars['127.0.0.1']['kerberos_admin_password'] }}' | kinit admin
-  no_log: true
-  changed_when: false
+- block:
+    - name: Authenticate as admin
+      shell: set -o pipefail && echo $'{{ hostvars['127.0.0.1']['kerberos_admin_password'] }}' | kinit admin
+      no_log: true
+      changed_when: false
+      register: authenticate_admin
+  rescue:
+    - name: Authenticate as admin failed
+      fail:
+        msg: "Error: {{ authenticate_admin.stderr }}"
 
 - name: Replace the /etc/resolv.conf file
   copy:

+ 94 - 57
roles/powervault_me4_nfs/tasks/nfs_node_configure.yml

@@ -1,5 +1,4 @@
-
-#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -29,59 +28,97 @@
   changed_when: false
   failed_when: false
 
-- name: Install packages
-  package:
-    name: iscsi-initiator-utils
-    state: present
-  tags: install
-
-- name: Install packages
-  package:
-    name: sg3_utils
-    state: present
-  tags: install
-
-- name: Set bootproto value
-  lineinfile:
-    path: "{{ nic_path }}"
-    regexp: '^BOOTPROTO='
-    line: 'BOOTPROTO=none'
-  register: result
-
-- name: Set onboot value
-  lineinfile:
-    path: "{{ nic_path }}"
-    regexp: '^ONBOOT='
-    line: 'ONBOOT=yes'
-
-- name: Add ip address
-  lineinfile:
-    path: "{{ nic_path }}"
-    insertafter: '^ONBOOT=yes'
-    line: 'IPADDR={{ pv_nic_ip }}'
-
-- name: Add netmask address
-  lineinfile:
-    path: "{{ nic_path }}"
-    insertafter: '^IPADDR={{ pv_nic_ip }}'
-    line: NETMASK=255.255.255.0
-
-- name: Down the nic
-  command: ifdown {{ pv_nic }}
-  changed_when: true
-  failed_when: false
-  tags: install
-
-- name: Up the nic
-  command: ifup {{ pv_nic }}
-  changed_when: true
-  tags: install
-
-- name: Show ip
-  shell: >
-    set -o pipefail && \
-    ifconfig {{ pv_nic }} | grep 'inet' |cut -d: -f2 |  awk '{ print $2}'
-  changed_when: false
+- name: NFS node configuration on leap
+  block:
+    - name: Install open-iscsi
+      zypper:
+        name: open-iscsi
+        state: present
+      tags: install
+
+    - name: Install sg3_utils
+      zypper:
+        name: sg3_utils
+        state: present
+      tags: install
+
+    - name: Start the iSCSI deamon
+      systemd:
+        name: iscsid
+        state: started
+
+    - block:
+      - name: Configure nic
+        command: ip a add {{ pv_nic_ip }}/255.255.255.0 dev {{ pv_nic }}
+        register: nic_status
+        changed_when: false
+      rescue:
+      - name: Check if nic configured or not
+        fail:
+          msg: "{{ nic_conf_failed_msg }}"
+        when: nic_status_search not in nic_status.stderr
+
+    - name: Up the nic
+      command: ip link set dev {{ pv_nic }} up
+      changed_when: false
+  when: os_supported_leap in ansible_distribution | lower
+
+- name: NFS node configuration on rocky
+  block:
+    - name: Install packages
+      package:
+        name: iscsi-initiator-utils
+        state: present
+      tags: install
+
+    - name: Install packages
+      package:
+        name: sg3_utils
+        state: present
+      tags: install
+
+    - name: Set bootproto value
+      lineinfile:
+        path: "{{ nic_path }}"
+        regexp: '^BOOTPROTO='
+        line: 'BOOTPROTO=none'
+      register: result
+
+    - name: Set onboot value
+      lineinfile:
+        path: "{{ nic_path }}"
+        regexp: '^ONBOOT='
+        line: 'ONBOOT=yes'
+
+    - name: Add ip address
+      lineinfile:
+        path: "{{ nic_path }}"
+        insertafter: '^ONBOOT=yes'
+        line: 'IPADDR={{ pv_nic_ip }}'
+
+    - name: Add netmask address
+      lineinfile:
+        path: "{{ nic_path }}"
+        insertafter: '^IPADDR={{ pv_nic_ip }}'
+        line: NETMASK=255.255.255.0
+
+    - name: Down the nic
+      command: ifdown {{ pv_nic }}
+      changed_when: true
+      failed_when: false
+      tags: install
+
+    - name: Up the nic
+      command: ifup {{ pv_nic }}
+      changed_when: true
+      tags: install
+
+    - name: Show ip
+      shell: >
+        set -o pipefail && \
+        ifconfig {{ pv_nic }} | grep 'inet' |cut -d: -f2 |  awk '{ print $2}'
+      changed_when: false
+  when: os_supported_leap not in ansible_distribution | lower
 
 - name: Discover nodes
   command: iscsiadm -m discovery -t sendtargets -p {{ item }}
@@ -121,7 +158,7 @@
 - name: IQDN id
   shell: >
     set -o pipefail && \
-    cat /etc/iscsi/initiatorname.iscsi | cut -f2 -d"="
+    grep "InitiatorName=" /etc/iscsi/initiatorname.iscsi | cut -f2 -d"="
   register: iqdn_id
   changed_when: false
   tags: install
@@ -135,4 +172,4 @@
   command: iscsiadm -m node --login {{ pv_name }} -p {{ ip_port }}
   changed_when: true
   failed_when: false
-  tags: install
+  tags: install

+ 0 - 0
roles/powervault_me4_nfs/vars/main.yml


Một số tệp đã không được hiển thị bởi vì quá nhiều tập tin thay đổi trong này khác