瀏覽代碼

Merge branch 'devel' into delete_test_omnia

Lucas A. Wilson 3 年之前
父節點
當前提交
5c2cf6fc8e
共有 35 個文件被更改,包括 319 次插入369 次删除
  1. 2 1
      control_plane/control_plane.yml
  2. 2 1
      control_plane/ethernet.yml
  3. 5 3
      control_plane/roles/control_plane_common/tasks/fetch_base_inputs.yml
  4. 1 1
      control_plane/roles/control_plane_common/tasks/password_config.yml
  5. 1 1
      control_plane/roles/control_plane_common/tasks/verify_omnia_params.yml
  6. 2 2
      control_plane/roles/control_plane_common/vars/main.yml
  7. 2 2
      control_plane/roles/control_plane_sm/tasks/create_pod.yml
  8. 2 1
      control_plane/roles/control_plane_sm/tasks/pre_requisites.yml
  9. 0 1
      control_plane/roles/control_plane_sm/vars/main.yml
  10. 27 25
      control_plane/roles/provision_idrac/tasks/check_prerequisites.yml
  11. 0 44
      control_plane/roles/webui_awx/tasks/check_awx_status.yml
  12. 0 40
      control_plane/roles/webui_awx/tasks/firewall_settings.yml
  13. 0 34
      control_plane/roles/webui_awx/tasks/install_awx_cli.yml
  14. 0 85
      control_plane/roles/webui_awx/tasks/ui_accessibility.yml
  15. 43 6
      docs/FAQ.md
  16. 5 4
      docs/INSTALL_OMNIA.md
  17. 33 29
      docs/INSTALL_OMNIA_CONTROL_PLANE.md
  18. 22 10
      docs/README.md
  19. 5 3
      docs/control_plane/device_templates/CONFIGURE_INFINIBAND_SWITCHES.md
  20. 2 2
      docs/control_plane/device_templates/CONFIGURE_POWERSWITCHES.md
  21. 3 2
      docs/control_plane/device_templates/CONFIGURE_POWERVAULT_STORAGE.md
  22. 4 3
      docs/control_plane/device_templates/PROVISION_SERVERS.md
  23. 4 3
      docs/control_plane/input_parameters/INFINIBAND_SWITCHES.md
  24. 1 1
      docs/control_plane/input_parameters/POWERSWITCHES.md
  25. 3 2
      docs/control_plane/input_parameters/POWERVAULT_STORAGE.md
  26. 59 0
      docs/login_node/login_user_creation.md
  27. 6 8
      control_plane/roles/webui_awx/tasks/clone_awx.yml
  28. 30 0
      examples/login_node_example/roles/login_user/tasks/main.yml
  29. 28 0
      examples/login_node_example/roles/login_user/vars/main.yml
  30. 4 4
      omnia_config.yml
  31. 11 37
      roles/cluster_validation/tasks/fetch_password.yml
  32. 2 2
      roles/k8s_common/vars/main.yml
  33. 2 2
      roles/k8s_manager/tasks/main.yml
  34. 3 5
      roles/login_server/tasks/install_ipa_server.yml
  35. 5 5
      roles/slurm_exporter/tasks/main.yml

+ 2 - 1
control_plane/control_plane.yml

@@ -25,4 +25,5 @@
     - control_plane_ib
     - control_plane_sm
     - control_plane_customiso
-    - control_plane_repo
+    - control_plane_repo
+    - deploy_job_templates

+ 2 - 1
control_plane/ethernet.yml

@@ -21,5 +21,6 @@
     - dellemc.os10
    vars:
      ansible_network_os: dellemc.os10.os10
+     ansible_command_timeout: 180
    roles:
-    - network_ethernet
+    - network_ethernet

+ 5 - 3
control_plane/roles/control_plane_common/tasks/fetch_base_inputs.yml

@@ -37,7 +37,7 @@
       host_network_dhcp_end_range | length < 1 or
       provision_method | length < 1 or
       default_lease_time | length < 1
-      
+
 - name: Validate default lease time
   assert:
     that:
@@ -205,12 +205,14 @@
 - name : Assert iso_file_path
   fail:
     msg: "{{ invalid_iso_file_path }}"
-  when: ( not result_path_iso_file.stat.exists ) and ( ".iso" not in  iso_file_path )
+  when: not result_path_iso_file.stat.exists
 
 - name: Fail when iso path valid but image not right
   fail:
     msg: "{{ invalid_iso_file_path }}"
-  when: ( result_path_iso_file.stat.exists ) and ( ".iso" not in iso_file_path )
+  when:
+    - result_path_iso_file.stat.exists
+    - '".iso" not in iso_file_path'
 
 #### management_net_dhcp_start_end_range
 - name: Assert management network nic

+ 1 - 1
control_plane/roles/control_plane_common/tasks/password_config.yml

@@ -32,7 +32,7 @@
 
 - name: Validate input parameters are not empty
   fail:
-    msg: "{{ input_config_failure_msg }}"
+    msg: "{{ login_input_config_failure_msg }}"
   register: input_config_check
   when:
     - provision_password | length < 1 or

+ 1 - 1
control_plane/roles/control_plane_common/tasks/verify_omnia_params.yml

@@ -61,7 +61,7 @@
 
 - name: Validate login node parameters when login_node_reqd is set to true
   fail:
-    msg: "{{ input_config_failure_msg }} for login_node"
+    msg: "{{ omnia_input_config_failure_msg }}"
   when:
     - ( domain_name | length < 1 or
       realm_name | length < 1 or

+ 2 - 2
control_plane/roles/control_plane_common/vars/main.yml

@@ -57,7 +57,7 @@ min_username_length: 4
 file_perm: '0755'
 vault_file_perm: '0644'
 nic_min_length: 3
-input_config_failure_msg: "Please provide all the required parameters in login_vars.yml"
+login_input_config_failure_msg: "Failed. Please provide all the required parameters in login_vars.yml"
 fail_msg_provision_password: "Failed. Incorrect provision_password format provided in login_vars.yml"
 fail_msg_cobbler_password: "Failed. Incorrect cobbler_password format provided in login_vars.yml file"
 fail_msg_idrac_credentials: "Failed. Incorrect idrac_username or idrac_password format provided in login_vars.yml"
@@ -83,7 +83,7 @@ success_msg_directory_manager_password: "directory_manager_password successfully
 fail_msg_directory_manager_password: "Failed. Incorrect format provided for directory_manager_password"
 success_msg_ipa_admin_password: "ipa_admin_password successfully validated"
 fail_msg_ipa_admin_password: "Failed. Incorrect format provided for ipa_admin_password"
-input_config_failure_msg: "Input parameters cannot be empty"
+omnia_input_config_failure_msg: "Failed. Please provide all the required parameters in omnia_config.yml for for login_node"
 login_node_required_success_msg: "Login_node_required successfully validated"
 login_node_required_fail_msg: "Failed. login_node_required can be either true or false"
 

+ 2 - 2
control_plane/roles/control_plane_sm/tasks/create_pod.yml

@@ -43,7 +43,7 @@
       replace:
         path: "{{ sm_kube_config_file }}"
         regexp: "          image:.*"
-        replace: "          image: 'localhost/{{sm_docker_image_name}}:{{ sm_docker_image_tag }}'"
+        replace: "          image: 'localhost/{{ sm_docker_image_name }}:{{ sm_docker_image_tag }}'"
       tags: install
 
     - name: Replace cache directory in sm config file
@@ -64,4 +64,4 @@
       command: "kubectl apply -f {{ sm_kube_config_file }}"
       tags: install
 
-  when: "'subnet-manager' not in k8s_pods.stdout"
+  when: "'subnet-manager' not in k8s_pods.stdout"

+ 2 - 1
control_plane/roles/control_plane_sm/tasks/pre_requisites.yml

@@ -45,5 +45,6 @@
   copy:
     src: "{{ opensm_conf_file }}"
     dest: "{{ opensm_conf_file_dest }}"
+    mode: preserve
     force: yes
-  tags: install
+  tags: install

+ 0 - 1
control_plane/roles/control_plane_sm/vars/main.yml

@@ -25,4 +25,3 @@ sm_docker_image_tag: latest
 #Usage: create_pod.yml
 sm_container_name: opensm-container
 sm_kube_config_file: "{{ role_path }}/files/k8s_sm.yml"
-opensm_conf_file: "{{ role_path }}/../../input_params/opensm.conf"

+ 27 - 25
control_plane/roles/provision_idrac/tasks/check_prerequisites.yml

@@ -167,31 +167,33 @@
         idrac_password: "{{ idrac_password }}"
       register: idrac_info
 
-    - name: Set enterprise license status
-      set_fact:
-        enterprise_license: true
-        idrac_license_name: "{{ idrac_info.system_info.License[my_idx1].LicenseDescription }}"
-      with_items: "{{ idrac_info.system_info.License }}"
-      when:
-        - '"iDRAC" in idrac_info.system_info.License[my_idx1].LicenseDescription'
-        - '"Enterprise" in idrac_info.system_info.License[my_idx1].LicenseDescription'
-        - '"License" in idrac_info.system_info.License[my_idx1].LicenseDescription'
-        - '"Healthy" in idrac_info.system_info.License[my_idx1].PrimaryStatus'
-      loop_control:
-        index_var: my_idx1
-
-    - name: Set datacenter license status
-      set_fact:
-        datacenter_license: true
-        idrac_license_name: "{{ idrac_info.system_info.License[my_idx1].LicenseDescription }}"
-      with_items: "{{ idrac_info.system_info.License }}"
-      when:
-        - '"iDRAC" in idrac_info.system_info.License[my_idx2].LicenseDescription'
-        - '"Datacenter" in idrac_info.system_info.License[my_idx2].LicenseDescription'
-        - '"License" in idrac_info.system_info.License[my_idx2].LicenseDescription'
-        - '"Healthy" in idrac_info.system_info.License[my_idx2].PrimaryStatus'
-      loop_control:
-        index_var: my_idx2
+    - block:
+        - name: Set enterprise license status
+          set_fact:
+            enterprise_license: true
+            idrac_license_name: "{{ idrac_info.system_info.License[my_idx1].LicenseDescription }}"
+          with_items: "{{ idrac_info.system_info.License }}"
+          when:
+            - '"iDRAC" in idrac_info.system_info.License[my_idx1].LicenseDescription'
+            - '"Enterprise" in idrac_info.system_info.License[my_idx1].LicenseDescription'
+            - '"License" in idrac_info.system_info.License[my_idx1].LicenseDescription'
+            - '"Healthy" in idrac_info.system_info.License[my_idx1].PrimaryStatus'
+          loop_control:
+            index_var: my_idx1
+
+        - name: Set datacenter license status
+          set_fact:
+            datacenter_license: true
+            idrac_license_name: "{{ idrac_info.system_info.License[my_idx1].LicenseDescription }}"
+          with_items: "{{ idrac_info.system_info.License }}"
+          when:
+            - '"iDRAC" in idrac_info.system_info.License[my_idx2].LicenseDescription'
+            - '"Datacenter" in idrac_info.system_info.License[my_idx2].LicenseDescription'
+            - '"License" in idrac_info.system_info.License[my_idx2].LicenseDescription'
+            - '"Healthy" in idrac_info.system_info.License[my_idx2].PrimaryStatus'
+          loop_control:
+            index_var: my_idx2
+      when: idrac_info.system_info.License is defined
 
     - name: Change provision mode in absence of license
       set_fact:

+ 0 - 44
control_plane/roles/webui_awx/tasks/check_awx_status.yml

@@ -1,44 +0,0 @@
-# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-#Tasks for verifying if AWX is already installed on the system
-- name: Initialize variables
-  set_fact:
-    awx_status: false
-  tags: install
-
-- name: Check awx_task status on the machine
-  docker_container_info:
-    name: awx_task
-  register: awx_task_result
-  tags: install
-  vars:
-    ansible_python_interpreter: "/usr/bin/python3"
-
-- name: Check awx_web status on the machine
-  docker_container_info:
-    name: awx_web
-  register: awx_web_result
-  tags: install
-  vars:
-    ansible_python_interpreter: "/usr/bin/python3"
-
-- name: Update awx status
-  set_fact:
-    awx_status: true
-  when:
-    - awx_task_result.exists
-    - awx_web_result.exists
-  tags: install

+ 0 - 40
control_plane/roles/webui_awx/tasks/firewall_settings.yml

@@ -1,40 +0,0 @@
-# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-#Tasks for modifying firewall configurations for AWX
-
-- name: Masquerading on public zone
-  firewalld:
-    masquerade: yes
-    state: enabled
-    permanent: 'true'
-    zone: public
-  tags: install
-
-- name: Add HTTP and HTTPS services to firewalld
-  firewalld:
-    service: "{{ item }}"
-    permanent: true
-    state: enabled
-  with_items:
-    - http
-    - https
-  tags: install
-
-- name: Reboot firewalld
-  systemd:
-    name: firewalld
-    state: reloaded
-  tags: install

+ 0 - 34
control_plane/roles/webui_awx/tasks/install_awx_cli.yml

@@ -1,34 +0,0 @@
-# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-# Tasks for installing AWX-CLI
-- name: Add AWX CLI repo
-  block:
-    - name: Get repo
-      get_url:
-        url: "{{ awx_cli_repo }}"
-        dest: "{{ awx_cli_repo_path }}"
-    - name: Disable gpgcheck
-      replace:
-        path: "{{ awx_cli_repo_path }}"
-        regexp: 'gpgcheck=1'
-        replace: 'gpgcheck=0'
-  tags: install
-
-- name: Install AWX-CLI
-  package:
-    name: ansible-tower-cli
-    state: present
-  tags: install

+ 0 - 85
control_plane/roles/webui_awx/tasks/ui_accessibility.yml

@@ -1,85 +0,0 @@
-# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-# Check accessibility of AWX-UI
-- name: Re-install if in migrating state
-  block:
-    - name: Wait for AWX UI to be up
-      uri:
-        url: "{{ awx_ip }}"
-        status_code: "{{ return_status }}"
-        return_content: yes
-      register: register_error
-      until: awx_ui_msg in register_error.content
-      retries: 20
-      delay: 15
-      changed_when: no
-      no_log: True
-
-  rescue:
-    - name: Starting rescue
-      debug:
-        msg: "Attempting to re-install AWX"
-
-    - name: Remove old containers
-      docker_container:
-        name: "{{ item }}"
-        state: absent
-      loop:
-        - awx_task
-        - awx_web
-
-    - name: Restart docker
-      service:
-        name: docker
-        state: restarted
-
-    - name: Re-install AWX
-      block:
-        - name: Run AWX install.yml file
-          command: ansible-playbook -i inventory install.yml --extra-vars "admin_password={{ admin_password }}"
-          args:
-            chdir: "{{ awx_installer_path }}"
-          register: awx_installation
-          no_log: True
-
-      rescue:
-        - name: Check AWX status on machine
-          include_tasks: check_awx_status.yml
-
-        - name: Fail if container are not running
-          fail:
-            msg: "AWX installation failed with error msg:
-             {{ awx_installation.stdout | regex_replace(admin_password) }}."
-          when: not awx_status
-
-    - name: Check if AWX UI is up
-      block:
-        - name: Wait for AWX UI to be up
-          uri:
-            url: "{{ awx_ip }}"
-            status_code: "{{ return_status }}"
-            return_content: yes
-          register: register_error
-          until: awx_ui_msg in register_error.content
-          retries: 240
-          delay: 15
-          changed_when: no
-          no_log: True
-      rescue:
-        - name: Message
-          fail:
-            msg: "{{ register_error | regex_replace(awx_user) | regex_replace(admin_password) }}"
-  tags: install

+ 43 - 6
docs/FAQ.md

@@ -112,7 +112,7 @@ Resolution:
 It is recommended that the ansible-vault view or edit commands are used and not the ansible-vault decrypt or encrypt commands.
 
 ## What to do if the LC is not ready?
-* Verify the state of the LC in all servers by running `racadm getremoteservicesstatus`
+* Verify that the LC is in a ready state for all servers: `racadm getremoteservicesstatus`
 * Launch iDRAC template.
 
 ## What to do if the network CIDR entry of iDRAC IP in /etc/exports file is missing?
@@ -127,15 +127,52 @@ It is recommended that the ansible-vault view or edit commands are used and not
 ## Is Disabling 2FA supported by Omnia?
 * Disabling 2FA is not supported by Omnia and must be manually disabled.
 
-## Is provisioning server using BOSS controller supported by Omnia?
-* Provisioning server using BOSS controller is not supported by Omnia. It will be supported in upcoming releases.
-
 ## The provisioning of PowerEdge servers failed. How do I clean up before starting over?
 1. Delete the respective iDRAC IP addresses from the *provisioned_idrac_inventory* on the AWX UI or delete the *provisioned_idrac_inventory* to delete the iDRAC IP addresses of all the servers in the cluster.
 2. Launch the iDRAC template from the AWX UI.
 
-## What to do when WARNING message regarding older firmware displayed during idrac_template execution and idrac_template task failed?
+## What to do if PowerVault throws the error: `Error: The specified disk is not available. - Unavailable disk (0.x) in disk range '0.x-x'`
+1. Verify that the disk in question is not part of any pool: `show disks`
+2. If the disk is part of a pool, remove it and try again.
+
+## Why does PowerVault throw the error: `You cannot create a linear disk group when a virtual disk group exists on the system.`?
+At any given time only one type of disk group can be created on the system. That is, all disk groups on the system have to exclusively be linear or virtual. To fix the issue, either delete the existing disk group or change the type of pool you are creating.
+
+## Is provisioning server using BOSS controller supported by Omnia?
+* Provisioning server using BOSS controller is not supported by Omnia. It will be supported in upcoming releases.
+
+
+## What to do when iDRAC template execution throws a warning regarding older firmware versions?
 Potential Cause: Older firmware version in PowerEdge servers. Omnia supports only iDRAC 8 based Dell EMC PowerEdge Servers with firmware versions 2.75.75.75 and above and iDRAC 9 based Dell EMC PowerEdge Servers with Firmware versions 4.40.40.00 and above.
 
-1. Update idrac firmware version in PowerEdge servers manually to the supported version.
+1. Update iDRAC firmware version in PowerEdge servers manually to the supported version.
 2. Re-run idrac_template.
+
+## What steps have to be taken to re-run control_plane.yml after a Kubernetes reset?
+1. Delete the folder: `/var/nfs_awx`
+2. Delete the file:  `/<project name>/control_plane/roles/webui_awx/files/.tower_cli.cfg`
+
+Once complete, it's safe to re-run control_plane.yml.
+
+## Why does the Initialize Kubeadm task fail with 'nnode.Registration.name: Invalid value: \"<Host name>\"'?
+
+Potential Cause: The control_plane playbook does not support hostnames with an underscore in it such as 'mgmt_station'.
+
+As defined in RFC 822, the only legal characters are the following:
+1. Alphanumeric (a-z and 0-9): Both uppercase and lowercase letters are acceptable, and the hostname is case insensitive. In other words, dvader.empire.gov is identical to DVADER.EMPIRE.GOV and Dvader.Empire.Gov.
+
+2. Hyphen (-): Neither the first nor the last character in a hostname field should be a hyphen.
+
+3. Period (.): The period should be used only to delimit fields in a hostname (e.g., dvader.empire.gov)
+
+## What to do when JupyterHub pods are in 'ImagePullBackOff' or 'ErrImagePull' status after executing jupyterhub.yml?
+Potential Cause: Your Docker pull limit has been exceeded. For more information, click [here](https://www.docker.com/increase-rate-limits)
+1. Delete Jupyterhub deployment by executing the following command in manager node: `helm delete jupyterhub -n jupyterhub`
+2. Re-execute jupyterhub.yml after 8-9 hours.
+
+## What to do when Kubeflow pods are in 'ImagePullBackOff' or 'ErrImagePull' status after executing kubeflow.yml?
+Potential Cause: Your Docker pull limit has been exceeded. For more information, click [here](https://www.docker.com/increase-rate-limits)
+1. Delete Kubeflow deployment by executing the following command in manager node: `kfctl delete -V -f /root/k8s/omnia-kubeflow/kfctl_k8s_istio.v1.0.2.yaml`
+2. Re-execute kubeflow.yml after 8-9 hours
+
+

+ 5 - 4
docs/INSTALL_OMNIA.md

@@ -9,7 +9,7 @@ To install the Omnia control plane and manage workloads on your cluster using th
 * If you have configured the `omnia_config.yml` file to enable the login node, the login node must be part of the cluster. 
 * All nodes must be connected to the network and must have access to the Internet.
 * Set the hostnames of all the nodes in the cluster.
-	* If the login node is enabled, then set the hostnames in the format: __hostname.domainname__. For example, "manager.omnia.test" is a valid hostname.
+	* If the login node is enabled, then set the hostnames in the format: __hostname.domainname__. For example, "manager.omnia.test" is a valid hostname. **Do not** use underscores ( _ ) in the host names.
 	* Include the hostnames under /etc/hosts in the format: </br>*ipaddress hostname.domainname*. For example, "192.168.12.1 manager.example.com" is a valid entry.
 * SSH Keys for root are installed on all nodes to allow for password-less SSH.
 * The user should have root privileges to perform installations and configurations.
@@ -112,9 +112,8 @@ __Note:__
 * The default value of Kubernetes Pod Network CIDR is 10.244.0.0/16. If 10.244.0.0/16 is already in use within your network, select a different Pod Network CIDR. For more information, see __https://docs.projectcalico.org/getting-started/kubernetes/quickstart__.
 
 **NOTE**: If you want to view or edit the `omnia_config.yml` file, run the following command:  
-					
 - `ansible-vault view omnia_config.yml --vault-password-file .omnia_vault_key` -- To view the file. 
-- `ansible-vault edit omnia_config.yml --vault-password-file .omnia_vault_key` -- To edit the file.  
+- `ansible-vault edit omnia_config.yml --vault-password-file .omnia_vault_key` -- To edit the file.
 
 **NOTE**: It is suggested that you use the ansible-vault view or edit commands and that you do not use the ansible-vault decrypt or encrypt commands. If you have used the ansible-vault decrypt or encrypt commands, provide 644 permission to `omnia_config.yml`.  
 
@@ -151,11 +150,13 @@ The following __kubernetes__ roles are provided by Omnia when __omnia.yml__ file
 	- Kubernetes services are deployed such as Kubernetes Dashboard, Prometheus, MetalLB and NFS client provisioner
 
 __Note:__ 
+
+* Whenever k8s_version, k8s_cni or k8s_pod_network_cidr needs to be modified after the HPC cluster is setup, the OS in the manager and compute nodes in the cluster must be re-flashed before executing omnia.yml again.
 * After Kubernetes is installed and configured, few Kubernetes and calico/flannel related ports are opened in the manager and compute nodes. This is required for Kubernetes Pod-to-Pod and Pod-to-Service communications. Calico/flannel provides a full networking stack for Kubernetes pods.
 * If Kubernetes Pods are unable to communicate with the servers (i.e., unable to access the Internet) when the DNS servers are not responding, then the Kubernetes Pod Network CIDR may be overlapping with the host network which is DNS issue. To resolve this issue:
 	1. Disable firewalld.service.
 	2. If the issue persists, then perform the following actions:  
-		a. In your Kubernetes cluster, run `kubeadm reset -f` on the nodes.  
+		a. Format the OS on manager and compute nodes.  
 		b. In the management station, edit the *omnia_config.yml* file to change the Kubernetes Pod Network CIDR or CNI value. Suggested IP range is 192.168.0.0/16 and ensure you provide an IP which is not in use in your host network.  
 		c. Execute `omnia.yml` and skip slurm using `--skip-tags slurm`.
 

File diff suppressed because it is too large
+ 33 - 29
docs/INSTALL_OMNIA_CONTROL_PLANE.md


+ 22 - 10
docs/README.md

@@ -51,7 +51,7 @@ Requirements  |   Version
 ----------------------------------  |   -------
 OS pre-installed on the management station  |  CentOS 8.4
 OS deployed by Omnia on bare-metal Dell EMC PowerEdge Servers | CentOS 7.9 2009 Minimal Edition
-Cobbler  |  2.8.5
+Cobbler  |  3.2.1
 Ansible AWX  |  19.1.0
 Slurm Workload Manager  |  20.11.2
 Kubernetes on the management station  |  1.21.0
@@ -64,7 +64,7 @@ The following table lists the supported devices managed by Omnia. Other devices
 
 Device type	|	Supported models	
 -----------	|	-------	
-Dell EMC PowerEdge Servers	|	PowerEdge C4140, C6420, C6520, R240, R340, R440, R540, R640, R650, R740, R740xd, R740xd2, R750, R750xa, R840, R940, R940xa
+Dell EMC PowerEdge Servers	|	PowerEdge C4140, C6420, R240, R340, R440, R540, R640, R740, R740xd, R740xd2, R840, R940, R940xa
 Dell EMC PowerVault Storage	|	PowerVault ME4084, ME4024, and ME4012 Storage Arrays
 Dell EMC Networking Switches	|	PowerSwitch S3048-ON and PowerSwitch S5232F-ON
 Mellanox InfiniBand Switches	|	NVIDIA MQM8700-HS2F Quantum HDR InfiniBand Switch 40 QSFP56
@@ -153,16 +153,17 @@ stp_rpvst_default_behaviour	|	boolean: false, true	|	Configures RPVST default be
 # Known issues  
 * **Issue**: Hosts are not displayed on the AWX UI.  
 	**Resolution**:  
-	* Verify if the *provisioned_hosts.yml* file is present in the *omnia/appliance/roles/inventory/files* folder.
-	* Verify whether the hosts are listed in the *provisioned_hosts.yml* file.  
-		* If hosts are not listed, then servers are not PXE booted yet.
-		* If hosts are listed, then an IP address has been assigned to them by DHCP. However, hosts are not displayed on the AWX UI as the PXE boot is still in process or is not initiated.
-	* Check for the reachable and unreachable hosts using the **provisioned_report.yml** tool present in the *omnia/appliance/tools* folder. To run provisioned_report.yml, in the omnia/appliance directory, run `playbook -i roles/inventory/files/provisioned_hosts.yml tools/provisioned_report.yml`.
+	* Verify if the provisioned_hosts.yml file is present in the omnia/control_plane/roles/collect_node_info/files/ folder.
+	* Verify whether the hosts are listed in the provisioned_hosts.yml file.
+	* If hosts are not listed, then servers are not PXE booted yet.
+If hosts are listed, then an IP address has been assigned to them by DHCP. However, hosts are not displayed on the AWX UI as the PXE boot is still in process or is not initiated.
+	* Check for the reachable and unreachable hosts using the provision_report.yml tool present in the omnia/control_plane/tools folder. To run provision_report.yml, in the omnia/control_plane/ directory, run playbook -i roles/collect_node_info/files/provisioned_hosts.yml tools/provision_report.yml.
 
 * **Issue**: There are **ImagePullBack** or **ErrPullImage** errors in the status of Kubernetes pods.  
 	**Cause**: The errors occur when the Docker pull limit is exceeded.  
 	**Resolution**:
 	* For **omnia.yml** and **control_plane.yml**: Provide the docker username and password for the Docker Hub account in the *omnia_config.yml* file and execute the playbook. 
+	* For HPC cluster, during omnia.yml execution, a kubernetes secret 'dockerregcred' will be created in default namespace and patched to service account. User needs to patch this secret in their respective namespace while deploying custom applications and use the secret as imagePullSecrets in yaml file to avoid ErrImagePull. [Click here for more info](https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/)
 	* **Note**: If the playbook is already executed and the pods are in __ImagePullBack__ error, then run `kubeadm reset -f` in all the nodes before re-executing the playbook with the docker credentials.
 
 * **Issue**: The `kubectl` command stops working after a reboot and displays the following error message: *The connection to the server head_node_ip:port was refused - did you specify the right host or port?*  
@@ -172,15 +173,26 @@ stp_rpvst_default_behaviour	|	boolean: false, true	|	Configures RPVST default be
 	* `systemctl restart kubelet`  
 	
 * **Issue**: If control_plane.yml fails at the webui_awx role, then the previous IP address and password are not cleared when control_plane.yml is re-run.   
-	**Resolution**: In the *webui_awx/files* directory, delete the *.tower_cli.cfg* and *.tower_vault_key* files, and then re-run `control_plane.yml`.  
+	**Resolution**: In the *webui_awx/files* directory, delete the *.tower_cli.cfg* and *.tower_vault_key* files, and then re-run `control_plane.yml`.
 
 * **Issue**: The FreeIPA server and client installation fails.  
 	**Cause**: The hostnames of the manager and login nodes are not set in the correct format.  
 	**Resolution**: If you have enabled the option to install the login node in the cluster, set the hostnames of the nodes in the format: *hostname.domainname*. For example, *manager.omnia.test* is a valid hostname for the login node. **Note**: To find the cause for the failure of the FreeIPA server and client installation, see *ipaserver-install.log* in the manager node or */var/log/ipaclient-install.log* in the login node.  
 	
-* **Issue**: The inventoy details are not updated in AWX when device or host credentials are invalid.  
-	**Resolution**: Provide valid credentials of the devices and hosts in the cluster.  
+* **Issue**: The inventory details are not updated in AWX when device or host credentials are invalid.  
+	**Resolution**: Provide valid credentials of the devices and hosts in the cluster. 
+
+* **Issue**: The Host list is empty after executing the control_plane playbook.  
+	**Resolution**: Ensure that all devices used are in DHCP enabled mode.
+	
+* **Issue**: The task 'Install Packages' fails on the NFS node with the message: `Failure in talking to yum: Cannot find a valid baseurl for repo: base/7/x86_64.`  
+	**Cause**: There are connections missing on the NFS node.  
+	**Resolution**: Ensure that there are 3 nics being used on the NFS node:
+	1. For provisioning the OS
+	2. For connecting to the internet (Management purposes)
+	3. For connecting to PowerVault (Data Connection)  
 	
+
 # [Frequently asked questions](FAQ.md)
 
 # Limitations

+ 5 - 3
docs/control_plane/device_templates/CONFIGURE_INFINIBAND_SWITCHES.md

@@ -3,10 +3,12 @@ In your HPC cluster, connect the Mellanox InfiniBand switches using the Fat-Tree
 
 Omnia uses the server-based Subnet Manager (SM). SM runs as a Kubernetes pod on the management station. To enable the SM, Omnia configures the required parameters in the `opensm.conf` file. Based on the requirement, the parameters can be edited.  
 
-**NOTE**: Install the InfiniBand hardware drivers by running the command: `yum groupinstall "Infiniband Support" -y`.   
+**NOTE**: Install the InfiniBand hardware drivers by running the command: `yum groupinstall "Infiniband Support" -y`.  
+
 ## Edit the "input_params" file 
-Under the `control_plane/input_params` directory, edit the following files:
-1. `base_vars.yml` file
+Under the `control_plane/input_params` directory, edit the following files:  
+
+1. `base_vars.yml` file    
 
 	File name	|	Variables	|	Default, choices	|	Description
 	-----------	|	-------	|	----------------	|	-----------------

File diff suppressed because it is too large
+ 2 - 2
docs/control_plane/device_templates/CONFIGURE_POWERSWITCHES.md


+ 3 - 2
docs/control_plane/device_templates/CONFIGURE_POWERVAULT_STORAGE.md

@@ -29,7 +29,8 @@ Under the `control_plane/input_params` directory, edit the following files:
 	powervault_me4_disk_partition_size [Required] |	<ul><li>**5**</li><li>Any value between 5-99</li></ul> |	Enter the partition size which would be used as an NFS share.  
 	powervault_me4_volume_size [Required] |	<ul><li>**100GB**</li><li>Custom value</li></ul> |	Enter the volume size in the format: *SizeTB*, *SizeGB*, *SizeMB*, or *SizeB*.  
 	powervault_me4_pool [Required] |	<ul><li>**a** (or A)</li><li>b (or B)</li></ul> |	Enter the pool for the volume.  
-	powervault_me4_server_nic [Required] |	<ul><li>**em1**</li></ul> |	Enter the NIC of the server to which the PowerVault Storage is connected.    
+	powervault_me4_server_nic [Required] |	<ul><li>**em1**</li></ul> |	Enter the NIC of the server to which the PowerVault Storage is connected.
+	powervault_me4_pool_type [Required] |	<ul><li>**Virtual**</li><li>Linear </li></ul> |	Select the type of pool to be deployed on PowerVault. Ensure that all pools on the device are exclusively virtual or linear.
 	
 ## Configuring PowerVault Storage
 
@@ -38,4 +39,4 @@ Under the `control_plane/input_params` directory, edit the following files:
 2. Copy the Cluster-IP address of the awx-ui. 
 3. To retrieve the AWX UI password, run `kubectl get secret awx-admin-password -n awx -o jsonpath="{.data.password}" | base64 --decode`.
 4. Open the default web browser on the management station and enter `http://<IP>:8052`, where IP is the awx-ui IP address and 8052 is the awx-ui port number. Log in to the AWX UI using the username as `admin` and the retrieved password.  
-5. Under __RESOURCES__ -> __Templates__, launch the **powervault_me4_template**.
+5. Under __RESOURCES__ -> __Templates__, launch the **powervault_me4_template**.

+ 4 - 3
docs/control_plane/device_templates/PROVISION_SERVERS.md

@@ -31,7 +31,7 @@ Based on the inputs provided in the `login_vars.yml` and `base_vars.yml` files,
 2. Copy the Cluster-IP address of the awx-ui. 
 3. To retrieve the AWX UI password, run `kubectl get secret awx-admin-password -n awx -o jsonpath="{.data.password}" | base64 --decode`.
 4. Open the default web browser on the management station and enter `http://<IP>:8052`, where IP is the awx-ui IP address and 8052 is the awx-ui port number. Log in to the AWX UI using the username as `admin` and the retrieved password.  
-5. Under __RESOURCES__ -> __Templates__, launch the **idrac_template**.  
+5. Under __RESOURCES__ -> __Templates__, launch the **idrac_template**.
 
 Omnia role used to provision custom ISO on PowerEdge Servers using iDRAC: *provision_idrac*  
 
@@ -96,9 +96,10 @@ Omnia provides the following options to enhance security on the provisioned Powe
 	idrac_2fa.yml	|	dns_domain_name</br> [Required]	|		|	DNS domain name to be set for iDRAC. 
 	<br>	|	ipv4_static_dns1, ipv4_static_dns2</br> [Required] 	|		|	DNS1 and DNS2 static IPv4 addresses.
 	<br>	|	smtp_server_ip</br> [Required]	|		|	Server IP address used for SMTP.
-	<br>	|	smtp_username</br> [Required]	|		|	Username for SMTP.
-	<br>	|	smtp_password</br> [Required]	|		|	Password for SMTP.
 	<br>	|	use_email_address_2fa</br> [Required]	|		|	Email address used for enabling 2FA. After 2FA is enabled, an authentication code is sent to the provided email address. 
+	<br>	| smtp_authentication [Required]	| <ul> <li>__Disabled__</li> <li>Enabled </li> </ul> | Enable SMTP authentication 
+	<br>	|	smtp_username</br> [Optional]	|		|	Username for SMTP.
+	<br>	|	smtp_password</br> [Optional]	|		|	Password for SMTP.
 
 	**NOTE**: 2FA will be enabled on the iDRAC only if SMTP server details are valid and a test email notification is working using SMTP.  
 * **LDAP Directory Services**: To enable or disable the LDAP directory services, set the *ldap_directory_services* variable to "enabled" in the `idrac_vars.yml` file.  

File diff suppressed because it is too large
+ 4 - 3
docs/control_plane/input_parameters/INFINIBAND_SWITCHES.md


+ 1 - 1
docs/control_plane/input_parameters/POWERSWITCHES.md

@@ -12,7 +12,7 @@ Under the `control_plane/input_params` directory, edit the following files:
 	a. `ethernet_switch_username`- username for Ethernet switches.  
 	**NOTE**: The username must not contain the following characters: -, \\, "", and \'  
 	b. `ethernet_switch_password`- password for Ethernet switches.   
-	**NOTE**: Minimum length of the password must be eight characters and the maximum limit is 30 characters. Do not use these characters while entering a password: -, \\, "", and \'    
+	**NOTE**: Minimum length of the password must be eight characters and the maximum limit is 30 characters. Do not use these characters while entering a password: -, \\, "", and \'  
 
 3. `ethernet_tor_vars.yml` or `ethernet_vars.yml` file: If **ethernet_switch_support** is set to "true" in the *base_vars.yml* file, then update the following variables.  
 

+ 3 - 2
docs/control_plane/input_parameters/POWERVAULT_STORAGE.md

@@ -28,8 +28,9 @@ Under the `control_plane/input_params` directory, edit the following files:
 	powervault_me4_disk_group_name |	<ul><li>**omnia**</li><li>User-defined name</li></ul> |	Enter the group name of the disk.
 	powervault_me4_disk_partition_size [Required] |	<ul><li>**5**</li><li>Any value between 5-99</li></ul> |	Enter the partition size which would be used as an NFS share.  
 	powervault_me4_volume_size [Required] |	<ul><li>**100GB**</li><li>Custom value</li></ul> |	Enter the volume size in the format: *SizeTB*, *SizeGB*, *SizeMB*, or *SizeB*.  
-	powervault_me4_pool [Required] |	<ul><li>**a** (or A)</li><li>b (or B)</li></ul> |	Enter the pool for the volume.  
-	powervault_me4_server_nic [Required] |	<ul><li>**em1**</li></ul> |	Enter the NIC of the server to which the PowerVault Storage is connected.   
+	powervault_me4_pool [Required] |	<ul><li>**a** (or A)</li><li>b (or B)</li></ul> |	Enter the pool for the volume.
+	powervault_me4_pool_type [Required] |	<ul><li>Virtual</li><li>**Linear** </li></ul> |	Select the type of pool to be deployed on PowerVault. Ensure that all pools on the device are exclusively virtual or linear.
+		
 	
 ## Deploy Omnia Control Plane
 Before you configure the PowerVault Storage devices, you must complete the deployment of Omnia control plane. Go to Step 8 in the [Steps to install the Omnia Control Plane](../../INSTALL_OMNIA_CONTROL_PLANE.md#steps-to-deploy-the-omnia-control-plane) file to run the `ansible-playbook control_plane.yml` file.  

+ 59 - 0
docs/login_node/login_user_creation.md

@@ -0,0 +1,59 @@
+# How to Create a user using Freeipa
+
+## Prerequisites:
+1. Make sure the server and client are installed
+2. The admin user has to be initialized using kerberos authentication.
+
+   `kinit admin` (When prompted provide the password)
+   
+
+##Adding the New User
+1. ssh to manager node
+
+`ssh xxxxx@192.168.1.5`
+
+2. Use the command below to create a user:
+
+`ipa user-add '<new username>' --first='<User's first name>'
+    --last='<User's last name>' --homedir='Home Directory path (optional)' 
+    --random`
+
+3. The output will display the random password set. 
+```
+ "----------------------",
+            "Added user \"omniauser\"",
+            "----------------------",
+            "  User login: omniauser",
+            "  First name: omnia",
+            "  Last name: user",
+            "  Full name: omnia user",
+            "  Display name: omnia user",
+            "  Initials: ou",
+            "  Home directory: /home/omniauser",
+            "  GECOS: omnia user",
+            "  Login shell: /bin/sh",
+            "  Principal name: omniauser@MYIPA.TEST",
+            "  Principal alias: omniauser@MYIPA.TEST",
+            "  User password expiration: 20210804180355Z",
+            "  Email address: omniauser@myipa.test",
+            "  Random password: 0Qr:Ir;:q_vFKP+*b|0)0D",
+            "  UID: 893800014",
+            "  GID: 893800014",
+            "  Password: True",
+            "  Member of groups: ipausers",
+            "  Kerberos keys available: True"			
+			
+```
+			
+4. The random password displayed can be used to login to the login node using the newly created user.
+
+` ssh omniauser@192.168.1.6`
+
+5. Change the password on first login and then login with the new password.
+
+6. To assign permissions to the newly created user to execute slurm jobs run the command:
+
+   `usermod -a -G slurm 'new_login_user'`
+7. The user has been assigned appropriate permissions to execute slurm jobs. Jobs can be executed
+
+` srun --nodes 1 --ntasks-per-node 1 --partition normal hostname`

+ 6 - 8
control_plane/roles/webui_awx/tasks/clone_awx.yml

@@ -1,4 +1,4 @@
-# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,10 +13,8 @@
 # limitations under the License.
 ---
 
-- name: Clone AWX repo
-  git:
-    repo: "{{ awx_git_repo }}"
-    dest: "{{ awx_repo_path }}"
-    force: yes
-    version: 15.0.0
-  tags: install
+- name: Create user and assign slurm permission
+  hosts: manager
+  gather_facts: false
+  roles:
+    - login_user

+ 30 - 0
examples/login_node_example/roles/login_user/tasks/main.yml

@@ -0,0 +1,30 @@
+# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Create a user
+  command: >-
+    ipa user-add '{{ new_login_user }}' --first='{{ user_first_name }}'
+    --last='{{ user_last_name }}' --homedir='{{ user_home_dir }}' 
+    --random
+  register: output
+  changed_when: false
+
+- name: Debug
+  debug:
+    msg: "{{ output }}"
+
+- name: Add the user to slurm group
+  command: usermod -a -G slurm '{{ new_login_user }}'
+  changed_when: false

+ 28 - 0
examples/login_node_example/roles/login_user/vars/main.yml

@@ -0,0 +1,28 @@
+# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+# New User to be created on Login node
+# Make sure the username is in small case
+# For more details chech freeipa website
+new_login_user: "omniauser"
+
+# User home directory path
+user_home_dir: "/home/omniauser"
+
+# User's first name
+user_first_name: "omnia"
+
+# User's last name
+user_last_name: "user"

+ 4 - 4
omnia_config.yml

@@ -1,4 +1,4 @@
-# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -57,11 +57,11 @@ login_node_required: true
 
 # This variable is used to accept the domain name the user intends to configure
 # Eg: ipa.test
-domain_name: ""
+domain_name: "omnia.test"
 
 # A Kerberos realm is the domain over which a Kerberos authentication server has the authority to authenticate a user, host or service. 
 # A realm name is often, but not always the upper case version of the name of the DNS domain over which it presides
-realm_name: ""
+realm_name: "OMNIA.TEST"
 
 # The directory server operations require an administrative user. 
 # This user is referred to as the Directory Manager and has full access to the Directory for system management tasks 
@@ -72,4 +72,4 @@ directory_manager_password: ""
 
 # The IPA server requires an administrative user, named 'admin'. 
 # This user is a regular system account used for IPA server administration
-ipa_admin_password: ""
+ipa_admin_password: ""

+ 11 - 37
roles/cluster_validation/tasks/fetch_password.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 ---
+
 - name: Check if omnia_vault_key exists
   stat:
     path: "{{ role_path }}/../../{{ config_vaultname }}"
@@ -72,12 +73,12 @@
 - name: Assert mariadb_password
   assert:
     that:
-        - mariadb_password | length > min_length | int - 1
-        - mariadb_password | length < max_length | int + 1
-        - '"-" not in mariadb_password '
-        - '"\\" not in mariadb_password '
-        - '"\"" not in mariadb_password '
-        - " \"'\" not in mariadb_password "
+      - mariadb_password | length > min_length | int - 1
+      - mariadb_password | length < max_length | int + 1
+      - '"-" not in mariadb_password '
+      - '"\\" not in mariadb_password '
+      - '"\"" not in mariadb_password '
+      - " \"'\" not in mariadb_password "
     success_msg: "{{ success_msg_mariadb_password }}"
     fail_msg: "{{ fail_msg_mariadb_password }}"
 
@@ -89,7 +90,8 @@
 
 - name: Assert kubernetes cni
   assert:
-    that: "('calico' in k8s_cni) or ('flannel' in k8s_cni)"
+    that:
+      - "('calico' in k8s_cni) or ('flannel' in k8s_cni)"
     success_msg: "{{ success_msg_k8s_cni }}"
     fail_msg: "{{ fail_msg_k8s_cni }}"
 
@@ -112,38 +114,10 @@
     ansible_conf_file_path: "{{ ansible_config_file_path }}"
   no_log: True
 
-- name: Check whether ansible config file exists
-  stat:
-    path: "{{ ansible_conf_file_path }}/ansible.cfg"
-  register: ansible_conf_exists
-
-- name: Create the directory if it does not exist
-  file:
-    path: "{{ ansible_conf_file_path }}"
-    state: directory
-    mode: "{{ file_perm }}"
-  when: not ansible_conf_exists.stat.exists
-
-- name: Create ansible config file if it does not exist
-  copy:
-    dest: "{{ ansible_conf_file_path }}/ansible.cfg"
-    mode: "{{ file_perm }}"
-    content: |
-      [defaults]
-      log_path = /var/log/omnia.log
-  when: not ansible_conf_exists.stat.exists
-
-- name: Set omnia.log file
-  replace:
-    path: "{{ ansible_conf_file_path }}/ansible.cfg"
-    regexp: '#log_path = /var/log/ansible.log'
-    replace: 'log_path = /var/log/omnia.log'
-  when: ansible_conf_exists.stat.exists
-
 - name: Verify the value of login_node_required
   assert:
     that:
-      - 'login_node_required | type_debug == "bool"'
+      - login_node_required == true or login_node_required == false
     success_msg: "{{ login_node_required_success_msg }}"
     fail_msg: "{{ login_node_required_fail_msg }}"
 

+ 2 - 2
roles/k8s_common/vars/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -24,4 +24,4 @@ k8s_conf_dest: /etc/sysctl.d/
 
 k8s_repo_file_mode: 0644
 
-k8s_conf_file_mode: 0644
+k8s_conf_file_mode: 0644

+ 2 - 2
roles/k8s_manager/tasks/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -36,4 +36,4 @@
   register: install_helm
   until: install_helm is not failed
   retries: 20
-  tags: manager
+  tags: manager

+ 3 - 5
roles/login_server/tasks/install_ipa_server.yml

@@ -32,10 +32,8 @@
 
 - name: Install ipa server
   command: >-
-    ipa-server-install -n '{{ hostvars['127.0.0.1']['domain_name'] }}' --hostname='{{ server_hostname }}'
-    -a '{{ hostvars['127.0.0.1']['ipa_admin_password'] }}'
-    -p '{{ hostvars['127.0.0.1']['directory_manager_password'] }}' -r '{{ hostvars['127.0.0.1']['realm_name'] }}'
-    --setup-dns --auto-forwarders --auto-reverse -U
+    ipa-server-install -n '{{ hostvars['127.0.0.1']['domain_name'] }}' --hostname='{{ server_hostname }}' -a '{{ hostvars['127.0.0.1']['ipa_admin_password'] }}'
+    -p '{{ hostvars['127.0.0.1']['directory_manager_password'] }}' -r '{{ hostvars['127.0.0.1']['realm_name'] }}' --setup-dns --auto-forwarders --auto-reverse -U
   changed_when: true
   no_log: true
 
@@ -49,4 +47,4 @@
     src: "{{ temp_resolv_conf_path }}"
     dest: "{{ resolv_conf_path }}"
     mode: "{{ file_mode }}"
-    remote_src: yes
+    remote_src: yes

+ 5 - 5
roles/slurm_exporter/tasks/main.yml

@@ -16,8 +16,8 @@
 - name: Verify if slurm-exporter is already installed
   command: ls /usr/bin/prometheus-slurm-exporter
   register: slurm_exporter_status
-  changed_when: False
-  ignore_errors: yes
+  changed_when: false
+  failed_when: false
 
 - name: Install slurm exporter
   include_tasks: install_slurm_exporter.yml
@@ -29,11 +29,11 @@
 - name: Verify if kubernetes is already installed
   command: ls /usr/bin/kubectl
   register: k8s_installation_status
-  changed_when: False
-  ignore_errors: yes
+  changed_when: false
+  failed_when: false
 
 - name: Install prometheus on host
   include_tasks: install_prometheus.yml
   when:
     - "'kubernetes' in ansible_skip_tags"
-    - "'No such file' in k8s_installation_status.stderr"
+    - "'No such file' in k8s_installation_status.stderr"