Browse Source

Merge branch 'devel' into devel

Shubhangi-dell 3 years ago
parent
commit
1932df4d18

+ 14 - 2
.all-contributorsrc

@@ -60,7 +60,9 @@
       "contributions": [
         "ideas",
         "doc",
-        "code"
+        "code",
+        "review",
+        "maintenance"
       ]
     },
     {
@@ -139,7 +141,8 @@
       "profile": "https://github.com/Shubhangi-dell",
       "contributions": [
         "code",
-        "maintenance"
+        "maintenance",
+        "bug"
       ]
     },
     {
@@ -310,6 +313,15 @@
       "contributions": [
         "code"
       ]
+    },
+    {
+      "login": "abhishek-sa1",
+      "name": "abhishek-sa1",
+      "avatar_url": "https://avatars.githubusercontent.com/u/94038029?v=4",
+      "profile": "https://github.com/abhishek-sa1",
+      "contributions": [
+        "code"
+      ]
     }
   ],
   "contributorsPerLine": 7,

File diff suppressed because it is too large
+ 3 - 2
README.md


+ 7 - 4
control_plane/roles/collect_device_info/files/create_inventory.yml

@@ -119,6 +119,12 @@
       when: "'$ANSIBLE_VAULT;' in config_content.stdout"
       run_once: true
 
+    - name: Install paramiko
+      command: pip3 install paramiko -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
+      delegate_to: localhost
+      run_once: true
+      changed_when: false
+
     - name: Initialize variables
       set_fact:
         idrac_inventory_status: false
@@ -179,7 +185,6 @@
             --conf.insecure hosts list --inventory ethernet_inventory -f human --filter "name"
           changed_when: false
           no_log: true
-          run_once: true
           register: ethernet_switches
 
         - name: Assert ethernet switch
@@ -230,7 +235,6 @@
             --conf.insecure hosts list --inventory infiniband_inventory -f human --filter "name"
           changed_when: false
           no_log: true
-          run_once: true
           register: infiniband_switches
 
         - name: Authenticate infiniband Switch
@@ -305,7 +309,6 @@
             --conf.insecure hosts list --inventory powervault_me4_inventory -f human --filter "name"
           changed_when: false
           no_log: true
-          run_once: true
           register: me4_storage
 
         - name: Get auth string for powervault
@@ -366,4 +369,4 @@
         - name: Failed while adding device to powervault_me4_inventory
           debug:
             msg: "{{ powervault_me4_fail_msg }}"
-      when: powervault_me4_status
+      when: powervault_me4_status

+ 4 - 7
control_plane/roles/collect_node_info/files/create_inventory.yml

@@ -100,20 +100,17 @@
       ignore_errors: true
 
     - name: Set the hostname from mapping file
-      hostname:
-        name: "{{ host_name.stdout + '.' + hostvars['localhost']['domain_name'] }}"
+      command: hostnamectl set-hostname "{{ host_name.stdout + '.' + hostvars['localhost']['domain_name'] }}"
       when: ('localhost' in hostname_check.stdout) and (mapping_file_present != "" ) and  (mapping_file | bool == true )
       ignore_errors: true
 
     - name: Set the hostname if hostname not present mapping file
-      hostname:
-        name: "compute{{ inventory_hostname.split('.')[-2] + '-' + inventory_hostname.split('.')[-1] + '.' + hostvars['localhost']['domain_name'] }}"
+      command: hostnamectl set-hostname "compute{{ inventory_hostname.split('.')[-2] + '-' + inventory_hostname.split('.')[-1] + '.' + hostvars['localhost']['domain_name'] }}"
       when: ('localhost' in hostname_check.stdout) and (file_present.rc != 0) and (mapping_file | bool == true )
       ignore_errors: true
 
     - name: Set the system hostname
-      hostname:
-        name: "compute{{ inventory_hostname.split('.')[-2] + '-' + inventory_hostname.split('.')[-1]+'.'+ hostvars['localhost']['domain_name'] }}"
+      command: hostnamectl set-hostname "compute{{ inventory_hostname.split('.')[-2] + '-' + inventory_hostname.split('.')[-1]+'.'+ hostvars['localhost']['domain_name'] }}"
       when: ('localhost' in hostname_check.stdout) and (mapping_file | bool == false)
       ignore_errors: true
 
@@ -222,4 +219,4 @@
     - name: Show unreachable hosts
       debug:
         msg: "{{ host_unreachable_msg }} + {{ groups['ungrouped'] }}"
-      when: "'ungrouped' in groups"
+      when: "'ungrouped' in groups"

+ 7 - 2
control_plane/roles/control_plane_k8s/tasks/k8s_firewalld.yml

@@ -26,7 +26,7 @@
 
 - name: Configure firewalld on master nodes
   firewalld:
-    port: "{{ item }}/tcp"
+    port: "{{ item }}"
     permanent: yes
     state: enabled
   with_items: '{{ k8s_master_ports }}'
@@ -45,6 +45,11 @@
     state: enabled
   with_items: "{{ calico_tcp_ports }}"
 
+- name: Masquerade the firewall
+  command: firewall-cmd --add-masquerade --permanent
+  changed_when: true
+  tags: firewalld
+
 - name: Reload firewalld
   command: firewall-cmd --reload
   changed_when: true
@@ -53,4 +58,4 @@
   service:
     name: firewalld
     state: stopped
-    enabled: no
+    enabled: no

+ 9 - 6
control_plane/roles/control_plane_k8s/vars/main.yml

@@ -38,11 +38,14 @@ docker_repo_dest: /etc/yum.repos.d/docker-ce.repo
 
 # Usage: k8s_firewalld.yml
 k8s_master_ports:
-  - 6443
-  - 2379-2380
-  - 10250
-  - 10251
-  - 10252
+  - 6443/tcp
+  - 2379-2380/tcp
+  - 10250/tcp
+  - 10251/tcp
+  - 10252/tcp
+  - 10255/tcp
+  - 8472/udp
+  - 30000-32767/tcp
 calico_udp_ports:
   - 4789
 calico_tcp_ports:
@@ -95,4 +98,4 @@ metallb_run_as_user_port: "65534"
 k8s_dashboard_yaml_url: https://raw.githubusercontent.com/kubernetes/dashboard/v2.2.0/aio/deploy/recommended.yaml
 k8s_dashboard_admin_file_dest: /root/k8s/k8s_dashboard_admin.yaml
 k8s_dashboard_admin_file_mode: 0655
-nfs_path: /var/nfs_awx
+nfs_path: /var/nfs_awx

+ 2 - 1
control_plane/roles/network_ethernet/tasks/pre_requisites.yml

@@ -16,6 +16,7 @@
 - name: Install paramiko
   command: pip3 install paramiko -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
   delegate_to: localhost
+  run_once: true
   changed_when: false
 
 - name: Check if ethernet_tor_vars.yml exists
@@ -71,4 +72,4 @@
         success_msg: "{{ success_msg_save_config }}"
         fail_msg: "{{ fail_msg_save_config }}"
 
-  when: "'S3' not in model_type and 'S4' not in model_type"
+  when: "'S3' not in model_type and 'S4' not in model_type"

+ 2 - 2
control_plane/roles/provision_cobbler/files/Dockerfile

@@ -51,8 +51,8 @@ RUN dnf install -y python3-sphinx
 RUN pip3 install wheel
 
 #Copy Configuration files
-COPY cobbler_settings /etc/cobbler/settings.yaml
-COPY temp_dhcp.template  /etc/cobbler/dhcp.template
+COPY settings.yaml /etc/cobbler/settings.yaml
+COPY dhcp.template  /etc/cobbler/dhcp.template
 COPY modules.conf  /etc/cobbler/modules.conf
 COPY tftp /etc/xinetd.d/tftp
 COPY .users.digest /etc/cobbler/users.digest

+ 1 - 1
control_plane/roles/provision_cobbler/tasks/provision_password.yml

@@ -76,7 +76,7 @@
       replace:
         path: "{{ role_path }}/files/{{ cobbler_kickstart_file }}"
         regexp: '^url --url http://ip/cblr/links/rocky-x86_64/'
-        replace: url --url http://{{ hpc_ip }}/cblr/links/Rocky8-x86_64/
+        replace: url --url http://{{ hpc_ip }}/cblr/links/rocky-x86_64/
       tags: install
 
     - name: Configure kickstart file - nic

+ 23 - 12
docs/FAQ.md

@@ -9,18 +9,18 @@ Potential Causes:
 Resolution:  
 Wait for AWX UI to be accessible at http://\<management-station-IP>:8081, and then run the `control_plane.yml` file again, where __management-station-IP__ is the IP address of the management node.
 
-## What to do if the nodes in a Kubernetes cluster reboot?  
+## What to do if the nodes in a Kubernetes cluster reboot:
 Wait for 15 minutes after the Kubernetes cluster reboots. Next, verify the status of the cluster using the following commands:
 * `kubectl get nodes` on the manager node to get the real-time k8s cluster status.  
 * `kubectl get pods --all-namespaces` on the manager node to check which the pods are in the **Running** state.
 * `kubectl cluster-info` on the manager node to verify that both the k8s master and kubeDNS are in the **Running** state.
 
-## What to do when the Kubernetes services are not in the __Running__  state?  
+## What to do when the Kubernetes services are not in the __Running__  state:
 1. Run `kubectl get pods --all-namespaces` to verify that all pods are in the **Running** state.
 2. If the pods are not in the **Running** state, delete the pods using the command:`kubectl delete pods <name of pod>`
 3. Run the corresponding playbook that was used to install Kubernetes: `omnia.yml`, `jupyterhub.yml`, or `kubeflow.yml`.
 
-## What to do when the JupyterHub or Prometheus UI is not accessible?  
+## What to do when the JupyterHub or Prometheus UI is not accessible:
 Run the command `kubectl get pods --namespace default` to ensure **nfs-client** pod and all Prometheus server pods are in the **Running** state. 
 
 ## While configuring Cobbler, why does the `control_plane.yml` fail during the Run import command?  
@@ -42,7 +42,7 @@ Resolution:
 1. Create a Non-RAID or virtual disk on the server.  
 2. Check if other systems except for the management node have cobblerd running. If yes, then stop the Cobbler container using the following commands: `docker rm -f cobbler` and `docker image rm -f cobbler`.
 
-## What to do when the Slurm services do not start automatically after the cluster reboots?  
+## What to do when Slurm services do not start automatically after the cluster reboots:
 
 * Manually restart the slurmd services on the manager node by running the following commands:
 ```
@@ -103,6 +103,17 @@ Resolution:
 2. In the omnia_config.yml file, change the k8s_cni variable value from `calico` to `flannel`.
 3. Run the Kubernetes and Kubeflow playbooks.  
 
+## What to do if jobs hang in 'pending' state on the AWX UI:
+
+Run `kubectl rollout restart deployment awx -n awx` from the management station and try to re-run the job.
+
+If the above solution **doesn't work**,
+1. Delete all the inventories, groups and organization from AWX UI.
+2. Delete the folder: `/var/nfs_awx`.
+3. Delete the file: `omnia/control_plane/roles/webui_awx/files/.tower_cli.cfg`.
+4. Re-run *control_plane.yml*.
+  
+
 ## Why is permission denied when executing the `idrac.yml` file or other .yml files from AWX?
 Potential Cause: The "PermissionError: [Errno 13] Permission denied" error is displayed if you have used the ansible-vault decrypt or encrypt commands.  
 Resolution:
@@ -111,17 +122,17 @@ Resolution:
 
 It is recommended that the ansible-vault view or edit commands are used and not the ansible-vault decrypt or encrypt commands.
 
-## What to do if the LC is not ready?
+## What to do if the LC is not ready:
 * Verify that the LC is in a ready state for all servers: `racadm getremoteservicesstatus`
 * Launch iDRAC template.
 
-## What to do if the network CIDR entry of iDRAC IP in /etc/exports file is missing?
+## What to do if the network CIDR entry of iDRAC IP in /etc/exports file is missing:
 * Add an additional network CIDR range of idrac IPs in the */etc/exports* file if the iDRAC IP is not in the management network range provided in base_vars.yml.
 
-## What to do if a custom ISO file is not present on the device?
+## What to do if a custom ISO file is not present on the device:
 * Re-run the *control_plane.yml* file.
 
-## What to do if the *management_station_ip.txt* file under *provision_idrac/files* folder is missing?
+## What to do if the *management_station_ip.txt* file under *provision_idrac/files* folder is missing:
 * Re-run the *control_plane.yml* file.
 
 ## Is Disabling 2FA supported by Omnia?
@@ -131,7 +142,7 @@ It is recommended that the ansible-vault view or edit commands are used and not
 1. Delete the respective iDRAC IP addresses from the *provisioned_idrac_inventory* on the AWX UI or delete the *provisioned_idrac_inventory* to delete the iDRAC IP addresses of all the servers in the cluster.
 2. Launch the iDRAC template from the AWX UI.
 
-## What to do if PowerVault throws the error: `Error: The specified disk is not available. - Unavailable disk (0.x) in disk range '0.x-x'`
+## What to do if PowerVault throws the error: `Error: The specified disk is not available. - Unavailable disk (0.x) in disk range '0.x-x'`:
 1. Verify that the disk in question is not part of any pool: `show disks`
 2. If the disk is part of a pool, remove it and try again.
 
@@ -142,7 +153,7 @@ At any given time only one type of disk group can be created on the system. That
 * Provisioning server using BOSS controller is not supported by Omnia. It will be supported in upcoming releases.
 
 
-## What to do when iDRAC template execution throws a warning regarding older firmware versions?
+## What to do when iDRAC template execution throws a warning regarding older firmware versions:
 Potential Cause: Older firmware version in PowerEdge servers. Omnia supports only iDRAC 8 based Dell EMC PowerEdge Servers with firmware versions 2.75.75.75 and above and iDRAC 9 based Dell EMC PowerEdge Servers with Firmware versions 4.40.40.00 and above.
 
 1. Update iDRAC firmware version in PowerEdge servers manually to the supported version.
@@ -165,12 +176,12 @@ As defined in RFC 822, the only legal characters are the following:
 
 3. Period (.): The period should be used only to delimit fields in a hostname (e.g., dvader.empire.gov)
 
-## What to do when JupyterHub pods are in 'ImagePullBackOff' or 'ErrImagePull' status after executing jupyterhub.yml?
+## What to do when JupyterHub pods are in 'ImagePullBackOff' or 'ErrImagePull' status after executing jupyterhub.yml:
 Potential Cause: Your Docker pull limit has been exceeded. For more information, click [here](https://www.docker.com/increase-rate-limits)
 1. Delete Jupyterhub deployment by executing the following command in manager node: `helm delete jupyterhub -n jupyterhub`
 2. Re-execute jupyterhub.yml after 8-9 hours.
 
-## What to do when Kubeflow pods are in 'ImagePullBackOff' or 'ErrImagePull' status after executing kubeflow.yml?
+## What to do when Kubeflow pods are in 'ImagePullBackOff' or 'ErrImagePull' status after executing kubeflow.yml:
 Potential Cause: Your Docker pull limit has been exceeded. For more information, click [here](https://www.docker.com/increase-rate-limits)
 1. Delete Kubeflow deployment by executing the following command in manager node: `kfctl delete -V -f /root/k8s/omnia-kubeflow/kfctl_k8s_istio.v1.0.2.yaml`
 2. Re-execute kubeflow.yml after 8-9 hours

File diff suppressed because it is too large
+ 8 - 8
docs/INSTALL_OMNIA_CONTROL_PLANE.md


+ 1 - 1
docs/README.md

@@ -1,7 +1,7 @@
 **Omnia** (Latin: all or everything) is a deployment tool to configure Dell EMC PowerEdge servers running standard RPM-based Linux OS images into clusters capable of supporting HPC, AI, and data analytics workloads. It uses Slurm, Kubernetes, and other packages to manage jobs and run diverse workloads on the same converged solution. It is a collection of [Ansible](https://ansible.com) playbooks, is open source, and is constantly being extended to enable comprehensive workloads.
 
 #### Current release version
-1.2
+1.1.1
 
 #### Previous release version
 1.1  

+ 1 - 1
docs/control_plane/device_templates/CONFIGURE_INFINIBAND_SWITCHES.md

@@ -22,7 +22,7 @@ Under the `control_plane/input_params` directory, edit the following files:
 	-----------	|	-------	|	----------------	|	-----------------
 	base_vars.yml	|	ib_switch_support	|	<ul><li>false</li><li>**true**</li></ul>	|	To enable Mellanox InfiniBand switch configuration, set the variable to "true".
 	<br>	|	ib_network_nic	|	<ul><li>**ib0**</li></ul>	|	NIC or Ethernet card that must be connected to configure Mellanox InfiniBand switches.  
-	<br>	|	ib_network_dhcp_start_range, ib_network_dhcp_end_range	|		|	DHCP range for the DHCP server to assign IPv4 addresses.
+	<br>	|	ib_network_dhcp_start_range, ib_network_dhcp_end_range	| **172.25.0.100**, **172.25.0.200**		|	DHCP range for the DHCP server to assign IPv4 addresses.
 	
 2. `login_vars.yml` file  
 	a. `ib_username` and `ib_password`- username and password for InfiniBand Switches.   

+ 1 - 1
docs/control_plane/device_templates/CONFIGURE_POWERVAULT_STORAGE.md

@@ -18,7 +18,7 @@ Under the `control_plane/input_params` directory, edit the following files:
 
 	Variables	|	Default, choices	|	Description
 	----------------	|	-----------------	|	-----------------
-	locale	|	<ul><li>English</li></ul>	|	Represents the selected language. In Omnia 1.1, only English is supported.
+	locale	|	<ul><li>English</li></ul>	|	Represents the selected language. Currently, only English is supported.
 	powervault_me4_system_name [Optional]	|	<ul><li>**Uninitialized_Name**</li><li>User-defined name</li></ul>	|	The system name used to identify the PowerVault Storage device. The name should be less than 30 characters and must not contain spaces.
 	powervault_me4_snmp_notify_level [Required]	|	<ul><li>**none**</li><li>crit</li><li>error</li><li>warn</li><li>resolved</li><li>info</li></ul>	|	Select the SNMP notification levels for PowerVault Storage devices. 
 	powervault_me4_raid_levels	[Required] |	<ul><li>**raid1**</li>Examples:<li>r5/raid5: 3-16</li><li>r6/raid6: 4-16</li><li>r10/raid10: 4-16</li><li>adapt: 12-128</li></ul> |	Enter the required RAID levels and the minimum and maximum number of disks for each RAID levels. 

+ 1 - 1
docs/control_plane/input_parameters/INFINIBAND_SWITCHES.md

@@ -14,7 +14,7 @@ File name	|	Variables	|	Default, choices	|	Description
 -----------	|	-------	|	----------------	|	-----------------
 base_vars.yml	|	ib_switch_support	|	<ul><li>false</li><li>**true**</li></ul>	|	To enable Mellanox InfiniBand switch configuration, set the variable to "true".
 <br>	|	ib_network_nic	|	<ul><li>**ib0**</li></ul>	|	NIC or Ethernet card that must be connected to configure Mellanox InfiniBand switches.  
-<br>	|	ib_network_dhcp_start_range, ib_network_dhcp_end_range	|		|	DHCP range for the DHCP server to assign IPv4 addresses.
+<br>	|	ib_network_dhcp_start_range, ib_network_dhcp_end_range	| **172.25.0.100**, **172.25.0.200**		|	DHCP range for the DHCP server to assign IPv4 addresses.
 
 2. Edit the `login_vars.yml` file to enter the following details:  
 	a. `ib_username` and `ib_password`- username and password for InfiniBand Switches.   

+ 1 - 1
docs/control_plane/input_parameters/POWERVAULT_STORAGE.md

@@ -18,7 +18,7 @@ Under the `control_plane/input_params` directory, edit the following files:
 
 	Variables	|	Default, choices	|	Description
 	----------------	|	-----------------	|	-----------------
-	locale	|	<ul><li>English</li></ul>	|	Represents the selected language. In Omnia 1.1, only English is supported.
+	locale	|	<ul><li>English</li></ul>	|	Represents the selected language. Currently, only English is supported.
 	powervault_me4_system_name [Optional]	|	<ul><li>**Uninitialized_Name**</li><li>User-defined name</li></ul>	|	The system name used to identify the PowerVault Storage device. The name should be less than 30 characters and must not contain spaces.
 	powervault_me4_snmp_notify_level [Required]	|	<ul><li>**none**</li><li>crit</li><li>error</li><li>warn</li><li>resolved</li><li>info</li></ul>	|	Select the SNMP notification levels for PowerVault Storage devices. 
 	powervault_me4_raid_levels	[Required] |	<ul><li>**raid1**</li>Examples:<li>r5/raid5: 3-16</li><li>r6/raid6: 4-16</li><li>r10/raid10: 4-16</li><li>adapt: 12-128</li></ul> |	Enter the required RAID levels and the minimum and maximum number of disks for each RAID levels.

+ 141 - 0
platforms/roles/kubeflow/tasks/deploy_kubeflow.yml

@@ -0,0 +1,141 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Download kfctl release from the Kubeflow releases page
+  unarchive:
+    src: "{{ kfctl_download_url }}"
+    dest: "{{ kfctl_download_dest_path }}"
+    mode: "{{ kfctl_download_file_mode }}"
+    remote_src: yes
+
+- name: Delete omnia kubeflow directory if exists
+  file:
+    path: "{{ omnia_kubeflow_dir_path }}"
+    state: absent
+
+- name: Create omnia kubeflow directory
+  file:
+    path: "{{ omnia_kubeflow_dir_path }}"
+    state: directory
+    mode: "{{ omnia_kubeflow_dir_mode }}"
+    recurse: yes
+
+- name: Build kubeflow configuration
+  command:
+    cmd: /usr/bin/kfctl build -V -f "{{ kubeflow_config_yaml_url }}"
+    chdir: "{{ omnia_kubeflow_dir_path }}"
+  changed_when: true
+
+- name: Modify CPU limit for istio-ingressgateway-service-account
+  replace:
+    path: "{{ istio_noauth_yaml_file_path }}"
+    after: 'serviceAccountName: istio-ingressgateway-service-account'
+    before: '---'
+    regexp: 'cpu: 100m'
+    replace: 'cpu: 2'
+
+- name: Modify memory limit for istio-ingressgateway-service-account
+  replace:
+    path: "{{ istio_noauth_yaml_file_path }}"
+    after: 'serviceAccountName: istio-ingressgateway-service-account'
+    before: '---'
+    regexp: 'memory: 128Mi'
+    replace: 'memory: 512Mi'
+
+- name: Modify CPU request for istio-ingressgateway-service-account
+  replace:
+    path: "{{ istio_noauth_yaml_file_path }}"
+    after: 'serviceAccountName: istio-ingressgateway-service-account'
+    before: '---'
+    regexp: 'cpu: 10m'
+    replace: 'cpu: 1'
+
+- name: Modify memory request for istio-ingressgateway-service-account
+  replace:
+    path: "{{ istio_noauth_yaml_file_path }}"
+    after: 'serviceAccountName: istio-ingressgateway-service-account'
+    before: '---'
+    regexp: 'memory: 40Mi'
+    replace: 'memory: 256Mi'
+
+- name: Modify memory request for istio-engressgateway-service-account
+  replace:
+    path: "{{ istio_noauth_yaml_file_path }}"
+    after: 'serviceAccountName: istio-egressgateway-service-account'
+    before: '---'
+    regexp: 'memory: 128Mi'
+    replace: 'memory: 256Mi'
+
+- name: Modify memory request for istio-engressgateway-service-account
+  replace:
+    path: "{{ istio_noauth_yaml_file_path }}"
+    after: 'serviceAccountName: istio-egressgateway-service-account'
+    before: '---'
+    regexp: 'memory: 40Mi'
+    replace: 'memory: 128Mi'
+
+- name: Modify CPU limit for kfserving-gateway
+  replace:
+    path: "{{ kfserving_gateway_yaml_file_path }}"
+    after: 'serviceAccountName: istio-ingressgateway-service-account'
+    before: 'env:'
+    regexp: 'cpu: 100m'
+    replace: 'cpu: 2'
+
+- name: Modify memory limit for kfserving-gateway
+  replace:
+    path: "{{ kfserving_gateway_yaml_file_path }}"
+    after: 'serviceAccountName: istio-ingressgateway-service-account'
+    before: 'env:'
+    regexp: 'memory: 128Mi'
+    replace: 'memory: 512Mi'
+
+- name: Modify CPU request for kfserving-gateway
+  replace:
+    path: "{{ kfserving_gateway_yaml_file_path }}"
+    after: 'serviceAccountName: istio-ingressgateway-service-account'
+    before: 'env:'
+    regexp: 'cpu: 10m'
+    replace: 'cpu: 1'
+
+- name: Modify memory request for kfserving-gateway
+  replace:
+    path: "{{ kfserving_gateway_yaml_file_path }}"
+    after: 'serviceAccountName: istio-ingressgateway-service-account'
+    before: 'env:'
+    regexp: 'memory: 40Mi'
+    replace: 'memory: 256Mi'
+
+- name: Change argo base service from NodePort to LoadBalancer
+  replace:
+    path: "{{ argo_yaml_file_path }}"
+    regexp: 'NodePort'
+    replace: 'LoadBalancer'
+
+- name: Change istio-install base istio-noauth service from NodePort to LoadBalancer
+  replace:
+    path: "{{ istio_noauth_yaml_file_path }}"
+    regexp: 'NodePort'
+    replace: 'LoadBalancer'
+
+- name: Apply kubeflow configuration
+  command:
+    cmd: "/usr/bin/kfctl apply -V -f '{{ kubeflow_config_file }}'"
+    chdir: "{{ omnia_kubeflow_dir_path }}"
+  changed_when: true
+  register: apply_kubeflow_config
+  until: apply_kubeflow_config is not failed
+  retries: 20
+  delay: 10

+ 45 - 0
platforms/roles/kubeflow/tasks/firewalld_config.yml

@@ -0,0 +1,45 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Install firewalld
+  package:
+    name: firewalld
+    state: present
+  tags: firewalld
+
+- name: Start and enable firewalld
+  service:
+    name: firewalld
+    state: started
+    enabled: yes
+  tags: firewalld
+
+- name: Configure firewalld on master nodes
+  firewalld:
+    port: "{{ item }}/tcp"
+    permanent: yes
+    state: enabled
+  with_items: '{{ kubeflow_firewalld_ports }}'
+  tags: firewalld
+
+- name: Masquerade the firewall
+  command: firewall-cmd --add-masquerade --permanent
+  changed_when: true
+  tags: firewalld
+
+- name: Reload firewalld
+  command: firewall-cmd --reload
+  changed_when: true
+  tags: firewalld

+ 4 - 125
platforms/roles/kubeflow/tasks/main.yml

@@ -13,129 +13,8 @@
 #  limitations under the License.
 ---
 
-- name: Download kfctl release from the Kubeflow releases page
-  unarchive:
-    src: "{{ kfctl_download_url }}"
-    dest: "{{ kfctl_download_dest_path }}"
-    mode: "{{ kfctl_download_file_mode }}"
-    remote_src: yes
+- name: Configure firewalld ports
+  import_tasks: firewalld_config.yml
 
-- name: Delete omnia kubeflow directory if exists
-  file:
-    path: "{{ omnia_kubeflow_dir_path }}"
-    state: absent
-
-- name: Create omnia kubeflow directory
-  file:
-    path: "{{ omnia_kubeflow_dir_path }}"
-    state: directory
-    mode: "{{ omnia_kubeflow_dir_mode }}"
-    recurse: yes
-
-- name: Build kubeflow configuration
-  command:
-    cmd: /usr/bin/kfctl build -V -f "{{ kubeflow_config_yaml_url }}"
-    chdir: "{{ omnia_kubeflow_dir_path }}"
-  changed_when: true
-
-- name: Modify CPU limit for istio-ingressgateway-service-account
-  replace:
-    path: "{{ istio_noauth_yaml_file_path }}"
-    after: 'serviceAccountName: istio-ingressgateway-service-account'
-    before: '---'
-    regexp: 'cpu: 100m'
-    replace: 'cpu: 2'
-
-- name: Modify memory limit for istio-ingressgateway-service-account
-  replace:
-    path: "{{ istio_noauth_yaml_file_path }}"
-    after: 'serviceAccountName: istio-ingressgateway-service-account'
-    before: '---'
-    regexp: 'memory: 128Mi'
-    replace: 'memory: 512Mi'
-
-- name: Modify CPU request for istio-ingressgateway-service-account
-  replace:
-    path: "{{ istio_noauth_yaml_file_path }}"
-    after: 'serviceAccountName: istio-ingressgateway-service-account'
-    before: '---'
-    regexp: 'cpu: 10m'
-    replace: 'cpu: 1'
-
-- name: Modify memory request for istio-ingressgateway-service-account
-  replace:
-    path: "{{ istio_noauth_yaml_file_path }}"
-    after: 'serviceAccountName: istio-ingressgateway-service-account'
-    before: '---'
-    regexp: 'memory: 40Mi'
-    replace: 'memory: 256Mi'
-
-- name: Modify memory request for istio-engressgateway-service-account
-  replace:
-    path: "{{ istio_noauth_yaml_file_path }}"
-    after: 'serviceAccountName: istio-egressgateway-service-account'
-    before: '---'
-    regexp: 'memory: 128Mi'
-    replace: 'memory: 256Mi'
-
-- name: Modify memory request for istio-engressgateway-service-account
-  replace:
-    path: "{{ istio_noauth_yaml_file_path }}"
-    after: 'serviceAccountName: istio-egressgateway-service-account'
-    before: '---'
-    regexp: 'memory: 40Mi'
-    replace: 'memory: 128Mi'
-
-- name: Modify CPU limit for kfserving-gateway
-  replace:
-    path: "{{ kfserving_gateway_yaml_file_path }}"
-    after: 'serviceAccountName: istio-ingressgateway-service-account'
-    before: 'env:'
-    regexp: 'cpu: 100m'
-    replace: 'cpu: 2'
-
-- name: Modify memory limit for kfserving-gateway
-  replace:
-    path: "{{ kfserving_gateway_yaml_file_path }}"
-    after: 'serviceAccountName: istio-ingressgateway-service-account'
-    before: 'env:'
-    regexp: 'memory: 128Mi'
-    replace: 'memory: 512Mi'
-
-- name: Modify CPU request for kfserving-gateway
-  replace:
-    path: "{{ kfserving_gateway_yaml_file_path }}"
-    after: 'serviceAccountName: istio-ingressgateway-service-account'
-    before: 'env:'
-    regexp: 'cpu: 10m'
-    replace: 'cpu: 1'
-
-- name: Modify memory request for kfserving-gateway
-  replace:
-    path: "{{ kfserving_gateway_yaml_file_path }}"
-    after: 'serviceAccountName: istio-ingressgateway-service-account'
-    before: 'env:'
-    regexp: 'memory: 40Mi'
-    replace: 'memory: 256Mi'
-
-- name: Change argo base service from NodePort to LoadBalancer
-  replace:
-    path: "{{ argo_yaml_file_path }}"
-    regexp: 'NodePort'
-    replace: 'LoadBalancer'
-
-- name: Change istio-install base istio-noauth service from NodePort to LoadBalancer
-  replace:
-    path: "{{ istio_noauth_yaml_file_path }}"
-    regexp: 'NodePort'
-    replace: 'LoadBalancer'
-
-- name: Apply kubeflow configuration
-  command:
-    cmd: "/usr/bin/kfctl apply -V -f '{{ kubeflow_config_file }}'"
-    chdir: "{{ omnia_kubeflow_dir_path }}"
-  changed_when: true
-  register: apply_kubeflow_config
-  until: apply_kubeflow_config is not failed
-  retries: 20
-  delay: 10
+- name: Deploy kubeflow
+  import_tasks: deploy_kubeflow.yml

+ 14 - 0
platforms/roles/kubeflow/vars/main.yml

@@ -13,6 +13,20 @@
 #  limitations under the License.
 ---
 
+kubeflow_firewalld_ports:
+  - 15020
+  - 80
+  - 443
+  - 31400
+  - 15011
+  - 8060
+  - 853
+  - 15029
+  - 15030
+  - 15031
+  - 15032
+  - 15443
+
 kfctl_download_url: https://github.com/kubeflow/kfctl/releases/download/v1.0.2/kfctl_v1.0.2-0-ga476281_linux.tar.gz
 
 kfctl_download_dest_path: /usr/bin/

+ 7 - 2
roles/k8s_firewalld/tasks/main.yml

@@ -28,7 +28,7 @@
 
 - name: Configure firewalld on master nodes
   firewalld:
-    port: "{{ item }}/tcp"
+    port: "{{ item }}"
     permanent: yes
     state: enabled
   with_items: '{{ k8s_master_ports }}'
@@ -71,6 +71,11 @@
   when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
   tags: firewalld
 
+- name: Masquerade the firewall
+  command: firewall-cmd --add-masquerade --permanent
+  changed_when: true
+  tags: firewalld
+
 - name: Reload firewalld
   command: firewall-cmd --reload
   changed_when: true
@@ -81,4 +86,4 @@
     name: firewalld
     state: stopped
     enabled: no
-  tags: firewalld
+  tags: firewalld

+ 8 - 5
roles/k8s_firewalld/vars/main.yml

@@ -15,11 +15,14 @@
 
 # Master nodes firewall ports
 k8s_master_ports:
-  - 6443
-  - 2379-2380
-  - 10250
-  - 10251
-  - 10252
+  - 6443/tcp
+  - 2379-2380/tcp
+  - 10250/tcp
+  - 10251/tcp
+  - 10252/tcp
+  - 10255/tcp
+  - 30000-32767/tcp
+  - 8472/udp
 
 # Worker nodes firewall ports
 k8s_compute_ports:

+ 16 - 2
roles/login_node/tasks/install_ipa_client.yml

@@ -32,9 +32,23 @@
   changed_when: false
   failed_when: false
 
-- name: Install ipa client
+- name: Install ipa client in CentOS 7.9
   command: >-
     ipa-client-install --domain '{{ hostvars['127.0.0.1']['domain_name'] }}' --server '{{ hostvars[groups['manager'][0]]['server_hostname'] }}'
     --principal admin --password '{{ hostvars['127.0.0.1']['ipa_admin_password'] }}' --force-join --enable-dns-updates --force-ntpd -U
   changed_when: true
-  no_log: true
+  no_log: true
+  when:
+    - ( ansible_distribution | lower == os_centos )
+    - ( ansible_distribution_version < os_version )
+
+- name: Install ipa client in Rocky 8.4
+  command: >-
+    ipa-client-install --domain '{{ hostvars['127.0.0.1']['domain_name'] }}' --server '{{ hostvars[groups['manager'][0]]['server_hostname'] }}'
+    --principal admin --password '{{ hostvars['127.0.0.1']['ipa_admin_password'] }}' --force-join --enable-dns-updates --no-ntp -U
+  changed_when: true
+  no_log: true
+  when:
+    - ( ansible_distribution | lower == os_centos ) or
+      ( ansible_distribution | lower == os_rocky )
+    - ( ansible_distribution_version >= os_version )

+ 16 - 2
roles/login_server/tasks/install_ipa_server.yml

@@ -30,12 +30,26 @@
   changed_when: false
   failed_when: false
 
-- name: Install ipa server
+- name: Install ipa server in CentOS 7.9
   command: >-
     ipa-server-install -n '{{ hostvars['127.0.0.1']['domain_name'] }}' --hostname='{{ server_hostname }}' -a '{{ hostvars['127.0.0.1']['ipa_admin_password'] }}'
     -p '{{ hostvars['127.0.0.1']['directory_manager_password'] }}' -r '{{ hostvars['127.0.0.1']['realm_name'] }}' --setup-dns --auto-forwarders --auto-reverse -U
   changed_when: true
   no_log: true
+  when:
+    - ( ansible_distribution | lower == os_centos )
+    - ( ansible_distribution_version < os_version )
+
+- name: Install ipa server in CentOS > 8 or Rocky 8.4
+  command: >-
+    ipa-server-install -n '{{ hostvars['127.0.0.1']['domain_name'] }}' --hostname='{{ server_hostname }}' -a '{{ hostvars['127.0.0.1']['ipa_admin_password'] }}'
+    -p '{{ hostvars['127.0.0.1']['directory_manager_password'] }}' -r '{{ hostvars['127.0.0.1']['realm_name'] }}' --setup-dns --no-forwarders --no-reverse --no-ntp -U
+  changed_when: true
+  no_log: true
+  when:
+    - ( ansible_distribution | lower == os_centos ) or
+      ( ansible_distribution | lower == os_rocky )
+    - ( ansible_distribution_version >= os_version )
 
 - name: Authenticate as admin
   shell: set -o pipefail && echo $'{{ hostvars['127.0.0.1']['ipa_admin_password'] }}' | kinit admin
@@ -47,4 +61,4 @@
     src: "{{ temp_resolv_conf_path }}"
     dest: "{{ resolv_conf_path }}"
     mode: "{{ file_mode }}"
-    remote_src: yes
+    remote_src: yes

+ 26 - 0
roles/slurm_common/tasks/main.yml

@@ -26,6 +26,32 @@
     backup: yes
     mode: "{{ common_mode }}"
 
+- name: Enable powertools repo in Rocky 8.4
+  command: dnf config-manager --set-enabled powertools -y
+  when:
+    - ( ansible_distribution | lower == os_centos ) or
+      ( ansible_distribution | lower == os_rocky )
+    - ( ansible_distribution_version >= os_version )
+
+- name: Add python dependent packages for CentOS 7.9
+  package:
+    name: "{{ common_python2_packages }}"
+    state: present
+  tags: install
+  when:
+    - ( ansible_distribution | lower == os_centos )
+    - ( ansible_distribution_version < os_version )
+
+- name: Add python dependent packages for CentOS version > 8 and Rocky 8.4
+  package:
+    name: "{{ common_python3_packages }}"
+    state: present
+  tags: install
+  when:
+    - ( ansible_distribution | lower == os_centos ) or
+      ( ansible_distribution | lower == os_rocky )
+    - ( ansible_distribution_version >= os_version )
+
 - name: Install packages for slurm
   package:
     name: "{{ common_packages }}"

+ 10 - 0
roles/slurm_common/vars/main.yml

@@ -22,9 +22,15 @@ common_packages:
    - mariadb-server
    - mariadb-devel
    - man2html
+
+common_python2_packages:
    - MySQL-python
    - python-netaddr
 
+common_python3_packages:
+   - MySQL-python3
+   - python3-netaddr
+
 hostname_dest: "/etc/hostname"
 hosts_dest: "/etc/hosts"
 munge_dest: "/etc/munge/"
@@ -50,3 +56,7 @@ slurmd_pid: "/var/run/slurmd.pid"
 cluster_name : "manager,compute"
 slurmctld_log: "/var/log/slurm/slurmctld.log"
 slurmd_log: "/var/log/slurm/slurmd.log"
+
+os_centos: 'centos'
+os_rocky: 'rocky'
+os_version: '8.0'