Browse Source

Issue #781: Docs Out of Date

Signed-off-by: cgoveas <cassandra.goveas@dell.com>
cgoveas 3 years ago
parent
commit
fc127ee25f

+ 7 - 0
control_plane/roles/control_plane_customiso/files/temp_leap15.xml

@@ -34,6 +34,13 @@
       <hostname>localhost.localdomain</hostname>
       <resolv_conf_policy>auto</resolv_conf_policy>
     </dns>
+    <interfaces config:type="list">
+      <interface>
+        <bootproto>dhcp</bootproto>
+        <name>link</name>
+        <startmode>auto</startmode>
+      </interface>
+    </interfaces>
   </networking>
   <services-manager t="map">
     <services t="map">

+ 4 - 16
control_plane/roles/control_plane_customiso/tasks/check_prerequisites.yml

@@ -14,46 +14,37 @@
 ---
 
 - name: Include control_plane_common vars
-  include_vars: ../../control_plane_common/vars/main.yml
-  tags: install
+  include_vars: ../../control_plane_common/vars/main.yml  
 
 - name: Include provision_cobbler vars
   include_vars: ../../provision_cobbler/vars/main.yml
-  tags: install
 
 - name: Install xorriso package - rocky
   package:
     name: "{{ iso_package }}"
     state: present
-  tags: install
-  when:
-    - ansible_distribution | lower == os_supported_centos or
-      ansible_distribution | lower == os_supported_rocky
+  when: os_supported_leap not in mgmt_os
 
 - name: Install xorriso package - leap
   zypper:
     name: "{{ iso_package }}"
     state: present
-  tags: install
-  when: os_supported_leap in ansible_distribution | lower
+  when: os_supported_leap in mgmt_os
 
 - name: Install ansible-galaxy modules
   command: ansible-galaxy collection install {{ item }}
   changed_when: true
-  tags: install
   with_items: "{{ idrac_collections }}"
 
 - name: Install omsdk using pip
   pip:
     name: omsdk
     state: present
-  tags: install
 
 - name: Check iso mount folder
   stat:
     path: "{{ iso_mount_path }}{{ grub_cfg_path }}"
   register: check_mount_iso
-  tags: install
 
 - name: Include provision_cobbler vars
   include_tasks: ../../provision_cobbler/tasks/mount_iso.yml
@@ -64,7 +55,6 @@
     path: "{{ iso_mount_path }}{{ grub_cfg_path }}"
   register: recheck_mount_iso
   when: not check_mount_iso.stat.exists
-  tags: install
 
 - name: Incorrect iso mount
   fail:
@@ -72,8 +62,7 @@
   when:
     - not check_mount_iso.stat.exists
     - not recheck_mount_iso.stat.exists
-  register: iso_mount_fail
-  tags: install
+  register: iso_mount_fail 
 
 - name: Copy management station ip to {{ management_station_ip_file }}
   lineinfile:
@@ -81,4 +70,3 @@
     line: "{{ mngmnt_network_ip }}"
     mode: "{{ file_permission }}"
     create: yes
-  tags: install

+ 0 - 3
control_plane/roles/control_plane_customiso/tasks/create_unattended_iso_centos.yml

@@ -19,7 +19,6 @@
     -boot-info-table -eltorito-alt-boot -e images/efiboot.img -no-emul-boot -J -R -V "CentOS 7 x86_64"  {{ tmp_iso_dir }}
   changed_when: true
   register: centos_iso_status
-  tags: install
   args:
     chdir: "{{ tmp_iso_dir }}"
 
@@ -30,10 +29,8 @@
       - custom_iso_success_keyword2 in centos_iso_status.stderr
     success_msg: "{{ custom_iso_success_msg }}"
     fail_msg: "{{ custom_iso_fail_msg }}"
-  tags: install
 
 - name: Remove the kickstart file
   file:
     path: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
     state: absent
-  tags: install

+ 0 - 3
control_plane/roles/control_plane_customiso/tasks/create_unattended_iso_leap.yml

@@ -19,7 +19,6 @@
     -c boot/x86_64/loader/boot.cat -eltorito-alt-boot -e boot/x86_64/efi -no-emul-boot -o {{ nfs_share_offline_repo }}/{{ leap_iso_filename }} {{ tmp_iso_dir }}
   changed_when: true
   register: leap_iso_status
-  tags: install
   args:
     chdir: "{{ tmp_iso_dir }}"
 
@@ -30,10 +29,8 @@
       - custom_iso_success_keyword2 in leap_iso_status.stderr
     success_msg: "{{ custom_iso_success_msg }}"
     fail_msg: "{{ custom_iso_fail_msg }}"
-  tags: install
   
 - name: Remove the kickstart file
   file:
     path: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
     state: absent
-  tags: install

+ 0 - 3
control_plane/roles/control_plane_customiso/tasks/create_unattended_iso_rocky.yml

@@ -24,7 +24,6 @@
     -boot-info-table -eltorito-alt-boot -e images/efiboot.img -no-emul-boot -J -R -V {{ rocky_profile_name.stdout.split(' ')[4] | replace("'","") }}  {{ tmp_iso_dir }}
   changed_when: true
   register: rocky_iso_status
-  tags: install
   args:
     chdir: "{{ tmp_iso_dir }}"
 
@@ -35,10 +34,8 @@
       - custom_iso_success_keyword2 in rocky_iso_status.stderr
     success_msg: "{{ custom_iso_success_msg }}"
     fail_msg: "{{ custom_iso_fail_msg }}"
-  tags: install
 
 - name: Remove the kickstart file
   file:
     path: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
     state: absent
-  tags: install

+ 41 - 46
control_plane/roles/control_plane_customiso/tasks/edit_iso_config.yml

@@ -17,7 +17,6 @@
   command: openssl rand -base64 12
   changed_when: false
   register: generate_random_phrase
-  tags: install
   no_log: true
 
 - name: Encrypt login password
@@ -25,21 +24,18 @@
   no_log: true
   changed_when: false
   register: encrypt_login_pass
-  tags: install
 
 - name: Create a tmp iso directory
   file:
     path: "{{ tmp_iso_dir }}"
     state: directory
     mode: "{{ file_permission }}"
-  tags: install
 
 - name: Copy files to tmp folder
   command:  rsync -AHPSXav {{ iso_mount_path }} {{ tmp_iso_dir }}
   changed_when: true
   args:
-    warn: false
-  tags: install
+    warn: false 
 
 - name: Kickstart file changes rocky and centos
   block:
@@ -47,28 +43,24 @@
       block:
         - name: Set centos kickstart file name
           set_fact:
-            idrac_kickstart_file: "{{ idrac_centos_ks }}"
-          tags: install
+            idrac_kickstart_file: "{{ idrac_centos_ks }}"     
 
         - name: Remove the kickstart file if exists
           file:
             path: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
-            state: absent
-          tags: install
+            state: absent     
 
         - name: Create the centos kickstart file
           copy:
             src: "{{ role_path }}/files/temp_centos7.cfg"
             dest: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
-            mode: "{{ file_permission }}"
-          tags: install
+            mode: "{{ file_permission }}"       
 
-        - name: Configure kickstart file centos - nic
+        - name: Configure kickstart file - nic
           lineinfile:
             path: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
             insertafter: '^network  --bootproto=dhcp --device=link --onboot=on --activate'
-            line: 'network  --bootproto=dhcp --device={{ item }} --onboot=on --activate'
-          tags: install
+            line: 'network  --bootproto=dhcp --device={{ item }} --onboot=on --activate'    
           with_items: "{{ centos_host_nic }}"
       when: provision_os == os_supported_centos
 
@@ -76,28 +68,24 @@
       block:
         - name: Set rocky kickstart file name
           set_fact:
-            idrac_kickstart_file: "{{ idrac_rocky_ks }}"
-          tags: install
+            idrac_kickstart_file: "{{ idrac_rocky_ks }}"      
 
         - name: Remove the kickstart file if exists
           file:
             path: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
-            state: absent
-          tags: install
+            state: absent        
 
         - name: Create the rocky kickstart file
           copy:
             src: "{{ role_path }}/files/temp_rocky8.cfg"
             dest: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
-            mode: "{{ file_permission }}"
-          tags: install
+            mode: "{{ file_permission }}"       
 
-        - name: Configure kickstart file rocky - nic
+        - name: Configure kickstart file - nic
           lineinfile:
             path: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
             insertafter: '^network  --bootproto=dhcp --device=link --onboot=on --activate'
-            line: 'network  --bootproto=dhcp --device={{ item }} --onboot=on --activate'
-          tags: install
+            line: 'network  --bootproto=dhcp --device={{ item }} --onboot=on --activate'         
           with_items: "{{ rocky_host_nic }}"
       when: provision_os == os_supported_rocky
 
@@ -106,29 +94,25 @@
         path: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
         regexp: '^rootpw --iscrypted ks_password'
         replace: 'rootpw --iscrypted {{ encrypt_login_pass.stdout }}'
-      no_log: true
-      tags: install
+      no_log: true     
 
     - name: Configure kickstart file - timezone
       replace:
         path: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
         regexp: '^timezone --utc ks_timezone'
-        replace: 'timezone --utc {{ timezone }}'
-      tags: install
+        replace: 'timezone --utc {{ timezone }}'  
 
     - name: Configure kickstart file - language
       replace:
         path: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
         regexp: '^lang ks_language'
-        replace: 'lang {{ language }}'
-      tags: install
+        replace: 'lang {{ language }}' 
 
     - name: Copy kickstart file to iso mount path
       copy:
         src: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
         dest: "{{ tmp_iso_dir }}{{ idrac_kickstart_file }}"
-        mode: preserve
-      tags: install
+        mode: preserve   
 
     - name: Remove ^M characters
       command: dos2unix {{ tmp_iso_dir }}{{ idrac_kickstart_file }}
@@ -142,8 +126,7 @@
         replace: "{{ item.replace }}"
       with_items:
         - { regexp: "append initrd=initrd.img", replace: "append initrd=initrd.img ks=cdrom:/{{ idrac_kickstart_file }}" }
-        - { regexp: "rd.live.check quiet", replace: "" }
-      tags: install
+        - { regexp: "rd.live.check quiet", replace: "" }    
 
     - name: Edit grub.cfg
       replace:
@@ -154,8 +137,7 @@
         - { regexp: "timeout=60", replace: "timeout=5" }
         - { regexp: "kernel /images/pxeboot/vmlinuz", replace: "kernel /images/pxeboot/vmlinuz ks=cdrom:/{{ idrac_kickstart_file }}" }
         - { regexp: "linuxefi /images/pxeboot/vmlinuz", replace: "linuxefi /images/pxeboot/vmlinuz ks=cdrom:/{{ idrac_kickstart_file }}" }
-        - { regexp: "rd.live.check quiet", replace: "" }
-      tags: install
+        - { regexp: "rd.live.check quiet", replace: "" } 
   when:
     - provision_os == os_supported_rocky or
       provision_os == os_supported_centos
@@ -164,37 +146,51 @@
   block:
     - name: Set leap kickstart file name
       set_fact:
-        idrac_kickstart_file: "{{ idrac_leap_ks }}"
-      tags: install
+        idrac_kickstart_file: "{{ idrac_leap_ks }}"  
   
     - name: Create the leap kickstart file
       copy:
         src: "{{ role_path }}/files/temp_leap15.xml"
         dest: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
-        mode: "{{ file_permission }}"
-      tags: install
+        mode: "{{ file_permission }}"    
 
     - name: Configure kickstart file - Password
       replace:
         path: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
         regexp: '      <user_password>ks_password</user_password>'
         replace: '      <user_password>{{ encrypt_login_pass.stdout }}</user_password>'
-      no_log: true
-      tags: install
+      no_log: true  
 
     - name: Configure kickstart file - timezone
       replace:
         path: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
         regexp: '    <timezone>ks_timezone</timezone>'
-        replace: '    <timezone>{{ timezone }}</timezone>'
-      tags: install
+        replace: '    <timezone>{{ timezone }}</timezone>'  
+    
+    - name: Configure kickstart file - nic
+      lineinfile:
+        path: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
+        insertafter: '      </interface>'
+        line: >4
+                  <interface>
+                    <bootproto>dhcp</bootproto>
+                    <name>{{ item }}</name>
+                    <startmode>auto</startmode>
+                  </interface>
+      with_items: "{{ centos_host_nic }}"
+
+    - name: Remove blank lines
+      lineinfile:
+        path: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
+        regexp: '^\s*$'
+        state: absent
+      changed_when: false
 
     - name: Copy kickstart file to iso mount path
       copy:
         src: "{{ role_path }}/files/{{ idrac_kickstart_file }}"
         dest: "/tmp/leap/{{ idrac_kickstart_file }}"
-        mode: preserve
-      tags: install
+        mode: preserve  
 
     - name: Edit grub.cfg
       replace:
@@ -203,5 +199,4 @@
         replace: "{{ item.replace }}"
       with_items:
         - { regexp: "  timeout=60", replace: "  timeout=5" }
-      tags: install
   when: provision_os == os_supported_leap

+ 2 - 2
control_plane/roles/control_plane_customiso/vars/main.yml

@@ -20,8 +20,8 @@ iso_mount_path: /mnt/{{ provision_os }}/
 iso_mount_check_fail_msg: "ISO file not mounted successfully. Ensure /mnt/{{ provision_os }} path is mounted with {{ provision_os }} ISO file."
 management_station_ip_file: "management_station_ip.txt"
 idrac_collections:
-  - community.general
-  - dellemc.openmanage
+  - community.general:4.4.0
+  - dellemc.openmanage:4.4.0
 iso_package: xorriso
 
 # Usage: edit_iso_config.yml

+ 10 - 5
control_plane/roles/webui_awx/files/requirements.yml

@@ -1,7 +1,12 @@
 ---
 collections:
-  - community.general
-  - dellemc.openmanage
-  - dellemc.os10
-  - kubernetes.core
-  - community.grafana
+  - name: community.general
+    version: 4.4.0
+  - name: dellemc.openmanage
+    version: 4.4.0
+  - name: dellemc.os10
+    version: 1.1.1
+  - name: kubernetes.core
+    version: 2.2.3
+  - name: community.grafana
+    version: 1.3.0

+ 3 - 3
control_plane/roles/webui_grafana/vars/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -15,8 +15,8 @@
 
 # Usage: pre-requisites.yml
 collections_name:
-  - kubernetes.core
-  - community.grafana
+  - kubernetes.core:2.2.3
+  - community.grafana:1.3.0
 directory_mode: '0774'
 
 # Usage: secrets.yml

File diff suppressed because it is too large
+ 34 - 17
docs/INSTALL_OMNIA_CONTROL_PLANE.md


+ 53 - 2
docs/MONITOR_CLUSTERS.md

@@ -77,7 +77,8 @@ Prometheus is installed:
 * Access Prometheus with a private IP address:
     1. Run `kubectl get services --all-namespaces`.
     2. From the list of services, find  the **prometheus-xxxx-server** service under the **Name** column, and copy the **EXTERNAL-IP** address.  
-   For example, in the below list of services, `192.168.2.150` is the external IP address for the service `prometheus-1619158141-server`.  
+   For example, in the below list of services, `192.168.2.150` is the external IP address for the service `prometheus-1619158141-server`.
+   
 		NAMESPACE	|	NAME	|	TYPE	|	CLUSTER-IP	|	EXTERNAL-IP	|	PORT(S)	|	AGE  
 		---------	|	----	|	----	|	----------	|	-----------	|	-------	|	----  
 		default	|	kubernetes	|	ClusterIP	|	10.96.0.1	|	none	|	443/TCP	|	107m  
@@ -92,4 +93,54 @@ Prometheus is installed:
 
 __Note:__ 
 * If Prometheus is installed through Slurm without installing Kubernetes, then it will be removed when Kubernetes is installed because Prometheus would be running as a pod. 
-* Only a single instance of Prometheus is installed when both Kubernetes and Slurm are installed.
+* Only a single instance of Prometheus is installed when both Kubernetes and Slurm are installed.
+
+## Accessing Cluster metrics (fetched by Prometheus) on Grafana 
+
+* Once `control_plane.yml` is run, Prometheus is added to Grafana as a datasource (hpc-prometheus). This allows Grafana to display statistics from the Compute Nodes that have been polled using Prometheus on the Management Station.
+
+* Select the dashboard (![Dashboard Icon](Telemetry_Visualization/Images/DashBoardIcon.PNG)) tab to view the list of Prometheus based dashboards. Some default dashboards include CoreDNS, Prometheus Overview, Kuberenetes Networking etc.
+
+>> __Note:__ Both the control plane and HPC clusters can be monitored on these dashboards by toggling the datasource at the top of each dashboard. 
+
+## Accessing Control Plane metrics (fetched by Prometheus) on Grafana
+
+* Once `control_plane.yml` is run, Prometheus is added to Grafana as a datasource. This allows Grafana to display statistics from the Control Plane that have been polled using Prometheus.
+
+![Prometheus DataSource](Telemetry_Visualization/Images/Prometheus_DataSource.jpg)
+
+* Select the dashboard (![Dashboard Icon](Telemetry_Visualization/Images/DashBoardIcon.PNG)) tab to view the list of Prometheus based dashboards. Some default dashboards include CoreDNS, Prometheus Overview, Kuberenetes Networking etc.
+
+>> __Note:__ Both the control plane and HPC clusters can be monitored on these dashboards by toggling the datasource at the top of each dashboard:
+
+| Data Source | Description | Source |
+|-------------|-------------|--------|
+|  hpc-prometheus-manager-nodeIP            | Manages the Kuberenetes and Slurm Cluster on the Manager and Compute nodes.            |  This datasource is set up when `Omnia.yml` is run.      |
+| control_plane_prometheus            | Monitors the Single Node cluster running on the Management Station            | This datasource is set up when `control_plane.yml` is run.        |
+
+
+![Prometheus DataSource](Telemetry_Visualization/Images/Prometheus_Dashboard.jpg)
+
+
+
+
+| Type        | Subtype           | Dashboard Name                    | Available DataSources                               |
+|-------------|-------------------|-----------------------------------|-----------------------------------------------------|
+|             |                   | CoreDNS                           | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Kuberenetes |                   | API Types                         | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Kuberenetes | Compute Resources | Cluster                           | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Kuberenetes | Compute Resources | Namespace (Pods)                  | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Kuberenetes | Compute Resources | Node (Pods)                       | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Kuberenetes | Compute Resources | Pod                               | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Kuberenetes | Compute Resources | Workload                          | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Kuberenetes |                   | Kubelet                           | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Kuberenetes | Networking        | Cluster                           | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Kuberenetes | Networking        | Namespace (Pods)                  | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Kuberenetes | Networking        | Namespace (Workload)              | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Kuberenetes | Networking        | Pod                               | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Kuberenetes | Networking        | Workload                          | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Kuberenetes |                   | Scheduler                         | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Kuberenetes |                   | Stateful Sets                     | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+|             |                   | Prometheus Overview               | control-plane-prometheus, hpc-prometheus-manager-nodeIP |
+| Slurm       |                   | CPUs/GPUs, Jobs, Nodes, Scheduler | hpc-prometheus-manager-nodeIP                           |
+| Slurm       |                   | Node Exporter Server Metrics      | hpc-prometheus-manager-nodeIP                           |

+ 2 - 38
docs/README.md

@@ -29,6 +29,7 @@ Omnia can install Kubernetes or Slurm (or both), along with additional drivers,
 ## What's new in this release
 * Extended support of Leap OS on Management station, login, compute and NFS nodes.
 * Omnia now supports Powervault configurations with 2 network interfaces.
+* Omnia now supports multi profile creation and multi cluster provisioning using Cobbler.
 * Provisioning of Rocky custom ISO on supported PowerEdge servers using iDRAC.
 * Configuring Dell EMC networking switches, Mellanox InfiniBand switches, and PowerVault storage devices in the cluster. 
 * An option to configure a login node with the same configurations as the compute nodes in the cluster. With appropriate user privileges provided by the cluster administrator, users can log in to the login node and schedule Slurm jobs. The authentication mechanism in the login node uses the FreeIPA solution.
@@ -39,7 +40,7 @@ The Omnia Control Plane will automate the entire cluster deployment process, sta
 For detailed instructions, see [Install the Omnia Control Plane](INSTALL_OMNIA_CONTROL_PLANE.md).  
 
 ## Installing Omnia to servers with a pre-provisioned OS
-Omnia can be deployed on clusters that already have an RPM-based Linux OS running on them and are all connected to the Internet. Currently, all Omnia testing is done on [CentOS](https://centos.org). Please see [Example system designs](EXAMPLE_SYSTEM_DESIGNS.md) for instructions on the network setup.
+Omnia can be deployed on clusters that already have an RPM-based Linux OS running on them and are all connected to the Internet. Currently, all Omnia testing is done using the software versions mentioned [here](README.md#System-requirements ). Please see [Example system designs](EXAMPLE_SYSTEM_DESIGNS.md) for instructions on the network setup.
 
 Once servers have functioning OS and networking, you can use Omnia to install and start Slurm and/or Kubernetes. For detailed instructions, see [Install Omnia using CLI](INSTALL_OMNIA.md). 
 
@@ -117,43 +118,6 @@ dellemc.os10	|	GNU-General Public License v3.1	|	1.1.1	|	It provides networking
 Genisoimage-dnf	|	GPL v3	|	1.1.11	|	Genisoimage is a pre-mastering program for creating ISO-9660 CD-ROM  filesystem images
 OMSDK	|	Apache-2.0	|	1.2.456	|	Dell EMC OpenManage Python SDK (OMSDK) is a python library that helps developers and customers to automate the lifecycle management of PowerEdge Servers
 
-# Supported interface keys of PowerSwitch S3048-ON (ToR Switch)
-The following table provides details about the interface keys supported by the S3048-ON ToR Switch. Dell EMC Networking OS10 Enterprise Edition is the supported operating system.
-
-Interface key name	|	Type	|	Description
----------	|   ----	|	-----------
-desc	|	string	|	Configures a single line interface description
-portmode	|	string	|	Configures port mode according to the device type
-switchport	|	boolean: true, false*	|	Configures an interface in L2 mode
-admin	|	string: up, down*	|	Configures the administrative state for the interface; configuring the value as administratively "up" enables the interface; configuring the value as administratively "down" disables the interface
-mtu	|	integer	|	Configures the MTU size for L2 and L3 interfaces (1280 to 65535)
-speed	|	string: auto, 1000, 10000, 25000, ...	|	Configures the speed of the interface
-fanout	|	string: dual, single; string:10g-4x, 40g-1x, 25g-4x, 100g-1x, 50g-2x (os10)	|	Configures fanout to the appropriate value
-suppress_ra	|	string: present, absent	|	Configures IPv6 router advertisements if set to present
-ip_type_dynamic	|	boolean: true, false	|	Configures IP address DHCP if set to true (ip_and_mask is ignored if set to true)
-ipv6_type_dynamic	|	boolean: true, false	|	Configures an IPv6 address for DHCP if set to true (ipv6_and_mask is ignored if set to true)
-ipv6_autoconfig	|	boolean: true, false	|	Configures stateless configuration of IPv6 addresses if set to true (ipv6_and_mask is ignored if set to true)
-vrf	|	string	|	Configures the specified VRF to be associated to the interface
-min_ra	|	string	|	Configures RA minimum interval time period
-max_ra	|	string	|	Configures RA maximum interval time period
-ip_and_mask	|	string	|	Configures the specified IP address to the interface
-ipv6_and_mask	|	string	|	Configures a specified IPv6 address to the interface
-virtual_gateway_ip	|	string	|	Configures an anycast gateway IP address for a VXLAN virtual network as well as VLAN interfaces
-virtual_gateway_ipv6	|	string	|	Configures an anycast gateway IPv6 address for VLAN interfaces
-state_ipv6	|	string: absent, present*	|	Deletes the IPV6 address if set to absent
-ip_helper	|	list	|	Configures DHCP server address objects (see ip_helper.*)
-ip_helper.ip	|	string (required)	|	Configures the IPv4 address of the DHCP server (A.B.C.D format)
-ip_helper.state	|	string: absent, present*	|	Deletes the IP helper address if set to absent
-flowcontrol	|	dictionary	|	Configures the flowcontrol attribute (see flowcontrol.*)
-flowcontrol.mode	|	string: receive, transmit	|	Configures the flowcontrol mode
-flowcontrol.enable	|	string: on, off	|	Configures the flowcontrol mode on
-flowcontrol.state	|	string: absent, present	|	Deletes the flowcontrol if set to absent
-ipv6_bgp_unnum	|	dictionary	|	Configures the IPv6 BGP unnum attributes (see ipv6_bgp_unnum.*) below
-ipv6_bgp_unnum.state	|	string: absent, present*	|	Disables auto discovery of BGP unnumbered peer if set to absent
-ipv6_bgp_unnum.peergroup_type	|	string: ebgp, ibgp	|	Specifies the type of template to inherit from
-stp_rpvst_default_behaviour	|	boolean: false, true	|	Configures RPVST default behavior of BPDU's when set to True, which is default
-
-* *(Asterisk) denotes the default value.
 
 # Known issues  
 * **Issue**: Hosts are not displayed on the AWX UI.  

+ 45 - 1
docs/Security/Enable_Security_ManagementStation.md

@@ -1,6 +1,8 @@
 # Enabling Security on the Management Station and Login Node
 
-## Enabling FreeIPA on the Management Station:
+Omnia uses FreeIPA to enable security features like authorisation and access control.
+
+## Enabling Authentication on the Management Station:
 
 Set the parameter 'enable_security_support' to true in `base_vars.yml`
 
@@ -20,4 +22,46 @@ Set the parameter 'enable_security_support' to true in `base_vars.yml`
 | ipa_admin_password         |               | "admin" user password for the IPA server                                                         |
 
 
+## Log Aggregation via Grafana
+
+[Loki](https://grafana.com/docs/loki/latest/fundamentals/overview/) is a datastore used to efficiently hold log data for security purposes. Using the `promtail` agent, logs are collated and streamed via a HTTP API.
+
+>> __Note:__ When `control_plane.yml` is run, Loki is automatically set up as a data source on the Grafana UI.
+
+
+
+### Querying Loki 
+
+Loki uses basic regex based syntax to filter for specific jobs, dates or timestamps.
+
+* Select the Explore ![Explore Icon](Telemetry_Visualization/Images/ExploreIcon.PNG) tab to select control-plane-loki from the drop down.
+* Using [LogQL queries](https://grafana.com/docs/loki/latest/logql/log_queries/), all logs in `/var/log` can be accessed using filters (Eg: `{job=”Omnia”}` )
+
+## Viewing Logs on the Dashboard
+
+All log files can be viewed via the Dashboard tab (![Dashboard Icon](Telemetry_Visualization/Images/DashBoardIcon.PNG)). The Default Dashboard displays `omnia.log` and `syslog`. Custom dashboards can be created per user requirements.
+
+Below is a list of all logs available to Loki and can be accessed on the dashboard:
+
+| Name               | Location                                  | Purpose                      | Additional Information                                                                             |
+|--------------------|-------------------------------------------|------------------------------|----------------------------------------------------------------------------------------------------|
+| Omnia Logs         | /var/log/omnia.log                        | Omnia Log                    | This log is configured by Default                                                                  |
+| syslogs            | /var/log/messages                         | System Logging               | This log is configured by Default                                                                  |
+| Audit Logs         | /var/log/audit/audit.log                  | All Login Attempts           | This log is configured by Default                                                                  |
+| CRON logs          | /var/log/cron                             | CRON Job Logging             | This log is configured by Default                                                                  |
+| Pods logs          | /var/log/pods/*/*/*log                    | k8s pods                     | This log is configured by Default                                                                  |
+| Access Logs        | /var/log/dirsrv/slapd-<Realm Name>/access | Directory Server Utilization | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
+| Error Log          | /var/log/dirsrv/slapd-<Realm Name>/errors | Directory Server Errors      | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
+| CA Transaction Log | /var/log/pki/pki-tomcat/ca/transactions   | FreeIPA PKI Transactions     | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
+| KRB5KDC            | /var/log/krb5kdc.log                      | KDC Utilization              | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
+| Secure logs        | /var/log/secure                           | Login Error Codes            | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
+| HTTPD logs         | /var/log/httpd/*                          | FreeIPA API Call             | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
+| DNF logs           | /var/log/dnf.log                          | Installation Logs            | This log is configured on Rocky OS                                                                 |
+| Zypper Logs        | /var/log/zypper.log                       | Installation Logs            | This log is configured on Leap OS                                                                  |
+
+
+
+
+
+
 

+ 0 - 105
docs/TelemetryAndMonitoring/MONITOR_CLUSTERS.md

@@ -1,105 +0,0 @@
-# Monitor Kubernetes and Slurm
-Omnia provides playbooks to configure additional software components for Kubernetes such as JupyterHub and Kubeflow. For workload management (submitting, conrolling, and managing jobs) of HPC, AI, and Data Analytics clusters, you can access Kubernetes and Slurm dashboards and other supported applications. 
-
-## Before accessing the dashboards
-To access any of the dashboards, ensure that a compatible web browser is installed. If you are connecting remotely to your Linux server by using MobaXterm version later than 8 or other X11 Clients though *ssh*, follow the below mentioned steps to launch the Firefox Browser:  
-* On the management station:
-	1. Connect using *ssh*. Run `ssh <user>@<IP-address>`, where *IP-address* is the private IP of the management station.
-	2. `dnf install mesa-libGL-devel -y`
-	3. `dnf install firefox -y`
-	4. `dnf install xorg-x11-xauth`
-	5. `export DISPLAY=:10.0`
-	6. `logout and login back`
-	7. To launch Firefox from terminal, run `firefox&`.  
-	
-* On the manager node:
-	1. Connect using *ssh*. Run `ssh <user>@<IP-address>`, where *IP-address* is the private IP of the manager node.
-	2. `yum install firefox -y`
-	3. `yum install xorg-x11-xauth`
-	4. `export DISPLAY=:10.0`
-	5. `logout and login back`
-	6. To launch Firefox from terminal, run `firefox&`
-
-**NOTE**: When the PuTTY or MobaXterm session ends, you must run **export DISPLAY=:10.0** command each time, else Firefox cannot be launched again.  
-
-## Access FreeIPA Dashboard  
-The FreeIPA Dashboard can be accessed from the management station, manager, and login nodes. To access the dashboard:
-1.	Install the Firefox Browser.
-2.	Open the Firefox Browser and enter the url: `https://<hostname>`. For example, enter `https://manager.example.com`.
-3.	Enter the username and password. If the admin or user has obtained a Kerberos ticket, then the credentials need not be provided.  
-
-**Note**: To obtain a Kerberos ticket, perform the following actions:
-1. Enter `kinit <username>`
-2. When prompted, enter the password.
-
-An administrator can create users on the login node using FreeIPA. The users will be prompted to change the passwords upon first login.
-
-## Access Kuberentes Dashboard
-1. To verify if the **Kubernetes-dashboard** service is in the Running state, run `kubectl get pods --namespace kubernetes-dashboard`.
-2. To start the Kubernetes dashboard, run `kubectl proxy`.
-3. To retrieve the encrypted token, run `kubectl get secret -n kubernetes-dashboard $(kubectl get serviceaccount admin-user -n kubernetes-dashboard -o jsonpath="{.secrets[0].name}") -o jsonpath="{.data.token}" | base64 --decode`.
-4. Copy the encrypted token value.
-5. On a web browser on the management station (for control_plane.yml) or manager node (for omnia.yml) enter http://localhost:8001/api/v1/namespaces/kubernetes-dashboard/services/https:kubernetes-dashboard:/proxy/.
-6. Select the authentication method as __Token__.
-7. On the Kuberenetes Dashboard, paste the copied encrypted token and click **Sign in** to access the Kubernetes Dashboard.
-
-## Access Kubeflow Dashboard
-1. Before accessing the Kubeflow Dashboard, run `kubectl -n kubeflow get applications -o yaml profiles`. Wait till **profiles-deployment** enters the Ready state.
-2. To retrieve the **External IP** or **CLUSTER IP**, run `kubectl get services istio-ingressgateway --namespace istio-system`.
-3. On a web browser installed on the manager node, enter the **External IP** or **Cluster IP** to open the Kubeflow Central Dashboard.  
-
-For more information about the Kubeflow Central Dashboard, see https://www.kubeflow.org/docs/components/central-dash/overview/.
-
-## Access JupyterHub Dashboard
-
-1. To verify if the JupyterHub services are running, run `kubectl get pods --namespace jupyterhub`.
-2. Ensure that the pod names starting with __hub__ and __proxy__ are in the **Running** state.
-3. To retrieve the **External IP** or **CLUSTER IP**, run `kubectl get services proxy-public --namespace jupyterhub`.
-4. On a web browser installed on the manager node, enter the **External IP** or **Cluster IP** to open the JupyterHub Dashboard.
-5. JupyterHub is running with a default dummy authenticator. Enter any username and password combination to access the dashboard.
-
-For more information about configuring username and password, and to access the JupyterHub Dashboard, see https://zero-to-jupyterhub.readthedocs.io/en/stable/jupyterhub/customization.html.
-
-## Access Prometheus UI
-
-Prometheus is installed:
-  * As a Kubernetes role (**A**), when both Slurm and Kubernetes are installed.
-  * On the host when only Slurm is installed (**B**).
-
-**A**. When Prometheus is installed as a Kubernetes role.  
-* Access Prometheus with local host:  
-    1. Run the following commands:  
-       `export POD_NAME=$(kubectl get pods --namespace default -l "app=prometheus,component=server" -o jsonpath="{.items[0].metadata.name}")`  
-       `echo $POD_NAME`  
-       `kubectl --namespace default port-forward $POD_NAME 9090`  
-    2. To launch the Prometheus UI, in the web browser, enter `http://localhost:9090`.
-  
-* Access Prometheus with a private IP address:
-    1. Run `kubectl get services --all-namespaces`.
-    2. From the list of services, find  the **prometheus-xxxx-server** service under the **Name** column, and copy the **EXTERNAL-IP** address.  
-   For example, in the below list of services, `192.168.2.150` is the external IP address for the service `prometheus-1619158141-server`.
-   
-		NAMESPACE	|	NAME	|	TYPE	|	CLUSTER-IP	|	EXTERNAL-IP	|	PORT(S)	|	AGE  
-		---------	|	----	|	----	|	----------	|	-----------	|	-------	|	----  
-		default	|	kubernetes	|	ClusterIP	|	10.96.0.1	|	none	|	443/TCP	|	107m  
-		default	|	**prometheus-1619158141-server**	|	LoadBalancer	|	10.97.40.140	|	**192.168.2.150**	|	80:31687/TCP	|	106m  
-    3. To open Firefox, run `firefox&`.
-    4. Enter the copied External IP address to access Prometheus. For example, enter `192.168.2.150` to access Prometheus UI.
-
-**B**. When Prometheus is installed on the host.
-1. Navigate to Prometheus folder. The default path is `/var/lib/prometheus-2.23.0.linux-amd64/`.
-2. Start the web server: `./prometheus`.  
-3. To launch the Prometheus UI, in the web browser, enter `http://localhost:9090`. 
-
-__Note:__ 
-* If Prometheus is installed through Slurm without installing Kubernetes, then it will be removed when Kubernetes is installed because Prometheus would be running as a pod. 
-* Only a single instance of Prometheus is installed when both Kubernetes and Slurm are installed.
-
-## Accessing Prometheus data via Grafana UI (On the Management Station)
-
-* Once `control_plane.yml` is run, Prometheus is added to Grafana as a datasource (hpc-prometheus). This allows Grafana to display statistics from the Compute Nodes that have been polled using Prometheus.
-
-* Select the dashboard (![Dashboard Icon](Images/DashBoardIcon.PNG)) tab to view the list of Prometheus based dashboards. Some default dashboards include CoreDNS, Prometheus Overview, Kuberenetes Networking etc.
-
->> __Note:__ Both the control plane and HPC clusters can be monitored on these dashboards by toggling the datasource at the top of each dashboard. 
-

+ 0 - 67
docs/TelemetryAndMonitoring/Monitor_Control_Plane.md

@@ -1,67 +0,0 @@
-# Monitoring The Management Station
-
-To monitor the Management Station, Omnia uses the Grafana UI with a Loki integration (This can be set up using the steps provided [here](Install_Telemetry.md)).  
-
-
-## Accessing Loki via Grafana
-
-[Loki](https://grafana.com/docs/loki/latest/fundamentals/overview/) is a datastore used to efficiently hold log data for security purposes. Using the `promtail` agent, logs are collated and streamed via a HTTP API.
-
->> __Note:__ When `control_plane.yml` is run, Loki is automatically set up as a data source on the Grafana UI.
-
-
-
-### Querying Loki 
-
-Loki uses basic regex based syntax to filter for specific jobs, dates or timestamps.
-
-* Select the Explore ![Explore Icon](Images/ExploreIcon.PNG) tab to select control-plane-loki from the drop down.
-* Using [LogQL queries](https://grafana.com/docs/loki/latest/logql/log_queries/), all logs in `/var/log` can be accessed using filters (Eg: `{job=”Omnia”}` )
-
-## Viewing Logs on the Dashboard
-
-All log files can be viewed via the Dashboard tab (![Dashboard Icon](Images/DashBoardIcon.PNG)). The Default Dashboard displays `omnia.log` and `syslog`. Custom dashboards can be created per user requirements.
-
-## Accessing Prometheus data via Grafana
-
-* Once `control_plane.yml` is run, Prometheus is added to Grafana as a datasource. This allows Grafana to display statistics from the Control Plane that have been polled using Prometheus.
-
-![Prometheus DataSource](Images/Prometheus_DataSource.jpg)
-
-* Select the dashboard (![Dashboard Icon](Images/DashBoardIcon.PNG)) tab to view the list of Prometheus based dashboards. Some default dashboards include CoreDNS, Prometheus Overview, Kuberenetes Networking etc.
-
->> __Note:__ Both the control plane and HPC clusters can be monitored on these dashboards by toggling the datasource at the top of each dashboard:
-
-| Data Source | Description | Source |
-|-------------|-------------|--------|
-|  hpc-prometheus-headnodeIP            | Manages the Kuberenetes and Slurm Cluster on the Manager and Compute nodes.            |  This datasource is set up when `Omnia.yml` is run.      |
-| control_plane_prometheus            | Monitors the Single Node cluster running on the Management Station            | This datasource is set up when `control_plane.yml` is run.        |
-
-
-![Prometheus DataSource](Images/Prometheus_Dashboard.jpg)
-
-
-
-
-| Type        | Subtype           | Dashboard Name                    | Available DataSources                               |
-|-------------|-------------------|-----------------------------------|-----------------------------------------------------|
-|             |                   | CoreDNS                           | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Kuberenetes |                   | API Types                         | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Kuberenetes | Compute Resources | Cluster                           | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Kuberenetes | Compute Resources | Namespace (Pods)                  | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Kuberenetes | Compute Resources | Node (Pods)                       | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Kuberenetes | Compute Resources | Pod                               | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Kuberenetes | Compute Resources | Workload                          | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Kuberenetes |                   | Kubelet                           | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Kuberenetes | Networking        | Cluster                           | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Kuberenetes | Networking        | Namespace (Pods)                  | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Kuberenetes | Networking        | Namespace (Workload)              | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Kuberenetes | Networking        | Pod                               | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Kuberenetes | Networking        | Workload                          | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Kuberenetes |                   | Scheduler                         | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Kuberenetes |                   | Stateful Sets                     | control-plane-prometheus, hpc-prometheus-headnodeIP |
-|             |                   | Prometheus Overview               | control-plane-prometheus, hpc-prometheus-headnodeIP |
-| Slurm       |                   | CPUs/GPUs, Jobs, Nodes, Scheduler | hpc-prometheus-headnodeIP                           |
-| Slurm       |                   | Node Exporter Server Metrics      | hpc-prometheus-headnodeIP                           |
-
-

docs/TelemetryAndMonitoring/Images/DashBoardIcon.PNG → docs/Telemetry_Visualization/Images/DashBoardIcon.PNG


docs/TelemetryAndMonitoring/Images/ExploreIcon.PNG → docs/Telemetry_Visualization/Images/ExploreIcon.PNG


docs/TelemetryAndMonitoring/Images/Prometheus_Dashboard.jpg → docs/Telemetry_Visualization/Images/Prometheus_Dashboard.jpg


docs/TelemetryAndMonitoring/Images/Prometheus_DataSource.jpg → docs/Telemetry_Visualization/Images/Prometheus_DataSource.jpg


+ 4 - 2
docs/TelemetryAndMonitoring/Install_Telemetry.md

@@ -1,6 +1,8 @@
-# Setting Up Telemetry
+# Setting Up Grafana
 
-Using Grafana, users can poll multiple devices and create graphs/visualizations of key statistics.
+Using Grafana, users can poll multiple devices and create graphs/visualizations of key system metrics such as temperature, System power consumption, Memory Usage, IO Usage, CPU Usage, Total Memory Power, System Output Power, Total Fan Power, Total Storage Power, System Input Power, Total CPU Power, RPM Readings, Total Heat Dissipation, Power to Cool ratio, System Air Flow Efficiency etc.
+
+A lot of these metrics are collected using iDRAC telemetry. iDRAC telemetry allows you to stream telemetry data from your servers to a centralized log/metrics servers. For more information on iDRAC telemetry, click [here](https://github.com/dell/iDRAC-Telemetry-Scripting).
 
 ## Prerequisites
 

+ 1 - 1
docs/control_plane/device_templates/PROVISION_SERVERS.md

@@ -79,7 +79,7 @@ To access the Cobbler dashboard, enter `https://<IP>/cobbler_web` where `<IP>` i
 
 >>__Note__: If you want to add more nodes, append the new nodes in the existing mapping file. However, do not modify the previous nodes in the mapping file as it may impact the existing cluster.
 
->> __Note__: During a Cobbler based deployment, only one OS is supported at a time. If the user would like to deploy both, please deploy one first, **unmount `/mnt/iso`** and then re-run Cobbler for the second OS.
+>> __Note__: With the addition of Multiple profiles, the cobbler container dynamically updates the mount point based on the value of `provision_os` in `base_vars.yml`.
 
 ## Security enhancements  
 Omnia provides the following options to enhance security on the provisioned PowerEdge servers:

File diff suppressed because it is too large
+ 39 - 0
docs/control_plane/input_parameters/POWERSWITCHES.md