Browse Source

Merge branch 'dellhpc:devel' into awx_logo

abhishek-sa1 3 years ago
parent
commit
929ce13738
30 changed files with 609 additions and 116 deletions
  1. 4 4
      control_plane/input_params/base_vars.yml
  2. 2 1
      control_plane/roles/provision_cobbler/files/Dockerfile_leap
  3. 2 1
      control_plane/roles/provision_cobbler/files/Dockerfile_rocky
  4. 6 18
      control_plane/roles/provision_cobbler/files/cobbler_configurations_leap.yml
  5. 9 9
      control_plane/roles/provision_cobbler/files/cobbler_configurations_rocky.yml
  6. 10 4
      control_plane/roles/provision_cobbler/files/inventory_creation.yml
  7. 1 1
      control_plane/roles/provision_cobbler/files/k8s_cobbler.yml
  8. 85 0
      control_plane/roles/provision_cobbler/files/multi_cluster_provisioning.yml
  9. 3 2
      control_plane/roles/provision_cobbler/files/tftp.yml
  10. 37 3
      control_plane/roles/provision_cobbler/tasks/check_prerequisites.yml
  11. 3 3
      control_plane/roles/provision_cobbler/tasks/cobbler_image.yml
  12. 23 7
      control_plane/roles/provision_cobbler/tasks/configure_cobbler.yml
  13. 48 41
      control_plane/roles/provision_cobbler/tasks/main.yml
  14. 2 2
      control_plane/roles/provision_cobbler/tasks/mapping_file.yml
  15. 6 6
      control_plane/roles/provision_cobbler/tasks/mount_iso.yml
  16. 67 0
      control_plane/roles/provision_cobbler/tasks/multi_profile_creation.yml
  17. 5 1
      control_plane/roles/provision_cobbler/tasks/provision_password.yml
  18. 6 3
      control_plane/roles/provision_cobbler/vars/main.yml
  19. 13 0
      docs/FAQ.md
  20. 8 1
      docs/INSTALL_OMNIA_CONTROL_PLANE.md
  21. 9 9
      docs/README.md
  22. 23 0
      docs/Security/Enable_Security_ManagementStation.md
  23. BIN
      docs/TelemetryAndMonitoring/Images/DashBoardIcon.PNG
  24. BIN
      docs/TelemetryAndMonitoring/Images/ExploreIcon.PNG
  25. BIN
      docs/TelemetryAndMonitoring/Images/Prometheus_Dashboard.jpg
  26. BIN
      docs/TelemetryAndMonitoring/Images/Prometheus_DataSource.jpg
  27. 51 0
      docs/TelemetryAndMonitoring/Install_Telemetry.md
  28. 105 0
      docs/TelemetryAndMonitoring/MONITOR_CLUSTERS.md
  29. 67 0
      docs/TelemetryAndMonitoring/Monitor_Control_Plane.md
  30. 14 0
      docs/control_plane/device_templates/CONFIGURE_INFINIBAND_SWITCHES.md

+ 4 - 4
control_plane/input_params/base_vars.yml

@@ -90,16 +90,16 @@ provision_method: "idrac"
 provision_state: "stateful"
 provision_state: "stateful"
 
 
 # This is the operating system image that will be used for provisioning compute nodes in the cluster.
 # This is the operating system image that will be used for provisioning compute nodes in the cluster.
-# Accepted values: rocky, centos
+# Accepted values: rocky, centos, leap
 # Default value: "rocky"
 # Default value: "rocky"
 # Required field
 # Required field
 provision_os: "rocky"
 provision_os: "rocky"
 
 
 # This is the path where the user has to place the iso image that needs to be provisioned in target nodes.
 # This is the path where the user has to place the iso image that needs to be provisioned in target nodes.
-# The iso file should be Rocky8-Minimal or CentOS7-2009-minimal edition.
+# The iso file should be Rocky8-Minimal or CentOS7-2009-minimal or openSUSE-Leap-15.3-DVD edition.
 # Other iso files are not supported.
 # Other iso files are not supported.
 # Required field
 # Required field
-iso_file_path: "/root/Rocky-8.4-x86_64-minimal.iso"
+iso_file_path: "/root/Rocky-8.5-x86_64-minimal.iso"
 
 
 # This is the timezone that will be set during provisioning of OS
 # This is the timezone that will be set during provisioning of OS
 # Available timezones are provided in control_plane/common/files/timezone.txt
 # Available timezones are provided in control_plane/common/files/timezone.txt
@@ -175,4 +175,4 @@ ib_network_nic: "ib0"
 # The dhcp range for assigning the IPv4 address
 # The dhcp range for assigning the IPv4 address
 # Example: 172.17.0.1
 # Example: 172.17.0.1
 ib_network_dhcp_start_range: "172.25.0.100"
 ib_network_dhcp_start_range: "172.25.0.100"
-ib_network_dhcp_end_range: "172.25.0.200"
+ib_network_dhcp_end_range: "172.25.0.200"

+ 2 - 1
control_plane/roles/provision_cobbler/files/Dockerfile_leap

@@ -59,6 +59,7 @@ COPY .users.digest /etc/cobbler/users.digest
 COPY cobbler_configurations_leap.yml /root
 COPY cobbler_configurations_leap.yml /root
 COPY tftp.yml /root
 COPY tftp.yml /root
 COPY inventory_creation.yml /root
 COPY inventory_creation.yml /root
+COPY multi_cluster_provisioning.yml /root
 
 
 EXPOSE 69 80 443 25151
 EXPOSE 69 80 443 25151
 
 
@@ -67,4 +68,4 @@ VOLUME [ "/srv/www/cobbler", "/var/lib/cobbler/backup", "/mnt" ]
 RUN systemctl enable apache2
 RUN systemctl enable apache2
 RUN systemctl enable dhcpd
 RUN systemctl enable dhcpd
 
 
-CMD ["sbin/init"]
+CMD ["sbin/init"]

+ 2 - 1
control_plane/roles/provision_cobbler/files/Dockerfile_rocky

@@ -58,6 +58,7 @@ COPY .users.digest /etc/cobbler/users.digest
 COPY cobbler_configurations_rocky.yml /root
 COPY cobbler_configurations_rocky.yml /root
 COPY tftp.yml /root
 COPY tftp.yml /root
 COPY inventory_creation.yml /root
 COPY inventory_creation.yml /root
+COPY multi_cluster_provisioning.yml /root
 
 
 EXPOSE 69 80 443 25151
 EXPOSE 69 80 443 25151
 
 
@@ -66,4 +67,4 @@ VOLUME [ "/var/www/cobbler", "/var/lib/cobbler/backup", "/mnt" ]
 RUN systemctl enable httpd
 RUN systemctl enable httpd
 RUN systemctl enable dhcpd
 RUN systemctl enable dhcpd
 
 
-CMD ["sbin/init"]
+CMD ["sbin/init"]

+ 6 - 18
control_plane/roles/provision_cobbler/files/cobbler_configurations_leap.yml

@@ -1,4 +1,4 @@
-#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  you may not use this file except in compliance with the License.
@@ -77,21 +77,21 @@
     shell: export PATH="/usr/bin/curl:$PATH"
     shell: export PATH="/usr/bin/curl:$PATH"
 
 
   - name: Run import command
   - name: Run import command
-    command: cobbler import --arch=x86_64 --path=/mnt --name="{{ name_iso }}"
+    command: cobbler import --arch=x86_64 --path=/mnt/{{ provision_os }} --name="{{ provision_os }}"
     changed_when: false
     changed_when: false
 
 
   - name: Kickstart profile - leap
   - name: Kickstart profile - leap
     copy:
     copy:
       src: "/root/leap15.xml"
       src: "/root/leap15.xml"
       dest: "/var/lib/cobbler/templates/sample_autoyast.xml"
       dest: "/var/lib/cobbler/templates/sample_autoyast.xml"
-      mode: 0775
+      mode: "{{ file_perm }}"
     tags: install
     tags: install
 
 
   - name: Pxe menu
   - name: Pxe menu
     copy:
     copy:
       src: "/root/omnia/control_plane/roles/provision_cobbler/files/menu.yml"
       src: "/root/omnia/control_plane/roles/provision_cobbler/files/menu.yml"
       dest: "/etc/cobbler/boot_loader_conf/pxedefault.template"
       dest: "/etc/cobbler/boot_loader_conf/pxedefault.template"
-      mode: 0775
+      mode: "{{ file_perm }}"
     tags: install
     tags: install
 
 
   - name: Assign default grub option
   - name: Assign default grub option
@@ -127,26 +127,14 @@
     changed_when: false
     changed_when: false
     register: ansible_playbook_path
     register: ansible_playbook_path
 
 
-#  - name: Set dhcpd_lease_file variable for inventory creation
-#    cron:
-#      env: yes
-#      name: dhcpd_lease_file
-#      value: "/var/lib/dhcp/db/dhcpd.leases"
-
-#  - name: Set provision_os variable for inventory creation
-#    cron:
-#      env: yes
-#      name: provision_os
-#      value: "{{ name_iso }}"
-
   - name: Add tftp cron job
   - name: Add tftp cron job
     cron:
     cron:
       name: Start tftp service
       name: Start tftp service
       minute: "*"
       minute: "*"
-      job: "{{ ansible_playbook_path.stdout.split(' ')[1] }} /root/tftp.yml -e provision_os={{ name_iso }}"
+      job: "{{ ansible_playbook_path.stdout.split(' ')[1] }} /root/tftp.yml"
 
 
   - name: Add inventory cron job
   - name: Add inventory cron job
     cron:
     cron:
       name: Create inventory
       name: Create inventory
       minute: "*/5"
       minute: "*/5"
-      job: "{{ ansible_playbook_path.stdout.split(' ')[1] }} /root/inventory_creation.yml -e dhcpd_lease_file=\"/var/lib/dhcp/db/dhcpd.leases\""
+      job: "{{ ansible_playbook_path.stdout.split(' ')[1] }} /root/inventory_creation.yml"

+ 9 - 9
control_plane/roles/provision_cobbler/files/cobbler_configurations_rocky.yml

@@ -68,37 +68,37 @@
     shell: export PATH="/usr/bin/curl:$PATH"
     shell: export PATH="/usr/bin/curl:$PATH"
 
 
   - name: Run import command
   - name: Run import command
-    command: cobbler import --arch=x86_64 --path=/mnt --name="{{ name_iso }}"
+    command: cobbler import --arch=x86_64 --path=/mnt/{{ provision_os }} --name="{{ provision_os }}"
     changed_when: false
     changed_when: false
 
 
   - name: Kickstart profile - centos
   - name: Kickstart profile - centos
     copy:
     copy:
       src: "/root/centos7.ks"
       src: "/root/centos7.ks"
       dest: "/var/lib/cobbler/templates/sample.ks"
       dest: "/var/lib/cobbler/templates/sample.ks"
-      mode: 0775
+      mode: "{{ file_perm }}"
     tags: install
     tags: install
-    when: name_iso == "centos"
+    when: provision_os == "centos"
 
 
   - name: Kickstart profile - rocky
   - name: Kickstart profile - rocky
     copy:
     copy:
       src: "/root/rocky8.ks"
       src: "/root/rocky8.ks"
       dest: "/var/lib/cobbler/templates/sample.ks"
       dest: "/var/lib/cobbler/templates/sample.ks"
-      mode: 0775
+      mode: "{{ file_perm }}"
     tags: install
     tags: install
-    when: name_iso == "rocky"
+    when: provision_os == "rocky"
 
 
   - name: Pxe menu
   - name: Pxe menu
     copy:
     copy:
       src: "/root/omnia/control_plane/roles/provision_cobbler/files/menu.yml"
       src: "/root/omnia/control_plane/roles/provision_cobbler/files/menu.yml"
       dest: "/etc/cobbler/boot_loader_conf/pxedefault.template"
       dest: "/etc/cobbler/boot_loader_conf/pxedefault.template"
-      mode: 0775
+      mode: "{{ file_perm }}"
     tags: install
     tags: install
 
 
   - name: Assign default grub option
   - name: Assign default grub option
     replace:
     replace:
       path: "/var/lib/cobbler/grub_config/grub/grub.cfg"
       path: "/var/lib/cobbler/grub_config/grub/grub.cfg"
-      regexp: "^set default=\'local\'"
-      replace: "set default=\'1\'"
+      regexp: "^set default='local'"
+      replace: "set default='1'"
     tags: install
     tags: install
 
 
   - name: Assign default grub timeout
   - name: Assign default grub timeout
@@ -137,4 +137,4 @@
     cron:
     cron:
       name: Create inventory
       name: Create inventory
       minute: "*/5"
       minute: "*/5"
-      job: "{{ ansible_playbook_path.stdout.split(' ')[1] }} /root/inventory_creation.yml -e dhcpd_lease_file=\"/var/lib/dhcpd/dhcpd.leases\""
+      job: "{{ ansible_playbook_path.stdout.split(' ')[1] }} /root/inventory_creation.yml"

+ 10 - 4
control_plane/roles/provision_cobbler/files/inventory_creation.yml

@@ -1,10 +1,16 @@
 - hosts: localhost
 - hosts: localhost
   connection: local
   connection: local
-  gather_facts: false
+  gather_facts: true
   tasks:
   tasks:
-    - name: Read dhcp file
+    - name: Read dhcp file for rocky
       set_fact:
       set_fact:
-        var: "{{ lookup('file', '{{ dhcpd_lease_file }}').split()| unique | select| list }}"
+        var: "{{ lookup('file', '/var/lib/dhcpd/dhcpd.leases').split()| unique | select| list }}"
+      when: ansible_facts['distribution'] | lower == "rocky"
+
+    - name: Read dhcp file for leap
+      set_fact:
+        var: "{{ lookup('file', '/var/lib/dhcp/db/dhcpd.leases').split()| unique | select| list }}"
+      when: ansible_facts['distribution'] | lower == "leap"
     
     
     - name: Filter the ip
     - name: Filter the ip
       set_fact:
       set_fact:
@@ -35,4 +41,4 @@
 
 
     - name: New line at end of file
     - name: New line at end of file
       shell: echo "">> omnia/control_plane/roles/collect_node_info/files/provisioned_hosts.yml
       shell: echo "">> omnia/control_plane/roles/collect_node_info/files/provisioned_hosts.yml
-      changed_when: false
+      changed_when: false

+ 1 - 1
control_plane/roles/provision_cobbler/files/k8s_cobbler.yml

@@ -25,7 +25,7 @@ spec:
             type: Directory
             type: Directory
         - name: mnt-iso
         - name: mnt-iso
           hostPath:
           hostPath:
-            path: /mnt/iso/
+            path: /mnt/
             type: Directory
             type: Directory
       containers:
       containers:
         - name: cobbler
         - name: cobbler

+ 85 - 0
control_plane/roles/provision_cobbler/files/multi_cluster_provisioning.yml

@@ -0,0 +1,85 @@
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Initial cobbler setup
+  hosts: localhost
+  connection: local
+  gather_facts: false
+  tasks:
+    - name: Initialize variables
+      set_fact:
+        multi_profile: false
+        grub_option: 1
+
+    - name: Run import command
+      command: cobbler import --arch=x86_64 --path=/mnt/{{ provision_os }} --name="{{ provision_os }}"
+      changed_when: false
+
+    - name: Kickstart profile - centos
+      copy:
+        src: "/root/centos7.ks"
+        dest: "/var/lib/cobbler/templates/sample.ks"
+        mode: "{{ file_perm }}"
+      tags: install
+      when: provision_os == "centos"
+
+    - name: Kickstart profile - rocky
+      copy:
+        src: "/root/rocky8.ks"
+        dest: "/var/lib/cobbler/templates/sample.ks"
+        mode: "{{ file_perm }}"
+      tags: install
+      when: provision_os == "rocky"
+
+    - name: Kickstart profile - leap
+      copy:
+        src: "/root/leap15.xml"
+        dest: "/var/lib/cobbler/templates/sample_autoyast.xml"
+        mode: "{{ file_perm }}"
+      tags: install
+      when: provision_os == "leap"
+
+    - name: Get the cobbler profile list
+      command: cobbler profile list
+      changed_when: false
+      register: cobbler_profile_list
+      failed_when: false
+
+    - name: Check if cobbler_profile_list has more that one profile
+      set_fact:
+        multi_profile: true
+      when:
+        - cobbler_profile_list.stdout_lines| length > 1
+
+    - name: Check if the provision os is in cobbler_profile_list
+      set_fact:
+        grub_option: "{{ index + 1 }}"
+      when:
+        - provision_os in item
+        - multi_profile
+      loop: "{{ cobbler_profile_list.stdout_lines | flatten(levels=1) }}"
+      loop_control:
+        index_var: index
+
+    - name: Assign default grub option
+      replace:
+        path: "/var/lib/cobbler/grub_config/grub/grub.cfg"
+        regexp: "^set default=.*"
+        replace: "set default='{{ grub_option }}'"
+      tags: install
+
+    - name: Syncing of cobbler
+      command: cobbler sync
+      changed_when: false

+ 3 - 2
control_plane/roles/provision_cobbler/files/tftp.yml

@@ -1,4 +1,4 @@
-#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 - name: Start tftp and dhcp
 - name: Start tftp and dhcp
   hosts: localhost
   hosts: localhost
   connection: local
   connection: local
+  gather_facts: true
   tasks:
   tasks:
     - name: Configure tftp for leap
     - name: Configure tftp for leap
       block:
       block:
@@ -36,7 +37,7 @@
 
 
         - name: Start tftp.socket
         - name: Start tftp.socket
           command: systemctl start tftp.socket
           command: systemctl start tftp.socket
-      when: provision_os == "leap"
+      when: ansible_facts['distribution'] | lower == "leap"
 
 
     - name: Fetch tftp status
     - name: Fetch tftp status
       command: systemctl is-active tftp
       command: systemctl is-active tftp

+ 37 - 3
control_plane/roles/provision_cobbler/tasks/check_prerequisites.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
     cobbler_config_status: false
     cobbler_config_status: false
     backup_map_status: false
     backup_map_status: false
     new_node_status: false
     new_node_status: false
+    provision_os_change: false
+    previous_os: None
   tags: install
   tags: install
 
 
 - name: Set centos kickstart file name
 - name: Set centos kickstart file name
@@ -111,11 +113,43 @@
   failed_when: false
   failed_when: false
   when: cobbler_container_status
   when: cobbler_container_status
 
 
+- name: Check if .provisioned_os file exists
+  stat:
+    path: "/root/omnia/control_plane/roles/provision_cobbler/files/.provisioned_os"
+  register: provisioned_os_file
+
+- name: Create .provisioned_os file
+  file:
+    path: "{{ role_path }}/files/.provisioned_os"
+    state: touch
+    mode: "{{ temp_file_perm }}"
+  when: not provisioned_os_file.stat.exists
+
+- name: Check status of .provisioned_os file
+  stat:
+    path: "{{ role_path }}/files/.provisioned_os"
+  register: provisioned_os_file
+
+- name: Get the previous os provisioned
+  set_fact:
+    previous_os: "{{ lookup('file', '{{ role_path }}/files/.provisioned_os').split() | last }}"
+  when:
+    - provisioned_os_file.stat.exists
+    - provisioned_os_file.stat.size > 0
+
 - name: Update cobbler config status
 - name: Update cobbler config status
   set_fact:
   set_fact:
     cobbler_config_status: true
     cobbler_config_status: true
   when:
   when:
     - cobbler_container_status
     - cobbler_container_status
-    - provision_os in cobbler_profile_list.stdout
+    - (provision_os in cobbler_profile_list.stdout) or (previous_os in cobbler_profile_list.stdout)
     - "'* * * * * /usr/bin/ansible-playbook /root/tftp.yml' in crontab_list.stdout"
     - "'* * * * * /usr/bin/ansible-playbook /root/tftp.yml' in crontab_list.stdout"
-    - "'*/5 * * * * /usr/bin/ansible-playbook /root/inventory_creation.yml' in crontab_list.stdout"
+    - "'*/5 * * * * /usr/bin/ansible-playbook /root/inventory_creation.yml' in crontab_list.stdout"
+
+- name: Set status for provison_os_change
+  set_fact:
+    provision_os_change: true
+  when:
+    - previous_os != None
+    - previous_os != provision_os
+    - cobbler_config_status

+ 3 - 3
control_plane/roles/provision_cobbler/tasks/cobbler_image.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 ---
 ---
 
 
 - name: Image creation (It may take 5-10 mins)
 - name: Image creation (It may take 5-10 mins)
-  command: "buildah bud  -f Dockerfile_rocky -t {{ cobbler_image_name }}:{{ cobbler_image_tag }} --network host ."
+  command: "buildah bud -f Dockerfile_rocky -t {{ cobbler_image_name }}:{{ cobbler_image_tag }} --network host ."
   changed_when: true
   changed_when: true
   args:
   args:
     chdir: "{{ role_path }}/files/"
     chdir: "{{ role_path }}/files/"
@@ -48,4 +48,4 @@
 - name: Deploy cobbler pod
 - name: Deploy cobbler pod
   command: "kubectl apply -f {{ role_path }}/files/k8s_cobbler.yml"
   command: "kubectl apply -f {{ role_path }}/files/k8s_cobbler.yml"
   changed_when: true
   changed_when: true
-  tags: install
+  tags: install

+ 23 - 7
control_plane/roles/provision_cobbler/tasks/configure_cobbler.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,10 @@
   command: "kubectl apply -f {{ role_path }}/files/k8s_cobbler.yml"
   command: "kubectl apply -f {{ role_path }}/files/k8s_cobbler.yml"
   changed_when: true
   changed_when: true
   tags: install
   tags: install
-  when: cobbler_container_status and not cobbler_config_status
+  when:
+    - cobbler_container_status
+    - not cobbler_config_status
+    - not provision_os_change
 
 
 - name: Wait for cobbler pod to come to ready state
 - name: Wait for cobbler pod to come to ready state
   command: kubectl wait --for=condition=ready -n {{ cobbler_namespace }} pod -l app=cobbler
   command: kubectl wait --for=condition=ready -n {{ cobbler_namespace }} pod -l app=cobbler
@@ -40,20 +43,28 @@
   when: not cobbler_config_status
   when: not cobbler_config_status
 
 
 - name: Configuring cobbler inside container (It may take 5-10 mins)
 - name: Configuring cobbler inside container (It may take 5-10 mins)
-  command: "kubectl exec --stdin --tty -n {{ cobbler_namespace }} {{ cobbler_pod_name.stdout }} -- ansible-playbook /root/cobbler_configurations_rocky.yml -e name_iso={{ provision_os }}"
+  command: "kubectl exec --stdin --tty -n {{ cobbler_namespace }} {{ cobbler_pod_name.stdout }} -- ansible-playbook /root/cobbler_configurations_rocky.yml -e provision_os={{ provision_os }} -e file_perm={{ file_perm }}"
   changed_when: true
   changed_when: true
   tags: install
   tags: install
-  when: 
+  when:
     - not cobbler_config_status
     - not cobbler_config_status
     - mgmt_os == os_supported_rocky
     - mgmt_os == os_supported_rocky
-      
+    - not provision_os_change
+
 - name: Configuring cobbler inside container (It may take 5-10 mins)
 - name: Configuring cobbler inside container (It may take 5-10 mins)
-  command: "kubectl exec --stdin --tty -n {{ cobbler_namespace }} {{ cobbler_pod_name.stdout }} -- ansible-playbook /root/cobbler_configurations_leap.yml -e name_iso={{ provision_os }} -e host_network_nic={{ host_network_nic }}"
+  command: "kubectl exec --stdin --tty -n {{ cobbler_namespace }} {{ cobbler_pod_name.stdout }} -- ansible-playbook /root/cobbler_configurations_leap.yml -e provision_os={{ provision_os }} -e host_network_nic={{ host_network_nic }}  -e file_perm={{ file_perm }}"
   changed_when: true
   changed_when: true
   tags: install
   tags: install
-  when: 
+  when:
     - not cobbler_config_status
     - not cobbler_config_status
     - mgmt_os == os_supported_leap
     - mgmt_os == os_supported_leap
+    - not provision_os_change
+
+- name: Configuring cobbler for multiple profile support
+  command: "kubectl exec --stdin --tty -n {{ cobbler_namespace }} {{ cobbler_pod_name.stdout }} -- ansible-playbook /root/multi_cluster_provisioning.yml -e provision_os={{ provision_os }} -e provision_os_change={{ provision_os_change }} -e file_perm={{ file_perm }}"
+  changed_when: true
+  tags: install
+  when: provision_os_change
 
 
 - name: Schedule task
 - name: Schedule task
   cron:
   cron:
@@ -68,6 +79,10 @@
   changed_when: true
   changed_when: true
   when: cobbler_config_status
   when: cobbler_config_status
 
 
+- name: Store the provisioned os in .provisioned_os.txt file
+  shell: echo "{{ provision_os }}">> {{ role_path }}/files/.provisioned_os
+  changed_when: false
+
 - name: Remove the files
 - name: Remove the files
   file:
   file:
     path: "{{ item }}"
     path: "{{ item }}"
@@ -78,3 +93,4 @@
     - "{{ role_path }}/files/settings"
     - "{{ role_path }}/files/settings"
     - "{{ role_path }}/files/{{ cobbler_kickstart_file }}"
     - "{{ role_path }}/files/{{ cobbler_kickstart_file }}"
     - "{{ role_path }}/files/temp_host_mapping_file.csv.bak"
     - "{{ role_path }}/files/temp_host_mapping_file.csv.bak"
+    - "/mnt/tmp"

+ 48 - 41
control_plane/roles/provision_cobbler/tasks/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  you may not use this file except in compliance with the License.
@@ -18,52 +18,59 @@
 - name: Check cobbler status on machine
 - name: Check cobbler status on machine
   include_tasks: check_prerequisites.yml
   include_tasks: check_prerequisites.yml
 
 
-- name: Mount iso image
-  import_tasks: mount_iso.yml
-  when: not cobbler_image_status
+- name: Multi profile creation
+  import_tasks: multi_profile_creation.yml
+  when: provision_os_change
 
 
-- name: Modify firewall settings for Cobbler
-  import_tasks: firewall_settings.yml
-  when: not cobbler_container_status
-
-- name: Include common variables
-  include_vars: ../../control_plane_common/vars/main.yml
-  when: not cobbler_container_status
+- name: Cobbler pod creation
+  block:
+    - name: Mount iso image
+      import_tasks: mount_iso.yml
+      when: not cobbler_image_status
 
 
-- name: Internet validation
-  include_tasks: ../../control_plane_common/tasks/internet_validation.yml
-  when: not cobbler_container_status
+    - name: Modify firewall settings for Cobbler
+      import_tasks: firewall_settings.yml
+      when: not cobbler_container_status
 
 
-- name: Dhcp Configuration
-  import_tasks: dhcp_configure.yml
-  when: (not cobbler_image_status) or ( backup_map_status )
+    - name: Include common variables
+      include_vars: ../../control_plane_common/vars/main.yml
+      when: not cobbler_container_status
 
 
-- name: Provision password validation
-  import_tasks: provision_password.yml
-  when: not cobbler_image_status
+    - name: Internet validation
+      include_tasks: ../../control_plane_common/tasks/internet_validation.yml
+      when: not cobbler_container_status
 
 
-- name: Mapping file validation
-  import_tasks: mapping_file.yml
-  when: (not cobbler_image_status) and (host_mapping_file) or ( backup_map_status)
+    - name: Dhcp Configuration
+      import_tasks: dhcp_configure.yml
+      when: (not cobbler_image_status) or ( backup_map_status )
 
 
-- name: Cobbler image creation
-  import_tasks: cobbler_image.yml
-  when: not cobbler_container_status
+    - name: Provision password validation
+      import_tasks: provision_password.yml
+      when: not cobbler_image_status
 
 
-- name: Cobbler configuration
-  import_tasks: configure_cobbler.yml
-  when: not cobbler_config_status
+    - name: Mapping file validation
+      import_tasks: mapping_file.yml
+      when: (not cobbler_image_status) and (host_mapping_file) or ( backup_map_status)
 
 
-- name: Cobbler container status message
-  block:
-    - name: cobbler container running
-      debug:
-        msg: "{{ message_skipped }}"
-        verbosity: 2
-      when: cobbler_container_status
-    - name: cobbler container not running
-      debug:
-        msg: "{{ message_installed }}"
-        verbosity: 2
+    - name: Cobbler image creation
+      import_tasks: cobbler_image.yml
       when: not cobbler_container_status
       when: not cobbler_container_status
-  tags: install
+
+    - name: Cobbler configuration
+      import_tasks: configure_cobbler.yml
+      when: not cobbler_config_status
+
+    - name: Cobbler container status message
+      block:
+        - name: cobbler container running
+          debug:
+            msg: "{{ message_skipped }}"
+            verbosity: 2
+          when: cobbler_container_status
+        - name: cobbler container not running
+          debug:
+            msg: "{{ message_installed }}"
+            verbosity: 2
+          when: not cobbler_container_status
+      tags: install
+  when: not provision_os_change

+ 2 - 2
control_plane/roles/provision_cobbler/tasks/mapping_file.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # you may not use this file except in compliance with the License.
@@ -68,7 +68,7 @@
   copy:
   copy:
     src: "{{ temp_host_mapping_file }}"
     src: "{{ temp_host_mapping_file }}"
     dest: "{{ role_path }}/files/backup_host_mapping_file.csv"
     dest: "{{ role_path }}/files/backup_host_mapping_file.csv"
-    mode: 0644
+    mode: "{{ temp_file_perm }}"
 
 
 - name: Get cobbler pod name
 - name: Get cobbler pod name
   command: 'kubectl get pod -n {{ cobbler_namespace }} -l app=cobbler -o jsonpath="{.items[0].metadata.name}"'
   command: 'kubectl get pod -n {{ cobbler_namespace }} -l app=cobbler -o jsonpath="{.items[0].metadata.name}"'

+ 6 - 6
control_plane/roles/provision_cobbler/tasks/mount_iso.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # you may not use this file except in compliance with the License.
@@ -20,19 +20,19 @@
 
 
 - name: Check iso directory status
 - name: Check iso directory status
   stat:
   stat:
-    path: "/mnt/{{ iso_dir_name }}"
+    path: "/mnt/{{ provision_os }}"
   register: check_iso_dir
   register: check_iso_dir
 
 
 - name: Create iso directory
 - name: Create iso directory
   file:
   file:
-    path: "/mnt/{{ iso_dir_name }}"
+    path: "/mnt/{{ provision_os }}"
     state: directory
     state: directory
-    mode: 0644
+    mode: "{{ temp_file_perm }}"
   tags: install
   tags: install
   when: not check_iso_dir.stat.exists
   when: not check_iso_dir.stat.exists
 
 
 - name: Check mountpoint
 - name: Check mountpoint
-  command: mountpoint /mnt/{{ iso_dir_name }}
+  command: mountpoint /mnt/
   changed_when: false
   changed_when: false
   register: result
   register: result
   failed_when: false
   failed_when: false
@@ -44,7 +44,7 @@
   tags: install
   tags: install
 
 
 - name: Mount the iso file
 - name: Mount the iso file
-  command: mount -o loop {{ iso_file_path }} /mnt/{{ iso_dir_name }}
+  command: mount -o loop {{ iso_file_path }} /mnt/{{ provision_os }}
   changed_when: false
   changed_when: false
   args:
   args:
     warn: no
     warn: no

+ 67 - 0
control_plane/roles/provision_cobbler/tasks/multi_profile_creation.yml

@@ -0,0 +1,67 @@
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Validate cobbler configuration status
+  set_fact:
+    cobbler_config_status: false
+    profile_exists: false
+  when: cobbler_config_status
+
+- name: Check prifile for {{ provision_os }} exists
+  set_fact:
+    profile_exists: true
+  when: provision_os in cobbler_profile_list.stdout
+
+- name: Create a temp iso directory to mount the iso files
+  file:
+    path: "/mnt/tmp"
+    state: directory
+    mode: "{{ temp_file_perm }}"
+  when: not profile_exists
+
+- name: Check {{ provision_os }} folder status
+  stat:
+    path: "/mnt/{{ provision_os }}"
+  register: file_status
+
+- name: Create directory for {{ provision_os }} to copy the iso files
+  file:
+    path: "/mnt/{{ provision_os }}"
+    state: directory
+    mode: "{{ temp_file_perm }}"
+  when:
+    - not file_status.stat.exists
+    - not profile_exists
+
+- name: Mount the iso file
+  command: mount -o loop {{ iso_file_path }} /mnt/tmp
+  args:
+    warn: no
+  when: not profile_exists
+
+- name: Copy the iso file inside /mnt
+  command: cp -rf /mnt/tmp/. /mnt/{{ provision_os }}
+  changed_when: false
+  when: not profile_exists
+
+- name: Unmount /mnt/tmp for further mounts
+  command: umount /mnt/tmp
+  when: not profile_exists
+
+- name: Provision password validation
+  import_tasks: provision_password.yml
+
+- name: Cobbler configuration
+  import_tasks: configure_cobbler.yml

+ 5 - 1
control_plane/roles/provision_cobbler/tasks/provision_password.yml

@@ -1,4 +1,4 @@
-#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
     path: "{{ role_path }}/files/.users.digest"
     path: "{{ role_path }}/files/.users.digest"
     state: absent
     state: absent
   tags: install
   tags: install
+  when: not provision_os_change
 
 
 - name: Create a new user
 - name: Create a new user
   file:
   file:
@@ -25,18 +26,21 @@
     state: touch
     state: touch
     mode: "{{ user_mode }}"
     mode: "{{ user_mode }}"
   tags: install
   tags: install
+  when: not provision_os_change
 
 
 - name: Cobbler UI password
 - name: Cobbler UI password
   set_fact:
   set_fact:
         encrypt_password: "{{ cobbler_password | hash('sha3_256') }}"
         encrypt_password: "{{ cobbler_password | hash('sha3_256') }}"
   no_log: true
   no_log: true
   tags: install
   tags: install
+  when: not provision_os_change
 
 
 - name: Copy cobbler password to cobbler config file
 - name: Copy cobbler password to cobbler config file
   shell: printf "%s:%s:%s\n" "{{ username }}" "Cobbler" "{{ encrypt_password }}" > "{{ role_path }}/files/.users.digest"
   shell: printf "%s:%s:%s\n" "{{ username }}" "Cobbler" "{{ encrypt_password }}" > "{{ role_path }}/files/.users.digest"
   changed_when: false
   changed_when: false
   no_log: true
   no_log: true
   tags: install
   tags: install
+  when: not provision_os_change
 
 
 - name: Kickstart configuration - centos
 - name: Kickstart configuration - centos
   block:
   block:

+ 6 - 3
control_plane/roles/provision_cobbler/vars/main.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # you may not use this file except in compliance with the License.
@@ -66,5 +66,8 @@ cobbler_image_tag: latest
 message_skipped: "Installation Skipped: Cobbler instance is already running in your system"
 message_skipped: "Installation Skipped: Cobbler instance is already running in your system"
 message_installed: "Installation Successful"
 message_installed: "Installation Successful"
 
 
-# Usage: mount_iso.yml
-iso_dir_name: iso
+# Usage: multi_cluster_provisioning.yml
+file_perm: '0775'
+
+# Usage: multi_cluster_provisioning.yml
+temp_file_perm: '0644'

+ 13 - 0
docs/FAQ.md

@@ -192,4 +192,17 @@ No. During Cobbler based deployment, only one OS is supported at a time. If the
 ## Why do Firmware Updates fail for some components with Omnia 1.1.1?
 ## Why do Firmware Updates fail for some components with Omnia 1.1.1?
 Due to the latest `catalog.xml` file, Firmware updates fail for some components on server models R640 and R740. Omnia execution doesn't get interrupted but an error gets logged. For now, please download those individual updates manually.
 Due to the latest `catalog.xml` file, Firmware updates fail for some components on server models R640 and R740. Omnia execution doesn't get interrupted but an error gets logged. For now, please download those individual updates manually.
 
 
+## Why does the Task [network_ib : Authentication failure response] fail with the message 'Status code was -1 and not [302]: Request failed: <urlopen error [Errno 111] Connection refused>' on Infiniband Switches when running `infiniband.yml`?
+To configure a new Infiniband Switch, it is required that HTTP and JSON gateway be enabled. To verify that they are enabled, run:
+
+`show web` (To check if HTTP is enabled)
+
+`show json-gw` (To check if JSON Gateway is enabled)
+
+To correct the issue, run:
+
+`web http enable` (To enable the HTTP gateway)
+
+`json-gw enable` (To enable the JSON gateway)
+
 
 

File diff suppressed because it is too large
+ 8 - 1
docs/INSTALL_OMNIA_CONTROL_PLANE.md


+ 9 - 9
docs/README.md

@@ -51,7 +51,7 @@ Requirements  |   Version
 OS pre-installed on the management station  |  CentOS 8.4/ Rocky 8.5/ Leap 15.3
 OS pre-installed on the management station  |  CentOS 8.4/ Rocky 8.5/ Leap 15.3
 OS deployed by Omnia on bare-metal Dell EMC PowerEdge Servers | Rocky 8.5 Minimal Edition/ Leap 15.3
 OS deployed by Omnia on bare-metal Dell EMC PowerEdge Servers | Rocky 8.5 Minimal Edition/ Leap 15.3
 Cobbler  |  3.2.2
 Cobbler  |  3.2.2
-Ansible AWX  |  19.1.0
+Ansible AWX  |  19.4.0
 Slurm Workload Manager  |  20.11.2
 Slurm Workload Manager  |  20.11.2
 Kubernetes on the management station  |  1.21.0
 Kubernetes on the management station  |  1.21.0
 Kubernetes on the manager and compute nodes	|	1.16.7 or 1.19.3
 Kubernetes on the manager and compute nodes	|	1.16.7 or 1.19.3
@@ -59,7 +59,7 @@ Kubeflow  |  1
 Prometheus  |  2.23.0
 Prometheus  |  2.23.0
 Ansible  |  2.9.21
 Ansible  |  2.9.21
 Python  |  3.6.15
 Python  |  3.6.15
-CRI-O  |  1.17.3
+CRI-O  |  1.21.0
 
 
 ## Hardware managed by Omnia
 ## Hardware managed by Omnia
 The following table lists the supported devices managed by Omnia. Other devices than those listed in the following table will be discovered by Omnia, but features offered by Omnia will not be applicable.
 The following table lists the supported devices managed by Omnia. Other devices than those listed in the following table will be discovered by Omnia, but features offered by Omnia will not be applicable.
@@ -89,12 +89,12 @@ FreeIPA	|	GNU General Public License v3	|	4.6.8	|	Authentication system used in
 OpenSM	|	GNU General Public License 2	|	3.3.24	|	-
 OpenSM	|	GNU General Public License 2	|	3.3.24	|	-
 NVIDIA container runtime	|	Apache-2.0	|	3.4.2	|	Nvidia container runtime library
 NVIDIA container runtime	|	Apache-2.0	|	3.4.2	|	Nvidia container runtime library
 Python PIP	|	MIT License	|	21.1.2	|	Python Package
 Python PIP	|	MIT License	|	21.1.2	|	Python Package
-Python3	|	-	|	3.6.8	|	-
-Kubelet	|	Apache-2.0	|	1.16.7,1.19,1.21	|	Provides external, versioned ComponentConfig API types for configuring the kubelet
-Kubeadm	|	Apache-2.0	|	1.16.7,1.19,1.21	|	"fast paths" for creating Kubernetes clusters
-Kubectl	|	Apache-2.0	|	1.16.7,1.19,1.21	|	Command line tool for Kubernetes
+Python3	|	-	|	3.6.8 (3.6.15 if LeapOS is being used)	|	-
+Kubelet	|	Apache-2.0	|	1.16.7,1.19, 1.21 (LeapOS only supports 1.21) 	|	Provides external, versioned ComponentConfig API types for configuring the kubelet
+Kubeadm	|	Apache-2.0	|	1.16.7,1.19, 1.21 (LeapOS only supports 1.21)	|	"fast paths" for creating Kubernetes clusters
+Kubectl	|	Apache-2.0	|	1.16.7,1.19, 1.21 (LeapOS only supports 1.21)	|	Command line tool for Kubernetes
 JupyterHub	|	Modified BSD License	|	1.1.0	|	Multi-user hub
 JupyterHub	|	Modified BSD License	|	1.1.0	|	Multi-user hub
-kubernetes Controllers	|	Apache-2.0	|	1.16.7,1.19,1.21	|	Orchestration tool	
+kubernetes Controllers	|	Apache-2.0	|	1.16.7,1.19 (1.21 if LeapOS is being used)	|	Orchestration tool	
 Kfctl	|	Apache-2.0	|	1.0.2	|	CLI for deploying and managing Kubeflow
 Kfctl	|	Apache-2.0	|	1.0.2	|	CLI for deploying and managing Kubeflow
 Kubeflow	|	Apache-2.0	|	1	|	Cloud Native platform for machine learning
 Kubeflow	|	Apache-2.0	|	1	|	Cloud Native platform for machine learning
 Helm	|	Apache-2.0	|	3.5.0	|	Kubernetes Package Manager
 Helm	|	Apache-2.0	|	3.5.0	|	Kubernetes Package Manager
@@ -104,8 +104,8 @@ Horovod	|	Apache-2.0	|	0.21.1	|	Distributed deep learning training framework for
 MPI	|	Copyright (c) 2018-2019 Triad National Security,LLC. All rights reserved.	|	0.3.0	|	HPC library
 MPI	|	Copyright (c) 2018-2019 Triad National Security,LLC. All rights reserved.	|	0.3.0	|	HPC library
 CoreDNS	|	Apache-2.0	|	1.6.2	|	DNS server that chains plugins
 CoreDNS	|	Apache-2.0	|	1.6.2	|	DNS server that chains plugins
 CNI	|	Apache-2.0	|	0.3.1	|	Networking for Linux containers
 CNI	|	Apache-2.0	|	0.3.1	|	Networking for Linux containers
-AWX	|	Apache-2.0	|	19.1.0	|	Web-based User Interface
-AWX.AWX	|	Apache-2.0	|	19.1.0	|	Galaxy collection to perform awx configuration
+AWX	|	Apache-2.0	|	19.4.0	|	Web-based User Interface
+AWX.AWX	|	Apache-2.0	|	19.4.0	|	Galaxy collection to perform awx configuration
 AWXkit	|	Apache-2.0	|	to be updated	|	To perform configuration through CLI commands
 AWXkit	|	Apache-2.0	|	to be updated	|	To perform configuration through CLI commands
 Cri-o	|	Apache-2.0	|	1.21	|	Container Service
 Cri-o	|	Apache-2.0	|	1.21	|	Container Service
 Buildah	|	Apache-2.0	|	1.21.4	|	Tool to build and run container
 Buildah	|	Apache-2.0	|	1.21.4	|	Tool to build and run container

+ 23 - 0
docs/Security/Enable_Security_ManagementStation.md

@@ -0,0 +1,23 @@
+# Enabling Security on the Management Station and Login Node
+
+## Enabling FreeIPA on the Management Station:
+
+Set the parameter 'enable_security_support' to true in `base_vars.yml`
+
+## Prerequisites Before Enabling FreeIPA:
+* Enter the relevant values in `security_vars.yml`:
+
+| Parameter Name | Default Value | Additional Information                                                                                           |
+|----------------|---------------|------------------------------------------------------------------------------------------------------------------|
+| domain_name    | omnia.test    | The domain name should not contain an underscore ( _ )                                                           |
+| realm_name     | omnia.test    | The realm name should follow the following rules per https://www.freeipa.org/page/Deployment_Recommendations <br> * The realm name must not conflict with any other existing Kerberos realm name (e.g. name used by Active Directory). <br> * The realm name should be upper-case (EXAMPLE.COM) version of primary DNS domain name (example.com).  |
+
+* Enter the relevant values in `login_vars.yml`:
+
+| Parameter Name             | Default Value | Additional Information                                                                           |
+|----------------------------|---------------|--------------------------------------------------------------------------------------------------|
+| directory_manager_password |               | Password of the Directory Manager with full access to the directory for system management tasks. |
+| ipa_admin_password         |               | "admin" user password for the IPA server                                                         |
+
+
+

BIN
docs/TelemetryAndMonitoring/Images/DashBoardIcon.PNG


BIN
docs/TelemetryAndMonitoring/Images/ExploreIcon.PNG


BIN
docs/TelemetryAndMonitoring/Images/Prometheus_Dashboard.jpg


BIN
docs/TelemetryAndMonitoring/Images/Prometheus_DataSource.jpg


+ 51 - 0
docs/TelemetryAndMonitoring/Install_Telemetry.md

@@ -0,0 +1,51 @@
+# Setting Up Telemetry
+
+Using Grafana, users can poll multiple devices and create graphs/visualizations of key statistics.
+
+## Prerequisites
+
+1. To set up Grafana, ensure that `control_plane/input_params/login_vars.yml` is updated with the Grafana Username and Password.
+2. All parameters in `telemetry/input_params/login_vars.yml` need to be filled in:
+
+| Parameter Name        | Default Value | Information |
+|-----------------------|---------------|-------------|
+| timescaledb_user      | postgres      |  Username used for connecting to timescale db. Minimum Legth: 2 characters.          |
+| timescaledb_password  | postgres      |  Password used for connecting to timescale db. Minimum Legth: 2 characters.           |
+| mysqldb_user          | mysql         |  Username used for connecting to mysql db. Minimum Legth: 2 characters.         |
+| mysqldb_password      | mysql         |  Password used for connecting to mysql db. Minimum Legth: 2 characters.            |
+| mysqldb_root_password | mysql         |  Password used for connecting to mysql db for root user. Minimum Legth: 2 characters.         |
+
+3. All parameters in `telemetry/input_params/base_vars.yml` need to be filled in:
+
+| Parameter Name          | Default Value     | Information |
+|-------------------------|-------------------|-------------|
+| mount_location          | /mnt/omnia        | Sets the location all telemetry related files will be stored and both timescale and mysql databases will be mounted.            |
+| idrac_telemetry_support | true              | This variable is used to enable iDRAC telemetry support and visualizations. Accepted Values: true/false            |
+| slurm_telemetry_support | true              | This variable is used to enable slurm telemetry support and visualizations. Slurm Telemetry support can only be activated when idrac_telemetry_support is set to true. Accepted Values: True/False.        |
+| timescaledb_name        | telemetry_metrics | Postgres DB with timescale extension is used for storing iDRAC and slurm telemetry metrics.            |
+| myscaledb_name          | mysql             | MySQL DB is used to store IPs and credentials of iDRACs having datacenter license           |
+
+3. Find the IP of the Grafana UI using:
+ 
+`kubectl get svc -n grafana`
+
+## Logging into Grafana
+
+Use any one of the following browsers to access the Grafana UI (https://< Grafana UI IP >:5000):
+* Chrome/Chromium
+* Firefox
+* Safari
+* Microsoft Edge
+
+>> __Note:__ Always enable JavaScript in your browser. Running Grafana without JavaScript enabled in the browser is not supported.
+
+## Prerequisites to Enabling Slurm Telemetry
+
+* Slurm Telemetry cannot be executed without iDRAC support
+* Omnia control plane should be executed and node_inventory should be created in awx.
+* The slurm manager and compute nodes are fetched at run time from node_inventory.
+* Slurm should be installed on the nodes, if not there is no point in executing slurm telemetry.
+
+
+
+

+ 105 - 0
docs/TelemetryAndMonitoring/MONITOR_CLUSTERS.md

@@ -0,0 +1,105 @@
+# Monitor Kubernetes and Slurm
+Omnia provides playbooks to configure additional software components for Kubernetes such as JupyterHub and Kubeflow. For workload management (submitting, conrolling, and managing jobs) of HPC, AI, and Data Analytics clusters, you can access Kubernetes and Slurm dashboards and other supported applications. 
+
+## Before accessing the dashboards
+To access any of the dashboards, ensure that a compatible web browser is installed. If you are connecting remotely to your Linux server by using MobaXterm version later than 8 or other X11 Clients though *ssh*, follow the below mentioned steps to launch the Firefox Browser:  
+* On the management station:
+	1. Connect using *ssh*. Run `ssh <user>@<IP-address>`, where *IP-address* is the private IP of the management station.
+	2. `dnf install mesa-libGL-devel -y`
+	3. `dnf install firefox -y`
+	4. `dnf install xorg-x11-xauth`
+	5. `export DISPLAY=:10.0`
+	6. `logout and login back`
+	7. To launch Firefox from terminal, run `firefox&`.  
+	
+* On the manager node:
+	1. Connect using *ssh*. Run `ssh <user>@<IP-address>`, where *IP-address* is the private IP of the manager node.
+	2. `yum install firefox -y`
+	3. `yum install xorg-x11-xauth`
+	4. `export DISPLAY=:10.0`
+	5. `logout and login back`
+	6. To launch Firefox from terminal, run `firefox&`
+
+**NOTE**: When the PuTTY or MobaXterm session ends, you must run **export DISPLAY=:10.0** command each time, else Firefox cannot be launched again.  
+
+## Access FreeIPA Dashboard  
+The FreeIPA Dashboard can be accessed from the management station, manager, and login nodes. To access the dashboard:
+1.	Install the Firefox Browser.
+2.	Open the Firefox Browser and enter the url: `https://<hostname>`. For example, enter `https://manager.example.com`.
+3.	Enter the username and password. If the admin or user has obtained a Kerberos ticket, then the credentials need not be provided.  
+
+**Note**: To obtain a Kerberos ticket, perform the following actions:
+1. Enter `kinit <username>`
+2. When prompted, enter the password.
+
+An administrator can create users on the login node using FreeIPA. The users will be prompted to change the passwords upon first login.
+
+## Access Kuberentes Dashboard
+1. To verify if the **Kubernetes-dashboard** service is in the Running state, run `kubectl get pods --namespace kubernetes-dashboard`.
+2. To start the Kubernetes dashboard, run `kubectl proxy`.
+3. To retrieve the encrypted token, run `kubectl get secret -n kubernetes-dashboard $(kubectl get serviceaccount admin-user -n kubernetes-dashboard -o jsonpath="{.secrets[0].name}") -o jsonpath="{.data.token}" | base64 --decode`.
+4. Copy the encrypted token value.
+5. On a web browser on the management station (for control_plane.yml) or manager node (for omnia.yml) enter http://localhost:8001/api/v1/namespaces/kubernetes-dashboard/services/https:kubernetes-dashboard:/proxy/.
+6. Select the authentication method as __Token__.
+7. On the Kuberenetes Dashboard, paste the copied encrypted token and click **Sign in** to access the Kubernetes Dashboard.
+
+## Access Kubeflow Dashboard
+1. Before accessing the Kubeflow Dashboard, run `kubectl -n kubeflow get applications -o yaml profiles`. Wait till **profiles-deployment** enters the Ready state.
+2. To retrieve the **External IP** or **CLUSTER IP**, run `kubectl get services istio-ingressgateway --namespace istio-system`.
+3. On a web browser installed on the manager node, enter the **External IP** or **Cluster IP** to open the Kubeflow Central Dashboard.  
+
+For more information about the Kubeflow Central Dashboard, see https://www.kubeflow.org/docs/components/central-dash/overview/.
+
+## Access JupyterHub Dashboard
+
+1. To verify if the JupyterHub services are running, run `kubectl get pods --namespace jupyterhub`.
+2. Ensure that the pod names starting with __hub__ and __proxy__ are in the **Running** state.
+3. To retrieve the **External IP** or **CLUSTER IP**, run `kubectl get services proxy-public --namespace jupyterhub`.
+4. On a web browser installed on the manager node, enter the **External IP** or **Cluster IP** to open the JupyterHub Dashboard.
+5. JupyterHub is running with a default dummy authenticator. Enter any username and password combination to access the dashboard.
+
+For more information about configuring username and password, and to access the JupyterHub Dashboard, see https://zero-to-jupyterhub.readthedocs.io/en/stable/jupyterhub/customization.html.
+
+## Access Prometheus UI
+
+Prometheus is installed:
+  * As a Kubernetes role (**A**), when both Slurm and Kubernetes are installed.
+  * On the host when only Slurm is installed (**B**).
+
+**A**. When Prometheus is installed as a Kubernetes role.  
+* Access Prometheus with local host:  
+    1. Run the following commands:  
+       `export POD_NAME=$(kubectl get pods --namespace default -l "app=prometheus,component=server" -o jsonpath="{.items[0].metadata.name}")`  
+       `echo $POD_NAME`  
+       `kubectl --namespace default port-forward $POD_NAME 9090`  
+    2. To launch the Prometheus UI, in the web browser, enter `http://localhost:9090`.
+  
+* Access Prometheus with a private IP address:
+    1. Run `kubectl get services --all-namespaces`.
+    2. From the list of services, find  the **prometheus-xxxx-server** service under the **Name** column, and copy the **EXTERNAL-IP** address.  
+   For example, in the below list of services, `192.168.2.150` is the external IP address for the service `prometheus-1619158141-server`.
+   
+		NAMESPACE	|	NAME	|	TYPE	|	CLUSTER-IP	|	EXTERNAL-IP	|	PORT(S)	|	AGE  
+		---------	|	----	|	----	|	----------	|	-----------	|	-------	|	----  
+		default	|	kubernetes	|	ClusterIP	|	10.96.0.1	|	none	|	443/TCP	|	107m  
+		default	|	**prometheus-1619158141-server**	|	LoadBalancer	|	10.97.40.140	|	**192.168.2.150**	|	80:31687/TCP	|	106m  
+    3. To open Firefox, run `firefox&`.
+    4. Enter the copied External IP address to access Prometheus. For example, enter `192.168.2.150` to access Prometheus UI.
+
+**B**. When Prometheus is installed on the host.
+1. Navigate to Prometheus folder. The default path is `/var/lib/prometheus-2.23.0.linux-amd64/`.
+2. Start the web server: `./prometheus`.  
+3. To launch the Prometheus UI, in the web browser, enter `http://localhost:9090`. 
+
+__Note:__ 
+* If Prometheus is installed through Slurm without installing Kubernetes, then it will be removed when Kubernetes is installed because Prometheus would be running as a pod. 
+* Only a single instance of Prometheus is installed when both Kubernetes and Slurm are installed.
+
+## Accessing Prometheus data via Grafana UI (On the Management Station)
+
+* Once `control_plane.yml` is run, Prometheus is added to Grafana as a datasource (hpc-prometheus). This allows Grafana to display statistics from the Compute Nodes that have been polled using Prometheus.
+
+* Select the dashboard (![Dashboard Icon](Images/DashBoardIcon.PNG)) tab to view the list of Prometheus based dashboards. Some default dashboards include CoreDNS, Prometheus Overview, Kuberenetes Networking etc.
+
+>> __Note:__ Both the control plane and HPC clusters can be monitored on these dashboards by toggling the datasource at the top of each dashboard. 
+

+ 67 - 0
docs/TelemetryAndMonitoring/Monitor_Control_Plane.md

@@ -0,0 +1,67 @@
+# Monitoring The Management Station
+
+To monitor the Management Station, Omnia uses the Grafana UI with a Loki integration (This can be set up using the steps provided [here](Install_Telemetry.md)).  
+
+
+## Accessing Loki via Grafana
+
+[Loki](https://grafana.com/docs/loki/latest/fundamentals/overview/) is a datastore used to efficiently hold log data for security purposes. Using the `promtail` agent, logs are collated and streamed via a HTTP API.
+
+>> __Note:__ When `control_plane.yml` is run, Loki is automatically set up as a data source on the Grafana UI.
+
+
+
+### Querying Loki 
+
+Loki uses basic regex based syntax to filter for specific jobs, dates or timestamps.
+
+* Select the Explore ![Explore Icon](Images/ExploreIcon.PNG) tab to select control-plane-loki from the drop down.
+* Using [LogQL queries](https://grafana.com/docs/loki/latest/logql/log_queries/), all logs in `/var/log` can be accessed using filters (Eg: `{job=”Omnia”}` )
+
+## Viewing Logs on the Dashboard
+
+All log files can be viewed via the Dashboard tab (![Dashboard Icon](Images/DashBoardIcon.PNG)). The Default Dashboard displays `omnia.log` and `syslog`. Custom dashboards can be created per user requirements.
+
+## Accessing Prometheus data via Grafana
+
+* Once `control_plane.yml` is run, Prometheus is added to Grafana as a datasource. This allows Grafana to display statistics from the Control Plane that have been polled using Prometheus.
+
+![Prometheus DataSource](Images/Prometheus_DataSource.jpg)
+
+* Select the dashboard (![Dashboard Icon](Images/DashBoardIcon.PNG)) tab to view the list of Prometheus based dashboards. Some default dashboards include CoreDNS, Prometheus Overview, Kuberenetes Networking etc.
+
+>> __Note:__ Both the control plane and HPC clusters can be monitored on these dashboards by toggling the datasource at the top of each dashboard:
+
+| Data Source | Description | Source |
+|-------------|-------------|--------|
+|  hpc-prometheus-headnodeIP            | Manages the Kuberenetes and Slurm Cluster on the Manager and Compute nodes.            |  This datasource is set up when `Omnia.yml` is run.      |
+| control_plane_prometheus            | Monitors the Single Node cluster running on the Management Station            | This datasource is set up when `control_plane.yml` is run.        |
+
+
+![Prometheus DataSource](Images/Prometheus_Dashboard.jpg)
+
+
+
+
+| Type        | Subtype           | Dashboard Name                    | Available DataSources                               |
+|-------------|-------------------|-----------------------------------|-----------------------------------------------------|
+|             |                   | CoreDNS                           | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Kuberenetes |                   | API Types                         | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Kuberenetes | Compute Resources | Cluster                           | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Kuberenetes | Compute Resources | Namespace (Pods)                  | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Kuberenetes | Compute Resources | Node (Pods)                       | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Kuberenetes | Compute Resources | Pod                               | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Kuberenetes | Compute Resources | Workload                          | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Kuberenetes |                   | Kubelet                           | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Kuberenetes | Networking        | Cluster                           | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Kuberenetes | Networking        | Namespace (Pods)                  | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Kuberenetes | Networking        | Namespace (Workload)              | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Kuberenetes | Networking        | Pod                               | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Kuberenetes | Networking        | Workload                          | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Kuberenetes |                   | Scheduler                         | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Kuberenetes |                   | Stateful Sets                     | control-plane-prometheus, hpc-prometheus-headnodeIP |
+|             |                   | Prometheus Overview               | control-plane-prometheus, hpc-prometheus-headnodeIP |
+| Slurm       |                   | CPUs/GPUs, Jobs, Nodes, Scheduler | hpc-prometheus-headnodeIP                           |
+| Slurm       |                   | Node Exporter Server Metrics      | hpc-prometheus-headnodeIP                           |
+
+

+ 14 - 0
docs/control_plane/device_templates/CONFIGURE_INFINIBAND_SWITCHES.md

@@ -7,6 +7,20 @@ Omnia uses the server-based Subnet Manager (SM). SM runs as a Kubernetes pod on
 
 
 ## Setting up a new or factory reset switch
 ## Setting up a new or factory reset switch
 
 
+Before running `infiniband.yml`, ensure that HTTP and JSON Gateway are enabled on your switch. This can be verifed by running:
+
+`show web` (To check if HTTP is enabled)
+
+`show json-gw` (To check if JSON Gateway is enabled)
+
+In case either service has been disabled, run:
+
+`web http enable` (To enable the HTTP gateway)
+
+`json-gw enable` (To enable the JSON gateway)
+
+
+
 When connecting to a new or factory reset switch, the configuration wizard requests to execute an initial configuration:
 When connecting to a new or factory reset switch, the configuration wizard requests to execute an initial configuration:
 * **(Recommended)** If the user enters 'no', they still have to provide the admin and monitor passwords. 
 * **(Recommended)** If the user enters 'no', they still have to provide the admin and monitor passwords. 
 * If the user enters 'yes', they will also be prompted to enter the hostname for the switch, DHCP details, IPv6 details, etc.
 * If the user enters 'yes', they will also be prompted to enter the hostname for the switch, DHCP details, IPv6 details, etc.