Bladeren bron

Merge branch 'devel' into all-contributors/add-abhishek-sa1

Sujit Jadhav 3 jaren geleden
bovenliggende
commit
b6684be847
59 gewijzigde bestanden met toevoegingen van 1579 en 424 verwijderingen
  1. 6 3
      .all-contributorsrc
  2. 1 0
      .gitattributes
  3. 7 0
      .github/workflows/ansible-lint.yml
  4. 18 4
      README.md
  5. 7 1
      control_plane/collect_node_info.yml
  6. 2 0
      control_plane/input_params/base_vars.yml
  7. 2 3
      control_plane/roles/collect_device_info/files/create_inventory.yml
  8. 41 26
      control_plane/roles/collect_node_info/files/add_host.yml
  9. 89 32
      control_plane/roles/collect_node_info/files/create_inventory.yml
  10. 17 14
      control_plane/roles/collect_node_info/tasks/main.yml
  11. 8 11
      control_plane/roles/control_plane_common/tasks/pre_requisite.yml
  12. 1 2
      control_plane/roles/control_plane_common/vars/main.yml
  13. 11 7
      control_plane/roles/control_plane_k8s/tasks/k8s_init.yml
  14. 0 4
      control_plane/roles/control_plane_k8s/tasks/k8s_installation.yml
  15. 10 2
      control_plane/roles/control_plane_k8s/tasks/k8s_installation_leap.yml
  16. 3 2
      control_plane/roles/control_plane_k8s/vars/main.yml
  17. 33 21
      control_plane/roles/control_plane_monitoring/tasks/configure_k8s_prom_grafana.yml
  18. 31 19
      control_plane/roles/control_plane_monitoring/tasks/configure_loki_grafana.yml
  19. 21 9
      control_plane/roles/control_plane_security/tasks/install_389ds.yml
  20. 11 5
      control_plane/roles/control_plane_security/tasks/install_ipa_server.yml
  21. 4 0
      docs/FAQ.md
  22. 23 17
      docs/INSTALL_OMNIA.md
  23. 10 6
      docs/INSTALL_OMNIA_CONTROL_PLANE.md
  24. 1 1
      docs/MONITOR_CLUSTERS.md
  25. 64 49
      docs/README.md
  26. 39 3
      docs/Security/ENABLE_SECURITY_LOGIN_NODE.md
  27. 32 3
      docs/Security/ENABLE_SECURITY_MANAGEMENT_STATION.md
  28. BIN
      docs/Telemetry_Visualization/Images/ParallelCoordinates.png
  29. BIN
      docs/Telemetry_Visualization/Images/PowerMap.png
  30. BIN
      docs/Telemetry_Visualization/Images/SankeyViewer.png
  31. BIN
      docs/Telemetry_Visualization/Images/Spirallayout.gif
  32. 40 0
      docs/Telemetry_Visualization/TELEMETRY.md
  33. 2 2
      docs/Telemetry_Visualization/Visualization.md
  34. 1 0
      docs/control_plane/device_templates/PROVISION_SERVERS.md
  35. 1 1
      examples/PyTorch/pytorch-deploy.yaml
  36. 17 11
      roles/k8s_start_services/tasks/configure_nginx_prom_grafana.yml
  37. 21 11
      roles/login_node/tasks/install_389ds.yml
  38. 26 19
      roles/login_node/tasks/install_ipa_client.yml
  39. 36 23
      roles/login_server/tasks/install_ipa_server.yml
  40. 2 2
      roles/slurm_common/tasks/main.yml
  41. 35 23
      roles/slurm_exporter/tasks/configure_grafana.yml
  42. 0 4
      telemetry/input_params/base_vars.yml
  43. 0 0
      telemetry/input_params/telemetry_login_vars.yml
  44. 2 2
      telemetry/roles/common/tasks/main.yml
  45. 25 4
      telemetry/roles/common/tasks/pre-requisites.yml
  46. 4 16
      telemetry/roles/common/tasks/validate_base_vars.yml
  47. 6 6
      telemetry/roles/common/tasks/validate_login_vars.yml
  48. 9 5
      telemetry/roles/common/vars/main.yml
  49. 304 0
      telemetry/roles/grafana_config/files/PowerMap.json
  50. 482 0
      telemetry/roles/grafana_config/files/Sankey.json
  51. 14 13
      telemetry/roles/grafana_config/files/SpiralLayout.json
  52. 39 26
      telemetry/roles/grafana_config/files/parallel-coordinate.json
  53. 2 0
      telemetry/roles/grafana_config/vars/main.yml
  54. 1 1
      telemetry/roles/idrac_telemetry/tasks/filter_idrac.yml
  55. 8 8
      telemetry/roles/idrac_telemetry/tasks/initiate_telemetry.yml
  56. 0 1
      telemetry/roles/idrac_telemetry/vars/main.yml
  57. 7 0
      telemetry/roles/timescaledb/tasks/timescaledb_pod.yml
  58. 1 0
      telemetry/roles/timescaledb/vars/main.yml
  59. 2 2
      telemetry/telemetry.yml

+ 6 - 3
.all-contributorsrc

@@ -39,7 +39,8 @@
         "mentoring",
         "projectManagement",
         "review",
-        "talk"
+        "talk",
+        "bug"
       ]
     },
     {
@@ -157,7 +158,8 @@
       "profile": "https://github.com/cgoveas",
       "contributions": [
         "doc",
-        "bug"
+        "bug",
+        "maintenance"
       ]
     },
     {
@@ -366,7 +368,8 @@
       "avatar_url": "https://avatars.githubusercontent.com/u/18387748?v=4",
       "profile": "http://www.myweb.ttu.edu/ngu00336/",
       "contributions": [
-        "code"
+        "code",
+        "plugin"
       ]
     },
     {

+ 1 - 0
.gitattributes

@@ -0,0 +1 @@
+*.yml linguist-detectable

+ 7 - 0
.github/workflows/ansible-lint.yml

@@ -30,6 +30,13 @@ jobs:
         targets: |
           /github/workspace/omnia.yml
           /github/workspace/control_plane/control_plane.yml
+          /github/workspace/control_plane/collect_device_info.yml
+          /github/workspace/control_plane/collect_node_info.yml
+          /github/workspace/control_plane/ethernet.yml
+          /github/workspace/control_plane/idrac.yml
+          /github/workspace/control_plane/infiniband.yml
+          /github/workspace/control_plane/powervault_me4.yml
+          /github/workspace/telemetry/telemetry.yml
           /github/workspace/platforms/jupyterhub.yml
           /github/workspace/platforms/kubeflow.yml
           /github/workspace/tools/install_tools.yml

File diff suppressed because it is too large
+ 18 - 4
README.md


+ 7 - 1
control_plane/collect_node_info.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,3 +19,9 @@
   gather_facts: false
   roles:
     - collect_node_info
+
+- import_playbook: "{{ playbook_dir }}/roles/collect_node_info/files/create_inventory.yml"
+  vars:
+    host_username: "{{ hostvars['127.0.0.1']['host_username'] }}"
+    host_password: "{{ hostvars['127.0.0.1']['provision_password'] }}"
+    mapping_file: "{{ hostvars['127.0.0.1']['mapping_file'] | bool }}"

+ 2 - 0
control_plane/input_params/base_vars.yml

@@ -94,6 +94,8 @@ awx_organization: "DellEMC"
 
 ### Usage: webui_grafana ###
 # At this location grafana persistent volume will be created.
+# If using telemetry, all telemetry related files will also be stored and
+# both timescale and mysql databases will be mounted to this location.
 mount_location: /opt/omnia/
 
 ### Usage: provision_cobbler, provision_idrac ###

+ 2 - 3
control_plane/roles/collect_device_info/files/create_inventory.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 ---
 
 # This role will not group the devices if user provides invalid credentials
@@ -276,7 +275,7 @@
           when:
             - inventory_hostname not in infiniband_switches.stdout
             - not login.failed
-            - infinibandswitch_info.json.data['Product name'] == infiniband_search_key
+            - (infinibandswitch_info.json.results is defined and infinibandswitch_info.json.results[0].data['Product name'] == infiniband_search_key) or (infinibandswitch_info.json.data is defined and infinibandswitch_info.json.data['Product name'] == infiniband_search_key)
       rescue:
         - name: Failed while adding device to ib_inventory
           debug:

+ 41 - 26
control_plane/roles/collect_node_info/files/add_host.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,40 +12,55 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+
 - name: Initialise host description
   set_fact:
     host_description: "Description Unavailable"
     
 - name: Fetch description
   set_fact:
-    host_description: "CPU:{{ hostvars[item]['ansible_processor_count'] }}
-    Cores:{{ hostvars[item]['ansible_processor_cores'] }}
-    Memory:{{ hostvars[item]['ansible_memtotal_mb'] }}MB
-    BIOS:{{ hostvars[item]['ansible_bios_version'] }}"
-  ignore_errors: yes
+    host_description: "Service Tag: {{ service_tag }}"
+  failed_when: false
+  when: hostname_check.stdout is defined
 
-- name: Fetch the hosts in awx node inventory
-  command: >-
-    awx --conf.host {{ awx_host }} --conf.username {{ awx_username }} --conf.password {{ awx_password }}
-    --conf.insecure hosts list --inventory node_inventory
-  changed_when: false
-  no_log: true
-  when:
-     - host_description != "Description Unavailable"
-  register: hosts
-  ignore_errors: yes
+- block:
+    - name: Fetch the hosts in awx node inventory
+      command: >-
+        awx --conf.host {{ awx_host }} --conf.username {{ awx_username }} --conf.password {{ awx_password }}
+        --conf.insecure hosts list --inventory node_inventory
+      changed_when: false
+      delegate_to: localhost
+      no_log: true
+      run_once: true
+      register: fetch_hosts
+  rescue:
+    - name: Failed to fetch hosts in AWX
+      fail:
+        msg: "{{ fetch_hosts.stderr }}"
   
-- name: Add the host to awx node inventory if not present
-  command: >-
-    awx --conf.host {{ awx_host }} --conf.username {{ awx_username }} --conf.password {{ awx_password }}
-    --conf.insecure hosts create --name {{ item }} --inventory node_inventory
-  changed_when: true
-  when: item not in hosts.stdout
-  no_log: true
-  ignore_errors: yes
+- block:
+    - name: Add the host to awx node inventory if not present
+      command: >-
+        awx --conf.host {{ awx_host }} --conf.username {{ awx_username }} --conf.password {{ awx_password }}
+        --conf.insecure hosts create --name {{ inventory_hostname }} --description "{{ host_description }}" --inventory node_inventory
+      changed_when: true
+      register: add_host_awx
+      delegate_to: localhost
+      no_log: true
+      when:
+        - hostname_check.stdout is defined
+        - fetch_hosts.stdout is defined
+        - inventory_hostname not in fetch_hosts.stdout
+  rescue:
+    - name: Failed to add host to AWX
+      fail:
+        msg: "{{ add_host_awx.stderr }}"
+      when: add_host_awx is defined
 
 - name: Host added msg
   debug:
-    msg: "{{ host_added_msg + item }}"
+    msg: "{{ hostvars['localhost']['host_added_msg'] + inventory_hostname }}"
   when:
-    - host_description != "Description Unavailable"
+    - host_description != "Description Unavailable"
+    - add_host_awx is defined
+    - add_host_awx is not failed

+ 89 - 32
control_plane/roles/collect_node_info/files/create_inventory.yml

@@ -13,7 +13,7 @@
 # limitations under the License.
 ---
 - name: Find reachable hosts
-  hosts: all
+  hosts: node_inventory
   gather_facts: false
   ignore_unreachable: true
   ignore_errors: true
@@ -47,7 +47,7 @@
       command: "cat {{ omnia_config_file }}"
       changed_when: false
       register: config_content
-      #no_log: True
+      no_log: true
 
     - name: Decrpyt omnia_config.yml
       command: >-
@@ -78,7 +78,7 @@
       register: hostname_check
       changed_when: false
       ignore_errors: true
-
+      
     - name: Check if IP is present in mapping file
       command: grep "{{ inventory_hostname }}" ../../provision_cobbler/files/new_host_mapping_file.csv
       delegate_to: localhost
@@ -95,23 +95,39 @@
     - name: Get the static hostname from mapping file
       shell: awk -F',' '$3 == "{{ inventory_hostname }}" { print $2 }' ../../provision_cobbler/files/new_host_mapping_file.csv
       delegate_to: localhost
-      when: ('localhost' in hostname_check.stdout) and (mapping_file_present != "" ) and ( mapping_file | bool == true )
+      when: 
+        - ( hostname_check.stdout is defined )
+        - ( 'localhost' in hostname_check.stdout ) 
+        - ( mapping_file_present != "" ) 
+        - ( mapping_file | bool == true )
       register: host_name
       ignore_errors: true
 
     - name: Set the hostname from mapping file
       command: hostnamectl set-hostname "{{ host_name.stdout + '.' + hostvars['localhost']['domain_name'] }}"
-      when: ('localhost' in hostname_check.stdout) and (mapping_file_present != "" ) and  (mapping_file | bool == true )
+      when: 
+        - ( hostname_check.stdout is defined )
+        - ( 'localhost' in hostname_check.stdout ) 
+        - ( mapping_file_present != "" ) 
+        - ( mapping_file | bool == true )
       ignore_errors: true
 
     - name: Set the hostname if hostname not present mapping file
       command: hostnamectl set-hostname "compute{{ inventory_hostname.split('.')[-2] + '-' + inventory_hostname.split('.')[-1] + '.' + hostvars['localhost']['domain_name'] }}"
-      when: ('localhost' in hostname_check.stdout) and (file_present.rc != 0) and (mapping_file | bool == true )
+      when: 
+        - ( hostname_check.stdout is defined )
+        - ( 'localhost' in hostname_check.stdout )
+        - ( file_present.rc is defined ) 
+        - ( file_present.rc != 0 ) 
+        - ( mapping_file | bool == true )
       ignore_errors: true
 
-    - name: Set the system hostname
+    - name: Set the system hostname if mapping file not present
       command: hostnamectl set-hostname "compute{{ inventory_hostname.split('.')[-2] + '-' + inventory_hostname.split('.')[-1]+'.'+ hostvars['localhost']['domain_name'] }}"
-      when: ('localhost' in hostname_check.stdout) and (mapping_file | bool == false)
+      when: 
+        - ( hostname_check.stdout is defined )
+        - ( 'localhost' in hostname_check.stdout ) 
+        - ( mapping_file | bool == false )
       ignore_errors: true
 
     - name: Add new hostname to /etc/hosts from mapping file
@@ -119,7 +135,11 @@
         dest: /etc/hosts
         line: "{{ inventory_hostname }} {{ host_name.stdout + '.' + hostvars['localhost']['domain_name'] }}"
         state: present
-      when: ('localhost' in hostname_check.stdout) and ( mapping_file_present != "" ) and ( mapping_file | bool == true )
+      when: 
+        - ( hostname_check.stdout is defined )
+        - ( 'localhost' in hostname_check.stdout ) 
+        - ( mapping_file_present != "" ) 
+        - ( mapping_file | bool == true )
       ignore_errors: true
 
     - name: Add new hostname to /etc/hosts if hostname not present mapping file
@@ -127,38 +147,69 @@
         dest: /etc/hosts
         line: "{{ inventory_hostname }} compute{{ inventory_hostname.split('.')[-2] + '-' + inventory_hostname.split('.')[-1]+'.'+ hostvars['localhost']['domain_name'] }}"
         state: present
-      when: ('localhost' in hostname_check.stdout) and ( file_present.rc != 0 ) and ( mapping_file | bool == true )
+      when: 
+        - ( hostname_check.stdout is defined )
+        - ( 'localhost' in hostname_check.stdout )
+        - ( file_present.rc is defined ) 
+        - ( file_present.rc != 0 ) 
+        - ( mapping_file | bool == true )
       ignore_errors: true
 
-    - name: Add new hostname to /etc/hosts
+    - name: Add new hostname to /etc/hosts if mapping file not present
       lineinfile:
         dest: /etc/hosts
         line: "{{ inventory_hostname }} compute{{ inventory_hostname.split('.')[-2] + '-' + inventory_hostname.split('.')[-1] +'.'+ hostvars['localhost']['domain_name'] }}"
         state: present
-      when: ('localhost' in hostname_check.stdout) and (mapping_file | bool == false )
+      when: 
+        - ( hostname_check.stdout is defined )
+        - ( 'localhost' in hostname_check.stdout ) 
+        - ( mapping_file | bool == false )
       ignore_errors: true
 
+    - name: Initialize service tag
+      set_fact:
+        service_tag: "Not Found"
+
+    - name: Get service tag
+      shell: >
+          set -o pipefail && \
+          dmidecode -t 1 | grep Serial
+      changed_when: false
+      failed_when: false
+      register: service_tag_details
+      when: hostname_check.stdout is defined
+
+    - name: Set fact service tag
+      set_fact:
+        service_tag: "{{ service_tag_details.stdout.split(':')[1].strip() }}"
+      when: service_tag_details.stdout is defined
+
 - name: Update inventory
-  hosts: localhost
+  hosts: reachable
   connection: local
   gather_facts: false
   tasks:
     - name: Encrypt omnia_config.yml file
       command: >-
-        ansible-vault encrypt "{{ omnia_config_file }}"
-        --vault-password-file "{{ omnia_config_vault_file }}"
+        ansible-vault encrypt "{{ hostvars['localhost']['omnia_config_file'] }}"
+        --vault-password-file "{{ hostvars['localhost']['omnia_config_vault_file'] }}"
       changed_when: false
+      delegate_to: localhost
+      run_once: true
 
     - name: Update omnia_config.yml permissions
       file:
-        path: "{{ omnia_config_file }}"
-        mode: "{{ file_perm }}"
+        path: "{{ hostvars['localhost']['omnia_config_file'] }}"
+        mode: "{{ hostvars['localhost']['file_perm'] }}"
+      delegate_to: localhost
+      run_once: true
 
     - name: Check if tower_config_file file is encrypted
       command: cat "{{ playbook_dir }}/../../webui_awx/files/.tower_cli.cfg"
       changed_when: false
       no_log: true
       register: tower_config_content
+      delegate_to: localhost
       run_once: true
 
     - name: Decrypt tower_config_file
@@ -167,17 +218,21 @@
         --vault-password-file "{{ playbook_dir }}/../../webui_awx/files/.tower_vault_key"
       changed_when: false
       when: "'$ANSIBLE_VAULT;' in tower_config_content.stdout"
+      delegate_to: localhost
       run_once: true
 
-    - name: Change file permissions
+    - name: Change file permissions - tower_config_file
       file:
         path: "{{ playbook_dir }}/../../webui_awx/files/.tower_cli.cfg"
-        mode: "{{ file_perm }}"
+        mode: "{{ hostvars['localhost']['file_perm'] }}"
+      delegate_to: localhost
+      run_once: true
 
     - name: Fetch awx host
       command: grep "host:" "{{ playbook_dir }}/../../webui_awx/files/.tower_cli.cfg"
       register: fetch_awx_host
       changed_when: false
+      delegate_to: localhost
       run_once: true
 
     - name: Fetch awx username
@@ -185,14 +240,16 @@
       register: fetch_awx_username
       changed_when: false
       run_once: true
-      no_log: true
+      delegate_to: localhost
+      run_once: true
 
     - name: Fetch awx password
       command: grep "password:" "{{ playbook_dir }}/../../webui_awx/files/.tower_cli.cfg"
       register: fetch_awx_password
       changed_when: false
       run_once: true
-      no_log: true
+      delegate_to: localhost
+      run_once: true
 
     - name: Set awx variables
       set_fact:
@@ -208,15 +265,15 @@
       changed_when: false
       when: "'$ANSIBLE_VAULT;' in tower_config_content.stdout"
       run_once: true
+      delegate_to: localhost
+      run_once: true
+
+    - name: Change file permissions - tower_config_file
+      file:
+        path: "{{ playbook_dir }}/../../webui_awx/files/.tower_cli.cfg"
+        mode: "{{ hostvars['localhost']['file_perm'] }}"
+      delegate_to: localhost
+      run_once: true
 
-    - name: Update inventory file
-      block:
-        - name: Fetch facts and add new hosts
-          include_tasks: add_host.yml
-          with_items: "{{ groups['reachable'] }}"
-      when: "'reachable' in groups"
-
-    - name: Show unreachable hosts
-      debug:
-        msg: "{{ host_unreachable_msg }} + {{ groups['ungrouped'] }}"
-      when: "'ungrouped' in groups"
+    - name: Fetch facts and add new hosts
+      include_tasks: add_host.yml

+ 17 - 14
control_plane/roles/collect_node_info/tasks/main.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -74,17 +74,20 @@
       when: "'$ANSIBLE_VAULT;' in config_content.stdout"
       run_once: true
 
-    - name: Add inventory playbook
-      block:
-        - name: add hosts with description to inventory file
-          command: >-
-            ansible-playbook -i {{ provisioned_hosts_file }}
-            {{ role_path }}/files/create_inventory.yml
-            --extra-vars "host_username={{ host_username }} host_password={{ provision_password }} mapping_file={{ mapping_file | bool }}"
-          no_log: True
-          register: register_error
-      rescue:
-        - name: Fail if host addition was not successful
-          fail:
-            msg: "{{ register_error.stderr + register_error.stdout | regex_replace(host_username) | regex_replace(provision_password) }}"
+    - name: Change file permissions
+      file:
+        path: "{{ login_vars_file }}"
+        mode: "{{ file_perm }}"
+
+    - name: Check the provisioned_hosts_file output
+      command: cat {{ provisioned_hosts_file }}
+      changed_when: false
+      register: os_hosts
+      
+    - name: Create device_inventory
+      add_host:
+        name: "{{ item }}"
+        groups: "node_inventory"
+      with_items: "{{ os_hosts.stdout_lines }}"
+      when: item | trim | length > 1
   when: provisioned_file.stat.exists

+ 8 - 11
control_plane/roles/control_plane_common/tasks/pre_requisite.yml

@@ -86,17 +86,14 @@
       register: sestatus_current
       changed_when: false
 
-    - name: Disable SElinux
-      replace:
-        path: "{{ selinux_config_path }}"
-        regexp: 'SELINUX=[a-z]+'
-        replace: 'SELINUX=disabled'
-      when: '"SELinux status:                 enabled" in sestatus_current.stdout_lines'
+    - name: Check SELinux status
+      debug:
+        msg: "{{ selinux_warning }}"
+      when: '"SELinux status:                 disabled" in sestatus_current.stdout_lines'
 
-    - name: Status of SElinux
-      fail:
-        msg: "{{ selinux_status }}"
+    - name: Set SElinux to permissive mode
+      command: setenforce 0
       when: '"SELinux status:                 enabled" in sestatus_current.stdout_lines'
-      register: selinux_value
+
   tags: init
-  when: os_supported_leap not in mgmt_os
+  when: os_supported_leap not in mgmt_os

+ 1 - 2
control_plane/roles/control_plane_common/vars/main.yml

@@ -61,13 +61,12 @@ os_supported_rocky_version: "8.4"
 fail_os_status: "Unsupported OS or OS version. OS should be {{ os_supported_centos }} {{ os_supported_centos_version }} or {{ os_supported_rocky }} {{ os_supported_rocky_version }} or {{ os_supported_leap }} {{ os_supported_leap_version }}"
 success_os_status: "Management Station OS validated"
 internet_status: "Failed. No Internet connection. Make sure network is up."
-selinux_status: "SElinux is not disabled. Disable it in /etc/sysconfig/selinux and reboot the system"
+selinux_warning: "Warning! SELinux status is disabled by user. No SELinux policy is loaded"
 ansible_python_version_status: "For {{ mgmt_os }} {{ ansible_distribution_version }}, python bindings of firewalld, dnf, selinux are not available if python is installed from source and not from dnf or zypper. So please make sure python3.6 is installed using dnf or zypper. And ansible uses the python version 3.6 installed using dnf or zypper"
 python_version_support: '3.6.8'
 default_ansible_config_file_path: /etc/ansible/ansible.cfg
 invalid_run_tag_msg: "Failed. init tag should be used with run tags"
 invalid_skip_tag_msg: "Failed. init tag can't be used with skip tags"
-selinux_config_path: /etc/sysconfig/selinux
 
 # Usage: verify_login_inputs.yml
 login_vars_filename: "input_params/login_vars.yml"

+ 11 - 7
control_plane/roles/control_plane_k8s/tasks/k8s_init.yml

@@ -35,13 +35,17 @@
   failed_when: false
   register: k8s_pods
 
-- name: Docker login
-  command: docker login -u {{ docker_username }} -p {{ docker_password }}
-  changed_when: true
-  register: docker_login_output
-  failed_when: false
-  when: docker_username or docker_password
-  no_log: true
+- block:
+    - name: Docker login
+      command: docker login -u {{ docker_username }} -p {{ docker_password }}
+      changed_when: true
+      register: docker_login_output
+      when: docker_username or docker_password
+      no_log: true
+  rescue:
+    - name: Warning - docker login failed
+      debug:
+        msg: "Warning: {{ docker_login_output.stderr }}" 
 
 - name: Docker login check
   fail:

+ 0 - 4
control_plane/roles/control_plane_k8s/tasks/k8s_installation.yml

@@ -19,10 +19,6 @@
     fstype: swap
     state: absent
 
-- name: Disable selinux
-  selinux:
-    state: disabled
-
 - name: Copy k8s.conf file
   copy:
     src: k8s.conf

+ 10 - 2
control_plane/roles/control_plane_k8s/tasks/k8s_installation_leap.yml

@@ -54,6 +54,13 @@
   command: /sbin/sysctl --system
   changed_when: true
 
+- name: Add crio repo
+  zypper_repository:
+    repo: "{{ crio_repo_leap }}"
+    state: present
+    disable_gpg_check: yes
+    autorefresh: yes
+
 - name: Installing cri-o
   package:
     name: cri-o
@@ -120,7 +127,7 @@
 
 - name: Install Kubeadm
   ansible.builtin.expect:
-    command: zypper install --oldpackage "{{ kubeadm_version }}"
+    command: zypper install --replacefiles --force --oldpackage "{{ kubeadm_version }}"
     responses:
         (.*) [1/2/c/d/?](.): '2'
         (.*)(y): 'y'
@@ -130,7 +137,7 @@
 
 - name: Install Kubelet
   ansible.builtin.expect:
-    command: zypper install --oldpackage "{{ kubelet_version }}"
+    command: zypper install --replacefiles --force --oldpackage "{{ kubelet_version }}"
     responses:
         (.*) [1/2/c/d/?](.): '2'
         (.*)(y): 'y'
@@ -142,6 +149,7 @@
   zypper:
      name: "{{ kubectl_version }}"
      state: present
+     replacefiles: true
      oldpackage: yes
      force: yes
   register: kubectl_status

+ 3 - 2
control_plane/roles/control_plane_k8s/vars/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -31,7 +31,8 @@ min_retries: 3
 max_retries: 10
 min_delay: 10
 wait_time: 30
- 
+crio_repo_leap: "https://download.opensuse.org/repositories/devel:kubic/15.3/devel:kubic.repo"
+
 # Usage: k8s_installation.yml
 common_packages:
   - openssl

+ 33 - 21
control_plane/roles/control_plane_monitoring/tasks/configure_k8s_prom_grafana.yml

@@ -23,28 +23,40 @@
   changed_when: false
   register: kube_prom_svc_port
 
-- name: Create prometheus datasource in grafana
-  community.grafana.grafana_datasource:
-    name: control-plane-prometheus
-    grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
-    grafana_user: "{{ grafana_username }}"
-    grafana_password: "{{ grafana_password }}"
-    ds_type: prometheus
-    ds_url: "http://{{ kube_prom_svc_ip.stdout }}:{{ kube_prom_svc_port.stdout }}"
-    access: proxy
-  no_log: true
+- block:
+    - name: Create prometheus datasource in grafana
+      community.grafana.grafana_datasource:
+        name: control-plane-prometheus
+        grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
+        grafana_user: "{{ grafana_username }}"
+        grafana_password: "{{ grafana_password }}"
+        ds_type: prometheus
+        ds_url: "http://{{ kube_prom_svc_ip.stdout }}:{{ kube_prom_svc_port.stdout }}"
+        access: proxy
+      no_log: true
+      register: create_prom_datasource
+  rescue:
+    - name: Create prometheus datasource in grafana failed
+      fail:
+        msg: "Error: {{ create_prom_datasource.msg }}"
 
-- name: Import K8s grafana dashboards
-  community.grafana.grafana_dashboard:
-    grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
-    grafana_user: "{{ grafana_username }}"
-    grafana_password: "{{ grafana_password }}"
-    state: present
-    commit_message: Updated by ansible
-    overwrite: yes
-    path: "{{ role_path }}/files/{{ item }}"
-  with_items: "{{ grafana_dashboard_json_files }}"
-  no_log: true
+- block:
+    - name: Import K8s grafana dashboards
+      community.grafana.grafana_dashboard:
+        grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
+        grafana_user: "{{ grafana_username }}"
+        grafana_password: "{{ grafana_password }}"
+        state: present
+        commit_message: Updated by ansible
+        overwrite: yes
+        path: "{{ role_path }}/files/{{ item }}"
+      with_items: "{{ grafana_dashboard_json_files }}"
+      no_log: true
+      register: import_prom_dashboards
+  rescue:
+    - name: Import K8s grafana dashboards failed
+      fail:
+        msg: "Error: {{ import_prom_dashboards.msg }}"
 
 - name: Save grafana svc ip
   replace:

+ 31 - 19
control_plane/roles/control_plane_monitoring/tasks/configure_loki_grafana.yml

@@ -27,23 +27,35 @@
   changed_when: false
   register: loki_svc_port
 
-- name: Create loki datasource in grafana
-  community.grafana.grafana_datasource:
-    name: control-plane-loki
-    grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
-    grafana_user: "{{ grafana_username }}"
-    grafana_password: "{{ grafana_password }}"
-    ds_type: loki
-    ds_url: "http://{{ loki_svc_ip.stdout }}:{{ loki_svc_port.stdout }}"
-  no_log: true
+- block:
+    - name: Create loki datasource in grafana
+      community.grafana.grafana_datasource:
+        name: control-plane-loki
+        grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
+        grafana_user: "{{ grafana_username }}"
+        grafana_password: "{{ grafana_password }}"
+        ds_type: loki
+        ds_url: "http://{{ loki_svc_ip.stdout }}:{{ loki_svc_port.stdout }}"
+      no_log: true
+      register: create_loki_datasource
+  rescue:
+    - name: Create loki datasource in grafana failed
+      fail:
+        msg: "Error: {{ create_loki_datasource.msg }}"
 
-- name: Import loki dashboard in grafana
-  community.grafana.grafana_dashboard:
-    grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
-    grafana_user: "{{ grafana_username }}"
-    grafana_password: "{{ grafana_password }}"
-    state: present
-    commit_message: Updated by ansible
-    overwrite: yes
-    path: "{{ role_path }}/files/loki_dashboard.json"
-  no_log: true
+- block:
+    - name: Import loki dashboard in grafana
+      community.grafana.grafana_dashboard:
+        grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
+        grafana_user: "{{ grafana_username }}"
+        grafana_password: "{{ grafana_password }}"
+        state: present
+        commit_message: Updated by ansible
+        overwrite: yes
+        path: "{{ role_path }}/files/loki_dashboard.json"
+      no_log: true
+      register: import_loki_dashboard
+  rescue:
+    - name: Import loki datasource in grafana failed
+      fail:
+        msg: "Error: {{ import_loki_dashboard.msg }}"

+ 21 - 9
control_plane/roles/control_plane_security/tasks/install_389ds.yml

@@ -198,11 +198,17 @@
         - name: Create admin principal failed
           fail:
             msg: "Error: {{ create_admin_principal.stderr }}"
-
-    - name: Authenticate as admin
-      shell: set -o pipefail && echo {{ ms_kerberos_admin_password }} | kinit admin
-      no_log: true
-      changed_when: false
+    
+    - block:
+        - name: Authenticate as admin
+          shell: set -o pipefail && echo {{ ms_kerberos_admin_password }} | kinit admin
+          no_log: true
+          changed_when: false
+          register: authenticate_admin
+      rescue:
+        - name: Authenticate as admin failed
+          fail:
+            msg: "Error: {{ authenticate_admin.stderr }}"
 
     - name: Install sssd packages
       zypper:
@@ -244,8 +250,14 @@
         state: started
         enabled: yes
 
-    - name: Configure password policy in 389-ds
-      command: dsconf -w {{ ms_directory_manager_password }} -D "cn=Directory Manager" ldap://{{ server_hostname_ms }} pwpolicy set --pwdlockoutduration {{ lockout_duration }} --pwdmaxfailures {{ max_failures }} --pwdresetfailcount {{ failure_reset_interval }}
-      no_log: true
-      changed_when: true
+    - block:
+        - name: Configure password policy in 389-ds
+          command: dsconf -w {{ ms_directory_manager_password }} -D "cn=Directory Manager" ldap://{{ server_hostname_ms }} pwpolicy set --pwdlockoutduration {{ lockout_duration }} --pwdmaxfailures {{ max_failures }} --pwdresetfailcount {{ failure_reset_interval }}
+          no_log: true
+          changed_when: true
+          register: configure_pwpolicy
+      rescue:
+        - name: Configure password policy in 389-ds failed
+          fail:
+            msg: "Error: {{ configure_pwpolicy.stderr }}"
   when: not ds389_status

+ 11 - 5
control_plane/roles/control_plane_security/tasks/install_ipa_server.yml

@@ -26,11 +26,17 @@
       fail:
         msg: "Error: {{ install_ipa_server.stderr_lines }}"
 
-- name: Authenticate as admin
-  shell: set -o pipefail && echo $'{{ ms_kerberos_admin_password }}' | kinit {{ ms_ipa_admin_username }}
-  no_log: true
-  changed_when: false
-
+- block:
+    - name: Authenticate as admin
+      shell: set -o pipefail && echo $'{{ ms_kerberos_admin_password }}' | kinit {{ ms_ipa_admin_username }}
+      no_log: true
+      changed_when: false
+      register: authenticate_admin
+  rescue:
+    - name: Authenticate as admin failed
+      fail:
+        msg: "Error: {{ authenticate_admin.stderr }}"
+  
 - name: Replace the /etc/resolv.conf file
   copy:
     src: "{{ temp_resolv_conf_path }}"

+ 4 - 0
docs/FAQ.md

@@ -36,6 +36,10 @@ Resolution:
 3. Verify that the downloaded .iso file is valid and correct.
 4. Delete the Cobbler container using `docker rm -f cobbler` and rerun `control_plane.yml`.
 
+## How to enable DHCP routing on Compute Nodes:
+
+To enable routing, update the `primary_dns` and `secondary_dns` in `base_vars` with the appropriate IPs (hostnames are currently not supported). For compute nodes that are not directly connected to the internet (ie only host network is configured), this configuration allows for internet connectivity.
+
 ## Why does PXE boot fail with tftp timeout or service timeout errors?  
 Potential Causes:
 * RAID is configured on the server.

+ 23 - 17
docs/INSTALL_OMNIA.md

@@ -5,7 +5,10 @@ The following sections provide details on installing Omnia using CLI.
 To install the Omnia control plane and manage workloads on your cluster using the Omnia control plane, see [Install the Omnia Control Plane](INSTALL_OMNIA_CONTROL_PLANE.md) and [Monitor Kubernetes and Slurm](MONITOR_CLUSTERS.md) for more information.
 
 ## Prerequisites
-* The login, manager, and compute nodes must be running CentOS 7.9 2009 OS.
+* The login, manager, and compute nodes must be running CentOS 7.9 2009 OS/ Rocky 8.x/ LeapOS 15.3.
+>> __Note:__ If you are using LeapOS, the following repositories will be enabled when running `omnia.yml`:
+>> * OSS ([Repository](http://download.opensuse.org/distribution/leap/15.3/repo/oss/) + [Update](http://download.opensuse.org/update/leap/15.3/oss/))
+>> * Non-OSS ([Repository](http://download.opensuse.org/distribution/leap/15.3/repo/non-oss/) + [Update](http://download.opensuse.org/update/leap/15.3/non-oss/))
 * If you have configured the `omnia_config.yml` file to enable the login node, the login node must be part of the cluster. 
 * All nodes must be connected to the network and must have access to the Internet.
 * Set the hostnames of all the nodes in the cluster.
@@ -42,12 +45,12 @@ To install the Omnia control plane and manage workloads on your cluster using th
 	export PATH=$PATH:/usr/local/bin
 	```  
 	
-**Note**: To deploy Omnia, Python 3.6 provides bindings to system tools such as RPM, DNF, and SELinux. As versions greater than 3.6 do not provide these bindings to system tools, ensure that you install Python 3.6 with dnf.  
+>> **Note**: To deploy Omnia, Python 3.6 provides bindings to system tools such as RPM, DNF, and SELinux. As versions greater than 3.6 do not provide these bindings to system tools, ensure that you install Python 3.6 with dnf.  
 
-**Note**: If Ansible version 2.9 or later is installed, ensure it is uninstalled before installing a newer version of Ansible. Run the following commands to uninstall Ansible before upgrading to a newer version.  
-1. `pip uninstall ansible`
-2. `pip uninstall ansible-base (if ansible 2.9 is installed)`
-3. `pip uninstall ansible-core (if ansible 2.10  > version is installed)`
+>> **Note**: If Ansible version 2.9 or later is installed, ensure it is uninstalled before installing a newer version of Ansible. Run the following commands to uninstall Ansible before upgrading to a newer version.  
+>> 1. `pip uninstall ansible`
+>> 2. `pip uninstall ansible-base (if ansible 2.9 is installed)`
+>> 3. `pip uninstall ansible-core (if ansible 2.10  > version is installed)`
 
 	 
 * On the management station, run the following commands to install Git:
@@ -56,7 +59,7 @@ To install the Omnia control plane and manage workloads on your cluster using th
 	dnf install git -y
 	```
 
-**Note**: If there are errors while executing the Ansible playbook commands, then re-run the commands.  
+>> **Note**: If there are errors while executing the Ansible playbook commands, then re-run the commands.  
 
 ## Steps to install Omnia using CLI
 
@@ -71,7 +74,7 @@ From release branch:
 git clone -b release https://github.com/dellhpc/omnia.git 
 ```-->  
 
-__Note:__ After the Omnia repository is cloned, a folder named __omnia__ is created. Ensure that you do not rename this folder.
+>> __Note:__ After the Omnia repository is cloned, a folder named __omnia__ is created. Ensure that you do not rename this folder.
 
 2. Change the directory to __omnia__: `cd omnia`
 
@@ -97,12 +100,15 @@ __Note:__ After the Omnia repository is cloned, a folder named __omnia__ is crea
 >> __NOTE:__  Without the login node, Slurm jobs can be scheduled only through the manager node.
 
 4. Create an inventory file in the *omnia* folder. Add login node IP address under the *[login_node]* group, manager node IP address under the *[manager]* group, compute node IP addresses under the *[compute]* group, and NFS node IP address under the *[nfs_node]* group. A template file named INVENTORY is provided in the *omnia\docs* folder.  
-	**NOTE**: Ensure that all the four groups (login_node, manager, compute, nfs_node) are present in the template, even if the IP addresses are not updated under login_node and nfs_node groups. 
+>>	**NOTE**: Ensure that all the four groups (login_node, manager, compute, nfs_node) are present in the template, even if the IP addresses are not updated under login_node and nfs_node groups. 
 
 5. To install Omnia:
-```
-ansible-playbook omnia.yml -i inventory 
-```
+
+| Leap OS                     	| CentOS, Rocky                                             	|
+|-----------------------------	|-----------------------------------------------------------	|
+| `ansible-playbook omnia.yml -i inventory -e 'ansible_python_interpreter=/usr/bin/python3'`   	| `ansible-playbook omnia.yml -i inventory`	|
+		
+
 
 6. By default, no skip tags are selected, and both Kubernetes and Slurm will be deployed.  
 
@@ -118,15 +124,15 @@ ansible-playbook omnia.yml -i inventory
 	The default path of the Ansible configuration file is `/etc/ansible/`. If the file is not present in the default path, then edit the `ansible_config_file_path` variable to update the configuration path.
 
 7. To provide passwords for mariaDB Database (for Slurm accounting), Kubernetes Pod Network CIDR, and Kubernetes CNI, edit the `omnia_config.yml` file.  
-__Note:__ 
+>> __Note:__ 
 * Supported values for Kubernetes CNI are calico and flannel. The default value of CNI considered by Omnia is calico. 
 * The default value of Kubernetes Pod Network CIDR is 10.244.0.0/16. If 10.244.0.0/16 is already in use within your network, select a different Pod Network CIDR. For more information, see __https://docs.projectcalico.org/getting-started/kubernetes/quickstart__.
 
-**NOTE**: If you want to view or edit the `omnia_config.yml` file, run the following command:  
+>> **NOTE**: If you want to view or edit the `omnia_config.yml` file, run the following command:  
 - `ansible-vault view omnia_config.yml --vault-password-file .omnia_vault_key` -- To view the file. 
 - `ansible-vault edit omnia_config.yml --vault-password-file .omnia_vault_key` -- To edit the file.
 
-**NOTE**: It is suggested that you use the ansible-vault view or edit commands and that you do not use the ansible-vault decrypt or encrypt commands. If you have used the ansible-vault decrypt or encrypt commands, provide 644 permission to `omnia_config.yml`.  
+>> **NOTE**: It is suggested that you use the ansible-vault view or edit commands and that you do not use the ansible-vault decrypt or encrypt commands. If you have used the ansible-vault decrypt or encrypt commands, provide 644 permission to `omnia_config.yml`.  
 
 Omnia considers `slurm` as the default username for MariaDB.  
 
@@ -160,7 +166,6 @@ The following __kubernetes__ roles are provided by Omnia when __omnia.yml__ file
 - **k8s_start_services** role
 	- Kubernetes services are deployed such as Kubernetes Dashboard, Prometheus, MetalLB and NFS client provisioner
 
-__Note:__ 
 
 * Whenever k8s_version, k8s_cni or k8s_pod_network_cidr needs to be modified after the HPC cluster is setup, the OS in the manager and compute nodes in the cluster must be re-flashed before executing omnia.yml again.
 * After Kubernetes is installed and configured, few Kubernetes and calico/flannel related ports are opened in the manager and compute nodes. This is required for Kubernetes Pod-to-Pod and Pod-to-Service communications. Calico/flannel provides a full networking stack for Kubernetes pods.
@@ -209,11 +214,12 @@ Commands to install JupyterHub and Kubeflow:
 * `ansible-playbook platforms/jupyterhub.yml -i inventory`
 * `ansible-playbook platforms/kubeflow.yml -i inventory`
 
-__Note:__ When the Internet connectivity is unstable or slow, it may take more time to pull the images to create the Kubeflow containers. If the time limit is exceeded, the **Apply Kubeflow configurations** task may fail. To resolve this issue, you must redeploy Kubernetes cluster and reinstall Kubeflow by completing the following steps:
+>> __Note:__ When the Internet connectivity is unstable or slow, it may take more time to pull the images to create the Kubeflow containers. If the time limit is exceeded, the **Apply Kubeflow configurations** task may fail. To resolve this issue, you must redeploy Kubernetes cluster and reinstall Kubeflow by completing the following steps:
 * Format the OS on manager and compute nodes.
 * In the `omnia_config.yml` file, change the k8s_cni variable value from calico to flannel.
 * Run the Kubernetes and Kubeflow playbooks. 
 
+
 ## Add a new compute node to the cluster
 
 To update the INVENTORY file present in `omnia` directory with the new node IP address under the compute group. Ensure the other nodes which are already a part of the cluster are also present in the compute group along with the new node. Then, run `omnia.yml` to add the new node to the cluster and update the configurations of the manager node.

+ 10 - 6
docs/INSTALL_OMNIA_CONTROL_PLANE.md

@@ -34,10 +34,12 @@ Depending on the pass-through switch configured in your HPC environment, the num
   
 * For DHCP configuration, you can provide a host mapping file. If the mapping file is not provided and the variable is left blank, a default mapping file will be created. The provided details must be in the format: MAC address, Hostname, IP address, Component_role. For example, `10:11:12:13,server1,100.96.20.66,compute` and  `14:15:16:17,server2,100.96.22.199,manager` are valid entries.  
 >> __Note:__  
-	* In the *omnia/examples* folder, a **mapping_host_file.csv** template is provided which can be used for DHCP configuration. The header in the template file must not be deleted before saving the file.  
-	* The Hostname should not contain the following characters: , (comma), \. (period) or _ (underscore). However, the **domain name** is allowed commas and periods. 
-	* The Hostname cannot start or end with a hyphen (-).
-* Connect one of the Ethernet cards on the management station to the HPC switch and the other Ethernet card must be connected to the global network. 
+>>	* In the *omnia/examples* folder, a **mapping_host_file.csv** template is provided which can be used for DHCP configuration. The header in the template file must not be deleted before saving the file.  
+>>	* The Hostname should not contain the following characters: , (comma), \. (period) or _ (underscore). However, the **domain name** is allowed commas and periods. 
+>>	* The Hostname cannot start or end with a hyphen (-).
+>>	* No upper case characters are allowed in the hostname.
+>>	* The hostname cannot start with a number.
+* Connect one of the Ethernet cards on the management station connected to the HPC switch and the other Ethernet card must be connected to the internet network. 
 * You must have root privileges to perform installations and configurations using the Omnia control plane.
 * On the management station, ensure that Python 3.6 and Ansible are installed (The following commands are compatible with all 3 OS's unless marked otherwise).  
 	* Run the following commands to install Python 3.6:  
@@ -113,7 +115,7 @@ To configure the login node, refer to [Install_Omnia](INSTALL_OMNIA.md).
 >> * Supported values for Kubernetes CNI are calico and flannel. The default value of CNI considered by Omnia is calico.	
 >> * The default value of Kubernetes Pod Network CIDR is 10.244.0.0/16. If 10.244.0.0/16 is already in use within your network, select a different Pod Network CIDR. For more information, see __https://docs.projectcalico.org/getting-started/kubernetes/quickstart__.  
 >> * The default path of the Ansible configuration file is `/etc/ansible/`. If the file is not present in the default path, then edit the `ansible_conf_file_path` variable to update the configuration path.
->> * If you choose to use FreeIPA on both the Management Station and the login_node, simply follow the steps mentioned [here](docs/Security/Enable_Security_ManagementStation.md) and set login_node to true. However, if you would only like to use FreeIPA on the login_node, be sure to fill out all the relevant variables in `omnia_config.yml`.
+>> * If you choose to enable security on both the Management Station, simply follow the steps mentioned [here](docs/Security/Enable_Security_ManagementStation.md).
 
 
 5. Change the directory to **control_plane/input_params** using the command: `cd omnia/control_plane/input_params`
@@ -210,6 +212,8 @@ Below are all the parameters in `login_vars.yml`
 | ms_kerberos_admin_password      |                          | Password authenticating the 'admin' account on the IPA server. If 389ds   is in use, this field authenticates the Kerberos Admin.                                                                                                             |
 
 
+
+
 ## Default Ansible AWX configurations  
 * The role used to deploy AWX within the *control_plane.yml* file: *webui_awx*.  
 * All the pods are deployed in the specific namespace: *awx*.  
@@ -221,7 +225,7 @@ Omnia performs the following configurations on AWX:
 * For networking switches, InfiniBand switches, iDRAC, and PowerVault Storage, four inventories are available- **ethernet_inventory**, **infiniband_inventory**, **idrac_inventory**, **provisioned_idrac_inventory**, and **powervault_me4_inventory**.
 * IP addresses of the hosts are stored in **node_inventory**.
 * The device credentials are stored in **idrac_credential**, **ethernet_credential**, **infiniband_credential**, and **powervault_me4_credential**. The **node_credential** stores the credentials of nodes in the cluster. 
-* Four groups are created under **node_inventory**-manager, compute, login, and nfs. All nodes in the inventory are added to these groups from the AWX UI.
+* Four groups are created under **node_inventory**-manager, compute, login, and nfs. All nodes in the inventory are to be added to these groups from the AWX UI by the user.
 * iDRAC, networking switches, InfiniBand switches, and PowerVault storage devices can be configured using the respective templates: **idrac_template**, **ethernet_template**, **infiniband_template**, and **powervault_me4_template**. **deploy_omnia_template** is used to deploy Kubernetes and Slurm on the compute nodes. 
 * Schedules are created for the **node_inventory_job** (every **10 minutes**) and the **device_inventory_job** (**once daily**) to dynamically retrieve and update node and device details to AWX.  
 

+ 1 - 1
docs/MONITOR_CLUSTERS.md

@@ -20,7 +20,7 @@ To access any of the dashboards, ensure that a compatible web browser is install
 	5. `logout and login back`
 	6. To launch Firefox from terminal, run `firefox&`
 
-**NOTE**: When the PuTTY or MobaXterm session ends, you must run **export DISPLAY=:10.0** command each time, else Firefox cannot be launched again.  
+>> **NOTE**: When the PuTTY or MobaXterm session ends, you must run **export DISPLAY=:10.0** command each time, else Firefox cannot be launched again.  
 
 ## Access FreeIPA Dashboard  
 The FreeIPA Dashboard can be accessed from the management station, manager, and login nodes. To access the dashboard:

+ 64 - 49
docs/README.md

@@ -62,7 +62,6 @@ Kubeflow  |  1
 Prometheus  |  2.23.0
 Ansible  |  2.9.21
 Python  |  3.6.15
-CRI-O  |  1.21.0
 
 ## Hardware managed by Omnia
 The following table lists the supported devices managed by Omnia. Other devices than those listed in the following table will be discovered by Omnia, but features offered by Omnia will not be applicable.
@@ -78,54 +77,70 @@ Mellanox InfiniBand Switches	|	NVIDIA MQM8700-HS2F Quantum HDR InfiniBand Switch
 ## Software deployed by Omnia
 The following table lists the software and its compatible version managed by Omnia. To avoid any impact on the proper functioning of Omnia, other versions than those listed are not supported.
 
-Software	|	License	|	Compatible Version	|	Description
------------	|	-------	|	----------------	|	-----------------
-LeapOS 15.3	|	-	|	15.x|	Operating system on entire cluster
-CentOS Linux release 7.9.2009 (Core)	|	-	|	7.9	|	Operating system on entire cluster except for management station
-Rocky 8.x	|	-	|	8.x	|	Operating system on entire cluster except for management station
-Rocky 8.x	|	-	|	8.x	|	Operating system on the management station
-MariaDB	|	GPL 2.0	|	5.5.68	|	Relational database used by Slurm
-Slurm	|	GNU General Public	|	20.11.7	|	HPC Workload Manager
-Docker CE	|	Apache-2.0	|	20.10.2	|	Docker Service
-FreeIPA	|	GNU General Public License v3	|	4.6.8	|	Authentication system used in the login node
-OpenSM	|	GNU General Public License 2	|	3.3.24	|	-
-NVIDIA container runtime	|	Apache-2.0	|	3.4.2	|	Nvidia container runtime library
-Python PIP	|	MIT License	|	21.1.2	|	Python Package
-Python3	|	-	|	3.6.8 (3.6.15 if LeapOS is being used)	|	-
-Kubelet	|	Apache-2.0	|	1.16.7,1.19, 1.21  	|	Provides external, versioned ComponentConfig API types for configuring the kubelet
-Kubeadm	|	Apache-2.0	|	1.16.7,1.19, 1.21 	|	"fast paths" for creating Kubernetes clusters
-Kubectl	|	Apache-2.0	|	1.16.7,1.19, 1.21 	|	Command line tool for Kubernetes
-JupyterHub	|	Modified BSD License	|	1.1.0	|	Multi-user hub
-kubernetes Controllers	|	Apache-2.0	|	1.16.7,1.19 (1.21 if LeapOS is being used)	|	Orchestration tool	
-Kfctl	|	Apache-2.0	|	1.0.2	|	CLI for deploying and managing Kubeflow
-Kubeflow	|	Apache-2.0	|	1	|	Cloud Native platform for machine learning
-Helm	|	Apache-2.0	|	3.5.0	|	Kubernetes Package Manager
-Helm Chart	|	-	|	0.9.0	|	-
-TensorFlow	|	Apache-2.0	|	2.1.0	|	Machine Learning framework
-Horovod	|	Apache-2.0	|	0.21.1	|	Distributed deep learning training framework for Tensorflow
-MPI	|	Copyright (c) 2018-2019 Triad National Security,LLC. All rights reserved.	|	0.3.0	|	HPC library
-CoreDNS	|	Apache-2.0	|	1.6.2	|	DNS server that chains plugins
-CNI	|	Apache-2.0	|	0.3.1	|	Networking for Linux containers
-AWX	|	Apache-2.0	|	20.0.0	|	Web-based User Interface
-AWX.AWX	|	Apache-2.0	|	19.4.0	|	Galaxy collection to perform awx configuration
-AWXkit	|	Apache-2.0	|	18.0.0	|	To perform configuration through CLI commands
-Cri-o	|	Apache-2.0	|	1.21, 1.17.3  (LeapOS only supports  1.17.3) |	Container Service
-Buildah	|	Apache-2.0	|	1.22.4	|	Tool to build and run containers
-PostgreSQL	|	Copyright (c) 1996-2020, PostgreSQL Global Development Group	|	10.15	|	Database Management System
-Redis	|	BSD-3-Clause License	|	6.0.10	|	In-memory database
-NGINX	|	BSD-2-Clause License	|	1.14	|	-
-dellemc.os10	|	GNU-General Public License v3.1	|	1.1.1	|	It provides networking hardware abstraction through a common set of APIs
-OMSDK	|	Apache-2.0	|	1.2.488	|	Dell EMC OpenManage Python SDK (OMSDK) is a python library that helps developers and customers to automate the lifecycle management of PowerEdge Servers
-| Loki                                  | Apache License 2.0               | 2.4.1  | Loki is a log aggregation system   designed to store and query logs from all your applications and   infrastructure                            |
-| Promtail                              | Apache License 2.1               | 2.4.1  | Promtail is an agent which ships the contents of local logs to   a private Grafana Loki instance or Grafana Cloud.                             |
-| kube-prometheus-stack                 | Apache License 2.2               | 25.0.0 | Kube Prometheus Stack is a collection of Kubernetes manifests,   Grafana dashboards, and Prometheus rules.                                     |
-| mailx                                 | MIT License                      | 12.5   | mailx is a Unix utility program for sending and receiving   mail.                                                                              |
-| postfix                               | IBM Public License               | 3.5.8  | Mail Transfer Agent (MTA) designed to determine routes and   send emails                                                                       |
-| xorriso                               | GPL version 3                    | 1.4.8  | xorriso copies file objects from POSIX compliant filesystems   into Rock Ridge enhanced ISO 9660 filesystems.                                  |
-| Dell EMC   OpenManage Ansible Modules | GNU- General Public License v3.0 | 5.0.0  | OpenManage Ansible Modules simplifies and automates   provisioning, deployment, and updates of PowerEdge servers and modular   infrastructure. |
-| 389-ds                               | GPL version 3               | 1.4.4  |  LDAP server used for authentication, access control.                                                                       |
-| sssd                               | GPL version 3                    | 1.16.1  | A set of daemons used to manage access to remote directory services and authentication mechanisms.                                   |
-| krb5 | MIT License | 1.19.2  | Authentication protocol providing strong authentication for client/server applications by using secret-key cryptography |
+| Software	                                  	| 	License	                                                                    | 	Compatible Version	                            | 	Description                                                                                                                                                 |
+|-------------------------------------------	|-----------------------------------------------------------------------------	|-------------------------------------------------	|--------------------------------------------------------------------------------------------------------------------------------------------------------------	|
+| LeapOS 15.3	                               	| 	-	                                                                        | 	15.x                                            | 	Operating system on entire cluster                                                                                                                          |
+| CentOS Linux release 7.9.2009 (Core)	      	| 	-	                                                                        | 	7.9	                                            | 	Operating system on entire cluster except for management station                                                                                            |
+| Rocky 8.x	                                 	| 	-	                                                                        | 	8.x	                                            | 	Operating system on entire cluster except for management station                                                                                            |
+| Rocky 8.x	                                 	| 	-	                                                                        | 	8.x	                                            | 	Operating system on the management station                                                                                                                  |
+| MariaDB	                                   	| 	GPL 2.0	                                                                    | 	5.5.68	                                        | 	Relational database used by Slurm                                                                                                                           |
+| Slurm	                                     	| 	GNU General Public	                                                        | 	20.11.7	                                        | 	HPC Workload Manager                                                                                                                                        |
+| Docker CE	                                 	| 	Apache-2.0	                                                                | 	20.10.2	                                        | 	Docker Service                                                                                                                                              |
+| FreeIPA	                                   	| 	GNU General Public License v3	                                            | 	4.6.8	                                        | 	Authentication system used in the login node                                                                                                                |
+| OpenSM	                                    | 	GNU General Public License 2	                                            | 	3.3.24	                                        | 	-                                                                                                                                                           |
+| NVIDIA container runtime	                  	| 	Apache-2.0	                                                                | 	3.4.2	                                        | 	Nvidia container runtime library                                                                                                                            |
+| Python PIP	                                | 	MIT License	                                                                | 	21.1.2	                                        | 	Python Package                                                                                                                                              |
+| Python3	                                   	| 	-	                                                                        | 	3.6.8 (3.6.15 if LeapOS is being used)	        | 	-                                                                                                                                                           |
+| Kubelet	                                   	| 	Apache-2.0	                                                                | 	1.16.7,1.19, 1.21  	                            | 	Provides external, versioned ComponentConfig API types for configuring   the kubelet                                                                        |
+| Kubeadm	                                   	| 	Apache-2.0	                                                                | 	1.16.7,1.19, 1.21 	                            | 	"fast paths" for creating Kubernetes clusters                                                                                                               |
+| Kubectl	                                   	| 	Apache-2.0	                                                                | 	1.16.7,1.19, 1.21 	                            | 	Command line tool for Kubernetes                                                                                                                            |
+| kubernetes.core	                           	| 	GPL 3.0	                                                                    | 	2.2.3 	                                        | 	Performs CRUD operations on K8s onjects                                                                                                                     |
+| JupyterHub	                                | 	Modified BSD License	                                                    | 	1.1.0	                                        | 	Multi-user hub                                                                                                                                              |
+| kubernetes Controllers	                    | 	Apache-2.0	                                                                | 	1.16.7,1.19 (1.21 if LeapOS is being used)	    | 	Orchestration tool	                                                                                                                                        |
+| Kfctl	                                     	| 	Apache-2.0	                                                                | 	1.0.2	                                        | 	CLI for deploying and managing Kubeflow                                                                                                                     |
+| Kubeflow	                                  	| 	Apache-2.0	                                                                | 	1	                                            | 	Cloud Native platform for machine learning                                                                                                                  |
+| Helm	                                      	| 	Apache-2.0	                                                                | 	3.5.0	                                        | 	Kubernetes Package Manager                                                                                                                                  |
+| Helm Chart	                                | 	-	                                                                        | 	0.9.0	                                        | 	-                                                                                                                                                           |
+| TensorFlow	                                | 	Apache-2.0	                                                                | 	2.1.0	                                        | 	Machine Learning framework                                                                                                                                  |
+| Horovod	                                   	| 	Apache-2.0	                                                                | 	0.21.1	                                        | 	Distributed deep learning training framework for Tensorflow                                                                                                 |
+| MPI	                                       	| 	Copyright (c) 2018-2019 Triad National Security,LLC. All rights   reserved.	| 	0.3.0	                                        | 	HPC library                                                                                                                                                 |
+| CoreDNS	                                   	| 	Apache-2.0	                                                                | 	1.6.2	                                        | 	DNS server that chains plugins                                                                                                                              |
+| CNI	                                       	| 	Apache-2.0	                                                                | 	0.3.1	                                        | 	Networking for Linux containers                                                                                                                             |
+| AWX	                                       	| 	Apache-2.0	                                                                | 	20.0.0	                                        | 	Web-based User Interface                                                                                                                                    |
+| AWX.AWX	                                   	| 	Apache-2.0	                                                                | 	19.4.0	                                        | 	Galaxy collection to perform awx configuration                                                                                                              |
+| AWXkit	                                    | 	Apache-2.0	                                                                | 	18.0.0	                                        | 	To perform configuration through CLI commands                                                                                                               |
+| CRI-O	                                     	| 	Apache-2.0	                                                                | 	1.21, 1.22.0  									| 	Container Service                                                                                                                                           |
+| Buildah	                                   	| 	Apache-2.0	                                                                | 	1.22.4	                                        | 	Tool to build and run containers                                                                                                                            |
+| PostgreSQL	                                | 	Copyright (c) 1996-2020, PostgreSQL Global Development Group	            | 	10.15	                                        | 	Database Management System                                                                                                                                  |
+| Redis	                                     	| 	BSD-3-Clause License	                                                    | 	6.0.10	                                        | 	In-memory database                                                                                                                                          |
+| NGINX	                                     	| 	BSD-2-Clause License	                                                    | 	1.14	                                        | 	-                                                                                                                                                           |
+| dellemc.os10	                              	| 	GNU-General Public License v3.1	                                            | 	1.1.1	                                        | 	It provides networking hardware abstraction through a common set of APIs                                                                                    |
+| grafana	                                   	| 	Apache-2.0	                                                                | 	8.3.2	                                        | 	Grafana is the open source analytics & monitoring solution for every   database.                                                                            |
+| community.grafana	                         	| 	GPL 3.0	                                                                    | 	1.3.0	                                        | 	Technical Support for open source grafana                                                                                                                   |
+| OMSDK	                                     	| 	Apache-2.0	                                                                | 	1.2.488	                                        | 	Dell EMC OpenManage Python SDK (OMSDK) is a python library that helps   developers and customers to automate the lifecycle management of PowerEdge   Servers|
+| activemq	                                  	| 	Apache-2.0	                                                                | 	5.10.0	                                        | 	Most popular multi protocol, message broker                                                                                                                 |
+|  Loki                                     	|  Apache License 2.0                                                         	|  2.4.1                                          	|  Loki is a log aggregation   system   designed to store and query   logs from all your applications and     infrastructure                                   	|
+|  Promtail                                 	|  Apache License 2.1                                                         	|  2.4.1                                          	|  Promtail is an agent which ships   the contents of local logs to   a   private Grafana Loki instance or Grafana Cloud.                                      	|
+|  kube-prometheus-stack                    	|  Apache License 2.2                                                         	|  25.0.0                                         	|  Kube Prometheus Stack is a   collection of Kubernetes manifests,     Grafana dashboards, and Prometheus rules.                                              	|
+|  mailx                                    	|  MIT License                                                                	|  12.5                                           	|  mailx is a Unix utility program   for sending and receiving   mail.                                                                                         	|
+|  postfix                                  	|  IBM Public License                                                         	|  3.5.8                                          	|  Mail Transfer Agent (MTA) designed   to determine routes and   send   emails                                                                                	|
+|  xorriso                                  	|  GPL version 3                                                              	|  1.4.8                                          	|  xorriso copies file objects from   POSIX compliant filesystems   into Rock   Ridge enhanced ISO 9660 filesystems.                                           	|
+|  Dell EMC     OpenManage Ansible Modules  	|  GNU- General Public License   v3.0                                         	|  5.0.0                                          	|  OpenManage Ansible Modules   simplifies and automates     provisioning, deployment, and updates of PowerEdge servers and   modular   infrastructure.        	|
+|  389-ds                                   	|  GPL version 3                                                              	|  1.4.4                                          	|   LDAP server used for   authentication, access control.                                                                                                     	|
+|  sssd                                     	|  GPL version 3                                                              	|  1.16.1                                         	|  A set of daemons used to manage   access to remote directory services and authentication mechanisms.                                                        	|
+|  krb5                                     	|  MIT License                                                                	|  1.19.2                                         	|  Authentication protocol providing   strong authentication for client/server applications by using secret-key   cryptography                                 	|
+|  openshift                                	|  Apache 2.0                                                                 	|  0.12.1                                         	|  an on-premises  platform as a   service built around Linux containers orchestrated and managed   by Kubernetes                                              	|
+| golang                                    	| BSD-3-Clause License                                                        	| 1.17                                            	| Go is a statically typed, compiled programming language designed at   Google                                                                                 	|
+| mysql                                     	| GPL 2.0                                                                     	| 8                                               	| MySQL is an open-source relational database management system.                                                                                               	|
+| postgresSQL                               	| PostgresSQL License                                                         	| 12                                              	| PostgreSQL, also known as Postgres, is a free and open-source relational   database management system emphasizing extensibility and SQL compliance.          	|
+| idrac-telemetry-reference tools           	| Apache-2.0                                                                  	| 0.1                                             	| Reference toolset for PowerEdge telemetry metric collection and   integration with analytics and visualization solutions.                                    	|
+| jansson                                   	| MIT License                                                                 	| 2.14                                            	| C library for encoding, decoding and manipulating JSON data                                                                                                  	|
+| libjwt                                    	| MPL-2.0 License                                                             	| 1.13.0                                          	| JWT C Library                                                                                                                                                	|
+| apparmor                                  	| GNU General Public License                                                  	| 3.0.3                                           	| Controls access based on paths of the program files                                                                                                          	|
+| nsfcac/grafana-plugin                     	| Apache-2.0                                                                  	| 2.1.0                                           	| Machine Learning Framework                                                                                                                                   	|
+| apparmor                                  	| GNU General Public License                                                  	| 3.0.3                                           	| Controls access based on paths of the program files                                                                                                          	|
+| snoopy                                    	| GPL 2.0                                                                     	| 2.4.15                                          	| Snoopy is a small library that logs all program executions on your   Linux/BSD system                                                                        	|
+
 
 # Known issues  
 * **Issue**: Hosts are not displayed on the AWX UI.  

File diff suppressed because it is too large
+ 39 - 3
docs/Security/ENABLE_SECURITY_LOGIN_NODE.md


File diff suppressed because it is too large
+ 32 - 3
docs/Security/ENABLE_SECURITY_MANAGEMENT_STATION.md


BIN
docs/Telemetry_Visualization/Images/ParallelCoordinates.png


BIN
docs/Telemetry_Visualization/Images/PowerMap.png


BIN
docs/Telemetry_Visualization/Images/SankeyViewer.png


BIN
docs/Telemetry_Visualization/Images/Spirallayout.gif


+ 40 - 0
docs/Telemetry_Visualization/TELEMETRY.md

@@ -0,0 +1,40 @@
+# Viewing Performance Stats on Grafana
+
+Using [Texas Technical University data visualization lab](https://idatavisualizationlab.github.io/HPCC), data polled from iDRAC and Slurm can be processed to generate live graphs. These Graphs can be accessed on the Grafana UI.
+
+Once `control_plane.yml` is executed and Grafana is set up, use `telemetry.yml` to initiate the Graphs. Data polled via Slurm and iDRAC is streamed into internal databases. This data is processed to create the 4 graphs listed below.
+
+>> __Note__: This feature only works on Nodes using iDRACs with a datacenter license running a minimum firmware of 4.0.
+
+## All your data in a glance
+
+Using the following graphs, data can be visualized to gather correlational information.
+1. [Parallel Coordinates](https://idatavisualizationlab.github.io/HPCC/#ParallelCoordinates) <br>
+Parallel coordinates are a great way to capture a systems status. It shows all ranges of individual metrics like CPU temp, Fan Speed, Memory Usage etc. The graph can be narrowed by time or metric ranges to get specific correlations such as CPU Temp vs Fan Speed etc.
+
+![Parallel Coordinates](Images/ParallelCoordinates.png)
+
+<br>
+
+2. [Spiral Layout](https://idatavisualizationlab.github.io/HPCC/#Spiral_Layout) <br>
+Spiral Layouts are best for viewing the change in a single metric over time. It is often used to check trends in metrics over a business day. Data visualized in this graph can be sorted using other metrics like Job IDs etc to understand the pattern of utilization on your devices.
+
+![Spiral Layout](Images/Spirallayout.gif)
+
+<br>
+
+3. [Sankey Viewer](https://idatavisualizationlab.github.io/HPCC/#SankeyViewer) <br>
+Sankey Viewers are perfect for viewing utilization by nodes/users/jobs. It provides point in time information for quick troubleshooting.
+
+![Sankey Viewer](Images/SankeyViewer.png)
+
+<br>
+
+4. [Power Map](https://idatavisualizationlab.github.io/HPCC/#PowerMap) <br>
+Power Maps are an excellent way to see utilization along the axis of time for different nodes/users/jobs. Hovering over the graph allows the user to narrow down information by Job/User or Node.
+
+![Power Map](Images/PowerMap.png)
+
+<br>
+
+

+ 2 - 2
docs/Telemetry_Visualization/Visualization.md

@@ -21,11 +21,11 @@ A lot of these metrics are collected using iDRAC telemetry. iDRAC telemetry allo
 
 | Parameter Name          | Default Value     | Information |
 |-------------------------|-------------------|-------------|
-| mount_location          | /opt/omnia| Sets the location all telemetry related files will be stored and both timescale and mysql databases will be mounted.            |
+| mount_location          | /opt/omnia 		  | Sets the location all telemetry related files will be stored and both timescale and mysql databases will be mounted.            |
 | idrac_telemetry_support | true              | This variable is used to enable iDRAC telemetry support and visualizations. Accepted Values: true/false            |
 | slurm_telemetry_support | true              | This variable is used to enable slurm telemetry support and visualizations. Slurm Telemetry support can only be activated when idrac_telemetry_support is set to true. Accepted Values: True/False.        |
 | timescaledb_name        | telemetry_metrics | Postgres DB with timescale extension is used for storing iDRAC and slurm telemetry metrics.            |
-| mysqldb_name			  | idrac_telemetrysource_services_db             | MySQL DB is used to store IPs and credentials of iDRACs having datacenter license           |
+| mysqldb_name			  | idrac_telemetrysource_services_db | MySQL DB is used to store IPs and credentials of iDRACs having datacenter license           |
 
 3. Find the IP of the Grafana UI using:
  

+ 1 - 0
docs/control_plane/device_templates/PROVISION_SERVERS.md

@@ -1,4 +1,5 @@
 # Custom ISO provisioning on Dell EMC PowerEdge Servers
+# Custom ISO provisioning on Dell EMC PowerEdge Servers
 
 ## Update the input parameters
 

+ 1 - 1
examples/PyTorch/pytorch-deploy.yaml

@@ -12,7 +12,7 @@ spec:
         volumeMounts:
         - mountPath: /pyscript
           name: torch-job-volume
-        command: ["bash","-c","python /pyscript/pytorchcpu-example.py"]
+        command: ["bash","-c","python /pyscript/pytorch-example.py"]
       restartPolicy: Never
       volumes:
       - name: torch-job-volume

+ 17 - 11
roles/k8s_start_services/tasks/configure_nginx_prom_grafana.yml

@@ -60,14 +60,20 @@
     state: restarted
     enabled: yes
 
-- name: Create prometheus datasource in grafana
-  community.grafana.grafana_datasource:
-    name: "hpc-prometheus-{{ ansible_default_ipv4.address }}"
-    grafana_url: "http://{{ grafana_svc_ip }}:{{ grafana_svc_port }}"
-    grafana_user: "{{ hostvars['127.0.0.1']['grafana_username'] }}"
-    grafana_password: "{{ hostvars['127.0.0.1']['grafana_password'] }}"
-    ds_type: prometheus
-    ds_url: "http://{{ ansible_default_ipv4.address }}:{{ nginx_listen_port }}"
-    access: direct
-  delegate_to: localhost
-  no_log: true
+- block:
+    - name: Create prometheus datasource in grafana
+      community.grafana.grafana_datasource:
+        name: "hpc-prometheus-{{ ansible_default_ipv4.address }}"
+        grafana_url: "http://{{ grafana_svc_ip }}:{{ grafana_svc_port }}"
+        grafana_user: "{{ hostvars['127.0.0.1']['grafana_username'] }}"
+        grafana_password: "{{ hostvars['127.0.0.1']['grafana_password'] }}"
+        ds_type: prometheus
+        ds_url: "http://{{ ansible_default_ipv4.address }}:{{ nginx_listen_port }}"
+        access: direct
+      delegate_to: localhost
+      no_log: true
+      register: create_k8s_prom_datasource
+  rescue:
+    - name: Create prometheus datasource in grafana failed
+      fail:
+        msg: "Error: {{ create_k8s_prom_datasource.msg }}"

+ 21 - 11
roles/login_node/tasks/install_389ds.yml

@@ -96,8 +96,6 @@
       no_log: true
       when: ldap1_search_key in ldap1_status.stdout
 
-    
-
     - name: Creating 389 directory server instance
       shell: dscreate -v from-file {{ ldap1_config_path }} | tee {{ ldap1_output_path }}
       changed_when: true
@@ -223,10 +221,16 @@
           fail:
             msg: "Error: {{ create_admin_principal.stderr }}"
 
-    - name: Authenticate as admin
-      shell: set -o pipefail && echo {{ kerberos_admin_password }} | kinit {{ ipa_admin_username }}
-      no_log: true
-      changed_when: false
+    - block:
+        - name: Authenticate as admin
+          shell: set -o pipefail && echo {{ kerberos_admin_password }} | kinit {{ ipa_admin_username }}
+          no_log: true
+          changed_when: false
+          register: authenticate_admin
+      rescue:
+        - name: Authenticate as admin failed
+          fail:
+            msg: "Error: {{ authenticate_admin.stderr }}"
 
     - name: Install sssd packages
       zypper:
@@ -269,8 +273,14 @@
         enabled: yes
   when: not ds389_status
 
-- name: Configure password policy in 389-ds
-  command: dsconf -w {{ directory_manager_password }} -D "cn=Directory Manager" ldap://{{ server_hostname_fqdn }} pwpolicy set --pwdlockoutduration {{ hostvars['127.0.0.1']['lockout_duration'] }} --pwdmaxfailures {{ hostvars['127.0.0.1']['max_failures'] }} --pwdresetfailcount {{ hostvars['127.0.0.1']['failure_reset_interval'] }}
-  changed_when: true
-  no_log: true
-  when: hostvars['127.0.0.1']['enable_secure_login_node']
+- block:
+    - name: Configure password policy in 389-ds
+      command: dsconf -w {{ directory_manager_password }} -D "cn=Directory Manager" ldap://{{ server_hostname_fqdn }} pwpolicy set --pwdlockoutduration {{ hostvars['127.0.0.1']['lockout_duration'] }} --pwdmaxfailures {{ hostvars['127.0.0.1']['max_failures'] }} --pwdresetfailcount {{ hostvars['127.0.0.1']['failure_reset_interval'] }}
+      changed_when: true
+      no_log: true
+      when: hostvars['127.0.0.1']['enable_secure_login_node']
+      register: configure_pwpolicy
+  rescue:
+    - name: Configure password policy in 389-ds failed
+      fail:
+        msg: "Error: {{ configure_pwpolicy.stderr }}"

+ 26 - 19
roles/login_node/tasks/install_ipa_client.yml

@@ -55,23 +55,30 @@
   changed_when: false
   failed_when: false
 
-- name: Install ipa client in CentOS 7.9
-  command: >-
-    ipa-client-install --domain '{{ required_domain_name }}' --server '{{ required_server_hostname }}'
-    --principal admin --password '{{ required_ipa_admin_pwd }}' --force-join --enable-dns-updates --force-ntpd -U
-  changed_when: true
-  no_log: true
-  when:
-    - ( ansible_distribution | lower == os_centos )
-    - ( ansible_distribution_version < os_version )
+- block:
+    - name: Install ipa client in CentOS 7.9
+      command: >-
+        ipa-client-install --domain '{{ required_domain_name }}' --server '{{ required_server_hostname }}'
+        --principal admin --password '{{ required_ipa_admin_pwd }}' --force-join --enable-dns-updates --force-ntpd -U
+      changed_when: true
+      no_log: true
+      register: install_ipa_client
+      when:
+        - ( ansible_distribution | lower == os_centos )
+        - ( ansible_distribution_version < os_version )
 
-- name: Install ipa client in Rocky 8.4
-  command: >-
-    ipa-client-install --domain '{{ required_domain_name }}' --server '{{ required_server_hostname }}'
-    --principal admin --password '{{ required_ipa_admin_pwd }}' --force-join --enable-dns-updates --no-ntp -U
-  changed_when: true
-  no_log: true
-  when:
-    - ( ansible_distribution | lower == os_centos ) or
-      ( ansible_distribution | lower == os_rocky )
-    - ( ansible_distribution_version >= os_version )
+    - name: Install ipa client in Rocky 8
+      command: >-
+        ipa-client-install --domain '{{ required_domain_name }}' --server '{{ required_server_hostname }}'
+        --principal admin --password '{{ required_ipa_admin_pwd }}' --force-join --enable-dns-updates --no-ntp -U
+      changed_when: true
+      no_log: true
+      register: install_ipa_client
+      when:
+        - ( ansible_distribution | lower == os_centos ) or
+          ( ansible_distribution | lower == os_rocky )
+        - ( ansible_distribution_version >= os_version )
+  rescue:
+    - name: Install ipa client failed
+      fail:
+        msg: "Error: {{ install_ipa_client.stderr_lines }}"

+ 36 - 23
roles/login_server/tasks/install_ipa_server.yml

@@ -30,31 +30,44 @@
   changed_when: false
   failed_when: false
 
-- name: Install ipa server in CentOS 7.9
-  command: >-
-    ipa-server-install -n '{{ hostvars['127.0.0.1']['domain_name'] }}' --hostname='{{ server_hostname }}' -a '{{ hostvars['127.0.0.1']['kerberos_admin_password'] }}'
-    -p '{{ hostvars['127.0.0.1']['directory_manager_password'] }}' -r '{{ hostvars['127.0.0.1']['realm_name'] }}' --setup-dns --auto-forwarders --auto-reverse -U
-  changed_when: true
-  no_log: true
-  when:
-    - ( ansible_distribution | lower == os_centos )
-    - ( ansible_distribution_version < os_version )
+- block:
+    - name: Install ipa server in CentOS 7.9
+      command: >-
+        ipa-server-install -n '{{ hostvars['127.0.0.1']['domain_name'] }}' --hostname='{{ server_hostname }}' -a '{{ hostvars['127.0.0.1']['kerberos_admin_password'] }}'
+        -p '{{ hostvars['127.0.0.1']['directory_manager_password'] }}' -r '{{ hostvars['127.0.0.1']['realm_name'] }}' --setup-dns --auto-forwarders --auto-reverse -U
+      changed_when: true
+      no_log: true
+      register: install_ipa_server
+      when:
+        - ( ansible_distribution | lower == os_centos )
+        - ( ansible_distribution_version < os_version )
 
-- name: Install ipa server in CentOS > 8 or Rocky 8.4
-  command: >-
-    ipa-server-install -n '{{ hostvars['127.0.0.1']['domain_name'] }}' --hostname='{{ server_hostname }}' -a '{{ hostvars['127.0.0.1']['kerberos_admin_password'] }}'
-    -p '{{ hostvars['127.0.0.1']['directory_manager_password'] }}' -r '{{ hostvars['127.0.0.1']['realm_name'] }}' --setup-dns --no-forwarders --no-reverse --no-ntp -U
-  changed_when: true
-  no_log: true
-  when:
-    - ( ansible_distribution | lower == os_centos ) or
-      ( ansible_distribution | lower == os_rocky )
-    - ( ansible_distribution_version >= os_version )
+    - name: Install ipa server in CentOS > 8 or Rocky 8
+      command: >-
+        ipa-server-install -n '{{ hostvars['127.0.0.1']['domain_name'] }}' --hostname='{{ server_hostname }}' -a '{{ hostvars['127.0.0.1']['kerberos_admin_password'] }}'
+        -p '{{ hostvars['127.0.0.1']['directory_manager_password'] }}' -r '{{ hostvars['127.0.0.1']['realm_name'] }}' --setup-dns --no-forwarders --no-reverse --no-ntp -U
+      changed_when: true
+      no_log: true
+      register: install_ipa_server
+      when:
+        - ( ansible_distribution | lower == os_centos ) or
+          ( ansible_distribution | lower == os_rocky )
+        - ( ansible_distribution_version >= os_version )
+  rescue:
+    - name: Install ipa server failed
+      fail:
+        msg: "Error: {{ install_ipa_server.stderr_lines }}"
 
-- name: Authenticate as admin
-  shell: set -o pipefail && echo $'{{ hostvars['127.0.0.1']['kerberos_admin_password'] }}' | kinit admin
-  no_log: true
-  changed_when: false
+- block:
+    - name: Authenticate as admin
+      shell: set -o pipefail && echo $'{{ hostvars['127.0.0.1']['kerberos_admin_password'] }}' | kinit admin
+      no_log: true
+      changed_when: false
+      register: authenticate_admin
+  rescue:
+    - name: Authenticate as admin failed
+      fail:
+        msg: "Error: {{ authenticate_admin.stderr }}"
 
 - name: Replace the /etc/resolv.conf file
   copy:

+ 2 - 2
roles/slurm_common/tasks/main.yml

@@ -26,7 +26,7 @@
     backup: yes
     mode: "{{ common_mode }}"
 
-- name: Enable powertools repo in Rocky 8.4
+- name: Enable powertools repo in Rocky 8
   command: dnf config-manager --set-enabled powertools -y
   when:
     - ( ansible_distribution | lower == os_centos ) or
@@ -42,7 +42,7 @@
     - ( ansible_distribution | lower == os_centos )
     - ( ansible_distribution_version < os_version )
 
-- name: Add python dependent packages for CentOS version > 8 and Rocky 8.4
+- name: Add python dependent packages for CentOS version > 8 and Rocky 8
   package:
     name: "{{ common_python3_packages }}"
     state: present

+ 35 - 23
roles/slurm_exporter/tasks/configure_grafana.yml

@@ -16,17 +16,23 @@
 - name: Include k8s_start_services variables
   include_vars: ../../k8s_start_services/vars/main.yml
 
-- name: Create prometheus datasource in grafana
-  community.grafana.grafana_datasource:
-    name: "hpc-prometheus-{{ ansible_default_ipv4.address }}"
-    grafana_url: "http://{{ grafana_svc_ip }}:{{ grafana_svc_port }}"
-    grafana_user: "{{ hostvars['127.0.0.1']['grafana_username'] }}"
-    grafana_password: "{{ hostvars['127.0.0.1']['grafana_password'] }}"
-    ds_type: prometheus
-    ds_url: "http://{{ ansible_default_ipv4.address }}:{{ nginx_listen_port }}"
-    access: direct
-  delegate_to: localhost
-  no_log: true
+- block:
+    - name: Create prometheus datasource in grafana
+      community.grafana.grafana_datasource:
+        name: "hpc-prometheus-{{ ansible_default_ipv4.address }}"
+        grafana_url: "http://{{ grafana_svc_ip }}:{{ grafana_svc_port }}"
+        grafana_user: "{{ hostvars['127.0.0.1']['grafana_username'] }}"
+        grafana_password: "{{ hostvars['127.0.0.1']['grafana_password'] }}"
+        ds_type: prometheus
+        ds_url: "http://{{ ansible_default_ipv4.address }}:{{ nginx_listen_port }}"
+        access: direct
+      delegate_to: localhost
+      no_log: true
+      register: create_slurm_prom_datasource
+  rescue:
+    - name: Create prometheus datasource in grafana failed
+      fail:
+        msg: "Error: {{ create_slurm_prom_datasource.msg }}"
 
 - name: Replace data source in slurm dashboard
   replace:
@@ -50,15 +56,21 @@
     replace: '"title": "SLURM - CPUs/GPUs, Nodes, Jobs, Scheduler ({{ ansible_default_ipv4.address }})",'
   delegate_to: localhost
 
-- name: Import Slurm Grafana dashboards
-  community.grafana.grafana_dashboard:
-    grafana_url: "http://{{ grafana_svc_ip }}:{{ grafana_svc_port }}"
-    grafana_user: "{{ hostvars['127.0.0.1']['grafana_username'] }}"
-    grafana_password: "{{ hostvars['127.0.0.1']['grafana_password'] }}"
-    state: present
-    commit_message: Updated by ansible
-    overwrite: yes
-    path: "{{ role_path }}/files/{{ item }}"
-  with_items: "{{ slurm_dashboard_json_files }}"
-  delegate_to: localhost
-  no_log: true
+- block:
+    - name: Import Slurm Grafana dashboards
+      community.grafana.grafana_dashboard:
+        grafana_url: "http://{{ grafana_svc_ip }}:{{ grafana_svc_port }}"
+        grafana_user: "{{ hostvars['127.0.0.1']['grafana_username'] }}"
+        grafana_password: "{{ hostvars['127.0.0.1']['grafana_password'] }}"
+        state: present
+        commit_message: Updated by ansible
+        overwrite: yes
+        path: "{{ role_path }}/files/{{ item }}"
+      with_items: "{{ slurm_dashboard_json_files }}"
+      delegate_to: localhost
+      no_log: true
+      register: import_prom_dashboards
+  rescue:
+    - name: Import K8s grafana dashboards failed
+      fail:
+        msg: "Error: {{ import_prom_dashboards.msg }}"

+ 0 - 4
telemetry/input_params/base_vars.yml

@@ -13,10 +13,6 @@
 # limitations under the License.
 ---
 
-# At this location all telemetry related files will be stored and
-# both timescale and mysql databases will be mounted.
-mount_location: /opt/omnia/
-
 # This variable is used to enable iDRAC telemetry support and visualizations
 # Accepted values:  "true" or "false"
 idrac_telemetry_support: true

telemetry/input_params/login_vars.yml → telemetry/input_params/telemetry_login_vars.yml


+ 2 - 2
telemetry/roles/common/tasks/main.yml

@@ -16,10 +16,10 @@
 - name: Check pre-requisites for telemetry and visualizations
   include_tasks: pre-requisites.yml
 
-- name: Validate base_vars.yml
+- name: Validate telemetry_base_vars.yml
   include_tasks: validate_base_vars.yml
 
-- name: Validate login_vars.yml
+- name: Validate telemetry_login_vars.yml
   include_tasks: validate_login_vars.yml
 
 - name: Create k8s secrets for database credentials

+ 25 - 4
telemetry/roles/common/tasks/pre-requisites.yml

@@ -29,22 +29,43 @@
       fail:
         msg: "{{ control_plane_installation_required }}"
 
-- name: Check that the base_vars.yml exists
+- name: Saving management station os
+  set_fact:
+    mgmt_os: "{{ ansible_facts['distribution'] | lower }}"
+
+- block:
+    - name: Fetch SElinux mode
+      command: sestatus
+      register: sestatus_current
+      changed_when: false
+
+    - name: Check SELinux status
+      debug:
+        msg: "{{ selinux_warning }}"
+      when: '"SELinux status:                 disabled" in sestatus_current.stdout_lines'
+
+    - name: Set SElinux to permissive mode
+      command: setenforce 0
+      when: '"SELinux status:                 enabled" in sestatus_current.stdout_lines'
+
+  when: os_supported_leap not in mgmt_os
+
+- name: Check that the telemetry_base_vars.yml exists
   stat:
     path: "{{ base_vars_file }}"
   register: stat_result
 
-- name: Fail if base_vars.yml file doesn't exist
+- name: Fail if telemetry_base_vars.yml file doesn't exist
   fail:
     msg: "{{ fail_msg_base_vars }}"
   when: not stat_result.stat.exists
 
-- name: Check that telemetry/login_vars.yml exists
+- name: Check that telemetry_login_vars.yml exists
   stat:
     path: "{{ login_vars_file }}"
   register: stat_result
 
-- name: Fail if telemetry/login_vars.yml file doesn't exist
+- name: Fail if telemetry_login_vars.yml file doesn't exist
   fail:
     msg: "{{ fail_msg_login_vars }}"
   when: not stat_result.stat.exists

+ 4 - 16
telemetry/roles/common/tasks/validate_base_vars.yml

@@ -13,25 +13,13 @@
 # limitations under the License.
 ---
 
-- name: Include telemetry base_vars.yml file
+- name: Include telemetry_base_vars.yml file
   include_vars: "{{ base_vars_file }}"
   no_log: true
 
-- name: Create mount directory if it doesn't exist
-  block:
-    - name: Checking directory
-      stat:
-        path: "{{ mount_location }}"
-      register: stat_result
-
-    - name: Creating directory
-      file:
-        path: "{{ mount_location }}"
-        state: directory
-        mode: "{{ folder_perm }}"
-        group: root
-        owner: root
-      when: not stat_result.stat.exists
+- name: Include control_plane base_vars.yml file
+  include_vars: "{{ control_plane_base_vars_file }}"
+  no_log: true
 
 - name: Assert idrac telemetry support
   assert:

+ 6 - 6
telemetry/roles/common/tasks/validate_login_vars.yml

@@ -13,25 +13,25 @@
 # limitations under the License.
 ---
 
-# Include telemetry/login_vars.yml
-- name: Check login_vars file is encrypted
+# Include telemetry_login_vars.yml
+- name: Check telemetry_login_vars file is encrypted
   command: cat {{ login_vars_file }}
   changed_when: false
   register: config_content
   no_log: true
 
-- name: Decrpyt login_vars.yml
+- name: Decrpyt telemetry_login_vars.yml
   command: >-
     ansible-vault decrypt {{ login_vars_file }}
     --vault-password-file {{ vault_filename }}
   changed_when: false
   when: "'$ANSIBLE_VAULT;' in config_content.stdout"
 
-- name: Include variable file telemetry/login_vars.yml
+- name: Include variable file telemetry_login_vars.yml
   include_vars: "{{ login_vars_file }}"
   no_log: true
 
-- name: Assert usernames and passwords in login_vars.yml
+- name: Assert usernames and passwords in telemetry_login_vars.yml
   block:
   - name: Assert timescaledb user name
     assert:
@@ -84,7 +84,7 @@
     no_log: true
 
   rescue:
-    - name: Validation issue in login/vars.yml
+    - name: Validation issue in telemetry_login_vars.yml
       fail:
         msg: "{{ login_vars_fail_msg }}"
 

+ 9 - 5
telemetry/roles/common/vars/main.yml

@@ -16,12 +16,16 @@
 # Usage: pre-requisites.yml
 control_plane_installation_required: "AWX and grafana installation through control_plane.yml is mandatory for telemetry.yml"
 
-base_vars_file: "{{ role_path }}/../../input_params/base_vars.yml"
-login_vars_file: "{{ role_path }}/../../input_params/login_vars.yml"
+os_supported_leap: "leap"
+selinux_warning: "Warning! SELinux status is disabled by user. No SELinux policy is loaded"
+
+base_vars_file: "{{ role_path }}/../../input_params/telemetry_base_vars.yml"
+login_vars_file: "{{ role_path }}/../../input_params/telemetry_login_vars.yml"
+control_plane_base_vars_file: "{{ role_path }}/../../../control_plane/input_params/base_vars.yml"
 ctrl_plane_login_vars_filename: "{{ role_path }}/../../../control_plane/input_params/login_vars.yml"
 
-fail_msg_base_vars: "telemetry/base_vars.yml file doesn't exist."
-fail_msg_login_vars: "telemetry/login_vars.yml file doesn't exist."
+fail_msg_base_vars: "telemetry_base_vars.yml file doesn't exist."
+fail_msg_login_vars: "telemetry_login_vars.yml file doesn't exist."
 ctrl_plane_fail_msg_login_vars: "control_plane/login_vars.yml file doesn't exist"
 pip_packages:
   - openshift
@@ -53,7 +57,7 @@ mysqldb_fail_msg: "MySQL DB name should have minimum length of 2"
 
 # Usage: validate_login_vars.yml
 vault_filename: "{{ role_path }}/../../input_params/.login_vault_key"
-login_vars_fail_msg: "Usernames and passwords in input_params/login_vars.yml should have minimum length 2"
+login_vars_fail_msg: "Usernames and passwords in input_params/telemetry_login_vars.yml should have minimum length 2"
 
 ctrl_plane_login_vault_filename: "{{ role_path }}/../../../control_plane/input_params/.login_vault_key"
 min_length_grafana: 5

+ 304 - 0
telemetry/roles/grafana_config/files/PowerMap.json

@@ -0,0 +1,304 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_TELEMETRY-POSTGRES",
+      "label": "telemetry-postgres",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "postgres",
+      "pluginName": "PostgreSQL"
+    }
+  ],
+  "__elements": [],
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "8.3.2"
+    },
+    {
+      "type": "panel",
+      "id": "hpcviz-idvl-hpcc-stream-net",
+      "name": "stream-net",
+      "version": "1.0.0"
+    },
+    {
+      "type": "datasource",
+      "id": "postgres",
+      "name": "PostgreSQL",
+      "version": "1.0.0"
+    }
+  ],
+  "editable": false,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1647433675249,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "postgres",
+        "uid": "telemetry-postgres"
+      },
+      "gridPos": {
+        "h": 15,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 2,
+      "options": {
+        "userEncoded": true
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "table",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  *\nFROM\n  slurm.jobs\nWHERE\n  user_id IN ($users)\n  AND start_time < ${__to:date:seconds}\n  AND end_time BETWEEN ${__from:date:seconds} and ${__to:date:seconds}",
+          "refId": "jobs",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"memory_power\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'PowerMetrics TotalMemoryPower'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "memory_power",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"cpu_power\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'PowerMetrics TotalCPUPower'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "cpu_power",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"cpu1_temp\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'CPU1 Temp TemperatureReading'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "cpu1_temp",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"CPU2_Temp\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'CPU2 Temp TemperatureReading' AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "cpu2_temp",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"NIC1_Temp\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label SIMILAR TO '% NIC 1 Port 1 Partition 1 TemperatureReading' AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "nic1_temp",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        }
+      ],
+      "title": "PowerMap",
+      "type": "hpcviz-idvl-hpcc-stream-net"
+    }
+  ],
+  "refresh": "5s",
+  "schemaVersion": 33,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {},
+        "datasource": {
+          "type": "postgres",
+          "uid": "telemetry-postgres"
+        },
+        "definition": "SELECT DISTINCT servicetag as __value from nodes\n",
+        "hide": 0,
+        "includeAll": true,
+        "multi": true,
+        "name": "servicetag",
+        "options": [],
+        "query": "SELECT DISTINCT servicetag as __value from nodes\n",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      },
+      {
+        "current": {},
+        "datasource": {
+          "type": "postgres",
+          "uid": "telemetry-postgres"
+        },
+        "definition": "SELECT\n user_id as __value, user_name as __text, nodes as IP, nodes.os_ip_addr, nodes.servicetag\nFROM\n  slurm.jobs\nINNER JOIN nodes\nON nodes.os_ip_addr = ANY(nodes)\nWHERE nodes.servicetag in ($servicetag) AND\n start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
+        "hide": 0,
+        "includeAll": true,
+        "multi": true,
+        "name": "users",
+        "options": [],
+        "query": "SELECT\n user_id as __value, user_name as __text, nodes as IP, nodes.os_ip_addr, nodes.servicetag\nFROM\n  slurm.jobs\nINNER JOIN nodes\nON nodes.os_ip_addr = ANY(nodes)\nWHERE nodes.servicetag in ($servicetag) AND\n start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-2d",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "PowerMap",
+  "uid": "ou27WHLnk",
+  "version": 16,
+  "weekStart": ""
+}

+ 482 - 0
telemetry/roles/grafana_config/files/Sankey.json

@@ -0,0 +1,482 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_TELEMETRY-POSTGRES",
+      "label": "telemetry-postgres",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "postgres",
+      "pluginName": "PostgreSQL"
+    }
+  ],
+  "__elements": [],
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "8.3.2"
+    },
+    {
+      "type": "panel",
+      "id": "hpcviz-idvl-hpcc-sankey",
+      "name": "sankey",
+      "version": "1.0.0"
+    },
+    {
+      "type": "datasource",
+      "id": "postgres",
+      "name": "PostgreSQL",
+      "version": "1.0.0"
+    }
+  ],
+  "editable": false,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1647435013504,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "postgres",
+        "uid": "telemetry-postgres"
+      },
+      "gridPos": {
+        "h": 19,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 14,
+      "options": {
+        "coreLimit": 128,
+        "displayOpt": "compute_num"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "table",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "panelId": 9,
+          "queryType": "randomWalk",
+          "rawQuery": true,
+          "rawSql": "SELECT $__timeGroupAlias(a.timestamp,$__interval),\na.source_ip, a.jobs, a.cpus \nfrom (\n  SELECT  timestamp, CONCAT(nodes.servicetag) \n  AS nodeid, jobs, cpus, nodes.os_ip_addr AS source_ip\n  FROM slurm.node_jobs a        \n  INNER JOIN nodes               \n  ON nodes.nodeid = a.nodeid\n  WHERE $__timeFilter(a.timestamp)) \nAS a  WHERE a.source_ip IN ($NodeByUser)\nGROUP BY a.timestamp, a.source_ip, a.jobs, a.cpus ORDER BY a.timestamp",
+          "refId": "node core",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "table",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  *\nFROM\n  slurm.jobs\nWHERE\n  user_id IN ($Users)\n  AND start_time < ${__to:date:seconds}\n  AND end_time BETWEEN ${__from:date:seconds} and ${__to:date:seconds}",
+          "refId": "jobs",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) AS \"memory_power\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n$__timeFilter(\"time\") AND\nlabel= 'PowerMetrics TotalMemoryPower' AND\ntimeseries_metrics.system in ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "memory_power",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) AS \"power_consumption\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n$__timeFilter(\"time\") AND\nlabel= 'PowerMetrics SystemPowerConsumption' AND\ntimeseries_metrics.system in ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "power_consumption",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) AS \"cpu_power\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n$__timeFilter(\"time\") AND\nlabel= 'PowerMetrics TotalCPUPower' AND\ntimeseries_metrics.system in ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "cpu_power",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) AS \"cpu_usage\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n$__timeFilter(\"time\") AND\nlabel= 'SystemUsage CPUUsage' AND\ntimeseries_metrics.system in ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "cpu_usage",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) AS \"cpu1_temp\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n$__timeFilter(\"time\") AND\nlabel= 'CPU1 Temp TemperatureReading' AND\ntimeseries_metrics.system in ($servicetag) \nGROUP BY time,name\nORDER BY time",
+          "refId": "cpu1_temp",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) AS \"cpu2_temp\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n$__timeFilter(\"time\") AND\nlabel= 'CPU2 Temp TemperatureReading' AND\ntimeseries_metrics.system in ($servicetag) \nGROUP BY time,name\nORDER BY time",
+          "refId": "cpu2_temp",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) AS \"nic_temp\",\nCONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n$__timeFilter(\"time\") AND\nlabel SIMILAR TO '% NIC 1 Port 1 Partition 1 TemperatureReading' AND\ntimeseries_metrics.system in ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "nic_temp",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) AS \"fan1_speed\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n$__timeFilter(\"time\") AND\nlabel= 'Fan 1A RPMReading' AND\ntimeseries_metrics.system in ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "fan1_speed",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT $__timeGroupAlias(\"timestamp\",$__interval),\navg(value) AS \"Memory_usage\", \nCONCAT('| ',nodes.os_ip_addr) AS name\nFROM slurm.memoryusage\nINNER JOIN nodes\nON nodes.nodeid = slurm.memoryusage.nodeid\nWHERE\n$__timeFilter(\"timestamp\") AND\nnodes.servicetag in ($servicetag) \nGROUP BY time,name\nORDER BY time",
+          "refId": "memory_usage",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        }
+      ],
+      "title": "Sankey",
+      "transformations": [],
+      "type": "hpcviz-idvl-hpcc-sankey"
+    }
+  ],
+  "refresh": "",
+  "schemaVersion": 33,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {},
+        "datasource": {
+          "type": "postgres",
+          "uid": "telemetry-postgres"
+        },
+        "definition": "SELECT DISTINCT servicetag as __value from nodes",
+        "hide": 0,
+        "includeAll": true,
+        "multi": true,
+        "name": "servicetag",
+        "options": [],
+        "query": "SELECT DISTINCT servicetag as __value from nodes",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      },
+      {
+        "current": {},
+        "datasource": {
+          "type": "postgres",
+          "uid": "telemetry-postgres"
+        },
+        "definition": "SELECT\n user_id as __value, user_name as __text, nodes as IP, nodes.os_ip_addr, nodes.servicetag\nFROM\n  slurm.jobs\nINNER JOIN nodes\nON nodes.os_ip_addr = ANY(nodes)\nWHERE nodes.servicetag in ($servicetag) AND\n start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
+        "hide": 0,
+        "includeAll": true,
+        "multi": true,
+        "name": "Users",
+        "options": [],
+        "query": "SELECT\n user_id as __value, user_name as __text, nodes as IP, nodes.os_ip_addr, nodes.servicetag\nFROM\n  slurm.jobs\nINNER JOIN nodes\nON nodes.os_ip_addr = ANY(nodes)\nWHERE nodes.servicetag in ($servicetag) AND\n start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      },
+      {
+        "current": {},
+        "datasource": {
+          "type": "postgres",
+          "uid": "telemetry-postgres"
+        },
+        "definition": "SELECT DISTINCT unnest(nodes) as node \nFROM slurm.jobs WHERE \nuser_id IN ($Users)  AND start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
+        "hide": 0,
+        "includeAll": true,
+        "multi": true,
+        "name": "NodeByUser",
+        "options": [],
+        "query": "SELECT DISTINCT unnest(nodes) as node \nFROM slurm.jobs WHERE \nuser_id IN ($Users)  AND start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
+        "refresh": 2,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-2d",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Sankey",
+  "uid": "27YRlmz7y",
+  "version": 35,
+  "weekStart": ""
+}

+ 14 - 13
telemetry/roles/grafana_config/files/SpiralLayout.json

@@ -34,7 +34,7 @@
   "fiscalYearStartMonth": 0,
   "graphTooltip": 0,
   "id": null,
-  "iteration": 1646754961002,
+  "iteration": 1647434054378,
   "links": [],
   "liveNow": false,
   "panels": [
@@ -97,7 +97,7 @@
           "hide": false,
           "metricColumn": "none",
           "rawQuery": true,
-          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"memory_power\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'PowerMetrics TotalMemoryPower' AND\n  system IN (CAST($servicetag AS text))\nGROUP BY time,name\nORDER BY time",
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"memory_power\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'PowerMetrics TotalMemoryPower'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
           "refId": "memory_power",
           "select": [
             [
@@ -128,7 +128,7 @@
           "hide": false,
           "metricColumn": "none",
           "rawQuery": true,
-          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"cpu_power\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'PowerMetrics TotalCPUPower'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"cpu_power\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'PowerMetrics TotalCPUPower'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
           "refId": "cpu_power",
           "select": [
             [
@@ -159,7 +159,7 @@
           "hide": false,
           "metricColumn": "none",
           "rawQuery": true,
-          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"cpu1_temp\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'CPU1 Temp TemperatureReading'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"cpu1_temp\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'CPU1 Temp TemperatureReading'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
           "refId": "cpu1_temp",
           "select": [
             [
@@ -190,7 +190,7 @@
           "hide": false,
           "metricColumn": "none",
           "rawQuery": true,
-          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"CPU2_Temp\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'CPU2 Temp TemperatureReading' AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time\n",
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"CPU2_Temp\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'CPU2 Temp TemperatureReading' AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
           "refId": "cpu2_temp",
           "select": [
             [
@@ -221,7 +221,7 @@
           "hide": false,
           "metricColumn": "none",
           "rawQuery": true,
-          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"NIC1_Temp\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'Embedded NIC 1 Port 1 Partition 1 TemperatureReading' AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"NIC1_Temp\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label SIMILAR TO '% NIC 1 Port 1 Partition 1 TemperatureReading' AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
           "refId": "nic1_temp",
           "select": [
             [
@@ -243,10 +243,11 @@
           ]
         }
       ],
-      "title": "SpiralLayout",
+      "title": "Spiral-Layout",
       "type": "hpcviz-idvl-hpcc-spiral-layout"
     }
   ],
+  "refresh": "5s",
   "schemaVersion": 33,
   "style": "dark",
   "tags": [],
@@ -260,7 +261,7 @@
         },
         "definition": "SELECT DISTINCT servicetag as __value from nodes\n",
         "hide": 0,
-        "includeAll": false,
+        "includeAll": true,
         "multi": true,
         "name": "servicetag",
         "options": [],
@@ -277,13 +278,13 @@
           "type": "postgres",
           "uid": "telemetry-postgres"
         },
-        "definition": "SELECT\n user_id as __value, user_name as __text\nFROM\n  slurm.jobs\nWHERE\n start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
+        "definition": "SELECT\n user_id as __value, user_name as __text, nodes as IP, nodes.os_ip_addr, nodes.servicetag\nFROM\n  slurm.jobs\nINNER JOIN nodes\nON nodes.os_ip_addr = ANY(nodes)\nWHERE nodes.servicetag in ($servicetag) AND\n start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
         "hide": 0,
         "includeAll": true,
         "multi": true,
         "name": "users",
         "options": [],
-        "query": "SELECT\n user_id as __value, user_name as __text\nFROM\n  slurm.jobs\nWHERE\n start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
+        "query": "SELECT\n user_id as __value, user_name as __text, nodes as IP, nodes.os_ip_addr, nodes.servicetag\nFROM\n  slurm.jobs\nINNER JOIN nodes\nON nodes.os_ip_addr = ANY(nodes)\nWHERE nodes.servicetag in ($servicetag) AND\n start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
         "refresh": 1,
         "regex": "",
         "skipUrlSync": false,
@@ -293,13 +294,13 @@
     ]
   },
   "time": {
-    "from": "now-6M",
+    "from": "now-2d",
     "to": "now"
   },
   "timepicker": {},
   "timezone": "",
   "title": "SpiralLayout",
-  "uid": "ou27WHLnt",
-  "version": 4,
+  "uid": "ou27WHLni",
+  "version": 7,
   "weekStart": ""
 }

+ 39 - 26
telemetry/roles/grafana_config/files/parallel-coordinate.json

@@ -1,28 +1,40 @@
 {
-  "annotations": {
-    "list": [
-      {
-        "builtIn": 1,
-        "datasource": "-- Grafana --",
-        "enable": true,
-        "hide": true,
-        "iconColor": "rgba(0, 211, 255, 1)",
-        "name": "Annotations & Alerts",
-        "target": {
-          "limit": 100,
-          "matchAny": false,
-          "tags": [],
-          "type": "dashboard"
-        },
-        "type": "dashboard"
-      }
-    ]
-  },
-  "editable": true,
+  "__inputs": [
+    {
+      "name": "DS_TELEMETRY-POSTGRES",
+      "label": "telemetry-postgres",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "postgres",
+      "pluginName": "PostgreSQL"
+    }
+  ],
+  "__elements": [],
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "8.3.2"
+    },
+    {
+      "type": "panel",
+      "id": "hpcviz-idvl-hpcc-parallel-coordinate",
+      "name": "parallel-coordinate",
+      "version": "2.0.0"
+    },
+    {
+      "type": "datasource",
+      "id": "postgres",
+      "name": "PostgreSQL",
+      "version": "1.0.0"
+    }
+  ],
+  "editable": false,
   "fiscalYearStartMonth": 0,
   "graphTooltip": 0,
-  "id": 21,
-  "iteration": 1644431955119,
+  "id": null,
+  "iteration": 1647449602865,
   "links": [],
   "liveNow": false,
   "panels": [
@@ -225,7 +237,7 @@
           "hide": false,
           "metricColumn": "none",
           "rawQuery": true,
-          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) as \"NIC1_temp\",\nCONCAT('| ',system) AS name\nFROM timeseries_metrics\nWHERE  \n$__timeFilter(time) AND \nlabel= 'Embedded NIC 1 Port 1 Partition 1 TemperatureReading' AND \nsystem IN ($ServiceTag)\nGROUP BY\ntime, name\nORDER BY time;",
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) as \"NIC1_temp\",\nCONCAT('| ',system) AS name\nFROM timeseries_metrics\nWHERE  \n$__timeFilter(time) AND \nlabel SIMILAR TO '% NIC 1 Port 1 Partition 1 TemperatureReading' AND \nsystem IN ($ServiceTag)\nGROUP BY\ntime, name\nORDER BY time;",
           "refId": "nic1_temp",
           "select": [
             [
@@ -289,13 +301,14 @@
   "templating": {
     "list": [
       {
+        "current": {},
         "datasource": {
           "type": "postgres",
           "uid": "telemetry-postgres"
         },
         "definition": "SELECT DISTINCT system as __value from timeseries_metrics",
         "hide": 0,
-        "includeAll": false,
+        "includeAll": true,
         "multi": true,
         "name": "ServiceTag",
         "options": [],
@@ -313,9 +326,9 @@
     "to": "now"
   },
   "timepicker": {},
-  "timezone": "",
+  "timezone": "browser",
   "title": "Parallel-Coordinate",
   "uid": "pArBHUtnk",
-  "version": 6,
+  "version": 10,
   "weekStart": ""
 }

+ 2 - 0
telemetry/roles/grafana_config/vars/main.yml

@@ -18,4 +18,6 @@ grafana_namespace: grafana
 telemetry_folder_name: telemetry
 dashboards:
   - parallel-coordinate.json
+  - Sankey.json
   - SpiralLayout.json
+  - PowerMap.json

+ 1 - 1
telemetry/roles/idrac_telemetry/tasks/filter_idrac.yml

@@ -60,7 +60,7 @@
         label: "{{ idrac_firmware_info.firmware_info.Firmware[index].FQDD }}"
       when:
           - '"iDRAC" in idrac_firmware_info.firmware_info.Firmware[index].FQDD'
-          - (idrac_firmware_info.firmware_info.Firmware[index].MajorVersion | int) > 4
+          - (idrac_firmware_info.firmware_info.Firmware[index].MajorVersion | int) > min_firmware_version_reqd
     when: datacenter_license is true
 
   rescue:

+ 8 - 8
telemetry/roles/idrac_telemetry/tasks/initiate_telemetry.yml

@@ -15,8 +15,8 @@
 
 # Include and initialize variables
 
-- name: Include telemetry base_vars.yml
-  include_vars: "{{ playbook_dir }}/input_params/base_vars.yml"
+- name: Include telemetry_base_vars.yml
+  include_vars: "{{ playbook_dir }}/input_params/telemetry_base_vars.yml"
   no_log: true
 
 - name: Initiate telemetry process if idrac_support is enabled
@@ -42,9 +42,9 @@
       idrac_telemetry_scripting_folder: iDRAC-Telemetry-Scripting
       idrac_login_input_filename: "{{ playbook_dir }}/../control_plane/input_params/login_vars.yml"
       idrac_login_vault_filename: "{{ playbook_dir }}/../control_plane/input_params/.login_vault_key"
-      login_vars_file: "{{ playbook_dir }}/input_params/login_vars.yml"
+      login_vars_file: "{{ playbook_dir }}/input_params/telemetry_login_vars.yml"
       vault_filename: "{{ playbook_dir }}/input_params/.login_vault_key"
-      min_firmware_version_reqd: 4
+      min_firmware_version_reqd: 3
       datacenter_license: false
       firmware_version: false
       file_perm: '0644'
@@ -167,20 +167,20 @@
 # Get mysqldb credentials
   - name: Get mysqldb credentials
     block:
-    - name: Check login_vars file is encrypted
+    - name: Check telemetry_login_vars file is encrypted
       command: cat {{ login_vars_file }}
       changed_when: false
       register: config_content
       # no_log: true
 
-    - name: Decrpyt login_vars.yml
+    - name: Decrpyt telemetry_login_vars.yml
       command: >-
         ansible-vault decrypt {{ login_vars_file }}
         --vault-password-file {{ vault_filename }}
       changed_when: false
       when: "'$ANSIBLE_VAULT;' in config_content.stdout"
 
-    - name: Include variable file telemetry/login_vars.yml
+    - name: Include variable file telemetry_login_vars.yml
       include_vars: "{{ login_vars_file }}"
       no_log: true
 
@@ -190,7 +190,7 @@
         --vault-password-file {{ vault_filename }}
       changed_when: false
 
-    - name: Update login_vars.yml permission
+    - name: Update telemetry_login_vars.yml permission
       file:
         path: "{{ login_vars_file }}"
         mode: "{{ file_perm }}"

+ 0 - 1
telemetry/roles/idrac_telemetry/vars/main.yml

@@ -16,7 +16,6 @@
 # Usage: mysqldb_deployment.yml
 mysql_pv_name: mysqldb-storage
 mysqldb_storage: 1Gi
-mysqldb_name: idrac_telemetrysource_services_db
 mysqldb_pvc_name: mysqldb-storage-claim
 mysqldb_k8s_name: mysqldb
 statefulset_replicas: 1

+ 7 - 0
telemetry/roles/timescaledb/tasks/timescaledb_pod.yml

@@ -43,6 +43,11 @@
                 persistentVolumeClaim:
                   claimName: "{{ pvc_name }}"
 
+              - name: timezone
+                hostPath:
+                  path: "{{ zoneinfo_dir + timezone }}"
+                  type: File
+
             containers:
               - name: timescale
                 image: timescale/timescaledb:latest-pg12
@@ -52,6 +57,8 @@
                     name: telemetry-reference-tools
                   - mountPath: /var/lib/postgresql/
                     name: timescaledb-pvc
+                  - mountPath: /etc/localtime
+                    name: timezone
                 workingDir: /go/src/github.com/telemetry-reference-tools
                 env:
                   - name: node.name

+ 1 - 0
telemetry/roles/timescaledb/vars/main.yml

@@ -24,3 +24,4 @@ timescaledb_container_port: 5432
 retries: 10
 delay: 10
 reference_tools_stable_commit: "0016fcb"
+zoneinfo_dir: "/usr/share/zoneinfo/"

+ 2 - 2
telemetry/telemetry.yml

@@ -16,7 +16,7 @@
 - name: Telemetry and visualization
   hosts: localhost
   connection: local
-  gather_facts: false
+  gather_facts: true
   roles:
   - common
   - timescaledb
@@ -49,4 +49,4 @@
   gather_facts: false
   roles:
    - slurm_telemetry
-  tags: slurm_telemetry
+  tags: slurm_telemetry