瀏覽代碼

Merge branch 'devel' into issue-902

Sujit Jadhav 3 年之前
父節點
當前提交
65ca583281
共有 33 個文件被更改,包括 389 次插入307 次删除
  1. 1 1
      .github/workflows/ansible-lint.yml
  2. 6 1
      control_plane/roles/collect_node_info/files/create_inventory.yml
  3. 9 2
      control_plane/roles/control_plane_ib/files/infiniband_container_configure.yml
  4. 2 2
      control_plane/roles/control_plane_ib/files/k8s_infiniband.yml
  5. 0 2
      control_plane/roles/control_plane_ib/files/start.sh
  6. 1 4
      control_plane/roles/control_plane_ib/tasks/check_prerequisites.yml
  7. 2 13
      control_plane/roles/control_plane_ib/tasks/configure_infiniband_container.yml
  8. 9 1
      control_plane/roles/control_plane_ib/tasks/infiniband_container_image.yml
  9. 1 2
      control_plane/roles/control_plane_ib/tasks/main.yml
  10. 1 2
      control_plane/roles/control_plane_ib/vars/main.yml
  11. 13 2
      control_plane/tools/roles/cluster_preperation/tasks/main.yml
  12. 21 5
      control_plane/tools/roles/cluster_preperation/tasks/passwordless_ssh.yml
  13. 4 2
      control_plane/tools/roles/cluster_preperation/vars/main.yml
  14. 3 0
      docs/FAQ.md
  15. 4 2
      docs/INSTALL_OMNIA.md
  16. 4 1
      docs/INSTALL_OMNIA_CONTROL_PLANE.md
  17. 17 4
      docs/Security/ENABLE_SECURITY_LOGIN_NODE.md
  18. 18 4
      docs/Security/ENABLE_SECURITY_MANAGEMENT_STATION.md
  19. 0 0
      docs/Security/LOGIN_USER_CREATION.md
  20. 47 22
      docs/Telemetry_Visualization/TELEMETRY.md
  21. 44 0
      docs/Telemetry_Visualization/VISUALIZATION.md
  22. 0 70
      docs/Telemetry_Visualization/Visualization.md
  23. 7 2
      docs/control_plane/device_templates/CONFIGURE_INFINIBAND_SWITCHES.md
  24. 1 1
      docs/control_plane/device_templates/PROVISION_SERVERS.md
  25. 1 1
      examples/PyTorch/pytorch-deploy.yaml
  26. 0 54
      examples/k8s-TensorFlow-resnet50-multinode-MPIOperator.yaml
  27. 8 2
      roles/k8s_start_services/tasks/deploy_k8s_services.yml
  28. 1 1
      roles/k8s_start_services/vars/main.yml
  29. 6 0
      roles/login_node/tasks/configure_alerting.yml
  30. 94 57
      roles/powervault_me4_nfs/tasks/nfs_node_configure.yml
  31. 4 1
      roles/powervault_me4_nfs/vars/main.yml
  32. 9 6
      roles/slurm_restd/tasks/main.yml
  33. 51 40
      telemetry/roles/grafana_config/files/SpiralLayout.json

+ 1 - 1
.github/workflows/ansible-lint.yml

@@ -17,7 +17,7 @@ jobs:
 
     - name: ansible-lint 
       # replace "master" with any valid ref
-      uses: ansible/ansible-lint-action@master
+      uses: ansible/ansible-lint-action@c37fb7b4bda2c8cb18f4942716bae9f11b0dc9bc
       with:
         # [required]
         # Paths to ansible files (i.e., playbooks, tasks, handlers etc..)

+ 6 - 1
control_plane/roles/collect_node_info/files/create_inventory.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -170,6 +170,11 @@
       set_fact:
         service_tag: "Not Found"
 
+    - name: Install dmidecode package
+      package:
+        name: dmidecode
+        state: present
+
     - name: Get service tag
       shell: >
           set -o pipefail && \

+ 9 - 2
control_plane/roles/control_plane_ib/files/infiniband_container_configure.yml

@@ -19,5 +19,12 @@
   gather_facts: false
   tasks:
   - name: Start dhcpd services
-    command: dhcpd {{ ib_nic }}
-    changed_when: false
+    service:
+      name: dhcpd
+      state: started
+      enabled: yes
+
+  - name: Run opensm
+    shell: nohup /usr/sbin/opensm -F /etc/rdma/opensm.conf &
+    changed_when: true
+    failed_when: false

+ 2 - 2
control_plane/roles/control_plane_ib/files/k8s_infiniband.yml

@@ -35,8 +35,8 @@ spec:
         - name: infiniband-container
           image: 'localhost/infiniband-container:latest'
           imagePullPolicy: Never
-          command: [ "/start.sh" ]
-          args: [ "/sbin/init" ]
+          command:
+            - /sbin/init
           volumeMounts:
             - name: omnia-storage
               mountPath: /root/omnia

+ 0 - 2
control_plane/roles/control_plane_ib/files/start.sh

@@ -1,5 +1,3 @@
 #!/bin/bash
 
-/usr/libexec/rdma-init-kernel
-
 exec /usr/sbin/opensm -F /etc/rdma/opensm.conf

+ 1 - 4
control_plane/roles/control_plane_ib/tasks/check_prerequisites.yml

@@ -17,9 +17,6 @@
   set_fact:
     infiniband_container_status: false
     infiniband_container_image_status: false
-    infiniband_container_config_status: false
-    infiniband_backup_map_status: false
-    infiniband_new_node_status: false
 
 - name: Inspect the infiniband_container image
   command: "buildah images"
@@ -72,4 +69,4 @@
 - name: Update infiniband_container container status
   set_fact:
     infiniband_container_status: true
-  when: "'infiniband-container' in infiniband_container_result.stdout"
+  when: "'infiniband-container' in infiniband_container_result.stdout"

+ 2 - 13
control_plane/roles/control_plane_ib/tasks/configure_infiniband_container.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,17 +13,6 @@
 # limitations under the License.
 ---
 
-- name: Check infiniband pod status
-  command: kubectl get pods -n network-config
-  changed_when: false
-  register: infiniband_pod_status
-  failed_when: false
-
-- name: Deploy infiniband pod
-  command: "kubectl apply -f {{ role_path }}/files/k8s_infiniband.yml"
-  changed_when: true
-  when: infiniband_container_status and  (not infiniband_container_config_status)
-
 - name: Wait for infiniband pod to come to ready state
   command: kubectl wait --for=condition=ready -n network-config pod -l app=infiniband
   changed_when: false
@@ -35,5 +24,5 @@
 
 - name: Configuring infiniband container
   command: 'kubectl exec --stdin --tty -n network-config {{ infiniband_pod_name.stdout }} \
-    -- ansible-playbook /root/omnia/control_plane/roles/control_plane_ib/files/infiniband_container_configure.yml -e ib_nic="{{ ib_network_nic }}"'
+    -- ansible-playbook /root/omnia/control_plane/roles/control_plane_ib/files/infiniband_container_configure.yml'
   changed_when: false

+ 9 - 1
control_plane/roles/control_plane_ib/tasks/infiniband_container_image.yml

@@ -18,7 +18,8 @@
   changed_when: true
   args:
     chdir: "{{ role_path }}/files/"
-
+  when: not infiniband_container_image_status
+  
 - name: Update image name in k8s_infiniband.yml
   replace:
     path: "{{ ib_kube_config_file }}"
@@ -55,6 +56,13 @@
     regexp: "        - name: opensm-logs\n          hostPath:\n            path:.*"
     replace: "        - name: opensm-logs\n          hostPath:\n            path: {{ subnet_manager.log_directory }} "
 
+- name: Check infiniband pod status
+  command: kubectl get pods -n network-config
+  changed_when: false
+  register: infiniband_pod_status
+  failed_when: false
+
 - name: Deploy infiniband pod
   command: "kubectl apply -f {{ ib_kube_config_file }}"
   changed_when: true
+  when: not infiniband_container_status

+ 1 - 2
control_plane/roles/control_plane_ib/tasks/main.yml

@@ -56,5 +56,4 @@
           when: not infiniband_container_status
   when:
     - device_support_status
-    - ib_switch_support
-    - mgmt_os in os_supported_rocky
+    - ib_switch_support

+ 1 - 2
control_plane/roles/control_plane_ib/vars/main.yml

@@ -25,5 +25,4 @@ mount_path: /root/omnia
 infiniband_message_skipped: "The container is already present"
 infiniband_message_installed: "The container is installed"
 ib_kube_config_file: "{{ role_path }}/files/k8s_infiniband.yml"
-ib_container_name: infiniband-container
-infiniband_message_installed: "The container is installed"
+ib_container_name: infiniband-container

+ 13 - 2
control_plane/tools/roles/cluster_preperation/tasks/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -22,6 +22,17 @@
     regexp: '#   StrictHostKeyChecking ask'
     replace: 'StrictHostKeyChecking no'
 
+- name: Disable strict mode checking
+  replace:
+    path: /etc/ssh/ssh_config
+    regexp: '^StrictModes\ '
+    replace: 'StrictModes no'
+
+- name: Restart sshd
+  service:
+    name: sshd
+    state: restarted
+
 - name: Install sshpass
   package:
     name: sshpass
@@ -33,4 +44,4 @@
       include_tasks: passwordless_ssh.yml
       with_items: "{{ ssh_to }}"
       loop_control:
-        pause: 5
+        pause: 5

+ 21 - 5
control_plane/tools/roles/cluster_preperation/tasks/passwordless_ssh.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
   when: "'manager' in group_names"
 
 - name: Verify whether passwordless ssh is set on the remote host
-  shell: sshpass ssh -o "PasswordAuthentication=no" root@{{ current_host }} 'hostname'
+  command: sshpass ssh -o "PasswordAuthentication=no" root@{{ current_host }} 'hostname'
   register: ssh_output
   async: 30
   poll: 5
@@ -45,12 +45,28 @@
   register: verify_rsa_id_file
   when: not ssh_status
 
-- name: Generate ssh key pair
-  command: ssh-keygen -t rsa -b 4096 -f "{{ rsa_id_file }}" -q -N "{{ passphrase }}"
+- name: Create rsa_id file if it doesn't exist
+  ansible.builtin.file:
+    path: "{{ rsa_id_file }}"
+    state: touch
+    mode: "{{ file_mode }}"
   when:
     - not ssh_status
     - not verify_rsa_id_file.stat.exists
 
+- name: Generate ssh key pair
+  shell: ssh-keygen -t rsa -b 4096 -f "{{ rsa_id_file }}" -q -N "{{ passphrase }}" <<<y >/dev/null 2>&1
+  when:
+    - not ssh_status
+
+- name: Creating ssh config file with IdentifyFile value
+  copy:
+    dest: "{{ config_file }}"
+    content: |
+      Host *
+          IdentityFile "{{ rsa_id_file }}"
+    mode: "{{ file_mode }}"
+
 - name: Add the key identity
   shell: |
     eval `ssh-agent -s`
@@ -85,4 +101,4 @@
   rescue:
     - name: Passwordless ssh failed
       fail:
-        msg: "{{ register_error.stderr | regex_replace(hostvars['127.0.0.1']['cobbler_password']) | regex_replace(auth_key_path) }}"
+        msg: "{{ register_error.stderr | regex_replace(hostvars['127.0.0.1']['cobbler_password']) | regex_replace(auth_key_path) }}"

+ 4 - 2
control_plane/tools/roles/cluster_preperation/vars/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -16,4 +16,6 @@
 #Usage: passwordless_ssh.yml
 rsa_id_file: "/root/.ssh/id_rsa"
 passphrase: ""
-auth_key_path: "/root/.ssh/authorized_keys"
+auth_key_path: "/root/.ssh/authorized_keys"
+config_file: "/root/.ssh/config"
+file_mode: "0600"

+ 3 - 0
docs/FAQ.md

@@ -44,10 +44,13 @@ To enable routing, update the `primary_dns` and `secondary_dns` in `base_vars` w
 Potential Causes:
 * RAID is configured on the server.
 * Two or more servers in the same network have Cobbler services running.  
+* The target compute node does not have a configured PXE device with an active NIC.
 
 Resolution:  
 1. Create a Non-RAID or virtual disk on the server.  
 2. Check if other systems except for the management node have cobblerd running. If yes, then stop the Cobbler container using the following commands: `docker rm -f cobbler` and `docker image rm -f cobbler`.
+3. On the server, go to `BIOS Setup -> Network Settings -> PXE Device`. For each listed device (typically 4), configure an active NIC under `PXE device settings`
+
 
 ## What to do when Slurm services do not start automatically after the cluster reboots:
 

+ 4 - 2
docs/INSTALL_OMNIA.md

@@ -52,7 +52,9 @@ To install the Omnia control plane and manage workloads on your cluster using th
 >> 2. `pip uninstall ansible-base (if ansible 2.9 is installed)`
 >> 3. `pip uninstall ansible-core (if ansible 2.10  > version is installed)`
 
-	 
+>> __Note:__ If you are using LeapOS, zypper may need to be updated before installing Omnia using the command: `zypper update -y`
+
+
 * On the management station, run the following commands to install Git:
 	```
 	dnf install epel-release -y
@@ -93,7 +95,7 @@ git clone -b release https://github.com/dellhpc/omnia.git
 | domain_name                | omnia.test    | Sets the intended domain name                                                                                                                                                                                                                        |
 | realm_name                 | OMNIA.TEST    | Sets the intended realm name                                                                                                                                                                                                                         |
 | directory_manager_password |               | Password authenticating admin level access to the Directory for system   management tasks. It will be added to the instance of directory server   created for IPA. <br> Required Length: 8 characters. <br> The   password must not contain -,\, '," |
-| kerberos_admin_password         |               | "admin" user password for the IPA server on RockyOS. If LeapOS is in use, it is used as the "kerberos admin" user password for 389-ds <br> This field is not relevant to Management Stations running `LeapOS`                                                                                                                                                                                                                            |
+| kerberos_admin_password    |               | "admin" user password for the IPA server on RockyOS. If LeapOS is in use, it is used as the "kerberos admin" user password for 389-ds <br> This field is not relevant to Management Stations running `LeapOS`                                                                                                                                                                                                                            |
 | enable_secure_login_node   |  **false**, true             | Boolean value deciding whether security features are enabled on the Login Node. For more information, see [here](docs/Security/Enable_Security_LoginNode.md).                                                                                                                                                                                                                           |
 	
 	

File diff suppressed because it is too large
+ 4 - 1
docs/INSTALL_OMNIA_CONTROL_PLANE.md


File diff suppressed because it is too large
+ 17 - 4
docs/Security/ENABLE_SECURITY_LOGIN_NODE.md


File diff suppressed because it is too large
+ 18 - 4
docs/Security/ENABLE_SECURITY_MANAGEMENT_STATION.md


docs/Security/login_user_creation.md → docs/Security/LOGIN_USER_CREATION.md


+ 47 - 22
docs/Telemetry_Visualization/TELEMETRY.md

@@ -1,40 +1,65 @@
-# Viewing Performance Stats on Grafana
+# Setting Up Grafana
 
-Using [Texas Technical University data visualization lab](https://idatavisualizationlab.github.io/HPCC), data polled from iDRAC and Slurm can be processed to generate live graphs. These Graphs can be accessed on the Grafana UI.
+Using Grafana, users can poll multiple devices and create graphs/visualizations of key system metrics such as temperature, System power consumption, Memory Usage, IO Usage, CPU Usage, Total Memory Power, System Output Power, Total Fan Power, Total Storage Power, System Input Power, Total CPU Power, RPM Readings, Total Heat Dissipation, Power to Cool ratio, System Air Flow Efficiency etc.
 
-Once `control_plane.yml` is executed and Grafana is set up, use `telemetry.yml` to initiate the Graphs. Data polled via Slurm and iDRAC is streamed into internal databases. This data is processed to create the 4 graphs listed below.
+A lot of these metrics are collected using iDRAC telemetry. iDRAC telemetry allows you to stream telemetry data from your servers to a centralized log/metrics servers. For more information on iDRAC telemetry, click [here]( https://github.com/dell/iDRAC-Telemetry-Reference-Tools).
 
->> __Note__: This feature only works on Nodes using iDRACs with a datacenter license running a minimum firmware of 4.0.
+## Prerequisites
 
-## All your data in a glance
+1. To set up Grafana, ensure that `control_plane/input_params/login_vars.yml` is updated with the Grafana Username and Password.
+2. All parameters in `telemetry/input_params/telemetry_login_vars.yml` need to be filled in:
 
-Using the following graphs, data can be visualized to gather correlational information.
-1. [Parallel Coordinates](https://idatavisualizationlab.github.io/HPCC/#ParallelCoordinates) <br>
-Parallel coordinates are a great way to capture a systems status. It shows all ranges of individual metrics like CPU temp, Fan Speed, Memory Usage etc. The graph can be narrowed by time or metric ranges to get specific correlations such as CPU Temp vs Fan Speed etc.
+| Parameter Name        | Default Value | Information |
+|-----------------------|---------------|-------------|
+| timescaledb_user      | 		        |  Username used for connecting to timescale db. Minimum Length: 2 characters.          |
+| timescaledb_password  | 		        |  Password used for connecting to timescale db. Minimum Length: 2 characters.           |
+| mysqldb_user          | 		        |  Username used for connecting to mysql db. Minimum Length: 2 characters.         |
+| mysqldb_password      | 		        |  Password used for connecting to mysql db. Minimum Length: 2 characters.            |
+| mysqldb_root_password | 		        |  Password used for connecting to mysql db for root user. Minimum Legth: 2 characters.         |
 
-![Parallel Coordinates](Images/ParallelCoordinates.png)
+3. All parameters in `telemetry/input_params/telemetry_base_vars.yml` need to be filled in:
 
-<br>
+| Parameter Name          | Default Value     | Information |
+|-------------------------|-------------------|-------------|
+| idrac_telemetry_support | true              | This variable is used to enable iDRAC telemetry support and visualizations. Accepted Values: true/false            |
+| slurm_telemetry_support | true              | This variable is used to enable slurm telemetry support and visualizations. Slurm Telemetry support can only be activated when idrac_telemetry_support is set to true. Accepted Values: True/False.        |
+| timescaledb_name        | telemetry_metrics | Postgres DB with timescale extension is used for storing iDRAC and slurm telemetry metrics.            |
+| mysqldb_name			  | idrac_telemetrysource_services_db | MySQL DB is used to store IPs and credentials of iDRACs having datacenter license           |
 
-2. [Spiral Layout](https://idatavisualizationlab.github.io/HPCC/#Spiral_Layout) <br>
-Spiral Layouts are best for viewing the change in a single metric over time. It is often used to check trends in metrics over a business day. Data visualized in this graph can be sorted using other metrics like Job IDs etc to understand the pattern of utilization on your devices.
+3. Find the IP of the Grafana UI using:
+ 
+`kubectl get svc -n grafana`
 
-![Spiral Layout](Images/Spirallayout.gif)
+## Logging into Grafana
 
-<br>
+Use any one of the following browsers to access the Grafana UI (https://< Grafana UI IP >:5000):
+* Chrome/Chromium
+* Firefox
+* Safari
+* Microsoft Edge
 
-3. [Sankey Viewer](https://idatavisualizationlab.github.io/HPCC/#SankeyViewer) <br>
-Sankey Viewers are perfect for viewing utilization by nodes/users/jobs. It provides point in time information for quick troubleshooting.
+>> __Note:__ Always enable JavaScript in your browser. Running Grafana without JavaScript enabled in the browser is not supported.
 
-![Sankey Viewer](Images/SankeyViewer.png)
+## Prerequisites to Enabling Slurm Telemetry
 
-<br>
+* Slurm Telemetry cannot be executed without iDRAC support
+* Omnia control plane should be executed and node_inventory should be created in awx.
+* The slurm manager and compute nodes are fetched at run time from node_inventory.
+* Slurm should be installed on the nodes, if not there is no point in executing slurm telemetry.
+* A minimum of one cluster is required for Slurm Telemetry to work.
+* Once telemetry is running, delete the pods and images on control plane if a cluster change is intended.
 
-4. [Power Map](https://idatavisualizationlab.github.io/HPCC/#PowerMap) <br>
-Power Maps are an excellent way to see utilization along the axis of time for different nodes/users/jobs. Hovering over the graph allows the user to narrow down information by Job/User or Node.
+## Initiating Telemetry
 
-![Power Map](Images/PowerMap.png)
+1. Once `control_plane.yml` and `omnia.yml` are executed, run the following commands from `omnia/telemetry`:
 
-<br>
+`ansible-playbook telemetry.yml`
 
+>> __Note:__ Telemetry Collection is only initiated on iDRACs on AWX that have a datacenter license and are running a firmware version of 4 or higher.
 
+## Adding a New Node to Telemetry
+After initiation, new nodes can be added to telemetry by running the following commands from `omnia/telemetry`:
+		
+` ansible-playbook add_idrac_node.yml `
+
+	

+ 44 - 0
docs/Telemetry_Visualization/VISUALIZATION.md

@@ -0,0 +1,44 @@
+# Viewing Performance Stats on Grafana
+
+Using [Texas Technical University data visualization lab](https://idatavisualizationlab.github.io/HPCC), data polled from iDRAC and Slurm can be processed to generate live graphs. These Graphs can be accessed on the Grafana UI.
+
+Once `control_plane.yml` is executed and Grafana is set up, use `telemetry.yml` to initiate the Graphs. Data polled via Slurm and iDRAC is streamed into internal databases. This data is processed to create the 4 graphs listed below.
+
+>> __Note__: This feature only works on Nodes using iDRACs with a datacenter license running a minimum firmware of 4.0.
+
+## All your data in a glance
+
+Using the following graphs, data can be visualized to gather correlational information. These graphs refresh every 5 seconds (Except SankeyViewer). 
+
+>> __Note:__ The timestamps used for the time metric are based on the `timezone` set in `control_plane/input_params/base_vars.yml`. 
+
+1. [Parallel Coordinates](https://idatavisualizationlab.github.io/HPCC/#ParallelCoordinates) <br>
+Parallel coordinates are a great way to capture a systems status. It shows all ranges of individual metrics like CPU temp, Fan Speed, Memory Usage etc. The graph can be narrowed by time or metric ranges to get specific correlations such as CPU Temp vs Fan Speed etc.
+
+![Parallel Coordinates](Images/ParallelCoordinates.png)
+
+<br>
+
+2. [Spiral Layout](https://idatavisualizationlab.github.io/HPCC/#Spiral_Layout) <br>
+Spiral Layouts are best for viewing the change in a single metric over time. It is often used to check trends in metrics over a business day. Data visualized in this graph can be sorted using other metrics like Job IDs etc to understand the pattern of utilization on your devices.
+
+![Spiral Layout](Images/Spirallayout.gif)
+
+<br>
+
+3. [Sankey Viewer](https://idatavisualizationlab.github.io/HPCC/#SankeyViewer) <br>
+Sankey Viewers are perfect for viewing utilization by nodes/users/jobs. It provides point in time information for quick troubleshooting.
+
+>> __Note:__ Due to the tremendous data processing undertaken by SankeyViewer, the graph does not auto-refresh. It can be manually refreshed by refreshing the internet tab or by clicking the refresh button on the top-right corner of the page.
+
+![Sankey Viewer](Images/SankeyViewer.png)
+
+<br>
+
+4. [Power Map](https://idatavisualizationlab.github.io/HPCC/#PowerMap) <br>
+Power Maps are an excellent way to see utilization along the axis of time for different nodes/users/jobs. Hovering over the graph allows the user to narrow down information by Job/User or Node.
+
+![Power Map](Images/PowerMap.png)
+
+<br>
+

+ 0 - 70
docs/Telemetry_Visualization/Visualization.md

@@ -1,70 +0,0 @@
-# Setting Up Grafana
-
-Using Grafana, users can poll multiple devices and create graphs/visualizations of key system metrics such as temperature, System power consumption, Memory Usage, IO Usage, CPU Usage, Total Memory Power, System Output Power, Total Fan Power, Total Storage Power, System Input Power, Total CPU Power, RPM Readings, Total Heat Dissipation, Power to Cool ratio, System Air Flow Efficiency etc.
-
-A lot of these metrics are collected using iDRAC telemetry. iDRAC telemetry allows you to stream telemetry data from your servers to a centralized log/metrics servers. For more information on iDRAC telemetry, click [here]( https://github.com/dell/iDRAC-Telemetry-Reference-Tools).
-
-## Prerequisites
-
-1. To set up Grafana, ensure that `control_plane/input_params/login_vars.yml` is updated with the Grafana Username and Password.
-2. All parameters in `telemetry/input_params/login_vars.yml` need to be filled in:
-
-| Parameter Name        | Default Value | Information |
-|-----------------------|---------------|-------------|
-| timescaledb_user      | 		        |  Username used for connecting to timescale db. Minimum Length: 2 characters.          |
-| timescaledb_password  | 		        |  Password used for connecting to timescale db. Minimum Length: 2 characters.           |
-| mysqldb_user          | 		        |  Username used for connecting to mysql db. Minimum Length: 2 characters.         |
-| mysqldb_password      | 		        |  Password used for connecting to mysql db. Minimum Length: 2 characters.            |
-| mysqldb_root_password | 		        |  Password used for connecting to mysql db for root user. Minimum Legth: 2 characters.         |
-
-3. All parameters in `telemetry/input_params/base_vars.yml` need to be filled in:
-
-| Parameter Name          | Default Value     | Information |
-|-------------------------|-------------------|-------------|
-| mount_location          | /opt/omnia 		  | Sets the location all telemetry related files will be stored and both timescale and mysql databases will be mounted.            |
-| idrac_telemetry_support | true              | This variable is used to enable iDRAC telemetry support and visualizations. Accepted Values: true/false            |
-| slurm_telemetry_support | true              | This variable is used to enable slurm telemetry support and visualizations. Slurm Telemetry support can only be activated when idrac_telemetry_support is set to true. Accepted Values: True/False.        |
-| timescaledb_name        | telemetry_metrics | Postgres DB with timescale extension is used for storing iDRAC and slurm telemetry metrics.            |
-| mysqldb_name			  | idrac_telemetrysource_services_db | MySQL DB is used to store IPs and credentials of iDRACs having datacenter license           |
-
-3. Find the IP of the Grafana UI using:
- 
-`kubectl get svc -n grafana`
-
-## Logging into Grafana
-
-Use any one of the following browsers to access the Grafana UI (https://< Grafana UI IP >:5000):
-* Chrome/Chromium
-* Firefox
-* Safari
-* Microsoft Edge
-
->> __Note:__ Always enable JavaScript in your browser. Running Grafana without JavaScript enabled in the browser is not supported.
-
-## Prerequisites to Enabling Slurm Telemetry
-
-* Slurm Telemetry cannot be executed without iDRAC support
-* Omnia control plane should be executed and node_inventory should be created in awx.
-* The slurm manager and compute nodes are fetched at run time from node_inventory.
-* Slurm should be installed on the nodes, if not there is no point in executing slurm telemetry.
-
-## Initiating Telemetry
-
-1. Once `control_plane.yml` and `omnia.yml` are executed, run the following commands from `omnia/telemetry`:
-
-`ansible-playbook telemetry.yml`
-
->> __Note:__ Telemetry Collection is only initiated on iDRACs on AWX that have a datacenter license and are running a firmware version of 4 or higher.
-
-## Adding a New Node to Telemetry
-After initiation, new nodes can be added to telemetry by running the following commands from `omnia/telemetry`:
-		
-` ansible-playbook add_idrac_node.yml `
-
-	
-
-
-
-
-
-

+ 7 - 2
docs/control_plane/device_templates/CONFIGURE_INFINIBAND_SWITCHES.md

@@ -3,7 +3,12 @@ In your HPC cluster, connect the Mellanox InfiniBand switches using the Fat-Tree
 
 Omnia uses the server-based Subnet Manager (SM). SM runs as a Kubernetes pod on the management station. To enable the SM, Omnia configures the required parameters in the `opensm.conf` file. Based on the requirement, the parameters can be edited.  
 
->>**NOTE**: Install the InfiniBand hardware drivers by running the command: `yum groupinstall "Infiniband Support" -y`.  
+>>**NOTE**: Install the InfiniBand hardware drivers by running the below command (depending on the OS):  
+>> * `yum groupinstall "Infiniband Support" -y` (For Rocky)
+>> * `zypper install rdma-core librdmacm1 libibmad5 libibumad3` (For LeapOS)
+
+>> **NOTE:** When using LeapOS, infiniband commands such as sminfo, ibhosts etc only run correctly within the infiniband container.
+
 
 ## Setting up a new or factory reset switch
 
@@ -25,7 +30,7 @@ When connecting to a new or factory reset switch, the configuration wizard reque
 * **(Recommended)** If the user enters 'no', they still have to provide the admin and monitor passwords. 
 * If the user enters 'yes', they will also be prompted to enter the hostname for the switch, DHCP details, IPv6 details, etc.
 
->> **Note:** When initializing a factory reset switch, the user needs to ensure DHCP is enabled and an IPv6 address is not assigned. Omnia will assign an IP address to the Infiniband switch using DHCP with all other configurations.
+>> **NOTE:** When initializing a factory reset switch, the user needs to ensure DHCP is enabled and an IPv6 address is not assigned. Omnia will assign an IP address to the Infiniband switch using DHCP with all other configurations.
 
 ## Edit the "input_params" file 
 Under the `control_plane/input_params` directory, edit the following files:  

+ 1 - 1
docs/control_plane/device_templates/PROVISION_SERVERS.md

@@ -1,5 +1,4 @@
 # Custom ISO provisioning on Dell EMC PowerEdge Servers
-# Custom ISO provisioning on Dell EMC PowerEdge Servers
 
 ## Update the input parameters
 
@@ -43,6 +42,7 @@ For the `idrac.yml` file to successfully provision the custom ISO on the PowerEd
 * The Lifecycle Controller Remote Services of PowerEdge Servers is in the 'ready' state.
 * The Redfish services are enabled in the iDRAC settings under **Services**.
 * The PowerEdge Servers have the iDRAC Enterprise or Datacenter license. If the license is not found, servers will be PXE booted and provisioned using Cobbler.  
+* If `provision_method` is set to PXE in `base_vars.yml`, ensure that all PXE devices have a configured, active NIC. To verify/ configure NIC availability: On the server, go to `BIOS Setup -> Network Settings -> PXE Device`. For each listed device (typically 4), configure/ check for an active NIC under `PXE device settings`
 
 The **provision_idrac** file configures and validates the following:
 * Required input parameters and prerequisites.

+ 1 - 1
examples/PyTorch/pytorch-deploy.yaml

@@ -17,4 +17,4 @@ spec:
       volumes:
       - name: torch-job-volume
         hostPath:
-          path: /home/k8s/torch-example
+          path: /home/k8snfs/torch-example

+ 0 - 54
examples/k8s-TensorFlow-resnet50-multinode-MPIOperator.yaml

@@ -1,54 +0,0 @@
-# Run multi-node training benchmark w/ Nvidia NGC Container: nvcr.io/nvidia/tensorflow:19.06-py3
-#
-# 2 C4140 compute nodes
-#  - 8 V100 GPUs
-#  - ConnectX-5
-#  - IPoIB EDR Infiniband in Ethernet mode
-#
-apiVersion: kubeflow.org/v1alpha1
-kind: MPIJob
-metadata:
-  name: tensorflow-benchmarks-resnet50
-spec:
-  replicas: 2
-  template:
-    spec:
-      containers:
-      - image: nvcr.io/nvidia/tensorflow:19.06-py3
-        name: tensorflow-benchmarks
-        volumeMounts:
-          - mountPath: /foo
-            name: work-volume
-          - mountPath: /data
-            name: mem-volume
-        resources:
-          limits:
-            nvidia.com/gpu: 4
-        command:
-          - mpirun
-          - --allow-run-as-root
-          - --map-by
-          - numa
-          - python
-          - /foo/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
-          - --batch_size=512
-          - --model=resnet50
-          - --variable_update=horovod
-          - --optimizer=momentum
-          - --nodistortions
-          - --gradient_repacking=8
-          - --weight_decay=1e-4
-          - --use_fp16=true
-          - --data_dir=/data/tensorflow/
-          - --data_name=imagenet
-      volumes:
-      - name: work-volume
-        hostPath:
-          # directory locally mounted on host
-          path: /work
-          type: Directory
-      - name: mem-volume
-        hostPath:
-          # dev shm directory on host
-          path: /dev/shm
-          type: Directory

+ 8 - 2
roles/k8s_start_services/tasks/deploy_k8s_services.yml

@@ -89,11 +89,13 @@
 - name: Helm - add Nvidia k8s-device-plugin (nvdp) repo
   command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'"
   changed_when: true
+  when: ansible_local.inventory.nvidia_gpu > 0
   tags: init
 
 - name: Helm - add Nvidia GPU discovery (nvgfd) repo
   command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'"
   changed_when: true
+  when: ansible_local.inventory.nvidia_gpu > 0
   tags: init
 
 - name: Helm - update repo
@@ -182,13 +184,17 @@
 - name: Install nvidia-device-plugin
   command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
   changed_when: true
-  when: "'nvidia-device-plugin' not in k8s_pods.stdout"
+  when:
+    - "'nvidia-device-plugin' not in k8s_pods.stdout"
+    - ansible_local.inventory.nvidia_gpu > 0
   tags: init
 
 - name: Install GPU Feature Discovery
   command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
   changed_when: true
-  when: "'node-feature-discovery' not in k8s_pods.stdout"
+  when:
+    - "'node-feature-discovery' not in k8s_pods.stdout"
+    - ansible_local.inventory.nvidia_gpu > 0
   tags: init
 
 - name: Deploy Xilinx Device plugin

+ 1 - 1
roles/k8s_start_services/vars/main.yml

@@ -93,7 +93,7 @@ prometheus_path_on_host: /var/lib/prometheus-2.23.0.linux-amd64/
 
 spark_operator_repo: https://googlecloudplatform.github.io/spark-on-k8s-operator
 
-operator_image_tag: v1beta2-1.2.3-3.1.1
+operator_image_tag: v1beta2-1.3.3-3.1.1
 
 volcano_scheduling_yaml_url: https://raw.githubusercontent.com/volcano-sh/volcano/v1.3.0/installer/volcano-development.yaml
 

+ 6 - 0
roles/login_node/tasks/configure_alerting.yml

@@ -62,6 +62,12 @@
   changed_when: false
   register: ansible_playbook_path
 
+- name: Start cron service
+  systemd:
+    name: cron
+    state: started
+    enabled: yes
+ 
 - name: Schedule cron job for alerting
   cron:
     name: Auth failure alerting

+ 94 - 57
roles/powervault_me4_nfs/tasks/nfs_node_configure.yml

@@ -1,5 +1,4 @@
-
-#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -29,59 +28,97 @@
   changed_when: false
   failed_when: false
 
-- name: Install packages
-  package:
-    name: iscsi-initiator-utils
-    state: present
-  tags: install
-
-- name: Install packages
-  package:
-    name: sg3_utils
-    state: present
-  tags: install
-
-- name: Set bootproto value
-  lineinfile:
-    path: "{{ nic_path }}"
-    regexp: '^BOOTPROTO='
-    line: 'BOOTPROTO=none'
-  register: result
-
-- name: Set onboot value
-  lineinfile:
-    path: "{{ nic_path }}"
-    regexp: '^ONBOOT='
-    line: 'ONBOOT=yes'
-
-- name: Add ip address
-  lineinfile:
-    path: "{{ nic_path }}"
-    insertafter: '^ONBOOT=yes'
-    line: 'IPADDR={{ pv_nic_ip }}'
-
-- name: Add netmask address
-  lineinfile:
-    path: "{{ nic_path }}"
-    insertafter: '^IPADDR={{ pv_nic_ip }}'
-    line: NETMASK=255.255.255.0
-
-- name: Down the nic
-  command: ifdown {{ pv_nic }}
-  changed_when: true
-  failed_when: false
-  tags: install
-
-- name: Up the nic
-  command: ifup {{ pv_nic }}
-  changed_when: true
-  tags: install
-
-- name: Show ip
-  shell: >
-    set -o pipefail && \
-    ifconfig {{ pv_nic }} | grep 'inet' |cut -d: -f2 |  awk '{ print $2}'
-  changed_when: false
+- name: NFS node configuration on leap
+  block:
+    - name: Install open-iscsi
+      zypper:
+        name: open-iscsi
+        state: present
+      tags: install
+
+    - name: Install sg3_utils
+      zypper:
+        name: sg3_utils
+        state: present
+      tags: install
+
+    - name: Start the iSCSI deamon
+      systemd:
+        name: iscsid
+        state: started
+
+    - block:
+      - name: Configure nic
+        command: ip a add {{ pv_nic_ip }}/255.255.255.0 dev {{ pv_nic }}
+        register: nic_status
+        changed_when: false
+      rescue:
+      - name: Check if nic configured or not
+        fail:
+          msg: "{{ nic_conf_failed_msg }}"
+        when: nic_status_search not in nic_status.stderr
+
+    - name: Up the nic
+      command: ip link set dev {{ pv_nic }} up
+      changed_when: false
+  when: os_supported_leap in ansible_distribution | lower
+
+- name: NFS node configuration on rocky
+  block:
+    - name: Install packages
+      package:
+        name: iscsi-initiator-utils
+        state: present
+      tags: install
+
+    - name: Install packages
+      package:
+        name: sg3_utils
+        state: present
+      tags: install
+
+    - name: Set bootproto value
+      lineinfile:
+        path: "{{ nic_path }}"
+        regexp: '^BOOTPROTO='
+        line: 'BOOTPROTO=none'
+      register: result
+
+    - name: Set onboot value
+      lineinfile:
+        path: "{{ nic_path }}"
+        regexp: '^ONBOOT='
+        line: 'ONBOOT=yes'
+
+    - name: Add ip address
+      lineinfile:
+        path: "{{ nic_path }}"
+        insertafter: '^ONBOOT=yes'
+        line: 'IPADDR={{ pv_nic_ip }}'
+
+    - name: Add netmask address
+      lineinfile:
+        path: "{{ nic_path }}"
+        insertafter: '^IPADDR={{ pv_nic_ip }}'
+        line: NETMASK=255.255.255.0
+
+    - name: Down the nic
+      command: ifdown {{ pv_nic }}
+      changed_when: true
+      failed_when: false
+      tags: install
+
+    - name: Up the nic
+      command: ifup {{ pv_nic }}
+      changed_when: true
+      tags: install
+
+    - name: Show ip
+      shell: >
+        set -o pipefail && \
+        ifconfig {{ pv_nic }} | grep 'inet' |cut -d: -f2 |  awk '{ print $2}'
+      changed_when: false
+  when: os_supported_leap not in ansible_distribution | lower
 
 - name: Discover nodes
   command: iscsiadm -m discovery -t sendtargets -p {{ item }}
@@ -121,7 +158,7 @@
 - name: IQDN id
   shell: >
     set -o pipefail && \
-    cat /etc/iscsi/initiatorname.iscsi | cut -f2 -d"="
+    grep "InitiatorName=" /etc/iscsi/initiatorname.iscsi | cut -f2 -d"="
   register: iqdn_id
   changed_when: false
   tags: install
@@ -135,4 +172,4 @@
   command: iscsiadm -m node --login {{ pv_name }} -p {{ ip_port }}
   changed_when: true
   failed_when: false
-  tags: install
+  tags: install

+ 4 - 1
roles/powervault_me4_nfs/vars/main.yml

@@ -37,6 +37,9 @@ pv_port_ip: 192.168.25.5
 pv_nfs_file: "{{ role_path }}/../../control_plane/input_params/powervault_me4_vars.yml"
 nic_path: "/etc/sysconfig/network-scripts/ifcfg-{{ powervault_me4_server_nic }}"   
 pv_ports_file: "{{ playbook_dir }}/control_plane/roles/powervault_me4/tasks/ports.yml"
+os_supported_leap: "leap"
+nic_status_search: "File exists"
+nic_conf_failed_msg: "NIC configuration failed"
 
 # Usage: validate_nfs_config.yml
-nic_error: "Failed. The nic given is wrong. Give nic according to the provisioned OS"
+nic_error: "Failed. The nic given is wrong. Give nic according to the provisioned OS"

+ 9 - 6
roles/slurm_restd/tasks/main.yml

@@ -13,11 +13,14 @@
 #  limitations under the License.
 ---
 
-- name: Install jansson
-  include_tasks: install_jansson.yml
+- name: Execute slurm restd on rocky
+  block:
+    - name: Install jansson
+      include_tasks: install_jansson.yml
 
-- name: Install libjwt
-  include_tasks: install_libjwt.yml
+    - name: Install libjwt
+      include_tasks: install_libjwt.yml
 
-- name: Generate Token
-  include_tasks: generate_token.yml
+    - name: Generate Token
+      include_tasks: generate_token.yml
+  when: os_supported_leap not in compute_os

+ 51 - 40
telemetry/roles/grafana_config/files/SpiralLayout.json

@@ -34,7 +34,7 @@
   "fiscalYearStartMonth": 0,
   "graphTooltip": 0,
   "id": null,
-  "iteration": 1647434054378,
+  "iteration": 1647618408881,
   "links": [],
   "liveNow": false,
   "panels": [
@@ -44,7 +44,7 @@
         "uid": "telemetry-postgres"
       },
       "gridPos": {
-        "h": 15,
+        "h": 17,
         "w": 24,
         "x": 0,
         "y": 0
@@ -61,13 +61,12 @@
             "type": "postgres",
             "uid": "telemetry-postgres"
           },
-          "format": "table",
+          "format": "time_series",
           "group": [],
-          "hide": false,
           "metricColumn": "none",
           "rawQuery": true,
-          "rawSql": "SELECT\n  *\nFROM\n  slurm.jobs\nWHERE\n  user_id IN ($users)\n  AND start_time < ${__to:date:seconds}\n  AND end_time BETWEEN ${__from:date:seconds} and ${__to:date:seconds}",
-          "refId": "jobs",
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) as \"memory_power\",\nCONCAT('| ',system) AS name\nFROM timeseries_metrics\nWHERE  \n$__timeFilter(time) AND \nlabel= 'PowerMetrics TotalMemoryPower' AND \nsystem IN ($ServiceTag)\nGROUP BY\ntime, name\nORDER BY time;",
+          "refId": "memory_power",
           "select": [
             [
               {
@@ -97,8 +96,8 @@
           "hide": false,
           "metricColumn": "none",
           "rawQuery": true,
-          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"memory_power\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'PowerMetrics TotalMemoryPower'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
-          "refId": "memory_power",
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) as \"power_consumption\",\nCONCAT('| ',system) AS name\nFROM timeseries_metrics\nWHERE  \n$__timeFilter(time) AND \nlabel= 'PowerMetrics SystemPowerConsumption' AND \nsystem IN ($ServiceTag)\nGROUP BY\ntime, name\nORDER BY time;",
+          "refId": "power_consumption",
           "select": [
             [
               {
@@ -128,7 +127,7 @@
           "hide": false,
           "metricColumn": "none",
           "rawQuery": true,
-          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"cpu_power\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'PowerMetrics TotalCPUPower'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) as \"cpu_power\",\nCONCAT('| ',system) AS name\nFROM timeseries_metrics\nWHERE  \n$__timeFilter(time) AND \nlabel= 'PowerMetrics TotalCPUPower' AND \nsystem IN ($ServiceTag)\nGROUP BY\ntime, name\nORDER BY time;",
           "refId": "cpu_power",
           "select": [
             [
@@ -159,7 +158,7 @@
           "hide": false,
           "metricColumn": "none",
           "rawQuery": true,
-          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"cpu1_temp\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'CPU1 Temp TemperatureReading'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) as \"cpu1_temp\",\nCONCAT('| ',system) AS name\nFROM timeseries_metrics\nWHERE  \n$__timeFilter(time) AND \nlabel= 'CPU1 Temp TemperatureReading' AND \nsystem IN ($ServiceTag)\nGROUP BY\ntime, name\nORDER BY time;",
           "refId": "cpu1_temp",
           "select": [
             [
@@ -190,7 +189,7 @@
           "hide": false,
           "metricColumn": "none",
           "rawQuery": true,
-          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"CPU2_Temp\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'CPU2 Temp TemperatureReading' AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) as \"cpu2_temp\",\nCONCAT('| ',system) AS name\nFROM timeseries_metrics\nWHERE  \n$__timeFilter(time) AND \nlabel= 'CPU2 Temp TemperatureReading' AND \nsystem IN ($ServiceTag)\nGROUP BY\ntime, name\nORDER BY time;",
           "refId": "cpu2_temp",
           "select": [
             [
@@ -221,7 +220,7 @@
           "hide": false,
           "metricColumn": "none",
           "rawQuery": true,
-          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",$__interval),\n  avg(CAST(value AS decimal)) AS \"NIC1_Temp\",\n  CONCAT('| ',nodes.os_ip_addr) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label SIMILAR TO '% NIC 1 Port 1 Partition 1 TemperatureReading' AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) as \"NIC1_temp\",\nCONCAT('| ',system) AS name\nFROM timeseries_metrics\nWHERE  \n$__timeFilter(time) AND \nlabel SIMILAR TO '% NIC 1 Port 1 Partition 1 TemperatureReading' AND \nsystem IN ($ServiceTag)\nGROUP BY\ntime, name\nORDER BY time;",
           "refId": "nic1_temp",
           "select": [
             [
@@ -241,13 +240,44 @@
               "type": "macro"
             }
           ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT $__timeGroupAlias(\"time\",$__interval),\navg(CAST(value AS decimal)) as \"Fan1_speed\",\nCONCAT('| ',system) AS name\nFROM timeseries_metrics\nWHERE  \n$__timeFilter(time) AND \nlabel= 'Fan 1A RPMReading' AND \nsystem IN ($ServiceTag)\nGROUP BY\ntime, name\nORDER BY time;",
+          "refId": "fan1_speed",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
         }
       ],
       "title": "Spiral-Layout",
       "type": "hpcviz-idvl-hpcc-spiral-layout"
     }
   ],
-  "refresh": "5s",
+  "refresh": "",
   "schemaVersion": 33,
   "style": "dark",
   "tags": [],
@@ -259,32 +289,13 @@
           "type": "postgres",
           "uid": "telemetry-postgres"
         },
-        "definition": "SELECT DISTINCT servicetag as __value from nodes\n",
-        "hide": 0,
-        "includeAll": true,
-        "multi": true,
-        "name": "servicetag",
-        "options": [],
-        "query": "SELECT DISTINCT servicetag as __value from nodes\n",
-        "refresh": 1,
-        "regex": "",
-        "skipUrlSync": false,
-        "sort": 1,
-        "type": "query"
-      },
-      {
-        "current": {},
-        "datasource": {
-          "type": "postgres",
-          "uid": "telemetry-postgres"
-        },
-        "definition": "SELECT\n user_id as __value, user_name as __text, nodes as IP, nodes.os_ip_addr, nodes.servicetag\nFROM\n  slurm.jobs\nINNER JOIN nodes\nON nodes.os_ip_addr = ANY(nodes)\nWHERE nodes.servicetag in ($servicetag) AND\n start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
+        "definition": "SELECT DISTINCT system as __value from timeseries_metrics",
         "hide": 0,
         "includeAll": true,
         "multi": true,
-        "name": "users",
+        "name": "ServiceTag",
         "options": [],
-        "query": "SELECT\n user_id as __value, user_name as __text, nodes as IP, nodes.os_ip_addr, nodes.servicetag\nFROM\n  slurm.jobs\nINNER JOIN nodes\nON nodes.os_ip_addr = ANY(nodes)\nWHERE nodes.servicetag in ($servicetag) AND\n start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
+        "query": "SELECT DISTINCT system as __value from timeseries_metrics",
         "refresh": 1,
         "regex": "",
         "skipUrlSync": false,
@@ -294,13 +305,13 @@
     ]
   },
   "time": {
-    "from": "now-2d",
+    "from": "now-7d",
     "to": "now"
   },
   "timepicker": {},
-  "timezone": "",
-  "title": "SpiralLayout",
-  "uid": "ou27WHLni",
-  "version": 7,
+  "timezone": "browser",
+  "title": "Spiral-Layout",
+  "uid": "pArBHUtnk",
+  "version": 4,
   "weekStart": ""
 }