瀏覽代碼

Merge branch 'devel' into all-contributors/add-abhishek-sa1

Sujit Jadhav 3 年之前
父節點
當前提交
c2a18b69f2
共有 44 個文件被更改,包括 1225 次插入388 次删除
  1. 3 0
      control_plane/roles/control_plane_ib/files/Dockerfile
  2. 1 11
      control_plane/roles/control_plane_ib/files/infiniband_container_configure.yml
  3. 3 3
      control_plane/roles/control_plane_ib/files/k8s_infiniband.yml
  4. 1 1
      control_plane/roles/control_plane_ib/files/start.sh
  5. 68 0
      control_plane/roles/control_plane_security/files/temp_sssd.conf
  6. 61 9
      control_plane/roles/control_plane_security/tasks/install_389ds.yml
  7. 7 0
      control_plane/roles/control_plane_security/vars/main.yml
  8. 8 8
      control_plane/roles/provision_idrac/tasks/check_prerequisites.yml
  9. 2 0
      control_plane/roles/webui_awx/files/requirements.yml
  10. 3 0
      docs/FAQ.md
  11. 6 4
      docs/INSTALL_OMNIA.md
  12. 78 38
      docs/INSTALL_OMNIA_CONTROL_PLANE.md
  13. 10 7
      docs/README.md
  14. 25 0
      docs/Security/ENABLE_SECURITY_LOGIN_NODE.md
  15. 85 0
      docs/Security/ENABLE_SECURITY_MANAGEMENT_STATION.md
  16. 0 27
      docs/Security/Enable_Security_LoginNode.md
  17. 0 79
      docs/Security/Enable_Security_ManagementStation.md
  18. 8 7
      docs/Telemetry_Visualization/Visualization.md
  19. 1 1
      docs/control_plane/device_templates/PROVISION_SERVERS.md
  20. 1 1
      docs/control_plane/input_parameters/PROVISION_SERVERS.md
  21. 30 0
      roles/cluster_validation/tasks/install_packages.yml
  22. 11 1
      roles/cluster_validation/tasks/main.yml
  23. 5 1
      roles/cluster_validation/vars/main.yml
  24. 4 4
      roles/common/vars/main.yml
  25. 3 3
      roles/k8s_start_services/tasks/deploy_k8s_services.yml
  26. 68 0
      roles/login_node/files/temp_sssd.conf
  27. 61 7
      roles/login_node/tasks/install_389ds.yml
  28. 7 0
      roles/login_node/vars/main.yml
  29. 3 3
      roles/slurm_exporter/tasks/install_prometheus.yml
  30. 6 0
      roles/slurm_manager/tasks/main.yml
  31. 305 0
      telemetry/roles/grafana_config/files/SpiralLayout.json
  32. 12 1
      telemetry/roles/grafana_config/tasks/add_dashboards.yml
  33. 4 1
      telemetry/roles/grafana_config/vars/main.yml
  34. 7 0
      telemetry/roles/slurm_telemetry/files/Dockerfile
  35. 6 0
      telemetry/roles/slurm_telemetry/files/init_k8s_pod_local.sh
  36. 33 0
      telemetry/roles/slurm_telemetry/files/k8s_slurm_telemetry.yml
  37. 0 90
      telemetry/roles/slurm_telemetry/files/update_service_tags.yml
  38. 51 0
      telemetry/roles/slurm_telemetry/tasks/deploy_slurm_telemetry.yml
  39. 75 63
      telemetry/roles/slurm_telemetry/tasks/get_node_inventory.yml
  40. 14 7
      telemetry/roles/slurm_telemetry/tasks/main.yml
  41. 118 0
      telemetry/roles/slurm_telemetry/tasks/update_service_tags.yml
  42. 6 6
      telemetry/roles/slurm_telemetry/tasks/update_timescaledb_details.yml
  43. 15 1
      telemetry/roles/slurm_telemetry/vars/main.yml
  44. 10 4
      telemetry/telemetry.yml

+ 3 - 0
control_plane/roles/control_plane_ib/files/Dockerfile

@@ -1,5 +1,6 @@
 FROM rockylinux/rockylinux:docker_os
 
+RUN dnf -y update && dnf clean all
 RUN dnf install -y epel-release
 RUN dnf install dhcp-server -y \
   ansible \
@@ -20,5 +21,7 @@ COPY opensm.conf /etc/rdma/opensm.conf
 COPY start.sh /
 
 RUN systemctl enable dhcpd
+RUN chmod +x /start.sh
 
+ENTRYPOINT ["/start.sh"]
 CMD ["sbin/init"]

+ 1 - 11
control_plane/roles/control_plane_ib/files/infiniband_container_configure.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,13 +21,3 @@
   - name: Start dhcpd services
     command: dhcpd {{ ib_nic }}
     changed_when: false
-
-  - name: Change mode
-    command: chmod 777 /start.sh
-    changed_when: false
-
-  - name: Run shell
-    shell: ./start.sh
-    args:
-      chdir: /
-    changed_when: false

+ 3 - 3
control_plane/roles/control_plane_ib/files/k8s_infiniband.yml

@@ -35,8 +35,8 @@ spec:
         - name: infiniband-container
           image: 'localhost/infiniband-container:latest'
           imagePullPolicy: Never
-          command:
-            - /sbin/init
+          command: [ "/start.sh" ]
+          args: [ "/sbin/init" ]
           volumeMounts:
             - name: omnia-storage
               mountPath: /root/omnia
@@ -52,4 +52,4 @@ spec:
             capabilities:
               add:
                 - NET_RAW
-            privileged: false
+            privileged: true

+ 1 - 1
control_plane/roles/control_plane_ib/files/start.sh

@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 
 /usr/libexec/rdma-init-kernel
 

+ 68 - 0
control_plane/roles/control_plane_security/files/temp_sssd.conf

@@ -0,0 +1,68 @@
+#
+# sssd.conf
+# Generated by 389 Directory Server - dsidm
+#
+# For more details see man sssd.conf and man sssd-ldap
+# Be sure to review the content of this file to ensure it is secure and correct
+# in your environment.
+
+[domain/ldap]
+# Uncomment this for more verbose logging.
+# debug_level=3
+
+# Cache hashes of user authentication for offline auth.
+cache_credentials = True
+id_provider = ldap
+auth_provider = ldap
+access_provider = ldap
+chpass_provider = ldap
+ldap_schema = rfc2307
+ldap_search_base = dc=omnia,dc=test
+ldap_uri = ldapi://%2fvar%2frun%2fslapd-ldap1.socket
+# If you have DNS SRV records, you can use the following instead. This derives
+# from your ldap_search_base.
+# ldap_uri = _srv_
+
+ldap_tls_reqcert = demand
+# To use cacert dir, place *.crt files in this path then run:
+# /usr/bin/openssl rehash /etc/openldap/certs
+# or (for older versions of openssl)
+# /usr/bin/c_rehash /etc/openldap/certs
+ldap_tls_cacertdir = /etc/openldap/certs
+
+# Path to the cacert
+# ldap_tls_cacert = /etc/openldap/certs/ca.crt
+
+# Only users who match this filter can login and authorise to this machine. Note
+# that users who do NOT match, will still have their uid/gid resolve, but they
+# can't login.
+ldap_access_filter = (memberOf=cn=server_admins,ou=groups,dc=omnia,dc=test)
+
+enumerate = false
+access_provider = ldap
+ldap_user_member_of = memberof
+ldap_user_gecos = cn
+ldap_user_uuid = nsUniqueId
+ldap_group_uuid = nsUniqueId
+# This is really important as it allows SSSD to respect nsAccountLock
+ldap_account_expire_policy = rhds
+ldap_access_order = filter, expire
+# Setup for ssh keys
+# Inside /etc/ssh/sshd_config add the lines:
+#   AuthorizedKeysCommand /usr/bin/sss_ssh_authorizedkeys
+#   AuthorizedKeysCommandUser nobody
+# You can test with the command: sss_ssh_authorizedkeys <username>
+ldap_user_ssh_public_key = nsSshPublicKey
+
+# This prevents an issue where the Directory is recursively walked on group
+# and user look ups. It makes the client faster and more responsive in almost
+# every scenario.
+ignore_group_members = False
+
+[sssd]
+services = nss, pam, ssh, sudo
+config_file_version = 2
+
+domains = ldap
+[nss]
+homedir_substring = /home

+ 61 - 9
control_plane/roles/control_plane_security/tasks/install_389ds.yml

@@ -39,13 +39,18 @@
   failed_when: false
   no_log: true
   register: ds389_status_authentication
- 
+
+- name: Gathering service facts
+  service_facts:
+
 - name: Modify ds389_status
   set_fact:
     ds389_status: true
   when: 
     - ds389_status_authentication.rc == 0
     - ds389_pwpolicy_search_key in ds389_pwpolicy_check.stdout
+    - "'sssd.service' in ansible_facts.services"
+    - sssd_install_search_key in ansible_facts.services['sssd.service'].state
 
 - block:
     - name: Install 389-ds
@@ -53,17 +58,25 @@
         name: "{{ ds389_packages }}"
         state: present 
 
+    - name: Check ldap instance is running or not
+      command: dsctl {{ ldap_instance }} status
+      changed_when: false
+      failed_when: false
+      register: ldap1_status
+
     - name: Create the ldap1.inf file
       copy:
         src: "{{ role_path }}/files/temp_ldap1.inf"
         dest: "{{ ldap1_config_path }}"
-        mode: "{{ file_mode }}"       
+        mode: "{{ file_mode }}"
+      when: ldap1_search_key in ldap1_status.stdout       
 
     - name: Configure ldap1.inf with domain name
       lineinfile:
         path: "{{ ldap1_config_path }}"
         regexp: "^suffix = dc=omnia,dc=test"
         line: "suffix = dc={{ domain_name.split('.')[0] }},dc={{ domain_name.split('.')[1] }}"
+      when: ldap1_search_key in ldap1_status.stdout
 
     - name: Configure ldap1.inf with directory manager password
       lineinfile:
@@ -71,12 +84,7 @@
         regexp: "^root_password = password"
         line: "root_password = {{ ms_directory_manager_password }}"
       no_log: true
-
-    - name: Check ldap instance is running or not
-      command: dsctl {{ ldap_instance }} status
-      changed_when: false
-      failed_when: false
-      register: ldap1_status
+      when: ldap1_search_key in ldap1_status.stdout
 
     - name: Creating 389 directory server instance
       shell: dscreate -v from-file {{ ldap1_config_path }} | tee {{ ldap1_output_path }}
@@ -132,24 +140,28 @@
         src: "{{ role_path }}/files/temp_krb5.conf"
         dest: "{{ kerberos_conf_path }}"
         mode: "{{ file_mode }}"
+      when: not principal_status.stat.exists      
 
     - name: Configure kerberos conf file with domain name
       replace:
         path: "{{ kerberos_conf_path }}"
         regexp: "omnia.test"
         replace: "{{ domain_name }}"
+      when: not principal_status.stat.exists
 
     - name: Configure kerberos conf file with realm name
       replace:
         path: "{{ kerberos_conf_path }}"
         regexp: "OMNIA.TEST"
         replace: "{{ realm_name }}"
+      when: not principal_status.stat.exists
 
     - name: Configure kerberos conf file with hostname
       replace:
         path: "{{ kerberos_conf_path }}"
         regexp: "hostname"
         replace: "{{ short_hostname.stdout }}"
+      when: not principal_status.stat.exists
 
     - block:
         - name: Setting up the kerberos database
@@ -191,7 +203,47 @@
       shell: set -o pipefail && echo {{ ms_kerberos_admin_password }} | kinit admin
       no_log: true
       changed_when: false
-    
+
+    - name: Install sssd packages
+      zypper:
+        name: "{{ sssd_packages }}"
+        state: present
+      
+    - name: Stop and disable nscd
+      systemd:
+        name: nscd
+        state: stopped
+        enabled: no
+      when: "'nscd.service' in ansible_facts.services"
+
+    - name: Check admin group in 389-ds
+      command: dsidm {{ ldap_instance }} group list
+      register: check_admin_group
+      changed_when: false
+
+    - name: Create admin group in 389-ds
+      shell: set -o pipefail && echo {{ admin_group_name }} |  dsidm {{ ldap_instance }} group create
+      changed_when: true
+      when: admin_group_name not in check_admin_group.stdout
+
+    - name: Create the sssd.conf file
+      copy:
+        src: "{{ role_path }}/files/temp_sssd.conf"
+        dest: "{{ sssd_config_path }}"
+        mode: "{{ sssd_file_mode }}"       
+
+    - name: Configure sssd.conf with domain name
+      replace:
+        path: "{{ sssd_config_path }}"
+        regexp: "dc=omnia,dc=test"
+        replace: "dc={{ domain_name.split('.')[0] }},dc={{ domain_name.split('.')[1] }}"
+
+    - name: Start sssd service
+      systemd:
+        name: sssd
+        state: started
+        enabled: yes
+
     - name: Configure password policy in 389-ds
       command: dsconf -w {{ ms_directory_manager_password }} -D "cn=Directory Manager" ldap://{{ server_hostname_ms }} pwpolicy set --pwdlockoutduration {{ lockout_duration }} --pwdmaxfailures {{ max_failures }} --pwdresetfailcount {{ failure_reset_interval }}
       no_log: true

+ 7 - 0
control_plane/roles/control_plane_security/vars/main.yml

@@ -86,6 +86,7 @@ ds389_packages:
   - python3-argcomplete
 ldap1_search_key: "No such instance"
 ds389_pwpolicy_search_key: "passwordlockoutduration: {{ lockout_duration }}"
+sssd_install_search_key: running
 ldap1_config_path: "{{ role_path }}/files/ldap1.inf"
 ldap_instance: ldap1
 ldap1_output_path: /var/log/ldap1_output.txt
@@ -100,6 +101,12 @@ kerberos_packages:
 kerberos_principal_path: /var/lib/kerberos/krb5kdc/principal
 kerberos_conf_path: /etc/krb5.conf
 kerberos_env_path: /usr/lib/mit/sbin/
+sssd_packages:
+  - sssd
+  - sssd-ldap
+admin_group_name: server_admins
+sssd_file_mode: 0600
+sssd_config_path: /etc/sssd/sssd.conf
 
 # Usage: restrict_nonessentials.yml
 service_status: ['enabled','alias','static','indirect','enabled-runtime','active','inactive']

+ 8 - 8
control_plane/roles/provision_idrac/tasks/check_prerequisites.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -174,9 +174,9 @@
             idrac_license_name: "{{ idrac_info.system_info.License[my_idx1].LicenseDescription }}"
           with_items: "{{ idrac_info.system_info.License }}"
           when:
-            - '"iDRAC" in idrac_info.system_info.License[my_idx1].LicenseDescription'
-            - '"Enterprise" in idrac_info.system_info.License[my_idx1].LicenseDescription'
-            - '"License" in idrac_info.system_info.License[my_idx1].LicenseDescription'
+            - '"idrac" in idrac_info.system_info.License[my_idx1].LicenseDescription | lower'
+            - '"enterprise" in idrac_info.system_info.License[my_idx1].LicenseDescription | lower'
+            - '"license" in idrac_info.system_info.License[my_idx1].LicenseDescription | lower'
             - '"Healthy" in idrac_info.system_info.License[my_idx1].PrimaryStatus'
           loop_control:
             index_var: my_idx1
@@ -184,12 +184,12 @@
         - name: Set datacenter license status
           set_fact:
             datacenter_license: true
-            idrac_license_name: "{{ idrac_info.system_info.License[my_idx1].LicenseDescription }}"
+            idrac_license_name: "{{ idrac_info.system_info.License[my_idx2].LicenseDescription }}"
           with_items: "{{ idrac_info.system_info.License }}"
           when:
-            - '"iDRAC" in idrac_info.system_info.License[my_idx2].LicenseDescription'
-            - '"Datacenter" in idrac_info.system_info.License[my_idx2].LicenseDescription'
-            - '"License" in idrac_info.system_info.License[my_idx2].LicenseDescription'
+            - '"idrac" in idrac_info.system_info.License[my_idx2].LicenseDescription | lower'
+            - '"data" in idrac_info.system_info.License[my_idx2].LicenseDescription | lower'
+            - '"license" in idrac_info.system_info.License[my_idx2].LicenseDescription | lower'
             - '"Healthy" in idrac_info.system_info.License[my_idx2].PrimaryStatus'
           loop_control:
             index_var: my_idx2

+ 2 - 0
control_plane/roles/webui_awx/files/requirements.yml

@@ -10,3 +10,5 @@ collections:
     version: 2.2.3
   - name: community.grafana
     version: 1.3.0
+  - name: ansible.utils
+    version: 2.5.2

+ 3 - 0
docs/FAQ.md

@@ -9,6 +9,9 @@ Potential Causes:
 Resolution:  
 Wait for AWX UI to be accessible at http://\<management-station-IP>:8081, and then run the `control_plane.yml` file again, where __management-station-IP__ is the IP address of the management node.
 
+## Why does Omnia Control Plane fail at Task: `control_plane_common: Assert Value of idrac_support if mngmt_network container needed`?
+When `device_config_support` is set to true, `idrac_support` also needs to be set to true. 
+
 ## What to do if the nodes in a Kubernetes cluster reboot:
 Wait for 15 minutes after the Kubernetes cluster reboots. Next, verify the status of the cluster using the following commands:
 * `kubectl get nodes` on the manager node to get the real-time k8s cluster status.  

+ 6 - 4
docs/INSTALL_OMNIA.md

@@ -194,11 +194,13 @@ The following __Slurm__ roles are provided by Omnia when __omnia.yml__ file is r
 To enable the login node, the *login_node_required* variable must be set to "true" in the *omnia_config.yml* file.  
 - **login_common** role: The firewall ports are opened on the manager and login nodes.  
 - **login_server** role: FreeIPA server is installed and configured on the manager node to provide authentication using LDAP and Kerberos principles.  
-- **login_node** role: FreeIPA client is installed and configured on the login node and is integrated with the server running on the manager node.  
+- **login_node** role: For Rocky, FreeIPA client is installed and configured on the login node and is integrated with the server running on the manager node. For LeapOS, 389ds will be installed instead.
 
-**NOTE**: To skip the installation of:
-* The login node-In the `omnia_config.yml` file, set the *login_node_required* variable to "false".  
-* The FreeIPA server and client: Use `--skip-tags freeipa` while executing the *omnia.yml* file. 
+>>__Note:__ If LeapOS is being deployed, login_common and login_server roles will be skipped.  
+
+>> **NOTE**: To skip the installation of:
+>> * The login node-In the `omnia_config.yml` file, set the *login_node_required* variable to "false".  
+>> * The FreeIPA server and client: Use `--skip-tags freeipa` while executing the *omnia.yml* file. 
 
 ### Installing JupyterHub and Kubeflow playbooks  
 If you want to install JupyterHub and Kubeflow playbooks, you have to first install the JupyterHub playbook and then install the Kubeflow playbook.

文件差異過大導致無法顯示
+ 78 - 38
docs/INSTALL_OMNIA_CONTROL_PLANE.md


+ 10 - 7
docs/README.md

@@ -54,7 +54,7 @@ Requirements  |   Version
 OS pre-installed on the management station  |  Rocky 8.x/ Leap 15.x
 OS deployed by Omnia on bare-metal Dell EMC PowerEdge Servers | Rocky 8.x Minimal Edition/ Leap 15.x
 Cobbler  |  3.2.2
-Ansible AWX  |  19.4.0
+Ansible AWX  |  20.0.0
 Slurm Workload Manager  |  20.11.2
 Kubernetes on the management station  |  1.21.0
 Kubernetes on the manager and compute nodes	|	1.16.7 or 1.19.3
@@ -92,9 +92,9 @@ OpenSM	|	GNU General Public License 2	|	3.3.24	|	-
 NVIDIA container runtime	|	Apache-2.0	|	3.4.2	|	Nvidia container runtime library
 Python PIP	|	MIT License	|	21.1.2	|	Python Package
 Python3	|	-	|	3.6.8 (3.6.15 if LeapOS is being used)	|	-
-Kubelet	|	Apache-2.0	|	1.16.7,1.19, 1.21 (LeapOS only supports 1.21) 	|	Provides external, versioned ComponentConfig API types for configuring the kubelet
-Kubeadm	|	Apache-2.0	|	1.16.7,1.19, 1.21 (LeapOS only supports 1.21)	|	"fast paths" for creating Kubernetes clusters
-Kubectl	|	Apache-2.0	|	1.16.7,1.19, 1.21 (LeapOS only supports 1.21)	|	Command line tool for Kubernetes
+Kubelet	|	Apache-2.0	|	1.16.7,1.19, 1.21  	|	Provides external, versioned ComponentConfig API types for configuring the kubelet
+Kubeadm	|	Apache-2.0	|	1.16.7,1.19, 1.21 	|	"fast paths" for creating Kubernetes clusters
+Kubectl	|	Apache-2.0	|	1.16.7,1.19, 1.21 	|	Command line tool for Kubernetes
 JupyterHub	|	Modified BSD License	|	1.1.0	|	Multi-user hub
 kubernetes Controllers	|	Apache-2.0	|	1.16.7,1.19 (1.21 if LeapOS is being used)	|	Orchestration tool	
 Kfctl	|	Apache-2.0	|	1.0.2	|	CLI for deploying and managing Kubeflow
@@ -106,10 +106,10 @@ Horovod	|	Apache-2.0	|	0.21.1	|	Distributed deep learning training framework for
 MPI	|	Copyright (c) 2018-2019 Triad National Security,LLC. All rights reserved.	|	0.3.0	|	HPC library
 CoreDNS	|	Apache-2.0	|	1.6.2	|	DNS server that chains plugins
 CNI	|	Apache-2.0	|	0.3.1	|	Networking for Linux containers
-AWX	|	Apache-2.0	|	19.4.0	|	Web-based User Interface
+AWX	|	Apache-2.0	|	20.0.0	|	Web-based User Interface
 AWX.AWX	|	Apache-2.0	|	19.4.0	|	Galaxy collection to perform awx configuration
-AWXkit	|	Apache-2.0	|	to be updated	|	To perform configuration through CLI commands
-Cri-o	|	Apache-2.0	|	1.21	|	Container Service
+AWXkit	|	Apache-2.0	|	18.0.0	|	To perform configuration through CLI commands
+Cri-o	|	Apache-2.0	|	1.21, 1.17.3  (LeapOS only supports  1.17.3) |	Container Service
 Buildah	|	Apache-2.0	|	1.22.4	|	Tool to build and run containers
 PostgreSQL	|	Copyright (c) 1996-2020, PostgreSQL Global Development Group	|	10.15	|	Database Management System
 Redis	|	BSD-3-Clause License	|	6.0.10	|	In-memory database
@@ -123,6 +123,9 @@ OMSDK	|	Apache-2.0	|	1.2.488	|	Dell EMC OpenManage Python SDK (OMSDK) is a pytho
 | postfix                               | IBM Public License               | 3.5.8  | Mail Transfer Agent (MTA) designed to determine routes and   send emails                                                                       |
 | xorriso                               | GPL version 3                    | 1.4.8  | xorriso copies file objects from POSIX compliant filesystems   into Rock Ridge enhanced ISO 9660 filesystems.                                  |
 | Dell EMC   OpenManage Ansible Modules | GNU- General Public License v3.0 | 5.0.0  | OpenManage Ansible Modules simplifies and automates   provisioning, deployment, and updates of PowerEdge servers and modular   infrastructure. |
+| 389-ds                               | GPL version 3               | 1.4.4  |  LDAP server used for authentication, access control.                                                                       |
+| sssd                               | GPL version 3                    | 1.16.1  | A set of daemons used to manage access to remote directory services and authentication mechanisms.                                   |
+| krb5 | MIT License | 1.19.2  | Authentication protocol providing strong authentication for client/server applications by using secret-key cryptography |
 
 # Known issues  
 * **Issue**: Hosts are not displayed on the AWX UI.  

文件差異過大導致無法顯示
+ 25 - 0
docs/Security/ENABLE_SECURITY_LOGIN_NODE.md


文件差異過大導致無法顯示
+ 85 - 0
docs/Security/ENABLE_SECURITY_MANAGEMENT_STATION.md


+ 0 - 27
docs/Security/Enable_Security_LoginNode.md

@@ -1,27 +0,0 @@
-# Enabling Security on the Login Node 
-
-* Ensure that `enable_secure_login_node` is set to **true** in `omnia_config.yml`
-* Set the following parameters in `omnia_security_config.yml`
-
-|  Parameter Name        |  Default Value  |  Additional Information                                                                                                                                          |
-|------------------------|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| max_failures           | 3               | Failures allowed before lockout. <br> This value cannot currently   be changed.                                                                                  |
-| failure_reset_interval | 60              | Period (in seconds) after which the number of failed login attempts is   reset <br> Accepted Values: 30-60                                                       |
-| lockout_duration       | 10              | Period (in seconds) for which users are locked out. <br> Accepted   Values: 5-10                                                                                 |
-| session_timeout        | 180             | Period (in seconds) after which idle users get logged out automatically   <br> Accepted Values: 30-90                                                            |
-| alert_email_address    |                 | Email address used for sending alerts in case of authentication failure. Currently, only one email ID is accepted in this field.   <br> If this variable is left blank, authentication failure alerts will   be disabled. |
-| allow_deny             | Allow           | This variable sets whether the user list is Allowed or Denied. <br>   Accepted Values: Allow, Deny                                                               |
-| user                   |                 | Array of users that are allowed or denied based on the `allow_deny`   value. Multiple users must be separated by a space.                                        |
-
-* Set the following parameters in `control_plane/input_params/security_vars.yml`
-
-|  Parameter Name        |  Default Value  |  Additional Information                                                                                                                                          |
-|------------------------|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| allow_deny             | Allow           | This variable sets whether the user list is Allowed or Denied. <br>   Accepted Values: Allow, Deny                                                               |
-| user                   |                 | Array of users that are allowed or denied based on the `allow_deny`   value. Multiple users must be separated by a space.                                        |
-
-
-## Kernel Lockdown
-
-* RockyOS has Kernel Lockdown mode (Integrity) enabled by default
-* SUSE/Leap allows users to set Kernel Lockdown mode to Confidentiality or Integrity.

+ 0 - 79
docs/Security/Enable_Security_ManagementStation.md

@@ -1,79 +0,0 @@
-# Enabling Security on the Management Station
-
-Omnia uses FreeIPA on RockyOS to enable security features like authorisation and access control.
-
-## Enabling Authentication on the Management Station:
-
-Set the parameter 'enable_security_support' to true in `base_vars.yml`
-
-## Prerequisites Before Enabling Security:
-
-* Enter the relevant values in `login_vars.yml`:
-
-| Parameter Name             | Default Value | Additional Information                                                                           |
-|----------------------------|---------------|--------------------------------------------------------------------------------------------------|
-| ms_directory_manager_password |               | Password of the Directory Manager with full access to the directory for system management tasks. |
-| ms_kerberos_admin_password         |               | "admin" user password for the IPA server on RockyOS. If LeapOS is in use, it is used as the "kerberos admin" user password for 389-ds <br> This field is not relevant to Management Stations running `LeapOS`                                                         |
-
-
-
-* Enter the relevant values in `security_vars.yml:
-
-If `RockyOS` is in use on the Management Station:
-
-|  Parameter Name        |  Default Value  |  Additional Information                                                                                                                                                                                                                                                                                                                                      |
-|------------------------|-----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-|  domain_name           |  omnia.test     |  The domain name should not contain   an underscore ( _ )                                                                                                                                                                                                                                                                                                    |
-|  realm_name            |  OMNIA.TEST     |  The realm name should follow the   following rules per https://www.freeipa.org/page/Deployment_Recommendations   <br> * The realm name must not conflict with any other existing   Kerberos realm name (e.g. name used by Active Directory). <br> * The   realm name should be upper-case (EXAMPLE.COM) version of primary DNS domain   name (example.com). |
-| max_failures           | 3               | Failures allowed before lockout. <br> This value cannot currently   be changed.                                                                                                                                                                                                                                                                              |
-| failure_reset_interval | 60              | Period (in seconds) after which the number of failed login attempts is   reset <br> Accepted Values: 30-60                                                                                                                                                                                                                                                   |
-| lockout_duration       | 10              | Period (in seconds) for which users are locked out. <br> Accepted   Values: 5-10                                                                                                                                                                                                                                                                             |
-| session_timeout        | 180             | Period (in seconds) after which idle users get logged out automatically   <br> Accepted Values: 30-90                                                                                                                                                                                                                                                        |
-| alert_email_address    |                 | Email address used for sending alerts in case of authentication failure. Currently, only one email address is supported in this field.   <br> If this variable is left blank, authentication failure alerts will   be disabled.                                                                                                                                                                                             |
-| allow_deny             | Allow           | This variable sets whether the user list is Allowed or Denied. <br>   Accepted Values: Allow, Deny                                                                                                                                                                                                                                                           |
-| user                   |                 | Array of users that are allowed or denied based on the `allow_deny`   value. Multiple users must be separated by a space.                                                                                                                                                                                                                                    |
-
-
-## Log Aggregation via Grafana
-
-[Loki](https://grafana.com/docs/loki/latest/fundamentals/overview/) is a datastore used to efficiently hold log data for security purposes. Using the `promtail` agent, logs are collated and streamed via a HTTP API.
-
->> __Note:__ When `control_plane.yml` is run, Loki is automatically set up as a data source on the Grafana UI.
-
-
-
-### Querying Loki 
-
-Loki uses basic regex based syntax to filter for specific jobs, dates or timestamps.
-
-* Select the Explore ![Explore Icon](../Telemetry_Visualization/Images/ExploreIcon.PNG) tab to select control-plane-loki from the drop down.
-* Using [LogQL queries](https://grafana.com/docs/loki/latest/logql/log_queries/), all logs in `/var/log` can be accessed using filters (Eg: `{job=”Omnia”}` )
-
-## Viewing Logs on the Dashboard
-
-All log files can be viewed via the Dashboard tab (![Dashboard Icon](../Telemetry_Visualization/Images/DashBoardIcon.PNG)). The Default Dashboard displays `omnia.log` and `syslog`. Custom dashboards can be created per user requirements.
-
-Below is a list of all logs available to Loki and can be accessed on the dashboard:
-
-| Name               | Location                                  | Purpose                      | Additional Information                                                                             |
-|--------------------|-------------------------------------------|------------------------------|----------------------------------------------------------------------------------------------------|
-| Omnia Logs         | /var/log/omnia.log                        | Omnia Log                    | This log is configured by Default                                                                  |
-| syslogs            | /var/log/messages                         | System Logging               | This log is configured by Default                                                                  |
-| Audit Logs         | /var/log/audit/audit.log                  | All Login Attempts           | This log is configured by Default                                                                  |
-| CRON logs          | /var/log/cron                             | CRON Job Logging             | This log is configured by Default                                                                  |
-| Pods logs          | /var/log/pods/ * / * / * log                    | k8s pods                     | This log is configured by Default                                                                  |
-| Access Logs        | /var/log/dirsrv/slapd-<Realm Name>/access | Directory Server Utilization | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| Error Log          | /var/log/dirsrv/slapd-<Realm Name>/errors | Directory Server Errors      | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| CA Transaction Log | /var/log/pki/pki-tomcat/ca/transactions   | FreeIPA PKI Transactions     | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| KRB5KDC            | /var/log/krb5kdc.log                      | KDC Utilization              | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| Secure logs        | /var/log/secure                           | Login Error Codes            | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| HTTPD logs         | /var/log/httpd/*                          | FreeIPA API Call             | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| DNF logs           | /var/log/dnf.log                          | Installation Logs            | This log is configured on Rocky OS                                                                 |
-| Zypper Logs        | /var/log/zypper.log                       | Installation Logs            | This log is configured on Leap OS                                                                  |
-
-
-
-
-
-
-

+ 8 - 7
docs/Telemetry_Visualization/Visualization.md

@@ -11,17 +11,17 @@ A lot of these metrics are collected using iDRAC telemetry. iDRAC telemetry allo
 
 | Parameter Name        | Default Value | Information |
 |-----------------------|---------------|-------------|
-| timescaledb_user      | 		        |  Username used for connecting to timescale db. Minimum Legth: 2 characters.          |
-| timescaledb_password  | 		        |  Password used for connecting to timescale db. Minimum Legth: 2 characters.           |
-| mysqldb_user          | 		        |  Username used for connecting to mysql db. Minimum Legth: 2 characters.         |
-| mysqldb_password      | 		        |  Password used for connecting to mysql db. Minimum Legth: 2 characters.            |
+| timescaledb_user      | 		        |  Username used for connecting to timescale db. Minimum Length: 2 characters.          |
+| timescaledb_password  | 		        |  Password used for connecting to timescale db. Minimum Length: 2 characters.           |
+| mysqldb_user          | 		        |  Username used for connecting to mysql db. Minimum Length: 2 characters.         |
+| mysqldb_password      | 		        |  Password used for connecting to mysql db. Minimum Length: 2 characters.            |
 | mysqldb_root_password | 		        |  Password used for connecting to mysql db for root user. Minimum Legth: 2 characters.         |
 
 3. All parameters in `telemetry/input_params/base_vars.yml` need to be filled in:
 
 | Parameter Name          | Default Value     | Information |
 |-------------------------|-------------------|-------------|
-| mount_location          | idrac_telemetrysource_services_db | Sets the location all telemetry related files will be stored and both timescale and mysql databases will be mounted.            |
+| mount_location          | /opt/omnia| Sets the location all telemetry related files will be stored and both timescale and mysql databases will be mounted.            |
 | idrac_telemetry_support | true              | This variable is used to enable iDRAC telemetry support and visualizations. Accepted Values: true/false            |
 | slurm_telemetry_support | true              | This variable is used to enable slurm telemetry support and visualizations. Slurm Telemetry support can only be activated when idrac_telemetry_support is set to true. Accepted Values: True/False.        |
 | timescaledb_name        | telemetry_metrics | Postgres DB with timescale extension is used for storing iDRAC and slurm telemetry metrics.            |
@@ -50,7 +50,7 @@ Use any one of the following browsers to access the Grafana UI (https://< Grafan
 
 ## Initiating Telemetry
 
-1. Once `control_plane.yml` and `telemetry.yml` are executed, run the following commands from `omnia/telemetry`:
+1. Once `control_plane.yml` and `omnia.yml` are executed, run the following commands from `omnia/telemetry`:
 
 `ansible-playbook telemetry.yml`
 
@@ -60,7 +60,8 @@ Use any one of the following browsers to access the Grafana UI (https://< Grafan
 After initiation, new nodes can be added to telemetry by running the following commands from `omnia/telemetry`:
 		
 ` ansible-playbook add_idrac_node.yml `
-		
+
+	
 
 
 

+ 1 - 1
docs/control_plane/device_templates/PROVISION_SERVERS.md

@@ -13,7 +13,7 @@ Edit the following files under the `control_plane/input_params` directory to pro
 	File name	|	Variables</br> [Required/ Optional]	|	Default, choices	|	Description
 	-------	|	----------------	|	-----------------	|	-----------------
 	idrac_vars.yml	|	idrac_system_profile</br> [Required]	|	<ul><li>**Performance**</li> <li>PerformancePerWatt(DAPC)</li> <li>PerformancePerWatt(OS)</li> <li>WorkstationPerformance</li></ul>	|	The system profile used for BIOS configuration. 
-	<br>	|	firmware_update_required</br> [Required]	|	<ul><li>**true**</li> <li>false</li></ul>	|	By default, Omnia updates the firmware on the servers. To disable the firmware update, set the variable to "false".
+	<br>	|	firmware_update_required</br> [Required]	|	<ul><li>**false**</li> <li>true</li></ul>	|	By default, Omnia updates the firmware on the servers. To disable the firmware update, set the variable to "false".
 	<br>	|	poweredge_model</br> [Required if "firmware_update_required" is set to "true"]	|	<ul><li>**C6420**</li> <li>R640</li><li>R740</li><li>C4140</li> <li>And other supported PowerEdge servers</li></ul>	|	Enter the required PowerEdge server models to update the firmware. For example, enter `R640,R740,C4140` to update firmware on these models of PowerEdge servers. For a complete list of supported PowerEdge servers, see the *Hardware managed by Omnia* section in the Readme file.
 	<br>	|	uefi_secure_boot</br> [Optional]	|	<ul><li>**disabled**</li> <li>enabled</li></ul>	|	Option to enable or disable the secure boot mode.
 	<br>	|	system_lockdown</br> [Optional]	|	<ul><li>**disabled**</li> <li>enabled</li></ul>	|	Option to enable or disable system lockdown.

+ 1 - 1
docs/control_plane/input_parameters/PROVISION_SERVERS.md

@@ -13,7 +13,7 @@ Edit the following files under the `control_plane/input_params` directory to pro
 	File name	|	Variables</br> [Required/ Optional]	|	Default, choices	|	Description
 	-------	|	----------------	|	-----------------	|	-----------------
 	idrac_vars.yml	|	idrac_system_profile</br> [Required]	|	<ul><li>**Performance**</li> <li>PerformancePerWatt(DAPC)</li> <li>PerformancePerWatt(OS)</li> <li>WorkstationPerformance</li></ul>	|	The system profile used for BIOS configuration. 
-	<br>	|	firmware_update_required</br> [Required]	|	<ul><li>**true**</li> <li>false</li></ul>	|	By default, Omnia updates the firmware on the servers. To disable the firmware update, set the variable to "false".
+	<br>	|	firmware_update_required</br> [Required]	|	<ul><li>**false**</li> <li>true</li></ul>	|	By default, Omnia updates the firmware on the servers. To disable the firmware update, set the variable to "false".
 	<br>	|	poweredge_model</br> [Required if "firmware_update_required" is set to "true"]	|	<ul><li>**C6420**</li> <li>R640</li><li>R740</li><li>C4140</li> <li>And other supported PowerEdge servers</li></ul>	|	Enter the required PowerEdge server models to update the firmware. For example, enter `R640,R740,C4140` to update firmware on these models of PowerEdge servers. For a complete list of supported PowerEdge servers, see the *Hardware managed by Omnia* section in the Readme file.
 	<br>	|	uefi_secure_boot</br> [Optional]	|	<ul><li>**disabled**</li> <li>enabled</li></ul>	|	Option to enable or disable the secure boot mode.
 	<br>	|	system_lockdown</br> [Optional]	|	<ul><li>**disabled**</li> <li>enabled</li></ul>	|	Option to enable or disable system lockdown.

+ 30 - 0
roles/cluster_validation/tasks/install_packages.yml

@@ -0,0 +1,30 @@
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Set fact for ansible version
+  set_fact:
+    ansible_collection_used: true
+  when: "ansible_version.full is version_compare(ansible_base_version, '>')"
+
+- name: Install netaddr
+  pip:
+    name: netaddr
+    state: present
+    executable: pip3
+
+- name: Install ansible galaxy collection ansible.utils
+  command: ansible-galaxy collection install "{{ ipaddr_collection }}"
+  changed_when: false
+  when: ansible_collection_used

+ 11 - 1
roles/cluster_validation/tasks/main.yml

@@ -27,6 +27,7 @@
     control_plane_status: false
     powervault_status: false
     nfs_node_status: false
+    ansible_collection_used: false
 
 - name: Check AWX instance
   command: awx --version
@@ -46,6 +47,15 @@
     - not awx_version_check.failed
     - awx_search_key in awx_hostname.stdout
 
+- name: Install Packages
+  include_tasks: install_packages.yml
+  when: not control_plane_status
+
+- name: Set ansible_collection_used to true in awx
+  set_fact:
+    ansible_collection_used: true
+  when: control_plane_status
+
 - name: Set NFS node status
   set_fact:
     nfs_node_status: true
@@ -90,4 +100,4 @@
         regexp: '#log_path = /var/log/ansible.log'
         replace: 'log_path = /var/log/omnia.log'
       when: ansible_conf_exists.stat.exists
-  when: not control_plane_status
+  when: not control_plane_status

+ 5 - 1
roles/cluster_validation/vars/main.yml

@@ -99,4 +99,8 @@ allow_deny_fail_msg: "Failed. Incorrect Access format in security_vars.yml"
 restrict_program_support_success_msg: "restrict_program_support successfully validated"
 restrict_program_support_failure_msg: "Failed. Accepted values are true or false."
 restrict_softwares_success_msg: "restrict_softwares successfully validated"
-restrict_softwares_failure_msg: "Warning. Values should be comma separated. The supported services are telnet, lpd, bluetooth, rlogin, rexec. Please check restrict_softwares variable"
+restrict_softwares_failure_msg: "Warning. Values should be comma separated. The supported services are telnet, lpd, bluetooth, rlogin, rexec. Please check restrict_softwares variable"
+
+# Usage: install_packages.yml
+ansible_base_version: '2.9'
+ipaddr_collection: ansible.utils:2.5.2

+ 4 - 4
roles/common/vars/main.yml

@@ -14,10 +14,10 @@
 ---
 
 leap_repo:
-  - { name: repo-non-oss, repo: http://download.opensuse.org/distribution/leap/15.3/repo/non-oss/ }
-  - { name: repo-oss, repo: http://download.opensuse.org/distribution/leap/15.3/repo/oss/ }
-  - { name: repo-update-oss, repo: http://download.opensuse.org/update/leap/15.3/oss/ }
-  - { name: repo-update-non-oss, repo: http://download.opensuse.org/update/leap/15.3/non-oss/ }
+  - { name: repo-non-oss, repo: "http://download.opensuse.org/distribution/leap/15.3/repo/non-oss/" }
+  - { name: repo-oss, repo: "http://download.opensuse.org/distribution/leap/15.3/repo/oss/" }
+  - { name: repo-update-oss, repo: "http://download.opensuse.org/update/leap/15.3/oss/" }
+  - { name: repo-update-non-oss, repo: "http://download.opensuse.org/update/leap/15.3/non-oss/" }
 
 nvidia_repo: https://download.nvidia.com/opensuse/leap/15.3/
 docker_repo_url_leap: https://download.docker.com/linux/sles/docker-ce.repo

+ 3 - 3
roles/k8s_start_services/tasks/deploy_k8s_services.yml

@@ -1,4 +1,4 @@
-#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -109,7 +109,7 @@
   tags: init
 
 - name: Start NFS Client Provisioner using NFS on manager node
-  command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_manager_node }}' --set nfs.path='{{ nfs_share_dir }}' --generate-name"
+  command: "helm install nfs-omnia stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_manager_node }}' --set nfs.path='{{ nfs_share_dir }}'"
   changed_when: true
   when:
     - "'nfs-client-provisioner' not in k8s_pods.stdout"
@@ -117,7 +117,7 @@
   tags: init
 
 - name: Start NFS Client Provisioner using NFS on NFS Node
-  command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_nfs_node }}' --set nfs.path='{{ me4_nfs_share_k8s }}' --generate-name"
+  command: "helm install nfs-omnia stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_nfs_node }}' --set nfs.path='{{ me4_nfs_share_k8s }}'"
   changed_when: true
   when:
     - "'nfs-client-provisioner' not in k8s_pods.stdout"

+ 68 - 0
roles/login_node/files/temp_sssd.conf

@@ -0,0 +1,68 @@
+#
+# sssd.conf
+# Generated by 389 Directory Server - dsidm
+#
+# For more details see man sssd.conf and man sssd-ldap
+# Be sure to review the content of this file to ensure it is secure and correct
+# in your environment.
+
+[domain/ldap]
+# Uncomment this for more verbose logging.
+# debug_level=3
+
+# Cache hashes of user authentication for offline auth.
+cache_credentials = True
+id_provider = ldap
+auth_provider = ldap
+access_provider = ldap
+chpass_provider = ldap
+ldap_schema = rfc2307
+ldap_search_base = dc=omnia,dc=test
+ldap_uri = ldapi://%2fvar%2frun%2fslapd-ldap1.socket
+# If you have DNS SRV records, you can use the following instead. This derives
+# from your ldap_search_base.
+# ldap_uri = _srv_
+
+ldap_tls_reqcert = demand
+# To use cacert dir, place *.crt files in this path then run:
+# /usr/bin/openssl rehash /etc/openldap/certs
+# or (for older versions of openssl)
+# /usr/bin/c_rehash /etc/openldap/certs
+ldap_tls_cacertdir = /etc/openldap/certs
+
+# Path to the cacert
+# ldap_tls_cacert = /etc/openldap/certs/ca.crt
+
+# Only users who match this filter can login and authorise to this machine. Note
+# that users who do NOT match, will still have their uid/gid resolve, but they
+# can't login.
+ldap_access_filter = (memberOf=cn=server_admins,ou=groups,dc=omnia,dc=test)
+
+enumerate = false
+access_provider = ldap
+ldap_user_member_of = memberof
+ldap_user_gecos = cn
+ldap_user_uuid = nsUniqueId
+ldap_group_uuid = nsUniqueId
+# This is really important as it allows SSSD to respect nsAccountLock
+ldap_account_expire_policy = rhds
+ldap_access_order = filter, expire
+# Setup for ssh keys
+# Inside /etc/ssh/sshd_config add the lines:
+#   AuthorizedKeysCommand /usr/bin/sss_ssh_authorizedkeys
+#   AuthorizedKeysCommandUser nobody
+# You can test with the command: sss_ssh_authorizedkeys <username>
+ldap_user_ssh_public_key = nsSshPublicKey
+
+# This prevents an issue where the Directory is recursively walked on group
+# and user look ups. It makes the client faster and more responsive in almost
+# every scenario.
+ignore_group_members = False
+
+[sssd]
+services = nss, pam, ssh, sudo
+config_file_version = 2
+
+domains = ldap
+[nss]
+homedir_substring = /home

+ 61 - 7
roles/login_node/tasks/install_389ds.yml

@@ -49,13 +49,18 @@
   failed_when: false
   no_log: true
   register: ds389_status_authentication
- 
+
+- name: Gathering service facts
+  service_facts:
+
 - name: Modify ds389_status
   set_fact:
     ds389_status: true
   when: 
     - ds389_status_authentication.rc == 0
     - ldap1_install_search_key in ldap1_status.stdout.split(' ')[3]
+    - "'sssd.service' in ansible_facts.services"
+    - sssd_install_search_key in ansible_facts.services['sssd.service'].state
 
 - block:
     - name: Install 389-ds
@@ -63,17 +68,25 @@
         name: "{{ ds389_packages }}"
         state: present 
 
+    - name: Check ldap instance is running or not
+      command: dsctl {{ ldap_instance }} status
+      changed_when: false
+      failed_when: false
+      register: ldap1_status
+
     - name: Create the ldap1.inf file
       copy:
         src: "{{ role_path }}/files/temp_ldap1.inf"
         dest: "{{ ldap1_config_path }}"
-        mode: "{{ file_mode }}"       
+        mode: "{{ file_mode }}"
+      when: ldap1_search_key in ldap1_status.stdout       
 
     - name: Configure ldap1.inf with domain name
       lineinfile:
         path: "{{ ldap1_config_path }}"
         regexp: "^suffix = dc=omnia,dc=test"
         line: "suffix = dc={{ domain_name.split('.')[0] }},dc={{ domain_name.split('.')[1] }}"
+      when: ldap1_search_key in ldap1_status.stdout
 
     - name: Configure ldap1.inf with directory manager password
       lineinfile:
@@ -81,12 +94,9 @@
         regexp: "^root_password = password"
         line: "root_password = {{ directory_manager_password }}"
       no_log: true
+      when: ldap1_search_key in ldap1_status.stdout
 
-    - name: Check ldap instance is running or not
-      command: dsctl {{ ldap_instance }} status
-      changed_when: false
-      failed_when: false
-      register: ldap1_status
+    
 
     - name: Creating 389 directory server instance
       shell: dscreate -v from-file {{ ldap1_config_path }} | tee {{ ldap1_output_path }}
@@ -154,24 +164,28 @@
         src: "{{ role_path }}/files/temp_krb5.conf"
         dest: "{{ kerberos_conf_path }}"
         mode: "{{ file_mode }}"
+      when: not principal_status.stat.exists
 
     - name: Configure kerberos conf file with domain name
       replace:
         path: "{{ kerberos_conf_path }}"
         regexp: "omnia.test"
         replace: "{{ domain_name }}"
+      when: not principal_status.stat.exists
 
     - name: Configure kerberos conf file with realm name
       replace:
         path: "{{ kerberos_conf_path }}"
         regexp: "OMNIA.TEST"
         replace: "{{ realm_name }}"
+      when: not principal_status.stat.exists
 
     - name: Configure kerberos conf file with hostname
       replace:
         path: "{{ kerberos_conf_path }}"
         regexp: "hostname"
         replace: "{{ server_hostname_short }}"
+      when: not principal_status.stat.exists
 
     - block:
         - name: Setting up the kerberos database
@@ -213,6 +227,46 @@
       shell: set -o pipefail && echo {{ kerberos_admin_password }} | kinit {{ ipa_admin_username }}
       no_log: true
       changed_when: false
+
+    - name: Install sssd packages
+      zypper:
+        name: "{{ sssd_packages }}"
+        state: present
+      
+    - name: Stop and disable nscd
+      systemd:
+        name: nscd
+        state: stopped
+        enabled: no
+      when: "'nscd.service' in ansible_facts.services"
+
+    - name: Check admin group in 389-ds
+      command: dsidm {{ ldap_instance }} group list
+      register: check_admin_group
+      changed_when: false
+
+    - name: Create admin group in 389-ds
+      shell: set -o pipefail && echo {{ admin_group_name }} |  dsidm {{ ldap_instance }} group create
+      changed_when: true
+      when: admin_group_name not in check_admin_group.stdout
+
+    - name: Create the sssd.conf file
+      copy:
+        src: "{{ role_path }}/files/temp_sssd.conf"
+        dest: "{{ sssd_config_path }}"
+        mode: "{{ sssd_file_mode }}"       
+
+    - name: Configure sssd.conf with domain name
+      replace:
+        path: "{{ sssd_config_path }}"
+        regexp: "dc=omnia,dc=test"
+        replace: "dc={{ domain_name.split('.')[0] }},dc={{ domain_name.split('.')[1] }}"
+
+    - name: Start sssd service
+      systemd:
+        name: sssd
+        state: started
+        enabled: yes
   when: not ds389_status
 
 - name: Configure password policy in 389-ds

+ 7 - 0
roles/login_node/vars/main.yml

@@ -72,6 +72,7 @@ ds389_packages:
 ldap1_search_key: "No such instance"
 ds389_pwpolicy_search_key: "passwordlockoutduration: {{ lockout_duration }}"
 ldap1_install_search_key: running
+sssd_install_search_key: running
 ldap1_config_path: "/root/ldap1.inf"
 ldap_instance: ldap1
 ldap1_output_path: /var/log/ldap1_output.txt
@@ -86,6 +87,12 @@ kerberos_packages:
 kerberos_principal_path: /var/lib/kerberos/krb5kdc/principal
 kerberos_conf_path: /etc/krb5.conf
 kerberos_env_path: /usr/lib/mit/sbin/
+sssd_packages:
+  - sssd
+  - sssd-ldap
+admin_group_name: server_admins
+sssd_file_mode: 0600
+sssd_config_path: /etc/sssd/sssd.conf
 
 # Usage: restrict_nonessentials.yml
 service_status: ['enabled','alias','static','indirect','enabled-runtime','active','inactive']

+ 3 - 3
roles/slurm_exporter/tasks/install_prometheus.yml

@@ -55,21 +55,21 @@
 
     - name: Configure nginx.conf (1/2)
       replace:
-        path: "../../k8s_start_services/templates/nginx.conf.j2"
+        path: "{{ playbook_dir }}/roles/k8s_start_services/templates/nginx.conf.j2"
         regexp: '        server_name  .*'
         replace: "        server_name  {{ ansible_default_ipv4.address }};"
       delegate_to: localhost
 
     - name: Configure nginx.conf (2/2)
       replace:
-        path: "../../k8s_start_services/templates/nginx.conf.j2"
+        path: "{{ playbook_dir }}/roles/k8s_start_services/templates/nginx.conf.j2"
         regexp: '          proxy_pass http://.*'
         replace: "          proxy_pass {{ prometheus_ip }};"
       delegate_to: localhost
 
     - name: Load nginx conf
       template:
-        src: "../../k8s_start_services/templates/nginx.conf.j2"
+        src: "{{ playbook_dir }}/roles/k8s_start_services/templates/nginx.conf.j2"
         dest: "{{ nginx_conf_file_path }}"
         mode: "{{ nginx_conf_file_mode }}"
 

+ 6 - 0
roles/slurm_manager/tasks/main.yml

@@ -120,6 +120,12 @@
 - name: Get network address/subnet mask
   set_fact:
     network_address: "{{ (ansible_default_ipv4.network + '/' + ansible_default_ipv4.netmask) | ipaddr('network/prefix') }}"
+  when: not hostvars['127.0.0.1']['ansible_collection_used']
+
+- name: Get network address/subnet mask
+  set_fact:
+    network_address: "{{ (ansible_default_ipv4.network + '/' + ansible_default_ipv4.netmask) | ansible.utils.ipaddr('network/prefix') }}"
+  when: hostvars['127.0.0.1']['ansible_collection_used']
 
 - name: Firewall rule slurm - allow all incoming traffic on internal network
   firewalld:

+ 305 - 0
telemetry/roles/grafana_config/files/SpiralLayout.json

@@ -0,0 +1,305 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_TELEMETRY-POSTGRES",
+      "label": "telemetry-postgres",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "postgres",
+      "pluginName": "PostgreSQL"
+    }
+  ],
+  "__elements": [],
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "8.3.2"
+    },
+    {
+      "type": "panel",
+      "id": "hpcviz-idvl-hpcc-spiral-layout",
+      "name": "spiral-layout",
+      "version": "2.0.0"
+    },
+    {
+      "type": "datasource",
+      "id": "postgres",
+      "name": "PostgreSQL",
+      "version": "1.0.0"
+    }
+  ],
+  "editable": false,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1646754961002,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "postgres",
+        "uid": "telemetry-postgres"
+      },
+      "gridPos": {
+        "h": 15,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 2,
+      "options": {
+        "nodeSize": 5,
+        "numberOfRing": 5,
+        "orderType": "rank"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "table",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  *\nFROM\n  slurm.jobs\nWHERE\n  user_id IN ($users)\n  AND start_time < ${__to:date:seconds}\n  AND end_time BETWEEN ${__from:date:seconds} and ${__to:date:seconds}",
+          "refId": "jobs",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"memory_power\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'PowerMetrics TotalMemoryPower' AND\n  system IN (CAST($servicetag AS text))\nGROUP BY time,name\nORDER BY time",
+          "refId": "memory_power",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"cpu_power\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'PowerMetrics TotalCPUPower'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "cpu_power",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"cpu1_temp\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'CPU1 Temp TemperatureReading'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "cpu1_temp",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"CPU2_Temp\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'CPU2 Temp TemperatureReading' AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time\n",
+          "refId": "cpu2_temp",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"NIC1_Temp\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'Embedded NIC 1 Port 1 Partition 1 TemperatureReading' AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "nic1_temp",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        }
+      ],
+      "title": "SpiralLayout",
+      "type": "hpcviz-idvl-hpcc-spiral-layout"
+    }
+  ],
+  "schemaVersion": 33,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {},
+        "datasource": {
+          "type": "postgres",
+          "uid": "telemetry-postgres"
+        },
+        "definition": "SELECT DISTINCT servicetag as __value from nodes\n",
+        "hide": 0,
+        "includeAll": false,
+        "multi": true,
+        "name": "servicetag",
+        "options": [],
+        "query": "SELECT DISTINCT servicetag as __value from nodes\n",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      },
+      {
+        "current": {},
+        "datasource": {
+          "type": "postgres",
+          "uid": "telemetry-postgres"
+        },
+        "definition": "SELECT\n user_id as __value, user_name as __text\nFROM\n  slurm.jobs\nWHERE\n start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
+        "hide": 0,
+        "includeAll": true,
+        "multi": true,
+        "name": "users",
+        "options": [],
+        "query": "SELECT\n user_id as __value, user_name as __text\nFROM\n  slurm.jobs\nWHERE\n start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-6M",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "SpiralLayout",
+  "uid": "ou27WHLnt",
+  "version": 4,
+  "weekStart": ""
+}

+ 12 - 1
telemetry/roles/grafana_config/tasks/add_dashboards.yml

@@ -13,12 +13,23 @@
 # limitations under the License.
 ---
 
+- name: Create a telemetry folder on grafana
+  community.grafana.grafana_folder:
+    url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
+    grafana_user: "{{ grafana_username }}"
+    grafana_password: "{{ grafana_password }}"
+    title: "{{ telemetry_folder_name }}"
+    state: present
+  no_log: true
+
 - name: Import dashboards for visualizations
   community.grafana.grafana_dashboard:
     grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
     grafana_user: "{{ grafana_username }}"
     grafana_password: "{{ grafana_password }}"
     state: present
+    folder: "{{ telemetry_folder_name }}"
     overwrite: yes
-    path: "{{ playbook_dir }}/roles/grafana_config/files/{{ dashboards }}"
+    path: "{{ playbook_dir }}/roles/grafana_config/files/{{ item }}"
+  with_items: "{{ dashboards }}"
   no_log: true

+ 4 - 1
telemetry/roles/grafana_config/vars/main.yml

@@ -15,4 +15,7 @@
 
 grafana_k8s: grafana
 grafana_namespace: grafana
-dashboards: parallel-coordinate.json
+telemetry_folder_name: telemetry
+dashboards:
+  - parallel-coordinate.json
+  - SpiralLayout.json

+ 7 - 0
telemetry/roles/slurm_telemetry/files/Dockerfile

@@ -8,13 +8,20 @@ RUN dnf -y install https://download.postgresql.org/pub/repos/yum/reporpms/EL-8-x
 RUN dnf module disable postgresql -y
 RUN dnf install postgresql13-devel -y
 RUN yum install python38-devel libpq-devel -y
+RUN dnf install sshpass -y
 
 COPY requirements.txt requirements.txt
 RUN ln -s /usr/pgsql-13/bin/pg_config /usr/bin/pg_config
 
 RUN pip3 install psycopg2-binary
 RUN pip3 install -r requirements.txt
+RUN mkdir /MonSter/
+COPY init_k8s_pod.sh /MonSter/
+RUN chmod 777 /MonSter/init_k8s_pod.sh
+
 RUN mkdir /log/
 RUN touch /log/monster.log
 
+COPY monster /MonSter/
+
 WORKDIR /MonSter/

+ 6 - 0
telemetry/roles/slurm_telemetry/files/init_k8s_pod_local.sh

@@ -0,0 +1,6 @@
+#!/bin/bash
+
+echo 'manager_node_ip manager_node_hostname' >> /etc/hosts
+ssh-keyscan -H manager_node_hostname >> /root/.ssh/known_hosts
+ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -q -N "" -y
+sshpass -p 'os_passwd' ssh-copy-id 'root@manager_node_ip'

+ 33 - 0
telemetry/roles/slurm_telemetry/files/k8s_slurm_telemetry.yml

@@ -0,0 +1,33 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: slurm-telemetry
+  namespace: telemetry-and-visualizations
+  labels:
+    app: slurm-telemetry
+spec:
+  selector:
+    matchLabels:
+      app: slurm-telemetry
+  replicas: 1
+  strategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        app: slurm-telemetry
+    spec:
+      volumes:
+        - name: ssh-key
+          hostPath:
+            path: /root/.ssh/
+            type: Directory
+      containers:
+        - name: slurm-telemetry
+          image: 'localhost/slurm_telemetry:latest'
+          imagePullPolicy: Never
+          command: ["/bin/sh","-c"]
+          args: ["./init_k8s_pod.sh; python3.8 tsdb.py; python3.8 mslurm.py"]
+          volumeMounts:
+            - name: ssh-key
+              mountPath: /root/.ssh/

+ 0 - 90
telemetry/roles/slurm_telemetry/files/update_service_tags.yml

@@ -1,90 +0,0 @@
-# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.​0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-- name: Create inventory in awx
-  hosts: manager, compute
-  tasks:
-    - name: Check slurmctld service
-      systemd:
-        name: slurmctld
-      register: slurm_service_status
-
-    - name: Set fact slurm_service
-      set_fact:
-        slurm_service: True
-      delegate_to: localhost
-      when: "slurm_service_status.status.ActiveState == 'active'"
-
-    - name: Set fact slurm_service
-      set_fact:
-        slurm_service: False
-      delegate_to: localhost
-      when: "slurm_service_status.status.ActiveState == 'inactive'"
-
-    - name: Replace input file
-      copy:
-        src: "input_config.yml"
-        dest: /mnt/omnia/slurm/monster/config.yml
-        mode: 0644
-      delegate_to: localhost
-      when: "slurm_service_status.status.ActiveState == 'active'"
-
-    - name: Prepare input config file
-      block:
-        - name: Get service tag
-          shell: >
-            set -o pipefail && \
-            dmidecode -t 1 | grep Serial
-          changed_when: false
-          register: service_tag_details
-
-        - name: Set fact service tag
-          set_fact:
-            service_tag: "{{ service_tag_details.stdout.split(':')[1].strip() }}"
-
-        - name: Get the hostname
-          command: hostname
-          register: machine_hostname
-          changed_when: false
-
-        - name: Update Head Node IP
-          replace:
-            path: /mnt/omnia/slurm/monster/config.yml
-            regexp: '  ip:.*'
-            replace: "  ip: {{ groups['manager'][0] }}"
-          delegate_to: localhost
-
-        - name: Update Head Node hostname
-          replace:
-            path: /mnt/omnia/slurm/monster/config.yml
-            regexp: '  headnode:.*'
-            replace: "  headnode: {{ hostvars[groups['manager'][0]]['machine_hostname'].stdout }}"
-          delegate_to: localhost
-
-        - name: Update nodes hostnames
-          lineinfile:
-            path: /mnt/omnia/slurm/monster/config.yml
-            line: "  {{ machine_hostname.stdout }}: {{ ansible_default_ipv4.address }}"
-            insertafter: "hostnames:"
-          delegate_to: localhost
-
-        - name: Update service tag info
-          lineinfile:
-            path: /mnt/omnia/slurm/monster/config.yml
-            line: "  - Servicetag: {{ service_tag }}\n    Os_Ip_Addr: {{ ansible_default_ipv4.address }}"
-            insertafter: "clusternodes:"
-          delegate_to: localhost
-      when: hostvars[groups['manager'][0]]['slurm_service']

+ 51 - 0
telemetry/roles/slurm_telemetry/tasks/deploy_slurm_telemetry.yml

@@ -0,0 +1,51 @@
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Get all images
+  command: "buildah images"
+  register: images_result
+  failed_when: false
+  changed_when: false
+
+- name: Update the permission of init_k8s_pod.sh
+  file:
+    path: "{{ role_path }}/files/init_k8s_pod.sh"
+    mode: "{{ slurm_telemetry_code_dir_mode }}"
+  when: slurm_telemetry_image not in images_result.stdout
+
+- name: Create slurm_telemetry image
+  command: buildah bud -t {{ slurm_telemetry_image }}:{{ slurm_telemetry_image_tag }} -f {{ role_path }}/files/Dockerfile
+  args:
+    chdir: "{{ role_path }}/files/"
+  changed_when: true
+  when: slurm_telemetry_image not in images_result.stdout
+
+- name: Deploy slurm_telemetry pod
+  command: kubectl apply -f {{ role_path }}/files/k8s_slurm_telemetry.yml
+  changed_when: true
+
+- name: Wait for slurm_telemetry pod to come to ready state
+  command: kubectl wait --for=condition=ready -n {{ namespace }} pod -l app=slurm-telemetry --timeout=4m
+  changed_when: true
+
+- name: Delete input config file
+  file:
+    path: "{{ role_path }}/files/monster/config.yml"
+    state: absent
+
+- name: Delete init k8s pod file
+  file:
+    path: "{{ role_path }}/files/init_k8s_pod.sh"
+    state: absent

+ 75 - 63
telemetry/roles/slurm_telemetry/tasks/get_node_inventory.yml

@@ -13,76 +13,88 @@
 # limitations under the License.
 ---
 
-- name: Copy slurm telemetry code
-  copy:
-    src: "{{ role_path }}/files/monster"
-    dest: "{{ slurm_telemetry_code_dir }}"
-    mode: "{{ slurm_telemetry_code_dir_mode }}"
-    
-- name: Install jmepath
-  pip:
-    name: jmespath
-    state: present
-    executable: pip3
+- name: Get inventory details
+  block:
+  - name: Copy slurm telemetry code
+    copy:
+      src: "{{ role_path }}/files/monster"
+      dest: "{{ slurm_telemetry_code_dir }}"
+      mode: "{{ slurm_telemetry_code_dir_mode }}"
 
-- name: Get AWX service IP
-  command: kubectl get svc awx-ui -n {{ awx_namespace }} -o=jsonpath='{.spec.clusterIP}'
-  changed_when: false
-  failed_when: false
-  register: awx_svc_ip
+  - name: Install sshpass
+    package:
+      name: sshpass
+      state: present
 
-- name: AWX needs to be installed
-  fail:
-    msg: "{{ awx_fail_msg }}"
-  when: not awx_svc_ip.stdout
+  - name: Install jmepath
+    pip:
+      name: jmespath
+      state: present
+      executable: pip3
 
-- name: Get AWX service port
-  command: kubectl get svc awx-ui -n {{ awx_namespace }} -o=jsonpath='{.spec.ports[0].port}'
-  changed_when: false
-  register: awx_svc_port
+  - name: Get AWX service IP
+    command: kubectl get svc awx-ui -n {{ awx_namespace }} -o=jsonpath='{.spec.clusterIP}'
+    changed_when: false
+    failed_when: false
+    register: awx_svc_ip
 
-- name: Get AWX secret
-  shell: >
-    set -o pipefail && \
-    kubectl get secret awx-admin-password -n {{ awx_namespace }} -o jsonpath="{.data.password}" | base64 --decode
-  changed_when: false
-  register: awx_secret
+  - name: AWX needs to be installed
+    fail:
+      msg: "{{ awx_fail_msg }}"
+    when: not awx_svc_ip.stdout
 
-- name: Get node_inventory id
-  shell: >
-    set -o pipefail && \
-    awx --conf.host http://{{ awx_svc_ip.stdout }}:{{ awx_svc_port.stdout }} --conf.username {{ awx_username }} \
-    --conf.password {{ awx_secret.stdout }} --conf.insecure inventory list -f human | grep node_inventory
-  changed_when: false
-  register: inventory_id
+  - name: Get AWX service port
+    command: kubectl get svc awx-ui -n {{ awx_namespace }} -o=jsonpath='{.spec.ports[0].port}'
+    changed_when: false
+    register: awx_svc_port
 
-- name: Node inventory not found in AWX
-  fail:
-    msg: "{{ node_inventory_fail_msg }}"
-  when: not inventory_id.stdout
+  - name: Get AWX secret
+    shell: >
+      set -o pipefail && \
+      kubectl get secret awx-admin-password -n {{ awx_namespace }} -o jsonpath="{.data.password}" | base64 --decode
+    changed_when: false
+    register: awx_secret
 
-- name: Get node_inventory
-  command: awx --conf.host http://{{ awx_svc_ip.stdout }}:{{ awx_svc_port.stdout }} --conf.username {{ awx_username }} \
-    --conf.password {{ awx_secret.stdout }} --conf.insecure hosts list --inventory {{ inventory_id.stdout[0] }}
-  changed_when: false
-  register: node_inventory_output
+  - name: Get node_inventory id
+    shell: >
+      set -o pipefail && \
+      awx --conf.host http://{{ awx_svc_ip.stdout }}:{{ awx_svc_port.stdout }} --conf.username {{ awx_username }} \
+      --conf.password {{ awx_secret.stdout }} --conf.insecure inventory list -f human | grep node_inventory
+    changed_when: false
+    register: inventory_id
 
-- name: Save the json data
-  set_fact:
-    node_inventory_jsondata: "{{ node_inventory_output.stdout | from_json }}"
+  - name: Node inventory not found in AWX
+    fail:
+      msg: "{{ node_inventory_fail_msg }}"
+    when: not inventory_id.stdout
 
-- name: Add temporary hosts
-  add_host:
-    name: "{{ item.name }}"
-    groups: "{{ item.summary_fields.groups.results[0].name }}"
-  with_items: "{{ node_inventory_jsondata | json_query('results') }}"
-  no_log: true
+  - name: Get node_inventory
+    command: awx --conf.host http://{{ awx_svc_ip.stdout }}:{{ awx_svc_port.stdout }} --conf.username {{ awx_username }} \
+      --conf.password {{ awx_secret.stdout }} --conf.insecure hosts list --inventory {{ inventory_id.stdout[0] }}
+    changed_when: false
+    register: node_inventory_output
 
-- name: Update slurm telemetry code path
-  replace:
-    path: "{{ role_path }}/files/update_service_tags.yml"
-    regexp: '{{ item }}.*'
-    replace: "{{ item }} {{ slurm_telemetry_code_dir }}/monster/config.yml"
-  with_items:
-    - "dest:"
-    - "path:"
+  - name: Save the json data
+    set_fact:
+      node_inventory_jsondata: "{{ node_inventory_output.stdout | from_json }}"
+
+  - name: Add temporary hosts
+    add_host:
+      name: "{{ node_inventory_jsondata['results'][node_index].name }}"
+      groups: "{{ node_inventory_jsondata['results'][node_index].summary_fields.groups.results[0].name }}"
+      ansible_user: "{{ os_username }}"
+      ansible_password: "{{ provision_password }}"
+      ansible_become_pass: "{{ provision_password }}"
+      ansible_ssh_common_args: '-o StrictHostKeyChecking=no'
+    with_items: "{{ node_inventory_jsondata['results'] }}"
+    loop_control:
+      index_var: node_index
+    when: node_inventory_jsondata['results'][node_index].summary_fields.groups.count > 0
+    no_log: true
+
+  - name: Copy input_config file
+    copy:
+      src: "{{ role_path }}/files/input_config.yml"
+      dest: "{{ role_path }}/files/monster/config.yml"
+      mode: "{{ monster_config_file_mode }}"
+  when: slurm_telemetry_support

+ 14 - 7
telemetry/roles/slurm_telemetry/tasks/main.yml

@@ -13,12 +13,19 @@
 # limitations under the License.
 ---
 
-- name: Include common variables
-  include_vars: ../../common/vars/main.yml
+- name: Deploy slurm-telemetry
+  block:
+  - name: Include common variables
+    include_vars: ../../common/vars/main.yml
 
-- name: Include timescaledb variables
-  include_vars: ../../timescaledb/vars/main.yml
+  - name: Include timescaledb variables
+    include_vars: ../../timescaledb/vars/main.yml
 
-- name: Prepare MonSter input file
-  include_tasks: update_timescaledb_details.yml
-  when: hostvars[groups['manager'][0]]['slurm_service']
+  - name: Prepare MonSter input file
+    include_tasks: update_timescaledb_details.yml
+    when: hostvars[groups['manager'][0]]['slurm_service']
+
+  - name: Deploy slurm telemetry
+    include_tasks: deploy_slurm_telemetry.yml
+    when: hostvars[groups['manager'][0]]['slurm_service']
+  when: slurm_telemetry_support

+ 118 - 0
telemetry/roles/slurm_telemetry/tasks/update_service_tags.yml

@@ -0,0 +1,118 @@
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.<200b>0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Get hosts details
+  block:
+  - name: Check slurmctld service
+    systemd:
+      name: slurmctld
+    register: slurm_service_status
+
+  - name: Set fact slurm_service
+    set_fact:
+      slurm_service: False
+    delegate_to: localhost
+
+  - name: Set fact slurm_service
+    set_fact:
+      slurm_service: True
+    delegate_to: localhost
+    when: "slurm_service_status.status.ActiveState == 'active'"
+
+  - name: Assert slurmctld status
+    fail:
+      msg: "{{ slurmctld_status_fail_msg }}"
+    when: not hostvars[groups['manager'][0]]['slurm_service']
+
+  - name: Prepare input config file
+    block:
+    - name: Get service tag
+      shell: >
+          set -o pipefail && \
+          dmidecode -t 1 | grep Serial
+      changed_when: false
+      register: service_tag_details
+
+    - name: Set fact service tag
+      set_fact:
+        service_tag: "{{ service_tag_details.stdout.split(':')[1].strip() }}"
+
+    - name: Get the hostname
+      command: hostname
+      register: machine_hostname
+      changed_when: false
+
+    - name: Update Head Node IP
+      replace:
+        path: "{{ role_path }}{{ monster_input_file_path }}"
+        regexp: '  ip:.*'
+        replace: "  ip: {{ groups['manager'][0] }}"
+      delegate_to: localhost
+
+    - name: Update Head Node hostname
+      replace:
+        path: "{{ role_path }}{{ monster_input_file_path }}"
+        regexp: '  headnode:.*'
+        replace: "  headnode: {{ hostvars[groups['manager'][0]]['machine_hostname'].stdout }}"
+      delegate_to: localhost
+
+    - name: Update nodes hostnames
+      lineinfile:
+        path: "{{ role_path }}{{ monster_input_file_path }}"
+        line: "  {{ machine_hostname.stdout }}: {{ inventory_hostname }}"
+        insertafter: "hostnames:"
+      delegate_to: localhost
+
+    - name: Update service tag info
+      lineinfile:
+        path: "{{ role_path }}{{ monster_input_file_path }}"
+        line: "  - Servicetag: {{ service_tag }}\n    Os_Ip_Addr: {{ inventory_hostname }}"
+        insertafter: "clusternodes:"
+      delegate_to: localhost
+
+    - name: Copy initialization file
+      copy:
+        src: "{{ role_path }}/files/init_k8s_pod_local.sh"
+        dest: "{{ role_path }}/files/init_k8s_pod.sh"
+        mode: "{{ monster_config_file_mode }}"
+      delegate_to: localhost
+      when: manager_group in group_names
+
+    - name: Update manager node details in init_k8s_pod.sh
+      replace:
+        path: "{{ role_path }}/files/init_k8s_pod.sh"
+        regexp: echo 'manager_node_ip manager_node_hostname' >> /etc/hosts
+        replace: echo '{{ inventory_hostname }} {{ machine_hostname.stdout }}' >> /etc/hosts
+      delegate_to: localhost
+      when: manager_group in group_names
+
+    - name: Update manager node IP in init_k8s_pod.sh
+      replace:
+        path: "{{ role_path }}/files/init_k8s_pod.sh"
+        regexp: ssh-keyscan -H manager_node_hostname >> /root/.ssh/known_hosts
+        replace: ssh-keyscan -H {{ machine_hostname.stdout }} >> /root/.ssh/known_hosts
+      delegate_to: localhost
+      when: manager_group in group_names
+
+    - name: Update manager node IP in init_k8s_pod.sh
+      replace:
+        path: "{{ role_path }}/files/init_k8s_pod.sh"
+        regexp: sshpass -p 'os_passwd' ssh-copy-id 'root@manager_node_ip'
+        replace: sshpass -p "{{ hostvars['127.0.0.1']['provision_password'] }}" ssh-copy-id 'root@{{ inventory_hostname }}'
+      delegate_to: localhost
+      when: manager_group in group_names
+
+    when: hostvars[groups['manager'][0]]['slurm_service']
+  when: hostvars['127.0.0.1']['slurm_telemetry_support']

+ 6 - 6
telemetry/roles/slurm_telemetry/tasks/update_timescaledb_details.yml

@@ -25,31 +25,31 @@
 
 - name: Update timescaledb service IP
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  host:.*"
     replace: "  host: {{ timescaledb_svc_ip.stdout }}"
 
 - name: Update timescaledb service port
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  port:.*"
     replace: "  port: {{ timescaledb_svc_port.stdout }}"
     before: "# Slurm REST API Configuration"
 
 - name: Update timescaledb username
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  username:.*"
     replace: "  username: {{ timescaledb_user }}"
 
 - name: Update timescaledb password
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  password:.*"
     replace: "  password: {{ timescaledb_password }}"
 
 - name: Update timescaledb database
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  database:.*"
-    replace: "  database: {{ timescaledb_name }}"
+    replace: "  database: {{ timescaledb_name }}"

+ 15 - 1
telemetry/roles/slurm_telemetry/vars/main.yml

@@ -19,4 +19,18 @@ slurm_telemetry_code_dir_mode: 0755
 awx_namespace: awx
 awx_username: admin
 awx_fail_msg: "AWX service not found. AWX needs to be installed"
-node_inventory_fail_msg: "AWX node inventory not found. Node inventory needs be created in AWX"
+node_inventory_fail_msg: "AWX node inventory not found. Node inventory needs be created in AWX"
+os_username: root
+
+# usage: deploy_slurm_telemetry
+slurm_telemetry_image: slurm_telemetry
+slurm_telemetry_image_tag: latest
+monster_config_file_mode: 0644
+
+manager_group: manager
+compute_group: compute
+input_config_file_path: /mnt/omnia/slurm/monster/config.yml
+monster_input_file_path: /files/monster/config.yml
+
+slurmctld_status_success_msg: "slurmctld is running on manager node"
+slurmctld_status_fail_msg: "slurmctld is inactive. Please check manager node for slurm status"

+ 10 - 4
telemetry/telemetry.yml

@@ -33,9 +33,15 @@
         tasks_from: get_node_inventory.yml
       tags: slurm_telemetry
 
-- name: Update slurm node IPs and service tags
-  import_playbook: "{{ playbook_dir }}/roles/slurm_telemetry/files/update_service_tags.yml"
-  tags: slurm_telemetry
+- name: Get node details
+  hosts: manager, compute
+  gather_facts: false
+  tasks:
+    - name: Get service tag
+      include_role:
+        name: slurm_telemetry
+        tasks_from: update_service_tags.yml
+      tags: slurm_telemetry
 
 - name: Slurm Telemetry
   hosts: localhost
@@ -43,4 +49,4 @@
   gather_facts: false
   roles:
    - slurm_telemetry
-  tags: slurm_telemetry
+  tags: slurm_telemetry