Ver código fonte

Merge branch 'devel' into all-contributors/add-abhishek-sa1

Sujit Jadhav 3 anos atrás
pai
commit
c2a18b69f2
44 arquivos alterados com 1225 adições e 388 exclusões
  1. 3 0
      control_plane/roles/control_plane_ib/files/Dockerfile
  2. 1 11
      control_plane/roles/control_plane_ib/files/infiniband_container_configure.yml
  3. 3 3
      control_plane/roles/control_plane_ib/files/k8s_infiniband.yml
  4. 1 1
      control_plane/roles/control_plane_ib/files/start.sh
  5. 68 0
      control_plane/roles/control_plane_security/files/temp_sssd.conf
  6. 61 9
      control_plane/roles/control_plane_security/tasks/install_389ds.yml
  7. 7 0
      control_plane/roles/control_plane_security/vars/main.yml
  8. 8 8
      control_plane/roles/provision_idrac/tasks/check_prerequisites.yml
  9. 2 0
      control_plane/roles/webui_awx/files/requirements.yml
  10. 3 0
      docs/FAQ.md
  11. 6 4
      docs/INSTALL_OMNIA.md
  12. 78 38
      docs/INSTALL_OMNIA_CONTROL_PLANE.md
  13. 10 7
      docs/README.md
  14. 25 0
      docs/Security/ENABLE_SECURITY_LOGIN_NODE.md
  15. 85 0
      docs/Security/ENABLE_SECURITY_MANAGEMENT_STATION.md
  16. 0 27
      docs/Security/Enable_Security_LoginNode.md
  17. 0 79
      docs/Security/Enable_Security_ManagementStation.md
  18. 8 7
      docs/Telemetry_Visualization/Visualization.md
  19. 1 1
      docs/control_plane/device_templates/PROVISION_SERVERS.md
  20. 1 1
      docs/control_plane/input_parameters/PROVISION_SERVERS.md
  21. 30 0
      roles/cluster_validation/tasks/install_packages.yml
  22. 11 1
      roles/cluster_validation/tasks/main.yml
  23. 5 1
      roles/cluster_validation/vars/main.yml
  24. 4 4
      roles/common/vars/main.yml
  25. 3 3
      roles/k8s_start_services/tasks/deploy_k8s_services.yml
  26. 68 0
      roles/login_node/files/temp_sssd.conf
  27. 61 7
      roles/login_node/tasks/install_389ds.yml
  28. 7 0
      roles/login_node/vars/main.yml
  29. 3 3
      roles/slurm_exporter/tasks/install_prometheus.yml
  30. 6 0
      roles/slurm_manager/tasks/main.yml
  31. 305 0
      telemetry/roles/grafana_config/files/SpiralLayout.json
  32. 12 1
      telemetry/roles/grafana_config/tasks/add_dashboards.yml
  33. 4 1
      telemetry/roles/grafana_config/vars/main.yml
  34. 7 0
      telemetry/roles/slurm_telemetry/files/Dockerfile
  35. 6 0
      telemetry/roles/slurm_telemetry/files/init_k8s_pod_local.sh
  36. 33 0
      telemetry/roles/slurm_telemetry/files/k8s_slurm_telemetry.yml
  37. 0 90
      telemetry/roles/slurm_telemetry/files/update_service_tags.yml
  38. 51 0
      telemetry/roles/slurm_telemetry/tasks/deploy_slurm_telemetry.yml
  39. 75 63
      telemetry/roles/slurm_telemetry/tasks/get_node_inventory.yml
  40. 14 7
      telemetry/roles/slurm_telemetry/tasks/main.yml
  41. 118 0
      telemetry/roles/slurm_telemetry/tasks/update_service_tags.yml
  42. 6 6
      telemetry/roles/slurm_telemetry/tasks/update_timescaledb_details.yml
  43. 15 1
      telemetry/roles/slurm_telemetry/vars/main.yml
  44. 10 4
      telemetry/telemetry.yml

+ 3 - 0
control_plane/roles/control_plane_ib/files/Dockerfile

@@ -1,5 +1,6 @@
 FROM rockylinux/rockylinux:docker_os
 
+RUN dnf -y update && dnf clean all
 RUN dnf install -y epel-release
 RUN dnf install dhcp-server -y \
   ansible \
@@ -20,5 +21,7 @@ COPY opensm.conf /etc/rdma/opensm.conf
 COPY start.sh /
 
 RUN systemctl enable dhcpd
+RUN chmod +x /start.sh
 
+ENTRYPOINT ["/start.sh"]
 CMD ["sbin/init"]

+ 1 - 11
control_plane/roles/control_plane_ib/files/infiniband_container_configure.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,13 +21,3 @@
   - name: Start dhcpd services
     command: dhcpd {{ ib_nic }}
     changed_when: false
-
-  - name: Change mode
-    command: chmod 777 /start.sh
-    changed_when: false
-
-  - name: Run shell
-    shell: ./start.sh
-    args:
-      chdir: /
-    changed_when: false

+ 3 - 3
control_plane/roles/control_plane_ib/files/k8s_infiniband.yml

@@ -35,8 +35,8 @@ spec:
         - name: infiniband-container
           image: 'localhost/infiniband-container:latest'
           imagePullPolicy: Never
-          command:
-            - /sbin/init
+          command: [ "/start.sh" ]
+          args: [ "/sbin/init" ]
           volumeMounts:
             - name: omnia-storage
               mountPath: /root/omnia
@@ -52,4 +52,4 @@ spec:
             capabilities:
               add:
                 - NET_RAW
-            privileged: false
+            privileged: true

+ 1 - 1
control_plane/roles/control_plane_ib/files/start.sh

@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 
 /usr/libexec/rdma-init-kernel
 

+ 68 - 0
control_plane/roles/control_plane_security/files/temp_sssd.conf

@@ -0,0 +1,68 @@
+#
+# sssd.conf
+# Generated by 389 Directory Server - dsidm
+#
+# For more details see man sssd.conf and man sssd-ldap
+# Be sure to review the content of this file to ensure it is secure and correct
+# in your environment.
+
+[domain/ldap]
+# Uncomment this for more verbose logging.
+# debug_level=3
+
+# Cache hashes of user authentication for offline auth.
+cache_credentials = True
+id_provider = ldap
+auth_provider = ldap
+access_provider = ldap
+chpass_provider = ldap
+ldap_schema = rfc2307
+ldap_search_base = dc=omnia,dc=test
+ldap_uri = ldapi://%2fvar%2frun%2fslapd-ldap1.socket
+# If you have DNS SRV records, you can use the following instead. This derives
+# from your ldap_search_base.
+# ldap_uri = _srv_
+
+ldap_tls_reqcert = demand
+# To use cacert dir, place *.crt files in this path then run:
+# /usr/bin/openssl rehash /etc/openldap/certs
+# or (for older versions of openssl)
+# /usr/bin/c_rehash /etc/openldap/certs
+ldap_tls_cacertdir = /etc/openldap/certs
+
+# Path to the cacert
+# ldap_tls_cacert = /etc/openldap/certs/ca.crt
+
+# Only users who match this filter can login and authorise to this machine. Note
+# that users who do NOT match, will still have their uid/gid resolve, but they
+# can't login.
+ldap_access_filter = (memberOf=cn=server_admins,ou=groups,dc=omnia,dc=test)
+
+enumerate = false
+access_provider = ldap
+ldap_user_member_of = memberof
+ldap_user_gecos = cn
+ldap_user_uuid = nsUniqueId
+ldap_group_uuid = nsUniqueId
+# This is really important as it allows SSSD to respect nsAccountLock
+ldap_account_expire_policy = rhds
+ldap_access_order = filter, expire
+# Setup for ssh keys
+# Inside /etc/ssh/sshd_config add the lines:
+#   AuthorizedKeysCommand /usr/bin/sss_ssh_authorizedkeys
+#   AuthorizedKeysCommandUser nobody
+# You can test with the command: sss_ssh_authorizedkeys <username>
+ldap_user_ssh_public_key = nsSshPublicKey
+
+# This prevents an issue where the Directory is recursively walked on group
+# and user look ups. It makes the client faster and more responsive in almost
+# every scenario.
+ignore_group_members = False
+
+[sssd]
+services = nss, pam, ssh, sudo
+config_file_version = 2
+
+domains = ldap
+[nss]
+homedir_substring = /home

+ 61 - 9
control_plane/roles/control_plane_security/tasks/install_389ds.yml

@@ -39,13 +39,18 @@
   failed_when: false
   no_log: true
   register: ds389_status_authentication
- 
+
+- name: Gathering service facts
+  service_facts:
+
 - name: Modify ds389_status
   set_fact:
     ds389_status: true
   when: 
     - ds389_status_authentication.rc == 0
     - ds389_pwpolicy_search_key in ds389_pwpolicy_check.stdout
+    - "'sssd.service' in ansible_facts.services"
+    - sssd_install_search_key in ansible_facts.services['sssd.service'].state
 
 - block:
     - name: Install 389-ds
@@ -53,17 +58,25 @@
         name: "{{ ds389_packages }}"
         state: present 
 
+    - name: Check ldap instance is running or not
+      command: dsctl {{ ldap_instance }} status
+      changed_when: false
+      failed_when: false
+      register: ldap1_status
+
     - name: Create the ldap1.inf file
       copy:
         src: "{{ role_path }}/files/temp_ldap1.inf"
         dest: "{{ ldap1_config_path }}"
-        mode: "{{ file_mode }}"       
+        mode: "{{ file_mode }}"
+      when: ldap1_search_key in ldap1_status.stdout       
 
     - name: Configure ldap1.inf with domain name
       lineinfile:
         path: "{{ ldap1_config_path }}"
         regexp: "^suffix = dc=omnia,dc=test"
         line: "suffix = dc={{ domain_name.split('.')[0] }},dc={{ domain_name.split('.')[1] }}"
+      when: ldap1_search_key in ldap1_status.stdout
 
     - name: Configure ldap1.inf with directory manager password
       lineinfile:
@@ -71,12 +84,7 @@
         regexp: "^root_password = password"
         line: "root_password = {{ ms_directory_manager_password }}"
       no_log: true
-
-    - name: Check ldap instance is running or not
-      command: dsctl {{ ldap_instance }} status
-      changed_when: false
-      failed_when: false
-      register: ldap1_status
+      when: ldap1_search_key in ldap1_status.stdout
 
     - name: Creating 389 directory server instance
       shell: dscreate -v from-file {{ ldap1_config_path }} | tee {{ ldap1_output_path }}
@@ -132,24 +140,28 @@
         src: "{{ role_path }}/files/temp_krb5.conf"
         dest: "{{ kerberos_conf_path }}"
         mode: "{{ file_mode }}"
+      when: not principal_status.stat.exists      
 
     - name: Configure kerberos conf file with domain name
       replace:
         path: "{{ kerberos_conf_path }}"
         regexp: "omnia.test"
         replace: "{{ domain_name }}"
+      when: not principal_status.stat.exists
 
     - name: Configure kerberos conf file with realm name
       replace:
         path: "{{ kerberos_conf_path }}"
         regexp: "OMNIA.TEST"
         replace: "{{ realm_name }}"
+      when: not principal_status.stat.exists
 
     - name: Configure kerberos conf file with hostname
       replace:
         path: "{{ kerberos_conf_path }}"
         regexp: "hostname"
         replace: "{{ short_hostname.stdout }}"
+      when: not principal_status.stat.exists
 
     - block:
         - name: Setting up the kerberos database
@@ -191,7 +203,47 @@
       shell: set -o pipefail && echo {{ ms_kerberos_admin_password }} | kinit admin
       no_log: true
       changed_when: false
-    
+
+    - name: Install sssd packages
+      zypper:
+        name: "{{ sssd_packages }}"
+        state: present
+      
+    - name: Stop and disable nscd
+      systemd:
+        name: nscd
+        state: stopped
+        enabled: no
+      when: "'nscd.service' in ansible_facts.services"
+
+    - name: Check admin group in 389-ds
+      command: dsidm {{ ldap_instance }} group list
+      register: check_admin_group
+      changed_when: false
+
+    - name: Create admin group in 389-ds
+      shell: set -o pipefail && echo {{ admin_group_name }} |  dsidm {{ ldap_instance }} group create
+      changed_when: true
+      when: admin_group_name not in check_admin_group.stdout
+
+    - name: Create the sssd.conf file
+      copy:
+        src: "{{ role_path }}/files/temp_sssd.conf"
+        dest: "{{ sssd_config_path }}"
+        mode: "{{ sssd_file_mode }}"       
+
+    - name: Configure sssd.conf with domain name
+      replace:
+        path: "{{ sssd_config_path }}"
+        regexp: "dc=omnia,dc=test"
+        replace: "dc={{ domain_name.split('.')[0] }},dc={{ domain_name.split('.')[1] }}"
+
+    - name: Start sssd service
+      systemd:
+        name: sssd
+        state: started
+        enabled: yes
+
     - name: Configure password policy in 389-ds
       command: dsconf -w {{ ms_directory_manager_password }} -D "cn=Directory Manager" ldap://{{ server_hostname_ms }} pwpolicy set --pwdlockoutduration {{ lockout_duration }} --pwdmaxfailures {{ max_failures }} --pwdresetfailcount {{ failure_reset_interval }}
       no_log: true

+ 7 - 0
control_plane/roles/control_plane_security/vars/main.yml

@@ -86,6 +86,7 @@ ds389_packages:
   - python3-argcomplete
 ldap1_search_key: "No such instance"
 ds389_pwpolicy_search_key: "passwordlockoutduration: {{ lockout_duration }}"
+sssd_install_search_key: running
 ldap1_config_path: "{{ role_path }}/files/ldap1.inf"
 ldap_instance: ldap1
 ldap1_output_path: /var/log/ldap1_output.txt
@@ -100,6 +101,12 @@ kerberos_packages:
 kerberos_principal_path: /var/lib/kerberos/krb5kdc/principal
 kerberos_conf_path: /etc/krb5.conf
 kerberos_env_path: /usr/lib/mit/sbin/
+sssd_packages:
+  - sssd
+  - sssd-ldap
+admin_group_name: server_admins
+sssd_file_mode: 0600
+sssd_config_path: /etc/sssd/sssd.conf
 
 # Usage: restrict_nonessentials.yml
 service_status: ['enabled','alias','static','indirect','enabled-runtime','active','inactive']

+ 8 - 8
control_plane/roles/provision_idrac/tasks/check_prerequisites.yml

@@ -1,4 +1,4 @@
-# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -174,9 +174,9 @@
             idrac_license_name: "{{ idrac_info.system_info.License[my_idx1].LicenseDescription }}"
           with_items: "{{ idrac_info.system_info.License }}"
           when:
-            - '"iDRAC" in idrac_info.system_info.License[my_idx1].LicenseDescription'
-            - '"Enterprise" in idrac_info.system_info.License[my_idx1].LicenseDescription'
-            - '"License" in idrac_info.system_info.License[my_idx1].LicenseDescription'
+            - '"idrac" in idrac_info.system_info.License[my_idx1].LicenseDescription | lower'
+            - '"enterprise" in idrac_info.system_info.License[my_idx1].LicenseDescription | lower'
+            - '"license" in idrac_info.system_info.License[my_idx1].LicenseDescription | lower'
             - '"Healthy" in idrac_info.system_info.License[my_idx1].PrimaryStatus'
           loop_control:
             index_var: my_idx1
@@ -184,12 +184,12 @@
         - name: Set datacenter license status
           set_fact:
             datacenter_license: true
-            idrac_license_name: "{{ idrac_info.system_info.License[my_idx1].LicenseDescription }}"
+            idrac_license_name: "{{ idrac_info.system_info.License[my_idx2].LicenseDescription }}"
           with_items: "{{ idrac_info.system_info.License }}"
           when:
-            - '"iDRAC" in idrac_info.system_info.License[my_idx2].LicenseDescription'
-            - '"Datacenter" in idrac_info.system_info.License[my_idx2].LicenseDescription'
-            - '"License" in idrac_info.system_info.License[my_idx2].LicenseDescription'
+            - '"idrac" in idrac_info.system_info.License[my_idx2].LicenseDescription | lower'
+            - '"data" in idrac_info.system_info.License[my_idx2].LicenseDescription | lower'
+            - '"license" in idrac_info.system_info.License[my_idx2].LicenseDescription | lower'
             - '"Healthy" in idrac_info.system_info.License[my_idx2].PrimaryStatus'
           loop_control:
             index_var: my_idx2

+ 2 - 0
control_plane/roles/webui_awx/files/requirements.yml

@@ -10,3 +10,5 @@ collections:
     version: 2.2.3
   - name: community.grafana
     version: 1.3.0
+  - name: ansible.utils
+    version: 2.5.2

+ 3 - 0
docs/FAQ.md

@@ -9,6 +9,9 @@ Potential Causes:
 Resolution:  
 Wait for AWX UI to be accessible at http://\<management-station-IP>:8081, and then run the `control_plane.yml` file again, where __management-station-IP__ is the IP address of the management node.
 
+## Why does Omnia Control Plane fail at Task: `control_plane_common: Assert Value of idrac_support if mngmt_network container needed`?
+When `device_config_support` is set to true, `idrac_support` also needs to be set to true. 
+
 ## What to do if the nodes in a Kubernetes cluster reboot:
 Wait for 15 minutes after the Kubernetes cluster reboots. Next, verify the status of the cluster using the following commands:
 * `kubectl get nodes` on the manager node to get the real-time k8s cluster status.  

+ 6 - 4
docs/INSTALL_OMNIA.md

@@ -194,11 +194,13 @@ The following __Slurm__ roles are provided by Omnia when __omnia.yml__ file is r
 To enable the login node, the *login_node_required* variable must be set to "true" in the *omnia_config.yml* file.  
 - **login_common** role: The firewall ports are opened on the manager and login nodes.  
 - **login_server** role: FreeIPA server is installed and configured on the manager node to provide authentication using LDAP and Kerberos principles.  
-- **login_node** role: FreeIPA client is installed and configured on the login node and is integrated with the server running on the manager node.  
+- **login_node** role: For Rocky, FreeIPA client is installed and configured on the login node and is integrated with the server running on the manager node. For LeapOS, 389ds will be installed instead.
 
-**NOTE**: To skip the installation of:
-* The login node-In the `omnia_config.yml` file, set the *login_node_required* variable to "false".  
-* The FreeIPA server and client: Use `--skip-tags freeipa` while executing the *omnia.yml* file. 
+>>__Note:__ If LeapOS is being deployed, login_common and login_server roles will be skipped.  
+
+>> **NOTE**: To skip the installation of:
+>> * The login node-In the `omnia_config.yml` file, set the *login_node_required* variable to "false".  
+>> * The FreeIPA server and client: Use `--skip-tags freeipa` while executing the *omnia.yml* file. 
 
 ### Installing JupyterHub and Kubeflow playbooks  
 If you want to install JupyterHub and Kubeflow playbooks, you have to first install the JupyterHub playbook and then install the Kubeflow playbook.

Diferenças do arquivo suprimidas por serem muito extensas
+ 78 - 38
docs/INSTALL_OMNIA_CONTROL_PLANE.md


+ 10 - 7
docs/README.md

@@ -54,7 +54,7 @@ Requirements  |   Version
 OS pre-installed on the management station  |  Rocky 8.x/ Leap 15.x
 OS deployed by Omnia on bare-metal Dell EMC PowerEdge Servers | Rocky 8.x Minimal Edition/ Leap 15.x
 Cobbler  |  3.2.2
-Ansible AWX  |  19.4.0
+Ansible AWX  |  20.0.0
 Slurm Workload Manager  |  20.11.2
 Kubernetes on the management station  |  1.21.0
 Kubernetes on the manager and compute nodes	|	1.16.7 or 1.19.3
@@ -92,9 +92,9 @@ OpenSM	|	GNU General Public License 2	|	3.3.24	|	-
 NVIDIA container runtime	|	Apache-2.0	|	3.4.2	|	Nvidia container runtime library
 Python PIP	|	MIT License	|	21.1.2	|	Python Package
 Python3	|	-	|	3.6.8 (3.6.15 if LeapOS is being used)	|	-
-Kubelet	|	Apache-2.0	|	1.16.7,1.19, 1.21 (LeapOS only supports 1.21) 	|	Provides external, versioned ComponentConfig API types for configuring the kubelet
-Kubeadm	|	Apache-2.0	|	1.16.7,1.19, 1.21 (LeapOS only supports 1.21)	|	"fast paths" for creating Kubernetes clusters
-Kubectl	|	Apache-2.0	|	1.16.7,1.19, 1.21 (LeapOS only supports 1.21)	|	Command line tool for Kubernetes
+Kubelet	|	Apache-2.0	|	1.16.7,1.19, 1.21  	|	Provides external, versioned ComponentConfig API types for configuring the kubelet
+Kubeadm	|	Apache-2.0	|	1.16.7,1.19, 1.21 	|	"fast paths" for creating Kubernetes clusters
+Kubectl	|	Apache-2.0	|	1.16.7,1.19, 1.21 	|	Command line tool for Kubernetes
 JupyterHub	|	Modified BSD License	|	1.1.0	|	Multi-user hub
 kubernetes Controllers	|	Apache-2.0	|	1.16.7,1.19 (1.21 if LeapOS is being used)	|	Orchestration tool	
 Kfctl	|	Apache-2.0	|	1.0.2	|	CLI for deploying and managing Kubeflow
@@ -106,10 +106,10 @@ Horovod	|	Apache-2.0	|	0.21.1	|	Distributed deep learning training framework for
 MPI	|	Copyright (c) 2018-2019 Triad National Security,LLC. All rights reserved.	|	0.3.0	|	HPC library
 CoreDNS	|	Apache-2.0	|	1.6.2	|	DNS server that chains plugins
 CNI	|	Apache-2.0	|	0.3.1	|	Networking for Linux containers
-AWX	|	Apache-2.0	|	19.4.0	|	Web-based User Interface
+AWX	|	Apache-2.0	|	20.0.0	|	Web-based User Interface
 AWX.AWX	|	Apache-2.0	|	19.4.0	|	Galaxy collection to perform awx configuration
-AWXkit	|	Apache-2.0	|	to be updated	|	To perform configuration through CLI commands
-Cri-o	|	Apache-2.0	|	1.21	|	Container Service
+AWXkit	|	Apache-2.0	|	18.0.0	|	To perform configuration through CLI commands
+Cri-o	|	Apache-2.0	|	1.21, 1.17.3  (LeapOS only supports  1.17.3) |	Container Service
 Buildah	|	Apache-2.0	|	1.22.4	|	Tool to build and run containers
 PostgreSQL	|	Copyright (c) 1996-2020, PostgreSQL Global Development Group	|	10.15	|	Database Management System
 Redis	|	BSD-3-Clause License	|	6.0.10	|	In-memory database
@@ -123,6 +123,9 @@ OMSDK	|	Apache-2.0	|	1.2.488	|	Dell EMC OpenManage Python SDK (OMSDK) is a pytho
 | postfix                               | IBM Public License               | 3.5.8  | Mail Transfer Agent (MTA) designed to determine routes and   send emails                                                                       |
 | xorriso                               | GPL version 3                    | 1.4.8  | xorriso copies file objects from POSIX compliant filesystems   into Rock Ridge enhanced ISO 9660 filesystems.                                  |
 | Dell EMC   OpenManage Ansible Modules | GNU- General Public License v3.0 | 5.0.0  | OpenManage Ansible Modules simplifies and automates   provisioning, deployment, and updates of PowerEdge servers and modular   infrastructure. |
+| 389-ds                               | GPL version 3               | 1.4.4  |  LDAP server used for authentication, access control.                                                                       |
+| sssd                               | GPL version 3                    | 1.16.1  | A set of daemons used to manage access to remote directory services and authentication mechanisms.                                   |
+| krb5 | MIT License | 1.19.2  | Authentication protocol providing strong authentication for client/server applications by using secret-key cryptography |
 
 # Known issues  
 * **Issue**: Hosts are not displayed on the AWX UI.  

Diferenças do arquivo suprimidas por serem muito extensas
+ 25 - 0
docs/Security/ENABLE_SECURITY_LOGIN_NODE.md


Diferenças do arquivo suprimidas por serem muito extensas
+ 85 - 0
docs/Security/ENABLE_SECURITY_MANAGEMENT_STATION.md


+ 0 - 27
docs/Security/Enable_Security_LoginNode.md

@@ -1,27 +0,0 @@
-# Enabling Security on the Login Node 
-
-* Ensure that `enable_secure_login_node` is set to **true** in `omnia_config.yml`
-* Set the following parameters in `omnia_security_config.yml`
-
-|  Parameter Name        |  Default Value  |  Additional Information                                                                                                                                          |
-|------------------------|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| max_failures           | 3               | Failures allowed before lockout. <br> This value cannot currently   be changed.                                                                                  |
-| failure_reset_interval | 60              | Period (in seconds) after which the number of failed login attempts is   reset <br> Accepted Values: 30-60                                                       |
-| lockout_duration       | 10              | Period (in seconds) for which users are locked out. <br> Accepted   Values: 5-10                                                                                 |
-| session_timeout        | 180             | Period (in seconds) after which idle users get logged out automatically   <br> Accepted Values: 30-90                                                            |
-| alert_email_address    |                 | Email address used for sending alerts in case of authentication failure. Currently, only one email ID is accepted in this field.   <br> If this variable is left blank, authentication failure alerts will   be disabled. |
-| allow_deny             | Allow           | This variable sets whether the user list is Allowed or Denied. <br>   Accepted Values: Allow, Deny                                                               |
-| user                   |                 | Array of users that are allowed or denied based on the `allow_deny`   value. Multiple users must be separated by a space.                                        |
-
-* Set the following parameters in `control_plane/input_params/security_vars.yml`
-
-|  Parameter Name        |  Default Value  |  Additional Information                                                                                                                                          |
-|------------------------|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| allow_deny             | Allow           | This variable sets whether the user list is Allowed or Denied. <br>   Accepted Values: Allow, Deny                                                               |
-| user                   |                 | Array of users that are allowed or denied based on the `allow_deny`   value. Multiple users must be separated by a space.                                        |
-
-
-## Kernel Lockdown
-
-* RockyOS has Kernel Lockdown mode (Integrity) enabled by default
-* SUSE/Leap allows users to set Kernel Lockdown mode to Confidentiality or Integrity.

+ 0 - 79
docs/Security/Enable_Security_ManagementStation.md

@@ -1,79 +0,0 @@
-# Enabling Security on the Management Station
-
-Omnia uses FreeIPA on RockyOS to enable security features like authorisation and access control.
-
-## Enabling Authentication on the Management Station:
-
-Set the parameter 'enable_security_support' to true in `base_vars.yml`
-
-## Prerequisites Before Enabling Security:
-
-* Enter the relevant values in `login_vars.yml`:
-
-| Parameter Name             | Default Value | Additional Information                                                                           |
-|----------------------------|---------------|--------------------------------------------------------------------------------------------------|
-| ms_directory_manager_password |               | Password of the Directory Manager with full access to the directory for system management tasks. |
-| ms_kerberos_admin_password         |               | "admin" user password for the IPA server on RockyOS. If LeapOS is in use, it is used as the "kerberos admin" user password for 389-ds <br> This field is not relevant to Management Stations running `LeapOS`                                                         |
-
-
-
-* Enter the relevant values in `security_vars.yml:
-
-If `RockyOS` is in use on the Management Station:
-
-|  Parameter Name        |  Default Value  |  Additional Information                                                                                                                                                                                                                                                                                                                                      |
-|------------------------|-----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-|  domain_name           |  omnia.test     |  The domain name should not contain   an underscore ( _ )                                                                                                                                                                                                                                                                                                    |
-|  realm_name            |  OMNIA.TEST     |  The realm name should follow the   following rules per https://www.freeipa.org/page/Deployment_Recommendations   <br> * The realm name must not conflict with any other existing   Kerberos realm name (e.g. name used by Active Directory). <br> * The   realm name should be upper-case (EXAMPLE.COM) version of primary DNS domain   name (example.com). |
-| max_failures           | 3               | Failures allowed before lockout. <br> This value cannot currently   be changed.                                                                                                                                                                                                                                                                              |
-| failure_reset_interval | 60              | Period (in seconds) after which the number of failed login attempts is   reset <br> Accepted Values: 30-60                                                                                                                                                                                                                                                   |
-| lockout_duration       | 10              | Period (in seconds) for which users are locked out. <br> Accepted   Values: 5-10                                                                                                                                                                                                                                                                             |
-| session_timeout        | 180             | Period (in seconds) after which idle users get logged out automatically   <br> Accepted Values: 30-90                                                                                                                                                                                                                                                        |
-| alert_email_address    |                 | Email address used for sending alerts in case of authentication failure. Currently, only one email address is supported in this field.   <br> If this variable is left blank, authentication failure alerts will   be disabled.                                                                                                                                                                                             |
-| allow_deny             | Allow           | This variable sets whether the user list is Allowed or Denied. <br>   Accepted Values: Allow, Deny                                                                                                                                                                                                                                                           |
-| user                   |                 | Array of users that are allowed or denied based on the `allow_deny`   value. Multiple users must be separated by a space.                                                                                                                                                                                                                                    |
-
-
-## Log Aggregation via Grafana
-
-[Loki](https://grafana.com/docs/loki/latest/fundamentals/overview/) is a datastore used to efficiently hold log data for security purposes. Using the `promtail` agent, logs are collated and streamed via a HTTP API.
-
->> __Note:__ When `control_plane.yml` is run, Loki is automatically set up as a data source on the Grafana UI.
-
-
-
-### Querying Loki 
-
-Loki uses basic regex based syntax to filter for specific jobs, dates or timestamps.
-
-* Select the Explore ![Explore Icon](../Telemetry_Visualization/Images/ExploreIcon.PNG) tab to select control-plane-loki from the drop down.
-* Using [LogQL queries](https://grafana.com/docs/loki/latest/logql/log_queries/), all logs in `/var/log` can be accessed using filters (Eg: `{job=”Omnia”}` )
-
-## Viewing Logs on the Dashboard
-
-All log files can be viewed via the Dashboard tab (![Dashboard Icon](../Telemetry_Visualization/Images/DashBoardIcon.PNG)). The Default Dashboard displays `omnia.log` and `syslog`. Custom dashboards can be created per user requirements.
-
-Below is a list of all logs available to Loki and can be accessed on the dashboard:
-
-| Name               | Location                                  | Purpose                      | Additional Information                                                                             |
-|--------------------|-------------------------------------------|------------------------------|----------------------------------------------------------------------------------------------------|
-| Omnia Logs         | /var/log/omnia.log                        | Omnia Log                    | This log is configured by Default                                                                  |
-| syslogs            | /var/log/messages                         | System Logging               | This log is configured by Default                                                                  |
-| Audit Logs         | /var/log/audit/audit.log                  | All Login Attempts           | This log is configured by Default                                                                  |
-| CRON logs          | /var/log/cron                             | CRON Job Logging             | This log is configured by Default                                                                  |
-| Pods logs          | /var/log/pods/ * / * / * log                    | k8s pods                     | This log is configured by Default                                                                  |
-| Access Logs        | /var/log/dirsrv/slapd-<Realm Name>/access | Directory Server Utilization | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| Error Log          | /var/log/dirsrv/slapd-<Realm Name>/errors | Directory Server Errors      | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| CA Transaction Log | /var/log/pki/pki-tomcat/ca/transactions   | FreeIPA PKI Transactions     | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| KRB5KDC            | /var/log/krb5kdc.log                      | KDC Utilization              | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| Secure logs        | /var/log/secure                           | Login Error Codes            | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| HTTPD logs         | /var/log/httpd/*                          | FreeIPA API Call             | This log is available when FreeIPA is set up ( ie when   enable_security_support is set to 'true') |
-| DNF logs           | /var/log/dnf.log                          | Installation Logs            | This log is configured on Rocky OS                                                                 |
-| Zypper Logs        | /var/log/zypper.log                       | Installation Logs            | This log is configured on Leap OS                                                                  |
-
-
-
-
-
-
-

+ 8 - 7
docs/Telemetry_Visualization/Visualization.md

@@ -11,17 +11,17 @@ A lot of these metrics are collected using iDRAC telemetry. iDRAC telemetry allo
 
 | Parameter Name        | Default Value | Information |
 |-----------------------|---------------|-------------|
-| timescaledb_user      | 		        |  Username used for connecting to timescale db. Minimum Legth: 2 characters.          |
-| timescaledb_password  | 		        |  Password used for connecting to timescale db. Minimum Legth: 2 characters.           |
-| mysqldb_user          | 		        |  Username used for connecting to mysql db. Minimum Legth: 2 characters.         |
-| mysqldb_password      | 		        |  Password used for connecting to mysql db. Minimum Legth: 2 characters.            |
+| timescaledb_user      | 		        |  Username used for connecting to timescale db. Minimum Length: 2 characters.          |
+| timescaledb_password  | 		        |  Password used for connecting to timescale db. Minimum Length: 2 characters.           |
+| mysqldb_user          | 		        |  Username used for connecting to mysql db. Minimum Length: 2 characters.         |
+| mysqldb_password      | 		        |  Password used for connecting to mysql db. Minimum Length: 2 characters.            |
 | mysqldb_root_password | 		        |  Password used for connecting to mysql db for root user. Minimum Legth: 2 characters.         |
 
 3. All parameters in `telemetry/input_params/base_vars.yml` need to be filled in:
 
 | Parameter Name          | Default Value     | Information |
 |-------------------------|-------------------|-------------|
-| mount_location          | idrac_telemetrysource_services_db | Sets the location all telemetry related files will be stored and both timescale and mysql databases will be mounted.            |
+| mount_location          | /opt/omnia| Sets the location all telemetry related files will be stored and both timescale and mysql databases will be mounted.            |
 | idrac_telemetry_support | true              | This variable is used to enable iDRAC telemetry support and visualizations. Accepted Values: true/false            |
 | slurm_telemetry_support | true              | This variable is used to enable slurm telemetry support and visualizations. Slurm Telemetry support can only be activated when idrac_telemetry_support is set to true. Accepted Values: True/False.        |
 | timescaledb_name        | telemetry_metrics | Postgres DB with timescale extension is used for storing iDRAC and slurm telemetry metrics.            |
@@ -50,7 +50,7 @@ Use any one of the following browsers to access the Grafana UI (https://< Grafan
 
 ## Initiating Telemetry
 
-1. Once `control_plane.yml` and `telemetry.yml` are executed, run the following commands from `omnia/telemetry`:
+1. Once `control_plane.yml` and `omnia.yml` are executed, run the following commands from `omnia/telemetry`:
 
 `ansible-playbook telemetry.yml`
 
@@ -60,7 +60,8 @@ Use any one of the following browsers to access the Grafana UI (https://< Grafan
 After initiation, new nodes can be added to telemetry by running the following commands from `omnia/telemetry`:
 		
 ` ansible-playbook add_idrac_node.yml `
-		
+
+	
 
 
 

+ 1 - 1
docs/control_plane/device_templates/PROVISION_SERVERS.md

@@ -13,7 +13,7 @@ Edit the following files under the `control_plane/input_params` directory to pro
 	File name	|	Variables</br> [Required/ Optional]	|	Default, choices	|	Description
 	-------	|	----------------	|	-----------------	|	-----------------
 	idrac_vars.yml	|	idrac_system_profile</br> [Required]	|	<ul><li>**Performance**</li> <li>PerformancePerWatt(DAPC)</li> <li>PerformancePerWatt(OS)</li> <li>WorkstationPerformance</li></ul>	|	The system profile used for BIOS configuration. 
-	<br>	|	firmware_update_required</br> [Required]	|	<ul><li>**true**</li> <li>false</li></ul>	|	By default, Omnia updates the firmware on the servers. To disable the firmware update, set the variable to "false".
+	<br>	|	firmware_update_required</br> [Required]	|	<ul><li>**false**</li> <li>true</li></ul>	|	By default, Omnia updates the firmware on the servers. To disable the firmware update, set the variable to "false".
 	<br>	|	poweredge_model</br> [Required if "firmware_update_required" is set to "true"]	|	<ul><li>**C6420**</li> <li>R640</li><li>R740</li><li>C4140</li> <li>And other supported PowerEdge servers</li></ul>	|	Enter the required PowerEdge server models to update the firmware. For example, enter `R640,R740,C4140` to update firmware on these models of PowerEdge servers. For a complete list of supported PowerEdge servers, see the *Hardware managed by Omnia* section in the Readme file.
 	<br>	|	uefi_secure_boot</br> [Optional]	|	<ul><li>**disabled**</li> <li>enabled</li></ul>	|	Option to enable or disable the secure boot mode.
 	<br>	|	system_lockdown</br> [Optional]	|	<ul><li>**disabled**</li> <li>enabled</li></ul>	|	Option to enable or disable system lockdown.

+ 1 - 1
docs/control_plane/input_parameters/PROVISION_SERVERS.md

@@ -13,7 +13,7 @@ Edit the following files under the `control_plane/input_params` directory to pro
 	File name	|	Variables</br> [Required/ Optional]	|	Default, choices	|	Description
 	-------	|	----------------	|	-----------------	|	-----------------
 	idrac_vars.yml	|	idrac_system_profile</br> [Required]	|	<ul><li>**Performance**</li> <li>PerformancePerWatt(DAPC)</li> <li>PerformancePerWatt(OS)</li> <li>WorkstationPerformance</li></ul>	|	The system profile used for BIOS configuration. 
-	<br>	|	firmware_update_required</br> [Required]	|	<ul><li>**true**</li> <li>false</li></ul>	|	By default, Omnia updates the firmware on the servers. To disable the firmware update, set the variable to "false".
+	<br>	|	firmware_update_required</br> [Required]	|	<ul><li>**false**</li> <li>true</li></ul>	|	By default, Omnia updates the firmware on the servers. To disable the firmware update, set the variable to "false".
 	<br>	|	poweredge_model</br> [Required if "firmware_update_required" is set to "true"]	|	<ul><li>**C6420**</li> <li>R640</li><li>R740</li><li>C4140</li> <li>And other supported PowerEdge servers</li></ul>	|	Enter the required PowerEdge server models to update the firmware. For example, enter `R640,R740,C4140` to update firmware on these models of PowerEdge servers. For a complete list of supported PowerEdge servers, see the *Hardware managed by Omnia* section in the Readme file.
 	<br>	|	uefi_secure_boot</br> [Optional]	|	<ul><li>**disabled**</li> <li>enabled</li></ul>	|	Option to enable or disable the secure boot mode.
 	<br>	|	system_lockdown</br> [Optional]	|	<ul><li>**disabled**</li> <li>enabled</li></ul>	|	Option to enable or disable system lockdown.

+ 30 - 0
roles/cluster_validation/tasks/install_packages.yml

@@ -0,0 +1,30 @@
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Set fact for ansible version
+  set_fact:
+    ansible_collection_used: true
+  when: "ansible_version.full is version_compare(ansible_base_version, '>')"
+
+- name: Install netaddr
+  pip:
+    name: netaddr
+    state: present
+    executable: pip3
+
+- name: Install ansible galaxy collection ansible.utils
+  command: ansible-galaxy collection install "{{ ipaddr_collection }}"
+  changed_when: false
+  when: ansible_collection_used

+ 11 - 1
roles/cluster_validation/tasks/main.yml

@@ -27,6 +27,7 @@
     control_plane_status: false
     powervault_status: false
     nfs_node_status: false
+    ansible_collection_used: false
 
 - name: Check AWX instance
   command: awx --version
@@ -46,6 +47,15 @@
     - not awx_version_check.failed
     - awx_search_key in awx_hostname.stdout
 
+- name: Install Packages
+  include_tasks: install_packages.yml
+  when: not control_plane_status
+
+- name: Set ansible_collection_used to true in awx
+  set_fact:
+    ansible_collection_used: true
+  when: control_plane_status
+
 - name: Set NFS node status
   set_fact:
     nfs_node_status: true
@@ -90,4 +100,4 @@
         regexp: '#log_path = /var/log/ansible.log'
         replace: 'log_path = /var/log/omnia.log'
       when: ansible_conf_exists.stat.exists
-  when: not control_plane_status
+  when: not control_plane_status

+ 5 - 1
roles/cluster_validation/vars/main.yml

@@ -99,4 +99,8 @@ allow_deny_fail_msg: "Failed. Incorrect Access format in security_vars.yml"
 restrict_program_support_success_msg: "restrict_program_support successfully validated"
 restrict_program_support_failure_msg: "Failed. Accepted values are true or false."
 restrict_softwares_success_msg: "restrict_softwares successfully validated"
-restrict_softwares_failure_msg: "Warning. Values should be comma separated. The supported services are telnet, lpd, bluetooth, rlogin, rexec. Please check restrict_softwares variable"
+restrict_softwares_failure_msg: "Warning. Values should be comma separated. The supported services are telnet, lpd, bluetooth, rlogin, rexec. Please check restrict_softwares variable"
+
+# Usage: install_packages.yml
+ansible_base_version: '2.9'
+ipaddr_collection: ansible.utils:2.5.2

+ 4 - 4
roles/common/vars/main.yml

@@ -14,10 +14,10 @@
 ---
 
 leap_repo:
-  - { name: repo-non-oss, repo: http://download.opensuse.org/distribution/leap/15.3/repo/non-oss/ }
-  - { name: repo-oss, repo: http://download.opensuse.org/distribution/leap/15.3/repo/oss/ }
-  - { name: repo-update-oss, repo: http://download.opensuse.org/update/leap/15.3/oss/ }
-  - { name: repo-update-non-oss, repo: http://download.opensuse.org/update/leap/15.3/non-oss/ }
+  - { name: repo-non-oss, repo: "http://download.opensuse.org/distribution/leap/15.3/repo/non-oss/" }
+  - { name: repo-oss, repo: "http://download.opensuse.org/distribution/leap/15.3/repo/oss/" }
+  - { name: repo-update-oss, repo: "http://download.opensuse.org/update/leap/15.3/oss/" }
+  - { name: repo-update-non-oss, repo: "http://download.opensuse.org/update/leap/15.3/non-oss/" }
 
 nvidia_repo: https://download.nvidia.com/opensuse/leap/15.3/
 docker_repo_url_leap: https://download.docker.com/linux/sles/docker-ce.repo

+ 3 - 3
roles/k8s_start_services/tasks/deploy_k8s_services.yml

@@ -1,4 +1,4 @@
-#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -109,7 +109,7 @@
   tags: init
 
 - name: Start NFS Client Provisioner using NFS on manager node
-  command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_manager_node }}' --set nfs.path='{{ nfs_share_dir }}' --generate-name"
+  command: "helm install nfs-omnia stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_manager_node }}' --set nfs.path='{{ nfs_share_dir }}'"
   changed_when: true
   when:
     - "'nfs-client-provisioner' not in k8s_pods.stdout"
@@ -117,7 +117,7 @@
   tags: init
 
 - name: Start NFS Client Provisioner using NFS on NFS Node
-  command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_nfs_node }}' --set nfs.path='{{ me4_nfs_share_k8s }}' --generate-name"
+  command: "helm install nfs-omnia stable/nfs-client-provisioner --set nfs.server='{{ nfs_server_nfs_node }}' --set nfs.path='{{ me4_nfs_share_k8s }}'"
   changed_when: true
   when:
     - "'nfs-client-provisioner' not in k8s_pods.stdout"

+ 68 - 0
roles/login_node/files/temp_sssd.conf

@@ -0,0 +1,68 @@
+#
+# sssd.conf
+# Generated by 389 Directory Server - dsidm
+#
+# For more details see man sssd.conf and man sssd-ldap
+# Be sure to review the content of this file to ensure it is secure and correct
+# in your environment.
+
+[domain/ldap]
+# Uncomment this for more verbose logging.
+# debug_level=3
+
+# Cache hashes of user authentication for offline auth.
+cache_credentials = True
+id_provider = ldap
+auth_provider = ldap
+access_provider = ldap
+chpass_provider = ldap
+ldap_schema = rfc2307
+ldap_search_base = dc=omnia,dc=test
+ldap_uri = ldapi://%2fvar%2frun%2fslapd-ldap1.socket
+# If you have DNS SRV records, you can use the following instead. This derives
+# from your ldap_search_base.
+# ldap_uri = _srv_
+
+ldap_tls_reqcert = demand
+# To use cacert dir, place *.crt files in this path then run:
+# /usr/bin/openssl rehash /etc/openldap/certs
+# or (for older versions of openssl)
+# /usr/bin/c_rehash /etc/openldap/certs
+ldap_tls_cacertdir = /etc/openldap/certs
+
+# Path to the cacert
+# ldap_tls_cacert = /etc/openldap/certs/ca.crt
+
+# Only users who match this filter can login and authorise to this machine. Note
+# that users who do NOT match, will still have their uid/gid resolve, but they
+# can't login.
+ldap_access_filter = (memberOf=cn=server_admins,ou=groups,dc=omnia,dc=test)
+
+enumerate = false
+access_provider = ldap
+ldap_user_member_of = memberof
+ldap_user_gecos = cn
+ldap_user_uuid = nsUniqueId
+ldap_group_uuid = nsUniqueId
+# This is really important as it allows SSSD to respect nsAccountLock
+ldap_account_expire_policy = rhds
+ldap_access_order = filter, expire
+# Setup for ssh keys
+# Inside /etc/ssh/sshd_config add the lines:
+#   AuthorizedKeysCommand /usr/bin/sss_ssh_authorizedkeys
+#   AuthorizedKeysCommandUser nobody
+# You can test with the command: sss_ssh_authorizedkeys <username>
+ldap_user_ssh_public_key = nsSshPublicKey
+
+# This prevents an issue where the Directory is recursively walked on group
+# and user look ups. It makes the client faster and more responsive in almost
+# every scenario.
+ignore_group_members = False
+
+[sssd]
+services = nss, pam, ssh, sudo
+config_file_version = 2
+
+domains = ldap
+[nss]
+homedir_substring = /home

+ 61 - 7
roles/login_node/tasks/install_389ds.yml

@@ -49,13 +49,18 @@
   failed_when: false
   no_log: true
   register: ds389_status_authentication
- 
+
+- name: Gathering service facts
+  service_facts:
+
 - name: Modify ds389_status
   set_fact:
     ds389_status: true
   when: 
     - ds389_status_authentication.rc == 0
     - ldap1_install_search_key in ldap1_status.stdout.split(' ')[3]
+    - "'sssd.service' in ansible_facts.services"
+    - sssd_install_search_key in ansible_facts.services['sssd.service'].state
 
 - block:
     - name: Install 389-ds
@@ -63,17 +68,25 @@
         name: "{{ ds389_packages }}"
         state: present 
 
+    - name: Check ldap instance is running or not
+      command: dsctl {{ ldap_instance }} status
+      changed_when: false
+      failed_when: false
+      register: ldap1_status
+
     - name: Create the ldap1.inf file
       copy:
         src: "{{ role_path }}/files/temp_ldap1.inf"
         dest: "{{ ldap1_config_path }}"
-        mode: "{{ file_mode }}"       
+        mode: "{{ file_mode }}"
+      when: ldap1_search_key in ldap1_status.stdout       
 
     - name: Configure ldap1.inf with domain name
       lineinfile:
         path: "{{ ldap1_config_path }}"
         regexp: "^suffix = dc=omnia,dc=test"
         line: "suffix = dc={{ domain_name.split('.')[0] }},dc={{ domain_name.split('.')[1] }}"
+      when: ldap1_search_key in ldap1_status.stdout
 
     - name: Configure ldap1.inf with directory manager password
       lineinfile:
@@ -81,12 +94,9 @@
         regexp: "^root_password = password"
         line: "root_password = {{ directory_manager_password }}"
       no_log: true
+      when: ldap1_search_key in ldap1_status.stdout
 
-    - name: Check ldap instance is running or not
-      command: dsctl {{ ldap_instance }} status
-      changed_when: false
-      failed_when: false
-      register: ldap1_status
+    
 
     - name: Creating 389 directory server instance
       shell: dscreate -v from-file {{ ldap1_config_path }} | tee {{ ldap1_output_path }}
@@ -154,24 +164,28 @@
         src: "{{ role_path }}/files/temp_krb5.conf"
         dest: "{{ kerberos_conf_path }}"
         mode: "{{ file_mode }}"
+      when: not principal_status.stat.exists
 
     - name: Configure kerberos conf file with domain name
       replace:
         path: "{{ kerberos_conf_path }}"
         regexp: "omnia.test"
         replace: "{{ domain_name }}"
+      when: not principal_status.stat.exists
 
     - name: Configure kerberos conf file with realm name
       replace:
         path: "{{ kerberos_conf_path }}"
         regexp: "OMNIA.TEST"
         replace: "{{ realm_name }}"
+      when: not principal_status.stat.exists
 
     - name: Configure kerberos conf file with hostname
       replace:
         path: "{{ kerberos_conf_path }}"
         regexp: "hostname"
         replace: "{{ server_hostname_short }}"
+      when: not principal_status.stat.exists
 
     - block:
         - name: Setting up the kerberos database
@@ -213,6 +227,46 @@
       shell: set -o pipefail && echo {{ kerberos_admin_password }} | kinit {{ ipa_admin_username }}
       no_log: true
       changed_when: false
+
+    - name: Install sssd packages
+      zypper:
+        name: "{{ sssd_packages }}"
+        state: present
+      
+    - name: Stop and disable nscd
+      systemd:
+        name: nscd
+        state: stopped
+        enabled: no
+      when: "'nscd.service' in ansible_facts.services"
+
+    - name: Check admin group in 389-ds
+      command: dsidm {{ ldap_instance }} group list
+      register: check_admin_group
+      changed_when: false
+
+    - name: Create admin group in 389-ds
+      shell: set -o pipefail && echo {{ admin_group_name }} |  dsidm {{ ldap_instance }} group create
+      changed_when: true
+      when: admin_group_name not in check_admin_group.stdout
+
+    - name: Create the sssd.conf file
+      copy:
+        src: "{{ role_path }}/files/temp_sssd.conf"
+        dest: "{{ sssd_config_path }}"
+        mode: "{{ sssd_file_mode }}"       
+
+    - name: Configure sssd.conf with domain name
+      replace:
+        path: "{{ sssd_config_path }}"
+        regexp: "dc=omnia,dc=test"
+        replace: "dc={{ domain_name.split('.')[0] }},dc={{ domain_name.split('.')[1] }}"
+
+    - name: Start sssd service
+      systemd:
+        name: sssd
+        state: started
+        enabled: yes
   when: not ds389_status
 
 - name: Configure password policy in 389-ds

+ 7 - 0
roles/login_node/vars/main.yml

@@ -72,6 +72,7 @@ ds389_packages:
 ldap1_search_key: "No such instance"
 ds389_pwpolicy_search_key: "passwordlockoutduration: {{ lockout_duration }}"
 ldap1_install_search_key: running
+sssd_install_search_key: running
 ldap1_config_path: "/root/ldap1.inf"
 ldap_instance: ldap1
 ldap1_output_path: /var/log/ldap1_output.txt
@@ -86,6 +87,12 @@ kerberos_packages:
 kerberos_principal_path: /var/lib/kerberos/krb5kdc/principal
 kerberos_conf_path: /etc/krb5.conf
 kerberos_env_path: /usr/lib/mit/sbin/
+sssd_packages:
+  - sssd
+  - sssd-ldap
+admin_group_name: server_admins
+sssd_file_mode: 0600
+sssd_config_path: /etc/sssd/sssd.conf
 
 # Usage: restrict_nonessentials.yml
 service_status: ['enabled','alias','static','indirect','enabled-runtime','active','inactive']

+ 3 - 3
roles/slurm_exporter/tasks/install_prometheus.yml

@@ -55,21 +55,21 @@
 
     - name: Configure nginx.conf (1/2)
       replace:
-        path: "../../k8s_start_services/templates/nginx.conf.j2"
+        path: "{{ playbook_dir }}/roles/k8s_start_services/templates/nginx.conf.j2"
         regexp: '        server_name  .*'
         replace: "        server_name  {{ ansible_default_ipv4.address }};"
       delegate_to: localhost
 
     - name: Configure nginx.conf (2/2)
       replace:
-        path: "../../k8s_start_services/templates/nginx.conf.j2"
+        path: "{{ playbook_dir }}/roles/k8s_start_services/templates/nginx.conf.j2"
         regexp: '          proxy_pass http://.*'
         replace: "          proxy_pass {{ prometheus_ip }};"
       delegate_to: localhost
 
     - name: Load nginx conf
       template:
-        src: "../../k8s_start_services/templates/nginx.conf.j2"
+        src: "{{ playbook_dir }}/roles/k8s_start_services/templates/nginx.conf.j2"
         dest: "{{ nginx_conf_file_path }}"
         mode: "{{ nginx_conf_file_mode }}"
 

+ 6 - 0
roles/slurm_manager/tasks/main.yml

@@ -120,6 +120,12 @@
 - name: Get network address/subnet mask
   set_fact:
     network_address: "{{ (ansible_default_ipv4.network + '/' + ansible_default_ipv4.netmask) | ipaddr('network/prefix') }}"
+  when: not hostvars['127.0.0.1']['ansible_collection_used']
+
+- name: Get network address/subnet mask
+  set_fact:
+    network_address: "{{ (ansible_default_ipv4.network + '/' + ansible_default_ipv4.netmask) | ansible.utils.ipaddr('network/prefix') }}"
+  when: hostvars['127.0.0.1']['ansible_collection_used']
 
 - name: Firewall rule slurm - allow all incoming traffic on internal network
   firewalld:

+ 305 - 0
telemetry/roles/grafana_config/files/SpiralLayout.json

@@ -0,0 +1,305 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_TELEMETRY-POSTGRES",
+      "label": "telemetry-postgres",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "postgres",
+      "pluginName": "PostgreSQL"
+    }
+  ],
+  "__elements": [],
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "8.3.2"
+    },
+    {
+      "type": "panel",
+      "id": "hpcviz-idvl-hpcc-spiral-layout",
+      "name": "spiral-layout",
+      "version": "2.0.0"
+    },
+    {
+      "type": "datasource",
+      "id": "postgres",
+      "name": "PostgreSQL",
+      "version": "1.0.0"
+    }
+  ],
+  "editable": false,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1646754961002,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "postgres",
+        "uid": "telemetry-postgres"
+      },
+      "gridPos": {
+        "h": 15,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 2,
+      "options": {
+        "nodeSize": 5,
+        "numberOfRing": 5,
+        "orderType": "rank"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "table",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  *\nFROM\n  slurm.jobs\nWHERE\n  user_id IN ($users)\n  AND start_time < ${__to:date:seconds}\n  AND end_time BETWEEN ${__from:date:seconds} and ${__to:date:seconds}",
+          "refId": "jobs",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"memory_power\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'PowerMetrics TotalMemoryPower' AND\n  system IN (CAST($servicetag AS text))\nGROUP BY time,name\nORDER BY time",
+          "refId": "memory_power",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"cpu_power\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'PowerMetrics TotalCPUPower'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "cpu_power",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"cpu1_temp\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'CPU1 Temp TemperatureReading'AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "cpu1_temp",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"CPU2_Temp\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'CPU2 Temp TemperatureReading' AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time\n",
+          "refId": "cpu2_temp",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        },
+        {
+          "datasource": {
+            "type": "postgres",
+            "uid": "telemetry-postgres"
+          },
+          "format": "time_series",
+          "group": [],
+          "hide": false,
+          "metricColumn": "none",
+          "rawQuery": true,
+          "rawSql": "SELECT\n  $__timeGroupAlias(\"time\",'10m'),\n  avg(CAST(value AS decimal)) AS \"NIC1_Temp\",\n  CONCAT('| ',system) AS name\nFROM timeseries_metrics\nINNER JOIN nodes\nON nodes.servicetag = timeseries_metrics.system\nWHERE\n  $__timeFilter(\"time\") AND\n  label= 'Embedded NIC 1 Port 1 Partition 1 TemperatureReading' AND\n  system IN ($servicetag)\nGROUP BY time,name\nORDER BY time",
+          "refId": "nic1_temp",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "column"
+              }
+            ]
+          ],
+          "timeColumn": "time",
+          "where": [
+            {
+              "name": "$__timeFilter",
+              "params": [],
+              "type": "macro"
+            }
+          ]
+        }
+      ],
+      "title": "SpiralLayout",
+      "type": "hpcviz-idvl-hpcc-spiral-layout"
+    }
+  ],
+  "schemaVersion": 33,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {},
+        "datasource": {
+          "type": "postgres",
+          "uid": "telemetry-postgres"
+        },
+        "definition": "SELECT DISTINCT servicetag as __value from nodes\n",
+        "hide": 0,
+        "includeAll": false,
+        "multi": true,
+        "name": "servicetag",
+        "options": [],
+        "query": "SELECT DISTINCT servicetag as __value from nodes\n",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      },
+      {
+        "current": {},
+        "datasource": {
+          "type": "postgres",
+          "uid": "telemetry-postgres"
+        },
+        "definition": "SELECT\n user_id as __value, user_name as __text\nFROM\n  slurm.jobs\nWHERE\n start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
+        "hide": 0,
+        "includeAll": true,
+        "multi": true,
+        "name": "users",
+        "options": [],
+        "query": "SELECT\n user_id as __value, user_name as __text\nFROM\n  slurm.jobs\nWHERE\n start_time < ${__to:date:seconds} AND end_time > ${__from:date:seconds}",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-6M",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "SpiralLayout",
+  "uid": "ou27WHLnt",
+  "version": 4,
+  "weekStart": ""
+}

+ 12 - 1
telemetry/roles/grafana_config/tasks/add_dashboards.yml

@@ -13,12 +13,23 @@
 # limitations under the License.
 ---
 
+- name: Create a telemetry folder on grafana
+  community.grafana.grafana_folder:
+    url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
+    grafana_user: "{{ grafana_username }}"
+    grafana_password: "{{ grafana_password }}"
+    title: "{{ telemetry_folder_name }}"
+    state: present
+  no_log: true
+
 - name: Import dashboards for visualizations
   community.grafana.grafana_dashboard:
     grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}"
     grafana_user: "{{ grafana_username }}"
     grafana_password: "{{ grafana_password }}"
     state: present
+    folder: "{{ telemetry_folder_name }}"
     overwrite: yes
-    path: "{{ playbook_dir }}/roles/grafana_config/files/{{ dashboards }}"
+    path: "{{ playbook_dir }}/roles/grafana_config/files/{{ item }}"
+  with_items: "{{ dashboards }}"
   no_log: true

+ 4 - 1
telemetry/roles/grafana_config/vars/main.yml

@@ -15,4 +15,7 @@
 
 grafana_k8s: grafana
 grafana_namespace: grafana
-dashboards: parallel-coordinate.json
+telemetry_folder_name: telemetry
+dashboards:
+  - parallel-coordinate.json
+  - SpiralLayout.json

+ 7 - 0
telemetry/roles/slurm_telemetry/files/Dockerfile

@@ -8,13 +8,20 @@ RUN dnf -y install https://download.postgresql.org/pub/repos/yum/reporpms/EL-8-x
 RUN dnf module disable postgresql -y
 RUN dnf install postgresql13-devel -y
 RUN yum install python38-devel libpq-devel -y
+RUN dnf install sshpass -y
 
 COPY requirements.txt requirements.txt
 RUN ln -s /usr/pgsql-13/bin/pg_config /usr/bin/pg_config
 
 RUN pip3 install psycopg2-binary
 RUN pip3 install -r requirements.txt
+RUN mkdir /MonSter/
+COPY init_k8s_pod.sh /MonSter/
+RUN chmod 777 /MonSter/init_k8s_pod.sh
+
 RUN mkdir /log/
 RUN touch /log/monster.log
 
+COPY monster /MonSter/
+
 WORKDIR /MonSter/

+ 6 - 0
telemetry/roles/slurm_telemetry/files/init_k8s_pod_local.sh

@@ -0,0 +1,6 @@
+#!/bin/bash
+
+echo 'manager_node_ip manager_node_hostname' >> /etc/hosts
+ssh-keyscan -H manager_node_hostname >> /root/.ssh/known_hosts
+ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -q -N "" -y
+sshpass -p 'os_passwd' ssh-copy-id 'root@manager_node_ip'

+ 33 - 0
telemetry/roles/slurm_telemetry/files/k8s_slurm_telemetry.yml

@@ -0,0 +1,33 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: slurm-telemetry
+  namespace: telemetry-and-visualizations
+  labels:
+    app: slurm-telemetry
+spec:
+  selector:
+    matchLabels:
+      app: slurm-telemetry
+  replicas: 1
+  strategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        app: slurm-telemetry
+    spec:
+      volumes:
+        - name: ssh-key
+          hostPath:
+            path: /root/.ssh/
+            type: Directory
+      containers:
+        - name: slurm-telemetry
+          image: 'localhost/slurm_telemetry:latest'
+          imagePullPolicy: Never
+          command: ["/bin/sh","-c"]
+          args: ["./init_k8s_pod.sh; python3.8 tsdb.py; python3.8 mslurm.py"]
+          volumeMounts:
+            - name: ssh-key
+              mountPath: /root/.ssh/

+ 0 - 90
telemetry/roles/slurm_telemetry/files/update_service_tags.yml

@@ -1,90 +0,0 @@
-# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.​0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-- name: Create inventory in awx
-  hosts: manager, compute
-  tasks:
-    - name: Check slurmctld service
-      systemd:
-        name: slurmctld
-      register: slurm_service_status
-
-    - name: Set fact slurm_service
-      set_fact:
-        slurm_service: True
-      delegate_to: localhost
-      when: "slurm_service_status.status.ActiveState == 'active'"
-
-    - name: Set fact slurm_service
-      set_fact:
-        slurm_service: False
-      delegate_to: localhost
-      when: "slurm_service_status.status.ActiveState == 'inactive'"
-
-    - name: Replace input file
-      copy:
-        src: "input_config.yml"
-        dest: /mnt/omnia/slurm/monster/config.yml
-        mode: 0644
-      delegate_to: localhost
-      when: "slurm_service_status.status.ActiveState == 'active'"
-
-    - name: Prepare input config file
-      block:
-        - name: Get service tag
-          shell: >
-            set -o pipefail && \
-            dmidecode -t 1 | grep Serial
-          changed_when: false
-          register: service_tag_details
-
-        - name: Set fact service tag
-          set_fact:
-            service_tag: "{{ service_tag_details.stdout.split(':')[1].strip() }}"
-
-        - name: Get the hostname
-          command: hostname
-          register: machine_hostname
-          changed_when: false
-
-        - name: Update Head Node IP
-          replace:
-            path: /mnt/omnia/slurm/monster/config.yml
-            regexp: '  ip:.*'
-            replace: "  ip: {{ groups['manager'][0] }}"
-          delegate_to: localhost
-
-        - name: Update Head Node hostname
-          replace:
-            path: /mnt/omnia/slurm/monster/config.yml
-            regexp: '  headnode:.*'
-            replace: "  headnode: {{ hostvars[groups['manager'][0]]['machine_hostname'].stdout }}"
-          delegate_to: localhost
-
-        - name: Update nodes hostnames
-          lineinfile:
-            path: /mnt/omnia/slurm/monster/config.yml
-            line: "  {{ machine_hostname.stdout }}: {{ ansible_default_ipv4.address }}"
-            insertafter: "hostnames:"
-          delegate_to: localhost
-
-        - name: Update service tag info
-          lineinfile:
-            path: /mnt/omnia/slurm/monster/config.yml
-            line: "  - Servicetag: {{ service_tag }}\n    Os_Ip_Addr: {{ ansible_default_ipv4.address }}"
-            insertafter: "clusternodes:"
-          delegate_to: localhost
-      when: hostvars[groups['manager'][0]]['slurm_service']

+ 51 - 0
telemetry/roles/slurm_telemetry/tasks/deploy_slurm_telemetry.yml

@@ -0,0 +1,51 @@
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Get all images
+  command: "buildah images"
+  register: images_result
+  failed_when: false
+  changed_when: false
+
+- name: Update the permission of init_k8s_pod.sh
+  file:
+    path: "{{ role_path }}/files/init_k8s_pod.sh"
+    mode: "{{ slurm_telemetry_code_dir_mode }}"
+  when: slurm_telemetry_image not in images_result.stdout
+
+- name: Create slurm_telemetry image
+  command: buildah bud -t {{ slurm_telemetry_image }}:{{ slurm_telemetry_image_tag }} -f {{ role_path }}/files/Dockerfile
+  args:
+    chdir: "{{ role_path }}/files/"
+  changed_when: true
+  when: slurm_telemetry_image not in images_result.stdout
+
+- name: Deploy slurm_telemetry pod
+  command: kubectl apply -f {{ role_path }}/files/k8s_slurm_telemetry.yml
+  changed_when: true
+
+- name: Wait for slurm_telemetry pod to come to ready state
+  command: kubectl wait --for=condition=ready -n {{ namespace }} pod -l app=slurm-telemetry --timeout=4m
+  changed_when: true
+
+- name: Delete input config file
+  file:
+    path: "{{ role_path }}/files/monster/config.yml"
+    state: absent
+
+- name: Delete init k8s pod file
+  file:
+    path: "{{ role_path }}/files/init_k8s_pod.sh"
+    state: absent

+ 75 - 63
telemetry/roles/slurm_telemetry/tasks/get_node_inventory.yml

@@ -13,76 +13,88 @@
 # limitations under the License.
 ---
 
-- name: Copy slurm telemetry code
-  copy:
-    src: "{{ role_path }}/files/monster"
-    dest: "{{ slurm_telemetry_code_dir }}"
-    mode: "{{ slurm_telemetry_code_dir_mode }}"
-    
-- name: Install jmepath
-  pip:
-    name: jmespath
-    state: present
-    executable: pip3
+- name: Get inventory details
+  block:
+  - name: Copy slurm telemetry code
+    copy:
+      src: "{{ role_path }}/files/monster"
+      dest: "{{ slurm_telemetry_code_dir }}"
+      mode: "{{ slurm_telemetry_code_dir_mode }}"
 
-- name: Get AWX service IP
-  command: kubectl get svc awx-ui -n {{ awx_namespace }} -o=jsonpath='{.spec.clusterIP}'
-  changed_when: false
-  failed_when: false
-  register: awx_svc_ip
+  - name: Install sshpass
+    package:
+      name: sshpass
+      state: present
 
-- name: AWX needs to be installed
-  fail:
-    msg: "{{ awx_fail_msg }}"
-  when: not awx_svc_ip.stdout
+  - name: Install jmepath
+    pip:
+      name: jmespath
+      state: present
+      executable: pip3
 
-- name: Get AWX service port
-  command: kubectl get svc awx-ui -n {{ awx_namespace }} -o=jsonpath='{.spec.ports[0].port}'
-  changed_when: false
-  register: awx_svc_port
+  - name: Get AWX service IP
+    command: kubectl get svc awx-ui -n {{ awx_namespace }} -o=jsonpath='{.spec.clusterIP}'
+    changed_when: false
+    failed_when: false
+    register: awx_svc_ip
 
-- name: Get AWX secret
-  shell: >
-    set -o pipefail && \
-    kubectl get secret awx-admin-password -n {{ awx_namespace }} -o jsonpath="{.data.password}" | base64 --decode
-  changed_when: false
-  register: awx_secret
+  - name: AWX needs to be installed
+    fail:
+      msg: "{{ awx_fail_msg }}"
+    when: not awx_svc_ip.stdout
 
-- name: Get node_inventory id
-  shell: >
-    set -o pipefail && \
-    awx --conf.host http://{{ awx_svc_ip.stdout }}:{{ awx_svc_port.stdout }} --conf.username {{ awx_username }} \
-    --conf.password {{ awx_secret.stdout }} --conf.insecure inventory list -f human | grep node_inventory
-  changed_when: false
-  register: inventory_id
+  - name: Get AWX service port
+    command: kubectl get svc awx-ui -n {{ awx_namespace }} -o=jsonpath='{.spec.ports[0].port}'
+    changed_when: false
+    register: awx_svc_port
 
-- name: Node inventory not found in AWX
-  fail:
-    msg: "{{ node_inventory_fail_msg }}"
-  when: not inventory_id.stdout
+  - name: Get AWX secret
+    shell: >
+      set -o pipefail && \
+      kubectl get secret awx-admin-password -n {{ awx_namespace }} -o jsonpath="{.data.password}" | base64 --decode
+    changed_when: false
+    register: awx_secret
 
-- name: Get node_inventory
-  command: awx --conf.host http://{{ awx_svc_ip.stdout }}:{{ awx_svc_port.stdout }} --conf.username {{ awx_username }} \
-    --conf.password {{ awx_secret.stdout }} --conf.insecure hosts list --inventory {{ inventory_id.stdout[0] }}
-  changed_when: false
-  register: node_inventory_output
+  - name: Get node_inventory id
+    shell: >
+      set -o pipefail && \
+      awx --conf.host http://{{ awx_svc_ip.stdout }}:{{ awx_svc_port.stdout }} --conf.username {{ awx_username }} \
+      --conf.password {{ awx_secret.stdout }} --conf.insecure inventory list -f human | grep node_inventory
+    changed_when: false
+    register: inventory_id
 
-- name: Save the json data
-  set_fact:
-    node_inventory_jsondata: "{{ node_inventory_output.stdout | from_json }}"
+  - name: Node inventory not found in AWX
+    fail:
+      msg: "{{ node_inventory_fail_msg }}"
+    when: not inventory_id.stdout
 
-- name: Add temporary hosts
-  add_host:
-    name: "{{ item.name }}"
-    groups: "{{ item.summary_fields.groups.results[0].name }}"
-  with_items: "{{ node_inventory_jsondata | json_query('results') }}"
-  no_log: true
+  - name: Get node_inventory
+    command: awx --conf.host http://{{ awx_svc_ip.stdout }}:{{ awx_svc_port.stdout }} --conf.username {{ awx_username }} \
+      --conf.password {{ awx_secret.stdout }} --conf.insecure hosts list --inventory {{ inventory_id.stdout[0] }}
+    changed_when: false
+    register: node_inventory_output
 
-- name: Update slurm telemetry code path
-  replace:
-    path: "{{ role_path }}/files/update_service_tags.yml"
-    regexp: '{{ item }}.*'
-    replace: "{{ item }} {{ slurm_telemetry_code_dir }}/monster/config.yml"
-  with_items:
-    - "dest:"
-    - "path:"
+  - name: Save the json data
+    set_fact:
+      node_inventory_jsondata: "{{ node_inventory_output.stdout | from_json }}"
+
+  - name: Add temporary hosts
+    add_host:
+      name: "{{ node_inventory_jsondata['results'][node_index].name }}"
+      groups: "{{ node_inventory_jsondata['results'][node_index].summary_fields.groups.results[0].name }}"
+      ansible_user: "{{ os_username }}"
+      ansible_password: "{{ provision_password }}"
+      ansible_become_pass: "{{ provision_password }}"
+      ansible_ssh_common_args: '-o StrictHostKeyChecking=no'
+    with_items: "{{ node_inventory_jsondata['results'] }}"
+    loop_control:
+      index_var: node_index
+    when: node_inventory_jsondata['results'][node_index].summary_fields.groups.count > 0
+    no_log: true
+
+  - name: Copy input_config file
+    copy:
+      src: "{{ role_path }}/files/input_config.yml"
+      dest: "{{ role_path }}/files/monster/config.yml"
+      mode: "{{ monster_config_file_mode }}"
+  when: slurm_telemetry_support

+ 14 - 7
telemetry/roles/slurm_telemetry/tasks/main.yml

@@ -13,12 +13,19 @@
 # limitations under the License.
 ---
 
-- name: Include common variables
-  include_vars: ../../common/vars/main.yml
+- name: Deploy slurm-telemetry
+  block:
+  - name: Include common variables
+    include_vars: ../../common/vars/main.yml
 
-- name: Include timescaledb variables
-  include_vars: ../../timescaledb/vars/main.yml
+  - name: Include timescaledb variables
+    include_vars: ../../timescaledb/vars/main.yml
 
-- name: Prepare MonSter input file
-  include_tasks: update_timescaledb_details.yml
-  when: hostvars[groups['manager'][0]]['slurm_service']
+  - name: Prepare MonSter input file
+    include_tasks: update_timescaledb_details.yml
+    when: hostvars[groups['manager'][0]]['slurm_service']
+
+  - name: Deploy slurm telemetry
+    include_tasks: deploy_slurm_telemetry.yml
+    when: hostvars[groups['manager'][0]]['slurm_service']
+  when: slurm_telemetry_support

+ 118 - 0
telemetry/roles/slurm_telemetry/tasks/update_service_tags.yml

@@ -0,0 +1,118 @@
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.<200b>0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Get hosts details
+  block:
+  - name: Check slurmctld service
+    systemd:
+      name: slurmctld
+    register: slurm_service_status
+
+  - name: Set fact slurm_service
+    set_fact:
+      slurm_service: False
+    delegate_to: localhost
+
+  - name: Set fact slurm_service
+    set_fact:
+      slurm_service: True
+    delegate_to: localhost
+    when: "slurm_service_status.status.ActiveState == 'active'"
+
+  - name: Assert slurmctld status
+    fail:
+      msg: "{{ slurmctld_status_fail_msg }}"
+    when: not hostvars[groups['manager'][0]]['slurm_service']
+
+  - name: Prepare input config file
+    block:
+    - name: Get service tag
+      shell: >
+          set -o pipefail && \
+          dmidecode -t 1 | grep Serial
+      changed_when: false
+      register: service_tag_details
+
+    - name: Set fact service tag
+      set_fact:
+        service_tag: "{{ service_tag_details.stdout.split(':')[1].strip() }}"
+
+    - name: Get the hostname
+      command: hostname
+      register: machine_hostname
+      changed_when: false
+
+    - name: Update Head Node IP
+      replace:
+        path: "{{ role_path }}{{ monster_input_file_path }}"
+        regexp: '  ip:.*'
+        replace: "  ip: {{ groups['manager'][0] }}"
+      delegate_to: localhost
+
+    - name: Update Head Node hostname
+      replace:
+        path: "{{ role_path }}{{ monster_input_file_path }}"
+        regexp: '  headnode:.*'
+        replace: "  headnode: {{ hostvars[groups['manager'][0]]['machine_hostname'].stdout }}"
+      delegate_to: localhost
+
+    - name: Update nodes hostnames
+      lineinfile:
+        path: "{{ role_path }}{{ monster_input_file_path }}"
+        line: "  {{ machine_hostname.stdout }}: {{ inventory_hostname }}"
+        insertafter: "hostnames:"
+      delegate_to: localhost
+
+    - name: Update service tag info
+      lineinfile:
+        path: "{{ role_path }}{{ monster_input_file_path }}"
+        line: "  - Servicetag: {{ service_tag }}\n    Os_Ip_Addr: {{ inventory_hostname }}"
+        insertafter: "clusternodes:"
+      delegate_to: localhost
+
+    - name: Copy initialization file
+      copy:
+        src: "{{ role_path }}/files/init_k8s_pod_local.sh"
+        dest: "{{ role_path }}/files/init_k8s_pod.sh"
+        mode: "{{ monster_config_file_mode }}"
+      delegate_to: localhost
+      when: manager_group in group_names
+
+    - name: Update manager node details in init_k8s_pod.sh
+      replace:
+        path: "{{ role_path }}/files/init_k8s_pod.sh"
+        regexp: echo 'manager_node_ip manager_node_hostname' >> /etc/hosts
+        replace: echo '{{ inventory_hostname }} {{ machine_hostname.stdout }}' >> /etc/hosts
+      delegate_to: localhost
+      when: manager_group in group_names
+
+    - name: Update manager node IP in init_k8s_pod.sh
+      replace:
+        path: "{{ role_path }}/files/init_k8s_pod.sh"
+        regexp: ssh-keyscan -H manager_node_hostname >> /root/.ssh/known_hosts
+        replace: ssh-keyscan -H {{ machine_hostname.stdout }} >> /root/.ssh/known_hosts
+      delegate_to: localhost
+      when: manager_group in group_names
+
+    - name: Update manager node IP in init_k8s_pod.sh
+      replace:
+        path: "{{ role_path }}/files/init_k8s_pod.sh"
+        regexp: sshpass -p 'os_passwd' ssh-copy-id 'root@manager_node_ip'
+        replace: sshpass -p "{{ hostvars['127.0.0.1']['provision_password'] }}" ssh-copy-id 'root@{{ inventory_hostname }}'
+      delegate_to: localhost
+      when: manager_group in group_names
+
+    when: hostvars[groups['manager'][0]]['slurm_service']
+  when: hostvars['127.0.0.1']['slurm_telemetry_support']

+ 6 - 6
telemetry/roles/slurm_telemetry/tasks/update_timescaledb_details.yml

@@ -25,31 +25,31 @@
 
 - name: Update timescaledb service IP
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  host:.*"
     replace: "  host: {{ timescaledb_svc_ip.stdout }}"
 
 - name: Update timescaledb service port
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  port:.*"
     replace: "  port: {{ timescaledb_svc_port.stdout }}"
     before: "# Slurm REST API Configuration"
 
 - name: Update timescaledb username
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  username:.*"
     replace: "  username: {{ timescaledb_user }}"
 
 - name: Update timescaledb password
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  password:.*"
     replace: "  password: {{ timescaledb_password }}"
 
 - name: Update timescaledb database
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  database:.*"
-    replace: "  database: {{ timescaledb_name }}"
+    replace: "  database: {{ timescaledb_name }}"

+ 15 - 1
telemetry/roles/slurm_telemetry/vars/main.yml

@@ -19,4 +19,18 @@ slurm_telemetry_code_dir_mode: 0755
 awx_namespace: awx
 awx_username: admin
 awx_fail_msg: "AWX service not found. AWX needs to be installed"
-node_inventory_fail_msg: "AWX node inventory not found. Node inventory needs be created in AWX"
+node_inventory_fail_msg: "AWX node inventory not found. Node inventory needs be created in AWX"
+os_username: root
+
+# usage: deploy_slurm_telemetry
+slurm_telemetry_image: slurm_telemetry
+slurm_telemetry_image_tag: latest
+monster_config_file_mode: 0644
+
+manager_group: manager
+compute_group: compute
+input_config_file_path: /mnt/omnia/slurm/monster/config.yml
+monster_input_file_path: /files/monster/config.yml
+
+slurmctld_status_success_msg: "slurmctld is running on manager node"
+slurmctld_status_fail_msg: "slurmctld is inactive. Please check manager node for slurm status"

+ 10 - 4
telemetry/telemetry.yml

@@ -33,9 +33,15 @@
         tasks_from: get_node_inventory.yml
       tags: slurm_telemetry
 
-- name: Update slurm node IPs and service tags
-  import_playbook: "{{ playbook_dir }}/roles/slurm_telemetry/files/update_service_tags.yml"
-  tags: slurm_telemetry
+- name: Get node details
+  hosts: manager, compute
+  gather_facts: false
+  tasks:
+    - name: Get service tag
+      include_role:
+        name: slurm_telemetry
+        tasks_from: update_service_tags.yml
+      tags: slurm_telemetry
 
 - name: Slurm Telemetry
   hosts: localhost
@@ -43,4 +49,4 @@
   gather_facts: false
   roles:
    - slurm_telemetry
-  tags: slurm_telemetry
+  tags: slurm_telemetry