Ver código fonte

Issue #172: Slurm role changes and testing framework for Slurm

Signed-off-by: VishnupriyaKrish <Vishnupriya_Krishnar@dellteam.com>
John Lockman 4 anos atrás
pai
commit
af435faf43

+ 5 - 5
omnia.yml

@@ -24,11 +24,11 @@
   roles:
     - common
  
-- name: Apply GPU node config
-  hosts: gpus
-  gather_facts: false
-  roles:
-    - compute_gpu
+#- name: Apply GPU node config
+#  hosts: gpus
+#  gather_facts: false
+#  roles:
+#    - compute_gpu
 
 - name: Apply K8s manager config
   hosts: manager

roles/compute_gpu/files/daemon.json → roles/common/files/daemon.json


+ 18 - 0
roles/common/files/inventory.fact

@@ -0,0 +1,18 @@
+#!/bin/bash
+INVENTORY=$(mktemp lspci.XXXXXXXX)
+
+lspci > $INVENTORY
+
+NVIDIA_GPU=$(cat $INVENTORY | grep -i nvidia | wc -l)
+XILINX_FPGA=$(cat $INVENTORY | grep "Processing accelerators: Xilinx Corporation Device" | wc -l)
+INTEL_A10_FPGA=$(cat $INVENTORY | grep "Processing accelerators: Intel Corporation Device" | wc -l)
+
+cat << EOF
+{
+	"xilinx_fpga" : $XILINX_FPGA,
+	"nvidia_gpu" : $NVIDIA_GPU,
+	"intel_a10_fpga" : $INTEL_A10_FPGA
+}
+EOF
+
+rm -f $INVENTORY

+ 18 - 0
roles/common/tasks/main.yml

@@ -13,6 +13,17 @@
 #  limitations under the License.
 ---
 
+- name: Create a custom fact directory on each host
+  file:
+    path: /etc/ansible/facts.d
+    state: directory
+
+- name: Install accelerator discovery script
+  copy:
+    src: inventory.fact
+    dest: /etc/ansible/facts.d/inventory.fact
+    mode: 0755
+
 - name: Add kubernetes repo
   copy:
     src: kubernetes.repo
@@ -70,6 +81,9 @@
     state: present
   tags: install
 
+- name: Collect host facts (including acclerator information)
+  setup: ~
+
 - name: Install k8s packages
   package:
     name: "{{ k8s_packages }}"
@@ -107,3 +121,7 @@
 - name: Deploy time ntp/chrony
   include_tasks: ntp.yml
   tags: install
+
+- name: Install Nvidia drivers and software components
+  include_tasks: nvidia.yml
+  when: ansible_local.inventory.nvidia_gpu > 0

roles/compute_gpu/tasks/main.yml → roles/common/tasks/nvidia.yml


+ 13 - 0
roles/common/vars/main.yml

@@ -23,6 +23,7 @@ common_packages:
   - bash-completion
   - nvidia-detect
   - chrony
+  - pciutils
 
 k8s_packages:
   - kubelet-1.16.7
@@ -61,3 +62,15 @@ ntp_servers:
   - 2.centos.pool.ntp.org
 chrony_servers:
   - 2.centos.pool.ntp.org
+
+nvidia_docker_repo_url: https://nvidia.github.io/nvidia-docker/centos7/nvidia-docker.repo
+nvidia_docker_repo_dest: /etc/yum.repos.d/nvidia-docker.repo
+nvidia_container_repo_url: https://nvidia.github.io/libnvidia-container/centos7/libnvidia-container.repo
+nvidia_container_repo_dest: /etc/yum.repos.d/libnvidia-container.repo
+
+nvidia_packages:
+  - kmod-nvidia
+  - nvidia-docker2
+
+daemon_file_dest: /etc/docker/
+daemon_file_mode: 0644

+ 0 - 3
roles/compute_gpu/files/k8s.conf

@@ -1,3 +0,0 @@
-net.bridge.bridge-nf-call-ip6tables = 1
-net.bridge.bridge-nf-call-iptables = 1
-

+ 0 - 8
roles/compute_gpu/files/kubernetes.repo

@@ -1,8 +0,0 @@
-[kubernetes]
-name=Kubernetes
-baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64
-enabled=1
-gpgcheck=1
-repo_gpgcheck=1
-gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
-

+ 3 - 2
roles/slurm_common/files/slurm.conf

@@ -32,6 +32,7 @@ ProctrackType=proctrack/pgid
 #FirstJobId=
 ReturnToService=2
 #MaxJobCount=
+MailProg=/usr/bin/mail
 #PlugStackConfig=
 #PropagatePrioProcess=
 #PropagateResourceLimits=
@@ -87,11 +88,11 @@ AccountingStorageType=accounting_storage/slurmdbd
 #AccountingStorageLoc=
 #AccountingStoragePass=
 #AccountingStorageUser=
-#
+AccountingStoragePort=
 # COMPUTE NODES
 #NodeName=linux[1-32] Procs=1 State=UNKNOWN
 #NodeName=DEFAULT Sockets=2 CoresPerSocket=20 State=UNKNOWN
 NodeName= Sockets= CoresPerSocket=
 #NodeName=compute[002-005] CoresPerSocket=20
 PartitionName=normal Nodes=ALL Default=YES MaxTime=INFINITE State=UP
-#PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
+#PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP

+ 40 - 0
roles/slurm_common/handlers/main.yml

@@ -0,0 +1,40 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Restart ntpd
+  systemd:
+    name: ntpd
+    state: started
+    enabled: yes
+
+- name: Restart chrony
+  systemd:
+    name: chronyd
+    state: started
+    enabled: yes
+
+- name: Sync tp clocks
+  command: ntpdc -np
+  register: ntp_clock
+  until:  ntp_clock.stdout.find('*') > -1
+  retries: "{{ retry_count_one }}"
+  delay: "{{ delay_count_one }}"
+
+- name: Sync chrony sources
+  command: chronyc sources
+  register: chrony_src
+  until:  chrony_src.stdout.find('^*') > -1
+  retries: "{{ retry_count }}"
+  delay: "{{ delay_count }}"

+ 56 - 35
roles/slurm_common/tasks/main.yml

@@ -13,38 +13,46 @@
 #  limitations under the License.
 ---
 
-- name: Install epel repository
-  package:
-    name: epel-release
+- name: Get hostname
+  command: hostname -s
+  register: host_name
+  changed_when: true
+
+- name: Add host name in file
+  replace:
+    dest: "{{ hostname_dest }}"
+    regexp: localhost.localdomain
+    replace: "{{ host_name.stdout }}"
+    backup: yes
+    mode: "{{ common_mode ]}"
+
+- name: Add host name in hosts file
+  lineinfile:
+    dest: "{{ hosts_dest }}"
+    line: "{{ hostvars[inventory_hostname]['ansible_default_ipv4']['address'] }} {{ host_name.stdout }}"
     state: present
-  tags: install
+    create: yes
+    mode: "{{ common_mode }}"
 
-- name: Munge installation
+- name: Install epel repository
   package:
-    name: munge-devel
+    name: epel-release
     state: present
   tags: install
 
 - name: Install packages for slurm
   package:
-    name: "{{ common_packages }}"
+    name: "{{ item }}"
     state: present
+  with_items:
+    - "{{ common_packages }}"
   tags: install
 
-- name: pip upgrade pip
-  pip:
-    name: pip
-    executable: pip3
-    extra_args: --upgrade
-    state: latest
-  tags: install
-
-- name: create munge key
+- name: Create munge key
   command: "{{ munge_cmd }}"
   changed_when: true
-  tags: install
 
-- name: copy munge key
+- name: Copy munge key
   copy:
     src: munge.key
     dest: "{{ munge_dest }}"
@@ -53,75 +61,72 @@
     mode: "{{ munge_mode }}"
   tags: install
 
-- name: slurm configuration - slurm.conf
+- name: Slurm configuration - slurm.conf
   copy:
     src: slurm.conf
     dest: "{{ slurm_dest }}"
     mode: "{{ slurm_mode }}"
   tags: install
 
-- name: add cluster name
+- name: Add cluster name
   lineinfile:
     path: "{{ slurm_confpth }}"
-    regexp: "clustername="
-    line: "clustername={{ cluster_name }}"
-  tags: install
+    regexp: "ClusterName="
+    line: "ClusterName={{ cluster_name }}"
 
-- name: add slurm user name
+- name: Add slurm user name
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmUser="
     line: "SlurmUser={{ slurm_user }}"
-  tags: install
 
 - name: Add slurmctld port no
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmctldPort="
     line: "SlurmctldPort={{ slurmctld_port }}"
-  tags: install
 
 - name: Add slurmd port no
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmdPort="
     line: "SlurmdPort={{ slurmd_port }}"
-  tags: install
 
 - name: Add spool path
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmdSpoolDir="
     line: "SlurmdSpoolDir={{ spool_pth }}"
-  tags: install
 
 - name: Add slurmctld pid file path
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmctldPidFile="
     line: "SlurmctldPidFile={{ slurmctld_pid }}"
-  tags: install
 
 - name: Add slurmd pid file path
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmdPidFile="
     line: "SlurmdPidFile={{ slurmd_pid }}"
-  tags: install
 
 - name: Add slurmctld log file path
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmctldLogFile="
     line: "SlurmctldLogFile={{ slurmctld_log }}"
-  tags: install
 
 - name: Add slurmd log file path
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "SlurmdLogFile="
     line: "SlurmdLogFile={{ slurmd_log }}"
-  tags: install
+
+- name: Add accounting storage port no
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "AccountingStoragePort="
+    line: "AccountingStoragePort={{ acct_port }}"
 
 - name: Create slurm group
   group:
@@ -155,6 +160,15 @@
     state: directory
     mode: "{{ gen_mode }}"
     recurse: yes
+
+- name: Create slurm pid directory
+  file:
+    path: "{{ slurm_pidpth }}"
+    state: directory
+    owner: slurm
+    group: slurm
+    mode: "{{ gen_mode }}"
+    recurse: yes
   tags: install
 
 - name: Give slurm user permission to slurmctld
@@ -164,7 +178,6 @@
     group: slurm
     mode: "{{ gen_mode }}"
     state: touch
-  tags: install
 
 - name: Give slurm user permission to slurmd
   file:
@@ -173,11 +186,19 @@
     group: slurm
     mode: "{{ gen_mode }}"
     state: touch
-  tags: install
 
 - name: Start munge service
-  service:
+  systemd:
     name: munge
     state: restarted
     enabled: yes
   tags: install
+  ignore_errors: yes
+
+- name: Disable selinux
+  selinux:
+    state: disabled
+  tags: install
+
+- name: Deploy time ntp/chrony
+  include_tasks: ntp.yml

+ 56 - 0
roles/slurm_common/tasks/ntp.yml

@@ -0,0 +1,56 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+  - name: Deploy ntp servers
+    block:
+      - name: Deploy ntpd
+        package:
+          name: ntp
+          state: present
+      - name: Deploy ntpdate
+        package:
+          name: ntpdate
+          state: present
+      - name: Update ntp servers
+        template:
+          src: ntp.conf.j2
+          dest: "{{ ntp_path }}"
+          owner: root
+          group: root
+          mode: "{{ ntp_mode }}"
+          backup: yes
+        notify:
+          - Restart ntpd
+          - Sync tp clocks
+    when:  ( ansible_distribution == "CentOS" or   ansible_distribution == "RedHat" ) and ansible_distribution_major_version  < os_higher_version
+
+  - name: Deploy chrony server
+    block:
+      - name: Deploy chrony
+        package:
+            name: chrony
+            state: present
+      - name: Update ntp servers
+        template:
+          src: chrony.conf.j2
+          dest: "{{ chrony_path }}"
+          owner: root
+          group: root
+          mode: "{{ ntp_mode }}"
+          backup: yes
+        notify:
+          - Restart chrony
+          - Sync chrony sources
+    when:  ( ansible_distribution == "CentOS" or   ansible_distribution == "RedHat" ) and ansible_distribution_major_version  > os_version

+ 41 - 0
roles/slurm_common/templates/chrony.conf.j2

@@ -0,0 +1,41 @@
+# Use public servers from the pool.ntp.org project.
+# Please consider joining the pool (http://www.pool.ntp.org/join.html).
+{% for item in chrony_servers %}
+pool {{ item }} iburst
+{% endfor %}
+
+
+# Record the rate at which the system clock gains/losses time.
+driftfile /var/lib/chrony/drift
+
+# Allow the system clock to be stepped in the first three updates
+# if its offset is larger than 1 second.
+makestep 1.0 3
+
+# Enable kernel synchronization of the real-time clock (RTC).
+rtcsync
+
+# Enable hardware timestamping on all interfaces that support it.
+#hwtimestamp *
+
+# Increase the minimum number of selectable sources required to adjust
+# the system clock.
+#minsources 2
+
+# Allow NTP client access from local network.
+#allow 192.168.0.0/16
+
+# Serve time even if not synchronized to a time source.
+#local stratum 10
+
+# Specify file containing keys for NTP authentication.
+keyfile /etc/chrony.keys
+
+# Get TAI-UTC offset and leap seconds from the system tz database.
+leapsectz right/UTC
+
+# Specify directory for log files.
+logdir /var/log/chrony
+
+# Select which information is logged.
+#log measurements statistics tracking

+ 14 - 0
roles/slurm_common/templates/ntp.conf.j2

@@ -0,0 +1,14 @@
+driftfile /var/lib/ntp/drift
+
+restrict default nomodify notrap nopeer noquery
+
+restrict 127.0.0.1
+restrict ::1
+
+{% for item in ntp_servers %}
+server  {{ item }} iburst
+{% endfor %}
+
+includefile /etc/ntp/crypto/pw
+
+keys /etc/ntp/keys

+ 25 - 2
roles/slurm_common/vars/main.yml

@@ -13,27 +13,34 @@
 #  limitations under the License.
 ---
 
-epel_url: https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
+epel_url: https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
 
 common_packages:
    - munge
    - munge-libs
+   - munge-devel
    - mariadb-server
    - mariadb-devel
    - python3
-   - python-pip
+   - man2html
+   - MySQL-python
 
+hostname_dest: "/etc/hostname"
+hosts_dest: "/etc/hosts"
 munge_dest: "/etc/munge/"
 munge_cmd: "/usr/sbin/create-munge-key -f"
 munge_mode: "0400"
 slurm_mode: "0644"
+common_mode: "0777"
 slurm_dest: "/etc/slurm/"
 slurm_confpth: "/etc/slurm/slurm.conf"
 slurm_user: "slurm"
 slurmctld_port: "6817"
 slurmd_port: "6818"
+acct_port: "6819"
 slurm_uid: "6001"
 slurm_logpth: "/var/log/slurm/"
+slurm_pidpth: "/var/run/slurm/"
 gen_mode: "0755"
 spool_pth: "/var/spool/slurm/"
 slurmctld_pid: "/var/run/slurmctld.pid"
@@ -41,3 +48,19 @@ slurmd_pid: "/var/run/slurmd.pid"
 cluster_name : "manager,compute"
 slurmctld_log: "/var/log/slurm/slurmctld.log"
 slurmd_log: "/var/log/slurm/slurmd.log"
+chrony_path: "/etc/chrony.conf"
+ntp_path: "/etc/ntp.conf"
+ntp_mode: "0644"
+os_higher_version: "8"
+os_version: "7"
+retry_count_one: "10"
+delay_count_one: "60"
+retry_count: "6"
+delay_count: "10"
+
+ntp_servers:
+  - 0.centos.pool.ntp.org
+  - 1.centos.pool.ntp.org
+  - 2.centos.pool.ntp.org
+chrony_servers:
+  - 2.centos.pool.ntp.org

+ 3 - 3
roles/slurm_manager/files/slurmdbd.conf

@@ -18,7 +18,7 @@ AuthType=auth/munge
 # slurmDBD info
 DbdAddr=
 DbdHost=
-#DbdPort=7031
+#DbdPort=6019
 SlurmUser=
 #MessageTimeout=300
 DebugLevel=verbose
@@ -33,6 +33,6 @@ PidFile=
 StorageType=accounting_storage/mysql
 #StorageHost=
 #StoragePort=
-#StoragePass=
-#StorageUser=
+StoragePass=
+StorageUser=
 #StorageLoc=

+ 121 - 41
roles/slurm_manager/tasks/main.yml

@@ -13,17 +13,65 @@
 #  limitations under the License.
 ---
 
-#- name: Install packages for slurm
-# package:
-#   name: "{{ slurm_packages }}"
-#   state: present
-# tags: install
-
-#- name: Install development tools
-# package:
-#   name: "{{ dev_tools }}"
-#   state: present
-# tags: install
+- name: Give slurm user permission to slurmctld spool
+  file:
+    path: "{{ spool_slurmctld_pth }}"
+    owner: slurm
+    group: slurm
+    mode: "{{ tmp_mode }}"
+    state: touch
+
+- name: Give slurm ownership to cluster state
+  file:
+    path: "{{ cluster_state_path }}"
+    owner: slurm
+    group: slurm
+    mode: "{{ tmp_mode }}"
+    state: touch
+
+- name: Create slurmctld log file on master
+  file:
+    path: "{{ slurm_logpth }}"
+    owner: slurm
+    mode: "{{ tmp_mode }}"
+    state: touch
+  with_items:
+    - slurmctld.log
+
+- name: Create log files on master
+  file:
+    path: "{{ slurm_logpth }}"
+    owner: slurm
+    mode: "{{ tmp_mode }}"
+    state: touch
+  with_items:
+    - "{{ log_files_master }}"
+
+- name: Get mariadb password from vault
+  command: ansible-vault view {{ input_config_pth }} --vault-password-file {{ vault_path }}
+  register: vault_contents
+  delegate_to: localhost
+  changed_when: true
+
+- name: Extract db password from input config file
+  set_fact:
+    db_password="{{ vault_contents.stdout | regex_findall('[\n\r].*mariadb_password:\s*\"([^\n\r]*)\"') }}"
+
+- name: Install packages for slurm
+  package:
+    name: "{{ item }}"
+    state: present
+  with_items:
+    - "{{ slurm_packages }}"
+  tags: install
+
+- name: Install development tools
+  package:
+    name: "{{ item }}"
+    state: present
+  with_items:
+    - "{{ dev_tools }}"
+  tags: install
 
 - name: Create temporary download folder for slurm
   file:
@@ -40,38 +88,62 @@
     checksum: "{{ slurm_md5 }}"
     validate_certs: no
   tags: install
- 
+
 - name: Build slurm rpms
-  command: rpmbuild -ta "{{ rpmbuild_path }}"
+  command: rpmbuild -ta "{{ rpmbuild_path }}" --with mysql
   changed_when: false
   args:
     warn: no
 
-#- name: Verify package md5
-#command: rpm -qa
-#  ignore_errors: true
-#  register: verify_result
-#  changed_when: no
-#  failed_when: no
-#  args:
-#    warn: no
+- name: Verify package md5
+  command: rpm -qa
+  ignore_errors: true
+  register: verify_result
+  changed_when: no
+  failed_when: no
+  args:
+    warn: no
 
 - name: Install rpms
   command: rpm -Uvh ~"{{ rpm_loop }}"
   args:
     chdir: "{{ rpm_path }}"
     warn: no
-    #  when: verify_result.rc != 0
+  changed_when: true
+
+- name: Get the hostname
+  command: hostname -s
+  register: machine_name
+  changed_when: true
 
 - name: Add control machine name
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "ControlMachine="
-    line: "ControlMachine={{ group_names[0] }}"
+    line: "ControlMachine={{ machine_name.stdout }}"
 
-- name: Firewall rule for slurm - tcp/ip,udp
+- name: Add slurm user name
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SlurmUser="
+    line: "SlurmUser={{ slurm_user }}"
+
+- name: Install firewalld
+  package:
+    name: firewalld
+    state: present
+  tags: firewalld
+
+- name: Start and enable firewalld
+  service:
+    name: firewalld
+    state: started
+    enabled: yes
+  tags: firewalld
+
+- name: Firewall rule for slurm - tcp/udp ports
   firewalld:
-    zone: internal
+    zone: public
     port: "{{ item }}"
     permanent: true
     state: enabled
@@ -79,10 +151,11 @@
     - "{{ tcp_port1 }}"
     - "{{ tcp_port2 }}"
     - "{{ tcp_port3 }}"
-    - "{{ tcp_port4 }}"
+    - "{{ udp_port3 }}"
     - "{{ udp_port1 }}"
     - "{{ udp_port2 }}"
-  tags: install
+  when: "'manager' in group_names"
+  tags: firewalld
 
 - name: Get network address/subnet mask through ipaddr
   set_fact:
@@ -94,23 +167,22 @@
     rich_rule: 'rule family="{{ family }}" source address="{{ network_address }}" accept'
     permanent: true
     state: enabled
-  tags: install
+  tags: firewalld
 
-- name: Firewall reload
-  systemd:
-    name: firewalld
-    state: reloaded
-  tags: install
+- name: Reload firewalld
+  command: firewall-cmd --reload
+  changed_when: true
+  tags: firewalld
 
 - name: Start mariadb
-  service:
+  systemd:
     name: mariadb
     state: restarted
     enabled: yes
   tags: install
 
 - name: Grant permissions for slurm db
-  command: mysql -u root -e "GRANT ALL ON slurm_acct_db.* TO 'slurm'@'localhost' identified by 'password' with grant option;"
+  command: mysql -u root -e "GRANT ALL ON slurm_acct_db.* TO '{{ db_user }}'@'{{ db_host }}' identified by '{{ db_password[0] }}'with grant option;"
   tags: install
   changed_when: true
 
@@ -119,6 +191,7 @@
     src: slurmdbd.conf
     dest: "{{ slurmdbd_path }}"
     mode: "{{ slurmdbd_mode }}"
+    owner: slurm
   tags: install
 
 - name: Add slurm user name
@@ -139,6 +212,18 @@
     regexp: "DbdHost="
     line: "DbdHost={{ DbdHost }}"
 
+- name: Add storage password
+  lineinfile:
+    path: "{{ slurmdbd_path }}"
+    regexp: "StoragePass="
+    line: "StoragePass={{ db_password[0] }}"
+
+- name: Add storage user
+  lineinfile:
+    path: "{{ slurmdbd_path }}"
+    regexp: "StorageUser="
+    line: "StorageUser={{ slurm_user }}"
+
 - name: Add log file path
   lineinfile:
     path: "{{ slurmdbd_path }}"
@@ -151,13 +236,8 @@
     regexp: "PidFile="
     line: "PidFile={{ pidfile }}"
 
-- name: Populate accounting database
-  command: slurmdbd
-  tags: install
-  changed_when: true
-
 - name: Save slurm conf file in buffer
   fetch:
     src: "{{ slurm_confpth }}"
     dest: "{{ buffer_path }}"
-    flat: true
+    flat: true

+ 26 - 11
roles/slurm_manager/vars/main.yml

@@ -17,35 +17,48 @@ slurm_packages:
    - python3
    - gcc
    - openssl
-   - openssl-devel
    - numactl
-   - numactl-devel
    - hwloc
    - lua
    - readline
-   - readline-devel
-   - pam-devel
    - perl-ExtUtils-MakeMaker
-   - cpanm*
    - rpm-build
+   - perl-DBI
+   - perl-Switch
+   - libibumad
 
 dev_tools:
    - rrdtool-devel
    - lua-devel
    - hwloc-devel
+   - libssh2-devel
+   - pam-devel
+   - readline-devel
+   - openssl-devel
+   - numactl-devel
+   - ncurses-devel
+   - gtk2-devel
+
+log_files_master:
+   - slurm_jobacct.log
+   - slurm_jobcomp.log
 
 tmp_path: "/root/slurm-tmp"
 tmp_mode: "0755"
-slurm_url: https://download.schedmd.com/slurm/slurm-20.02.3.tar.bz2
-slurm_md5: "md5:c71a300d6c5d33ef8ca60e52a203bb1e"
-rpmbuild_path: "/root/slurm-tmp/slurm-20.02.3.tar.bz2"
+cluster_state_path: "/var/spool/slurm/cluster_state"
+spool_slurmctld_pth: "/var/spool/slurmctld"
+spool_slurmd_pth: "/var/spool/slurmd"
+slurm_logpth: "/var/log/slurm/"
+slurm_url: https://download.schedmd.com/slurm/slurm-20.11.2.tar.bz2
+slurm_md5: "md5:592b8b24ff0f24327033eec59cd438d7"
+rpmbuild_path: "/root/slurm-tmp/slurm-20.11.2.tar.bz2"
 rpm_loop: "/rpmbuild/RPMS/x86_64/*.rpm"
 tcp_port1: "6817/tcp"
 tcp_port2: "6818/tcp"
 tcp_port3: "6819/tcp"
-tcp_port4: "7321/tcp"
 udp_port1: "6817/udp"
-udp_port2: "7321/udp"
+udp_port2: "6818/udp"
+udp_port3: "6819/udp"
 family: "ipv4"
 db_user: "slurm"
 db_host: "localhost"
@@ -56,7 +69,9 @@ slurm_user: "slurm"
 DbdAddr: "localhost"
 DbdHost: "localhost"
 logfile: "/var/log/slurm/slurmdbd.log"
-pidfile: "/var/run/slurm/slurmdbd.pid"
+pidfile: "/var/run/slurmdbd.pid"
 buffer_path: "/tmp/slurm.conf"
 rpm_path: "/root/rpmbuild/RPMS/x86_64/"
 slurm_mode: "0644"
+input_config_pth: "omnia/appliance/input_config.yml"
+vault_path: "omnia/appliance/roles/common/files/.vault_key"

+ 16 - 13
roles/slurm_start_services/tasks/main.yml

@@ -16,32 +16,35 @@
 - name: Include common variables
   include_vars: ../../slurm_manager/vars/main.yml
 
+- name: Include common variables
+  include_vars: ../../slurm_common/vars/main.yml
+
 - name: Copy slurm conf from buffer
   copy:
     src: "{{ buffer_path }}"
     dest: "{{ slurm_confpth }}"
     mode: "{{ slurm_mode }}"
 
-- name: Start slurmctld on manager
-  service:
-    name: slurmctld
-    enabled: yes
-  tags: install
-
 - name: Enable slurmdbd on manager
   service:
     name: slurmdbd
-    enabled: yes
+    state: restarted
+  tags: install
+
+- name: Start slurmctld on manager
+  systemd:
+    name: slurmctld
+    state: started
   tags: install
 
 - name: Show cluster if exists
-  command: sacctmgr -n show cluster {{ inventory_hostname }}
+  command: sacctmgr -n show cluster {{ cluster_name }}
   register: slurm_clusterlist
   changed_when: false
 
 - name: Create slurm cluster
-  command: sacctmgr -i add cluster {{ inventory_hostname }}
-  when: slurm_clusterlist.stdout.find(inventory_hostname) == 1
+  command: sacctmgr -i add cluster {{ cluster_name }}
+  when: slurm_clusterlist.stdout.find(cluster_name) == 1
 
 - name: Show account
   command: sacctmgr show account
@@ -49,8 +52,8 @@
   changed_when: false
 
 - name: Create default slurm group
-  command: sacctmgr -i add account defaultgroup Cluster={{ inventory_hostname }} Description="Default Account" Organization="Default Org"
-  when: account_added.stdout.find(inventory_hostname) == 1
+  command: sacctmgr -i add account defaultgroup Cluster={{ cluster_name }} Description="Default Account" Organization="Default Org"
+  when: account_added.stdout.find(cluster_name) == 1
   tags: install
 
 - name: Check if user exists
@@ -60,5 +63,5 @@
 
 - name: Add root to the default account
   command: sacctmgr -i add user root DefaultAccount=defaultgroup
-  when: account_added.stdout.find(inventory_hostname) == 1
+  when: account_added.stdout.find(cluster_name) == 1
   tags: install

+ 44 - 7
roles/start_slurm_workers/tasks/main.yml

@@ -16,6 +16,37 @@
 - name: Include common variables
   include_vars: ../../slurm_manager/vars/main.yml
 
+- name: Give slurm user permission to slurmd spool
+  file:
+    path: "{{ spool_slurmd_pth }}"
+    owner: slurm
+    group: slurm
+    mode: "{{ tmp_mode }}"
+    state: touch
+
+- name: Create log files on compute nodes
+  file:
+    path: "{{ slurm_logpth }}"
+    owner: slurm
+    group: slurm
+    mode: "{{ tmp_mode }}"
+    state: touch
+  with_items:
+    - slurmd.log
+
+- name: Install firewalld
+  package:
+    name: firewalld
+    state: present
+  tags: firewalld
+
+- name: Stop and disable firewalld
+  service:
+    name: firewalld
+    state: stopped
+    enabled: no
+  tags: firewalld
+
 - name: Copy slurm conf from buffer
   copy:
     src: "{{ buffer_path }}"
@@ -32,7 +63,7 @@
 
 - name: Install development tools
   package:
-    name: "{{ item | join (',') }}"
+    name: "{{ item }}"
     state: present
   with_items:
     - "{{ dev_tools }}"
@@ -55,7 +86,7 @@
   tags: install
 
 - name: Build slurm rpms
-  command: rpmbuild -ta "{{ rpmbuild_path }}"
+  command: rpmbuild -ta "{{ rpmbuild_path }}" --with mysql
   changed_when: false
   args:
     warn: no
@@ -74,15 +105,21 @@
   args:
     chdir: "{{ rpm_path }}"
     warn: no
-  when: verify_result.rc != 0
+  changed_when: true
+
+- name: Get the hostname
+  command: hostname -s
+  register: machine_name
+  changed_when: true
 
 - name: Add socket and core info
   lineinfile:
     path: "{{ slurm_confpth }}"
     regexp: "NodeName= Sockets= CoresPerSocket="
-    line: "NodeName={{ group_names[0] }} Sockets={{ hostvars[inventory_hostname]['ansible_facts']['processor_count'] }}
+    line: "NodeName={{ machine_name.stdout }} Sockets={{ hostvars[inventory_hostname]['ansible_facts']['processor_count'] }}
       CoresPerSocket={{ hostvars[inventory_hostname]['ansible_facts']['processor_cores'] }}"
 
+
 - name: Save slurm conf in buffer
   fetch:
     src: "{{ slurm_confpth }}"
@@ -90,7 +127,7 @@
     flat: true
 
 - name: Start slurmd on compute nodes
-  service:
+  systemd:
     name: slurmd.service
-    enabled: yes
-  tags: install
+    state: started
+  tags: install

+ 122 - 0
test/test_slurm_common.yml

@@ -0,0 +1,122 @@
+# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+# Testcase OMNIA_USP_US_VFSP_TC_001
+# Execute slurm_common role in manager & compute nodes with os installed centos 7.9
+- name: OMNIA_USP_US_VFSP_TC_001
+  hosts: manager, compute
+  vars_files:
+    - test_vars/test_slurm_common_vars.yml
+  tasks:
+    - block:
+        - name: Call slurm common role
+          include_role:
+            name: ../roles/slurm_common
+      tags: TC_001
+
+    - name: Fetch common packages installed
+      package_facts:
+        manager: auto
+      tags: TC_001,VERIFY_001
+
+    - name: Checking munge service status
+      systemd:
+        name: munge
+      register: munge_service
+      tags: TC_001, VERIFY_001
+
+    - name: Checking ntpd service status
+      systemd:
+        name: ntpd
+      register: ntpd_service
+      tags: TC_001, VERIFY_001
+
+    - name: Verify all required packages are installed
+      assert:
+        that: "'{{ item }}' in ansible_facts.packages"
+        success_msg: "{{ packages_status_success_msg }}"
+        fail_msg: "{{ packages_status_fail_msg }}"
+      with_items: "{{ common_packages }}"
+      tags: TC_001,VERIFY_001
+
+    - name: Validating munge service status
+      assert:
+        that:
+          - munge_service.status.ActiveState == 'active'
+        fail_msg: "{{ munge_service_fail_msg }}"
+        success_msg: "{{ munge_service_success_msg }}"
+      tags: TC_001, VERIFY_001
+
+    - name: Validating ntpd service status
+      assert:
+        that:
+          - ntpd_service.status.ActiveState == 'active'
+        fail_msg: "{{ ntpd_service_fail_msg }}"
+        success_msg: "{{ ntpd_service_success_msg }}"
+      tags: TC_001, VERIFY_001
+
+# Testcase OMNIA_USP_US_VFSP_TC_002
+# Execute slurm_common role in manager & compute nodes with common packages already installed
+- name: OMNIA_USP_US_VFSP_TC_002
+  hosts: manager, compute
+  vars_files:
+    - test_vars/test_slurm_common_vars.yml
+  tasks:
+    - block:
+        - name: Call slurm common role
+          include_role:
+            name: ../roles/slurm_common
+      tags: TC_002, VERIFY_002
+
+    - name: Fetch common packages installed
+      package_facts:
+        manager: auto
+      tags: TC_002,VERIFY_002
+
+    - name: Checking munge service status
+      systemd:
+        name: munge
+      register: munge_service
+      tags: TC_002, VERIFY_002
+
+    - name: Checking ntpd service status
+      systemd:
+        name: ntpd
+      register: ntpd_service
+      tags: TC_002, VERIFY_002
+
+    - name: Verify all required packages are installed
+      assert:
+        that: "'{{ item }}' in ansible_facts.packages"
+        success_msg: "{{ packages_status_success_msg }}"
+        fail_msg: "{{ packages_status_fail_msg }}"
+      with_items: "{{ common_packages }}"
+      tags: TC_002,VERIFY_002
+
+    - name: Validating munge service status
+      assert:
+        that:
+          - munge_service.status.ActiveState == 'active'
+        fail_msg: "{{ munge_service_fail_msg }}"
+        success_msg: "{{ munge_service_success_msg }}"
+      tags: TC_002, VERIFY_002
+
+    - name: Validating ntpd service status
+      assert:
+        that:
+          - ntpd_service.status.ActiveState == 'active'
+        fail_msg: "{{ ntpd_service_fail_msg }}"
+        success_msg: "{{ ntpd_service_success_msg }}"
+      tags: TC_002, VERIFY_002

+ 152 - 0
test/test_slurm_manager.yml

@@ -0,0 +1,152 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+# Testcase OMNIA_USP_US_VFSP_TC_003
+# Execute slurm_manager role in manager node with os installed centos 7.9
+- name: OMNIA_USP_US_VFSP_TC_003
+  hosts: manager
+  vars_files:
+    - test_vars/test_slurm_manager_vars.yml
+  tasks:
+    - block:
+        - name: Call slurm manager role
+          include_role:
+            name: ../roles/slurm_manager
+      tags: TC_003
+
+    - name: Fetch slurm packages installed
+      package_facts:
+        manager: auto
+      tags: TC_003,VERIFY_003
+
+    - name: Start and enable firewalld
+      service:
+        name: firewalld
+        state: started
+        enabled: yes
+      tags: TC_003, VERIFY_003
+
+    - name: Checking firewalld tcp/udp ports on manager node
+      command: firewall-cmd --list-ports
+      register: manager_firewalld_ports
+      when: "'manager' in group_names"
+      tags: TC_003, VERIFY_003
+
+    - name: Checking mariadb service status
+      systemd:
+        name: mariadb
+      register: mariadb_service
+      tags: TC_003, VERIFY_003
+
+    - name: Validating tcp/udp ports on manager node
+      assert:
+        that:
+          - "'6817/tcp' in manager_firewalld_ports.stdout"
+          - "'6817/udp' in manager_firewalld_ports.stdout"
+          - "'6818/tcp' in manager_firewalld_ports.stdout"
+          - "'6818/udp' in manager_firewalld_ports.stdout"
+          - "'6819/tcp' in manager_firewalld_ports.stdout"
+          - "'6819/udp' in manager_firewalld_ports.stdout"
+        fail_msg: "{{ manager_ports_status_fail_msg }}"
+        success_msg: "{{ manager_ports_status_success_msg }}"
+      when: "'manager' in group_names"
+      tags: TC_003, VERIFY_003
+
+    - name: Verify all slurm packages are installed
+      assert:
+        that: "'{{ item }}' in ansible_facts.packages"
+        success_msg: "{{ slurm_packages_status_success_msg }}"
+        fail_msg: "{{ slurm_packages_status_fail_msg }}"
+      with_items:
+          - "{{ slurm_packages }}"
+          - "{{ dev_tools }}"
+      tags: TC_003, VERIFY_003
+
+    - name: Validating mariadb service status
+      assert:
+        that:
+          - mariadb_service.status.ActiveState == 'active'
+        fail_msg: "{{ mariadb_service_fail_msg }}"
+        success_msg: "{{ mariadb_service_success_msg }}"
+      tags: TC_003, VERIFY_003
+
+# Testcase OMNIA_USP_US_VFSP_TC_004
+# Execute slurm_manager role in manager node with slurm packages already installed
+- name: OMNIA_USP_US_VFSP_TC_004
+  hosts: manager
+  vars_files:
+    - test_vars/test_slurm_manager_vars.yml
+  tasks:
+    - block:
+        - name: Call slurm manager role
+          include_role:
+            name: ../roles/slurm_manager
+      tags: TC_004
+
+    - name: Fetch slurm packages installed
+      package_facts:
+        manager: auto
+      tags: TC_004,VERIFY_004
+
+    - name: Start and enable firewalld
+      service:
+        name: firewalld
+        state: started
+        enabled: yes
+      tags: TC_004, VERIFY_004
+
+    - name: Checking firewalld tcp/udp ports on manager node
+      command: firewall-cmd --list-ports
+      register: manager_firewalld_ports
+      when: "'manager' in group_names"
+      tags: TC_004, VERIFY_004
+
+    - name: Checking mariadb service status
+      systemd:
+        name: mariadb
+      register: mariadb_service
+      tags: TC_004, VERIFY_004
+
+    - name: Validating tcp/udp ports on manager node
+      assert:
+        that:
+          - "'6817/tcp' in manager_firewalld_ports.stdout"
+          - "'6817/udp' in manager_firewalld_ports.stdout"
+          - "'6818/tcp' in manager_firewalld_ports.stdout"
+          - "'6818/udp' in manager_firewalld_ports.stdout"
+          - "'6819/tcp' in manager_firewalld_ports.stdout"
+          - "'6819/udp' in manager_firewalld_ports.stdout"
+        fail_msg: "{{ manager_ports_status_fail_msg }}"
+        success_msg: "{{ manager_ports_status_success_msg }}"
+      when: "'manager' in group_names"
+      tags: TC_004, VERIFY_004
+
+    - name: Verify all slurm packages are installed
+      assert:
+        that: "'{{ item }}' in ansible_facts.packages"
+        success_msg: "{{ slurm_packages_status_success_msg }}"
+        fail_msg: "{{ slurm_packages_status_fail_msg }}"
+      with_items:
+          - "{{ slurm_packages }}"
+          - "{{ dev_tools }}"
+      tags: TC_004, VERIFY_004
+
+    - name: Validating mariadb service status
+      assert:
+        that:
+          - mariadb_service.status.ActiveState == 'active'
+        fail_msg: "{{ mariadb_service_fail_msg }}"
+        success_msg: "{{ mariadb_service_success_msg }}"
+      tags: TC_004, VERIFY_004

+ 124 - 0
test/test_slurm_start_services.yml

@@ -0,0 +1,124 @@
+#Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+# Testcase OMNIA_USP_US_VFSP_TC_007
+# Execute slurm_start_services role in manager node with os installed centos 7.9
+- name: OMNIA_USP_US_VFSP_TC_007
+  hosts: manager
+  vars_files:
+    - test_vars/test_slurm_start_services_vars.yml
+  tasks:
+    - block:
+        - name: Call slurm start services role
+          include_role:
+            name: ../roles/slurm_start_services
+      tags: TC_007
+
+    - name: Checking slurmctld service status
+      systemd:
+        name: slurmctld
+      register: slurmctld_service
+      tags: TC_007, VERIFY_007
+
+    - name: Checking slurmdbd service status
+      systemd:
+        name: slurmdbd
+      register: slurmdbd_service
+      tags: TC_007, VERIFY_007
+
+    - name: Check if slurm is installed
+      command: sinfo -V
+      register: slurm_version
+      changed_when: false
+      ignore_errors: True
+      tags: TC_007,VERIFY_007
+
+    - name: Validating slurmctld service status
+      assert:
+        that:
+          - slurmctld_service.status.ActiveState == 'active'
+        fail_msg: "{{ slurmctld_service_fail_msg }}"
+        success_msg: "{{ slurmctld_service_success_msg }}"
+      tags: TC_007, VERIFY_007
+
+    - name: Validating slurmdbd service status
+      assert:
+        that:
+          - slurmdbd_service.status.ActiveState == 'active'
+        fail_msg: "{{ slurmdbd_service_fail_msg }}"
+        success_msg: "{{ slurmdbd_service_success_msg }}"
+      tags: TC_007, VERIFY_007
+
+    - name: Validate slurm installation
+      assert:
+        that: "'command not found' not in slurm_version.stdout"
+        fail_msg: "{{ slurm_status_fail_msg }}"
+        success_msg: "{{ slurm_status_success_msg }}"
+      tags: TC_007, VERIFY_007
+
+# Testcase OMNIA_USP_US_VFSP_TC_008
+# Execute slurm_start_services role in manager node with services already running
+- name: OMNIA_USP_US_VFSP_TC_008
+  hosts: manager
+  vars_files:
+    - test_vars/test_slurm_start_services_vars.yml
+  tasks:
+    - block:
+        - name: Call slurm start services role
+          include_role:
+            name: ../roles/slurm_start_services
+      tags: TC_008
+
+    - name: Checking slurmctld service status
+      systemd:
+        name: slurmctld
+      register: slurmctld_service
+      tags: TC_008, VERIFY_008
+
+    - name: Checking slurmdbd service status
+      systemd:
+        name: slurmdbd
+      register: slurmdbd_service
+      tags: TC_008, VERIFY_008
+
+    - name: Check if slurm is installed
+      command: sinfo -V
+      register: slurm_version
+      changed_when: false
+      ignore_errors: True
+      tags: TC_008,VERIFY_008
+
+    - name: Validating slurmctld service status
+      assert:
+        that:
+          - slurmctld_service.status.ActiveState == 'active'
+        fail_msg: "{{ slurmctld_service_fail_msg }}"
+        success_msg: "{{ slurmctld_service_success_msg }}"
+      tags: TC_008, VERIFY_008
+
+    - name: Validating slurmdbd service status
+      assert:
+        that:
+          - slurmdbd_service.status.ActiveState == 'active'
+        fail_msg: "{{ slurmdbd_service_fail_msg }}"
+        success_msg: "{{ slurmdbd_service_success_msg }}"
+      tags: TC_008, VERIFY_008
+
+    - name: Validate slurm installation
+      assert:
+        that: "'command not found' not in slurm_version.stdout"
+        fail_msg: "{{ slurm_status_fail_msg }}"
+        success_msg: "{{ slurm_status_success_msg }}"
+      tags: TC_008, VERIFY_008

+ 126 - 0
test/test_slurm_workers.yml

@@ -0,0 +1,126 @@
+#Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+# Testcase OMNIA_USP_US_VFSP_TC_005
+# Execute slurm_worker role in compute node with os installed centos 7.9
+- name: OMNIA_USP_US_VFSP_TC_005
+  hosts: compute
+  vars_files:
+    - test_vars/test_slurm_workers_vars.yml
+  tasks:
+    - block:
+        - name: Call slurm worker role
+          include_role:
+            name: ../roles/slurm_workers
+      tags: TC_005
+
+    - name: Fetch slurm packages installed
+      package_facts:
+        manager: auto
+      tags: TC_005,VERIFY_005
+
+    - name: Check if slurm is installed
+      command: sinfo -V
+      register: slurm_version
+      changed_when: false
+      ignore_errors: True
+      tags: TC_005,VERIFY_005
+
+    - name: Checking slurmd service status
+      service:
+        name: slurmd.service
+      register: slurmd_service
+      tags: TC_005, VERIFY_005
+
+    - name: Verify all slurm packages are installed
+      assert:
+        that: "'{{ item }}' in ansible_facts.packages"
+        success_msg: "{{ slurm_packages_status_success_msg }}"
+        fail_msg: "{{ slurm_packages_status_fail_msg }}"
+      with_items:
+          - "{{ slurm_packages }}"
+          - "{{ dev_tools }}"
+      tags: TC_005, VERIFY_005
+
+    - name: Validate slurm installation
+      assert:
+        that: "'command not found' not in slurm_version.stdout"
+        fail_msg: "{{ slurm_status_fail_msg }}"
+        success_msg: "{{ slurm_status_success_msg }}"
+      tags: TC_005, VERIFY_005
+
+    - name: Validating slurmd service status
+      assert:
+        that:
+          - slurmd_service.status.ActiveState == 'active'
+        fail_msg: "{{ slurmd_service_fail_msg }}"
+        success_msg: "{{ slurmd_service_success_msg }}"
+      tags: TC_005, VERIFY_005
+
+# Testcase OMNIA_USP_US_VFSP_TC_006
+# Execute slurm_workers role in compute node with slurm packages already installed
+- name: OMNIA_USP_US_VFSP_TC_006
+  hosts: compute
+  vars_files:
+    - test_vars/test_slurm_workers_vars.yml
+  tasks:
+    - block:
+        - name: Call slurm worker role
+          include_role:
+            name: ../roles/slurm_workers
+      tags: TC_006
+
+    - name: Fetch slurm packages installed
+      package_facts:
+        manager: auto
+      tags: TC_006,VERIFY_006
+
+    - name: Checking slurmd service status
+      service:
+        name: slurmd.service
+      register: slurmd_service
+      tags: TC_006, VERIFY_006
+
+    - name: Check if slurm is installed
+      command: sinfo -V
+      register: slurm_version
+      changed_when: false
+      ignore_errors: True
+      tags: TC_006,VERIFY_006
+
+    - name: Verify all slurm packages are installed
+      assert:
+        that: "'{{ item }}' in ansible_facts.packages"
+        success_msg: "{{ slurm_packages_status_success_msg }}"
+        fail_msg: "{{ slurm_packages_status_fail_msg }}"
+      with_items:
+          - "{{ slurm_packages }}"
+          - "{{ dev_tools }}"
+      tags: TC_006, VERIFY_006
+
+    - name: Validate slurm installation
+      assert:
+        that: "'command not found' not in slurm_version.stdout"
+        fail_msg: "{{ slurm_status_fail_msg }}"
+        success_msg: "{{ slurm_status_success_msg }}"
+      tags: TC_006, VERIFY_006
+
+    - name: Validating slurmd service status
+      assert:
+        that:
+          - slurmd_service.status.ActiveState == 'active'
+        fail_msg: "{{ slurmd_service_fail_msg }}"
+        success_msg: "{{ slurmd_service_success_msg }}"
+      tags: TC_006, VERIFY_006

+ 36 - 0
test/test_vars/test_slurm_common_vars.yml

@@ -0,0 +1,36 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+packages_status_success_msg: "Common packages are installed"
+
+packages_status_fail_msg: "Common packages are not installed"
+
+munge_service_fail_msg: "Munge service is not running"
+
+munge_service_success_msg: "Munge service is running"
+
+ntpd_service_fail_msg: "Ntpd service is not running"
+
+ntpd_service_success_msg: "Ntpd service is running"
+
+common_packages:
+   - munge
+   - munge-libs
+   - munge-devel
+   - mariadb-server
+   - mariadb-devel
+   - python3
+   - man2html
+   - MySQL-python

+ 56 - 0
test/test_vars/test_slurm_manager_vars.yml

@@ -0,0 +1,56 @@
+#Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+slurm_packages:
+   - python3
+   - gcc
+   - openssl
+   - numactl
+   - hwloc
+   - lua
+   - readline
+   - perl-ExtUtils-MakeMaker
+   - rpm-build
+   - perl-DBI
+   - perl-Switch
+   - libibumad
+
+dev_tools:
+   - rrdtool-devel
+   - lua-devel
+   - hwloc-devel
+   - libssh2-devel
+   - pam-devel
+   - readline-devel
+   - openssl-devel
+   - numactl-devel
+   - ncurses-devel
+   - gtk2-devel
+
+manager_ports_status_fail_msg: "Slurm ports are not opened in manager node"
+
+manager_ports_status_success_msg: "Slurm Ports are opened in manager node"
+
+slurm_packages_status_success_msg: "Slurm and dev packages are installed"
+
+slurm_packages_status_fail_msg: "Slurm and dev packages are not installed"
+
+slurm_status_fail_msg: "Slurm is not installed"
+
+slurm_status_success_msg: "Slurm is installed"
+
+mariadb_service_fail_msg: " Mariadb server is not running"
+
+mariadb_service_success_msg: " Mariadb server is up running"

+ 6 - 10
roles/compute_gpu/vars/main.yml

@@ -13,18 +13,14 @@
 #  limitations under the License.
 ---
 
-nvidia_docker_repo_url: https://nvidia.github.io/nvidia-docker/centos7/nvidia-docker.repo
+slurmctld_service_fail_msg: "Slurmctld service is not running"
 
-nvidia_docker_repo_dest: /etc/yum.repos.d/nvidia-docker.repo
+slurmctld_service_success_msg: "Slurmctld service is running"
 
-nvidia_container_repo_url: https://nvidia.github.io/libnvidia-container/centos7/libnvidia-container.repo 
+slurmdbd_service_fail_msg: "Slurmdbd service is not running"
 
-nvidia_container_repo_dest: /etc/yum.repos.d/libnvidia-container.repo
+slurmdbd_service_success_msg: "Slurmdbd service is running"
 
-nvidia_packages:
-  - kmod-nvidia
-  - nvidia-docker2
+slurm_status_success_msg: "Slurm is installed"
 
-daemon_file_dest: /etc/docker/
-
-daemon_file_mode: 0644
+slurm_status_fail_msg: " Slurm is not installed"

+ 56 - 0
test/test_vars/test_slurm_workers_vars.yml

@@ -0,0 +1,56 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+slurm_packages:
+   - python3
+   - gcc
+   - openssl
+   - numactl
+   - hwloc
+   - lua
+   - readline
+   - perl-ExtUtils-MakeMaker
+   - rpm-build
+   - perl-DBI
+   - perl-Switch
+   - libibumad
+
+dev_tools:
+   - rrdtool-devel
+   - lua-devel
+   - hwloc-devel
+   - libssh2-devel
+   - pam-devel
+   - readline-devel
+   - openssl-devel
+   - numactl-devel
+   - ncurses-devel
+   - gtk2-devel
+
+manager_ports_status_fail_msg: "Slurm ports are not opened in manager node"
+
+manager_ports_status_success_msg: "Slurm Ports are opened in manager node"
+
+slurm_packages_status_success_msg: "Slurm and dev packages are installed"
+
+slurm_packages_status_fail_msg: "Slurm and dev packages are not installed"
+
+slurm_status_fail_msg: "Slurm is not installed"
+
+slurm_status_success_msg: "Slurm is installed"
+
+slurmd_service_fail_msg: "Slurmd service is not running"
+
+slurmd_service_success_msg: "Slurmd service is running"