Browse Source

Issue #442: Changes in response to PR comments

Signed-off-by: sakshiarora13 <sakshi_arora1@dell.com>
sakshiarora13 3 years ago
parent
commit
2a9011ccf7

+ 2 - 2
control_plane/input_params/base_vars.yml

@@ -129,8 +129,8 @@ host_mapping_file_path: ""
 
 # The nic/ethernet card that needs to be connected to configure infiniband switch
 # This nic will be configured by Omnia for the DHCP server.
-# Default value of nic is ib1
-ib_network_nic: "ib1"
+# Default value of nic is ib0
+ib_network_nic: "ib0"
 
 # The dhcp range for assigning the IPv4 address
 # Example: 172.17.0.1

+ 11 - 3
omnia.yml

@@ -112,7 +112,7 @@
   tags: kubernetes
 
 - name: Apply common Slurm installation and config
-  hosts: manager, compute
+  hosts: manager, compute, login_node
   gather_facts: false
   roles:
     - slurm_common
@@ -125,13 +125,21 @@
     - slurm_manager
   tags: slurm
 
-- name: Start Slurm workers
-  hosts: compute
+- name: Configure Slurm workers
+  hosts: compute, login_node
+  serial: 1
   gather_facts: false
   roles:
     - slurm_workers
   tags: slurm
 
+- name: Start Slurm workers
+  hosts: compute, login_node
+  gather_facts: false
+  roles:
+    - slurm_workers_service
+  tags: slurm
+
 - name: Start Slurm services
   hosts: manager
   gather_facts: false

+ 1 - 1
roles/slurm_common/tasks/main.yml

@@ -192,4 +192,4 @@
     state: restarted
     enabled: yes
   tags: install
-  ignore_errors: yes
+  failed_when: false

+ 1 - 1
roles/slurm_exporter/tasks/install_slurm_exporter.yml

@@ -55,7 +55,7 @@
   args:
     chdir: "{{ slurm_exporter_inst_dir }}"
   changed_when: False
-  ignore_errors: yes
+  failed_when: false
 
 - name: Copy executable to /usr/bin
   copy:

+ 20 - 0
roles/slurm_exporter/tasks/start_services.yml

@@ -13,6 +13,19 @@
 # limitations under the License.
 ---
 
+- name: Install firewalld
+  package:
+    name: firewalld
+    state: present
+  tags: firewalld
+
+- name: Start and enable firewalld
+  service:
+    name: firewalld
+    state: started
+    enabled: yes
+  tags: firewalld
+
 - name: Firewall port addition for slurm exporter
   firewalld:
     zone: public
@@ -29,6 +42,13 @@
   changed_when: true
   tags: firewalld
 
+- name: Stop and disable firewalld
+  service:
+    name: firewalld
+    state: stopped
+    enabled: no
+  tags: firewalld
+
 - name: Create systemd unit file
   copy:
     src: "{{ role_path }}/files/prometheus-slurm-exporter.service"

+ 3 - 3
roles/slurm_manager/tasks/main.yml

@@ -156,13 +156,13 @@
   lineinfile:
     path: "{{ slurmdbd_path }}"
     regexp: "DbdAddr="
-    line: "DbdAddr={{ DbdAddr }}"
+    line: "DbdAddr={{ dbd_addr }}"
 
 - name: Add db host
   lineinfile:
     path: "{{ slurmdbd_path }}"
     regexp: "DbdHost="
-    line: "DbdHost={{ DbdHost }}"
+    line: "DbdHost={{ dbd_host }}"
 
 - name: Add storage password
   lineinfile:
@@ -192,4 +192,4 @@
   fetch:
     src: "{{ slurm_confpth }}"
     dest: "{{ buffer_path }}"
-    flat: true
+    flat: true

+ 2 - 2
roles/slurm_manager/vars/main.yml

@@ -64,8 +64,8 @@ slurmdbd_path: "/etc/slurm/slurmdbd.conf"
 slurmdbd_mode: "0600"
 slurm_confpth: "/etc/slurm/slurm.conf"
 slurm_user: "slurm"
-DbdAddr: "localhost"
-DbdHost: "localhost"
+dbd_addr: "localhost"
+dbd_host: "localhost"
 logfile: "/var/log/slurm/slurmdbd.log"
 pidfile: "/var/run/slurmdbd.pid"
 buffer_path: "/tmp/slurm.conf"

+ 58 - 8
roles/slurm_workers/tasks/main.yml

@@ -40,6 +40,29 @@
     state: present
   tags: firewalld
 
+- name: Start and enable firewalld
+  service:
+    name: firewalld
+    state: started
+    enabled: yes
+  tags: firewalld
+
+- name: Firewall rule for slurm - tcp/udp ports
+  firewalld:
+    zone: public
+    port: "{{ item }}"
+    permanent: true
+    state: enabled
+  with_items:
+    - "{{ tcp_port2 }}"
+    - "{{ udp_port2 }}"
+  tags: firewalld
+
+- name: Reload firewalld
+  command: firewall-cmd --reload
+  changed_when: true
+  tags: firewalld
+
 - name: Stop and disable firewalld
   service:
     name: firewalld
@@ -90,16 +113,43 @@
     mode: "{{ slurm_mode }}"
   with_items:
     - "{{ groups['compute'] }}"
+  when: '"compute" in group_names'
+
+- name: Add login node core & socket info in slurm config file
+  lineinfile:
+    dest: "{{ slurm_confpth }}"
+    line: "NodeName={{ hostvars[item].node_name }} Sockets={{ hostvars[item].sockets }} CoresPerSocket={{ hostvars[item].cores }}"
+    state: present
+    create: yes
+    mode: "{{ slurm_mode }}"
+  with_items:
+    - "{{ groups['login_node'] }}"
+  when:
+    - hostvars["127.0.0.1"]["login_node_required"]
+    - '"login_node" in group_names'
+
+- name: Update hostnames of compute node when ALL in partition nodes
+  replace:
+    path: "{{ slurm_confpth }}"
+    regexp: 'PartitionName=normal Nodes=ALL'
+    replace: 'PartitionName=normal Nodes={{ machine_name.stdout }}'
+  when:
+    - hostvars["127.0.0.1"]["login_node_required"]
+    - '"compute" in group_names'
+  register: output
+
+- name: Update hostnames of compute node in partition nodes
+  replace:
+    path: "{{ slurm_confpth }}"
+    regexp: ' Default=YES MaxTime=INFINITE State=UP'
+    replace: ',{{ machine_name.stdout }} Default=YES MaxTime=INFINITE State=UP'
+  when:
+    - hostvars["127.0.0.1"]["login_node_required"]
+    - '"compute" in group_names'
+    - output.msg | length == 0
 
 - name: Save slurm conf in buffer
   fetch:
     src: "{{ slurm_confpth }}"
     dest: "{{ buffer_path }}"
-    flat: true
-
-- name: Start slurmd on compute nodes
-  systemd:
-    name: slurmd.service
-    state: started
-    enabled: yes
-  tags: install
+    flat: true

+ 36 - 0
roles/slurm_workers_service/tasks/main.yml

@@ -0,0 +1,36 @@
+#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Include common variables
+  include_vars: ../../slurm_manager/vars/main.yml
+
+- name: Copy slurm conf from buffer
+  copy:
+    src: "{{ buffer_path }}"
+    dest: "{{ slurm_confpth }}"
+    mode: "{{ slurm_mode }}"
+
+- name: Save slurm conf in buffer
+  fetch:
+    src: "{{ slurm_confpth }}"
+    dest: "{{ buffer_path }}"
+    flat: true
+
+- name: Start slurmd on compute nodes
+  systemd:
+    name: slurmd.service
+    state: started
+    enabled: yes
+  tags: install