Browse Source

Issue #422: Double instance of subnet manager running in IB cluster

Signed-off-by: sakshiarora13 <sakshi_arora1@dell.com>
Lucas A. Wilson 3 years ago
parent
commit
15e969cd7a

+ 7 - 7
control_plane/inifiniband.yml

@@ -11,11 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
----
 
-- name: Infiniband Configuration
-  hosts: all
-  gather_facts: false
-  connection: local
-  roles:
-    - network_ib
+---
+-  name: Infiniband Configuration
+   hosts: all
+   gather_facts: false
+   connection: local
+   roles:
+    - network_ib

+ 1 - 1
control_plane/roles/control_plane_sm/files/Dockerfile

@@ -6,7 +6,7 @@ RUN dnf install -y epel-release
 
 RUN dnf groupinstall "Infiniband Support" -y
 
-RUN dnf install -y opensm
+RUN dnf install -y opensm-3.3.23
 
 COPY opensm.conf /etc/rdma/opensm.conf
 

+ 24 - 0
control_plane/roles/network_ib/tasks/disable_switch_sm.yml

@@ -0,0 +1,24 @@
+# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Ensure Subnet Manager is disabled on switch
+  block:
+  - name: Set parameters to disable sm
+    set_fact:
+      sm_config: "no ib sm"
+
+  - name: Disable subnet manager on switch
+    include_tasks: global_config.yml
+    with_items: "{{ sm_config }}"

+ 3 - 0
control_plane/roles/network_ib/tasks/main.yml

@@ -31,6 +31,9 @@
     - name: Configure SNMP
       include_tasks: snmp_config.yml
 
+    - name: Ensure SM is disabled on switch
+      include_tasks: disable_switch_sm.yml
+
     - name: Save running-config to startup-config
       include_tasks: save_config.yml
 

+ 5 - 2
roles/k8s_start_services/tasks/main.yml

@@ -13,6 +13,9 @@
 #  limitations under the License.
 ---
 
+- name: Include common variables
+  include_vars: ../../slurm_exporter/vars/main.yml
+
 - name: Wait for CoreDNS to restart
   command: kubectl rollout status deployment/coredns -n kube-system
   changed_when: false
@@ -147,8 +150,8 @@
 - name: Add the host IP to config file
   replace:
     path: "{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}"
-    regexp: "localhost"
-    replace: "{{ public_ip.stdout }}"
+    regexp: "localhost:8080"
+    replace: "{{ public_ip.stdout }}:{{ slurm_exporter_port }}"
   tags: init
 
 - name: Prometheus deployment

+ 1 - 1
roles/slurm_exporter/files/prometheus-slurm-exporter.service

@@ -2,7 +2,7 @@
 Description = Start prometheus slurm exporter
 
 [Service]
-ExecStart = /usr/bin/prometheus-slurm-exporter
+ExecStart = /usr/bin/prometheus-slurm-exporter "--listen-address=0.0.0.0:8080"
 Restart = always
 RestartSec = 15
 

+ 1 - 1
roles/slurm_exporter/tasks/install_prometheus.yml

@@ -37,4 +37,4 @@
           scrape_interval:  30s
           scrape_timeout:   30s
           static_configs:
-            - targets: ['localhost:8080']
+            - targets: ['localhost:{{ slurm_exporter_port }}']

+ 23 - 0
roles/slurm_exporter/tasks/start_services.yml

@@ -13,6 +13,22 @@
 # limitations under the License.
 ---
 
+- name: Firewall port addition for slurm exporter
+  firewalld:
+    zone: public
+    port: "{{ item }}"
+    permanent: true
+    state: enabled
+  with_items:
+    - "{{ slurm_exporter_port }}/tcp"
+    - "{{ slurm_exporter_port }}/udp"
+  tags: firewalld
+
+- name: Reload firewalld
+  command: firewall-cmd --reload
+  changed_when: true
+  tags: firewalld
+
 - name: Create systemd unit file
   copy:
     src: "{{ role_path }}/files/prometheus-slurm-exporter.service"
@@ -20,6 +36,13 @@
     remote_src: no
     mode: "{{ file_permission }}"
 
+- name: Update the port in service file
+  replace:
+    path: "{{ systemd_path_dest }}/prometheus-slurm-exporter.service"
+    regexp: "0.0.0.0:8080"
+    replace: "0.0.0.0:{{ slurm_exporter_port }}"
+  tags: init
+
 - name: Start services
   systemd:
     name: prometheus-slurm-exporter

+ 1 - 0
roles/slurm_exporter/vars/main.yml

@@ -35,6 +35,7 @@ prometheus_config_file: "{{ prometheus_inst_path }}/prometheus.yml"
 #Usage: start_service.yml
 file_permission: "0755"
 systemd_path_dest: "/etc/systemd/system/"
+slurm_exporter_port: "8081"
 
 #Usage: configure_prometheus_pod.yml
 slurm_config_file: "slurm_exporter_config.yaml"