Browse Source

Issue #223: Fix for compute node configuration overwriting issue

Signed-off-by: VishnupriyaKrish <Vishnupriya_Krishnar@Dellteam.com>
Lucas A. Wilson 4 years ago
parent
commit
5399ee2710

+ 6 - 6
omnia.yml

@@ -118,9 +118,9 @@
     - slurm_start_services
   tags: slurm
 
-#- name: Install slurm exporter
-# hosts: manager
-# gather_facts: false
-# roles:
-#   - slurm_exporter
-# tags: slurm
+- name: Install slurm exporter
+  hosts: manager
+  gather_facts: false
+  roles:
+    - slurm_exporter
+  tags: slurm

+ 1 - 6
roles/slurm_common/files/slurm.conf

@@ -90,9 +90,4 @@ AccountingStorageType=accounting_storage/slurmdbd
 #AccountingStorageUser=
 AccountingStoragePort=
 # COMPUTE NODES
-#NodeName=linux[1-32] Procs=1 State=UNKNOWN
-#NodeName=DEFAULT Sockets=2 CoresPerSocket=20 State=UNKNOWN
-NodeName= Sockets= CoresPerSocket=
-#NodeName=compute[002-005] CoresPerSocket=20
-PartitionName=normal Nodes=ALL Default=YES MaxTime=INFINITE State=UP
-#PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
+PartitionName=normal Nodes=ALL Default=YES MaxTime=INFINITE State=UP

+ 1 - 0
roles/slurm_common/vars/main.yml

@@ -21,6 +21,7 @@ common_packages:
    - mariadb-devel
    - man2html
    - MySQL-python
+   - python-netaddr
 
 hostname_dest: "/etc/hostname"
 hosts_dest: "/etc/hosts"

+ 27 - 2
roles/slurm_exporter/tasks/configure_prometheus_pod.yml

@@ -13,6 +13,31 @@
 #  limitations under the License.
 ---
 
+- name: Copy the slurm exporter config file
+  copy:
+    src: "{{ slurm_config_file }}"
+    dest: "{{ installation_dir }}"
+    owner: root
+    group: root
+    mode: "{{ file_permission }}"
+
+- name: Add the host IP to config file
+  lineinfile:
+    dest: "{{ installation_dir }}/{{ slurm_config_file }}"
+    regexp: "http:"
+    insertafter: "- targets"
+    line: "        - http://{{ inventory_hostname }}:8080/metrics"
+    state: present
+    backup: yes
+    backrefs: yes
+
+- name: Verify if slurm exporter is already configured
+  command: kubectl get service prometheus-slurmexporter-metrics-2
+  register: service_status
+  changed_when: False
+  ignore_errors: yes
+
 - name: Apply slurm exporter configuration to prometheus
-  command: kubectl apply -f "{{ role_path }}/files/{{ slurm_config_file }}" --validate=false
-  changed_when: False
+  command: kubectl apply -f "{{ installation_dir }}/{{ slurm_config_file }}" --validate=false
+  changed_when: true
+  when: "'Error from server (NotFound)' in service_status.stderr"

+ 1 - 0
roles/slurm_manager/vars/main.yml

@@ -25,6 +25,7 @@ slurm_packages:
    - perl-DBI
    - perl-Switch
    - libibumad
+   - git
 
 dev_tools:
    - rrdtool-devel

+ 11 - 1
roles/slurm_start_services/tasks/main.yml

@@ -24,6 +24,16 @@
     src: "{{ buffer_path }}"
     dest: "{{ slurm_confpth }}"
     mode: "{{ slurm_mode }}"
+ 
+- name: Add compute hosts info in hosts file
+  lineinfile:
+    dest: "{{ hosts_dest }}"
+    line: "{{ hostvars[item].compute_host }} {{ hostvars[item].compute_ip }}"
+    state: present
+    create: yes
+    mode: "{{ common_mode }}"
+  with_items:
+    - "{{ groups['compute'] }}"
 
 - name: Enable slurmdbd on manager
   service:
@@ -62,4 +72,4 @@
 
 - name: Add root to the default account
   command: sacctmgr -i add user root DefaultAccount=defaultgroup
-  when: user_added.rc != 0
+  when: user_added.rc != 0

+ 19 - 5
roles/slurm_workers/tasks/main.yml

@@ -113,12 +113,26 @@
   register: machine_name
   changed_when: true
 
-- name: Add socket and core info
+- name: Set compute node hostname/host ip to add in manager hosts file
+  set_fact:
+    compute_host: "{{ inventory_hostname }}" 
+    compute_ip: "{{ machine_name.stdout }}"
+
+- name: Get socket and core info from compute nodes
+  set_fact:
+    node_name: "{{ machine_name.stdout }}"
+    sockets: "{{ hostvars[inventory_hostname]['ansible_facts']['processor_count'] }}"
+    cores: "{{ hostvars[inventory_hostname]['ansible_facts']['processor_cores'] }}"  
+
+- name: Add compute nodes core & socket info in slurm config file
   lineinfile:
-    path: "{{ slurm_confpth }}"
-    regexp: "NodeName= Sockets= CoresPerSocket="
-    line: "NodeName={{ machine_name.stdout }} Sockets={{ hostvars[inventory_hostname]['ansible_facts']['processor_count'] }}
-      CoresPerSocket={{ hostvars[inventory_hostname]['ansible_facts']['processor_cores'] }}"
+    dest: "{{ slurm_confpth }}"
+    line: "NodeName={{ hostvars[item].node_name }} Sockets={{ hostvars[item].sockets }} CoresPerSocket={{ hostvars[item].cores }}"
+    state: present
+    create: yes
+    mode: "{{ slurm_mode }}"
+  with_items:
+    - "{{ groups['compute'] }}"
 
 - name: Save slurm conf in buffer
   fetch: