Browse Source

Issue #229: Updated to stable commit version of slurm exporter and added prometheus extra scrape configs for slurm exporter

Signed-off-by: K <Deepika_K2@Dell.com>
Lucas A. Wilson 4 năm trước cách đây
mục cha
commit
e08d7c81ed

+ 4 - 0
roles/k8s_start_services/files/extraScrapeConfigs.yaml

@@ -0,0 +1,4 @@
+- job_name: Slurm-exporter-prometheus
+  static_configs:
+  - targets:
+    - localhost:8080

+ 16 - 0
roles/k8s_start_services/tasks/main.yml

@@ -99,10 +99,26 @@
     -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
   changed_when: true
   tags: init
+  
+- name: Copy the slurm exporter config file
+  copy:
+    src: "{{ slurm_exporter_config_file }}"
+    dest: "{{ slurm_exporter_config_file_path }}"
+    owner: root
+    group: root
+    mode: "{{ slurm_exporter_file_mode }}"
+
+- name: Add the host IP to config file
+  replace:
+    path: "{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}"
+    regexp: "localhost"
+    replace: "{{ inventory_hostname }}"
+  tags: init
 
 - name: Prometheus deployment
   command: >
     helm install stable/prometheus \
+    --set-file extraScrapeConfigs="{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}" \
     --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
     --generate-name
   changed_when: true

+ 6 - 0
roles/k8s_start_services/vars/main.yml

@@ -46,3 +46,9 @@ gpu_feature_discovery_version: 0.2.0
 fpga_device_plugin_yaml_url: https://raw.githubusercontent.com/Xilinx/FPGA_as_a_Service/master/k8s-fpga-device-plugin/fpga-device-plugin.yml
 
 rocm_device_plugin_yaml_url: https://raw.githubusercontent.com/RadeonOpenCompute/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml 
+
+slurm_exporter_config_file: extraScrapeConfigs.yaml
+
+slurm_exporter_config_file_path: /var/lib/
+
+slurm_exporter_file_mode: 0655

+ 1 - 6
roles/slurm_common/files/slurm.conf

@@ -90,9 +90,4 @@ AccountingStorageType=accounting_storage/slurmdbd
 #AccountingStorageUser=
 AccountingStoragePort=
 # COMPUTE NODES
-#NodeName=linux[1-32] Procs=1 State=UNKNOWN
-#NodeName=DEFAULT Sockets=2 CoresPerSocket=20 State=UNKNOWN
-NodeName= Sockets= CoresPerSocket=
-#NodeName=compute[002-005] CoresPerSocket=20
-PartitionName=normal Nodes=ALL Default=YES MaxTime=INFINITE State=UP
-#PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
+PartitionName=normal Nodes=ALL Default=YES MaxTime=INFINITE State=UP

+ 1 - 0
roles/slurm_common/vars/main.yml

@@ -21,6 +21,7 @@ common_packages:
    - mariadb-devel
    - man2html
    - MySQL-python
+   - python-netaddr
 
 hostname_dest: "/etc/hostname"
 hosts_dest: "/etc/hosts"

+ 1 - 1
roles/slurm_exporter/tasks/install_slurm_exporter.yml

@@ -29,7 +29,7 @@
   git:
     repo: "{{ slurm_exporter_git_repo }}"
     dest: "{{ slurm_exporter_inst_dir }}"
-    version: "master"
+    version: "{{ stable_commit_id }}"
 
 - name: export GOPATH
   shell: echo $GOPATH

+ 1 - 5
roles/slurm_exporter/tasks/main.yml

@@ -21,8 +21,4 @@
 
 - name: Install prometheus on host
   include_tasks: install_prometheus.yml
-  when: "'kubernetes' in ansible_skip_tags"
-
-- name: Apply slurm exporter config to prometheus pod
-  include_tasks: configure_prometheus_pod.yml
-  tags: kubernetes
+  when: "'kubernetes' in ansible_skip_tags"

+ 1 - 0
roles/slurm_exporter/vars/main.yml

@@ -23,6 +23,7 @@ slurm_exporter_git_repo: "https://github.com/vpenso/prometheus-slurm-exporter.gi
 go_modules_path: "{{ slurm_exporter_inst_dir }}/go/modules"
 slurm_exporter_exec: "{{ slurm_exporter_inst_dir }}/bin/prometheus-slurm-exporter"
 system_path: "/usr/bin"
+stable_commit_id: "00a7dee"
 
 #Usage: install_prometheus.yml
 prometheus_git_repo: "https://github.com/prometheus/prometheus/releases/download/v2.23.0/prometheus-2.23.0.linux-amd64.tar.gz"

+ 11 - 1
roles/slurm_start_services/tasks/main.yml

@@ -24,6 +24,16 @@
     src: "{{ buffer_path }}"
     dest: "{{ slurm_confpth }}"
     mode: "{{ slurm_mode }}"
+ 
+- name: Add compute hosts info in hosts file
+  lineinfile:
+    dest: "{{ hosts_dest }}"
+    line: "{{ hostvars[item].compute_host }} {{ hostvars[item].compute_ip }}"
+    state: present
+    create: yes
+    mode: "{{ common_mode }}"
+  with_items:
+    - "{{ groups['compute'] }}"
 
 - name: Enable slurmdbd on manager
   service:
@@ -62,4 +72,4 @@
 
 - name: Add root to the default account
   command: sacctmgr -i add user root DefaultAccount=defaultgroup
-  when: user_added.rc != 0
+  when: user_added.rc != 0

+ 19 - 5
roles/slurm_workers/tasks/main.yml

@@ -113,12 +113,26 @@
   register: machine_name
   changed_when: true
 
-- name: Add socket and core info
+- name: Set compute node hostname/host ip to add in manager hosts file
+  set_fact:
+    compute_host: "{{ inventory_hostname }}" 
+    compute_ip: "{{ machine_name.stdout }}"
+
+- name: Get socket and core info from compute nodes
+  set_fact:
+    node_name: "{{ machine_name.stdout }}"
+    sockets: "{{ hostvars[inventory_hostname]['ansible_facts']['processor_count'] }}"
+    cores: "{{ hostvars[inventory_hostname]['ansible_facts']['processor_cores'] }}"  
+
+- name: Add compute nodes core & socket info in slurm config file
   lineinfile:
-    path: "{{ slurm_confpth }}"
-    regexp: "NodeName= Sockets= CoresPerSocket="
-    line: "NodeName={{ machine_name.stdout }} Sockets={{ hostvars[inventory_hostname]['ansible_facts']['processor_count'] }}
-      CoresPerSocket={{ hostvars[inventory_hostname]['ansible_facts']['processor_cores'] }}"
+    dest: "{{ slurm_confpth }}"
+    line: "NodeName={{ hostvars[item].node_name }} Sockets={{ hostvars[item].sockets }} CoresPerSocket={{ hostvars[item].cores }}"
+    state: present
+    create: yes
+    mode: "{{ slurm_mode }}"
+  with_items:
+    - "{{ groups['compute'] }}"
 
 - name: Save slurm conf in buffer
   fetch: