Browse Source

Issue #174: Prometheus slurm exporter and test framework

Signed-off-by: K <Deepika_K2@Dell.com>
Lucas A. Wilson 4 years ago
parent
commit
a6073400cf

+ 18 - 14
omnia.yml

@@ -13,12 +13,12 @@
 # limitations under the License.
 ---
 
-- name: Validate the cluster
-  hosts: localhost
-  connection: local
-  gather_facts: no
-  roles:
-    - cluster_validation
+#- name: Validate the cluster
+# hosts: localhost
+# connection: local
+# gather_facts: no
+# roles:
+#   - cluster_validation
 
 - name: Gather facts from all the nodes
   hosts: all
@@ -76,14 +76,18 @@
   gather_facts: false
   roles:
     - k8s_nfs_server_setup
-  tags: kubernetes
+  tags: 
+    - kubernetes
+    - nfs
 
 - name: Apply NFS client setup on compute nodes
   hosts: compute
   gather_facts: false
   roles:
     - k8s_nfs_client_setup
-  tags: kubernetes
+  tags: 
+    - kubernetes
+    - nfs
 
 - name: Start K8s on manager server
   hosts: manager
@@ -134,9 +138,9 @@
     - slurm_start_services
   tags: slurm
 
-- name: Install slurm exporter
-  hosts: manager
-  gather_facts: false
-  roles:
-    - slurm_exporter
-  tags: slurm
+#- name: Install slurm exporter
+# hosts: manager
+# gather_facts: false
+# roles:
+#   - slurm_exporter
+# tags: slurm

+ 8 - 1
roles/common/tasks/main.yml

@@ -39,6 +39,12 @@
     state: present
   tags: install
 
+- name: Add docker community edition repository
+  get_url:
+    url: "{{ docker_repo_url }}"
+    dest: "{{ docker_repo_dest }}"
+  tags: install
+
 - name: Disable swap
   command: /sbin/swapoff -a
   changed_when: true
@@ -70,4 +76,5 @@
 
 - name: Install Nvidia drivers and software components
   include_tasks: nvidia.yml
-  when: ansible_local.inventory.nvidia_gpu > 0
+  when: ansible_local.inventory.nvidia_gpu > 0
+  tags: install

+ 37 - 17
roles/common/tasks/nvidia.yml

@@ -13,17 +13,44 @@
 #  limitations under the License.
 ---
 
-- name: Add nvidia-docker2 Repo
-  get_url:
-    url: "{{ nvidia_docker_repo_url }}"
-    dest: "{{ nvidia_docker_repo_dest }}"
-  tags: install, testing
-
 - name: Add libnvidia container Repo
-  get_url:
-    url: "{{ nvidia_container_repo_url }}"
-    dest: "{{ nvidia_container_repo_dest }}"
-  tags: install, testing
+  yum_repository:
+    name: libnvidia-container
+    description:  libnvidia-container
+    baseurl: https://nvidia.github.io/libnvidia-container/stable/centos7/$basearch
+    repo_gpgcheck: no
+    gpgcheck: no
+    gpgkey: https://nvidia.github.io/libnvidia-container/gpgkey
+    sslverify: yes
+    sslcacert: /etc/pki/tls/certs/ca-bundle.crt
+    enabled: yes
+  tags: install
+
+- name: Add nvidia-container-runtime Repo 
+  yum_repository:
+    name: nvidia-container-runtime
+    description:  nvidia-container-runtime
+    baseurl: https://nvidia.github.io/nvidia-container-runtime/stable/centos7/$basearch
+    repo_gpgcheck: no
+    gpgcheck: no
+    gpgkey: https://nvidia.github.io/nvidia-container-runtime/gpgkey
+    sslverify: yes
+    sslcacert: /etc/pki/tls/certs/ca-bundle.crt
+    enabled: yes
+  tags: install
+
+- name: Add nvidia-docker Repo 
+  yum_repository:
+    name: nvidia-docker
+    description:  nvidia-docker
+    baseurl: https://nvidia.github.io/nvidia-docker/centos7/$basearch
+    repo_gpgcheck: no
+    gpgcheck: no
+    gpgkey: https://nvidia.github.io/nvidia-docker/gpgkey
+    enabled: yes
+    sslverify: yes
+    sslcacert: /etc/pki/tls/certs/ca-bundle.crt
+  tags: install
 
 - name: Install nvidia driver and nvidia-docker2
   package:
@@ -52,10 +79,3 @@
     enabled: yes
     daemon_reload: yes
   tags: install
-
-- name: Restart and enable kubernetes - kubelet
-  service:
-    name: kubelet
-    state: restarted
-    enabled: yes
-  tags: install

+ 6 - 1
roles/common/vars/main.yml

@@ -23,6 +23,7 @@ common_packages:
   - nvidia-detect
   - chrony
   - pciutils
+  - docker-ce
 
 custom_fact_dir: /etc/ansible/facts.d
 
@@ -36,6 +37,10 @@ elrepo_gpg_key_url: https://www.elrepo.org/RPM-GPG-KEY-elrepo.org
 
 elrepo_rpm_url: https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm
 
+docker_repo_url: https://download.docker.com/linux/centos/docker-ce.repo
+
+docker_repo_dest: /etc/yum.repos.d/docker-ce.repo
+
 chrony_path: "/etc/chrony.conf"
 ntp_path: "/etc/ntp.conf"
 ntp_mode: "0644"
@@ -63,4 +68,4 @@ nvidia_packages:
   - nvidia-docker2
 
 daemon_file_dest: /etc/docker/
-daemon_file_mode: 0644
+daemon_file_mode: 0644

+ 11 - 19
roles/k8s_common/tasks/main.yml

@@ -14,18 +14,16 @@
 ---
 
 - name: Add kubernetes repo
-  copy:
-    src: kubernetes.repo
-    dest: "{{ k8s_repo_dest }}"
-    owner: root
-    group: root
-    mode: "{{ k8s_repo_file_mode }}"
-  tags: install
-
-- name: Add docker community edition repository
-  get_url:
-    url: "{{ docker_repo_url }}"
-    dest: "{{ docker_repo_dest }}"
+  yum_repository:
+    name: kubernetes
+    description: kubernetes
+    baseurl: https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64
+    enabled: yes
+    gpgcheck: no
+    repo_gpgcheck: no
+    gpgkey: 
+      - https://packages.cloud.google.com/yum/doc/yum-key.gpg 
+      - https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
   tags: install
 
 - name: Update sysctl to handle incorrectly routed traffic when iptables is bypassed
@@ -42,12 +40,6 @@
   changed_when: true
   tags: install
 
-- name: Install docker
-  package:
-    name: docker-ce
-    state: present
-  tags: install
-
 - name: Install k8s packages
   package:
     name: "{{ k8s_packages }}"
@@ -74,4 +66,4 @@
   service:
     name: kubelet
     state: restarted
-    enabled: yes
+    enabled: yes

+ 1 - 5
roles/k8s_common/vars/main.yml

@@ -20,12 +20,8 @@ k8s_packages:
 
 k8s_repo_dest: /etc/yum.repos.d/
 
-docker_repo_url: https://download.docker.com/linux/centos/docker-ce.repo
-
-docker_repo_dest: /etc/yum.repos.d/docker-ce.repo
-
 k8s_conf_dest: /etc/sysctl.d/
 
 k8s_repo_file_mode: 0644
 
-k8s_conf_file_mode: 0644
+k8s_conf_file_mode: 0644

+ 10 - 0
roles/slurm_exporter/files/prometheus-slurm-exporter.service

@@ -0,0 +1,10 @@
+[Unit]
+Description = Start prometheus slurm exporter
+
+[Service]
+ExecStart = /usr/bin/prometheus-slurm-exporter
+Restart = always
+RestartSec = 15
+
+[Install]
+WantedBy = multi-user.target

+ 30 - 0
roles/slurm_exporter/files/slurm_exporter_config.yaml

@@ -0,0 +1,30 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus-slurmexporter-metrics-2
+  namespace: default
+  annotations:
+      prometheus.io/scrape: 'true'
+  labels:
+    app: prometheus
+    app.kubernetes.io/managed-by: Helm
+    chart: prometheus-11.12.1
+    component: server
+spec:
+  ports:
+  - name: metrics
+    port: 8080
+    protocol: TCP
+    targetPort: 8080
+  selector:
+    app: prometheus
+    component: server
+  additionalScrapeConfigs:
+    name: prometheus-config
+    key: prometheus-config.yaml
+    job_name: 'prometheus-slurm-exporter'
+    scrape_interval: 15s
+    static_configs:
+      - targets:
+        - http:"{{ inventory_hostname }}":8080/metrics
+  serviceMonitorSelector: {}

+ 18 - 0
roles/slurm_exporter/tasks/configure_prometheus_pod.yml

@@ -0,0 +1,18 @@
+# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Apply slurm exporter configuration to prometheus
+  command: kubectl apply -f "{{ role_path }}/files/{{ slurm_config_file }}" --validate=false
+  changed_when: False

+ 40 - 0
roles/slurm_exporter/tasks/install_prometheus.yml

@@ -0,0 +1,40 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Download and untar prometheus stable version
+  unarchive:
+    src: "{{ prometheus_git_repo }}"
+    dest: "{{ installation_dir }}"
+    remote_src: yes
+
+- name: Copy prometheus executable to /usr/local/bin
+  copy:
+    src: "{{ prometheus_exec_path }}"
+    dest: "{{ system_local_path }}"
+    remote_src: yes
+    mode: "{{ file_permission }}"
+
+- name: Configure prometheus for slurm exporter
+  blockinfile:
+    path: "{{ prometheus_config_file }}"
+    insertafter: EOF
+    mode: "{{ file_permission }}"
+    block: |
+      # SLURM resource manager:
+        - job_name: 'my_slurm_exporter'
+          scrape_interval:  30s
+          scrape_timeout:   30s
+          static_configs:
+            - targets: ['localhost:8080']

+ 65 - 0
roles/slurm_exporter/tasks/install_slurm_exporter.yml

@@ -0,0 +1,65 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Download and untar go package
+  unarchive:
+    src: "{{ go_pack_url }}"
+    dest: "{{ installation_dir }}"
+    remote_src: yes
+
+- name: Add to the linux path
+  shell: echo $PATH
+  environment:
+    PATH: "{{ extended_path }}:{{ ansible_env.PATH }}"
+  changed_when: False
+
+- name: Clone the source code
+  git:
+    repo: "{{ slurm_exporter_git_repo }}"
+    dest: "{{ slurm_exporter_inst_dir }}"
+    version: "master"
+
+- name: export GOPATH
+  shell: echo $GOPATH
+  environment:
+    PATH: "{{ ansible_env.PATH }}:/var/lib/go/bin/"
+    GOPATH: "{{ go_modules_path }}"
+  changed_when: False
+
+- name: Download dependencies
+  command: "{{ go_exec_path }} mod download"
+  args:
+    chdir: "{{ slurm_exporter_inst_dir }}"
+  changed_when: False
+
+- name: Build the exporter
+  shell: "{{ go_exec_path }} build -o bin/prometheus-slurm-exporter {main,accounts,cpus,nodes,partitions,queue,scheduler,users}.go"
+  args:
+    chdir: "{{ slurm_exporter_inst_dir }}"
+  changed_when: False
+
+- name: Run all tests included in _test.go files
+  shell: "{{ go_exec_path }} test -v *.go"
+  args:
+    chdir: "{{ slurm_exporter_inst_dir }}"
+  changed_when: False
+  ignore_errors: yes
+
+- name: Copy executable to /usr/bin
+  copy:
+    src: "{{ slurm_exporter_exec }}"
+    dest: "{{ system_path }}"
+    remote_src: yes
+    mode: "{{ file_permission }}"

+ 28 - 0
roles/slurm_exporter/tasks/main.yml

@@ -0,0 +1,28 @@
+# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Install slurm exporter
+  include_tasks: install_slurm_exporter.yml
+
+- name: Start slurm exporter services
+  include_tasks: start_services.yml
+
+- name: Install prometheus on host
+  include_tasks: install_prometheus.yml
+  when: "'kubernetes' in ansible_skip_tags"
+
+- name: Apply slurm exporter config to prometheus pod
+  include_tasks: configure_prometheus_pod.yml
+  tags: kubernetes

+ 26 - 0
roles/slurm_exporter/tasks/start_services.yml

@@ -0,0 +1,26 @@
+# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Create systemd unit file
+  copy:
+    src: "{{ role_path }}/files/prometheus-slurm-exporter.service"
+    dest: "{{ systemd_path_dest }}"
+    remote_src: no
+    mode: "{{ file_permission }}"
+
+- name: Start services
+  systemd:
+    name: prometheus-slurm-exporter
+    state: started

+ 39 - 0
roles/slurm_exporter/vars/main.yml

@@ -0,0 +1,39 @@
+# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+# Usage: install_slurm_exporter.yml
+installation_dir: "/var/lib/"
+slurm_exporter_inst_dir: "/var/lib/slurm-exporter"
+go_pack_url: "https://dl.google.com/go/go1.15.linux-amd64.tar.gz"
+extended_path: "{{ installation_dir }}/go/bin"
+go_exec_path: "{{ installation_dir }}/go/bin/go"
+slurm_exporter_git_repo: "https://github.com/vpenso/prometheus-slurm-exporter.git"
+go_modules_path: "{{ slurm_exporter_inst_dir }}/go/modules"
+slurm_exporter_exec: "{{ slurm_exporter_inst_dir }}/bin/prometheus-slurm-exporter"
+system_path: "/usr/bin"
+
+#Usage: install_prometheus.yml
+prometheus_git_repo: "https://github.com/prometheus/prometheus/releases/download/v2.23.0/prometheus-2.23.0.linux-amd64.tar.gz"
+prometheus_inst_path: "/var/lib/prometheus-2.23.0.linux-amd64/"
+prometheus_exec_path: "{{ prometheus_inst_path }}/prometheus"
+system_local_path: "/usr/local/bin"
+prometheus_config_file: "{{ prometheus_inst_path }}/prometheus.yml"
+
+#Usage: start_service.yml
+file_permission: "0755"
+systemd_path_dest: "/etc/systemd/system/"
+
+#Usage: configure_prometheus_pod.yml
+slurm_config_file: "slurm_exporter_config.yaml"

+ 56 - 0
test/test_slurm_exporter_inst_host.yml

@@ -0,0 +1,56 @@
+# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+# OMNIA_PSE_TC_002
+# Install prometheus on host when kubernetes is not installed
+- name: OMNIA_PSE_TC_002
+  hosts: manager
+  vars_files:
+    - test_vars/test_slurmexporter_vars.yml
+  tasks:
+    - block:
+        - name: Call install slurm exporter role
+          include_role:
+            name: ../roles/slurm_exporter
+      tags: TC_002
+
+    - name: Verify slurm exporter status
+      systemd:
+        name: prometheus-slurm-exporter
+      register: slurm_exporter_status
+      tags: TC_002, VERIFY_002
+
+    - name: Validate slurm exporter service status
+      assert:
+        that:
+          - slurm_exporter_status.status.ActiveState == 'active'
+        fail_msg: "{{ slurm_exporter_service_fail_msg }}"
+        success_msg: "{{ slurm_exporter_service_success_msg }}"
+      tags: TC_002, VERIFY_002
+
+    - name: Verify prometheus installation status
+      command: prometheus --version
+      register: prometheus_status
+      tags: TC_002, VERIFY_002
+      ignore_errors: yes
+      changed_when: False
+
+    - name: Validate prometheus version command
+      assert:
+        that:
+          - "'Command not found' not in prometheus_status.stdout"
+        fail_msg: "{{ prometheus_installation_fail_msg }}"
+        success_msg: "{{ prometheus_installation_success_msg }}"
+      tags: TC_002, VERIFY_002

+ 55 - 0
test/test_slurm_exporter_inst_k8s.yml

@@ -0,0 +1,55 @@
+# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+# OMNIA_PSE_TC_001
+# Successful installation of slurm exporter on the host when both slurm and kubernetes is installed
+- name: OMNIA_PSE_TC_001
+  hosts: manager
+  vars_files:
+    - test_vars/test_slurmexporter_vars.yml
+  tasks:
+    - block:
+        - name: Call install slurm exporter role
+          include_role:
+            name: ../roles/slurm_exporter
+      tags: TC_001
+
+    - name: Verify slurm exporter status
+      systemd:
+        name: prometheus-slurm-exporter
+      register: slurm_exporter_status
+      tags: TC_001, VERIFY_001
+
+    - name: Validate slurm exporter service status
+      assert:
+        that:
+          - slurm_exporter_status.status.ActiveState == 'active'
+        fail_msg: "{{ slurm_exporter_service_fail_msg }}"
+        success_msg: "{{ slurm_exporter_service_success_msg }}"
+      tags: TC_001, VERIFY_001
+
+    - name: Verify slurm exporter job in k8s services
+      command: kubectl get service prometheus-slurmexporter-metrics-1
+      register: slurm_exporter_service_status
+      tags: TC_001, VERIFY_001
+      changed_when: False
+
+    - name: Validate slurm exporter job in k8s services
+      assert:
+        that:
+          - "'Error from server' not in slurm_exporter_service_status.stdout"
+        fail_msg: "{{ slurm_exporter_job_fail_msg }}"
+        success_msg: "{{ slurm_exporter_job_success_msg }}"
+      tags: TC_001, VERIFY_001

+ 0 - 1
test/test_vars/test_slurm_common_vars.yml

@@ -27,6 +27,5 @@ common_packages:
    - munge-devel
    - mariadb-server
    - mariadb-devel
-   - python3
    - man2html
    - MySQL-python

+ 26 - 0
test/test_vars/test_slurmexporter_vars.yml

@@ -0,0 +1,26 @@
+#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+slurm_exporter_service_fail_msg: "Slurm exporter service is not running"
+
+slurm_exporter_service_success_msg: "Slurm exporter service is active and running"
+
+slurm_exporter_job_fail_msg: "Slurm-exporter-metrics not configured as k8s service"
+
+slurm_exporter_job_success_msg: "Slurm-exporter-metrics successfully configured as k8s service"
+
+prometheus_installation_fail_msg: "Prometheus not installed"
+
+prometheus_installation_success_msg: "Prometheus is installed"