Browse Source

Issue #888: Update slurm telemetry support

Signed-off-by: DeepikaKrishnaiah <deepika_k2@dell.com>
DeepikaKrishnaiah 3 years ago
parent
commit
c3cc80d8ab

+ 4 - 4
roles/common/vars/main.yml

@@ -14,10 +14,10 @@
 ---
 
 leap_repo:
-  - { name: repo-non-oss, repo: http://download.opensuse.org/distribution/leap/15.3/repo/non-oss/ }
-  - { name: repo-oss, repo: http://download.opensuse.org/distribution/leap/15.3/repo/oss/ }
-  - { name: repo-update-oss, repo: http://download.opensuse.org/update/leap/15.3/oss/ }
-  - { name: repo-update-non-oss, repo: http://download.opensuse.org/update/leap/15.3/non-oss/ }
+  - { name: repo-non-oss, repo: "http://download.opensuse.org/distribution/leap/15.3/repo/non-oss/" }
+  - { name: repo-oss, repo: "http://download.opensuse.org/distribution/leap/15.3/repo/oss/" }
+  - { name: repo-update-oss, repo: "http://download.opensuse.org/update/leap/15.3/oss/" }
+  - { name: repo-update-non-oss, repo: "http://download.opensuse.org/update/leap/15.3/non-oss/" }
 
 nvidia_repo: https://download.nvidia.com/opensuse/leap/15.3/
 docker_repo_url_leap: https://download.docker.com/linux/sles/docker-ce.repo

+ 7 - 0
telemetry/roles/slurm_telemetry/files/Dockerfile

@@ -8,13 +8,20 @@ RUN dnf -y install https://download.postgresql.org/pub/repos/yum/reporpms/EL-8-x
 RUN dnf module disable postgresql -y
 RUN dnf install postgresql13-devel -y
 RUN yum install python38-devel libpq-devel -y
+RUN dnf install sshpass -y
 
 COPY requirements.txt requirements.txt
 RUN ln -s /usr/pgsql-13/bin/pg_config /usr/bin/pg_config
 
 RUN pip3 install psycopg2-binary
 RUN pip3 install -r requirements.txt
+RUN mkdir /MonSter/
+COPY init_k8s_pod.sh /MonSter/
+RUN chmod 777 /MonSter/init_k8s_pod.sh
+
 RUN mkdir /log/
 RUN touch /log/monster.log
 
+COPY monster /MonSter/
+
 WORKDIR /MonSter/

+ 6 - 0
telemetry/roles/slurm_telemetry/files/init_k8s_pod_local.sh

@@ -0,0 +1,6 @@
+#!/bin/bash
+
+echo 'manager_node_ip manager_node_hostname' >> /etc/hosts
+ssh-keyscan -H manager_node_hostname >> /root/.ssh/known_hosts
+ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -q -N "" -y
+sshpass -p 'os_passwd' ssh-copy-id 'root@manager_node_ip'

+ 33 - 0
telemetry/roles/slurm_telemetry/files/k8s_slurm_telemetry.yml

@@ -0,0 +1,33 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: slurm-telemetry
+  namespace: telemetry-and-visualizations
+  labels:
+    app: slurm-telemetry
+spec:
+  selector:
+    matchLabels:
+      app: slurm-telemetry
+  replicas: 1
+  strategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        app: slurm-telemetry
+    spec:
+      volumes:
+        - name: ssh-key
+          hostPath:
+            path: /root/.ssh/
+            type: Directory
+      containers:
+        - name: slurm-telemetry
+          image: 'localhost/slurm_telemetry:latest'
+          imagePullPolicy: Never
+          command: ["/bin/sh","-c"]
+          args: ["./init_k8s_pod.sh; python3.8 tsdb.py; python3.8 mslurm.py"]
+          volumeMounts:
+            - name: ssh-key
+              mountPath: /root/.ssh/

+ 0 - 90
telemetry/roles/slurm_telemetry/files/update_service_tags.yml

@@ -1,90 +0,0 @@
-# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.​0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-- name: Create inventory in awx
-  hosts: manager, compute
-  tasks:
-    - name: Check slurmctld service
-      systemd:
-        name: slurmctld
-      register: slurm_service_status
-
-    - name: Set fact slurm_service
-      set_fact:
-        slurm_service: True
-      delegate_to: localhost
-      when: "slurm_service_status.status.ActiveState == 'active'"
-
-    - name: Set fact slurm_service
-      set_fact:
-        slurm_service: False
-      delegate_to: localhost
-      when: "slurm_service_status.status.ActiveState == 'inactive'"
-
-    - name: Replace input file
-      copy:
-        src: "input_config.yml"
-        dest: /mnt/omnia/slurm/monster/config.yml
-        mode: 0644
-      delegate_to: localhost
-      when: "slurm_service_status.status.ActiveState == 'active'"
-
-    - name: Prepare input config file
-      block:
-        - name: Get service tag
-          shell: >
-            set -o pipefail && \
-            dmidecode -t 1 | grep Serial
-          changed_when: false
-          register: service_tag_details
-
-        - name: Set fact service tag
-          set_fact:
-            service_tag: "{{ service_tag_details.stdout.split(':')[1].strip() }}"
-
-        - name: Get the hostname
-          command: hostname
-          register: machine_hostname
-          changed_when: false
-
-        - name: Update Head Node IP
-          replace:
-            path: /mnt/omnia/slurm/monster/config.yml
-            regexp: '  ip:.*'
-            replace: "  ip: {{ groups['manager'][0] }}"
-          delegate_to: localhost
-
-        - name: Update Head Node hostname
-          replace:
-            path: /mnt/omnia/slurm/monster/config.yml
-            regexp: '  headnode:.*'
-            replace: "  headnode: {{ hostvars[groups['manager'][0]]['machine_hostname'].stdout }}"
-          delegate_to: localhost
-
-        - name: Update nodes hostnames
-          lineinfile:
-            path: /mnt/omnia/slurm/monster/config.yml
-            line: "  {{ machine_hostname.stdout }}: {{ ansible_default_ipv4.address }}"
-            insertafter: "hostnames:"
-          delegate_to: localhost
-
-        - name: Update service tag info
-          lineinfile:
-            path: /mnt/omnia/slurm/monster/config.yml
-            line: "  - Servicetag: {{ service_tag }}\n    Os_Ip_Addr: {{ ansible_default_ipv4.address }}"
-            insertafter: "clusternodes:"
-          delegate_to: localhost
-      when: hostvars[groups['manager'][0]]['slurm_service']

+ 51 - 0
telemetry/roles/slurm_telemetry/tasks/deploy_slurm_telemetry.yml

@@ -0,0 +1,51 @@
+# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Get all images
+  command: "buildah images"
+  register: images_result
+  failed_when: false
+  changed_when: false
+
+- name: Update the permission of init_k8s_pod.sh
+  file:
+    path: "{{ role_path }}/files/init_k8s_pod.sh"
+    mode: "{{ slurm_telemetry_code_dir_mode }}"
+  when: slurm_telemetry_image not in images_result.stdout
+
+- name: Create slurm_telemetry image
+  command: buildah bud -t {{ slurm_telemetry_image }}:{{ slurm_telemetry_image_tag }} -f {{ role_path }}/files/Dockerfile
+  args:
+    chdir: "{{ role_path }}/files/"
+  changed_when: true
+  when: slurm_telemetry_image not in images_result.stdout
+
+- name: Deploy slurm_telemetry pod
+  command: kubectl apply -f {{ role_path }}/files/k8s_slurm_telemetry.yml
+  changed_when: true
+
+- name: Wait for slurm_telemetry pod to come to ready state
+  command: kubectl wait --for=condition=ready -n {{ namespace }} pod -l app=slurm-telemetry --timeout=4m
+  changed_when: true
+
+- name: Delete input config file
+  file:
+    path: "{{ role_path }}/files/monster/config.yml"
+    state: absent
+
+- name: Delete init k8s pod file
+  file:
+    path: "{{ role_path }}/files/init_k8s_pod.sh"
+    state: absent

+ 75 - 63
telemetry/roles/slurm_telemetry/tasks/get_node_inventory.yml

@@ -13,76 +13,88 @@
 # limitations under the License.
 ---
 
-- name: Copy slurm telemetry code
-  copy:
-    src: "{{ role_path }}/files/monster"
-    dest: "{{ slurm_telemetry_code_dir }}"
-    mode: "{{ slurm_telemetry_code_dir_mode }}"
-    
-- name: Install jmepath
-  pip:
-    name: jmespath
-    state: present
-    executable: pip3
+- name: Get inventory details
+  block:
+  - name: Copy slurm telemetry code
+    copy:
+      src: "{{ role_path }}/files/monster"
+      dest: "{{ slurm_telemetry_code_dir }}"
+      mode: "{{ slurm_telemetry_code_dir_mode }}"
 
-- name: Get AWX service IP
-  command: kubectl get svc awx-ui -n {{ awx_namespace }} -o=jsonpath='{.spec.clusterIP}'
-  changed_when: false
-  failed_when: false
-  register: awx_svc_ip
+  - name: Install sshpass
+    package:
+      name: sshpass
+      state: present
 
-- name: AWX needs to be installed
-  fail:
-    msg: "{{ awx_fail_msg }}"
-  when: not awx_svc_ip.stdout
+  - name: Install jmepath
+    pip:
+      name: jmespath
+      state: present
+      executable: pip3
 
-- name: Get AWX service port
-  command: kubectl get svc awx-ui -n {{ awx_namespace }} -o=jsonpath='{.spec.ports[0].port}'
-  changed_when: false
-  register: awx_svc_port
+  - name: Get AWX service IP
+    command: kubectl get svc awx-ui -n {{ awx_namespace }} -o=jsonpath='{.spec.clusterIP}'
+    changed_when: false
+    failed_when: false
+    register: awx_svc_ip
 
-- name: Get AWX secret
-  shell: >
-    set -o pipefail && \
-    kubectl get secret awx-admin-password -n {{ awx_namespace }} -o jsonpath="{.data.password}" | base64 --decode
-  changed_when: false
-  register: awx_secret
+  - name: AWX needs to be installed
+   fail:
+      msg: "{{ awx_fail_msg }}"
+    when: not awx_svc_ip.stdout
 
-- name: Get node_inventory id
-  shell: >
-    set -o pipefail && \
-    awx --conf.host http://{{ awx_svc_ip.stdout }}:{{ awx_svc_port.stdout }} --conf.username {{ awx_username }} \
-    --conf.password {{ awx_secret.stdout }} --conf.insecure inventory list -f human | grep node_inventory
-  changed_when: false
-  register: inventory_id
+  - name: Get AWX service port
+    command: kubectl get svc awx-ui -n {{ awx_namespace }} -o=jsonpath='{.spec.ports[0].port}'
+    changed_when: false
+    register: awx_svc_port
 
-- name: Node inventory not found in AWX
-  fail:
-    msg: "{{ node_inventory_fail_msg }}"
-  when: not inventory_id.stdout
+  - name: Get AWX secret
+    shell: >
+      set -o pipefail && \
+      kubectl get secret awx-admin-password -n {{ awx_namespace }} -o jsonpath="{.data.password}" | base64 --decode
+    changed_when: false
+    register: awx_secret
 
-- name: Get node_inventory
-  command: awx --conf.host http://{{ awx_svc_ip.stdout }}:{{ awx_svc_port.stdout }} --conf.username {{ awx_username }} \
-    --conf.password {{ awx_secret.stdout }} --conf.insecure hosts list --inventory {{ inventory_id.stdout[0] }}
-  changed_when: false
-  register: node_inventory_output
+  - name: Get node_inventory id
+    shell: >
+      set -o pipefail && \
+      awx --conf.host http://{{ awx_svc_ip.stdout }}:{{ awx_svc_port.stdout }} --conf.username {{ awx_username }} \
+      --conf.password {{ awx_secret.stdout }} --conf.insecure inventory list -f human | grep node_inventory
+    changed_when: false
+    register: inventory_id
 
-- name: Save the json data
-  set_fact:
-    node_inventory_jsondata: "{{ node_inventory_output.stdout | from_json }}"
+  - name: Node inventory not found in AWX
+    fail:
+      msg: "{{ node_inventory_fail_msg }}"
+    when: not inventory_id.stdout
 
-- name: Add temporary hosts
-  add_host:
-    name: "{{ item.name }}"
-    groups: "{{ item.summary_fields.groups.results[0].name }}"
-  with_items: "{{ node_inventory_jsondata | json_query('results') }}"
-  no_log: true
+  - name: Get node_inventory
+    command: awx --conf.host http://{{ awx_svc_ip.stdout }}:{{ awx_svc_port.stdout }} --conf.username {{ awx_username }} \
+      --conf.password {{ awx_secret.stdout }} --conf.insecure hosts list --inventory {{ inventory_id.stdout[0] }}
+    changed_when: false
+    register: node_inventory_output
 
-- name: Update slurm telemetry code path
-  replace:
-    path: "{{ role_path }}/files/update_service_tags.yml"
-    regexp: '{{ item }}.*'
-    replace: "{{ item }} {{ slurm_telemetry_code_dir }}/monster/config.yml"
-  with_items:
-    - "dest:"
-    - "path:"
+  - name: Save the json data
+    set_fact:
+      node_inventory_jsondata: "{{ node_inventory_output.stdout | from_json }}"
+
+  - name: Add temporary hosts
+    add_host:
+      name: "{{ node_inventory_jsondata['results'][node_index].name }}"
+      groups: "{{ node_inventory_jsondata['results'][node_index].summary_fields.groups.results[0].name }}"
+      ansible_user: "{{ os_username }}"
+      ansible_password: "{{ provision_password }}"
+      ansible_become_pass: "{{ provision_password }}"
+      ansible_ssh_common_args: '-o StrictHostKeyChecking=no'
+    with_items: "{{ node_inventory_jsondata['results'] }}"
+    loop_control:
+      index_var: node_index
+    when: node_inventory_jsondata['results'][node_index].summary_fields.groups.count > 0
+    no_log: true
+
+  - name: Copy input_config file
+    copy:
+      src: "{{ role_path }}/files/input_config.yml"
+      dest: "{{ role_path }}/files/monster/config.yml"
+      mode: "{{ monster_config_file_mode }}"
+  when: slurm_telemetry_support

+ 14 - 7
telemetry/roles/slurm_telemetry/tasks/main.yml

@@ -13,12 +13,19 @@
 # limitations under the License.
 ---
 
-- name: Include common variables
-  include_vars: ../../common/vars/main.yml
+- name: Deploy slurm-telemetry
+  block:
+  - name: Include common variables
+    include_vars: ../../common/vars/main.yml
 
-- name: Include timescaledb variables
-  include_vars: ../../timescaledb/vars/main.yml
+  - name: Include timescaledb variables
+    include_vars: ../../timescaledb/vars/main.yml
 
-- name: Prepare MonSter input file
-  include_tasks: update_timescaledb_details.yml
-  when: hostvars[groups['manager'][0]]['slurm_service']
+  - name: Prepare MonSter input file
+    include_tasks: update_timescaledb_details.yml
+    when: hostvars[groups['manager'][0]]['slurm_service']
+
+  - name: Deploy slurm telemetry
+    include_tasks: deploy_slurm_telemetry.yml
+    when: hostvars[groups['manager'][0]]['slurm_service']
+  when: slurm_telemetry_support

+ 116 - 0
telemetry/roles/slurm_telemetry/tasks/update_service_tags.yml

@@ -0,0 +1,116 @@
+ Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.<200b>0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Get hosts details
+  block:
+  - name: Check slurmctld service
+    systemd:
+      name: slurmctld
+    register: slurm_service_status
+
+  - name: Set fact slurm_service
+    set_fact:
+      slurm_service: False
+    delegate_to: localhost
+
+  - name: Set fact slurm_service
+    set_fact:
+      slurm_service: True
+    delegate_to: localhost
+    when: "slurm_service_status.status.ActiveState == 'active'"
+
+  - name: Assert slurmctld status
+    fail:
+      msg: "{{ slurmctld_status_fail_msg }}"
+    when: not slurm_service
+
+  - name: Prepare input config file
+    block:
+    - name: Get service tag
+        shell: >
+          set -o pipefail && \
+          dmidecode -t 1 | grep Serial
+        changed_when: false
+        register: service_tag_details
+
+      - name: Set fact service tag
+        set_fact:
+          service_tag: "{{ service_tag_details.stdout.split(':')[1].strip() }}"
+
+      - name: Get the hostname
+        command: hostname
+        register: machine_hostname
+        changed_when: false
+
+      - name: Update Head Node IP
+        replace:
+          path: "{{ role_path }}{{ monster_input_file_path }}"
+          regexp: '  ip:.*'
+          replace: "  ip: {{ groups['manager'][0] }}"
+        delegate_to: localhost
+
+      - name: Update Head Node hostname
+        replace:
+          path: "{{ role_path }}{{ monster_input_file_path }}"
+          regexp: '  headnode:.*'
+          replace: "  headnode: {{ hostvars[groups['manager'][0]]['machine_hostname'].stdout }}"
+        delegate_to: localhost
+
+      - name: Update nodes hostnames
+        lineinfile:
+          path: "{{ role_path }}{{ monster_input_file_path }}"
+          line: "  {{ machine_hostname.stdout }}: {{ inventory_hostname }}"
+          insertafter: "hostnames:"
+        delegate_to: localhost
+
+      - name: Update service tag info
+        lineinfile:
+          path: "{{ role_path }}{{ monster_input_file_path }}"
+          line: "  - Servicetag: {{ service_tag }}\n    Os_Ip_Addr: {{ inventory_hostname }}"
+          insertafter: "clusternodes:"
+        delegate_to: localhost
+
+      - name: Copy initialization file
+        copy:
+          src: "{{ role_path }}/files/init_k8s_pod_local.sh"
+          dest: "{{ role_path }}/files/init_k8s_pod.sh"
+          mode: "{{ monster_config_file_mode }}"
+
+      - name: Update manager node details in init_k8s_pod.sh
+        replace:
+          path: "{{ role_path }}/files/init_k8s_pod.sh"
+          regexp: echo 'manager_node_ip manager_node_hostname' >> /etc/hosts
+          replace: echo '{{ inventory_hostname }} {{ machine_hostname.stdout }}' >> /etc/hosts
+        delegate_to: localhost
+        when: manager_group in group_names
+
+      - name: Update manager node IP in init_k8s_pod.sh
+        replace:
+          path: "{{ role_path }}/files/init_k8s_pod.sh"
+          regexp: ssh-keyscan -H manager_node_hostname >> /root/.ssh/known_hosts
+          replace: ssh-keyscan -H {{ machine_hostname.stdout }} >> /root/.ssh/known_hosts
+        delegate_to: localhost
+        when: manager_group in group_names
+
+      - name: Update manager node IP in init_k8s_pod.sh
+        replace:
+          path: "{{ role_path }}/files/init_k8s_pod.sh"
+          regexp: sshpass -p 'os_passwd' ssh-copy-id 'root@manager_node_ip'
+          replace: sshpass -p "{{ hostvars['127.0.0.1']['provision_password'] }}" ssh-copy-id 'root@{{ inventory_hostname }}'
+        delegate_to: localhost
+        when: manager_group in group_names
+
+    when: hostvars[groups['manager'][0]]['slurm_service']
+  when: slurm_telemetry_support

+ 6 - 6
telemetry/roles/slurm_telemetry/tasks/update_timescaledb_details.yml

@@ -25,31 +25,31 @@
 
 - name: Update timescaledb service IP
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  host:.*"
     replace: "  host: {{ timescaledb_svc_ip.stdout }}"
 
 - name: Update timescaledb service port
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  port:.*"
     replace: "  port: {{ timescaledb_svc_port.stdout }}"
     before: "# Slurm REST API Configuration"
 
 - name: Update timescaledb username
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  username:.*"
     replace: "  username: {{ timescaledb_user }}"
 
 - name: Update timescaledb password
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  password:.*"
     replace: "  password: {{ timescaledb_password }}"
 
 - name: Update timescaledb database
   replace:
-    path: "{{ slurm_telemetry_code_dir }}/monster/config.yml"
+    path: "{{ role_path }}{{ monster_input_file_path }}"
     regexp: "  database:.*"
-    replace: "  database: {{ timescaledb_name }}"
+    replace: "  database: {{ timescaledb_name }}"

+ 15 - 1
telemetry/roles/slurm_telemetry/vars/main.yml

@@ -19,4 +19,18 @@ slurm_telemetry_code_dir_mode: 0755
 awx_namespace: awx
 awx_username: admin
 awx_fail_msg: "AWX service not found. AWX needs to be installed"
-node_inventory_fail_msg: "AWX node inventory not found. Node inventory needs be created in AWX"
+node_inventory_fail_msg: "AWX node inventory not found. Node inventory needs be created in AWX"
+os_username: root
+
+# usage: deploy_slurm_telemetry
+slurm_telemetry_image: slurm_telemetry
+slurm_telemetry_image_tag: latest
+monster_config_file_mode: 0644
+
+manager_group: manager
+compute_group: compute
+input_config_file_path: /mnt/omnia/slurm/monster/config.yml
+monster_input_file_path: /files/monster/config.yml
+
+slurmctld_status_success_msg: "slurmctld is running on manager node"
+slurmctld_status_fail_msg: "slurmctld is inactive. Please check manager node for slurm status"

+ 10 - 4
telemetry/telemetry.yml

@@ -33,9 +33,15 @@
         tasks_from: get_node_inventory.yml
       tags: slurm_telemetry
 
-- name: Update slurm node IPs and service tags
-  import_playbook: "{{ playbook_dir }}/roles/slurm_telemetry/files/update_service_tags.yml"
-  tags: slurm_telemetry
+- name: Get node details
+  hosts: manager, compute
+  gather_facts: false
+  tasks:
+    - name: Get service tag
+      include_role:
+        name: slurm_telemetry
+        tasks_from: update_service_tags.yml
+      tags: slurm_telemetry
 
 - name: Slurm Telemetry
   hosts: localhost
@@ -43,4 +49,4 @@
   gather_facts: false
   roles:
    - slurm_telemetry
-  tags: slurm_telemetry
+  tags: slurm_telemetry