123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219 |
- """
- MIT License
- Copyright (c) 2022 Texas Tech University
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in all
- copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
- """
- """
- This file is part of MonSter.
- Author:
- Jie Li, jie.li@ttu.edu
- """
- import json
- import logger
- import hostlist
- log = logger.get_logger(__name__)
- def parse_jobs_metrics(jobs_data: dict,
- os_idrac_hostname_mapping: dict):
- """parse_jobs_metrics Parse Jobs Metrics
- Parse jobs metrics get from Slurm API
- Args:
- jobs_data (dict): Job data get from Slurm APi
- os_idrac_hostname_mapping (dict): OS-iDRAC hostname mapping
- Returns:
- list: Parsed jobs info
- """
- jobs_metrics = []
- all_jobs = jobs_data['jobs']
- attributes = ['job_id', 'array_job_id', 'array_task_id', 'name','job_state',
- 'user_id', 'user_name', 'group_id', 'cluster', 'partition',
- 'command', 'current_working_directory', 'batch_flag',
- 'batch_host', 'nodes', 'node_count', 'cpus', 'tasks',
- 'tasks_per_node', 'cpus_per_task', 'memory_per_node',
- 'memory_per_cpu', 'priority', 'time_limit', 'deadline',
- 'submit_time', 'preempt_time', 'suspend_time',
- 'eligible_time', 'start_time', 'end_time', 'resize_time',
- 'restart_cnt', 'exit_code', 'derived_exit_code']
-
- for job in all_jobs:
- nodes = job['nodes']
- hostnames = hostlist.expand_hostlist(nodes)
- # Mapping OS hostnames to iDRAC hostnames.
- if os_idrac_hostname_mapping:
- try:
- hostnames = [os_idrac_hostname_mapping[i] for i in hostnames]
- except Exception as err:
- log.error(f"Cannot mapping OS-iDRAC hostname: {err}")
- metrics = []
- for attribute in attributes:
- if attribute == 'nodes':
- metrics.append(hostnames)
- else:
- # Some attributes values are larger than 2147483647, which is
- # not INT4, and cannot saved in TSDB
- if type(job[attribute]) is int and job[attribute] > 2147483647:
- metrics.append(2147483647)
- else:
- metrics.append(job[attribute])
- tuple_metrics = tuple(metrics)
- jobs_metrics.append(tuple_metrics)
-
- return jobs_metrics
- def parse_node_metrics(nodes_data: dict,
- node_id_mapping: dict,
- os_idrac_hostname_mapping: dict):
- """parse_node_metrics Parse Node Metircs
- Parse Nodes metrics get from Slurm API
- Args:
- nodes_data (dict): Nodes data get from Slurm APi
- node_id_mapping (dict): Node-Id mapping
- os_idrac_hostname_mapping (dict): OS-iDRAC hostname mapping
- Returns:
- dict: Parsed node metrics
- """
- all_node_metrics = {}
- state_mapping = {
- 'allocated': 1,
- 'idle':0,
- 'down': -1
- }
- all_nodes = nodes_data['nodes']
- for node in all_nodes:
- hostname = node['hostname']
- # Mapping the OS hostname to iDRAC hostname. The hostname in
- # node_id_mapping is using iDRAC hostname.
- if os_idrac_hostname_mapping:
- try:
- hostname = os_idrac_hostname_mapping[hostname]
- except Exception as err:
- log.error(f"Cannot map OS-hostname and IP: {err}")
- # Only process those nodes that are in node_id_mapping dict.
- if hostname in node_id_mapping:
- node_id = node_id_mapping[hostname]
- # CPU load
- cpu_load = int(node['cpu_load'])
- # Some down nodes report cpu_load large than 2147483647, which is
- # not INT4 and cannot saved in TSDB
- if cpu_load > 2147483647:
- cpu_load = 2147483647
- # Memory usage
- free_memory = node['free_memory']
- real_memory = node['real_memory']
- memory_usage = ((real_memory - free_memory)/real_memory) * 100
- memory_used = real_memory - free_memory
- f_memory_usage = float("{:.2f}".format(memory_usage))
- # Status
- state = node['state']
- f_state = state_mapping[state]
- node_data = {
- 'cpu_load': cpu_load,
- 'memoryusage': f_memory_usage,
- 'memory_used': memory_used,
- 'state': f_state
- }
- all_node_metrics.update({
- node_id: node_data
- })
- return all_node_metrics
- def parse_node_jobs(jobs_metrics: dict,
- node_id_mapping:dict,
- os_idrac_hostname_mapping: dict):
- """parse_node_jobs Parse Node-Jobs
- Parse nodes-job correlation
- Args:
- jobs_metrics (dict): Job metrics get from Slurm APi
- node_id_mapping (dict): Node-Id mapping
- os_idrac_hostname_mapping (dict): OS-iDRAC hostname mapping
- Returns:
- dict: node-jobs correlation
- """
-
- node_jobs = {}
- all_jobs = jobs_metrics['jobs']
- # Get job-nodes correlation
- job_nodes = {}
- for job in all_jobs:
- valid_flag = True
- if job['job_state'] == "RUNNING":
- job_id = job['job_id']
- nodes = job['nodes']
- # Get node ids
- hostnames = hostlist.expand_hostlist(nodes)
- # Mapping OS hostnames to iDRAC hostnames.
- if os_idrac_hostname_mapping:
- try:
- hostnames = [os_idrac_hostname_mapping[i] for i in hostnames]
- except Exception as err:
- log.error(f"Cannot mapping OS-iDRAC hostname: {err}")
-
- # Check if hostname is in node_id_mapping.
- # If not, ignore this job info.
- for hostname in hostnames:
- if hostname not in node_id_mapping:
- valid_flag = False
- break
- if valid_flag:
- node_ids = [node_id_mapping[i] for i in hostnames]
- node_ids.sort()
- # Get cpu counts for each node
- allocated_nodes = job['job_resources']['allocated_nodes']
- cpu_counts = [resource['cpus'] for node, resource in allocated_nodes.items()]
- job_nodes.update({
- job_id: {
- 'nodes': node_ids,
- 'cpus': cpu_counts
- }
- })
- # Get nodes-job correlation
- for job, nodes_cpus in job_nodes.items():
- for i, node in enumerate(nodes_cpus['nodes']):
- if node not in node_jobs:
- node_jobs.update({
- node: {
- 'jobs':[job],
- 'cpus':[nodes_cpus['cpus'][i]]
- }
- })
- else:
- node_jobs[node]['jobs'].append(job)
- node_jobs[node]['cpus'].append(nodes_cpus['cpus'][i])
- return node_jobs
|