parse.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. """
  2. MIT License
  3. Copyright (c) 2022 Texas Tech University
  4. Permission is hereby granted, free of charge, to any person obtaining a copy
  5. of this software and associated documentation files (the "Software"), to deal
  6. in the Software without restriction, including without limitation the rights
  7. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. copies of the Software, and to permit persons to whom the Software is
  9. furnished to do so, subject to the following conditions:
  10. The above copyright notice and this permission notice shall be included in all
  11. copies or substantial portions of the Software.
  12. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  17. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  18. SOFTWARE.
  19. """
  20. """
  21. This file is part of MonSter.
  22. Author:
  23. Jie Li, jie.li@ttu.edu
  24. """
  25. import json
  26. import logger
  27. import hostlist
  28. log = logger.get_logger(__name__)
  29. def parse_jobs_metrics(jobs_data: dict,
  30. os_idrac_hostname_mapping: dict):
  31. """parse_jobs_metrics Parse Jobs Metrics
  32. Parse jobs metrics get from Slurm API
  33. Args:
  34. jobs_data (dict): Job data get from Slurm APi
  35. os_idrac_hostname_mapping (dict): OS-iDRAC hostname mapping
  36. Returns:
  37. list: Parsed jobs info
  38. """
  39. jobs_metrics = []
  40. all_jobs = jobs_data['jobs']
  41. attributes = ['job_id', 'array_job_id', 'array_task_id', 'name','job_state',
  42. 'user_id', 'user_name', 'group_id', 'cluster', 'partition',
  43. 'command', 'current_working_directory', 'batch_flag',
  44. 'batch_host', 'nodes', 'node_count', 'cpus', 'tasks',
  45. 'tasks_per_node', 'cpus_per_task', 'memory_per_node',
  46. 'memory_per_cpu', 'priority', 'time_limit', 'deadline',
  47. 'submit_time', 'preempt_time', 'suspend_time',
  48. 'eligible_time', 'start_time', 'end_time', 'resize_time',
  49. 'restart_cnt', 'exit_code', 'derived_exit_code']
  50. for job in all_jobs:
  51. nodes = job['nodes']
  52. hostnames = hostlist.expand_hostlist(nodes)
  53. # Mapping OS hostnames to iDRAC hostnames.
  54. if os_idrac_hostname_mapping:
  55. try:
  56. hostnames = [os_idrac_hostname_mapping[i] for i in hostnames]
  57. except Exception as err:
  58. log.error(f"Cannot mapping OS-iDRAC hostname: {err}")
  59. metrics = []
  60. for attribute in attributes:
  61. if attribute == 'nodes':
  62. metrics.append(hostnames)
  63. else:
  64. # Some attributes values are larger than 2147483647, which is
  65. # not INT4, and cannot saved in TSDB
  66. if type(job[attribute]) is int and job[attribute] > 2147483647:
  67. metrics.append(2147483647)
  68. else:
  69. metrics.append(job[attribute])
  70. tuple_metrics = tuple(metrics)
  71. jobs_metrics.append(tuple_metrics)
  72. return jobs_metrics
  73. def parse_node_metrics(nodes_data: dict,
  74. node_id_mapping: dict,
  75. os_idrac_hostname_mapping: dict):
  76. """parse_node_metrics Parse Node Metircs
  77. Parse Nodes metrics get from Slurm API
  78. Args:
  79. nodes_data (dict): Nodes data get from Slurm APi
  80. node_id_mapping (dict): Node-Id mapping
  81. os_idrac_hostname_mapping (dict): OS-iDRAC hostname mapping
  82. Returns:
  83. dict: Parsed node metrics
  84. """
  85. all_node_metrics = {}
  86. state_mapping = {
  87. 'allocated': 1,
  88. 'idle':0,
  89. 'down': -1
  90. }
  91. all_nodes = nodes_data['nodes']
  92. for node in all_nodes:
  93. hostname = node['hostname']
  94. # Mapping the OS hostname to iDRAC hostname. The hostname in
  95. # node_id_mapping is using iDRAC hostname.
  96. if os_idrac_hostname_mapping:
  97. try:
  98. hostname = os_idrac_hostname_mapping[hostname]
  99. except Exception as err:
  100. log.error(f"Cannot map OS-hostname and IP: {err}")
  101. # Only process those nodes that are in node_id_mapping dict.
  102. if hostname in node_id_mapping:
  103. node_id = node_id_mapping[hostname]
  104. # CPU load
  105. cpu_load = int(node['cpu_load'])
  106. # Some down nodes report cpu_load large than 2147483647, which is
  107. # not INT4 and cannot saved in TSDB
  108. if cpu_load > 2147483647:
  109. cpu_load = 2147483647
  110. # Memory usage
  111. free_memory = node['free_memory']
  112. real_memory = node['real_memory']
  113. memory_usage = ((real_memory - free_memory)/real_memory) * 100
  114. memory_used = real_memory - free_memory
  115. f_memory_usage = float("{:.2f}".format(memory_usage))
  116. # Status
  117. state = node['state']
  118. f_state = state_mapping[state]
  119. node_data = {
  120. 'cpu_load': cpu_load,
  121. 'memoryusage': f_memory_usage,
  122. 'memory_used': memory_used,
  123. 'state': f_state
  124. }
  125. all_node_metrics.update({
  126. node_id: node_data
  127. })
  128. return all_node_metrics
  129. def parse_node_jobs(jobs_metrics: dict,
  130. node_id_mapping:dict,
  131. os_idrac_hostname_mapping: dict):
  132. """parse_node_jobs Parse Node-Jobs
  133. Parse nodes-job correlation
  134. Args:
  135. jobs_metrics (dict): Job metrics get from Slurm APi
  136. node_id_mapping (dict): Node-Id mapping
  137. os_idrac_hostname_mapping (dict): OS-iDRAC hostname mapping
  138. Returns:
  139. dict: node-jobs correlation
  140. """
  141. node_jobs = {}
  142. all_jobs = jobs_metrics['jobs']
  143. # Get job-nodes correlation
  144. job_nodes = {}
  145. for job in all_jobs:
  146. valid_flag = True
  147. if job['job_state'] == "RUNNING":
  148. job_id = job['job_id']
  149. nodes = job['nodes']
  150. # Get node ids
  151. hostnames = hostlist.expand_hostlist(nodes)
  152. # Mapping OS hostnames to iDRAC hostnames.
  153. if os_idrac_hostname_mapping:
  154. try:
  155. hostnames = [os_idrac_hostname_mapping[i] for i in hostnames]
  156. except Exception as err:
  157. log.error(f"Cannot mapping OS-iDRAC hostname: {err}")
  158. # Check if hostname is in node_id_mapping.
  159. # If not, ignore this job info.
  160. for hostname in hostnames:
  161. if hostname not in node_id_mapping:
  162. valid_flag = False
  163. break
  164. if valid_flag:
  165. node_ids = [node_id_mapping[i] for i in hostnames]
  166. node_ids.sort()
  167. # Get cpu counts for each node
  168. allocated_nodes = job['job_resources']['allocated_nodes']
  169. cpu_counts = [resource['cpus'] for node, resource in allocated_nodes.items()]
  170. job_nodes.update({
  171. job_id: {
  172. 'nodes': node_ids,
  173. 'cpus': cpu_counts
  174. }
  175. })
  176. # Get nodes-job correlation
  177. for job, nodes_cpus in job_nodes.items():
  178. for i, node in enumerate(nodes_cpus['nodes']):
  179. if node not in node_jobs:
  180. node_jobs.update({
  181. node: {
  182. 'jobs':[job],
  183. 'cpus':[nodes_cpus['cpus'][i]]
  184. }
  185. })
  186. else:
  187. node_jobs[node]['jobs'].append(job)
  188. node_jobs[node]['cpus'].append(nodes_cpus['cpus'][i])
  189. return node_jobs