slurm.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. """
  2. MIT License
  3. Copyright (c) 2022 Texas Tech University
  4. Permission is hereby granted, free of charge, to any person obtaining a copy
  5. of this software and associated documentation files (the "Software"), to deal
  6. in the Software without restriction, including without limitation the rights
  7. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. copies of the Software, and to permit persons to whom the Software is
  9. furnished to do so, subject to the following conditions:
  10. The above copyright notice and this permission notice shall be included in all
  11. copies or substantial portions of the Software.
  12. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  17. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  18. SOFTWARE.
  19. """
  20. """
  21. This file is part of MonSter.
  22. Author:
  23. Jie Li, jie.li@ttu.edu
  24. """
  25. import time
  26. import json
  27. import logger
  28. import requests
  29. import subprocess
  30. from requests.adapters import HTTPAdapter
  31. log = logger.get_logger(__name__)
  32. def read_slurm_token(slurm_config: dict):
  33. """read_slurm_token Read Slurm token
  34. Read the token file, if it is out of data, get a new token from Slurm
  35. Args:
  36. slurm_config (dict): Slurm Configuration
  37. """
  38. token = ""
  39. try:
  40. with open('./token.json', 'r') as f:
  41. token_record = json.load(f)
  42. time_interval = int(time.time()) - token_record['time']
  43. if time_interval >= 3600:
  44. token = get_slurm_token(slurm_config)
  45. else:
  46. token = token_record['token']
  47. except:
  48. token = get_slurm_token(slurm_config)
  49. return token
  50. def get_slurm_token(slurm_config: dict):
  51. """get_slurm_token Get Slurm Token
  52. Get JWT token from Slurm. This requires the public key on this node to be
  53. added to the target cluster headnode.
  54. Args:
  55. slurm_config (dict): Slurm Configuration
  56. Returns:
  57. srt: token
  58. """
  59. while True:
  60. try:
  61. # Setting command parameters
  62. slurm_headnode = slurm_config['headnode']
  63. print("Get a new token...")
  64. # The command used in cli
  65. command = [f"ssh {slurm_headnode} 'scontrol token lifespan=3600'"]
  66. # Get the string from command line
  67. rtn_str = subprocess.run(command, shell=True, stdout=subprocess.PIPE).stdout.decode('utf-8')
  68. # Get token
  69. token = rtn_str.splitlines()[0].split('=')[1]
  70. timestamp = int(time.time())
  71. token_record = {
  72. 'time': timestamp,
  73. 'token': token
  74. }
  75. with open('./token.json', 'w') as f:
  76. json.dump(token_record, f, indent = 4)
  77. return token
  78. except Exception as err:
  79. print("Get Slurm token error! Try in 60s.")
  80. time.sleep(60)
  81. else:
  82. break
  83. def call_slurm_api(slurm_config: dict, token: str, url: str):
  84. """call_slurm_api Call Slurm API
  85. Call Slurm API and get the data from the specified url
  86. Args:
  87. slurm_config (dict): Slurm Configuration
  88. token (str): Slurm JWT token
  89. url (str): Url of Slurm API
  90. Returns:
  91. dict: slurm metrics
  92. """
  93. metrics = {}
  94. headers = {"X-SLURM-USER-NAME": slurm_config['user'],
  95. "X-SLURM-USER-TOKEN": token}
  96. adapter = HTTPAdapter(max_retries=3)
  97. with requests.Session() as session:
  98. session.mount(url, adapter)
  99. try:
  100. response = session.get(url, headers=headers)
  101. metrics = response.json()
  102. except Exception as err:
  103. log.error(f"Fetch slurm metrics error: {err}")
  104. return metrics
  105. def get_slurm_url(slurm_config: dict, url_type: str):
  106. """get_slurm_nodes_url Get Slurm Nodes Url
  107. Get the url for reading nodes info from slurm
  108. Args:
  109. slurm_config (dict): Slurm Configuration
  110. url_type: Url type. nodes or jobs
  111. """
  112. base_url = f"http://{slurm_config['ip']}:{slurm_config['port']}"
  113. url_types = ['nodes', 'jobs']
  114. if url_type not in url_types:
  115. raise ValueError(f"Invalid url type. Expected one of: {url_types}")
  116. if url_type == 'nodes':
  117. url = f"{base_url}{slurm_config['slurm_nodes']}"
  118. else:
  119. url = f"{base_url}{slurm_config['slurm_jobs']}"
  120. return url