spank.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. /*****************************************************************************\
  2. * spank.h - Stackable Plug-in Architecture for Node job Kontrol
  3. *****************************************************************************
  4. * Copyright (C) 2002-2007 The Regents of the University of California.
  5. * Copyright (C) 2008-2010 Lawrence Livermore National Security.
  6. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  7. * CODE-OCEC-09-009. All rights reserved.
  8. *
  9. * This file is part of Slurm, a resource management program.
  10. * For details, see <https://slurm.schedmd.com/>.
  11. * Please also read the included file: DISCLAIMER.
  12. *
  13. * Slurm is free software; you can redistribute it and/or modify it under
  14. * the terms of the GNU General Public License as published by the Free
  15. * Software Foundation; either version 2 of the License, or (at your option)
  16. * any later version.
  17. *
  18. * In addition, as a special exception, the copyright holders give permission
  19. * to link the code of portions of this program with the OpenSSL library under
  20. * certain conditions as described in each individual source file, and
  21. * distribute linked combinations including the two. You must obey the GNU
  22. * General Public License in all respects for all of the code used other than
  23. * OpenSSL. If you modify file(s) with this exception, you may extend this
  24. * exception to your version of the file(s), but you are not obligated to do
  25. * so. If you do not wish to do so, delete this exception statement from your
  26. * version. If you delete this exception statement from all source files in
  27. * the program, then also delete it here.
  28. *
  29. * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
  30. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  31. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  32. * details.
  33. *
  34. * You should have received a copy of the GNU General Public License along
  35. * with Slurm; if not, write to the Free Software Foundation, Inc.,
  36. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  37. \*****************************************************************************/
  38. #ifndef SPANK_H
  39. #define SPANK_H
  40. /* SPANK handle. Plug-in's context for running Slurm job
  41. */
  42. typedef struct spank_handle * spank_t;
  43. /* Prototype for all spank plugin operations
  44. */
  45. typedef int (spank_f) (spank_t spank, int ac, char *argv[]);
  46. /* SPANK plugin operations. SPANK plugin should have at least one of
  47. * these functions defined non-NULL.
  48. *
  49. * Plug-in callbacks are completed at the following points in slurmd:
  50. *
  51. * slurmd
  52. * `-> init()
  53. * |
  54. * `-> job_prolog()
  55. * |
  56. * | `-> slurmstepd
  57. * | `-> init ()
  58. * | -> process spank options
  59. * | -> init_post_opt ()
  60. * | + drop privileges (initgroups(), seteuid(), chdir())
  61. * | `-> user_init ()
  62. * | + for each task
  63. * | | + fork ()
  64. * | | |
  65. * | | + reclaim privileges
  66. * | | `-> task_init_privileged ()
  67. * | | |
  68. * | | + become_user ()
  69. * | | `-> task_init ()
  70. * | | |
  71. * | | + execve ()
  72. * | |
  73. * | + reclaim privileges
  74. * | + for each task
  75. * | | `-> task_post_fork ()
  76. * | |
  77. * | + for each task
  78. * | | + wait ()
  79. * | | `-> task_exit ()
  80. * | `-> exit ()
  81. * |
  82. * `---> job_epilog()
  83. * |
  84. * `-> slurmd_exit()
  85. *
  86. * In srun only the init(), init_post_opt() and local_user_init(), and exit()
  87. * callbacks are used.
  88. *
  89. * In sbatch/salloc only the init(), init_post_opt(), and exit() callbacks
  90. * are used.
  91. *
  92. * In slurmd proper, only the init(), slurmd_exit(), and
  93. * job_prolog/epilog callbacks are used.
  94. *
  95. */
  96. extern spank_f slurm_spank_init;
  97. extern spank_f slurm_spank_job_prolog;
  98. extern spank_f slurm_spank_init_post_opt;
  99. extern spank_f slurm_spank_local_user_init;
  100. extern spank_f slurm_spank_user_init;
  101. extern spank_f slurm_spank_task_init_privileged;
  102. extern spank_f slurm_spank_task_init;
  103. extern spank_f slurm_spank_task_post_fork;
  104. extern spank_f slurm_spank_task_exit;
  105. extern spank_f slurm_spank_job_epilog;
  106. extern spank_f slurm_spank_slurmd_exit;
  107. extern spank_f slurm_spank_exit;
  108. /* Items which may be obtained from the spank handle using the
  109. * spank_get_item () call. The expected list of variable arguments may
  110. * be found in the comments below.
  111. *
  112. * For example, S_JOB_NCPUS takes (uint16_t *), a pointer to uint16_t, so
  113. * the get item call would look like:
  114. *
  115. * uint16_t ncpus;
  116. * spank_err_t rc = spank_get_item (spank, S_JOB_NCPUS, &ncpus);
  117. *
  118. * while S_JOB_PID_TO_GLOBAL_ID takes (pid_t, uint32_t *), so it would
  119. * be called as:
  120. *
  121. * uint32_t global_id;
  122. * spank_err_t rc;
  123. * rc = spank_get_item (spank, S_JOB_PID_TO_GLOBAL_ID, pid, &global_id);
  124. */
  125. enum spank_item {
  126. S_JOB_UID, /* User id (uid_t *) */
  127. S_JOB_GID, /* Primary group id (gid_t *) */
  128. S_JOB_ID, /* Slurm job id (uint32_t *) */
  129. S_JOB_STEPID, /* Slurm job step id (uint32_t *) */
  130. S_JOB_NNODES, /* Total number of nodes in job (uint32_t *) */
  131. S_JOB_NODEID, /* Relative id of this node (uint32_t *) */
  132. S_JOB_LOCAL_TASK_COUNT, /* Number of local tasks (uint32_t *) */
  133. S_JOB_TOTAL_TASK_COUNT, /* Total number of tasks in job (uint32_t *) */
  134. S_JOB_NCPUS, /* Number of CPUs used by this job (uint16_t *) */
  135. S_JOB_ARGV, /* Command args (int *, char ***) */
  136. S_JOB_ENV, /* Job env array (char ***) */
  137. S_TASK_ID, /* Local task id (int *) */
  138. S_TASK_GLOBAL_ID, /* Global task id (uint32_t *) */
  139. S_TASK_EXIT_STATUS, /* Exit status of task if exited (int *) */
  140. S_TASK_PID, /* Task pid (pid_t *) */
  141. S_JOB_PID_TO_GLOBAL_ID, /* global task id from pid (pid_t, uint32_t *) */
  142. S_JOB_PID_TO_LOCAL_ID, /* local task id from pid (pid_t, uint32_t *) */
  143. S_JOB_LOCAL_TO_GLOBAL_ID,/* local id to global id (uint32_t, uint32_t *) */
  144. S_JOB_GLOBAL_TO_LOCAL_ID,/* global id to local id (uint32_t, uint32_t *) */
  145. S_JOB_SUPPLEMENTARY_GIDS,/* Array of suppl. gids (gid_t **, int *) */
  146. S_SLURM_VERSION, /* Current Slurm version (char **) */
  147. S_SLURM_VERSION_MAJOR, /* Slurm version major release (char **) */
  148. S_SLURM_VERSION_MINOR, /* Slurm version minor release (char **) */
  149. S_SLURM_VERSION_MICRO, /* Slurm version micro release (char **) */
  150. S_STEP_CPUS_PER_TASK, /* CPUs allocated per task (=1 if --overcommit
  151. * option is used, uint32_t *) */
  152. S_JOB_ALLOC_CORES, /* Job allocated cores in list format (char **) */
  153. S_JOB_ALLOC_MEM, /* Job allocated memory in MB (uint64_t *) */
  154. S_STEP_ALLOC_CORES, /* Step alloc'd cores in list format (char **) */
  155. S_STEP_ALLOC_MEM, /* Step alloc'd memory in MB (uint64_t *) */
  156. S_SLURM_RESTART_COUNT, /* Job restart count (uint32_t *) */
  157. S_JOB_ARRAY_ID, /* Slurm job array id (uint32_t *) or 0 */
  158. S_JOB_ARRAY_TASK_ID, /* Slurm job array task id (uint32_t *) */
  159. };
  160. typedef enum spank_item spank_item_t;
  161. /* SPANK error codes.
  162. */
  163. enum spank_err {
  164. ESPANK_SUCCESS = 0, /* Success. */
  165. ESPANK_ERROR = 1, /* Generic error. */
  166. ESPANK_BAD_ARG = 2, /* Bad argument. */
  167. ESPANK_NOT_TASK = 3, /* Not in task context. */
  168. ESPANK_ENV_EXISTS = 4, /* Environment variable exists && !overwrite */
  169. ESPANK_ENV_NOEXIST = 5, /* No such environment variable */
  170. ESPANK_NOSPACE = 6, /* Buffer too small. */
  171. ESPANK_NOT_REMOTE = 7, /* Function only may be called in remote context */
  172. ESPANK_NOEXIST = 8, /* Id/pid doesn't exist on this node */
  173. ESPANK_NOT_EXECD = 9, /* Lookup by pid requested, but no tasks running */
  174. ESPANK_NOT_AVAIL = 10,/* SPANK item not available from this callback */
  175. ESPANK_NOT_LOCAL = 11,/* Function only valid in local/alloc context */
  176. };
  177. typedef enum spank_err spank_err_t;
  178. /*
  179. * SPANK plugin context
  180. */
  181. enum spank_context {
  182. S_CTX_ERROR, /* Error obtaining current context */
  183. S_CTX_LOCAL, /* Local context (srun) */
  184. S_CTX_REMOTE, /* Remote context (slurmstepd) */
  185. S_CTX_ALLOCATOR, /* Allocator context (sbatch/salloc) */
  186. S_CTX_SLURMD, /* slurmd context */
  187. S_CTX_JOB_SCRIPT /* prolog/epilog context */
  188. };
  189. #define HAVE_S_CTX_SLURMD 1 /* slurmd context supported */
  190. #define HAVE_S_CTX_JOB_SCRIPT 1 /* job script (prolog/epilog) supported */
  191. typedef enum spank_context spank_context_t;
  192. /*
  193. * SPANK plugin options
  194. */
  195. /*
  196. * SPANK option callback. `val' is an integer value provided by
  197. * the plugin to distinguish between plugin-local options, `optarg'
  198. * is an argument passed by the user (if applicable), and `remote'
  199. * specifies whether this call is being made locally (e.g. in srun)
  200. * or remotely (e.g. in slurmstepd/slurmd).
  201. */
  202. typedef int (*spank_opt_cb_f) (int val, const char *optarg, int remote);
  203. struct spank_option {
  204. char * name; /* long option provided by plugin */
  205. char * arginfo; /* one word description of argument if required */
  206. char * usage; /* Usage text */
  207. int has_arg; /* Does option require argument? */
  208. int val; /* value to return using callback */
  209. spank_opt_cb_f cb; /* Callback function to check option value */
  210. };
  211. /*
  212. * Plugins may export a spank_options option table as symbol "spank_options".
  213. * This method only works in "local" and "remote" mode. To register options
  214. * in "allocator" mode (sbatch/salloc), use the preferred
  215. * spank_option_register function described below.
  216. */
  217. extern struct spank_option spank_options [];
  218. /*
  219. * SPANK plugin option table must end with the following entry:
  220. */
  221. #define SPANK_OPTIONS_TABLE_END { NULL, NULL, NULL, 0, 0, NULL }
  222. /*
  223. * Maximum allowed length of SPANK option name:
  224. */
  225. #define SPANK_OPTION_MAXLEN 75
  226. /* SPANK interface prototypes
  227. */
  228. #ifdef __cplusplus
  229. extern "C" {
  230. #endif
  231. /*
  232. * Return the string representation of a spank_err_t error code.
  233. */
  234. const char *spank_strerror (spank_err_t err);
  235. /*
  236. * Determine whether a given spank plugin symbol is supported
  237. * in this version of SPANK interface.
  238. *
  239. * Returns:
  240. * = 1 The symbol is supported
  241. * = 0 The symbol is not supported
  242. * = -1 Invalid argument
  243. */
  244. int spank_symbol_supported (const char *symbol);
  245. /*
  246. * Determine whether plugin is loaded in "remote" context
  247. *
  248. * Returns:
  249. * = 1 remote context, i.e. plugin is loaded in /slurmstepd.
  250. * = 0 not remote context
  251. * < 0 spank handle was not valid.
  252. */
  253. int spank_remote (spank_t spank);
  254. /*
  255. * Return the context in which the calling plugin is loaded.
  256. *
  257. * Returns the spank_context for the calling plugin, or SPANK_CTX_ERROR
  258. * if the current context cannot be determined.
  259. */
  260. spank_context_t spank_context (void);
  261. /*
  262. * Register a plugin-provided option dynamically. This function
  263. * is only valid when called from slurm_spank_init(), and must
  264. * be guaranteed to be called in all contexts in which it is
  265. * used (local, remote, allocator).
  266. *
  267. * This function is the only method to register options in
  268. * allocator context.
  269. *
  270. * May be called multiple times to register many options.
  271. *
  272. * Returns ESPANK_SUCCESS on successful registration of the option
  273. * or ESPANK_BAD_ARG if not called from slurm_spank_init().
  274. */
  275. spank_err_t spank_option_register (spank_t spank, struct spank_option *opt);
  276. /*
  277. * Check whether spank plugin option [opt] has been activated.
  278. * If the option takes an argument, then the option argument
  279. * (if found) will be returned in *optarg.
  280. * This function can be invoked from the following functions:
  281. * slurm_spank_job_prolog, slurm_spank_local_user_init, slurm_spank_user_init,
  282. * slurm_spank_task_init_privileged, slurm_spank_task_init,
  283. * slurm_spank_task_exit, and slurm_spank_job_epilog.
  284. *
  285. * Returns
  286. * ESPANK_SUCCESS if the option was used by user. In this case
  287. * *optarg will contain the option argument if opt->has_arg != 0.
  288. * ESPANK_ERROR if the option wasn't used.
  289. * ESPANK_BAD_ARG if an invalid argument was passed to the function,
  290. * such as NULL opt, NULL opt->name, or NULL optarg when opt->has_arg != 0.
  291. * ESPANK_NOT_AVAIL if called from improper context.
  292. */
  293. spank_err_t spank_option_getopt (spank_t spank, struct spank_option *opt,
  294. char **optarg);
  295. /* Get the value for the current job or task item specified,
  296. * storing the result in the subsequent pointer argument(s).
  297. * Refer to the spank_item_t comments for argument types.
  298. * For S_JOB_ARGV, S_JOB_ENV, and S_SLURM_VERSION* items
  299. * the result returned to the caller should not be freed or
  300. * modified.
  301. *
  302. * Returns ESPANK_SUCCESS on success, ESPANK_NOTASK if an S_TASK*
  303. * item is requested from outside a task context, ESPANK_BAD_ARG
  304. * if invalid args are passed to spank_get_item or spank_get_item
  305. * is called from an invalid context, and ESPANK_NOT_REMOTE
  306. * if not called from slurmstepd context or spank_local_user_init.
  307. */
  308. spank_err_t spank_get_item (spank_t spank, spank_item_t item, ...);
  309. /* Place a copy of environment variable "var" from the job's environment
  310. * into buffer "buf" of size "len."
  311. *
  312. * Returns ESPANK_SUCCESS on success, o/w spank_err_t on failure:
  313. * ESPANK_BAD_ARG = spank handle invalid or len < 0.
  314. * ESPANK_ENV_NOEXIST = environment variable doesn't exist in job's env.
  315. * ESPANK_NOSPACE = buffer too small, truncation occurred.
  316. * ESPANK_NOT_REMOTE = not called in remote context (i.e. from slurmd).
  317. */
  318. spank_err_t spank_getenv (spank_t spank, const char *var, char *buf, int len);
  319. /*
  320. * Set the environment variable "var" to "val" in the environment of
  321. * the current job or task in the spank handle. If overwrite != 0 an
  322. * existing value for var will be overwritten.
  323. *
  324. * Returns ESPANK_SUCCESS on success, o/w spank_err_t on failure:
  325. * ESPANK_ENV_EXISTS = var exists in job env and overwrite == 0.
  326. * ESPANK_BAD_ARG = spank handle invalid or var/val are NULL.
  327. * ESPANK_NOT_REMOTE = not called from slurmstepd.
  328. */
  329. spank_err_t spank_setenv (spank_t spank, const char *var, const char *val,
  330. int overwrite);
  331. /*
  332. * Unset environment variable "var" in the environment of current job or
  333. * task in the spank handle.
  334. *
  335. * Returns ESPANK_SUCCESS on success, o/w spank_err_t on failure:
  336. * ESPANK_BAD_ARG = spank handle invalid or var is NULL.
  337. * ESPANK_NOT_REMOTE = not called from slurmstepd.
  338. */
  339. spank_err_t spank_unsetenv (spank_t spank, const char *var);
  340. /*
  341. * Set an environment variable "name" to "value" in the "job control"
  342. * environment, which is an extra set of environment variables
  343. * included in the environment of the Slurm prolog and epilog
  344. * programs. Environment variables set via this function will
  345. * be prepended with SPANK_ to differentiate them from other env
  346. * vars, and to avoid security issues.
  347. *
  348. * Returns ESPANK_SUCCESS on success, o/w/ spank_err_t on failure:
  349. * ESPANK_ENV_EXISTS = var exists in control env and overwrite == 0.
  350. * ESPANK_NOT_LOCAL = not called in local context
  351. */
  352. spank_err_t spank_job_control_setenv (spank_t sp, const char *name,
  353. const char *value, int overwrite);
  354. /*
  355. * Place a copy of environment variable "name" from the job control
  356. * environment into a buffer buf of size len.
  357. *
  358. * Returns ESPANK_SUCCESS on success, o/w spank_err_t on failure:
  359. * ESPANK_BAD_ARG = invalid spank handle or len <= 0
  360. * ESPANK_ENV_NOEXIST = environment var does not exist in control env
  361. * ESPANK_NOSPACE = buffer too small, truncation occurred.
  362. * ESPANK_NOT_LOCAL = not called in local context
  363. */
  364. spank_err_t spank_job_control_getenv (spank_t sp, const char *name,
  365. char *buf, int len);
  366. /*
  367. * Unset environment variable "name" in the job control environment.
  368. *
  369. * Returns ESPANK_SUCCESS on success, o/w spank_err_t on failure:
  370. * ESPANK_BAD_ARG = invalid spank handle or var is NULL
  371. * ESPANK_NOT_LOCAL = not called in local context
  372. */
  373. spank_err_t spank_job_control_unsetenv (spank_t sp, const char *name);
  374. /*
  375. * Slurm logging functions which are exported to plugins.
  376. */
  377. extern void slurm_info (const char *format, ...)
  378. __attribute__ ((format (printf, 1, 2)));
  379. extern void slurm_error (const char *format, ...)
  380. __attribute__ ((format (printf, 1, 2)));
  381. extern void slurm_verbose (const char *format, ...)
  382. __attribute__ ((format (printf, 1, 2)));
  383. extern void slurm_debug (const char *format, ...)
  384. __attribute__ ((format (printf, 1, 2)));
  385. extern void slurm_debug2 (const char *format, ...)
  386. __attribute__ ((format (printf, 1, 2)));
  387. extern void slurm_debug3 (const char *format, ...)
  388. __attribute__ ((format (printf, 1, 2)));
  389. /*
  390. * Print at the same log level as error(), but without prefixing the message
  391. * with "error: ". Useful to report back to srun commands from SPANK plugins,
  392. * as info() will only go to the logs.
  393. */
  394. extern void slurm_spank_log(const char *, ...)
  395. __attribute__ ((format (printf, 1, 2)));
  396. #ifdef __cplusplus
  397. }
  398. #endif
  399. /*
  400. * All spank plugins must issue the following for the Slurm plugin
  401. * loader.
  402. */
  403. #define SPANK_PLUGIN(__name, __ver) \
  404. const char plugin_name [] = #__name; \
  405. const char plugin_type [] = "spank"; \
  406. const unsigned int plugin_version = __ver;
  407. #endif /* !SPANK_H */