slurm-dashboard-node-exporter.json 36 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534
  1. {
  2. "__inputs": [],
  3. "__elements": [],
  4. "__requires": [
  5. {
  6. "type": "grafana",
  7. "id": "grafana",
  8. "name": "Grafana",
  9. "version": "8.3.2"
  10. },
  11. {
  12. "type": "panel",
  13. "id": "graph",
  14. "name": "Graph (old)",
  15. "version": ""
  16. },
  17. {
  18. "type": "datasource",
  19. "id": "prometheus",
  20. "name": "Prometheus",
  21. "version": "1.0.0"
  22. },
  23. {
  24. "type": "panel",
  25. "id": "stat",
  26. "name": "Stat",
  27. "version": ""
  28. },
  29. {
  30. "type": "panel",
  31. "id": "text",
  32. "name": "Text",
  33. "version": ""
  34. }
  35. ],
  36. "annotations": {
  37. "list": [
  38. {
  39. "builtIn": 1,
  40. "datasource": "-- Grafana --",
  41. "enable": true,
  42. "hide": true,
  43. "iconColor": "rgba(0, 211, 255, 1)",
  44. "name": "Annotations & Alerts",
  45. "target": {
  46. "limit": 100,
  47. "matchAny": false,
  48. "tags": [],
  49. "type": "dashboard"
  50. },
  51. "type": "dashboard"
  52. }
  53. ]
  54. },
  55. "description": "Dashboard to view multiple servers",
  56. "editable": true,
  57. "fiscalYearStartMonth": 0,
  58. "gnetId": 405,
  59. "graphTooltip": 0,
  60. "id": null,
  61. "iteration": 1640761048436,
  62. "links": [],
  63. "liveNow": false,
  64. "panels": [
  65. {
  66. "editable": true,
  67. "error": false,
  68. "gridPos": {
  69. "h": 3,
  70. "w": 24,
  71. "x": 0,
  72. "y": 0
  73. },
  74. "id": 11,
  75. "maxPerRow": 6,
  76. "options": {
  77. "content": "",
  78. "mode": "html"
  79. },
  80. "pluginVersion": "8.3.2",
  81. "repeat": "node",
  82. "style": {},
  83. "title": "$node",
  84. "type": "text"
  85. },
  86. {
  87. "datasource": {
  88. "uid": "hpc-prometheus"
  89. },
  90. "fieldConfig": {
  91. "defaults": {
  92. "color": {
  93. "mode": "thresholds"
  94. },
  95. "mappings": [
  96. {
  97. "options": {
  98. "match": "null",
  99. "result": {
  100. "text": "N/A"
  101. }
  102. },
  103. "type": "special"
  104. }
  105. ],
  106. "thresholds": {
  107. "mode": "absolute",
  108. "steps": [
  109. {
  110. "color": "green",
  111. "value": null
  112. },
  113. {
  114. "color": "red",
  115. "value": 80
  116. }
  117. ]
  118. },
  119. "unit": "none"
  120. },
  121. "overrides": []
  122. },
  123. "gridPos": {
  124. "h": 3,
  125. "w": 24,
  126. "x": 0,
  127. "y": 3
  128. },
  129. "id": 20,
  130. "links": [],
  131. "maxDataPoints": 100,
  132. "options": {
  133. "colorMode": "none",
  134. "graphMode": "none",
  135. "justifyMode": "auto",
  136. "orientation": "horizontal",
  137. "reduceOptions": {
  138. "calcs": [
  139. "mean"
  140. ],
  141. "fields": "",
  142. "values": false
  143. },
  144. "textMode": "auto"
  145. },
  146. "pluginVersion": "8.3.2",
  147. "repeat": "node",
  148. "targets": [
  149. {
  150. "expr": "count(node_cpu_seconds_total{instance=~\"$node\", mode=\"system\"}) or count(node_cpu{instance=~\"$node\", mode=\"system\"})",
  151. "instant": true,
  152. "interval": "",
  153. "intervalFactor": 2,
  154. "legendFormat": "",
  155. "metric": "",
  156. "refId": "A",
  157. "step": 14400,
  158. "target": ""
  159. }
  160. ],
  161. "title": "CPU Cores",
  162. "type": "stat"
  163. },
  164. {
  165. "aliasColors": {},
  166. "bars": false,
  167. "dashLength": 10,
  168. "dashes": false,
  169. "datasource": {
  170. "uid": "hpc-prometheus"
  171. },
  172. "decimals": 3,
  173. "editable": true,
  174. "error": false,
  175. "fill": 10,
  176. "fillGradient": 0,
  177. "grid": {},
  178. "gridPos": {
  179. "h": 7,
  180. "w": 24,
  181. "x": 0,
  182. "y": 6
  183. },
  184. "hiddenSeries": false,
  185. "id": 7,
  186. "legend": {
  187. "alignAsTable": false,
  188. "avg": false,
  189. "current": false,
  190. "hideEmpty": false,
  191. "max": false,
  192. "min": false,
  193. "rightSide": false,
  194. "show": true,
  195. "total": false,
  196. "values": false
  197. },
  198. "lines": true,
  199. "linewidth": 0,
  200. "links": [],
  201. "maxPerRow": 6,
  202. "nullPointMode": "connected",
  203. "options": {
  204. "alertThreshold": true
  205. },
  206. "percentage": true,
  207. "pluginVersion": "8.3.2",
  208. "pointradius": 5,
  209. "points": false,
  210. "renderer": "flot",
  211. "repeat": "node",
  212. "seriesOverrides": [],
  213. "spaceLength": 10,
  214. "stack": true,
  215. "steppedLine": false,
  216. "targets": [
  217. {
  218. "expr": "sum(irate(node_cpu_seconds_total{mode=\"system\",instance=~'$node'}[5m])) or sum(irate(node_cpu{mode=\"system\",instance=~'$node'}[5m]))",
  219. "interval": "",
  220. "intervalFactor": 2,
  221. "legendFormat": "system",
  222. "metric": "",
  223. "refId": "A",
  224. "step": 1200,
  225. "target": ""
  226. },
  227. {
  228. "expr": "sum(irate(node_cpu_seconds_total{mode=\"user\",instance=~'$node'}[5m])) or sum(irate(node_cpu{mode=\"user\",instance=~'$node'}[5m]))",
  229. "interval": "",
  230. "intervalFactor": 2,
  231. "legendFormat": "user",
  232. "refId": "B",
  233. "step": 1200
  234. },
  235. {
  236. "expr": "sum(irate(node_cpu_seconds_total{mode=\"nice\",instance=~'$node'}[5m])) or sum(irate(node_cpu{mode=\"nice\",instance=~'$node'}[5m]))",
  237. "interval": "",
  238. "intervalFactor": 2,
  239. "legendFormat": "nice",
  240. "refId": "C",
  241. "step": 1200
  242. },
  243. {
  244. "expr": "sum(irate(node_cpu_seconds_total{mode=\"iowait\",instance=~'$node'}[5m])) or sum(irate(node_cpu{mode=\"iowait\",instance=~'$node'}[5m]))",
  245. "interval": "",
  246. "intervalFactor": 2,
  247. "legendFormat": "iowait",
  248. "refId": "E",
  249. "step": 1200
  250. },
  251. {
  252. "expr": "sum(irate(node_cpu_seconds_total{mode=\"steal\",instance=~'$node'}[5m])) or sum(irate(node_cpu{mode=\"steal\",instance=~'$node'}[5m]))",
  253. "intervalFactor": 2,
  254. "legendFormat": "steal",
  255. "refId": "H",
  256. "step": 1200
  257. },
  258. {
  259. "expr": "sum(irate(node_cpu_seconds_total{mode=\"idle\",instance=~'$node'}[5m])) or sum(irate(node_cpu{mode=\"idle\",instance=~'$node'}[5m]))",
  260. "interval": "",
  261. "intervalFactor": 2,
  262. "legendFormat": "idle",
  263. "refId": "D",
  264. "step": 1200
  265. },
  266. {
  267. "expr": "sum(irate(node_cpu_seconds_total{mode=\"irq\",instance=~'$node'}[5m])) or sum(irate(node_cpu{mode=\"irq\",instance=~'$node'}[5m]))",
  268. "interval": "",
  269. "intervalFactor": 2,
  270. "legendFormat": "irq",
  271. "refId": "F",
  272. "step": 1200
  273. },
  274. {
  275. "expr": "sum(irate(node_cpu_seconds_total{mode=\"softirq\",instance=~'$node'}[5m])) or sum(irate(node_cpu{mode=\"softirq\",instance=~'$node'}[5m]))",
  276. "interval": "",
  277. "intervalFactor": 2,
  278. "legendFormat": "softirq",
  279. "refId": "G",
  280. "step": 1200
  281. },
  282. {
  283. "expr": "sum(irate(node_cpu_seconds_total{mode=\"guest\",instance=~'$node'}[5m])) or sum(irate(node_cpu{mode=\"guest\",instance=~'$node'}[5m]))",
  284. "interval": "",
  285. "intervalFactor": 2,
  286. "legendFormat": "guest",
  287. "refId": "I",
  288. "step": 1200
  289. }
  290. ],
  291. "thresholds": [
  292. {
  293. "colorMode": "custom",
  294. "fill": true,
  295. "fillColor": "rgba(216, 200, 27, 0.27)",
  296. "op": "gt",
  297. "value": 0
  298. }
  299. ],
  300. "timeRegions": [],
  301. "title": "CPU",
  302. "tooltip": {
  303. "msResolution": false,
  304. "shared": true,
  305. "sort": 0,
  306. "value_type": "individual"
  307. },
  308. "type": "graph",
  309. "xaxis": {
  310. "mode": "time",
  311. "show": true,
  312. "values": []
  313. },
  314. "yaxes": [
  315. {
  316. "format": "short",
  317. "label": "%",
  318. "logBase": 1,
  319. "max": 100,
  320. "min": 0,
  321. "show": true
  322. },
  323. {
  324. "format": "short",
  325. "logBase": 1,
  326. "show": true
  327. }
  328. ],
  329. "yaxis": {
  330. "align": false
  331. }
  332. },
  333. {
  334. "aliasColors": {
  335. "Slab": "#E5A8E2",
  336. "Swap": "#E24D42"
  337. },
  338. "bars": false,
  339. "dashLength": 10,
  340. "dashes": false,
  341. "datasource": {
  342. "uid": "hpc-prometheus"
  343. },
  344. "decimals": 2,
  345. "editable": true,
  346. "error": false,
  347. "fill": 1,
  348. "fillGradient": 0,
  349. "grid": {},
  350. "gridPos": {
  351. "h": 7,
  352. "w": 24,
  353. "x": 0,
  354. "y": 13
  355. },
  356. "hiddenSeries": false,
  357. "id": 17,
  358. "legend": {
  359. "avg": false,
  360. "current": false,
  361. "max": false,
  362. "min": false,
  363. "show": true,
  364. "total": false,
  365. "values": false
  366. },
  367. "lines": true,
  368. "linewidth": 2,
  369. "links": [],
  370. "maxPerRow": 6,
  371. "nullPointMode": "connected",
  372. "options": {
  373. "alertThreshold": true
  374. },
  375. "percentage": false,
  376. "pluginVersion": "8.3.2",
  377. "pointradius": 5,
  378. "points": false,
  379. "renderer": "flot",
  380. "repeat": "node",
  381. "seriesOverrides": [
  382. {
  383. "alias": "/Apps|Buffers|Cached|Free|Slab|SwapCached|PageTables|VmallocUsed/",
  384. "fill": 5,
  385. "stack": true
  386. },
  387. {
  388. "alias": "Swap",
  389. "fill": 5,
  390. "stack": true
  391. }
  392. ],
  393. "spaceLength": 10,
  394. "stack": false,
  395. "steppedLine": false,
  396. "targets": [
  397. {
  398. "expr": "( node_memory_MemTotal_bytes{instance=~'$node'} - node_memory_MemFree_bytes{instance=~'$node'} - node_memory_Buffers_bytes{instance=~'$node'} - node_memory_Cached_bytes{instance=~'$node'} - node_memory_SwapCached_bytes{instance=~'$node'} - node_memory_Slab_bytes{instance=~'$node'} - node_memory_PageTables_bytes{instance=~'$node'} - node_memory_VmallocUsed_bytes{instance=~'$node'} ) or ( node_memory_MemTotal{instance=~'$node'} - node_memory_MemFree{instance=~'$node'} - node_memory_Buffers{instance=~'$node'} - node_memory_Cached{instance=~'$node'} - node_memory_SwapCached{instance=~'$node'} - node_memory_Slab{instance=~'$node'} - node_memory_PageTables{instance=~'$node'} - node_memory_VmallocUsed{instance=~'$node'} )",
  399. "interval": "",
  400. "intervalFactor": 2,
  401. "legendFormat": "Apps",
  402. "metric": "",
  403. "refId": "A",
  404. "step": 1200,
  405. "target": ""
  406. },
  407. {
  408. "expr": "node_memory_Buffers_bytes{instance=~'$node'} or node_memory_Buffers{instance=~'$node'}",
  409. "interval": "",
  410. "intervalFactor": 2,
  411. "legendFormat": "Buffers",
  412. "refId": "B",
  413. "step": 1200
  414. },
  415. {
  416. "expr": "node_memory_Cached_bytes{instance=~'$node'} or node_memory_Cached{instance=~'$node'}",
  417. "interval": "",
  418. "intervalFactor": 2,
  419. "legendFormat": "Cached",
  420. "refId": "D",
  421. "step": 1200
  422. },
  423. {
  424. "expr": "node_memory_MemFree_bytes{instance=~'$node'} or node_memory_MemFree{instance=~'$node'}",
  425. "hide": false,
  426. "interval": "",
  427. "intervalFactor": 2,
  428. "legendFormat": "Free",
  429. "refId": "E",
  430. "step": 1200
  431. },
  432. {
  433. "expr": "node_memory_Slab_bytes{instance=~'$node'} or node_memory_Slab{instance=~'$node'}",
  434. "interval": "",
  435. "intervalFactor": 2,
  436. "legendFormat": "Slab",
  437. "refId": "F",
  438. "step": 1200
  439. },
  440. {
  441. "expr": "node_memory_SwapCached_bytes{instance=~'$node'} or node_memory_SwapCached{instance=~'$node'}",
  442. "interval": "",
  443. "intervalFactor": 2,
  444. "legendFormat": "SwapCached",
  445. "refId": "G",
  446. "step": 1200
  447. },
  448. {
  449. "expr": "node_memory_PageTables_bytes{instance=~'$node'} or node_memory_PageTables{instance=~'$node'}",
  450. "interval": "",
  451. "intervalFactor": 2,
  452. "legendFormat": "PageTables",
  453. "refId": "H",
  454. "step": 1200
  455. },
  456. {
  457. "expr": "node_memory_VmallocUsed_bytes{instance=~'$node'} or node_memory_VmallocUsed{instance=~'$node'}",
  458. "interval": "",
  459. "intervalFactor": 2,
  460. "legendFormat": "VmallocUsed",
  461. "metric": "",
  462. "refId": "I",
  463. "step": 1200
  464. },
  465. {
  466. "expr": "(node_memory_SwapTotal_bytes{instance=~'$node'} - node_memory_SwapFree{instance=~'$node'}) or (node_memory_SwapTotal{instance=~'$node'} - node_memory_SwapFree{instance=~'$node'})",
  467. "interval": "",
  468. "intervalFactor": 2,
  469. "legendFormat": "Swap",
  470. "metric": "",
  471. "refId": "C",
  472. "step": 1200
  473. },
  474. {
  475. "expr": "node_memory_Committed_AS_bytes{instance=~'$node'} or node_memory_Committed_AS{instance=~'$node'}",
  476. "interval": "",
  477. "intervalFactor": 2,
  478. "legendFormat": "Committed",
  479. "metric": "",
  480. "refId": "J",
  481. "step": 1200
  482. },
  483. {
  484. "expr": "node_memory_Mapped_bytes{instance=~'$node'} or node_memory_Mapped{instance=~'$node'}",
  485. "interval": "",
  486. "intervalFactor": 2,
  487. "legendFormat": "Mapped",
  488. "refId": "K",
  489. "step": 1200
  490. },
  491. {
  492. "expr": "node_memory_Active_bytes{instance=~'$node'} or node_memory_Active{instance=~'$node'}",
  493. "interval": "",
  494. "intervalFactor": 2,
  495. "legendFormat": "Active",
  496. "metric": "",
  497. "refId": "L",
  498. "step": 1200
  499. },
  500. {
  501. "expr": "node_memory_Inactive_bytes{instance=~'$node'} or node_memory_Inactive{instance=~'$node'}",
  502. "interval": "",
  503. "intervalFactor": 2,
  504. "legendFormat": "Inactive",
  505. "metric": "",
  506. "refId": "M",
  507. "step": 1200
  508. }
  509. ],
  510. "thresholds": [],
  511. "timeRegions": [],
  512. "title": "Memory",
  513. "tooltip": {
  514. "msResolution": false,
  515. "shared": true,
  516. "sort": 0,
  517. "value_type": "individual"
  518. },
  519. "type": "graph",
  520. "xaxis": {
  521. "mode": "time",
  522. "show": true,
  523. "values": []
  524. },
  525. "yaxes": [
  526. {
  527. "format": "bytes",
  528. "label": "GB",
  529. "logBase": 1,
  530. "show": true
  531. },
  532. {
  533. "format": "short",
  534. "logBase": 1,
  535. "show": true
  536. }
  537. ],
  538. "yaxis": {
  539. "align": false
  540. }
  541. },
  542. {
  543. "aliasColors": {},
  544. "bars": false,
  545. "dashLength": 10,
  546. "dashes": false,
  547. "datasource": {
  548. "uid": "hpc-prometheus"
  549. },
  550. "editable": true,
  551. "error": false,
  552. "fill": 1,
  553. "fillGradient": 0,
  554. "grid": {},
  555. "gridPos": {
  556. "h": 7,
  557. "w": 24,
  558. "x": 0,
  559. "y": 20
  560. },
  561. "hiddenSeries": false,
  562. "id": 13,
  563. "legend": {
  564. "avg": false,
  565. "current": false,
  566. "max": false,
  567. "min": false,
  568. "show": true,
  569. "total": false,
  570. "values": false
  571. },
  572. "lines": true,
  573. "linewidth": 2,
  574. "links": [],
  575. "maxPerRow": 6,
  576. "nullPointMode": "connected",
  577. "options": {
  578. "alertThreshold": true
  579. },
  580. "percentage": false,
  581. "pluginVersion": "8.3.2",
  582. "pointradius": 5,
  583. "points": false,
  584. "renderer": "flot",
  585. "repeat": "node",
  586. "seriesOverrides": [],
  587. "spaceLength": 10,
  588. "stack": false,
  589. "steppedLine": false,
  590. "targets": [
  591. {
  592. "expr": "node_load1{instance=~\"$node\"}",
  593. "interval": "",
  594. "intervalFactor": 2,
  595. "legendFormat": "load",
  596. "metric": "",
  597. "refId": "A",
  598. "step": 1200,
  599. "target": ""
  600. }
  601. ],
  602. "thresholds": [],
  603. "timeRegions": [],
  604. "title": "Load",
  605. "tooltip": {
  606. "msResolution": false,
  607. "shared": true,
  608. "sort": 0,
  609. "value_type": "cumulative"
  610. },
  611. "type": "graph",
  612. "xaxis": {
  613. "mode": "time",
  614. "show": true,
  615. "values": []
  616. },
  617. "yaxes": [
  618. {
  619. "format": "short",
  620. "logBase": 1,
  621. "show": true
  622. },
  623. {
  624. "format": "short",
  625. "logBase": 1,
  626. "show": true
  627. }
  628. ],
  629. "yaxis": {
  630. "align": false
  631. }
  632. },
  633. {
  634. "aliasColors": {},
  635. "bars": false,
  636. "datasource": {
  637. "uid": "hpc-prometheus"
  638. },
  639. "decimals": 3,
  640. "editable": true,
  641. "error": false,
  642. "fill": 1,
  643. "grid": {},
  644. "gridPos": {
  645. "h": 7,
  646. "w": 24,
  647. "x": 0,
  648. "y": 27
  649. },
  650. "id": 9,
  651. "legend": {
  652. "avg": false,
  653. "current": false,
  654. "max": false,
  655. "min": false,
  656. "show": true,
  657. "total": false,
  658. "values": false
  659. },
  660. "lines": true,
  661. "linewidth": 2,
  662. "links": [],
  663. "maxPerRow": 6,
  664. "nullPointMode": "connected",
  665. "percentage": false,
  666. "pluginVersion": "8.3.2",
  667. "pointradius": 5,
  668. "points": false,
  669. "renderer": "flot",
  670. "repeat": "node",
  671. "seriesOverrides": [],
  672. "stack": false,
  673. "steppedLine": false,
  674. "targets": [
  675. {
  676. "expr": "100.0 - 100 * (node_filesystem_avail_bytes{instance=~'$node',device !~'tmpfs',device!~'by-uuid'} / node_filesystem_size_bytes{instance=~'$node',device !~'tmpfs',device!~'by-uuid'}) or 100.0 - 100 * (node_filesystem_avail{instance=~'$node',device !~'tmpfs',device!~'by-uuid'} / node_filesystem_size{instance=~'$node',device !~'tmpfs',device!~'by-uuid'})",
  677. "interval": "",
  678. "intervalFactor": 2,
  679. "legendFormat": "{{mountpoint}}",
  680. "metric": "",
  681. "refId": "A",
  682. "step": 1200,
  683. "target": ""
  684. }
  685. ],
  686. "thresholds": [],
  687. "title": "Disk Space Used",
  688. "tooltip": {
  689. "msResolution": true,
  690. "shared": true,
  691. "value_type": "cumulative"
  692. },
  693. "type": "graph",
  694. "xaxis": {
  695. "show": true
  696. },
  697. "yaxes": [
  698. {
  699. "format": "percent",
  700. "logBase": 1,
  701. "max": 100,
  702. "min": 0,
  703. "show": true
  704. },
  705. {
  706. "format": "short",
  707. "logBase": 1,
  708. "show": true
  709. }
  710. ]
  711. },
  712. {
  713. "aliasColors": {},
  714. "bars": false,
  715. "datasource": {
  716. "uid": "hpc-prometheus"
  717. },
  718. "editable": true,
  719. "error": false,
  720. "fill": 1,
  721. "grid": {},
  722. "gridPos": {
  723. "h": 7,
  724. "w": 24,
  725. "x": 0,
  726. "y": 34
  727. },
  728. "id": 19,
  729. "legend": {
  730. "avg": false,
  731. "current": false,
  732. "max": false,
  733. "min": false,
  734. "show": true,
  735. "total": false,
  736. "values": false
  737. },
  738. "lines": true,
  739. "linewidth": 2,
  740. "links": [],
  741. "maxPerRow": 6,
  742. "nullPointMode": "connected",
  743. "percentage": false,
  744. "pluginVersion": "8.3.2",
  745. "pointradius": 5,
  746. "points": false,
  747. "renderer": "flot",
  748. "repeat": "node",
  749. "seriesOverrides": [],
  750. "stack": false,
  751. "steppedLine": false,
  752. "targets": [
  753. {
  754. "expr": "irate(node_disk_io_time_seconds_total{instance=~\"$node\"}[5m])*100 or irate(node_disk_io_time_ms{instance=~\"$node\"}[5m])/10",
  755. "interval": "",
  756. "intervalFactor": 2,
  757. "legendFormat": "{{device}}",
  758. "metric": "",
  759. "refId": "A",
  760. "step": 1200,
  761. "target": ""
  762. }
  763. ],
  764. "thresholds": [],
  765. "title": "Disk Utilization per Device",
  766. "tooltip": {
  767. "msResolution": false,
  768. "shared": false,
  769. "value_type": "cumulative"
  770. },
  771. "type": "graph",
  772. "xaxis": {
  773. "show": true
  774. },
  775. "yaxes": [
  776. {
  777. "format": "percent",
  778. "logBase": 1,
  779. "max": 100,
  780. "show": true
  781. },
  782. {
  783. "format": "short",
  784. "logBase": 1,
  785. "show": true
  786. }
  787. ]
  788. },
  789. {
  790. "aliasColors": {},
  791. "bars": false,
  792. "datasource": {
  793. "uid": "hpc-prometheus"
  794. },
  795. "editable": true,
  796. "error": false,
  797. "fill": 1,
  798. "grid": {},
  799. "gridPos": {
  800. "h": 7,
  801. "w": 24,
  802. "x": 0,
  803. "y": 41
  804. },
  805. "id": 14,
  806. "legend": {
  807. "avg": false,
  808. "current": false,
  809. "max": false,
  810. "min": false,
  811. "show": true,
  812. "total": false,
  813. "values": false
  814. },
  815. "lines": true,
  816. "linewidth": 2,
  817. "links": [],
  818. "maxPerRow": 6,
  819. "nullPointMode": "connected",
  820. "percentage": false,
  821. "pluginVersion": "8.3.2",
  822. "pointradius": 5,
  823. "points": false,
  824. "renderer": "flot",
  825. "repeat": "node",
  826. "seriesOverrides": [
  827. {
  828. "alias": "/.*_read$/",
  829. "transform": "negative-Y"
  830. }
  831. ],
  832. "stack": false,
  833. "steppedLine": false,
  834. "targets": [
  835. {
  836. "expr": "irate(node_disk_reads_completed_total{instance=~'$node'}[5m]) or irate(node_disk_reads_completed{instance=~'$node'}[5m])",
  837. "interval": "",
  838. "intervalFactor": 4,
  839. "legendFormat": "{{device}}_read",
  840. "metric": "",
  841. "refId": "A",
  842. "step": 2400,
  843. "target": ""
  844. },
  845. {
  846. "expr": "irate(node_disk_writes_completed_total{instance=~'$node'}[5m]) or irate(node_disk_writes_completed{instance=~'$node'}[5m])",
  847. "intervalFactor": 2,
  848. "legendFormat": "{{device}}_write",
  849. "metric": "",
  850. "refId": "B",
  851. "step": 1200
  852. }
  853. ],
  854. "thresholds": [],
  855. "title": "Disk IOs per Device",
  856. "tooltip": {
  857. "msResolution": false,
  858. "shared": false,
  859. "value_type": "cumulative"
  860. },
  861. "type": "graph",
  862. "xaxis": {
  863. "show": true
  864. },
  865. "yaxes": [
  866. {
  867. "format": "short",
  868. "label": "IO/second read (-) / write (+)",
  869. "logBase": 1,
  870. "show": true
  871. },
  872. {
  873. "format": "short",
  874. "logBase": 1,
  875. "show": true
  876. }
  877. ]
  878. },
  879. {
  880. "aliasColors": {},
  881. "bars": false,
  882. "datasource": {
  883. "uid": "hpc-prometheus"
  884. },
  885. "editable": true,
  886. "error": false,
  887. "fill": 1,
  888. "grid": {},
  889. "gridPos": {
  890. "h": 7,
  891. "w": 24,
  892. "x": 0,
  893. "y": 48
  894. },
  895. "id": 18,
  896. "legend": {
  897. "avg": false,
  898. "current": false,
  899. "max": false,
  900. "min": false,
  901. "show": true,
  902. "total": false,
  903. "values": false
  904. },
  905. "lines": true,
  906. "linewidth": 2,
  907. "links": [],
  908. "maxPerRow": 6,
  909. "nullPointMode": "connected",
  910. "percentage": false,
  911. "pluginVersion": "8.3.2",
  912. "pointradius": 5,
  913. "points": false,
  914. "renderer": "flot",
  915. "repeat": "node",
  916. "seriesOverrides": [
  917. {
  918. "alias": "/.*_read/",
  919. "transform": "negative-Y"
  920. }
  921. ],
  922. "stack": false,
  923. "steppedLine": false,
  924. "targets": [
  925. {
  926. "expr": "irate(node_disk_read_bytes_total{instance=~'$node'}[5m]) or irate(node_disk_sectors_read{instance=~'$node'}[5m]) * 512",
  927. "interval": "",
  928. "intervalFactor": 4,
  929. "legendFormat": "{{device}}_read",
  930. "refId": "B",
  931. "step": 2400
  932. },
  933. {
  934. "expr": "irate(node_disk_written_bytes_total{instance=~'$node'}[5m]) or irate(node_disk_sectors_written{instance=~'$node'}[5m]) * 512",
  935. "interval": "",
  936. "intervalFactor": 4,
  937. "legendFormat": "{{device}}_write",
  938. "metric": "",
  939. "refId": "A",
  940. "step": 2400,
  941. "target": ""
  942. }
  943. ],
  944. "thresholds": [],
  945. "title": "Disk Throughput per Device",
  946. "tooltip": {
  947. "msResolution": false,
  948. "shared": false,
  949. "value_type": "cumulative"
  950. },
  951. "type": "graph",
  952. "xaxis": {
  953. "show": true
  954. },
  955. "yaxes": [
  956. {
  957. "format": "bytes",
  958. "label": "Bytes/second read (-) / write (+)",
  959. "logBase": 1,
  960. "show": true
  961. },
  962. {
  963. "format": "short",
  964. "logBase": 1,
  965. "show": true
  966. }
  967. ]
  968. },
  969. {
  970. "aliasColors": {},
  971. "bars": false,
  972. "datasource": {
  973. "uid": "hpc-prometheus"
  974. },
  975. "editable": true,
  976. "error": false,
  977. "fill": 1,
  978. "grid": {},
  979. "gridPos": {
  980. "h": 7,
  981. "w": 24,
  982. "x": 0,
  983. "y": 55
  984. },
  985. "id": 25,
  986. "legend": {
  987. "avg": false,
  988. "current": false,
  989. "max": false,
  990. "min": false,
  991. "show": true,
  992. "total": false,
  993. "values": false
  994. },
  995. "lines": true,
  996. "linewidth": 2,
  997. "links": [],
  998. "maxPerRow": 6,
  999. "nullPointMode": "connected",
  1000. "percentage": false,
  1001. "pluginVersion": "8.3.2",
  1002. "pointradius": 5,
  1003. "points": false,
  1004. "renderer": "flot",
  1005. "repeat": "node",
  1006. "seriesOverrides": [],
  1007. "stack": false,
  1008. "steppedLine": false,
  1009. "targets": [
  1010. {
  1011. "expr": "node_disk_io_now{instance=~\"$node\"}",
  1012. "interval": "",
  1013. "intervalFactor": 1,
  1014. "legendFormat": "{{device}}",
  1015. "metric": "",
  1016. "refId": "A",
  1017. "step": 1200,
  1018. "target": ""
  1019. }
  1020. ],
  1021. "thresholds": [],
  1022. "title": "Disk Queue Length",
  1023. "tooltip": {
  1024. "msResolution": false,
  1025. "shared": true,
  1026. "value_type": "cumulative"
  1027. },
  1028. "type": "graph",
  1029. "xaxis": {
  1030. "show": true
  1031. },
  1032. "yaxes": [
  1033. {
  1034. "format": "short",
  1035. "logBase": 1,
  1036. "min": 0,
  1037. "show": true
  1038. },
  1039. {
  1040. "format": "short",
  1041. "logBase": 1,
  1042. "show": true
  1043. }
  1044. ]
  1045. },
  1046. {
  1047. "aliasColors": {},
  1048. "bars": false,
  1049. "datasource": {
  1050. "uid": "hpc-prometheus"
  1051. },
  1052. "editable": true,
  1053. "error": false,
  1054. "fill": 1,
  1055. "grid": {},
  1056. "gridPos": {
  1057. "h": 7,
  1058. "w": 24,
  1059. "x": 0,
  1060. "y": 62
  1061. },
  1062. "id": 22,
  1063. "legend": {
  1064. "avg": false,
  1065. "current": false,
  1066. "max": false,
  1067. "min": false,
  1068. "show": true,
  1069. "total": false,
  1070. "values": false
  1071. },
  1072. "lines": true,
  1073. "linewidth": 2,
  1074. "links": [],
  1075. "maxPerRow": 6,
  1076. "nullPointMode": "connected",
  1077. "percentage": false,
  1078. "pluginVersion": "8.3.2",
  1079. "pointradius": 5,
  1080. "points": false,
  1081. "renderer": "flot",
  1082. "repeat": "node",
  1083. "seriesOverrides": [],
  1084. "stack": false,
  1085. "steppedLine": false,
  1086. "targets": [
  1087. {
  1088. "expr": "irate(node_context_switches_total{instance=~\"$node\"}[5m]) or irate(node_context_switches{instance=~\"$node\"}[5m])",
  1089. "interval": "",
  1090. "intervalFactor": 2,
  1091. "legendFormat": "context switches",
  1092. "metric": "",
  1093. "refId": "A",
  1094. "step": 1200,
  1095. "target": ""
  1096. }
  1097. ],
  1098. "thresholds": [],
  1099. "title": "Context Switches",
  1100. "tooltip": {
  1101. "msResolution": false,
  1102. "shared": true,
  1103. "value_type": "cumulative"
  1104. },
  1105. "type": "graph",
  1106. "xaxis": {
  1107. "show": true
  1108. },
  1109. "yaxes": [
  1110. {
  1111. "format": "short",
  1112. "logBase": 1,
  1113. "show": true
  1114. },
  1115. {
  1116. "format": "short",
  1117. "logBase": 1,
  1118. "show": true
  1119. }
  1120. ]
  1121. },
  1122. {
  1123. "aliasColors": {},
  1124. "bars": false,
  1125. "datasource": {
  1126. "uid": "hpc-prometheus"
  1127. },
  1128. "editable": true,
  1129. "error": false,
  1130. "fill": 1,
  1131. "grid": {},
  1132. "gridPos": {
  1133. "h": 7,
  1134. "w": 24,
  1135. "x": 0,
  1136. "y": 69
  1137. },
  1138. "id": 12,
  1139. "legend": {
  1140. "avg": false,
  1141. "current": false,
  1142. "max": false,
  1143. "min": false,
  1144. "show": true,
  1145. "total": false,
  1146. "values": false
  1147. },
  1148. "lines": true,
  1149. "linewidth": 2,
  1150. "links": [],
  1151. "maxPerRow": 6,
  1152. "nullPointMode": "connected",
  1153. "percentage": false,
  1154. "pluginVersion": "8.3.2",
  1155. "pointradius": 5,
  1156. "points": false,
  1157. "renderer": "flot",
  1158. "repeat": "node",
  1159. "seriesOverrides": [
  1160. {
  1161. "alias": "/.*_in/",
  1162. "transform": "negative-Y"
  1163. }
  1164. ],
  1165. "stack": false,
  1166. "steppedLine": false,
  1167. "targets": [
  1168. {
  1169. "expr": "irate(node_network_receive_bytes_total{instance=~'$node'}[5m])*8 or irate(node_network_receive_bytes{instance=~'$node'}[5m])*8",
  1170. "interval": "",
  1171. "intervalFactor": 2,
  1172. "legendFormat": "{{device}}_in",
  1173. "metric": "",
  1174. "refId": "A",
  1175. "step": 1200,
  1176. "target": ""
  1177. },
  1178. {
  1179. "expr": "irate(node_network_transmit_bytes_total{instance=~'$node'}[5m])*8 or irate(node_network_transmit_bytes{instance=~'$node'}[5m])*8",
  1180. "interval": "",
  1181. "intervalFactor": 2,
  1182. "legendFormat": "{{device}}_out",
  1183. "refId": "B",
  1184. "step": 1200
  1185. }
  1186. ],
  1187. "thresholds": [],
  1188. "title": "Network Traffic",
  1189. "tooltip": {
  1190. "msResolution": false,
  1191. "shared": true,
  1192. "value_type": "cumulative"
  1193. },
  1194. "type": "graph",
  1195. "xaxis": {
  1196. "show": true
  1197. },
  1198. "yaxes": [
  1199. {
  1200. "format": "bits",
  1201. "label": "bits in (-) / bits out (+)",
  1202. "logBase": 1,
  1203. "show": true
  1204. },
  1205. {
  1206. "format": "short",
  1207. "logBase": 1,
  1208. "show": true
  1209. }
  1210. ]
  1211. },
  1212. {
  1213. "aliasColors": {},
  1214. "bars": false,
  1215. "datasource": {
  1216. "uid": "hpc-prometheus"
  1217. },
  1218. "editable": true,
  1219. "error": false,
  1220. "fill": 1,
  1221. "grid": {},
  1222. "gridPos": {
  1223. "h": 7,
  1224. "w": 24,
  1225. "x": 0,
  1226. "y": 76
  1227. },
  1228. "id": 21,
  1229. "legend": {
  1230. "avg": false,
  1231. "current": false,
  1232. "max": false,
  1233. "min": false,
  1234. "show": true,
  1235. "total": false,
  1236. "values": false
  1237. },
  1238. "lines": true,
  1239. "linewidth": 2,
  1240. "links": [],
  1241. "maxPerRow": 6,
  1242. "nullPointMode": "connected",
  1243. "percentage": false,
  1244. "pluginVersion": "8.3.2",
  1245. "pointradius": 5,
  1246. "points": false,
  1247. "renderer": "flot",
  1248. "repeat": "node",
  1249. "seriesOverrides": [],
  1250. "stack": false,
  1251. "steppedLine": false,
  1252. "targets": [
  1253. {
  1254. "expr": "node_netstat_Tcp_CurrEstab{instance=~'$node'}",
  1255. "intervalFactor": 2,
  1256. "legendFormat": "established",
  1257. "refId": "A",
  1258. "step": 1200,
  1259. "target": ""
  1260. }
  1261. ],
  1262. "thresholds": [],
  1263. "title": "Netstat",
  1264. "tooltip": {
  1265. "msResolution": false,
  1266. "shared": true,
  1267. "value_type": "cumulative"
  1268. },
  1269. "type": "graph",
  1270. "xaxis": {
  1271. "show": true
  1272. },
  1273. "yaxes": [
  1274. {
  1275. "format": "short",
  1276. "logBase": 1,
  1277. "show": true
  1278. },
  1279. {
  1280. "format": "short",
  1281. "logBase": 1,
  1282. "show": true
  1283. }
  1284. ]
  1285. },
  1286. {
  1287. "aliasColors": {},
  1288. "bars": false,
  1289. "datasource": {
  1290. "uid": "hpc-prometheus"
  1291. },
  1292. "editable": true,
  1293. "error": false,
  1294. "fill": 1,
  1295. "grid": {},
  1296. "gridPos": {
  1297. "h": 7,
  1298. "w": 24,
  1299. "x": 0,
  1300. "y": 83
  1301. },
  1302. "id": 23,
  1303. "legend": {
  1304. "avg": false,
  1305. "current": false,
  1306. "max": false,
  1307. "min": false,
  1308. "show": true,
  1309. "total": false,
  1310. "values": false
  1311. },
  1312. "lines": true,
  1313. "linewidth": 2,
  1314. "links": [],
  1315. "maxPerRow": 6,
  1316. "nullPointMode": "connected",
  1317. "percentage": false,
  1318. "pluginVersion": "8.3.2",
  1319. "pointradius": 5,
  1320. "points": false,
  1321. "renderer": "flot",
  1322. "repeat": "node",
  1323. "seriesOverrides": [
  1324. {
  1325. "alias": "/.*Out.*/",
  1326. "transform": "negative-Y"
  1327. },
  1328. {
  1329. "alias": "Udp_NoPorts",
  1330. "yaxis": 2
  1331. }
  1332. ],
  1333. "stack": false,
  1334. "steppedLine": false,
  1335. "targets": [
  1336. {
  1337. "expr": "irate(node_netstat_Udp_InDatagrams{instance=~\"$node\"}[5m])",
  1338. "intervalFactor": 2,
  1339. "legendFormat": "Udp_InDatagrams",
  1340. "refId": "A",
  1341. "step": 1200,
  1342. "target": ""
  1343. },
  1344. {
  1345. "expr": "irate(node_netstat_Udp_InErrors{instance=~\"$node\"}[5m])",
  1346. "intervalFactor": 2,
  1347. "legendFormat": "Udp_InErrors",
  1348. "refId": "B",
  1349. "step": 1200
  1350. },
  1351. {
  1352. "expr": "irate(node_netstat_Udp_OutDatagrams{instance=~\"$node\"}[5m])",
  1353. "interval": "",
  1354. "intervalFactor": 2,
  1355. "legendFormat": "Udp_OutDatagrams",
  1356. "refId": "C",
  1357. "step": 1200
  1358. },
  1359. {
  1360. "expr": "irate(node_netstat_Udp_NoPorts{instance=~\"$node\"}[5m])",
  1361. "intervalFactor": 2,
  1362. "legendFormat": "Udp_NoPorts",
  1363. "refId": "D",
  1364. "step": 1200
  1365. }
  1366. ],
  1367. "thresholds": [],
  1368. "title": "UDP Stats",
  1369. "tooltip": {
  1370. "msResolution": false,
  1371. "shared": true,
  1372. "value_type": "cumulative"
  1373. },
  1374. "type": "graph",
  1375. "xaxis": {
  1376. "show": true
  1377. },
  1378. "yaxes": [
  1379. {
  1380. "format": "short",
  1381. "logBase": 1,
  1382. "show": true
  1383. },
  1384. {
  1385. "format": "short",
  1386. "logBase": 1,
  1387. "show": true
  1388. }
  1389. ]
  1390. },
  1391. {
  1392. "aliasColors": {},
  1393. "bars": false,
  1394. "datasource": {
  1395. "uid": "hpc-prometheus"
  1396. },
  1397. "editable": true,
  1398. "error": false,
  1399. "fill": 1,
  1400. "grid": {},
  1401. "gridPos": {
  1402. "h": 7,
  1403. "w": 24,
  1404. "x": 0,
  1405. "y": 90
  1406. },
  1407. "id": 24,
  1408. "legend": {
  1409. "avg": false,
  1410. "current": false,
  1411. "max": false,
  1412. "min": false,
  1413. "show": true,
  1414. "total": false,
  1415. "values": false
  1416. },
  1417. "lines": true,
  1418. "linewidth": 2,
  1419. "links": [],
  1420. "maxPerRow": 6,
  1421. "nullPointMode": "connected",
  1422. "percentage": false,
  1423. "pluginVersion": "8.3.2",
  1424. "pointradius": 5,
  1425. "points": false,
  1426. "renderer": "flot",
  1427. "repeat": "node",
  1428. "seriesOverrides": [],
  1429. "stack": false,
  1430. "steppedLine": false,
  1431. "targets": [
  1432. {
  1433. "expr": "node_nf_conntrack_entries_limit{instance=~\"$node\"} - node_nf_conntrack_entries{instance=~\"$node\"}",
  1434. "intervalFactor": 2,
  1435. "legendFormat": "free",
  1436. "refId": "A",
  1437. "step": 1200,
  1438. "target": ""
  1439. }
  1440. ],
  1441. "thresholds": [],
  1442. "title": "Conntrack",
  1443. "tooltip": {
  1444. "msResolution": false,
  1445. "shared": true,
  1446. "value_type": "cumulative"
  1447. },
  1448. "type": "graph",
  1449. "xaxis": {
  1450. "show": true
  1451. },
  1452. "yaxes": [
  1453. {
  1454. "format": "short",
  1455. "logBase": 1,
  1456. "show": true
  1457. },
  1458. {
  1459. "format": "short",
  1460. "logBase": 1,
  1461. "show": true
  1462. }
  1463. ]
  1464. }
  1465. ],
  1466. "schemaVersion": 33,
  1467. "style": "dark",
  1468. "tags": [
  1469. "prometheus"
  1470. ],
  1471. "templating": {
  1472. "list": [
  1473. {
  1474. "allFormat": "glob",
  1475. "current": {},
  1476. "datasource": {
  1477. "uid": "hpc-prometheus"
  1478. },
  1479. "definition": "",
  1480. "hide": 0,
  1481. "includeAll": false,
  1482. "label": "",
  1483. "multi": true,
  1484. "multiFormat": "regex values",
  1485. "name": "node",
  1486. "options": [],
  1487. "query": {
  1488. "query": "label_values(node_exporter_build_info, instance)",
  1489. "refId": "hpc-prometheus-node-Variable-Query"
  1490. },
  1491. "refresh": 1,
  1492. "regex": "",
  1493. "skipUrlSync": false,
  1494. "sort": 1,
  1495. "type": "query"
  1496. }
  1497. ]
  1498. },
  1499. "time": {
  1500. "from": "now-7d",
  1501. "to": "now"
  1502. },
  1503. "timepicker": {
  1504. "now": true,
  1505. "refresh_intervals": [
  1506. "5s",
  1507. "10s",
  1508. "30s",
  1509. "1m",
  1510. "5m",
  1511. "15m",
  1512. "30m",
  1513. "1h",
  1514. "2h",
  1515. "1d"
  1516. ],
  1517. "time_options": [
  1518. "5m",
  1519. "15m",
  1520. "1h",
  1521. "6h",
  1522. "12h",
  1523. "24h",
  1524. "2d",
  1525. "7d",
  1526. "30d"
  1527. ]
  1528. },
  1529. "timezone": "browser",
  1530. "title": "SLURM - Node Exporter Server Metrics",
  1531. "uid": "67UjInAnk",
  1532. "version": 1,
  1533. "weekStart": ""
  1534. }