{ "__inputs": [], "__elements": [], "__requires": [ { "type": "panel", "id": "gauge", "name": "Gauge", "version": "" }, { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "8.3.2" }, { "type": "panel", "id": "graph", "name": "Graph (old)", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "stat", "name": "Stat", "version": "" } ], "annotations": { "list": [ { "$$hashKey": "object:1345", "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "target": { "limit": 100, "matchAny": false, "tags": [], "type": "dashboard" }, "type": "dashboard" } ] }, "description": "slurm-exporter", "editable": true, "fiscalYearStartMonth": 0, "gnetId": 4323, "graphTooltip": 0, "id": null, "links": [], "liveNow": false, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 0, "fillGradient": 0, "gridPos": { "h": 10, "w": 24, "x": 0, "y": 0 }, "hiddenSeries": false, "id": 58, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "slurm_account_fairshare", "interval": "", "legendFormat": "{{account}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Fair Share per Account", "tooltip": { "shared": true, "sort": 1, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:567", "format": "short", "logBase": 1, "max": "1", "min": "0", "show": true }, { "$$hashKey": "object:568", "format": "short", "logBase": 1, "show": false } ], "yaxis": { "align": false } }, { "collapsed": false, "datasource": { "uid": "hpc-prometheus" }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 10 }, "id": 12, "panels": [], "title": "Cluster Nodes", "type": "row" }, { "aliasColors": { "Total Nodes": "purple" }, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 11, "w": 12, "x": 0, "y": 11 }, "hiddenSeries": false, "id": 1, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "slurm_nodes_alloc + slurm_nodes_comp", "format": "time_series", "intervalFactor": 2, "legendFormat": "Allocated Nodes (including compl)", "refId": "A" }, { "expr": "slurm_nodes_mix", "intervalFactor": 2, "legendFormat": "Mixed Nodes", "refId": "B" }, { "expr": "slurm_nodes_idle", "format": "time_series", "intervalFactor": 2, "legendFormat": "Idle Nodes", "refId": "C" }, { "expr": "slurm_nodes_alloc + slurm_nodes_down + slurm_nodes_drain + slurm_nodes_idle + slurm_nodes_mix + slurm_nodes_comp + slurm_nodes_maint + slurm_nodes_resv", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "Total Nodes", "refId": "D" } ], "thresholds": [], "timeRegions": [], "title": "Nodes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "short", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": { "Down Nodes": "#e24d42", "Nodes in *fail* state": "#6d1f62" }, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 11, "w": 12, "x": 12, "y": 11 }, "hiddenSeries": false, "id": 5, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "slurm_nodes_down", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "Down Nodes", "refId": "A" }, { "expr": "slurm_nodes_drain", "format": "time_series", "intervalFactor": 2, "legendFormat": "Draining Nodes", "refId": "B" }, { "expr": "slurm_nodes_err != 0", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "Nodes in *error* state", "refId": "C" }, { "expr": "slurm_nodes_fail != 0", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "Nodes in *fail* state", "refId": "D" } ], "thresholds": [], "timeRegions": [], "title": "Fail/Down/Drain/Err Nodes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:464", "format": "short", "logBase": 1, "min": "0", "show": true }, { "$$hashKey": "object:465", "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "collapsed": false, "datasource": { "uid": "hpc-prometheus" }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, "id": 13, "panels": [], "title": "SLURM Jobs", "type": "row" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 11, "w": 12, "x": 0, "y": 23 }, "hiddenSeries": false, "id": 2, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "slurm_queue_completing != 0", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "Completing Jobs", "refId": "A" }, { "expr": "slurm_queue_running", "format": "time_series", "intervalFactor": 2, "legendFormat": "Running Jobs", "refId": "B" }, { "expr": "slurm_queue_pending", "format": "time_series", "intervalFactor": 2, "legendFormat": "Pending Jobs", "refId": "C" }, { "expr": "slurm_queue_completed != 0", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "Completed Jobs", "refId": "D" } ], "thresholds": [], "timeRegions": [], "title": "RUNNING/COMPL/PEND Jobs", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "short", "logBase": 1, "min": "0", "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": { "Timed out Jobs": "#890f02" }, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 11, "w": 12, "x": 12, "y": 23 }, "hiddenSeries": false, "id": 6, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideZero": false, "max": true, "min": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "slurm_queue_timeout", "format": "time_series", "intervalFactor": 2, "legendFormat": "Timed out Jobs", "refId": "A" }, { "expr": "slurm_queue_failed", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "Failed Jobs", "refId": "B" }, { "expr": "slurm_queue_node_fail", "format": "time_series", "intervalFactor": 2, "legendFormat": "Failed jobs (due to NodeFail)", "refId": "C" }, { "expr": "slurm_queue_suspended", "format": "time_series", "intervalFactor": 2, "legendFormat": "Suspended Jobs", "refId": "D" }, { "expr": "slurm_queue_cancelled", "format": "time_series", "intervalFactor": 2, "legendFormat": "Cancelled Jobs", "refId": "E" }, { "expr": "slurm_queue_preempted", "format": "time_series", "intervalFactor": 2, "legendFormat": "Preempted Jobs", "refId": "F" } ], "thresholds": [], "timeRegions": [], "title": "FAIL/SUSP/CANC/PREEMPT/TIMEDOUT Jobs", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:1455", "format": "short", "logBase": 1, "min": "0", "show": true }, { "$$hashKey": "object:1456", "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "collapsed": false, "datasource": { "uid": "hpc-prometheus" }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 34 }, "id": 36, "panels": [], "title": "Users and Accounts", "type": "row" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 35 }, "hiddenSeries": false, "id": 56, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null as zero", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "slurm_partition_jobs_pending > 0", "interval": "", "legendFormat": "{{partition}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Pending Jobs per Partition", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:1530", "format": "short", "logBase": 1, "min": "0", "show": true }, { "$$hashKey": "object:1531", "format": "short", "logBase": 1, "show": false } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 43 }, "hiddenSeries": false, "id": 46, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null as zero", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "slurm_account_jobs_running{account!~'none'} != 0", "format": "time_series", "instant": false, "interval": "", "legendFormat": "{{account}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Running Jobs per Account", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:353", "format": "short", "logBase": 1, "min": "0", "show": true }, { "$$hashKey": "object:354", "format": "short", "logBase": 1, "show": false } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 43 }, "hiddenSeries": false, "id": 42, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "slurm_account_jobs_pending{account!~'none'} > 0", "instant": false, "interval": "", "legendFormat": "{{account}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Pending Jobs per Account", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:651", "format": "short", "logBase": 1, "min": "0", "show": true }, { "$$hashKey": "object:652", "format": "short", "logBase": 1, "show": false } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 50 }, "hiddenSeries": false, "id": 40, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "slurm_user_jobs_running > 10", "instant": false, "interval": "", "legendFormat": "{{user}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Running Jobs per Users", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:867", "format": "short", "logBase": 1, "min": "0", "show": true }, { "$$hashKey": "object:868", "format": "short", "logBase": 1, "show": false } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 50 }, "hiddenSeries": false, "id": 44, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null as zero", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "slurm_user_jobs_pending != 0", "instant": false, "interval": "", "legendFormat": "{{user}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Pending Jobs per Users", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:1014", "format": "short", "logBase": 1, "min": "0", "show": true }, { "$$hashKey": "object:1015", "format": "short", "logBase": 1, "show": false } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 57 }, "hiddenSeries": false, "id": 54, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null as zero", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "slurm_account_cpus_running{account!~'none'}", "instant": false, "interval": "", "legendFormat": "{{account}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Utilized CPUs per Account", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:1161", "format": "short", "logBase": 1, "min": "0", "show": true }, { "$$hashKey": "object:1162", "format": "short", "logBase": 1, "show": false } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 57 }, "hiddenSeries": false, "id": 52, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null as zero", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "slurm_user_cpus_running > 0", "instant": false, "interval": "", "legendFormat": "{{user}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Utilized CPUs per user", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:1382", "format": "short", "logBase": 1, "min": "0", "show": true }, { "$$hashKey": "object:1383", "format": "short", "logBase": 1, "show": false } ], "yaxis": { "align": false } }, { "collapsed": false, "datasource": { "uid": "hpc-prometheus" }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 64 }, "id": 14, "panels": [], "title": "CPU cores allocation", "type": "row" }, { "aliasColors": { "slurm_alloc_cpu_cores{cluster=\"kronos\",job=\"kronos_cores\"}": "#ea6460", "slurm_cpu_cores_total{cluster=\"kronos\",job=\"kronos_cores\"}": "#052b51", "slurm_idle_cpu_cores{cluster=\"kronos\",job=\"kronos_cores\"}": "#f2c96d" }, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 9, "w": 24, "x": 0, "y": 65 }, "hiddenSeries": false, "id": 10, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "slurm_cpus_total", "format": "time_series", "intervalFactor": 2, "legendFormat": "Total number of CPU cores", "refId": "B" }, { "expr": "slurm_cpus_alloc", "format": "time_series", "intervalFactor": 2, "legendFormat": "Allocated CPU cores", "refId": "A" }, { "expr": "slurm_cpus_idle", "format": "time_series", "hide": true, "interval": "", "intervalFactor": 2, "legendFormat": "Idle CPU cores", "refId": "C" } ], "thresholds": [], "timeRegions": [], "title": "CPU Allocation", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:403", "format": "short", "logBase": 1, "min": "0", "show": true }, { "$$hashKey": "object:404", "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": { "debug": "super-light-purple" }, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 74 }, "hiddenSeries": false, "id": 48, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null as zero", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "slurm_partition_cpus_allocated != 0", "interval": "", "legendFormat": "{{partition}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "CPUs Allocated per Partition", "tooltip": { "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:117", "format": "short", "logBase": 1, "show": true }, { "$$hashKey": "object:118", "format": "short", "logBase": 1, "show": false } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 74 }, "hiddenSeries": false, "id": 50, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null as zero", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "slurm_partition_cpus_idle", "interval": "", "legendFormat": "{{partition}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "CPUs Idle per Partition", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:213", "format": "short", "logBase": 1, "show": true }, { "$$hashKey": "object:214", "format": "short", "logBase": 1, "show": false } ], "yaxis": { "align": false } }, { "collapsed": false, "datasource": { "uid": "hpc-prometheus" }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 82 }, "id": 15, "panels": [], "title": "SLURM Scheduler Details", "type": "row" }, { "datasource": { "uid": "hpc-prometheus" }, "description": "The number of current active slurmctld threads.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [ { "options": { "match": "null", "result": { "text": "N/A" } }, "type": "special" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 5, "w": 8, "x": 0, "y": 83 }, "id": 7, "links": [], "maxDataPoints": 100, "options": { "colorMode": "none", "graphMode": "none", "justifyMode": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "pluginVersion": "8.3.2", "targets": [ { "expr": "slurm_scheduler_threads", "format": "time_series", "intervalFactor": 2, "legendFormat": "Slurm Scheduler Threads", "refId": "A" } ], "title": "Slurm Scheduler Threads", "type": "stat" }, { "datasource": { "uid": "hpc-prometheus" }, "description": "The agent mechanism helps to control communication between the Slrum daemons and the controller for a best effort.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [ { "options": { "match": "null", "result": { "text": "N/A" } }, "type": "special" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 5, "w": 8, "x": 8, "y": 83 }, "id": 8, "links": [], "maxDataPoints": 100, "options": { "colorMode": "none", "graphMode": "none", "justifyMode": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "pluginVersion": "8.3.2", "targets": [ { "expr": "slurm_scheduler_queue_size", "format": "time_series", "intervalFactor": 2, "legendFormat": "Agent Queue Size", "refId": "A" } ], "title": "Agent Queue Size", "type": "stat" }, { "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 100 } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 5, "w": 8, "x": 16, "y": 83 }, "id": 26, "options": { "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "pluginVersion": "8.3.2", "targets": [ { "expr": "slurm_scheduler_dbd_queue_size", "legendFormat": "DBD Agent Queue length", "refId": "A" } ], "title": "DBD Agent Queue Length", "type": "gauge" }, { "collapsed": false, "datasource": { "uid": "hpc-prometheus" }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 88 }, "id": 16, "panels": [], "title": "SLURM Scheduler Cycles", "type": "row" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 89 }, "hiddenSeries": false, "id": 4, "legend": { "alignAsTable": true, "avg": true, "current": false, "max": false, "min": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "slurm_scheduler_last_cycle", "format": "time_series", "intervalFactor": 2, "legendFormat": "Scheduler Last Cycle Time", "refId": "A" }, { "expr": "slurm_scheduler_mean_cycle", "format": "time_series", "intervalFactor": 2, "legendFormat": "Scheduler Mean Cycle Time", "refId": "B" } ], "thresholds": [], "timeRegions": [], "title": "Scheduler Cycles", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "µs", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 89 }, "hiddenSeries": false, "id": 3, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "slurm_scheduler_backfill_last_cycle", "format": "time_series", "intervalFactor": 2, "legendFormat": "Scheduler Backfill Last Cycle", "refId": "A" }, { "expr": "slurm_scheduler_backfill_mean_cycle", "format": "time_series", "intervalFactor": 2, "legendFormat": "Scheduler Backfill Mean Cycle", "refId": "B" } ], "thresholds": [], "timeRegions": [], "title": "Backfill Scheduler Cycles", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "µs", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 96 }, "hiddenSeries": false, "id": 9, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "slurm_scheduler_backfill_depth_mean", "format": "time_series", "intervalFactor": 2, "legendFormat": "Mean of processed jobs during backfilling scheduling cycles", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Scheduler Backfill Depth Mean", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "short", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "datasource": { "uid": "hpc-prometheus" }, "description": "Number of heterogeneous job components started thanks to backfilling since last Slurm start", "fieldConfig": { "defaults": { "displayName": "", "mappings": [ { "id": 0, "op": "=", "text": "N/A", "type": 1, "value": "null" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, "gridPos": { "h": 6, "w": 6, "x": 12, "y": 96 }, "id": 34, "links": [], "options": { "orientation": "horizontal", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "pluginVersion": "8.3.2", "targets": [ { "expr": "slurm_scheduler_backfilled_heterogeneous_total", "legendFormat": "Heterogeneous job components", "refId": "A" } ], "title": " Total backfilled heterogeneous Job components", "type": "gauge" }, { "collapsed": false, "datasource": { "uid": "hpc-prometheus" }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 102 }, "id": 32, "panels": [], "title": "Total Backfilled Jobs", "type": "row" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "description": "Number of jobs started thanks to backfilling since last Slurm start.", "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 103 }, "hiddenSeries": false, "id": 28, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:1065", "alias": "DELTA: Total number Backfilled Jobs (since last Slurm start)", "yaxis": 2 } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "delta(slurm_scheduler_backfilled_jobs_since_start_total[10m])", "legendFormat": "DELTA: Total number Backfilled Jobs (since last Slurm start)", "refId": "A" }, { "expr": "slurm_scheduler_backfilled_jobs_since_start_total", "legendFormat": "Total number Backfilled Jobs (since last Slurm start)", "refId": "B" } ], "thresholds": [], "timeRegions": [], "title": "Total Backfilled Jobs (since last slurm start)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:1078", "format": "short", "logBase": 1, "show": true }, { "$$hashKey": "object:1079", "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "uid": "hpc-prometheus" }, "description": "Number of jobs started thanks to backfilling since last time stats where reset", "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 103 }, "hiddenSeries": false, "id": 30, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:979", "alias": "DELTA: Total Backfilled Jobs (since last stats cycle start)", "yaxis": 2 } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "slurm_scheduler_backfilled_jobs_since_cycle_total", "legendFormat": " Total Backfilled Jobs (since last stats cycle start)", "refId": "A" }, { "expr": "delta(slurm_scheduler_backfilled_jobs_since_cycle_total[10m])", "legendFormat": "DELTA: Total Backfilled Jobs (since last stats cycle start)", "refId": "B" } ], "thresholds": [], "timeRegions": [], "title": "Total Backfilled Jobs (since last stats cycle start)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:992", "format": "short", "logBase": 1, "show": true }, { "$$hashKey": "object:993", "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } } ], "refresh": "30s", "schemaVersion": 33, "style": "dark", "tags": [ "Slurm" ], "templating": { "list": [] }, "time": { "from": "now-2d", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "", "title": "SLURM - CPUs/GPUs, Nodes, Jobs, Scheduler", "uid": "bX7jn6dZk", "version": 1, "weekStart": "" }