2023-04-05 17:00:01 +02:00
|
|
|
{ config, lib, ... }:
|
|
|
|
|
|
|
|
{
|
2023-09-21 21:38:34 +02:00
|
|
|
imports = [ ../module/slurm-exporter.nix ];
|
|
|
|
|
2023-04-05 17:00:01 +02:00
|
|
|
services.grafana = {
|
|
|
|
enable = true;
|
2023-05-09 18:53:31 +02:00
|
|
|
settings = {
|
|
|
|
server = {
|
2023-05-31 17:23:08 +02:00
|
|
|
domain = "jungle.bsc.es";
|
|
|
|
root_url = "%(protocol)s://%(domain)s/grafana";
|
|
|
|
serve_from_sub_path = true;
|
2023-05-09 18:53:31 +02:00
|
|
|
http_port = 2342;
|
|
|
|
http_addr = "127.0.0.1";
|
|
|
|
};
|
|
|
|
feature_toggles.publicDashboards = true;
|
2023-09-22 10:50:14 +02:00
|
|
|
"auth.anonymous".enabled = true;
|
2023-04-05 17:00:01 +02:00
|
|
|
};
|
|
|
|
};
|
|
|
|
|
2023-10-20 16:04:15 +02:00
|
|
|
# Make grafana alerts also use the proxy
|
|
|
|
systemd.services.grafana.environment = config.networking.proxy.envVars;
|
|
|
|
|
2023-04-05 17:00:01 +02:00
|
|
|
services.prometheus = {
|
|
|
|
enable = true;
|
|
|
|
port = 9001;
|
2023-07-28 16:19:59 +02:00
|
|
|
retentionTime = "1y";
|
2023-09-08 18:13:04 +02:00
|
|
|
listenAddress = "127.0.0.1";
|
2023-04-05 17:00:01 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
systemd.services.prometheus-ipmi-exporter.serviceConfig.DynamicUser = lib.mkForce false;
|
|
|
|
systemd.services.prometheus-ipmi-exporter.serviceConfig.PrivateDevices = lib.mkForce false;
|
|
|
|
|
2023-07-28 13:48:30 +02:00
|
|
|
# We need access to the devices to monitor the disk space
|
|
|
|
systemd.services.prometheus-node-exporter.serviceConfig.PrivateDevices = lib.mkForce false;
|
|
|
|
systemd.services.prometheus-node-exporter.serviceConfig.ProtectHome = lib.mkForce "read-only";
|
|
|
|
|
2023-04-05 17:00:01 +02:00
|
|
|
virtualisation.docker.daemon.settings = {
|
|
|
|
metrics-addr = "127.0.0.1:9323";
|
|
|
|
};
|
|
|
|
|
2023-04-18 16:03:46 +02:00
|
|
|
# Required to allow the smartctl exporter to read the nvme0 character device,
|
|
|
|
# see the commit message on:
|
|
|
|
# https://github.com/NixOS/nixpkgs/commit/12c26aca1fd55ab99f831bedc865a626eee39f80
|
|
|
|
services.udev.extraRules = ''
|
|
|
|
SUBSYSTEM=="nvme", KERNEL=="nvme[0-9]*", GROUP="disk"
|
|
|
|
'';
|
|
|
|
|
2023-04-05 17:00:01 +02:00
|
|
|
services.prometheus = {
|
|
|
|
|
|
|
|
exporters = {
|
2023-08-17 18:55:40 +02:00
|
|
|
ipmi = {
|
|
|
|
enable = true;
|
|
|
|
group = "root";
|
|
|
|
user = "root";
|
|
|
|
configFile = ./ipmi.yml;
|
|
|
|
#extraFlags = [ "--log.level=debug" ];
|
2023-09-08 18:13:04 +02:00
|
|
|
listenAddress = "127.0.0.1";
|
2023-08-17 18:55:40 +02:00
|
|
|
};
|
2023-04-05 17:00:01 +02:00
|
|
|
node = {
|
|
|
|
enable = true;
|
|
|
|
enabledCollectors = [ "systemd" ];
|
|
|
|
port = 9002;
|
2023-09-08 18:13:04 +02:00
|
|
|
listenAddress = "127.0.0.1";
|
|
|
|
};
|
|
|
|
smartctl = {
|
|
|
|
enable = true;
|
|
|
|
listenAddress = "127.0.0.1";
|
2023-04-05 17:00:01 +02:00
|
|
|
};
|
2023-10-03 08:58:07 +02:00
|
|
|
blackbox = {
|
|
|
|
enable = true;
|
|
|
|
listenAddress = "127.0.0.1";
|
|
|
|
configFile = ./blackbox.yml;
|
|
|
|
};
|
2023-04-05 17:00:01 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
scrapeConfigs = [
|
|
|
|
{
|
|
|
|
job_name = "xeon07";
|
|
|
|
static_configs = [{
|
|
|
|
targets = [
|
|
|
|
"127.0.0.1:${toString config.services.prometheus.exporters.node.port}"
|
|
|
|
"127.0.0.1:${toString config.services.prometheus.exporters.ipmi.port}"
|
|
|
|
"127.0.0.1:9323"
|
2023-04-06 13:56:52 +02:00
|
|
|
"127.0.0.1:9252"
|
2023-04-18 16:03:46 +02:00
|
|
|
"127.0.0.1:${toString config.services.prometheus.exporters.smartctl.port}"
|
2023-09-21 21:38:34 +02:00
|
|
|
"127.0.0.1:9341" # Slurm exporter
|
2023-10-03 08:58:07 +02:00
|
|
|
"127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}"
|
2023-08-29 11:58:00 +02:00
|
|
|
];
|
|
|
|
}];
|
|
|
|
}
|
|
|
|
{
|
2023-08-29 12:33:26 +02:00
|
|
|
job_name = "ceph";
|
2023-08-29 11:58:00 +02:00
|
|
|
static_configs = [{
|
|
|
|
targets = [
|
2023-08-29 12:33:26 +02:00
|
|
|
"10.0.40.40:9283" # Ceph statistics
|
2023-08-29 11:58:00 +02:00
|
|
|
"10.0.40.40:9002" # Node exporter
|
2023-08-29 12:33:26 +02:00
|
|
|
"10.0.40.42:9002" # Node exporter
|
2023-04-05 17:00:01 +02:00
|
|
|
];
|
|
|
|
}];
|
|
|
|
}
|
2023-10-03 08:58:07 +02:00
|
|
|
{
|
2023-10-24 11:49:42 +02:00
|
|
|
job_name = "blackbox-http";
|
2023-10-03 08:58:07 +02:00
|
|
|
metrics_path = "/probe";
|
|
|
|
params = { module = [ "http_2xx" ]; };
|
|
|
|
static_configs = [{
|
|
|
|
targets = [
|
2024-02-29 09:57:18 +01:00
|
|
|
"https://www.google.com/robots.txt"
|
2023-10-03 08:58:07 +02:00
|
|
|
"https://pm.bsc.es/"
|
2023-12-01 12:17:50 +01:00
|
|
|
"https://pm.bsc.es/gitlab/"
|
2023-10-03 08:58:07 +02:00
|
|
|
"https://jungle.bsc.es/"
|
2023-10-03 09:45:13 +02:00
|
|
|
"https://gitlab.bsc.es/"
|
2023-10-03 08:58:07 +02:00
|
|
|
];
|
|
|
|
}];
|
|
|
|
relabel_configs = [
|
|
|
|
{
|
|
|
|
# Takes the address and sets it in the "target=<xyz>" URL parameter
|
|
|
|
source_labels = [ "__address__" ];
|
|
|
|
target_label = "__param_target";
|
|
|
|
}
|
|
|
|
{
|
|
|
|
# Sets the "instance" label with the remote host we are querying
|
2023-10-24 11:49:42 +02:00
|
|
|
source_labels = [ "__param_target" ];
|
|
|
|
target_label = "instance";
|
|
|
|
}
|
|
|
|
{
|
|
|
|
# Shows the host target address instead of the blackbox address
|
|
|
|
target_label = "__address__";
|
|
|
|
replacement = "127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}";
|
|
|
|
}
|
|
|
|
];
|
|
|
|
}
|
|
|
|
{
|
|
|
|
job_name = "blackbox-icmp";
|
|
|
|
metrics_path = "/probe";
|
|
|
|
params = { module = [ "icmp" ]; };
|
|
|
|
static_configs = [{
|
|
|
|
targets = [
|
|
|
|
"1.1.1.1"
|
|
|
|
"8.8.8.8"
|
|
|
|
"ssfhead"
|
2023-10-26 12:36:06 +02:00
|
|
|
"anella-bsc.cesca.cat"
|
2023-10-24 11:49:42 +02:00
|
|
|
];
|
|
|
|
}];
|
|
|
|
relabel_configs = [
|
|
|
|
{
|
|
|
|
# Takes the address and sets it in the "target=<xyz>" URL parameter
|
|
|
|
source_labels = [ "__address__" ];
|
|
|
|
target_label = "__param_target";
|
|
|
|
}
|
|
|
|
{
|
|
|
|
# Sets the "instance" label with the remote host we are querying
|
2023-10-03 08:58:07 +02:00
|
|
|
source_labels = [ "__param_target" ];
|
|
|
|
target_label = "instance";
|
|
|
|
}
|
|
|
|
{
|
|
|
|
# Shows the host target address instead of the blackbox address
|
|
|
|
target_label = "__address__";
|
|
|
|
replacement = "127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}";
|
|
|
|
}
|
|
|
|
];
|
|
|
|
}
|
2024-04-29 11:22:45 +02:00
|
|
|
{
|
|
|
|
job_name = "gitea";
|
|
|
|
static_configs = [{ targets = [ "127.0.0.1:3000" ]; }];
|
|
|
|
}
|
2023-08-17 18:55:40 +02:00
|
|
|
{
|
|
|
|
# Scrape the IPMI info of the hosts remotely via LAN
|
|
|
|
job_name = "ipmi-lan";
|
|
|
|
scrape_interval = "1m";
|
|
|
|
scrape_timeout = "30s";
|
|
|
|
metrics_path = "/ipmi";
|
|
|
|
scheme = "http";
|
|
|
|
relabel_configs = [
|
|
|
|
{
|
|
|
|
# Takes the address and sets it in the "target=<xyz>" URL parameter
|
|
|
|
source_labels = [ "__address__" ];
|
|
|
|
separator = ";";
|
|
|
|
regex = "(.*)(:80)?";
|
|
|
|
target_label = "__param_target";
|
|
|
|
replacement = "\${1}";
|
|
|
|
action = "replace";
|
|
|
|
}
|
|
|
|
{
|
|
|
|
# Sets the "instance" label with the remote host we are querying
|
|
|
|
source_labels = [ "__param_target" ];
|
|
|
|
separator = ";";
|
|
|
|
regex = "(.*)";
|
|
|
|
target_label = "instance";
|
|
|
|
replacement = "\${1}";
|
|
|
|
action = "replace";
|
|
|
|
}
|
|
|
|
{
|
|
|
|
# Sets the fixed "module=lan" URL param
|
|
|
|
separator = ";";
|
|
|
|
regex = "(.*)";
|
|
|
|
target_label = "__param_module";
|
|
|
|
replacement = "lan";
|
|
|
|
action = "replace";
|
|
|
|
}
|
|
|
|
{
|
|
|
|
# Sets the target to query as the localhost IPMI exporter
|
|
|
|
separator = ";";
|
|
|
|
regex = ".*";
|
|
|
|
target_label = "__address__";
|
|
|
|
replacement = "127.0.0.1:9290";
|
|
|
|
action = "replace";
|
|
|
|
}
|
|
|
|
];
|
|
|
|
|
|
|
|
# Load the list of targets from another file
|
|
|
|
file_sd_configs = [
|
|
|
|
{
|
|
|
|
files = [ "${./targets.yml}" ];
|
|
|
|
refresh_interval = "30s";
|
|
|
|
}
|
|
|
|
];
|
|
|
|
}
|
2023-04-05 17:00:01 +02:00
|
|
|
];
|
|
|
|
};
|
|
|
|
}
|