jungle/m/hut/monitoring.nix
Rodrigo Arias Mallo 3a3c3050ef Monitor fox, gateway and UPC anella via ICMP
Fox should reply once the machine is connected to the UPC network.
Monitoring also the gateway and UPC anella allows us to estimate if the
whole network is down or just fox.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-02 11:26:51 +02:00

284 lines
8.5 KiB
Nix

{ config, lib, ... }:
{
imports = [
../module/slurm-exporter.nix
../module/meteocat-exporter.nix
../module/upc-qaire-exporter.nix
./gpfs-probe.nix
./nix-daemon-exporter.nix
];
age.secrets.grafanaJungleRobotPassword = {
file = ../../secrets/jungle-robot-password.age;
owner = "grafana";
mode = "400";
};
age.secrets.ipmiYml.file = ../../secrets/ipmi.yml.age;
services.grafana = {
enable = true;
settings = {
server = {
domain = "jungle.bsc.es";
root_url = "%(protocol)s://%(domain)s/grafana";
serve_from_sub_path = true;
http_port = 2342;
http_addr = "127.0.0.1";
};
smtp = {
enabled = true;
from_address = "jungle-robot@bsc.es";
user = "jungle-robot";
# Read the password from a file, which is only readable by grafana user
# https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#file-provider
password = "$__file{${config.age.secrets.grafanaJungleRobotPassword.path}}";
host = "mail.bsc.es:465";
startTLS_policy = "NoStartTLS";
};
feature_toggles.publicDashboards = true;
"auth.anonymous".enabled = true;
log.level = "warn";
};
};
# Make grafana alerts also use the proxy
systemd.services.grafana.environment = config.networking.proxy.envVars;
services.prometheus = {
enable = true;
port = 9001;
retentionTime = "5y";
listenAddress = "127.0.0.1";
};
systemd.services.prometheus-ipmi-exporter.serviceConfig.DynamicUser = lib.mkForce false;
systemd.services.prometheus-ipmi-exporter.serviceConfig.PrivateDevices = lib.mkForce false;
# We need access to the devices to monitor the disk space
systemd.services.prometheus-node-exporter.serviceConfig.PrivateDevices = lib.mkForce false;
systemd.services.prometheus-node-exporter.serviceConfig.ProtectHome = lib.mkForce "read-only";
virtualisation.docker.daemon.settings = {
metrics-addr = "127.0.0.1:9323";
};
# Required to allow the smartctl exporter to read the nvme0 character device,
# see the commit message on:
# https://github.com/NixOS/nixpkgs/commit/12c26aca1fd55ab99f831bedc865a626eee39f80
services.udev.extraRules = ''
SUBSYSTEM=="nvme", KERNEL=="nvme[0-9]*", GROUP="disk"
'';
services.prometheus = {
exporters = {
ipmi = {
enable = true;
group = "root";
user = "root";
configFile = config.age.secrets.ipmiYml.path;
# extraFlags = [ "--log.level=debug" ];
listenAddress = "127.0.0.1";
};
node = {
enable = true;
enabledCollectors = [ "systemd" "logind" ];
port = 9002;
listenAddress = "127.0.0.1";
};
smartctl = {
enable = true;
listenAddress = "127.0.0.1";
};
blackbox = {
enable = true;
listenAddress = "127.0.0.1";
configFile = ./blackbox.yml;
};
};
scrapeConfigs = [
{
job_name = "xeon07";
static_configs = [{
targets = [
"127.0.0.1:${toString config.services.prometheus.exporters.node.port}"
"127.0.0.1:${toString config.services.prometheus.exporters.ipmi.port}"
"127.0.0.1:9323"
"127.0.0.1:9252"
"127.0.0.1:${toString config.services.prometheus.exporters.smartctl.port}"
"127.0.0.1:9341" # Slurm exporter
"127.0.0.1:9966" # GPFS custom exporter
"127.0.0.1:9999" # Nix-daemon custom exporter
"127.0.0.1:9929" # Meteocat custom exporter
"127.0.0.1:9928" # UPC Qaire custom exporter
"127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}"
];
}];
}
{
job_name = "ceph";
static_configs = [{
targets = [
"10.0.40.40:9283" # Ceph statistics
"10.0.40.40:9002" # Node exporter
"10.0.40.42:9002" # Node exporter
];
}];
}
{
job_name = "blackbox-http";
metrics_path = "/probe";
params = { module = [ "http_2xx" ]; };
static_configs = [{
targets = [
"https://www.google.com/robots.txt"
"https://pm.bsc.es/"
"https://pm.bsc.es/gitlab/"
"https://jungle.bsc.es/"
"https://gitlab.bsc.es/"
];
}];
relabel_configs = [
{
# Takes the address and sets it in the "target=<xyz>" URL parameter
source_labels = [ "__address__" ];
target_label = "__param_target";
}
{
# Sets the "instance" label with the remote host we are querying
source_labels = [ "__param_target" ];
target_label = "instance";
}
{
# Shows the host target address instead of the blackbox address
target_label = "__address__";
replacement = "127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}";
}
];
}
{
job_name = "blackbox-icmp";
metrics_path = "/probe";
params = { module = [ "icmp" ]; };
static_configs = [{
targets = [
"1.1.1.1"
"8.8.8.8"
"ssfhead"
"anella-bsc.cesca.cat"
"upc-anella.cesca.cat"
"fox.ac.upc.edu"
"arenys5.ac.upc.edu"
];
}];
relabel_configs = [
{
# Takes the address and sets it in the "target=<xyz>" URL parameter
source_labels = [ "__address__" ];
target_label = "__param_target";
}
{
# Sets the "instance" label with the remote host we are querying
source_labels = [ "__param_target" ];
target_label = "instance";
}
{
# Shows the host target address instead of the blackbox address
target_label = "__address__";
replacement = "127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}";
}
];
}
{
job_name = "gitea";
static_configs = [{ targets = [ "127.0.0.1:3000" ]; }];
}
{
# Scrape the IPMI info of the hosts remotely via LAN
job_name = "ipmi-lan";
scrape_interval = "1m";
scrape_timeout = "30s";
metrics_path = "/ipmi";
scheme = "http";
relabel_configs = [
{
# Takes the address and sets it in the "target=<xyz>" URL parameter
source_labels = [ "__address__" ];
separator = ";";
regex = "(.*)(:80)?";
target_label = "__param_target";
replacement = "\${1}";
action = "replace";
}
{
# Sets the "instance" label with the remote host we are querying
source_labels = [ "__param_target" ];
separator = ";";
regex = "(.*)-ipmi"; # Remove "-ipm̀i" at the end
target_label = "instance";
replacement = "\${1}";
action = "replace";
}
{
# Sets the fixed "module=lan" URL param
separator = ";";
regex = "(.*)";
target_label = "__param_module";
replacement = "lan";
action = "replace";
}
{
# Sets the target to query as the localhost IPMI exporter
separator = ";";
regex = ".*";
target_label = "__address__";
replacement = "127.0.0.1:9290";
action = "replace";
}
];
# Load the list of targets from another file
file_sd_configs = [
{
files = [ "${./targets.yml}" ];
refresh_interval = "30s";
}
];
}
{
job_name = "ipmi-raccoon";
metrics_path = "/ipmi";
static_configs = [
{ targets = [ "127.0.0.1:9291" ]; }
];
params = {
target = [ "84.88.51.142" ];
module = [ "raccoon" ];
};
}
{
job_name = "raccoon";
static_configs = [
{
targets = [ "127.0.0.1:19002" ]; # Node exporter
}
];
}
{
job_name = "ipmi-fox";
metrics_path = "/ipmi";
static_configs = [
{ targets = [ "127.0.0.1:9290" ]; }
];
params = {
target = [ "fox-ipmi" ];
module = [ "fox" ];
};
}
];
};
}