jungle-backup/m/hut/monitoring.nix
Rodrigo Arias Mallo 9f43a0e13b Remove fox monitoring via IPMI
We will need to setup an VPN to be able to access fox in its new
location, so for now we simply remove the IPMI monitoring.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-02 11:26:53 +02:00

273 lines
8.2 KiB
Nix

{ config, lib, ... }:
{
imports = [
../module/slurm-exporter.nix
../module/meteocat-exporter.nix
../module/upc-qaire-exporter.nix
./gpfs-probe.nix
./nix-daemon-exporter.nix
];
age.secrets.grafanaJungleRobotPassword = {
file = ../../secrets/jungle-robot-password.age;
owner = "grafana";
mode = "400";
};
age.secrets.ipmiYml.file = ../../secrets/ipmi.yml.age;
services.grafana = {
enable = true;
settings = {
server = {
domain = "jungle.bsc.es";
root_url = "%(protocol)s://%(domain)s/grafana";
serve_from_sub_path = true;
http_port = 2342;
http_addr = "127.0.0.1";
};
smtp = {
enabled = true;
from_address = "jungle-robot@bsc.es";
user = "jungle-robot";
# Read the password from a file, which is only readable by grafana user
# https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#file-provider
password = "$__file{${config.age.secrets.grafanaJungleRobotPassword.path}}";
host = "mail.bsc.es:465";
startTLS_policy = "NoStartTLS";
};
feature_toggles.publicDashboards = true;
"auth.anonymous".enabled = true;
log.level = "warn";
};
};
# Make grafana alerts also use the proxy
systemd.services.grafana.environment = config.networking.proxy.envVars;
services.prometheus = {
enable = true;
port = 9001;
retentionTime = "5y";
listenAddress = "127.0.0.1";
};
systemd.services.prometheus-ipmi-exporter.serviceConfig.DynamicUser = lib.mkForce false;
systemd.services.prometheus-ipmi-exporter.serviceConfig.PrivateDevices = lib.mkForce false;
# We need access to the devices to monitor the disk space
systemd.services.prometheus-node-exporter.serviceConfig.PrivateDevices = lib.mkForce false;
systemd.services.prometheus-node-exporter.serviceConfig.ProtectHome = lib.mkForce "read-only";
virtualisation.docker.daemon.settings = {
metrics-addr = "127.0.0.1:9323";
};
# Required to allow the smartctl exporter to read the nvme0 character device,
# see the commit message on:
# https://github.com/NixOS/nixpkgs/commit/12c26aca1fd55ab99f831bedc865a626eee39f80
services.udev.extraRules = ''
SUBSYSTEM=="nvme", KERNEL=="nvme[0-9]*", GROUP="disk"
'';
services.prometheus = {
exporters = {
ipmi = {
enable = true;
group = "root";
user = "root";
configFile = config.age.secrets.ipmiYml.path;
# extraFlags = [ "--log.level=debug" ];
listenAddress = "127.0.0.1";
};
node = {
enable = true;
enabledCollectors = [ "systemd" "logind" ];
port = 9002;
listenAddress = "127.0.0.1";
};
smartctl = {
enable = true;
listenAddress = "127.0.0.1";
};
blackbox = {
enable = true;
listenAddress = "127.0.0.1";
configFile = ./blackbox.yml;
};
};
scrapeConfigs = [
{
job_name = "xeon07";
static_configs = [{
targets = [
"127.0.0.1:${toString config.services.prometheus.exporters.node.port}"
"127.0.0.1:${toString config.services.prometheus.exporters.ipmi.port}"
"127.0.0.1:9323"
"127.0.0.1:9252"
"127.0.0.1:${toString config.services.prometheus.exporters.smartctl.port}"
"127.0.0.1:9341" # Slurm exporter
"127.0.0.1:9966" # GPFS custom exporter
"127.0.0.1:9999" # Nix-daemon custom exporter
"127.0.0.1:9929" # Meteocat custom exporter
"127.0.0.1:9928" # UPC Qaire custom exporter
"127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}"
];
}];
}
{
job_name = "ceph";
static_configs = [{
targets = [
"10.0.40.40:9283" # Ceph statistics
"10.0.40.40:9002" # Node exporter
"10.0.40.42:9002" # Node exporter
];
}];
}
{
job_name = "blackbox-http";
metrics_path = "/probe";
params = { module = [ "http_2xx" ]; };
static_configs = [{
targets = [
"https://www.google.com/robots.txt"
"https://pm.bsc.es/"
"https://pm.bsc.es/gitlab/"
"https://jungle.bsc.es/"
"https://gitlab.bsc.es/"
];
}];
relabel_configs = [
{
# Takes the address and sets it in the "target=<xyz>" URL parameter
source_labels = [ "__address__" ];
target_label = "__param_target";
}
{
# Sets the "instance" label with the remote host we are querying
source_labels = [ "__param_target" ];
target_label = "instance";
}
{
# Shows the host target address instead of the blackbox address
target_label = "__address__";
replacement = "127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}";
}
];
}
{
job_name = "blackbox-icmp";
metrics_path = "/probe";
params = { module = [ "icmp" ]; };
static_configs = [{
targets = [
"1.1.1.1"
"8.8.8.8"
"ssfhead"
"anella-bsc.cesca.cat"
"upc-anella.cesca.cat"
"fox.ac.upc.edu"
"arenys5.ac.upc.edu"
];
}];
relabel_configs = [
{
# Takes the address and sets it in the "target=<xyz>" URL parameter
source_labels = [ "__address__" ];
target_label = "__param_target";
}
{
# Sets the "instance" label with the remote host we are querying
source_labels = [ "__param_target" ];
target_label = "instance";
}
{
# Shows the host target address instead of the blackbox address
target_label = "__address__";
replacement = "127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}";
}
];
}
{
job_name = "gitea";
static_configs = [{ targets = [ "127.0.0.1:3000" ]; }];
}
{
# Scrape the IPMI info of the hosts remotely via LAN
job_name = "ipmi-lan";
scrape_interval = "1m";
scrape_timeout = "30s";
metrics_path = "/ipmi";
scheme = "http";
relabel_configs = [
{
# Takes the address and sets it in the "target=<xyz>" URL parameter
source_labels = [ "__address__" ];
separator = ";";
regex = "(.*)(:80)?";
target_label = "__param_target";
replacement = "\${1}";
action = "replace";
}
{
# Sets the "instance" label with the remote host we are querying
source_labels = [ "__param_target" ];
separator = ";";
regex = "(.*)-ipmi"; # Remove "-ipm̀i" at the end
target_label = "instance";
replacement = "\${1}";
action = "replace";
}
{
# Sets the fixed "module=lan" URL param
separator = ";";
regex = "(.*)";
target_label = "__param_module";
replacement = "lan";
action = "replace";
}
{
# Sets the target to query as the localhost IPMI exporter
separator = ";";
regex = ".*";
target_label = "__address__";
replacement = "127.0.0.1:9290";
action = "replace";
}
];
# Load the list of targets from another file
file_sd_configs = [
{
files = [ "${./targets.yml}" ];
refresh_interval = "30s";
}
];
}
{
job_name = "ipmi-raccoon";
metrics_path = "/ipmi";
static_configs = [
{ targets = [ "127.0.0.1:9291" ]; }
];
params = {
target = [ "84.88.51.142" ];
module = [ "raccoon" ];
};
}
{
job_name = "raccoon";
static_configs = [
{
targets = [ "127.0.0.1:19002" ]; # Node exporter
}
];
}
];
};
}