Add monitoring in tent
Reviewed-by: Aleix Boné <abonerib@bsc.es> Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
This commit is contained in:
parent
d0fd8cde46
commit
61e6d3232b
13
m/tent/blackbox.yml
Normal file
13
m/tent/blackbox.yml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
modules:
|
||||||
|
http_2xx:
|
||||||
|
prober: http
|
||||||
|
timeout: 5s
|
||||||
|
http:
|
||||||
|
follow_redirects: true
|
||||||
|
valid_status_codes: [] # Defaults to 2xx
|
||||||
|
method: GET
|
||||||
|
icmp:
|
||||||
|
prober: icmp
|
||||||
|
timeout: 5s
|
||||||
|
icmp:
|
||||||
|
preferred_ip_protocol: "ip4"
|
||||||
@ -6,6 +6,7 @@
|
|||||||
../module/emulation.nix
|
../module/emulation.nix
|
||||||
../module/debuginfod.nix
|
../module/debuginfod.nix
|
||||||
../module/ssh-hut-extern.nix
|
../module/ssh-hut-extern.nix
|
||||||
|
./monitoring.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
# Select the this using the ID to avoid mismatches
|
# Select the this using the ID to avoid mismatches
|
||||||
|
|||||||
187
m/tent/monitoring.nix
Normal file
187
m/tent/monitoring.nix
Normal file
@ -0,0 +1,187 @@
|
|||||||
|
{ config, lib, pkgs, ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
../module/meteocat-exporter.nix
|
||||||
|
../module/upc-qaire-exporter.nix
|
||||||
|
];
|
||||||
|
|
||||||
|
services.grafana = {
|
||||||
|
enable = true;
|
||||||
|
settings = {
|
||||||
|
server = {
|
||||||
|
domain = "localhost";
|
||||||
|
#domain = "jungle.bsc.es";
|
||||||
|
#root_url = "%(protocol)s://%(domain)s/grafana";
|
||||||
|
#serve_from_sub_path = true;
|
||||||
|
http_port = 2342;
|
||||||
|
http_addr = "127.0.0.1";
|
||||||
|
};
|
||||||
|
feature_toggles.publicDashboards = true;
|
||||||
|
"auth.anonymous".enabled = true;
|
||||||
|
log.level = "warn";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
services.prometheus = {
|
||||||
|
enable = true;
|
||||||
|
port = 9001;
|
||||||
|
retentionTime = "5y";
|
||||||
|
listenAddress = "127.0.0.1";
|
||||||
|
};
|
||||||
|
|
||||||
|
# We need access to the devices to monitor the disk space
|
||||||
|
systemd.services.prometheus-node-exporter.serviceConfig.PrivateDevices = lib.mkForce false;
|
||||||
|
systemd.services.prometheus-node-exporter.serviceConfig.ProtectHome = lib.mkForce "read-only";
|
||||||
|
|
||||||
|
# Credentials for IPMI exporter
|
||||||
|
age.secrets.ipmiYml = {
|
||||||
|
file = ../../secrets/ipmi.yml.age;
|
||||||
|
owner = "ipmi-exporter";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Create an IPMI group and assign the ipmi0 device
|
||||||
|
users.groups.ipmi = {};
|
||||||
|
services.udev.extraRules = ''
|
||||||
|
SUBSYSTEM=="ipmi", KERNEL=="ipmi0", GROUP="ipmi", MODE="0660"
|
||||||
|
'';
|
||||||
|
|
||||||
|
# Add a new ipmi-exporter user that can read the ipmi0 device
|
||||||
|
users.users.ipmi-exporter = {
|
||||||
|
isSystemUser = true;
|
||||||
|
group = "ipmi";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Disable dynamic user so we have the ipmi-exporter user available for the credentials
|
||||||
|
systemd.services.prometheus-ipmi-exporter.serviceConfig = {
|
||||||
|
DynamicUser = lib.mkForce false;
|
||||||
|
PrivateDevices = lib.mkForce false;
|
||||||
|
User = lib.mkForce "ipmi-exporter";
|
||||||
|
Group = lib.mkForce "ipmi";
|
||||||
|
RestrictNamespaces = lib.mkForce false;
|
||||||
|
# Fake uid to 0 so it shuts up
|
||||||
|
ExecStart = let
|
||||||
|
cfg = config.services.prometheus.exporters.ipmi;
|
||||||
|
in lib.mkForce (lib.concatStringsSep " " ([
|
||||||
|
"${pkgs.util-linux}/bin/unshare --map-user 0"
|
||||||
|
"${pkgs.prometheus-ipmi-exporter}/bin/ipmi_exporter"
|
||||||
|
"--web.listen-address ${cfg.listenAddress}:${toString cfg.port}"
|
||||||
|
"--config.file ${lib.escapeShellArg cfg.configFile}"
|
||||||
|
] ++ cfg.extraFlags));
|
||||||
|
};
|
||||||
|
|
||||||
|
services.prometheus = {
|
||||||
|
exporters = {
|
||||||
|
ipmi = {
|
||||||
|
enable = true;
|
||||||
|
configFile = config.age.secrets.ipmiYml.path;
|
||||||
|
#extraFlags = [ "--log.level=debug" ];
|
||||||
|
listenAddress = "127.0.0.1";
|
||||||
|
};
|
||||||
|
node = {
|
||||||
|
enable = true;
|
||||||
|
enabledCollectors = [ "logind" ];
|
||||||
|
port = 9002;
|
||||||
|
listenAddress = "127.0.0.1";
|
||||||
|
};
|
||||||
|
blackbox = {
|
||||||
|
enable = true;
|
||||||
|
listenAddress = "127.0.0.1";
|
||||||
|
configFile = ./blackbox.yml;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
scrapeConfigs = [
|
||||||
|
{
|
||||||
|
job_name = "local";
|
||||||
|
static_configs = [{
|
||||||
|
targets = [
|
||||||
|
"127.0.0.1:9002" # Node exporter
|
||||||
|
#"127.0.0.1:9115" # Blackbox exporter
|
||||||
|
"127.0.0.1:9290" # IPMI exporter for local node
|
||||||
|
"127.0.0.1:9928" # UPC Qaire custom exporter
|
||||||
|
"127.0.0.1:9929" # Meteocat custom exporter
|
||||||
|
];
|
||||||
|
}];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "blackbox-http";
|
||||||
|
metrics_path = "/probe";
|
||||||
|
params = { module = [ "http_2xx" ]; };
|
||||||
|
static_configs = [{
|
||||||
|
targets = [
|
||||||
|
"https://www.google.com/robots.txt"
|
||||||
|
"https://pm.bsc.es/"
|
||||||
|
"https://pm.bsc.es/gitlab/"
|
||||||
|
"https://jungle.bsc.es/"
|
||||||
|
"https://gitlab.bsc.es/"
|
||||||
|
];
|
||||||
|
}];
|
||||||
|
relabel_configs = [
|
||||||
|
{
|
||||||
|
# Takes the address and sets it in the "target=<xyz>" URL parameter
|
||||||
|
source_labels = [ "__address__" ];
|
||||||
|
target_label = "__param_target";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
# Sets the "instance" label with the remote host we are querying
|
||||||
|
source_labels = [ "__param_target" ];
|
||||||
|
target_label = "instance";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
# Shows the host target address instead of the blackbox address
|
||||||
|
target_label = "__address__";
|
||||||
|
replacement = "127.0.0.1:9115";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "blackbox-icmp";
|
||||||
|
metrics_path = "/probe";
|
||||||
|
params = { module = [ "icmp" ]; };
|
||||||
|
static_configs = [{
|
||||||
|
targets = [
|
||||||
|
"1.1.1.1"
|
||||||
|
"8.8.8.8"
|
||||||
|
"ssfhead"
|
||||||
|
"raccoon"
|
||||||
|
"anella-bsc.cesca.cat"
|
||||||
|
"upc-anella.cesca.cat"
|
||||||
|
"fox.ac.upc.edu"
|
||||||
|
"arenys5.ac.upc.edu"
|
||||||
|
"arenys0-2.ac.upc.edu"
|
||||||
|
"epi01.bsc.es"
|
||||||
|
];
|
||||||
|
}];
|
||||||
|
relabel_configs = [
|
||||||
|
{
|
||||||
|
# Takes the address and sets it in the "target=<xyz>" URL parameter
|
||||||
|
source_labels = [ "__address__" ];
|
||||||
|
target_label = "__param_target";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
# Sets the "instance" label with the remote host we are querying
|
||||||
|
source_labels = [ "__param_target" ];
|
||||||
|
target_label = "instance";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
# Shows the host target address instead of the blackbox address
|
||||||
|
target_label = "__address__";
|
||||||
|
replacement = "127.0.0.1:9115";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "ipmi-raccoon";
|
||||||
|
metrics_path = "/ipmi";
|
||||||
|
static_configs = [
|
||||||
|
{ targets = [ "127.0.0.1:9290" ]; }
|
||||||
|
];
|
||||||
|
params = {
|
||||||
|
target = [ "raccoon-ipmi" ];
|
||||||
|
module = [ "raccoon" ];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
}
|
||||||
Binary file not shown.
@ -2,6 +2,7 @@ let
|
|||||||
keys = import ../keys.nix;
|
keys = import ../keys.nix;
|
||||||
adminsKeys = builtins.attrValues keys.admins;
|
adminsKeys = builtins.attrValues keys.admins;
|
||||||
hut = [ keys.hosts.hut ] ++ adminsKeys;
|
hut = [ keys.hosts.hut ] ++ adminsKeys;
|
||||||
|
mon = [ keys.hosts.hut keys.hosts.tent ] ++ adminsKeys;
|
||||||
# Only expose ceph keys to safe nodes and admins
|
# Only expose ceph keys to safe nodes and admins
|
||||||
safe = keys.hostGroup.safe ++ adminsKeys;
|
safe = keys.hostGroup.safe ++ adminsKeys;
|
||||||
in
|
in
|
||||||
@ -12,7 +13,7 @@ in
|
|||||||
"gitlab-bsc-docker-token.age".publicKeys = hut;
|
"gitlab-bsc-docker-token.age".publicKeys = hut;
|
||||||
"nix-serve.age".publicKeys = hut;
|
"nix-serve.age".publicKeys = hut;
|
||||||
"jungle-robot-password.age".publicKeys = hut;
|
"jungle-robot-password.age".publicKeys = hut;
|
||||||
"ipmi.yml.age".publicKeys = hut;
|
"ipmi.yml.age".publicKeys = mon;
|
||||||
|
|
||||||
"ceph-user.age".publicKeys = safe;
|
"ceph-user.age".publicKeys = safe;
|
||||||
"munge-key.age".publicKeys = safe;
|
"munge-key.age".publicKeys = safe;
|
||||||
|
|||||||
Reference in New Issue
Block a user