From a7775f9a8d5532aaf0c9796dd71b7ab87f81aacb Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Thu, 12 Jun 2025 10:32:31 +0200 Subject: [PATCH] Add monitoring in tent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Aleix Boné Reviewed-by: Aleix Roca Nonell --- m/tent/blackbox.yml | 13 +++ m/tent/configuration.nix | 1 + m/tent/monitoring.nix | 187 +++++++++++++++++++++++++++++++++++++++ secrets/ipmi.yml.age | Bin 1294 -> 1563 bytes secrets/secrets.nix | 3 +- 5 files changed, 203 insertions(+), 1 deletion(-) create mode 100644 m/tent/blackbox.yml create mode 100644 m/tent/monitoring.nix diff --git a/m/tent/blackbox.yml b/m/tent/blackbox.yml new file mode 100644 index 0000000..d716359 --- /dev/null +++ b/m/tent/blackbox.yml @@ -0,0 +1,13 @@ +modules: + http_2xx: + prober: http + timeout: 5s + http: + follow_redirects: true + valid_status_codes: [] # Defaults to 2xx + method: GET + icmp: + prober: icmp + timeout: 5s + icmp: + preferred_ip_protocol: "ip4" diff --git a/m/tent/configuration.nix b/m/tent/configuration.nix index 07b9244..1495755 100644 --- a/m/tent/configuration.nix +++ b/m/tent/configuration.nix @@ -6,6 +6,7 @@ ../module/emulation.nix ../module/debuginfod.nix ../module/ssh-hut-extern.nix + ./monitoring.nix ]; # Select the this using the ID to avoid mismatches diff --git a/m/tent/monitoring.nix b/m/tent/monitoring.nix new file mode 100644 index 0000000..e8b38dc --- /dev/null +++ b/m/tent/monitoring.nix @@ -0,0 +1,187 @@ +{ config, lib, pkgs, ... }: + +{ + imports = [ + ../module/meteocat-exporter.nix + ../module/upc-qaire-exporter.nix + ]; + + services.grafana = { + enable = true; + settings = { + server = { + domain = "localhost"; + #domain = "jungle.bsc.es"; + #root_url = "%(protocol)s://%(domain)s/grafana"; + #serve_from_sub_path = true; + http_port = 2342; + http_addr = "127.0.0.1"; + }; + feature_toggles.publicDashboards = true; + "auth.anonymous".enabled = true; + log.level = "warn"; + }; + }; + + services.prometheus = { + enable = true; + port = 9001; + retentionTime = "5y"; + listenAddress = "127.0.0.1"; + }; + + # We need access to the devices to monitor the disk space + systemd.services.prometheus-node-exporter.serviceConfig.PrivateDevices = lib.mkForce false; + systemd.services.prometheus-node-exporter.serviceConfig.ProtectHome = lib.mkForce "read-only"; + + # Credentials for IPMI exporter + age.secrets.ipmiYml = { + file = ../../secrets/ipmi.yml.age; + owner = "ipmi-exporter"; + }; + + # Create an IPMI group and assign the ipmi0 device + users.groups.ipmi = {}; + services.udev.extraRules = '' + SUBSYSTEM=="ipmi", KERNEL=="ipmi0", GROUP="ipmi", MODE="0660" + ''; + + # Add a new ipmi-exporter user that can read the ipmi0 device + users.users.ipmi-exporter = { + isSystemUser = true; + group = "ipmi"; + }; + + # Disable dynamic user so we have the ipmi-exporter user available for the credentials + systemd.services.prometheus-ipmi-exporter.serviceConfig = { + DynamicUser = lib.mkForce false; + PrivateDevices = lib.mkForce false; + User = lib.mkForce "ipmi-exporter"; + Group = lib.mkForce "ipmi"; + RestrictNamespaces = lib.mkForce false; + # Fake uid to 0 so it shuts up + ExecStart = let + cfg = config.services.prometheus.exporters.ipmi; + in lib.mkForce (lib.concatStringsSep " " ([ + "${pkgs.util-linux}/bin/unshare --map-user 0" + "${pkgs.prometheus-ipmi-exporter}/bin/ipmi_exporter" + "--web.listen-address ${cfg.listenAddress}:${toString cfg.port}" + "--config.file ${lib.escapeShellArg cfg.configFile}" + ] ++ cfg.extraFlags)); + }; + + services.prometheus = { + exporters = { + ipmi = { + enable = true; + configFile = config.age.secrets.ipmiYml.path; + #extraFlags = [ "--log.level=debug" ]; + listenAddress = "127.0.0.1"; + }; + node = { + enable = true; + enabledCollectors = [ "logind" ]; + port = 9002; + listenAddress = "127.0.0.1"; + }; + blackbox = { + enable = true; + listenAddress = "127.0.0.1"; + configFile = ./blackbox.yml; + }; + }; + + scrapeConfigs = [ + { + job_name = "local"; + static_configs = [{ + targets = [ + "127.0.0.1:9002" # Node exporter + #"127.0.0.1:9115" # Blackbox exporter + "127.0.0.1:9290" # IPMI exporter for local node + "127.0.0.1:9928" # UPC Qaire custom exporter + "127.0.0.1:9929" # Meteocat custom exporter + ]; + }]; + } + { + job_name = "blackbox-http"; + metrics_path = "/probe"; + params = { module = [ "http_2xx" ]; }; + static_configs = [{ + targets = [ + "https://www.google.com/robots.txt" + "https://pm.bsc.es/" + "https://pm.bsc.es/gitlab/" + "https://jungle.bsc.es/" + "https://gitlab.bsc.es/" + ]; + }]; + relabel_configs = [ + { + # Takes the address and sets it in the "target=" URL parameter + source_labels = [ "__address__" ]; + target_label = "__param_target"; + } + { + # Sets the "instance" label with the remote host we are querying + source_labels = [ "__param_target" ]; + target_label = "instance"; + } + { + # Shows the host target address instead of the blackbox address + target_label = "__address__"; + replacement = "127.0.0.1:9115"; + } + ]; + } + { + job_name = "blackbox-icmp"; + metrics_path = "/probe"; + params = { module = [ "icmp" ]; }; + static_configs = [{ + targets = [ + "1.1.1.1" + "8.8.8.8" + "ssfhead" + "raccoon" + "anella-bsc.cesca.cat" + "upc-anella.cesca.cat" + "fox.ac.upc.edu" + "arenys5.ac.upc.edu" + "arenys0-2.ac.upc.edu" + "epi01.bsc.es" + ]; + }]; + relabel_configs = [ + { + # Takes the address and sets it in the "target=" URL parameter + source_labels = [ "__address__" ]; + target_label = "__param_target"; + } + { + # Sets the "instance" label with the remote host we are querying + source_labels = [ "__param_target" ]; + target_label = "instance"; + } + { + # Shows the host target address instead of the blackbox address + target_label = "__address__"; + replacement = "127.0.0.1:9115"; + } + ]; + } + { + job_name = "ipmi-raccoon"; + metrics_path = "/ipmi"; + static_configs = [ + { targets = [ "127.0.0.1:9290" ]; } + ]; + params = { + target = [ "raccoon-ipmi" ]; + module = [ "raccoon" ]; + }; + } + ]; + }; +} diff --git a/secrets/ipmi.yml.age b/secrets/ipmi.yml.age index e445aaed3f2865c4dac62c68d45c607eb37fea4b..02d1218774bb6ebfcbdcf14b93c3f410e9efa2b4 100644 GIT binary patch delta 1499 zcmeC(9$d~&9$s5!XPLpGtV<6*VEKBE!Wi0 zm&-86FFUEkEy}>W$}~LK*;C)UGRQJhJ1{w=JlD&`sGuOjIX$(qs>&lTluOr6p}06h zH#Nn`)YQ;Y!QIp+!n9n$&m*tG#k;7~-(0`Q%{eA314=^kC&x32*N2olC0qLC=vRa%=B1`O zdloDx`avrx}}g1Sk5ql^P^ETbOxN6}hAohU9oT z7ACox_?3i))u&b%=jD54MRJ`!lkc^A&VO#6KWC5a={y;y^!!AzqoCr84%bZu`&<5h zs_whJd+TwTZ~PsXd_G>2?3}&!96L+j4y}k-m;Nd+EZ|h=bi{|foxaDliG1sp8_gSljUc8!O)bYNB z&)rV+M{SGlwvCc4Tu0P+&7_6fCf||q>EOThWyRa?crO{q4!VvmdP9m%gpZY(@j zB{%DdN|Nu>+JkeQ5{@+7aIb$L^iA<$`StTF4om#pc!T%mq@aCw548V1b@TIudq;$B zWp+0G%E@?g*Y~S>%hwwILgxG1`07l~ITrhCU!MBKUv!r4RxZy#HPNWwiDC;+#UEro z|N9K{56x}q%B`$N4O5=K+0WOpI!mk3XMK8AL|cu@y}Ld4zL@>8n*X`*GBdBMa($g^ zgP{`LA~pb(zORrMs4&WYoU*=Xz1OJjlKoBMVeUr_ zY||GwXU5ix$=<&{%VY+3R^v?mE8lGI`EGn=QZ^~K^4Z~QmbSOpl(LyGX>PrLRM2Xc z`}9PwdCrDEPv^L7eC^_sCeZmavxYs{PeI_^lvIl)905|x6*u>+Fx3UW!tDbr4Zsoeq;fu3R9{9$g>b}LOZ0UD*Cu5hwQ2t)IleumwE3LKP z-nkj0pyZG$AEEu!weGj`jBmS}*TkCEPAxrj?^bA-Ucod~*A;f}ADuj-A$Q~XQq(ygS(Jk30`)XU8{#VuIdapFhu@DfYc zbjz^F07vZ&p>w*15d-0 z;K%|~XCLQeQ)lPGRP#jRoG?o_pU5&7=lm$`s<4RU$?=Th;cg`!1#YS4X@TJeZk4V9 zF2Ui386k=JA*DXruKJZp9*+6><&NPlmARqGT%|t6nVv=I#zu*55zc0L*&%6eKBeh~ zc}Bt7i3JAcDan@Z1y1g%8A%4&lg~1W*K4O4XH|MSxw&Neq@_lBdiezvyBT-z=d7HLZdyvxEkl?dKZ=iWt3HAxcg~4hox7gcyoaOvvmD(LH5IA$8;6#E%wM^=~zg_Rf=hB+Jgn??DhXXKlkdb?yK*M}4v zr}|Wcg>tE_kDbhEr1iY*%cE_-*4}-nd*gqCNL|#j%Yi9RR!!-idTaM-d+(+D8t>=* zYHNKri;HW;w@rp^8)bY&4@6o1lITCg{r3Aqw|A$t zwC29+zh``=KWBgDciB|4lQq#NTvvQ=$+4SSFB9hMH)AD-(wF&u^Wu4?tX#(}A$vb* zQ}x7(+f(zsTZ(=4*B+bLa<5O~!)Aw8h52(f?4Bfls46-5cFe3D(N8n>NVeK-YGTlm zdwxsNVng3P?R$JSi%RPaiiM|qVCP6^E#lsM>c@XyXWzo(TCw}0gSOpX{Arr2@dyR>3My1aai1^>Dv^-DdrUm608qU*wEF?|iF zs9t*AF#p~1M>7MNo-3DZ*cLMzpH57#Kfy- zCHcgF<(euBiOQ zJ4?d*e{pX!``1<-<&)d%C;iGx%-QQ8FM315^6j>If9v-FPFpIf*LsTgn7>ccOq`Rh zHOJMs)9~fvO>+E_9Xk&1IU4nMjY8@CN1=*3S@iq#NSo(tVuQ2o zb92j<70tL%rT5vM+m_7#^!0vG)~>gbY>9zPhvm;Y8rk3cy8FMf?vFgl#*-{s zFAs&Ohj`8FzCUqV-nZZbcJJow7f5ei`(w(#QtgNJKd&yhR;{vdYOBus4(F@c9705-xGK>z>% diff --git a/secrets/secrets.nix b/secrets/secrets.nix index 58ec2d6..30fe1dd 100644 --- a/secrets/secrets.nix +++ b/secrets/secrets.nix @@ -2,6 +2,7 @@ let keys = import ../keys.nix; adminsKeys = builtins.attrValues keys.admins; hut = [ keys.hosts.hut ] ++ adminsKeys; + mon = [ keys.hosts.hut keys.hosts.tent ] ++ adminsKeys; # Only expose ceph keys to safe nodes and admins safe = keys.hostGroup.safe ++ adminsKeys; in @@ -12,7 +13,7 @@ in "gitlab-bsc-docker-token.age".publicKeys = hut; "nix-serve.age".publicKeys = hut; "jungle-robot-password.age".publicKeys = hut; - "ipmi.yml.age".publicKeys = hut; + "ipmi.yml.age".publicKeys = mon; "ceph-user.age".publicKeys = safe; "munge-key.age".publicKeys = safe;