diff --git a/m/tent/blackbox.yml b/m/tent/blackbox.yml new file mode 100644 index 0000000..d716359 --- /dev/null +++ b/m/tent/blackbox.yml @@ -0,0 +1,13 @@ +modules: + http_2xx: + prober: http + timeout: 5s + http: + follow_redirects: true + valid_status_codes: [] # Defaults to 2xx + method: GET + icmp: + prober: icmp + timeout: 5s + icmp: + preferred_ip_protocol: "ip4" diff --git a/m/tent/configuration.nix b/m/tent/configuration.nix index 07b9244..1495755 100644 --- a/m/tent/configuration.nix +++ b/m/tent/configuration.nix @@ -6,6 +6,7 @@ ../module/emulation.nix ../module/debuginfod.nix ../module/ssh-hut-extern.nix + ./monitoring.nix ]; # Select the this using the ID to avoid mismatches diff --git a/m/tent/monitoring.nix b/m/tent/monitoring.nix new file mode 100644 index 0000000..e8b38dc --- /dev/null +++ b/m/tent/monitoring.nix @@ -0,0 +1,187 @@ +{ config, lib, pkgs, ... }: + +{ + imports = [ + ../module/meteocat-exporter.nix + ../module/upc-qaire-exporter.nix + ]; + + services.grafana = { + enable = true; + settings = { + server = { + domain = "localhost"; + #domain = "jungle.bsc.es"; + #root_url = "%(protocol)s://%(domain)s/grafana"; + #serve_from_sub_path = true; + http_port = 2342; + http_addr = "127.0.0.1"; + }; + feature_toggles.publicDashboards = true; + "auth.anonymous".enabled = true; + log.level = "warn"; + }; + }; + + services.prometheus = { + enable = true; + port = 9001; + retentionTime = "5y"; + listenAddress = "127.0.0.1"; + }; + + # We need access to the devices to monitor the disk space + systemd.services.prometheus-node-exporter.serviceConfig.PrivateDevices = lib.mkForce false; + systemd.services.prometheus-node-exporter.serviceConfig.ProtectHome = lib.mkForce "read-only"; + + # Credentials for IPMI exporter + age.secrets.ipmiYml = { + file = ../../secrets/ipmi.yml.age; + owner = "ipmi-exporter"; + }; + + # Create an IPMI group and assign the ipmi0 device + users.groups.ipmi = {}; + services.udev.extraRules = '' + SUBSYSTEM=="ipmi", KERNEL=="ipmi0", GROUP="ipmi", MODE="0660" + ''; + + # Add a new ipmi-exporter user that can read the ipmi0 device + users.users.ipmi-exporter = { + isSystemUser = true; + group = "ipmi"; + }; + + # Disable dynamic user so we have the ipmi-exporter user available for the credentials + systemd.services.prometheus-ipmi-exporter.serviceConfig = { + DynamicUser = lib.mkForce false; + PrivateDevices = lib.mkForce false; + User = lib.mkForce "ipmi-exporter"; + Group = lib.mkForce "ipmi"; + RestrictNamespaces = lib.mkForce false; + # Fake uid to 0 so it shuts up + ExecStart = let + cfg = config.services.prometheus.exporters.ipmi; + in lib.mkForce (lib.concatStringsSep " " ([ + "${pkgs.util-linux}/bin/unshare --map-user 0" + "${pkgs.prometheus-ipmi-exporter}/bin/ipmi_exporter" + "--web.listen-address ${cfg.listenAddress}:${toString cfg.port}" + "--config.file ${lib.escapeShellArg cfg.configFile}" + ] ++ cfg.extraFlags)); + }; + + services.prometheus = { + exporters = { + ipmi = { + enable = true; + configFile = config.age.secrets.ipmiYml.path; + #extraFlags = [ "--log.level=debug" ]; + listenAddress = "127.0.0.1"; + }; + node = { + enable = true; + enabledCollectors = [ "logind" ]; + port = 9002; + listenAddress = "127.0.0.1"; + }; + blackbox = { + enable = true; + listenAddress = "127.0.0.1"; + configFile = ./blackbox.yml; + }; + }; + + scrapeConfigs = [ + { + job_name = "local"; + static_configs = [{ + targets = [ + "127.0.0.1:9002" # Node exporter + #"127.0.0.1:9115" # Blackbox exporter + "127.0.0.1:9290" # IPMI exporter for local node + "127.0.0.1:9928" # UPC Qaire custom exporter + "127.0.0.1:9929" # Meteocat custom exporter + ]; + }]; + } + { + job_name = "blackbox-http"; + metrics_path = "/probe"; + params = { module = [ "http_2xx" ]; }; + static_configs = [{ + targets = [ + "https://www.google.com/robots.txt" + "https://pm.bsc.es/" + "https://pm.bsc.es/gitlab/" + "https://jungle.bsc.es/" + "https://gitlab.bsc.es/" + ]; + }]; + relabel_configs = [ + { + # Takes the address and sets it in the "target=" URL parameter + source_labels = [ "__address__" ]; + target_label = "__param_target"; + } + { + # Sets the "instance" label with the remote host we are querying + source_labels = [ "__param_target" ]; + target_label = "instance"; + } + { + # Shows the host target address instead of the blackbox address + target_label = "__address__"; + replacement = "127.0.0.1:9115"; + } + ]; + } + { + job_name = "blackbox-icmp"; + metrics_path = "/probe"; + params = { module = [ "icmp" ]; }; + static_configs = [{ + targets = [ + "1.1.1.1" + "8.8.8.8" + "ssfhead" + "raccoon" + "anella-bsc.cesca.cat" + "upc-anella.cesca.cat" + "fox.ac.upc.edu" + "arenys5.ac.upc.edu" + "arenys0-2.ac.upc.edu" + "epi01.bsc.es" + ]; + }]; + relabel_configs = [ + { + # Takes the address and sets it in the "target=" URL parameter + source_labels = [ "__address__" ]; + target_label = "__param_target"; + } + { + # Sets the "instance" label with the remote host we are querying + source_labels = [ "__param_target" ]; + target_label = "instance"; + } + { + # Shows the host target address instead of the blackbox address + target_label = "__address__"; + replacement = "127.0.0.1:9115"; + } + ]; + } + { + job_name = "ipmi-raccoon"; + metrics_path = "/ipmi"; + static_configs = [ + { targets = [ "127.0.0.1:9290" ]; } + ]; + params = { + target = [ "raccoon-ipmi" ]; + module = [ "raccoon" ]; + }; + } + ]; + }; +} diff --git a/secrets/ipmi.yml.age b/secrets/ipmi.yml.age index e445aae..02d1218 100644 Binary files a/secrets/ipmi.yml.age and b/secrets/ipmi.yml.age differ diff --git a/secrets/secrets.nix b/secrets/secrets.nix index 58ec2d6..30fe1dd 100644 --- a/secrets/secrets.nix +++ b/secrets/secrets.nix @@ -2,6 +2,7 @@ let keys = import ../keys.nix; adminsKeys = builtins.attrValues keys.admins; hut = [ keys.hosts.hut ] ++ adminsKeys; + mon = [ keys.hosts.hut keys.hosts.tent ] ++ adminsKeys; # Only expose ceph keys to safe nodes and admins safe = keys.hostGroup.safe ++ adminsKeys; in @@ -12,7 +13,7 @@ in "gitlab-bsc-docker-token.age".publicKeys = hut; "nix-serve.age".publicKeys = hut; "jungle-robot-password.age".publicKeys = hut; - "ipmi.yml.age".publicKeys = hut; + "ipmi.yml.age".publicKeys = mon; "ceph-user.age".publicKeys = safe; "munge-key.age".publicKeys = safe;