Add monitoring in tent
Reviewed-by: Aleix Boné <abonerib@bsc.es> Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
This commit is contained in:
		
							parent
							
								
									d0fd8cde46
								
							
						
					
					
						commit
						61e6d3232b
					
				
							
								
								
									
										13
									
								
								m/tent/blackbox.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								m/tent/blackbox.yml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,13 @@ | |||||||
|  | modules: | ||||||
|  |   http_2xx: | ||||||
|  |     prober: http | ||||||
|  |     timeout: 5s | ||||||
|  |     http: | ||||||
|  |       follow_redirects: true | ||||||
|  |       valid_status_codes: []  # Defaults to 2xx | ||||||
|  |       method: GET | ||||||
|  |   icmp: | ||||||
|  |     prober: icmp | ||||||
|  |     timeout: 5s | ||||||
|  |     icmp: | ||||||
|  |       preferred_ip_protocol: "ip4" | ||||||
| @ -6,6 +6,7 @@ | |||||||
|     ../module/emulation.nix |     ../module/emulation.nix | ||||||
|     ../module/debuginfod.nix |     ../module/debuginfod.nix | ||||||
|     ../module/ssh-hut-extern.nix |     ../module/ssh-hut-extern.nix | ||||||
|  |     ./monitoring.nix | ||||||
|   ]; |   ]; | ||||||
| 
 | 
 | ||||||
|   # Select the this using the ID to avoid mismatches |   # Select the this using the ID to avoid mismatches | ||||||
|  | |||||||
							
								
								
									
										187
									
								
								m/tent/monitoring.nix
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										187
									
								
								m/tent/monitoring.nix
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,187 @@ | |||||||
|  | { config, lib, pkgs, ... }: | ||||||
|  | 
 | ||||||
|  | { | ||||||
|  |   imports = [ | ||||||
|  |     ../module/meteocat-exporter.nix | ||||||
|  |     ../module/upc-qaire-exporter.nix | ||||||
|  |   ]; | ||||||
|  | 
 | ||||||
|  |   services.grafana = { | ||||||
|  |     enable = true; | ||||||
|  |     settings = { | ||||||
|  |       server = { | ||||||
|  |         domain = "localhost"; | ||||||
|  |         #domain = "jungle.bsc.es"; | ||||||
|  |         #root_url = "%(protocol)s://%(domain)s/grafana"; | ||||||
|  |         #serve_from_sub_path = true; | ||||||
|  |         http_port = 2342; | ||||||
|  |         http_addr = "127.0.0.1"; | ||||||
|  |       }; | ||||||
|  |       feature_toggles.publicDashboards = true; | ||||||
|  |       "auth.anonymous".enabled = true; | ||||||
|  |       log.level = "warn"; | ||||||
|  |     }; | ||||||
|  |   }; | ||||||
|  | 
 | ||||||
|  |   services.prometheus = { | ||||||
|  |     enable = true; | ||||||
|  |     port = 9001; | ||||||
|  |     retentionTime = "5y"; | ||||||
|  |     listenAddress = "127.0.0.1"; | ||||||
|  |   }; | ||||||
|  | 
 | ||||||
|  |   # We need access to the devices to monitor the disk space | ||||||
|  |   systemd.services.prometheus-node-exporter.serviceConfig.PrivateDevices = lib.mkForce false; | ||||||
|  |   systemd.services.prometheus-node-exporter.serviceConfig.ProtectHome = lib.mkForce "read-only"; | ||||||
|  | 
 | ||||||
|  |   # Credentials for IPMI exporter | ||||||
|  |   age.secrets.ipmiYml = { | ||||||
|  |     file = ../../secrets/ipmi.yml.age; | ||||||
|  |     owner = "ipmi-exporter"; | ||||||
|  |   }; | ||||||
|  | 
 | ||||||
|  |   # Create an IPMI group and assign the ipmi0 device | ||||||
|  |   users.groups.ipmi = {}; | ||||||
|  |   services.udev.extraRules = '' | ||||||
|  |     SUBSYSTEM=="ipmi", KERNEL=="ipmi0", GROUP="ipmi", MODE="0660" | ||||||
|  |   ''; | ||||||
|  | 
 | ||||||
|  |   # Add a new ipmi-exporter user that can read the ipmi0 device | ||||||
|  |   users.users.ipmi-exporter = { | ||||||
|  |     isSystemUser = true; | ||||||
|  |     group = "ipmi"; | ||||||
|  |   }; | ||||||
|  | 
 | ||||||
|  |   # Disable dynamic user so we have the ipmi-exporter user available for the credentials | ||||||
|  |   systemd.services.prometheus-ipmi-exporter.serviceConfig = { | ||||||
|  |     DynamicUser = lib.mkForce false; | ||||||
|  |     PrivateDevices = lib.mkForce false; | ||||||
|  |     User = lib.mkForce "ipmi-exporter"; | ||||||
|  |     Group = lib.mkForce "ipmi"; | ||||||
|  |     RestrictNamespaces = lib.mkForce false; | ||||||
|  |     # Fake uid to 0 so it shuts up | ||||||
|  |     ExecStart = let | ||||||
|  |       cfg = config.services.prometheus.exporters.ipmi; | ||||||
|  |     in lib.mkForce (lib.concatStringsSep " " ([ | ||||||
|  |       "${pkgs.util-linux}/bin/unshare --map-user 0" | ||||||
|  |       "${pkgs.prometheus-ipmi-exporter}/bin/ipmi_exporter" | ||||||
|  |       "--web.listen-address ${cfg.listenAddress}:${toString cfg.port}" | ||||||
|  |       "--config.file ${lib.escapeShellArg cfg.configFile}" | ||||||
|  |     ] ++ cfg.extraFlags)); | ||||||
|  |   }; | ||||||
|  | 
 | ||||||
|  |   services.prometheus = { | ||||||
|  |     exporters = { | ||||||
|  |       ipmi = { | ||||||
|  |         enable = true; | ||||||
|  |         configFile = config.age.secrets.ipmiYml.path; | ||||||
|  |         #extraFlags = [ "--log.level=debug" ]; | ||||||
|  |         listenAddress = "127.0.0.1"; | ||||||
|  |       }; | ||||||
|  |       node = { | ||||||
|  |         enable = true; | ||||||
|  |         enabledCollectors = [ "logind" ]; | ||||||
|  |         port = 9002; | ||||||
|  |         listenAddress = "127.0.0.1"; | ||||||
|  |       }; | ||||||
|  |       blackbox = { | ||||||
|  |         enable = true; | ||||||
|  |         listenAddress = "127.0.0.1"; | ||||||
|  |         configFile = ./blackbox.yml; | ||||||
|  |       }; | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     scrapeConfigs = [ | ||||||
|  |       { | ||||||
|  |         job_name = "local"; | ||||||
|  |         static_configs = [{ | ||||||
|  |           targets = [ | ||||||
|  |             "127.0.0.1:9002" # Node exporter | ||||||
|  |             #"127.0.0.1:9115" # Blackbox exporter | ||||||
|  |             "127.0.0.1:9290" # IPMI exporter for local node | ||||||
|  |             "127.0.0.1:9928" # UPC Qaire custom exporter | ||||||
|  |             "127.0.0.1:9929" # Meteocat custom exporter | ||||||
|  |           ]; | ||||||
|  |         }]; | ||||||
|  |       } | ||||||
|  |       { | ||||||
|  |         job_name = "blackbox-http"; | ||||||
|  |         metrics_path = "/probe"; | ||||||
|  |         params = { module = [ "http_2xx" ]; }; | ||||||
|  |         static_configs = [{ | ||||||
|  |           targets = [ | ||||||
|  |             "https://www.google.com/robots.txt" | ||||||
|  |             "https://pm.bsc.es/" | ||||||
|  |             "https://pm.bsc.es/gitlab/" | ||||||
|  |             "https://jungle.bsc.es/" | ||||||
|  |             "https://gitlab.bsc.es/" | ||||||
|  |           ]; | ||||||
|  |         }]; | ||||||
|  |         relabel_configs = [ | ||||||
|  |           { | ||||||
|  |             # Takes the address and sets it in the "target=<xyz>" URL parameter | ||||||
|  |             source_labels = [ "__address__" ]; | ||||||
|  |             target_label = "__param_target"; | ||||||
|  |           } | ||||||
|  |           { | ||||||
|  |             # Sets the "instance" label with the remote host we are querying | ||||||
|  |             source_labels = [ "__param_target" ]; | ||||||
|  |             target_label = "instance"; | ||||||
|  |           } | ||||||
|  |           { | ||||||
|  |             # Shows the host target address instead of the blackbox address | ||||||
|  |             target_label = "__address__"; | ||||||
|  |             replacement = "127.0.0.1:9115"; | ||||||
|  |           } | ||||||
|  |         ]; | ||||||
|  |       } | ||||||
|  |       { | ||||||
|  |         job_name = "blackbox-icmp"; | ||||||
|  |         metrics_path = "/probe"; | ||||||
|  |         params = { module = [ "icmp" ]; }; | ||||||
|  |         static_configs = [{ | ||||||
|  |           targets = [ | ||||||
|  |             "1.1.1.1" | ||||||
|  |             "8.8.8.8" | ||||||
|  |             "ssfhead" | ||||||
|  |             "raccoon" | ||||||
|  |             "anella-bsc.cesca.cat" | ||||||
|  |             "upc-anella.cesca.cat" | ||||||
|  |             "fox.ac.upc.edu" | ||||||
|  |             "arenys5.ac.upc.edu" | ||||||
|  |             "arenys0-2.ac.upc.edu" | ||||||
|  |             "epi01.bsc.es" | ||||||
|  |           ]; | ||||||
|  |         }]; | ||||||
|  |         relabel_configs = [ | ||||||
|  |           { | ||||||
|  |             # Takes the address and sets it in the "target=<xyz>" URL parameter | ||||||
|  |             source_labels = [ "__address__" ]; | ||||||
|  |             target_label = "__param_target"; | ||||||
|  |           } | ||||||
|  |           { | ||||||
|  |             # Sets the "instance" label with the remote host we are querying | ||||||
|  |             source_labels = [ "__param_target" ]; | ||||||
|  |             target_label = "instance"; | ||||||
|  |           } | ||||||
|  |           { | ||||||
|  |             # Shows the host target address instead of the blackbox address | ||||||
|  |             target_label = "__address__"; | ||||||
|  |             replacement = "127.0.0.1:9115"; | ||||||
|  |           } | ||||||
|  |         ]; | ||||||
|  |       } | ||||||
|  |       { | ||||||
|  |         job_name = "ipmi-raccoon"; | ||||||
|  |         metrics_path = "/ipmi"; | ||||||
|  |         static_configs = [ | ||||||
|  |           { targets = [ "127.0.0.1:9290" ]; } | ||||||
|  |         ]; | ||||||
|  |         params = { | ||||||
|  |           target = [ "raccoon-ipmi" ]; | ||||||
|  |           module = [ "raccoon" ]; | ||||||
|  |         }; | ||||||
|  |       } | ||||||
|  |     ]; | ||||||
|  |   }; | ||||||
|  | } | ||||||
										
											Binary file not shown.
										
									
								
							| @ -2,6 +2,7 @@ let | |||||||
|   keys = import ../keys.nix; |   keys = import ../keys.nix; | ||||||
|   adminsKeys = builtins.attrValues keys.admins; |   adminsKeys = builtins.attrValues keys.admins; | ||||||
|   hut = [ keys.hosts.hut ] ++ adminsKeys; |   hut = [ keys.hosts.hut ] ++ adminsKeys; | ||||||
|  |   mon = [ keys.hosts.hut keys.hosts.tent ] ++ adminsKeys; | ||||||
|   # Only expose ceph keys to safe nodes and admins |   # Only expose ceph keys to safe nodes and admins | ||||||
|   safe = keys.hostGroup.safe ++ adminsKeys; |   safe = keys.hostGroup.safe ++ adminsKeys; | ||||||
| in | in | ||||||
| @ -12,7 +13,7 @@ in | |||||||
|   "gitlab-bsc-docker-token.age".publicKeys = hut; |   "gitlab-bsc-docker-token.age".publicKeys = hut; | ||||||
|   "nix-serve.age".publicKeys = hut; |   "nix-serve.age".publicKeys = hut; | ||||||
|   "jungle-robot-password.age".publicKeys = hut; |   "jungle-robot-password.age".publicKeys = hut; | ||||||
|   "ipmi.yml.age".publicKeys = hut; |   "ipmi.yml.age".publicKeys = mon; | ||||||
| 
 | 
 | ||||||
|   "ceph-user.age".publicKeys = safe; |   "ceph-user.age".publicKeys = safe; | ||||||
|   "munge-key.age".publicKeys = safe; |   "munge-key.age".publicKeys = safe; | ||||||
|  | |||||||
		Reference in New Issue
	
	Block a user