forked from rarias/bscpkgs
		
	A failure to reach the control node can cause slurmd to fail and the
unit remains in the failed state until is manually restarted. Instead,
try to restart the service every 30 seconds, forever:
    owl1% systemctl show slurmd | grep -E 'Restart=|RestartUSec='
    Restart=on-failure
    RestartUSec=30s
    owl1% pgrep slurmd
    5903
    owl1% sudo kill -SEGV 5903
    owl1% pgrep slurmd
    6137
Fixes: rarias/jungle#177
Reviewed-by: Aleix Boné <abonerib@bsc.es>
		
	
			
		
			
				
	
	
		
			25 lines
		
	
	
		
			738 B
		
	
	
	
		
			Nix
		
	
	
	
	
	
			
		
		
	
	
			25 lines
		
	
	
		
			738 B
		
	
	
	
		
			Nix
		
	
	
	
	
	
| { lib, ... }:
 | |
| 
 | |
| {
 | |
|   imports = [
 | |
|     ./slurm-common.nix
 | |
|   ];
 | |
| 
 | |
|   systemd.services.slurmd.serviceConfig = {
 | |
|     # Kill all processes in the control group on stop/restart. This will kill
 | |
|     # all the jobs running, so ensure that we only upgrade when the nodes are
 | |
|     # not in use. See:
 | |
|     # https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb
 | |
|     # https://bugs.schedmd.com/show_bug.cgi?id=2095#c24
 | |
|     KillMode = lib.mkForce "control-group";
 | |
| 
 | |
|     # If slurmd fails to contact the control server it will fail, causing the
 | |
|     # node to remain out of service until manually restarted. Always try to
 | |
|     # restart it.
 | |
|     Restart = "always";
 | |
|     RestartSec = "30s";
 | |
|   };
 | |
| 
 | |
|   services.slurm.client.enable = true;
 | |
| }
 |