From 79940876c3472b331a19fceeba4eb69742653c0e Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Mon, 29 Sep 2025 19:17:33 +0200 Subject: [PATCH] Restart slurmd on failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A failure to reach the control node can cause slurmd to fail and the unit remains in the failed state until is manually restarted. Instead, try to restart the service every 30 seconds, forever: owl1% systemctl show slurmd | grep -E 'Restart=|RestartUSec=' Restart=on-failure RestartUSec=30s owl1% pgrep slurmd 5903 owl1% sudo kill -SEGV 5903 owl1% pgrep slurmd 6137 Fixes: https://jungle.bsc.es/git/rarias/jungle/issues/177 Reviewed-by: Aleix Boné --- m/module/slurm-client.nix | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/m/module/slurm-client.nix b/m/module/slurm-client.nix index 84ba4c7..deec844 100644 --- a/m/module/slurm-client.nix +++ b/m/module/slurm-client.nix @@ -12,6 +12,12 @@ # https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb # https://bugs.schedmd.com/show_bug.cgi?id=2095#c24 KillMode = lib.mkForce "control-group"; + + # If slurmd fails to contact the control server it will fail, causing the + # node to remain out of service until manually restarted. Always try to + # restart it. + Restart = "always"; + RestartSec = "30s"; }; services.slurm.client.enable = true;