From 1d025f7a3849532f4eabccf68b2ec419191514a8 Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Thu, 30 Oct 2025 13:40:10 +0100 Subject: [PATCH 1/2] Don't suspend owl compute nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the owl nodes are located on top of the rack and turning them off causes a high temperature increase at that region, which accumulates heat from the whole rack. To maximize airflow we will leave them on at all times. This also makes allocations immediate at the extra cost of around 200 W. In the future, if we include more nodes in SLURM we can configure those to turn off if needed. Fixes: https://jungle.bsc.es/git/rarias/jungle/issues/156 Reviewed-by: Aleix Boné --- m/module/slurm-common.nix | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/m/module/slurm-common.nix b/m/module/slurm-common.nix index d0d06511..70dbf6cf 100644 --- a/m/module/slurm-common.nix +++ b/m/module/slurm-common.nix @@ -1,31 +1,6 @@ { config, pkgs, ... }: -let - suspendProgram = pkgs.writeShellScript "suspend.sh" '' - exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log - set -x - export "PATH=/run/current-system/sw/bin:$PATH" - echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log - hosts=$(scontrol show hostnames $1) - for host in $hosts; do - echo Shutting down host: $host - ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off - done - ''; - - resumeProgram = pkgs.writeShellScript "resume.sh" '' - exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log - set -x - export "PATH=/run/current-system/sw/bin:$PATH" - echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log - hosts=$(scontrol show hostnames $1) - for host in $hosts; do - echo Starting host: $host - ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on - done - ''; - -in { +{ services.slurm = { controlMachine = "apex"; clusterName = "jungle"; @@ -59,16 +34,6 @@ in { # the resources. Use the task/cgroup plugin to enable process containment. TaskPlugin=task/affinity,task/cgroup - # Power off unused nodes until they are requested - SuspendProgram=${suspendProgram} - SuspendTimeout=60 - ResumeProgram=${resumeProgram} - ResumeTimeout=300 - SuspendExcNodes=fox - - # Turn the nodes off after 1 hour of inactivity - SuspendTime=3600 - # Reduce port range so we can allow only this range in the firewall SrunPortRange=60000-61000 -- 2.49.0 From fc69ef32177108be0c6970560a27ed2705e0310c Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Fri, 31 Oct 2025 10:31:28 +0100 Subject: [PATCH 2/2] Enable pam_slurm_adopt in all compute nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prevents access to owl1 and owl2 too if the user doesn't have any jobs running there. Reviewed-by: Aleix Boné --- m/fox/configuration.nix | 16 ---------------- m/module/slurm-client.nix | 18 +++++++++++++++++- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/m/fox/configuration.nix b/m/fox/configuration.nix index 8c381f86..b9549db8 100644 --- a/m/fox/configuration.nix +++ b/m/fox/configuration.nix @@ -93,20 +93,4 @@ wantedBy = [ "multi-user.target" ]; serviceConfig.ExecStart = script; }; - - # Only allow SSH connections from users who have a SLURM allocation - # See: https://slurm.schedmd.com/pam_slurm_adopt.html - security.pam.services.sshd.rules.account.slurm = { - control = "required"; - enable = true; - modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so"; - args = [ "log_level=debug5" ]; - order = 999999; # Make it last one - }; - - # Disable systemd session (pam_systemd.so) as it will conflict with the - # pam_slurm_adopt.so module. What happens is that the shell is first adopted - # into the slurmstepd task and then into the systemd session, which is not - # what we want, otherwise it will linger even if all jobs are gone. - security.pam.services.sshd.startSession = lib.mkForce false; } diff --git a/m/module/slurm-client.nix b/m/module/slurm-client.nix index deec8441..66ad71cb 100644 --- a/m/module/slurm-client.nix +++ b/m/module/slurm-client.nix @@ -1,4 +1,4 @@ -{ lib, ... }: +{ lib, pkgs, ... }: { imports = [ @@ -21,4 +21,20 @@ }; services.slurm.client.enable = true; + + # Only allow SSH connections from users who have a SLURM allocation + # See: https://slurm.schedmd.com/pam_slurm_adopt.html + security.pam.services.sshd.rules.account.slurm = { + control = "required"; + enable = true; + modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so"; + args = [ "log_level=debug5" ]; + order = 999999; # Make it last one + }; + + # Disable systemd session (pam_systemd.so) as it will conflict with the + # pam_slurm_adopt.so module. What happens is that the shell is first adopted + # into the slurmstepd task and then into the systemd session, which is not + # what we want, otherwise it will linger even if all jobs are gone. + security.pam.services.sshd.startSession = lib.mkForce false; } -- 2.49.0