From 018d94bd7780a59aa5dad89aff6ba7f807735eba Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Thu, 30 Oct 2025 13:40:10 +0100 Subject: [PATCH] Don't suspend owl compute nodes Currently the owl nodes are located on top of the rack and turning them off causes a high temperature increase at that region, which accumulates heat from the whole rack. To maximize airflow we will leave them on at all times. This also makes allocations immediate at the extra cost of around 200 W. In the future, if we include more nodes in SLURM we can configure those to turn off if needed. Fixes: https://jungle.bsc.es/git/rarias/jungle/issues/156 --- m/module/slurm-common.nix | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/m/module/slurm-common.nix b/m/module/slurm-common.nix index d0d06511..70dbf6cf 100644 --- a/m/module/slurm-common.nix +++ b/m/module/slurm-common.nix @@ -1,31 +1,6 @@ { config, pkgs, ... }: -let - suspendProgram = pkgs.writeShellScript "suspend.sh" '' - exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log - set -x - export "PATH=/run/current-system/sw/bin:$PATH" - echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log - hosts=$(scontrol show hostnames $1) - for host in $hosts; do - echo Shutting down host: $host - ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off - done - ''; - - resumeProgram = pkgs.writeShellScript "resume.sh" '' - exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log - set -x - export "PATH=/run/current-system/sw/bin:$PATH" - echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log - hosts=$(scontrol show hostnames $1) - for host in $hosts; do - echo Starting host: $host - ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on - done - ''; - -in { +{ services.slurm = { controlMachine = "apex"; clusterName = "jungle"; @@ -59,16 +34,6 @@ in { # the resources. Use the task/cgroup plugin to enable process containment. TaskPlugin=task/affinity,task/cgroup - # Power off unused nodes until they are requested - SuspendProgram=${suspendProgram} - SuspendTimeout=60 - ResumeProgram=${resumeProgram} - ResumeTimeout=300 - SuspendExcNodes=fox - - # Turn the nodes off after 1 hour of inactivity - SuspendTime=3600 - # Reduce port range so we can allow only this range in the firewall SrunPortRange=60000-61000