Don't suspend owl compute nodes #214

Manually merged
rarias merged 2 commits from disable-slurm-suspend into master 2025-10-31 11:58:28 +01:00
Showing only changes of commit 1d025f7a38 - Show all commits

View File

@ -1,31 +1,6 @@
{ config, pkgs, ... }: { config, pkgs, ... }:
let {
suspendProgram = pkgs.writeShellScript "suspend.sh" ''
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
set -x
export "PATH=/run/current-system/sw/bin:$PATH"
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
hosts=$(scontrol show hostnames $1)
for host in $hosts; do
echo Shutting down host: $host
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
done
'';
resumeProgram = pkgs.writeShellScript "resume.sh" ''
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
set -x
export "PATH=/run/current-system/sw/bin:$PATH"
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
hosts=$(scontrol show hostnames $1)
for host in $hosts; do
echo Starting host: $host
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
done
'';
in {
services.slurm = { services.slurm = {
controlMachine = "apex"; controlMachine = "apex";
clusterName = "jungle"; clusterName = "jungle";
@ -59,16 +34,6 @@ in {
# the resources. Use the task/cgroup plugin to enable process containment. # the resources. Use the task/cgroup plugin to enable process containment.
TaskPlugin=task/affinity,task/cgroup TaskPlugin=task/affinity,task/cgroup
# Power off unused nodes until they are requested
SuspendProgram=${suspendProgram}
SuspendTimeout=60
ResumeProgram=${resumeProgram}
ResumeTimeout=300
SuspendExcNodes=fox
# Turn the nodes off after 1 hour of inactivity
SuspendTime=3600
# Reduce port range so we can allow only this range in the firewall # Reduce port range so we can allow only this range in the firewall
SrunPortRange=60000-61000 SrunPortRange=60000-61000