Poweroff idle slurm nodes after 1 hour

This commit is contained in:
Rodrigo Arias 2023-09-08 13:31:23 +02:00
parent 6db5772ac4
commit 77cb3c494e
2 changed files with 45 additions and 6 deletions

View File

@ -1,6 +1,33 @@
{ lib, ... }: { pkgs, lib, ... }:
{ let
suspendProgram = pkgs.writeScript "suspend.sh" ''
#!/usr/bin/env bash
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
set -x
export "PATH=/run/current-system/sw/bin:$PATH"
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
hosts=$(scontrol show hostnames $1)
for host in $hosts; do
echo Shutting down host: $host
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
done
'';
resumeProgram = pkgs.writeScript "resume.sh" ''
#!/usr/bin/env bash
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
set -x
export "PATH=/run/current-system/sw/bin:$PATH"
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
hosts=$(scontrol show hostnames $1)
for host in $hosts; do
echo Starting host: $host
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
done
'';
in {
systemd.services.slurmd.serviceConfig = { systemd.services.slurmd.serviceConfig = {
# Kill all processes in the control group on stop/restart. This will kill # Kill all processes in the control group on stop/restart. This will kill
# all the jobs running, so ensure that we only upgrade when the nodes are # all the jobs running, so ensure that we only upgrade when the nodes are
@ -9,6 +36,7 @@
# https://bugs.schedmd.com/show_bug.cgi?id=2095#c24 # https://bugs.schedmd.com/show_bug.cgi?id=2095#c24
KillMode = lib.mkForce "control-group"; KillMode = lib.mkForce "control-group";
}; };
services.slurm = { services.slurm = {
client.enable = true; client.enable = true;
controlMachine = "hut"; controlMachine = "hut";
@ -18,6 +46,11 @@
"hut Sockets=2 CoresPerSocket=14 ThreadsPerCore=2" "hut Sockets=2 CoresPerSocket=14 ThreadsPerCore=2"
]; ];
partitionName = [
"owl Nodes=owl[1-2] Default=YES MaxTime=INFINITE State=UP"
"all Nodes=owl[1-2],hut Default=NO MaxTime=INFINITE State=UP"
];
# See slurm.conf(5) for more details about these options. # See slurm.conf(5) for more details about these options.
extraConfig = '' extraConfig = ''
# Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but # Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but
@ -37,6 +70,16 @@
# Enable task/affinity to allow the jobs to run in a specified subset of # Enable task/affinity to allow the jobs to run in a specified subset of
# the resources. Use the task/cgroup plugin to enable process containment. # the resources. Use the task/cgroup plugin to enable process containment.
TaskPlugin=task/affinity,task/cgroup TaskPlugin=task/affinity,task/cgroup
# Power off unused nodes until they are requested
SuspendProgram=${suspendProgram}
SuspendTimeout=60
ResumeProgram=${resumeProgram}
ResumeTimeout=300
SuspendExcNodes=hut
# Turn the nodes off after 1 hour of inactivity
SuspendTime=3600
''; '';
}; };
} }

View File

@ -3,9 +3,5 @@
{ {
services.slurm = { services.slurm = {
server.enable = true; server.enable = true;
partitionName = [
"owl Nodes=owl[1-2] Default=YES MaxTime=INFINITE State=UP"
"all Nodes=owl[1-2],hut Default=NO MaxTime=INFINITE State=UP"
];
}; };
} }