Poweroff idle slurm nodes after 1 hour
This commit is contained in:
parent
6db5772ac4
commit
77cb3c494e
@ -1,6 +1,33 @@
|
||||
{ lib, ... }:
|
||||
{ pkgs, lib, ... }:
|
||||
|
||||
{
|
||||
let
|
||||
suspendProgram = pkgs.writeScript "suspend.sh" ''
|
||||
#!/usr/bin/env bash
|
||||
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
||||
set -x
|
||||
export "PATH=/run/current-system/sw/bin:$PATH"
|
||||
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
||||
hosts=$(scontrol show hostnames $1)
|
||||
for host in $hosts; do
|
||||
echo Shutting down host: $host
|
||||
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
|
||||
done
|
||||
'';
|
||||
|
||||
resumeProgram = pkgs.writeScript "resume.sh" ''
|
||||
#!/usr/bin/env bash
|
||||
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
||||
set -x
|
||||
export "PATH=/run/current-system/sw/bin:$PATH"
|
||||
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
||||
hosts=$(scontrol show hostnames $1)
|
||||
for host in $hosts; do
|
||||
echo Starting host: $host
|
||||
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
|
||||
done
|
||||
'';
|
||||
|
||||
in {
|
||||
systemd.services.slurmd.serviceConfig = {
|
||||
# Kill all processes in the control group on stop/restart. This will kill
|
||||
# all the jobs running, so ensure that we only upgrade when the nodes are
|
||||
@ -9,6 +36,7 @@
|
||||
# https://bugs.schedmd.com/show_bug.cgi?id=2095#c24
|
||||
KillMode = lib.mkForce "control-group";
|
||||
};
|
||||
|
||||
services.slurm = {
|
||||
client.enable = true;
|
||||
controlMachine = "hut";
|
||||
@ -18,6 +46,11 @@
|
||||
"hut Sockets=2 CoresPerSocket=14 ThreadsPerCore=2"
|
||||
];
|
||||
|
||||
partitionName = [
|
||||
"owl Nodes=owl[1-2] Default=YES MaxTime=INFINITE State=UP"
|
||||
"all Nodes=owl[1-2],hut Default=NO MaxTime=INFINITE State=UP"
|
||||
];
|
||||
|
||||
# See slurm.conf(5) for more details about these options.
|
||||
extraConfig = ''
|
||||
# Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but
|
||||
@ -37,6 +70,16 @@
|
||||
# Enable task/affinity to allow the jobs to run in a specified subset of
|
||||
# the resources. Use the task/cgroup plugin to enable process containment.
|
||||
TaskPlugin=task/affinity,task/cgroup
|
||||
|
||||
# Power off unused nodes until they are requested
|
||||
SuspendProgram=${suspendProgram}
|
||||
SuspendTimeout=60
|
||||
ResumeProgram=${resumeProgram}
|
||||
ResumeTimeout=300
|
||||
SuspendExcNodes=hut
|
||||
|
||||
# Turn the nodes off after 1 hour of inactivity
|
||||
SuspendTime=3600
|
||||
'';
|
||||
};
|
||||
}
|
||||
|
@ -3,9 +3,5 @@
|
||||
{
|
||||
services.slurm = {
|
||||
server.enable = true;
|
||||
partitionName = [
|
||||
"owl Nodes=owl[1-2] Default=YES MaxTime=INFINITE State=UP"
|
||||
"all Nodes=owl[1-2],hut Default=NO MaxTime=INFINITE State=UP"
|
||||
];
|
||||
};
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user