Don't suspend owl compute nodes
Currently the owl nodes are located on top of the rack and turning them off causes a high temperature increase at that region, which accumulates heat from the whole rack. To maximize airflow we will leave them on at all times. This also makes allocations immediate at the extra cost of around 200 W. In the future, if we include more nodes in SLURM we can configure those to turn off if needed. Fixes: #156
This commit is contained in:
parent
7989779c8f
commit
018d94bd77
@ -1,31 +1,6 @@
|
|||||||
{ config, pkgs, ... }:
|
{ config, pkgs, ... }:
|
||||||
|
|
||||||
let
|
{
|
||||||
suspendProgram = pkgs.writeShellScript "suspend.sh" ''
|
|
||||||
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
|
||||||
set -x
|
|
||||||
export "PATH=/run/current-system/sw/bin:$PATH"
|
|
||||||
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
|
||||||
hosts=$(scontrol show hostnames $1)
|
|
||||||
for host in $hosts; do
|
|
||||||
echo Shutting down host: $host
|
|
||||||
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
|
|
||||||
done
|
|
||||||
'';
|
|
||||||
|
|
||||||
resumeProgram = pkgs.writeShellScript "resume.sh" ''
|
|
||||||
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
|
||||||
set -x
|
|
||||||
export "PATH=/run/current-system/sw/bin:$PATH"
|
|
||||||
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
|
||||||
hosts=$(scontrol show hostnames $1)
|
|
||||||
for host in $hosts; do
|
|
||||||
echo Starting host: $host
|
|
||||||
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
|
|
||||||
done
|
|
||||||
'';
|
|
||||||
|
|
||||||
in {
|
|
||||||
services.slurm = {
|
services.slurm = {
|
||||||
controlMachine = "apex";
|
controlMachine = "apex";
|
||||||
clusterName = "jungle";
|
clusterName = "jungle";
|
||||||
@ -59,16 +34,6 @@ in {
|
|||||||
# the resources. Use the task/cgroup plugin to enable process containment.
|
# the resources. Use the task/cgroup plugin to enable process containment.
|
||||||
TaskPlugin=task/affinity,task/cgroup
|
TaskPlugin=task/affinity,task/cgroup
|
||||||
|
|
||||||
# Power off unused nodes until they are requested
|
|
||||||
SuspendProgram=${suspendProgram}
|
|
||||||
SuspendTimeout=60
|
|
||||||
ResumeProgram=${resumeProgram}
|
|
||||||
ResumeTimeout=300
|
|
||||||
SuspendExcNodes=fox
|
|
||||||
|
|
||||||
# Turn the nodes off after 1 hour of inactivity
|
|
||||||
SuspendTime=3600
|
|
||||||
|
|
||||||
# Reduce port range so we can allow only this range in the firewall
|
# Reduce port range so we can allow only this range in the firewall
|
||||||
SrunPortRange=60000-61000
|
SrunPortRange=60000-61000
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user