8 Commits

Author SHA1 Message Date
95ca7b8750 Keep compute nodes off when power comes back
When the power comes back, we don't know if the AC unit will be
operating properly or if the room will be at a safe temperature. So,
instead of powering all the machines back, only configure the login to
power on, so we can check the state of the room and power the rest of
the machines.
2025-07-31 17:44:49 +02:00
d948f8b752 Move StartLimit* options to unit section
The StartLimitBurst and StartLimitIntervalSec options belong to the
[Unit] section, otherwise they are ignored in [Service]:

> Unknown key 'StartLimitIntervalSec' in section [Service], ignoring.

When using [Unit], the limits are properly set:

  apex% systemctl show power-policy.service | grep StartLimit
  StartLimitIntervalUSec=10min
  StartLimitBurst=10
  StartLimitAction=none

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-24 14:32:46 +02:00
8f7787e217 Set power policy to always turn on
In all machines, as soon as we recover the power, turn the machine back
on. We cannot rely on the previous state as we will shut them down
before the power is cut to prevent damage on the power supply
monitoring circuit.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-24 11:22:38 +02:00
30b9b23112 Add NixOS module to control power policy
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-24 11:22:36 +02:00
9a056737de Move August shutdown to 3rd at 22h
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-24 11:22:33 +02:00
ac700d34a5 Disable automatic August shutdown for Fox
The UPC has different dates for the yearly power cut, and Fox can
recover properly from a power loss, so we don't need to have it turned
off before the power cut. Simply disabling the timer is enough.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-24 11:22:10 +02:00
9b681ab7ce Add cudainfo program to test CUDA
The cudainfo program checks that we can initialize the CUDA RT library
and communicate with the driver. It can be used as standalone program or
built with cudainfo.gpuCheck so it is executed inside the build sandbox
to see if it also works fine. It uses the autoAddDriverRunpath hook to
inject in the runpath the location of the library directory for CUDA
libraries.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-23 11:52:09 +02:00
9ce394bffd Add missing symlink in cuda sandbox
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-23 11:51:47 +02:00
7 changed files with 59 additions and 3 deletions

View File

@@ -5,9 +5,12 @@
../common/xeon.nix
../common/ssf/hosts.nix
../module/ceph.nix
../module/power-policy.nix
./nfs.nix
];
power.policy = "always-on";
# Don't install grub MBR for now
boot.loader.grub.device = "nodev";

View File

@@ -11,6 +11,7 @@
./base/net.nix
./base/nix.nix
./base/ntp.nix
./base/power-policy.nix
./base/rev.nix
./base/ssh.nix
./base/users.nix

View File

@@ -1,12 +1,12 @@
{
# Shutdown all machines on August 2nd at 11:00 AM, so we can protect the
# Shutdown all machines on August 3rd at 22:00, so we can protect the
# hardware from spurious electrical peaks on the yearly electrical cut for
# manteinance that starts on August 4th.
systemd.timers.august-shutdown = {
description = "Shutdown on August 2nd for maintenance";
description = "Shutdown on August 3rd for maintenance";
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "*-08-02 11:00:00";
OnCalendar = "*-08-03 22:00:00";
RandomizedDelaySec = "10min";
Unit = "systemd-poweroff.service";
};

View File

@@ -0,0 +1,9 @@
{
imports = [
../../module/power-policy.nix
];
# By default, keep the machines off as we don't know if the AC will be working
# once the electricity comes back.
power.policy = "always-off";
}

View File

@@ -6,8 +6,15 @@
../common/xeon/console.nix
../module/emulation.nix
../module/nvidia.nix
../module/power-policy.nix
];
power.policy = "always-on";
# Don't turn off on August as UPC has different dates.
# Fox works fine on power cuts.
systemd.timers.august-shutdown.enable = false;
# Select the this using the ID to avoid mismatches
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x500a07514b0c1103";

33
m/module/power-policy.nix Normal file
View File

@@ -0,0 +1,33 @@
{ config, lib, pkgs, ... }:
with lib;
let
cfg = config.power.policy;
in
{
options = {
power.policy = mkOption {
type = types.nullOr (types.enum [ "always-on" "previous" "always-off" ]);
default = null;
description = "Set power policy to use via IPMI.";
};
};
config = mkIf (cfg != null) {
systemd.services."power-policy" = {
description = "Set power policy to use via IPMI";
wantedBy = [ "multi-user.target" ];
unitConfig = {
StartLimitBurst = "10";
StartLimitIntervalSec = "10m";
};
serviceConfig = {
ExecStart = "${pkgs.ipmitool}/bin/ipmitool chassis policy ${cfg}";
Type = "oneshot";
Restart = "on-failure";
RestartSec = "5s";
};
};
};
}

View File

@@ -7,9 +7,12 @@
../module/debuginfod.nix
../module/ssh-hut-extern.nix
../module/nvidia.nix
../module/power-policy.nix
../eudy/kernel/perf.nix
];
power.policy = "always-on";
# Don't install Grub on the disk yet
boot.loader.grub.device = "nodev";