jungle/m/common/slurm.nix

43 lines
1.6 KiB
Nix
Raw Normal View History

{ lib, ... }:
2023-04-26 13:35:06 +02:00
{
systemd.services.slurmd.serviceConfig = {
# Kill all processes in the control group on stop/restart. This will kill
# all the jobs running, so ensure that we only upgrade when the nodes are
# not in use. See:
# https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb
# https://bugs.schedmd.com/show_bug.cgi?id=2095#c24
KillMode = lib.mkForce "control-group";
};
2023-04-26 13:35:06 +02:00
services.slurm = {
client.enable = true;
2023-06-14 11:15:00 +02:00
controlMachine = "hut";
clusterName = "jungle";
2023-04-26 13:35:06 +02:00
nodeName = [
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
"hut Sockets=2 CoresPerSocket=14 ThreadsPerCore=2"
2023-04-26 13:35:06 +02:00
];
# See slurm.conf(5) for more details about these options.
2023-04-28 17:07:48 +02:00
extraConfig = ''
2023-06-16 14:16:14 +02:00
# Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but
# not with Intel MPI. For that use the compatibility shim libpmi.so
# setting I_MPI_PMI_LIBRARY=$pmix/lib/libpmi.so while maintaining the PMIx
# library in SLURM (--mpi=pmix). See more details here:
# https://pm.bsc.es/gitlab/rarias/jungle/-/issues/16
2023-04-28 17:07:48 +02:00
MpiDefault=pmix
2023-06-16 14:16:14 +02:00
# When a node reboots return that node to the slurm queue as soon as it
# becomes operative again.
ReturnToService=2
# Track all processes by using a cgroup
ProctrackType=proctrack/cgroup
# Enable task/affinity to allow the jobs to run in a specified subset of
# the resources. Use the task/cgroup plugin to enable process containment.
TaskPlugin=task/affinity,task/cgroup
2023-04-28 17:07:48 +02:00
'';
2023-04-26 13:35:06 +02:00
};
}