jungle/m/module/slurm-client.nix

{ config, pkgs, lib, ... }:

let
  suspendProgram = pkgs.writeScript "suspend.sh" ''
    #!/usr/bin/env bash
    exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
    set -x
    export "PATH=/run/current-system/sw/bin:$PATH"
    echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
    hosts=$(scontrol show hostnames $1)
    for host in $hosts; do
      echo Shutting down host: $host
      ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
    done
  '';

  resumeProgram = pkgs.writeScript "resume.sh" ''
    #!/usr/bin/env bash
    exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
    set -x
    export "PATH=/run/current-system/sw/bin:$PATH"
    echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
    hosts=$(scontrol show hostnames $1)
    for host in $hosts; do
      echo Starting host: $host
      ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
    done
  '';

in {
  systemd.services.slurmd.serviceConfig = {
    # Kill all processes in the control group on stop/restart. This will kill
    # all the jobs running, so ensure that we only upgrade when the nodes are
    # not in use. See:
    # https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb
    # https://bugs.schedmd.com/show_bug.cgi?id=2095#c24
    KillMode = lib.mkForce "control-group";
  };

  services.slurm = {
    client.enable = true;
    controlMachine = "hut";
    clusterName = "jungle";
    nodeName = [
      "owl[1,2]  Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
      "hut       Sockets=2 CoresPerSocket=14 ThreadsPerCore=2"
    ];

    partitionName = [
      "owl Nodes=owl[1-2] Default=YES DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
      "all Nodes=owl[1-2],hut Default=NO DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
    ];

    # See slurm.conf(5) for more details about these options.
    extraConfig = ''
      # Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but
      # not with Intel MPI. For that use the compatibility shim libpmi.so
      # setting I_MPI_PMI_LIBRARY=$pmix/lib/libpmi.so while maintaining the PMIx
      # library in SLURM (--mpi=pmix). See more details here:
      # https://pm.bsc.es/gitlab/rarias/jungle/-/issues/16
      MpiDefault=pmix

      # When a node reboots return that node to the slurm queue as soon as it
      # becomes operative again.
      ReturnToService=2

      # Track all processes by using a cgroup
      ProctrackType=proctrack/cgroup

      # Enable task/affinity to allow the jobs to run in a specified subset of
      # the resources. Use the task/cgroup plugin to enable process containment.
      TaskPlugin=task/affinity,task/cgroup

      # Power off unused nodes until they are requested
      SuspendProgram=${suspendProgram}
      SuspendTimeout=60
      ResumeProgram=${resumeProgram}
      ResumeTimeout=300
      SuspendExcNodes=hut

      # Turn the nodes off after 1 hour of inactivity
      SuspendTime=3600

      # Reduce port range so we can allow only this range in the firewall
      SrunPortRange=60000-61000

      # Use cores as consumable resources. In SLURM terms, a core may have
      # multiple hardware threads (or CPUs).
      SelectType=select/cons_tres

      # Ignore memory constraints and only use unused cores to share a node with
      # other jobs.
      SelectTypeParameters=CR_Core
    '';
  };

  age.secrets.mungeKey = {
    file = ../../secrets/munge-key.age;
    owner = "munge";
    group = "munge";
  };

  services.munge = {
    enable = true;
    password = config.age.secrets.mungeKey.path;
  };
}
Add encrypted munge key with agenix 2023-09-08 19:01:57 +02:00			`{ config, pkgs, lib, ... }:`
Enable slurm in xeon01 2023-04-26 13:35:06 +02:00
Poweroff idle slurm nodes after 1 hour 2023-09-08 13:31:23 +02:00			`let`
			`suspendProgram = pkgs.writeScript "suspend.sh" ''`
			`#!/usr/bin/env bash`
			`exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log`
			`set -x`
			`export "PATH=/run/current-system/sw/bin:$PATH"`
			`echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log`
			`hosts=$(scontrol show hostnames $1)`
			`for host in $hosts; do`
			`echo Shutting down host: $host`
			`ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off`
			`done`
			`'';`

			`resumeProgram = pkgs.writeScript "resume.sh" ''`
			`#!/usr/bin/env bash`
			`exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log`
			`set -x`
			`export "PATH=/run/current-system/sw/bin:$PATH"`
			`echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log`
			`hosts=$(scontrol show hostnames $1)`
			`for host in $hosts; do`
			`echo Starting host: $host`
			`ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on`
			`done`
			`'';`

			`in {`
Kill slurmd remaining processes on upgrade 2023-07-27 14:24:21 +02:00			`systemd.services.slurmd.serviceConfig = {`
			`# Kill all processes in the control group on stop/restart. This will kill`
			`# all the jobs running, so ensure that we only upgrade when the nodes are`
			`# not in use. See:`
			`# https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb`
			`# https://bugs.schedmd.com/show_bug.cgi?id=2095#c24`
			`KillMode = lib.mkForce "control-group";`
			`};`
Poweroff idle slurm nodes after 1 hour 2023-09-08 13:31:23 +02:00
Enable slurm in xeon01 2023-04-26 13:35:06 +02:00			`services.slurm = {`
			`client.enable = true;`
Rename xeon07 to hut 2023-06-14 11:15:00 +02:00			`controlMachine = "hut";`
Set the name of the slurm cluster to jungle 2023-06-16 12:00:54 +02:00			`clusterName = "jungle";`
Enable slurm in xeon01 2023-04-26 13:35:06 +02:00			`nodeName = [`
Simplify flake and expose host pkgs The configuration of the machines is now moved to m/ 2023-06-14 17:28:00 +02:00			`"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"`
			`"hut Sockets=2 CoresPerSocket=14 ThreadsPerCore=2"`
Enable slurm in xeon01 2023-04-26 13:35:06 +02:00			`];`
Allow srun to specify the cpu binding The task/affinity plugin needs to be selected. 2023-06-21 13:16:23 +02:00
Poweroff idle slurm nodes after 1 hour 2023-09-08 13:31:23 +02:00			`partitionName = [`
Set default SLURM job time limit to one hour Prevents enless jobs from being left forever, while allow users to request a larger time limit. Reviewed-by: Aleix Boné <abonerib@bsc.es> 2024-07-18 11:44:01 +02:00			`"owl Nodes=owl[1-2] Default=YES DefaultTime=01:00:00 MaxTime=INFINITE State=UP"`
			`"all Nodes=owl[1-2],hut Default=NO DefaultTime=01:00:00 MaxTime=INFINITE State=UP"`
Poweroff idle slurm nodes after 1 hour 2023-09-08 13:31:23 +02:00			`];`

Allow srun to specify the cpu binding The task/affinity plugin needs to be selected. 2023-06-21 13:16:23 +02:00			`# See slurm.conf(5) for more details about these options.`
Use pmix by default in slurm 2023-04-28 17:07:48 +02:00			`extraConfig = ''`
Add coments in slurm config 2023-06-16 14:16:14 +02:00			`# Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but`
			`# not with Intel MPI. For that use the compatibility shim libpmi.so`
			`# setting I_MPI_PMI_LIBRARY=$pmix/lib/libpmi.so while maintaining the PMIx`
			`# library in SLURM (--mpi=pmix). See more details here:`
			`# https://pm.bsc.es/gitlab/rarias/jungle/-/issues/16`
Use pmix by default in slurm 2023-04-28 17:07:48 +02:00			`MpiDefault=pmix`
Add coments in slurm config 2023-06-16 14:16:14 +02:00
			`# When a node reboots return that node to the slurm queue as soon as it`
			`# becomes operative again.`
Automatically resume restarted nodes in SLURM 2023-05-18 12:48:04 +02:00			`ReturnToService=2`
Allow srun to specify the cpu binding The task/affinity plugin needs to be selected. 2023-06-21 13:16:23 +02:00
			`# Track all processes by using a cgroup`
			`ProctrackType=proctrack/cgroup`

			`# Enable task/affinity to allow the jobs to run in a specified subset of`
			`# the resources. Use the task/cgroup plugin to enable process containment.`
			`TaskPlugin=task/affinity,task/cgroup`
Poweroff idle slurm nodes after 1 hour 2023-09-08 13:31:23 +02:00
			`# Power off unused nodes until they are requested`
			`SuspendProgram=${suspendProgram}`
			`SuspendTimeout=60`
			`ResumeProgram=${resumeProgram}`
			`ResumeTimeout=300`
			`SuspendExcNodes=hut`

			`# Turn the nodes off after 1 hour of inactivity`
			`SuspendTime=3600`
Allow only some ports for srun 2023-09-08 17:51:37 +02:00
			`# Reduce port range so we can allow only this range in the firewall`
			`SrunPortRange=60000-61000`
Allow other jobs to run in unused cores The current select mechanism was using the memory too as a consumable resource, which by default only sets 1 MiB per node. As each job already requests 1 MiB, it prevents other jobs from running. As we are not really concerned with memory usage, we only use the unused cores in the select criteria. Reviewed-by: Aleix Boné <abonerib@bsc.es> 2024-07-18 11:19:03 +02:00
			`# Use cores as consumable resources. In SLURM terms, a core may have`
			`# multiple hardware threads (or CPUs).`
			`SelectType=select/cons_tres`

			`# Ignore memory constraints and only use unused cores to share a node with`
			`# other jobs.`
			`SelectTypeParameters=CR_Core`
Use pmix by default in slurm 2023-04-28 17:07:48 +02:00			`'';`
Enable slurm in xeon01 2023-04-26 13:35:06 +02:00			`};`
Add encrypted munge key with agenix 2023-09-08 19:01:57 +02:00
			`age.secrets.mungeKey = {`
			`file = ../../secrets/munge-key.age;`
			`owner = "munge";`
			`group = "munge";`
			`};`

			`services.munge = {`
			`enable = true;`
			`password = config.age.secrets.mungeKey.path;`
			`};`
Enable slurm in xeon01 2023-04-26 13:35:06 +02:00			`}`