Enable memory limits in SLURM

Make sure that jobs cannot allocate more memory than available so we don't trigger the OOM killer. Fixes: #178
2026-04-01 17:08:18 +02:00
parent c499a7293a
commit 2f2bd067a9
1 changed files with 8 additions and 6 deletions
--- a/m/module/slurm-common.nix
+++ b/m/module/slurm-common.nix
@@ -5,8 +5,8 @@
    controlMachine = "apex";
    clusterName = "jungle";
    nodeName = [
-      "owl[1,2]  Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
-      "fox       Sockets=8 CoresPerSocket=24 ThreadsPerCore=1"
+      "owl[1,2]  Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 MemSpecLimit=4096 RealMemory=128797"
+      "fox       Sockets=8 CoresPerSocket=24 ThreadsPerCore=1 MemSpecLimit=4096 RealMemory=773659"
    ];

    partitionName = [
@@ -41,9 +41,9 @@
      # multiple hardware threads (or CPUs).
      SelectType=select/cons_tres

-      # Ignore memory constraints and only use unused cores to share a node with
-      # other jobs.
-      SelectTypeParameters=CR_Core
+      # Both cores and memory are consumable resources, so we can put a limit in
+      # memory as well.
+      SelectTypeParameters=CR_Core_Memory

      # Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
      # This sets up the "extern" step into which ssh-launched processes will be
@@ -58,7 +58,9 @@

    extraCgroupConfig = ''
      CgroupPlugin=cgroup/v2
-      #ConstrainCores=yes
+      ConstrainRAMSpace=yes
+      ConstrainSwapSpace=yes
+      AllowedRAMSpace=99
    '';
  };