From 2f2bd067a9edb7558c0dd7e8e3d0f07a8deaf933 Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Wed, 1 Apr 2026 17:08:18 +0200 Subject: [PATCH] Enable memory limits in SLURM Make sure that jobs cannot allocate more memory than available so we don't trigger the OOM killer. Fixes: https://jungle.bsc.es/git/rarias/jungle/issues/178 --- m/module/slurm-common.nix | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/m/module/slurm-common.nix b/m/module/slurm-common.nix index 70dbf6cf..9ae4a74e 100644 --- a/m/module/slurm-common.nix +++ b/m/module/slurm-common.nix @@ -5,8 +5,8 @@ controlMachine = "apex"; clusterName = "jungle"; nodeName = [ - "owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl" - "fox Sockets=8 CoresPerSocket=24 ThreadsPerCore=1" + "owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 MemSpecLimit=4096 RealMemory=128797" + "fox Sockets=8 CoresPerSocket=24 ThreadsPerCore=1 MemSpecLimit=4096 RealMemory=773659" ]; partitionName = [ @@ -41,9 +41,9 @@ # multiple hardware threads (or CPUs). SelectType=select/cons_tres - # Ignore memory constraints and only use unused cores to share a node with - # other jobs. - SelectTypeParameters=CR_Core + # Both cores and memory are consumable resources, so we can put a limit in + # memory as well. + SelectTypeParameters=CR_Core_Memory # Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html # This sets up the "extern" step into which ssh-launched processes will be @@ -58,7 +58,9 @@ extraCgroupConfig = '' CgroupPlugin=cgroup/v2 - #ConstrainCores=yes + ConstrainRAMSpace=yes + ConstrainSwapSpace=yes + AllowedRAMSpace=99 ''; };