Enable memory limits in SLURM

Make sure that jobs cannot allocate more memory than available so we
don't trigger the OOM killer.

Fixes: rarias/jungle#178
This commit is contained in:
2026-04-01 17:08:18 +02:00
parent c499a7293a
commit 2f2bd067a9

View File

@@ -5,8 +5,8 @@
controlMachine = "apex";
clusterName = "jungle";
nodeName = [
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
"fox Sockets=8 CoresPerSocket=24 ThreadsPerCore=1"
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 MemSpecLimit=4096 RealMemory=128797"
"fox Sockets=8 CoresPerSocket=24 ThreadsPerCore=1 MemSpecLimit=4096 RealMemory=773659"
];
partitionName = [
@@ -41,9 +41,9 @@
# multiple hardware threads (or CPUs).
SelectType=select/cons_tres
# Ignore memory constraints and only use unused cores to share a node with
# other jobs.
SelectTypeParameters=CR_Core
# Both cores and memory are consumable resources, so we can put a limit in
# memory as well.
SelectTypeParameters=CR_Core_Memory
# Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
# This sets up the "extern" step into which ssh-launched processes will be
@@ -58,7 +58,9 @@
extraCgroupConfig = ''
CgroupPlugin=cgroup/v2
#ConstrainCores=yes
ConstrainRAMSpace=yes
ConstrainSwapSpace=yes
AllowedRAMSpace=99
'';
};