3 Commits

Author SHA1 Message Date
5d38b0d3a5 Disable IPMI watchdog in fox
All checks were successful
CI / build:all (pull_request) Successful in 1h38m3s
CI / build:cross (pull_request) Successful in 1h42m59s
Fixes: #231
2026-04-01 20:30:34 +02:00
2f2bd067a9 Enable memory limits in SLURM
Make sure that jobs cannot allocate more memory than available so we
don't trigger the OOM killer.

Fixes: #178
2026-04-01 20:01:12 +02:00
c499a7293a Add dbautist to owl nodes 2026-03-31 17:20:51 +02:00
3 changed files with 12 additions and 7 deletions

View File

@@ -108,7 +108,7 @@
home = "/home/Computational/dbautist";
description = "Dylan Bautista Cases";
group = "Computational";
hosts = [ "apex" "hut" "tent" "raccoon" ];
hosts = [ "apex" "hut" "tent" "raccoon" "owl1" "owl2" ];
hashedPassword = "$6$a2lpzMRVkG9nSgIm$12G6.ka0sFX1YimqJkBAjbvhRKZ.Hl090B27pdbnQOW0wzyxVWySWhyDDCILjQELky.HKYl9gqOeVXW49nW7q/";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAb+EQBoS98zrCwnGKkHKwMLdYABMTqv7q9E0+T0QmkS dbautist@bsc-848818791"

View File

@@ -25,6 +25,9 @@
boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usbhid" "usb_storage" "sd_mod" ];
boot.kernelModules = [ "kvm-amd" "amd_uncore" "amd_hsmp" ];
# Disable IPMI watchdog as the BMC is not stable
boot.blacklistedKernelModules = [ "ipmi_watchdog" ];
hardware.cpu.amd.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
hardware.cpu.intel.updateMicrocode = lib.mkForce false;

View File

@@ -5,8 +5,8 @@
controlMachine = "apex";
clusterName = "jungle";
nodeName = [
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
"fox Sockets=8 CoresPerSocket=24 ThreadsPerCore=1"
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 MemSpecLimit=4096 RealMemory=128797"
"fox Sockets=8 CoresPerSocket=24 ThreadsPerCore=1 MemSpecLimit=4096 RealMemory=773659"
];
partitionName = [
@@ -41,9 +41,9 @@
# multiple hardware threads (or CPUs).
SelectType=select/cons_tres
# Ignore memory constraints and only use unused cores to share a node with
# other jobs.
SelectTypeParameters=CR_Core
# Both cores and memory are consumable resources, so we can put a limit in
# memory as well.
SelectTypeParameters=CR_Core_Memory
# Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
# This sets up the "extern" step into which ssh-launched processes will be
@@ -58,7 +58,9 @@
extraCgroupConfig = ''
CgroupPlugin=cgroup/v2
#ConstrainCores=yes
ConstrainRAMSpace=yes
ConstrainSwapSpace=yes
AllowedRAMSpace=99
'';
};