diff --git a/m/common/base/users.nix b/m/common/base/users.nix index d40732d9..43ae8cb0 100644 --- a/m/common/base/users.nix +++ b/m/common/base/users.nix @@ -108,7 +108,7 @@ home = "/home/Computational/dbautist"; description = "Dylan Bautista Cases"; group = "Computational"; - hosts = [ "apex" "hut" "tent" "raccoon" ]; + hosts = [ "apex" "hut" "tent" "raccoon" "owl1" "owl2" ]; hashedPassword = "$6$a2lpzMRVkG9nSgIm$12G6.ka0sFX1YimqJkBAjbvhRKZ.Hl090B27pdbnQOW0wzyxVWySWhyDDCILjQELky.HKYl9gqOeVXW49nW7q/"; openssh.authorizedKeys.keys = [ "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAb+EQBoS98zrCwnGKkHKwMLdYABMTqv7q9E0+T0QmkS dbautist@bsc-848818791" diff --git a/m/fox/configuration.nix b/m/fox/configuration.nix index b9549db8..4f0b5188 100644 --- a/m/fox/configuration.nix +++ b/m/fox/configuration.nix @@ -25,6 +25,9 @@ boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usbhid" "usb_storage" "sd_mod" ]; boot.kernelModules = [ "kvm-amd" "amd_uncore" "amd_hsmp" ]; + # Disable IPMI watchdog as the BMC is not stable + boot.blacklistedKernelModules = [ "ipmi_watchdog" ]; + hardware.cpu.amd.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware; hardware.cpu.intel.updateMicrocode = lib.mkForce false; diff --git a/m/module/slurm-common.nix b/m/module/slurm-common.nix index 70dbf6cf..9ae4a74e 100644 --- a/m/module/slurm-common.nix +++ b/m/module/slurm-common.nix @@ -5,8 +5,8 @@ controlMachine = "apex"; clusterName = "jungle"; nodeName = [ - "owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl" - "fox Sockets=8 CoresPerSocket=24 ThreadsPerCore=1" + "owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 MemSpecLimit=4096 RealMemory=128797" + "fox Sockets=8 CoresPerSocket=24 ThreadsPerCore=1 MemSpecLimit=4096 RealMemory=773659" ]; partitionName = [ @@ -41,9 +41,9 @@ # multiple hardware threads (or CPUs). SelectType=select/cons_tres - # Ignore memory constraints and only use unused cores to share a node with - # other jobs. - SelectTypeParameters=CR_Core + # Both cores and memory are consumable resources, so we can put a limit in + # memory as well. + SelectTypeParameters=CR_Core_Memory # Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html # This sets up the "extern" step into which ssh-launched processes will be @@ -58,7 +58,9 @@ extraCgroupConfig = '' CgroupPlugin=cgroup/v2 - #ConstrainCores=yes + ConstrainRAMSpace=yes + ConstrainSwapSpace=yes + AllowedRAMSpace=99 ''; };