Enable pam_slurm_adopt in all compute nodes

Prevents access to owl1 and owl2 too if the user doesn't have any jobs running there. Reviewed-by: Aleix Boné <abonerib@bsc.es>
Don't suspend owl compute nodes
2025-10-31 11:41:50 +01:00 · 2025-10-31 11:41:44 +01:00
3 changed files with 18 additions and 53 deletions
--- a/m/fox/configuration.nix
+++ b/m/fox/configuration.nix
@ -93,20 +93,4 @@
    wantedBy = [ "multi-user.target" ];
    serviceConfig.ExecStart = script;
  };
-
-  # Only allow SSH connections from users who have a SLURM allocation
-  # See: https://slurm.schedmd.com/pam_slurm_adopt.html
-  security.pam.services.sshd.rules.account.slurm = {
-    control = "required";
-    enable = true;
-    modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
-    args = [ "log_level=debug5" ];
-    order = 999999; # Make it last one
-  };
-
-  # Disable systemd session (pam_systemd.so) as it will conflict with the
-  # pam_slurm_adopt.so module. What happens is that the shell is first adopted
-  # into the slurmstepd task and then into the systemd session, which is not
-  # what we want, otherwise it will linger even if all jobs are gone.
-  security.pam.services.sshd.startSession = lib.mkForce false;
 }
--- a/m/module/slurm-client.nix
+++ b/m/module/slurm-client.nix
@ -1,4 +1,4 @@
-{ lib, ... }:
+{ lib, pkgs, ... }:

 {
  imports = [
@ -21,4 +21,20 @@
  };

  services.slurm.client.enable = true;
+
+  # Only allow SSH connections from users who have a SLURM allocation
+  # See: https://slurm.schedmd.com/pam_slurm_adopt.html
+  security.pam.services.sshd.rules.account.slurm = {
+    control = "required";
+    enable = true;
+    modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
+    args = [ "log_level=debug5" ];
+    order = 999999; # Make it last one
+  };
+
+  # Disable systemd session (pam_systemd.so) as it will conflict with the
+  # pam_slurm_adopt.so module. What happens is that the shell is first adopted
+  # into the slurmstepd task and then into the systemd session, which is not
+  # what we want, otherwise it will linger even if all jobs are gone.
+  security.pam.services.sshd.startSession = lib.mkForce false;
 }
--- a/m/module/slurm-common.nix
+++ b/m/module/slurm-common.nix
@ -1,31 +1,6 @@
 { config, pkgs, ... }:

-let
-  suspendProgram = pkgs.writeShellScript "suspend.sh" ''
-    exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
-    set -x
-    export "PATH=/run/current-system/sw/bin:$PATH"
-    echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
-    hosts=$(scontrol show hostnames $1)
-    for host in $hosts; do
-      echo Shutting down host: $host
-      ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
-    done
-  '';
-
-  resumeProgram = pkgs.writeShellScript "resume.sh" ''
-    exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
-    set -x
-    export "PATH=/run/current-system/sw/bin:$PATH"
-    echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
-    hosts=$(scontrol show hostnames $1)
-    for host in $hosts; do
-      echo Starting host: $host
-      ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
-    done
-  '';
-
-in {
+{
  services.slurm = {
    controlMachine = "apex";
    clusterName = "jungle";
@ -59,16 +34,6 @@ in {
      # the resources. Use the task/cgroup plugin to enable process containment.
      TaskPlugin=task/affinity,task/cgroup

-      # Power off unused nodes until they are requested
-      SuspendProgram=${suspendProgram}
-      SuspendTimeout=60
-      ResumeProgram=${resumeProgram}
-      ResumeTimeout=300
-      SuspendExcNodes=fox
-
-      # Turn the nodes off after 1 hour of inactivity
-      SuspendTime=3600
-
      # Reduce port range so we can allow only this range in the firewall
      SrunPortRange=60000-61000