diff --git a/m/fox/configuration.nix b/m/fox/configuration.nix index 60ab5b0..97ac686 100644 --- a/m/fox/configuration.nix +++ b/m/fox/configuration.nix @@ -56,4 +56,20 @@ wantedBy = [ "multi-user.target" ]; serviceConfig.ExecStart = script; }; + + # Only allow SSH connections from users who have a SLURM allocation + # See: https://slurm.schedmd.com/pam_slurm_adopt.html + security.pam.services.sshd.rules.account.slurm = { + control = "required"; + enable = true; + modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so"; + args = [ "log_level=debug5" ]; + order = 999999; # Make it last one + }; + + # Disable systemd session (pam_systemd.so) as it will conflict with the + # pam_slurm_adopt.so module. What happens is that the shell is first adopted + # into the slurmstepd task and then into the systemd session, which is not + # what we want, otherwise it will linger even if all jobs are gone. + security.pam.services.sshd.startSession = lib.mkForce false; } diff --git a/m/module/slurm-client.nix b/m/module/slurm-client.nix index 4f0c88a..f4d630e 100644 --- a/m/module/slurm-client.nix +++ b/m/module/slurm-client.nix @@ -27,6 +27,22 @@ let done ''; + prolog = pkgs.writeScript "prolog.sh" '' + #!/usr/bin/env bash + + echo "hello from the prolog" + + exit 0 + ''; + + epilog = pkgs.writeScript "epilog.sh" '' + #!/usr/bin/env bash + + echo "hello from the epilog" + + exit 0 + ''; + in { systemd.services.slurmd.serviceConfig = { # Kill all processes in the control group on stop/restart. This will kill @@ -93,9 +109,29 @@ in { # Ignore memory constraints and only use unused cores to share a node with # other jobs. SelectTypeParameters=CR_Core + + # Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html + # This sets up the "extern" step into which ssh-launched processes will be + # adopted. Alloc runs the prolog at job allocation (salloc) rather than + # when a task runs (srun) so we can ssh early. + PrologFlags=Alloc,Contain,X11 + + # LaunchParameters=ulimit_pam_adopt will set RLIMIT_RSS in processes + # adopted by the external step, similar to tasks running in regular steps + # LaunchParameters=ulimit_pam_adopt + SlurmdDebug=debug5 + #DebugFlags=Protocol,Cgroup + ''; + + extraCgroupConfig = '' + CgroupPlugin=cgroup/v2 + #ConstrainCores=yes ''; }; + # Place the slurm config in /etc as this will be required by PAM + environment.etc.slurm.source = config.services.slurm.etcSlurm; + age.secrets.mungeKey = { file = ../../secrets/munge-key.age; owner = "munge"; diff --git a/pkgs/overlay.nix b/pkgs/overlay.nix index 5977ab0..2eb9229 100644 --- a/pkgs/overlay.nix +++ b/pkgs/overlay.nix @@ -39,6 +39,18 @@ final: prev: # See https://bugs.schedmd.com/show_bug.cgi?id=19324 ./slurm-rank-expansion.patch ]; + # Install also the pam_slurm_adopt library to restrict users from accessing + # nodes with no job allocated. + postBuild = (old.postBuild or "") + '' + pushd contribs/pam_slurm_adopt + make "PAM_DIR=$out/lib/security" + popd + ''; + postInstall = (old.postInstall or "") + '' + pushd contribs/pam_slurm_adopt + make "PAM_DIR=$out/lib/security" install + popd + ''; }); prometheus-slurm-exporter = prev.callPackage ./slurm-exporter.nix { };