Compare commits
9 Commits
4752afae0c
...
32a2b87313
| Author | SHA1 | Date | |
|---|---|---|---|
| 32a2b87313 | |||
| c1edc91e1b | |||
| 242fbdc268 | |||
| b0d90a8ee9 | |||
| 16300fa966 | |||
| 46da697690 | |||
| 10b2dca7b6 | |||
| fc69ef3217 | |||
| 1d025f7a38 |
@ -93,20 +93,4 @@
|
|||||||
wantedBy = [ "multi-user.target" ];
|
wantedBy = [ "multi-user.target" ];
|
||||||
serviceConfig.ExecStart = script;
|
serviceConfig.ExecStart = script;
|
||||||
};
|
};
|
||||||
|
|
||||||
# Only allow SSH connections from users who have a SLURM allocation
|
|
||||||
# See: https://slurm.schedmd.com/pam_slurm_adopt.html
|
|
||||||
security.pam.services.sshd.rules.account.slurm = {
|
|
||||||
control = "required";
|
|
||||||
enable = true;
|
|
||||||
modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
|
|
||||||
args = [ "log_level=debug5" ];
|
|
||||||
order = 999999; # Make it last one
|
|
||||||
};
|
|
||||||
|
|
||||||
# Disable systemd session (pam_systemd.so) as it will conflict with the
|
|
||||||
# pam_slurm_adopt.so module. What happens is that the shell is first adopted
|
|
||||||
# into the slurmstepd task and then into the systemd session, which is not
|
|
||||||
# what we want, otherwise it will linger even if all jobs are gone.
|
|
||||||
security.pam.services.sshd.startSession = lib.mkForce false;
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
{ lib, ... }:
|
{ lib, pkgs, ... }:
|
||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
@ -21,4 +21,20 @@
|
|||||||
};
|
};
|
||||||
|
|
||||||
services.slurm.client.enable = true;
|
services.slurm.client.enable = true;
|
||||||
|
|
||||||
|
# Only allow SSH connections from users who have a SLURM allocation
|
||||||
|
# See: https://slurm.schedmd.com/pam_slurm_adopt.html
|
||||||
|
security.pam.services.sshd.rules.account.slurm = {
|
||||||
|
control = "required";
|
||||||
|
enable = true;
|
||||||
|
modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
|
||||||
|
args = [ "log_level=debug5" ];
|
||||||
|
order = 999999; # Make it last one
|
||||||
|
};
|
||||||
|
|
||||||
|
# Disable systemd session (pam_systemd.so) as it will conflict with the
|
||||||
|
# pam_slurm_adopt.so module. What happens is that the shell is first adopted
|
||||||
|
# into the slurmstepd task and then into the systemd session, which is not
|
||||||
|
# what we want, otherwise it will linger even if all jobs are gone.
|
||||||
|
security.pam.services.sshd.startSession = lib.mkForce false;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,31 +1,6 @@
|
|||||||
{ config, pkgs, ... }:
|
{ config, pkgs, ... }:
|
||||||
|
|
||||||
let
|
{
|
||||||
suspendProgram = pkgs.writeShellScript "suspend.sh" ''
|
|
||||||
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
|
||||||
set -x
|
|
||||||
export "PATH=/run/current-system/sw/bin:$PATH"
|
|
||||||
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
|
||||||
hosts=$(scontrol show hostnames $1)
|
|
||||||
for host in $hosts; do
|
|
||||||
echo Shutting down host: $host
|
|
||||||
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
|
|
||||||
done
|
|
||||||
'';
|
|
||||||
|
|
||||||
resumeProgram = pkgs.writeShellScript "resume.sh" ''
|
|
||||||
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
|
||||||
set -x
|
|
||||||
export "PATH=/run/current-system/sw/bin:$PATH"
|
|
||||||
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
|
||||||
hosts=$(scontrol show hostnames $1)
|
|
||||||
for host in $hosts; do
|
|
||||||
echo Starting host: $host
|
|
||||||
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
|
|
||||||
done
|
|
||||||
'';
|
|
||||||
|
|
||||||
in {
|
|
||||||
services.slurm = {
|
services.slurm = {
|
||||||
controlMachine = "apex";
|
controlMachine = "apex";
|
||||||
clusterName = "jungle";
|
clusterName = "jungle";
|
||||||
@ -59,16 +34,6 @@ in {
|
|||||||
# the resources. Use the task/cgroup plugin to enable process containment.
|
# the resources. Use the task/cgroup plugin to enable process containment.
|
||||||
TaskPlugin=task/affinity,task/cgroup
|
TaskPlugin=task/affinity,task/cgroup
|
||||||
|
|
||||||
# Power off unused nodes until they are requested
|
|
||||||
SuspendProgram=${suspendProgram}
|
|
||||||
SuspendTimeout=60
|
|
||||||
ResumeProgram=${resumeProgram}
|
|
||||||
ResumeTimeout=300
|
|
||||||
SuspendExcNodes=fox
|
|
||||||
|
|
||||||
# Turn the nodes off after 1 hour of inactivity
|
|
||||||
SuspendTime=3600
|
|
||||||
|
|
||||||
# Reduce port range so we can allow only this range in the firewall
|
# Reduce port range so we can allow only this range in the firewall
|
||||||
SrunPortRange=60000-61000
|
SrunPortRange=60000-61000
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user