Compare commits
	
		
			2 Commits
		
	
	
		
			541c16cf44
			...
			dde13db192
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| dde13db192 | |||
| b8ecc178e3 | 
@ -93,20 +93,4 @@
 | 
			
		||||
    wantedBy = [ "multi-user.target" ];
 | 
			
		||||
    serviceConfig.ExecStart = script;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  # Only allow SSH connections from users who have a SLURM allocation
 | 
			
		||||
  # See: https://slurm.schedmd.com/pam_slurm_adopt.html
 | 
			
		||||
  security.pam.services.sshd.rules.account.slurm = {
 | 
			
		||||
    control = "required";
 | 
			
		||||
    enable = true;
 | 
			
		||||
    modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
 | 
			
		||||
    args = [ "log_level=debug5" ];
 | 
			
		||||
    order = 999999; # Make it last one
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  # Disable systemd session (pam_systemd.so) as it will conflict with the
 | 
			
		||||
  # pam_slurm_adopt.so module. What happens is that the shell is first adopted
 | 
			
		||||
  # into the slurmstepd task and then into the systemd session, which is not
 | 
			
		||||
  # what we want, otherwise it will linger even if all jobs are gone.
 | 
			
		||||
  security.pam.services.sshd.startSession = lib.mkForce false;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,4 @@
 | 
			
		||||
{ lib, ... }:
 | 
			
		||||
{ lib, pkgs, ... }:
 | 
			
		||||
 | 
			
		||||
{
 | 
			
		||||
  imports = [
 | 
			
		||||
@ -21,4 +21,20 @@
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  services.slurm.client.enable = true;
 | 
			
		||||
 | 
			
		||||
  # Only allow SSH connections from users who have a SLURM allocation
 | 
			
		||||
  # See: https://slurm.schedmd.com/pam_slurm_adopt.html
 | 
			
		||||
  security.pam.services.sshd.rules.account.slurm = {
 | 
			
		||||
    control = "required";
 | 
			
		||||
    enable = true;
 | 
			
		||||
    modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
 | 
			
		||||
    args = [ "log_level=debug5" ];
 | 
			
		||||
    order = 999999; # Make it last one
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  # Disable systemd session (pam_systemd.so) as it will conflict with the
 | 
			
		||||
  # pam_slurm_adopt.so module. What happens is that the shell is first adopted
 | 
			
		||||
  # into the slurmstepd task and then into the systemd session, which is not
 | 
			
		||||
  # what we want, otherwise it will linger even if all jobs are gone.
 | 
			
		||||
  security.pam.services.sshd.startSession = lib.mkForce false;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -1,31 +1,6 @@
 | 
			
		||||
{ config, pkgs, ... }:
 | 
			
		||||
 | 
			
		||||
let
 | 
			
		||||
  suspendProgram = pkgs.writeShellScript "suspend.sh" ''
 | 
			
		||||
    exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
 | 
			
		||||
    set -x
 | 
			
		||||
    export "PATH=/run/current-system/sw/bin:$PATH"
 | 
			
		||||
    echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
 | 
			
		||||
    hosts=$(scontrol show hostnames $1)
 | 
			
		||||
    for host in $hosts; do
 | 
			
		||||
      echo Shutting down host: $host
 | 
			
		||||
      ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
 | 
			
		||||
    done
 | 
			
		||||
  '';
 | 
			
		||||
 | 
			
		||||
  resumeProgram = pkgs.writeShellScript "resume.sh" ''
 | 
			
		||||
    exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
 | 
			
		||||
    set -x
 | 
			
		||||
    export "PATH=/run/current-system/sw/bin:$PATH"
 | 
			
		||||
    echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
 | 
			
		||||
    hosts=$(scontrol show hostnames $1)
 | 
			
		||||
    for host in $hosts; do
 | 
			
		||||
      echo Starting host: $host
 | 
			
		||||
      ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
 | 
			
		||||
    done
 | 
			
		||||
  '';
 | 
			
		||||
 | 
			
		||||
in {
 | 
			
		||||
{
 | 
			
		||||
  services.slurm = {
 | 
			
		||||
    controlMachine = "apex";
 | 
			
		||||
    clusterName = "jungle";
 | 
			
		||||
@ -59,16 +34,6 @@ in {
 | 
			
		||||
      # the resources. Use the task/cgroup plugin to enable process containment.
 | 
			
		||||
      TaskPlugin=task/affinity,task/cgroup
 | 
			
		||||
 | 
			
		||||
      # Power off unused nodes until they are requested
 | 
			
		||||
      SuspendProgram=${suspendProgram}
 | 
			
		||||
      SuspendTimeout=60
 | 
			
		||||
      ResumeProgram=${resumeProgram}
 | 
			
		||||
      ResumeTimeout=300
 | 
			
		||||
      SuspendExcNodes=fox
 | 
			
		||||
 | 
			
		||||
      # Turn the nodes off after 1 hour of inactivity
 | 
			
		||||
      SuspendTime=3600
 | 
			
		||||
 | 
			
		||||
      # Reduce port range so we can allow only this range in the firewall
 | 
			
		||||
      SrunPortRange=60000-61000
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user