Compare commits

..

10 Commits

Author SHA1 Message Date
8d65028162 Set LLVM_BUILD_NATIVE=ON 2025-10-31 19:45:10 +01:00
a173af654f Fix osu cross-compilation
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-31 16:23:46 +01:00
2fff7e4a7b Set mpich default compilers from targetPackages
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-31 16:23:46 +01:00
a761b73336 Enable meta.cross for mpich related packages
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-31 16:23:46 +01:00
86eb796771 Disable meta.cross for gpi-2 and tagaspi
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-31 16:23:46 +01:00
08633435cf Fix nativeBuildInputs for tagaspi
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-31 16:23:46 +01:00
39d64456a4 Fix nativeBuildInputs for gpi-2
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-31 16:23:46 +01:00
410040a4a0 Fix mpich cross compilation (disable fortran)
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-31 16:23:46 +01:00
fc69ef3217 Enable pam_slurm_adopt in all compute nodes
Prevents access to owl1 and owl2 too if the user doesn't have any jobs
running there.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-31 11:41:50 +01:00
1d025f7a38 Don't suspend owl compute nodes
Currently the owl nodes are located on top of the rack and turning them
off causes a high temperature increase at that region, which accumulates
heat from the whole rack. To maximize airflow we will leave them on at
all times. This also makes allocations immediate at the extra cost of
around 200 W.

In the future, if we include more nodes in SLURM we can configure those
to turn off if needed.

Fixes: rarias/jungle#156
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-31 11:41:44 +01:00
5 changed files with 23 additions and 55 deletions

View File

@@ -93,20 +93,4 @@
wantedBy = [ "multi-user.target" ];
serviceConfig.ExecStart = script;
};
# Only allow SSH connections from users who have a SLURM allocation
# See: https://slurm.schedmd.com/pam_slurm_adopt.html
security.pam.services.sshd.rules.account.slurm = {
control = "required";
enable = true;
modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
args = [ "log_level=debug5" ];
order = 999999; # Make it last one
};
# Disable systemd session (pam_systemd.so) as it will conflict with the
# pam_slurm_adopt.so module. What happens is that the shell is first adopted
# into the slurmstepd task and then into the systemd session, which is not
# what we want, otherwise it will linger even if all jobs are gone.
security.pam.services.sshd.startSession = lib.mkForce false;
}

View File

@@ -1,4 +1,4 @@
{ lib, ... }:
{ lib, pkgs, ... }:
{
imports = [
@@ -21,4 +21,20 @@
};
services.slurm.client.enable = true;
# Only allow SSH connections from users who have a SLURM allocation
# See: https://slurm.schedmd.com/pam_slurm_adopt.html
security.pam.services.sshd.rules.account.slurm = {
control = "required";
enable = true;
modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
args = [ "log_level=debug5" ];
order = 999999; # Make it last one
};
# Disable systemd session (pam_systemd.so) as it will conflict with the
# pam_slurm_adopt.so module. What happens is that the shell is first adopted
# into the slurmstepd task and then into the systemd session, which is not
# what we want, otherwise it will linger even if all jobs are gone.
security.pam.services.sshd.startSession = lib.mkForce false;
}

View File

@@ -1,31 +1,6 @@
{ config, pkgs, ... }:
let
suspendProgram = pkgs.writeShellScript "suspend.sh" ''
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
set -x
export "PATH=/run/current-system/sw/bin:$PATH"
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
hosts=$(scontrol show hostnames $1)
for host in $hosts; do
echo Shutting down host: $host
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
done
'';
resumeProgram = pkgs.writeShellScript "resume.sh" ''
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
set -x
export "PATH=/run/current-system/sw/bin:$PATH"
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
hosts=$(scontrol show hostnames $1)
for host in $hosts; do
echo Starting host: $host
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
done
'';
in {
{
services.slurm = {
controlMachine = "apex";
clusterName = "jungle";
@@ -59,16 +34,6 @@ in {
# the resources. Use the task/cgroup plugin to enable process containment.
TaskPlugin=task/affinity,task/cgroup
# Power off unused nodes until they are requested
SuspendProgram=${suspendProgram}
SuspendTimeout=60
ResumeProgram=${resumeProgram}
ResumeTimeout=300
SuspendExcNodes=fox
# Turn the nodes off after 1 hour of inactivity
SuspendTime=3600
# Reduce port range so we can allow only this range in the firewall
SrunPortRange=60000-61000

View File

@@ -43,7 +43,7 @@ stdenv.mkDerivation rec {
configureFlags = [
"--with-infiniband=${rdma-core-all}"
"--with-mpi=yes"
"--with-mpi=yes" # fixes mpi detection when cross-compiling
"--with-slurm"
"CFLAGS=-fPIC"
"CXXFLAGS=-fPIC"
@@ -70,6 +70,6 @@ stdenv.mkDerivation rec {
maintainers = with lib.maintainers.bsc; [ rarias ];
platforms = lib.platforms.linux;
license = lib.licenses.gpl3Plus;
cross = false;
cross = false; # infiniband detection does not work
};
}

View File

@@ -17,6 +17,7 @@
, gitUrl ? "ssh://git@bscpm04.bsc.es/llvm-ompss/llvm-mono.git"
, gitBranch ? "master"
, gitCommit ? "872ba63f86edaefc9787984ef3fae9f2f94e0124" # github-release-2025.11
, buildLlvmPackages
}:
let
@@ -119,6 +120,8 @@ in stdenv.mkDerivation {
"-DCMAKE_INSTALL_RPATH=${zlib}/lib:${gcc.cc.lib}/lib"
"-DLLVM_APPEND_VC_REV=ON"
"-DLLVM_FORCE_VC_REVISION=${source.version}"
"-DLLVM_TABLEGEN_EXE=${buildLlvmPackages.tblgen}/bin/llvm-tblgen"
"-DCLANG_TABLEGEN=${buildLlvmPackages.tblgen}/bin/clang-tblgen"
)
'';