Compare commits

...

9 Commits

Author SHA1 Message Date
32a2b87313
Fix osu cross-compilation
All checks were successful
CI / build:cross (pull_request) Successful in 12s
CI / build:all (pull_request) Successful in 15s
2025-10-31 12:01:53 +01:00
c1edc91e1b
Set mpich default compilers from targetPackages 2025-10-31 12:01:52 +01:00
242fbdc268
Enable meta.cross for mpich related packages 2025-10-31 12:01:52 +01:00
b0d90a8ee9
Disable meta.cross for gpi-2 and tagaspi 2025-10-31 12:01:52 +01:00
16300fa966
Fix nativeBuildInputs for tagaspi 2025-10-31 12:01:52 +01:00
46da697690
Fix nativeBuildInputs for gpi-2 2025-10-31 12:01:52 +01:00
10b2dca7b6
Fix mpich cross compilation (disable fortran) 2025-10-31 12:01:51 +01:00
fc69ef3217 Enable pam_slurm_adopt in all compute nodes
All checks were successful
CI / build:cross (pull_request) Successful in 5s
CI / build:all (pull_request) Successful in 15s
CI / build:all (push) Successful in 3s
CI / build:cross (push) Successful in 6s
Prevents access to owl1 and owl2 too if the user doesn't have any jobs
running there.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-31 11:41:50 +01:00
1d025f7a38 Don't suspend owl compute nodes
Currently the owl nodes are located on top of the rack and turning them
off causes a high temperature increase at that region, which accumulates
heat from the whole rack. To maximize airflow we will leave them on at
all times. This also makes allocations immediate at the extra cost of
around 200 W.

In the future, if we include more nodes in SLURM we can configure those
to turn off if needed.

Fixes: #156
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-31 11:41:44 +01:00
9 changed files with 68 additions and 69 deletions

View File

@ -93,20 +93,4 @@
wantedBy = [ "multi-user.target" ]; wantedBy = [ "multi-user.target" ];
serviceConfig.ExecStart = script; serviceConfig.ExecStart = script;
}; };
# Only allow SSH connections from users who have a SLURM allocation
# See: https://slurm.schedmd.com/pam_slurm_adopt.html
security.pam.services.sshd.rules.account.slurm = {
control = "required";
enable = true;
modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
args = [ "log_level=debug5" ];
order = 999999; # Make it last one
};
# Disable systemd session (pam_systemd.so) as it will conflict with the
# pam_slurm_adopt.so module. What happens is that the shell is first adopted
# into the slurmstepd task and then into the systemd session, which is not
# what we want, otherwise it will linger even if all jobs are gone.
security.pam.services.sshd.startSession = lib.mkForce false;
} }

View File

@ -1,4 +1,4 @@
{ lib, ... }: { lib, pkgs, ... }:
{ {
imports = [ imports = [
@ -21,4 +21,20 @@
}; };
services.slurm.client.enable = true; services.slurm.client.enable = true;
# Only allow SSH connections from users who have a SLURM allocation
# See: https://slurm.schedmd.com/pam_slurm_adopt.html
security.pam.services.sshd.rules.account.slurm = {
control = "required";
enable = true;
modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
args = [ "log_level=debug5" ];
order = 999999; # Make it last one
};
# Disable systemd session (pam_systemd.so) as it will conflict with the
# pam_slurm_adopt.so module. What happens is that the shell is first adopted
# into the slurmstepd task and then into the systemd session, which is not
# what we want, otherwise it will linger even if all jobs are gone.
security.pam.services.sshd.startSession = lib.mkForce false;
} }

View File

@ -1,31 +1,6 @@
{ config, pkgs, ... }: { config, pkgs, ... }:
let {
suspendProgram = pkgs.writeShellScript "suspend.sh" ''
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
set -x
export "PATH=/run/current-system/sw/bin:$PATH"
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
hosts=$(scontrol show hostnames $1)
for host in $hosts; do
echo Shutting down host: $host
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
done
'';
resumeProgram = pkgs.writeShellScript "resume.sh" ''
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
set -x
export "PATH=/run/current-system/sw/bin:$PATH"
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
hosts=$(scontrol show hostnames $1)
for host in $hosts; do
echo Starting host: $host
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
done
'';
in {
services.slurm = { services.slurm = {
controlMachine = "apex"; controlMachine = "apex";
clusterName = "jungle"; clusterName = "jungle";
@ -59,16 +34,6 @@ in {
# the resources. Use the task/cgroup plugin to enable process containment. # the resources. Use the task/cgroup plugin to enable process containment.
TaskPlugin=task/affinity,task/cgroup TaskPlugin=task/affinity,task/cgroup
# Power off unused nodes until they are requested
SuspendProgram=${suspendProgram}
SuspendTimeout=60
ResumeProgram=${resumeProgram}
ResumeTimeout=300
SuspendExcNodes=fox
# Turn the nodes off after 1 hour of inactivity
SuspendTime=3600
# Reduce port range so we can allow only this range in the firewall # Reduce port range so we can allow only this range in the firewall
SrunPortRange=60000-61000 SrunPortRange=60000-61000

View File

@ -9,7 +9,6 @@
, automake , automake
, libtool , libtool
, mpi , mpi
, rsync
, gfortran , gfortran
}: }:
@ -44,13 +43,24 @@ stdenv.mkDerivation rec {
configureFlags = [ configureFlags = [
"--with-infiniband=${rdma-core-all}" "--with-infiniband=${rdma-core-all}"
"--with-mpi=${mpiAll}" "--with-mpi=yes"
"--with-slurm" "--with-slurm"
"CFLAGS=-fPIC" "CFLAGS=-fPIC"
"CXXFLAGS=-fPIC" "CXXFLAGS=-fPIC"
]; ];
buildInputs = [ slurm mpiAll rdma-core-all autoconf automake libtool rsync gfortran ]; nativeBuildInputs = [
autoconf
automake
gfortran
libtool
];
buildInputs = [
slurm
mpiAll
rdma-core-all
];
hardeningDisable = [ "all" ]; hardeningDisable = [ "all" ];
@ -60,5 +70,6 @@ stdenv.mkDerivation rec {
maintainers = with lib.maintainers.bsc; [ rarias ]; maintainers = with lib.maintainers.bsc; [ rarias ];
platforms = lib.platforms.linux; platforms = lib.platforms.linux;
license = lib.licenses.gpl3Plus; license = lib.licenses.gpl3Plus;
cross = false;
}; };
} }

View File

@ -6,6 +6,13 @@
, pmix , pmix
, gfortran , gfortran
, symlinkJoin , symlinkJoin
# Disabled when cross-compiling
# To fix cross compilation, we should fill the values in:
# https://github.com/pmodels/mpich/blob/main/maint/fcrosscompile/cross_values.txt.in
# For each arch
, enableFortran ? stdenv.hostPlatform == stdenv.buildPlatform
, perl
, targetPackages
}: }:
let let
@ -15,10 +22,13 @@ let
paths = [ pmix.dev pmix.out ]; paths = [ pmix.dev pmix.out ];
}; };
in mpich.overrideAttrs (old: { in mpich.overrideAttrs (old: {
buildInput = old.buildInputs ++ [ buildInputs = old.buildInputs ++ [
libfabric libfabric
pmixAll pmixAll
]; ];
nativeBuildInputs = old.nativeBuildInputs ++ [
perl
];
configureFlags = [ configureFlags = [
"--enable-shared" "--enable-shared"
"--enable-sharedlib" "--enable-sharedlib"
@ -31,10 +41,21 @@ in mpich.overrideAttrs (old: {
] ++ lib.optionals (lib.versionAtLeast gfortran.version "10") [ ] ++ lib.optionals (lib.versionAtLeast gfortran.version "10") [
"FFLAGS=-fallow-argument-mismatch" # https://github.com/pmodels/mpich/issues/4300 "FFLAGS=-fallow-argument-mismatch" # https://github.com/pmodels/mpich/issues/4300
"FCFLAGS=-fallow-argument-mismatch" "FCFLAGS=-fallow-argument-mismatch"
] ++ lib.optionals (!enableFortran) [
"--disable-fortran"
]; ];
preFixup = ''
sed -i 's:^CC=.*:CC=${targetPackages.stdenv.cc}/bin/${targetPackages.stdenv.cc.targetPrefix}cc:' $out/bin/mpicc
sed -i 's:^CXX=.*:CXX=${targetPackages.stdenv.cc}/bin/${targetPackages.stdenv.cc.targetPrefix}c++:' $out/bin/mpicxx
'' + lib.optionalString enableFortran ''
sed -i 's:^FC=.*:FC=${targetPackages.gfortran or gfortran}/bin/${targetPackages.gfortran.targetPrefix or gfortran.targetPrefix}gfortran:' $out/bin/mpifort
'';
hardeningDisable = [ "all" ]; hardeningDisable = [ "all" ];
meta = old.meta // { meta = old.meta // {
maintainers = old.meta.maintainers ++ (with lib.maintainers.bsc; [ rarias ]); maintainers = old.meta.maintainers ++ (with lib.maintainers.bsc; [ rarias ]);
cross = true;
}; };
}) })

View File

@ -32,6 +32,11 @@ stdenv.mkDerivation rec {
"CXX=mpicxx" "CXX=mpicxx"
]; ];
env = {
MPICH_CC="${stdenv.cc}/bin/${stdenv.cc.targetPrefix}cc";
MPICH_CXX="${stdenv.cc}/bin/${stdenv.cc.targetPrefix}c++";
};
postInstall = '' postInstall = ''
mkdir -p $out/bin mkdir -p $out/bin
for f in $(find $out -executable -type f); do for f in $(find $out -executable -type f); do
@ -44,5 +49,6 @@ stdenv.mkDerivation rec {
homepage = "http://mvapich.cse.ohio-state.edu/benchmarks/"; homepage = "http://mvapich.cse.ohio-state.edu/benchmarks/";
maintainers = [ ]; maintainers = [ ];
platforms = lib.platforms.all; platforms = lib.platforms.all;
cross = true;
}; };
} }

View File

@ -35,5 +35,6 @@ stdenv.mkDerivation rec {
maintainers = with lib.maintainers.bsc; [ rarias ]; maintainers = with lib.maintainers.bsc; [ rarias ];
platforms = lib.platforms.linux; platforms = lib.platforms.linux;
license = lib.licenses.mit; license = lib.licenses.mit;
cross = true;
}; };
} }

View File

@ -5,23 +5,14 @@
, automake , automake
, autoconf , autoconf
, libtool , libtool
, mpi
, autoreconfHook , autoreconfHook
, gpi-2 , gpi-2
, boost , boost
, numactl , numactl
, rdma-core , rdma-core
, gfortran , gfortran
, symlinkJoin
}: }:
let
mpiAll = symlinkJoin {
name = "mpi-all";
paths = [ mpi.all ];
};
in
stdenv.mkDerivation rec { stdenv.mkDerivation rec {
pname = "tagaspi"; pname = "tagaspi";
enableParallelBuilding = true; enableParallelBuilding = true;
@ -35,16 +26,18 @@ stdenv.mkDerivation rec {
hash = "sha256-RGG/Re2uM293HduZfGzKUWioDtwnSYYdfeG9pVrX9EM="; hash = "sha256-RGG/Re2uM293HduZfGzKUWioDtwnSYYdfeG9pVrX9EM=";
}; };
buildInputs = [ nativeBuildInputs = [
autoreconfHook autoreconfHook
automake automake
autoconf autoconf
libtool libtool
gfortran
];
buildInputs = [
boost boost
numactl numactl
rdma-core rdma-core
gfortran
mpiAll
]; ];
dontDisableStatic = true; dontDisableStatic = true;
@ -63,5 +56,6 @@ stdenv.mkDerivation rec {
maintainers = with lib.maintainers.bsc; [ rarias ]; maintainers = with lib.maintainers.bsc; [ rarias ];
platforms = lib.platforms.linux; platforms = lib.platforms.linux;
license = lib.licenses.gpl3Plus; license = lib.licenses.gpl3Plus;
cross = false; # gpi-2 cannot cross
}; };
} }

View File

@ -68,5 +68,6 @@ in stdenv.mkDerivation {
maintainers = with lib.maintainers.bsc; [ rarias ]; maintainers = with lib.maintainers.bsc; [ rarias ];
platforms = lib.platforms.linux; platforms = lib.platforms.linux;
license = lib.licenses.gpl3Plus; license = lib.licenses.gpl3Plus;
cross = true;
}; };
} }