WIP isolation

This commit is contained in:
Rodrigo Arias 2020-10-07 09:49:42 +02:00
parent ba221c5200
commit 4ea0d16926
7 changed files with 101 additions and 31 deletions

View File

@ -16,13 +16,15 @@ let
varConfig = {
cc = [ bsc.icc ];
mpi = [ bsc.impi ];
#mpi = [ bsc.mpichDebug ];
blocksize = [ 1024 ];
};
# Common configuration
common = {
# Compile time nbody config
gitBranch = "garlic/tampi+send+oss+task";
gitBranch = "garlic/mpi+send";
#gitBranch = "garlic/tampi+send+oss+task";
# nbody runtime options
particles = 1024*4;
@ -30,15 +32,16 @@ let
# Resources
ntasksPerNode = "2";
nodes = "2";
nodes = "1";
# Stage configuration
enableRunexp = true;
enableTrebuchet = true;
enableSbatch = true;
enableControl = true;
enableExtrae = false;
enablePerf = false;
enableCtf = false;
enableStrace = true;
# MN4 path
nixPrefix = "/gpfs/projects/bsc15/nix";
@ -90,6 +93,11 @@ let
perfArgs = "sched record -a";
};
nixsetup = {stage, conf, ...}: with conf; w.nixsetup {
program = stageProgram stage;
nixsetup = "${nixPrefix}/bin/nix-setup";
};
isolate = {stage, conf, ...}: with conf; w.isolate {
program = stageProgram stage;
clusterName = "mn4";
@ -110,8 +118,23 @@ let
'';
};
strace = {stage, conf, ...}: w.strace {
program = stageProgram stage;
};
argv = {stage, conf, ...}: w.argv {
program = stageProgram stage;
env = ''
#export I_MPI_PMI_LIBRARY=${bsc.slurm17-libpmi2}/lib/libpmi2.so
export I_MPI_DEBUG=+1000
#export I_MPI_FABRICS=shm
export MPICH_DBG_OUTPUT=VERBOSE
export MPICH_DBG_CLASS=ALL
export MPICH_DBG_OUTPUT=stdout
export FI_LOG_LEVEL=Info
'';
argv = ''( -t ${toString conf.timesteps}
-p ${toString conf.particles} )'';
};
@ -146,19 +169,25 @@ let
};
stages = with common; []
# Launch the experiment remotely
#++ optional enableRunexp runexp
# Use sbatch to request resources first
++ optional enableSbatch sbatch
++ optionals enableSbatch [
sbatch
nixsetup
#isolate
]
# Repeats the next stages N times
++ optionals enableControl [ isolate control ]
++ optional enableControl control
# Executes srun to launch the program in the requested nodes, and
# immediately after enters the nix environment again, as slurmstepd launches
# the next stages from outside the namespace.
++ [ srun isolate ]
++ [
#strace
srun
nixsetup
#isolate
]
# Intrumentation with extrae
++ optional enableExtrae extrae
@ -169,6 +198,9 @@ let
# Optionally profile nanos6 with the new ctf
++ optional enableCtf ctf
# Optionally enable strace
#++ optional enableStrace strace
# Execute the nbody app with the argv and env vars
++ [ argv nbodyFn ];
@ -177,7 +209,7 @@ let
launcher = launch jobs;
runexp = stage: w.runexp {
trebuchet = stage: w.trebuchet {
program = stageProgram stage;
nixPrefix = common.nixPrefix;
};
@ -187,8 +219,7 @@ let
conf = common;
};
final = runexp (isolatedRun launcher);
final = trebuchet (isolatedRun launcher);
in
# We simply run each program one after another

View File

@ -16,7 +16,7 @@ with stdenv.lib;
stdenv.mkDerivation rec {
name = "nbody";
#src = /home/Computational/rarias/bscpkgs/manual/nbody;
#src = ~/nbody;
src = builtins.fetchGit {
url = "${gitURL}";

View File

@ -16,16 +16,31 @@ nixjoin="@nixPrefix@@nixtools@/bin/nix-join"
env=(
PATH="@nixPrefix@@busybox@/bin:@busybox@/bin:@extraPath@"
$(env | grep ^SLURM || true)
$(env | grep ^PMI || true)
$(env | grep ^GARLIC_OUT || true)
$(env | grep ^USER || true)
HOME="/homeless-shelter"
)
#-m @nixPrefix@ \
join_flags="-m /etc \
-m /.statelite/tmpfs/etc \
-m /sys \
-m /var/run/munge \
-m /gpfs/projects/bsc15 \
-m /bin:@nixPrefix@@busybox@/bin"
mounts=(
#-m @nixPrefix@
#FIXME: Use only the strictly neccesary from /etc
-m /etc
# The /etc/hosts file is a symlink to this etc/
-m /.statelite/tmpfs/etc
-m /sys
-m /dev
-m /proc
# nscd cache: doesn't exist (?)
#-m /var/run/nscd
# Needed for munge auth
-m /var/run/munge
# FIXME: We should only need nix and the output path
-m /gpfs/projects/bsc15
-m /bin:@nixPrefix@@busybox@/bin
)
join_flags="${mounts[@]}"
exec $nixjoin -v -i $join_flags $nixhome -- \
env -i "${env[@]}" @out@/bin/stage2

23
garlic/stages/strace.nix Normal file
View File

@ -0,0 +1,23 @@
{
stdenv
, bash
, strace
}:
{
program
}:
stdenv.mkDerivation {
name = "strace";
preferLocalBuild = true;
phases = [ "installPhase" ];
installPhase = ''
cat > $out <<EOF
#!/bin/sh
exec ${strace}/bin/strace -f ${program}
EOF
chmod +x $out
'';
}

View File

@ -11,15 +11,15 @@
}:
stdenv.mkDerivation {
name = "runexp";
name = "trebuchet";
preferLocalBuild = true;
phases = [ "unpackPhase" "installPhase" ];
dontPatchShebangs = true;
src = ./.;
inherit sshHost nixPrefix nixtools targetCluster program;
installPhase = ''
substituteAllInPlace runexp
cp runexp $out
substituteAllInPlace trebuchet
cp trebuchet $out
chmod +x $out
'';
}

View File

@ -1,4 +1,4 @@
#!/bin/sh
#!/bin/sh -ex
# @upload-to-mn@
# This program runs the current experiment in the ./result symlink in

View File

@ -28,7 +28,7 @@ let
mpich = callPackage ./bsc/mpich/default.nix { };
mpichDebug = self.mpich.override { enableDebug = true; };
mpichDebug = self.bsc.mpich.override { enableDebug = true; };
# Updated version of libpsm2: TODO push upstream.
#libpsm2 = callPackage ./bsc/libpsm2/default.nix { };
@ -207,7 +207,8 @@ let
envRecord = callPackage ./garlic/stages/envRecord.nix { };
valgrind = callPackage ./garlic/stages/valgrind.nix { };
isolate = callPackage ./garlic/stages/isolate { };
runexp = callPackage ./garlic/stages/runexp { };
trebuchet = callPackage ./garlic/stages/trebuchet { };
strace = callPackage ./garlic/stages/strace.nix { };
};
# Tests (move to bsc ?)