stages: add baywatch stage to check the exit code
This workaround stage prevents srun from returning 0 to the upper stages when a signal happens after MPI_Finalize. It writes the return code to a file named .srun.rc.$rank and later checks that exists and contains a 0. When the program is killed, exits with non-zero and the error is propagated to the baywatch stage, which aborts immediately without creating the rc file.
This commit is contained in:
parent
604cfd90a3
commit
71c06d02da
@ -76,6 +76,7 @@
|
|||||||
stages = {
|
stages = {
|
||||||
sbatch = callPackage ./stages/sbatch.nix { };
|
sbatch = callPackage ./stages/sbatch.nix { };
|
||||||
srun = callPackage ./stages/srun.nix { };
|
srun = callPackage ./stages/srun.nix { };
|
||||||
|
baywatch = callPackage ./stages/baywatch.nix { };
|
||||||
control = callPackage ./stages/control.nix { };
|
control = callPackage ./stages/control.nix { };
|
||||||
exec = callPackage ./stages/exec.nix { };
|
exec = callPackage ./stages/exec.nix { };
|
||||||
script = callPackage ./stages/script.nix { };
|
script = callPackage ./stages/script.nix { };
|
||||||
|
26
garlic/stages/baywatch.nix
Normal file
26
garlic/stages/baywatch.nix
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
stdenv
|
||||||
|
, garlicTools
|
||||||
|
}:
|
||||||
|
{
|
||||||
|
nextStage
|
||||||
|
}:
|
||||||
|
|
||||||
|
with garlicTools;
|
||||||
|
|
||||||
|
stdenv.mkDerivation rec {
|
||||||
|
name = "baywatch";
|
||||||
|
phases = [ "installPhase" ];
|
||||||
|
preferLocalBuild = true;
|
||||||
|
dontPatchShebangs = true;
|
||||||
|
installPhase = ''
|
||||||
|
cat > $out <<'EOF'
|
||||||
|
#!/bin/sh -e
|
||||||
|
|
||||||
|
${stageProgram nextStage}
|
||||||
|
echo $? >> .srun.rc.$SLURM_PROCID
|
||||||
|
|
||||||
|
EOF
|
||||||
|
chmod +x $out
|
||||||
|
'';
|
||||||
|
}
|
@ -35,6 +35,21 @@ stdenv.mkDerivation rec {
|
|||||||
${srunOptions} \
|
${srunOptions} \
|
||||||
${nixPrefix}${stageProgram nextStage}
|
${nixPrefix}${stageProgram nextStage}
|
||||||
|
|
||||||
|
>&2 echo srun exit code: $?
|
||||||
|
|
||||||
|
# Ensure that none failed, as srun fails to capture errors
|
||||||
|
# after MPI_Finalize
|
||||||
|
for i in $(seq 0 $(($SLURM_NTASKS - 1))); do
|
||||||
|
if [ ! -e .srun.rc.$i ]; then
|
||||||
|
>&2 echo "missing exit code for rank $i, aborting"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! grep -q '^0$' .srun.rc.$i; then
|
||||||
|
>&2 echo "non-zero exit for rank $i, aborting"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
${postSrun}
|
${postSrun}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
@ -99,13 +99,17 @@ rec {
|
|||||||
inherit nextStage;
|
inherit nextStage;
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
baywatch = {nextStage, ...}: stages.baywatch {
|
||||||
|
inherit nextStage;
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
stdPipelineOverride = {overrides ? {}}:
|
stdPipelineOverride = {overrides ? {}}:
|
||||||
let
|
let
|
||||||
stages = stdStages // overrides;
|
stages = stdStages // overrides;
|
||||||
in
|
in
|
||||||
with stages; [ sbatch isolate control srun isolate ];
|
with stages; [ sbatch isolate control srun isolate baywatch ];
|
||||||
|
|
||||||
|
|
||||||
stdPipeline = stdPipelineOverride {};
|
stdPipeline = stdPipelineOverride {};
|
||||||
|
Loading…
Reference in New Issue
Block a user