forked from rarias/bscpkgs
stages: add baywatch stage to check the exit code
This workaround stage prevents srun from returning 0 to the upper stages when a signal happens after MPI_Finalize. It writes the return code to a file named .srun.rc.$rank and later checks that exists and contains a 0. When the program is killed, exits with non-zero and the error is propagated to the baywatch stage, which aborts immediately without creating the rc file.
This commit is contained in:
26
garlic/stages/baywatch.nix
Normal file
26
garlic/stages/baywatch.nix
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
stdenv
|
||||
, garlicTools
|
||||
}:
|
||||
{
|
||||
nextStage
|
||||
}:
|
||||
|
||||
with garlicTools;
|
||||
|
||||
stdenv.mkDerivation rec {
|
||||
name = "baywatch";
|
||||
phases = [ "installPhase" ];
|
||||
preferLocalBuild = true;
|
||||
dontPatchShebangs = true;
|
||||
installPhase = ''
|
||||
cat > $out <<'EOF'
|
||||
#!/bin/sh -e
|
||||
|
||||
${stageProgram nextStage}
|
||||
echo $? >> .srun.rc.$SLURM_PROCID
|
||||
|
||||
EOF
|
||||
chmod +x $out
|
||||
'';
|
||||
}
|
||||
@@ -35,6 +35,21 @@ stdenv.mkDerivation rec {
|
||||
${srunOptions} \
|
||||
${nixPrefix}${stageProgram nextStage}
|
||||
|
||||
>&2 echo srun exit code: $?
|
||||
|
||||
# Ensure that none failed, as srun fails to capture errors
|
||||
# after MPI_Finalize
|
||||
for i in $(seq 0 $(($SLURM_NTASKS - 1))); do
|
||||
if [ ! -e .srun.rc.$i ]; then
|
||||
>&2 echo "missing exit code for rank $i, aborting"
|
||||
exit 1
|
||||
fi
|
||||
if ! grep -q '^0$' .srun.rc.$i; then
|
||||
>&2 echo "non-zero exit for rank $i, aborting"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
${postSrun}
|
||||
EOF
|
||||
|
||||
|
||||
Reference in New Issue
Block a user