stages: add baywatch stage to check the exit code

This workaround stage prevents srun from returning 0 to the upper stages
when a signal happens after MPI_Finalize. It writes the return code to a
file named .srun.rc.$rank and later checks that exists and contains a 0.

When the program is killed, exits with non-zero and the error is
propagated to the baywatch stage, which aborts immediately without
creating the rc file.
This commit is contained in:
2021-04-06 15:23:26 +02:00
parent 604cfd90a3
commit 71c06d02da
4 changed files with 47 additions and 1 deletions

View File

@@ -99,13 +99,17 @@ rec {
inherit nextStage;
}
);
baywatch = {nextStage, ...}: stages.baywatch {
inherit nextStage;
};
};
stdPipelineOverride = {overrides ? {}}:
let
stages = stdStages // overrides;
in
with stages; [ sbatch isolate control srun isolate ];
with stages; [ sbatch isolate control srun isolate baywatch ];
stdPipeline = stdPipelineOverride {};