stages: add baywatch stage to check the exit code
This workaround stage prevents srun from returning 0 to the upper stages when a signal happens after MPI_Finalize. It writes the return code to a file named .srun.rc.$rank and later checks that exists and contains a 0. When the program is killed, exits with non-zero and the error is propagated to the baywatch stage, which aborts immediately without creating the rc file.
This commit is contained in:
		
							parent
							
								
									604cfd90a3
								
							
						
					
					
						commit
						71c06d02da
					
				| @ -76,6 +76,7 @@ | |||||||
|   stages = { |   stages = { | ||||||
|     sbatch     = callPackage ./stages/sbatch.nix { }; |     sbatch     = callPackage ./stages/sbatch.nix { }; | ||||||
|     srun       = callPackage ./stages/srun.nix { }; |     srun       = callPackage ./stages/srun.nix { }; | ||||||
|  |     baywatch   = callPackage ./stages/baywatch.nix { }; | ||||||
|     control    = callPackage ./stages/control.nix { }; |     control    = callPackage ./stages/control.nix { }; | ||||||
|     exec       = callPackage ./stages/exec.nix { }; |     exec       = callPackage ./stages/exec.nix { }; | ||||||
|     script     = callPackage ./stages/script.nix { }; |     script     = callPackage ./stages/script.nix { }; | ||||||
|  | |||||||
							
								
								
									
										26
									
								
								garlic/stages/baywatch.nix
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								garlic/stages/baywatch.nix
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,26 @@ | |||||||
|  | { | ||||||
|  |   stdenv | ||||||
|  | , garlicTools | ||||||
|  | }: | ||||||
|  | { | ||||||
|  |   nextStage | ||||||
|  | }: | ||||||
|  | 
 | ||||||
|  | with garlicTools; | ||||||
|  | 
 | ||||||
|  | stdenv.mkDerivation rec { | ||||||
|  |   name = "baywatch"; | ||||||
|  |   phases = [ "installPhase" ]; | ||||||
|  |   preferLocalBuild = true; | ||||||
|  |   dontPatchShebangs = true; | ||||||
|  |   installPhase = '' | ||||||
|  |     cat > $out <<'EOF' | ||||||
|  |     #!/bin/sh -e | ||||||
|  | 
 | ||||||
|  |     ${stageProgram nextStage} | ||||||
|  |     echo $? >> .srun.rc.$SLURM_PROCID | ||||||
|  | 
 | ||||||
|  |     EOF | ||||||
|  |     chmod +x $out | ||||||
|  |   ''; | ||||||
|  | } | ||||||
| @ -35,6 +35,21 @@ stdenv.mkDerivation rec { | |||||||
|       ${srunOptions} \ |       ${srunOptions} \ | ||||||
|       ${nixPrefix}${stageProgram nextStage} |       ${nixPrefix}${stageProgram nextStage} | ||||||
| 
 | 
 | ||||||
|  |     >&2 echo srun exit code: $? | ||||||
|  | 
 | ||||||
|  |     # Ensure that none failed, as srun fails to capture errors | ||||||
|  |     # after MPI_Finalize | ||||||
|  |     for i in $(seq 0 $(($SLURM_NTASKS - 1))); do | ||||||
|  |       if [ ! -e .srun.rc.$i ]; then | ||||||
|  |         >&2 echo "missing exit code for rank $i, aborting" | ||||||
|  |         exit 1 | ||||||
|  |       fi | ||||||
|  |       if ! grep -q '^0$' .srun.rc.$i; then | ||||||
|  |         >&2 echo "non-zero exit for rank $i, aborting" | ||||||
|  |         exit 1 | ||||||
|  |       fi | ||||||
|  |     done | ||||||
|  | 
 | ||||||
|     ${postSrun} |     ${postSrun} | ||||||
|     EOF |     EOF | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -99,13 +99,17 @@ rec { | |||||||
|         inherit nextStage; |         inherit nextStage; | ||||||
|       } |       } | ||||||
|     ); |     ); | ||||||
|  | 
 | ||||||
|  |     baywatch = {nextStage, ...}: stages.baywatch { | ||||||
|  |       inherit nextStage; | ||||||
|  |     }; | ||||||
|   }; |   }; | ||||||
| 
 | 
 | ||||||
|   stdPipelineOverride = {overrides ? {}}: |   stdPipelineOverride = {overrides ? {}}: | ||||||
|   let |   let | ||||||
|     stages = stdStages // overrides; |     stages = stdStages // overrides; | ||||||
|   in |   in | ||||||
|     with stages; [ sbatch isolate control srun isolate ]; |     with stages; [ sbatch isolate control srun isolate baywatch ]; | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|   stdPipeline = stdPipelineOverride {}; |   stdPipeline = stdPipelineOverride {}; | ||||||
|  | |||||||
		Reference in New Issue
	
	Block a user