forked from rarias/bscpkgs
		
	stages: add baywatch stage to check the exit code
This workaround stage prevents srun from returning 0 to the upper stages when a signal happens after MPI_Finalize. It writes the return code to a file named .srun.rc.$rank and later checks that exists and contains a 0. When the program is killed, exits with non-zero and the error is propagated to the baywatch stage, which aborts immediately without creating the rc file.
This commit is contained in:
		
							parent
							
								
									604cfd90a3
								
							
						
					
					
						commit
						71c06d02da
					
				| @ -76,6 +76,7 @@ | ||||
|   stages = { | ||||
|     sbatch     = callPackage ./stages/sbatch.nix { }; | ||||
|     srun       = callPackage ./stages/srun.nix { }; | ||||
|     baywatch   = callPackage ./stages/baywatch.nix { }; | ||||
|     control    = callPackage ./stages/control.nix { }; | ||||
|     exec       = callPackage ./stages/exec.nix { }; | ||||
|     script     = callPackage ./stages/script.nix { }; | ||||
|  | ||||
							
								
								
									
										26
									
								
								garlic/stages/baywatch.nix
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								garlic/stages/baywatch.nix
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,26 @@ | ||||
| { | ||||
|   stdenv | ||||
| , garlicTools | ||||
| }: | ||||
| { | ||||
|   nextStage | ||||
| }: | ||||
| 
 | ||||
| with garlicTools; | ||||
| 
 | ||||
| stdenv.mkDerivation rec { | ||||
|   name = "baywatch"; | ||||
|   phases = [ "installPhase" ]; | ||||
|   preferLocalBuild = true; | ||||
|   dontPatchShebangs = true; | ||||
|   installPhase = '' | ||||
|     cat > $out <<'EOF' | ||||
|     #!/bin/sh -e | ||||
| 
 | ||||
|     ${stageProgram nextStage} | ||||
|     echo $? >> .srun.rc.$SLURM_PROCID | ||||
| 
 | ||||
|     EOF | ||||
|     chmod +x $out | ||||
|   ''; | ||||
| } | ||||
| @ -35,6 +35,21 @@ stdenv.mkDerivation rec { | ||||
|       ${srunOptions} \ | ||||
|       ${nixPrefix}${stageProgram nextStage} | ||||
| 
 | ||||
|     >&2 echo srun exit code: $? | ||||
| 
 | ||||
|     # Ensure that none failed, as srun fails to capture errors | ||||
|     # after MPI_Finalize | ||||
|     for i in $(seq 0 $(($SLURM_NTASKS - 1))); do | ||||
|       if [ ! -e .srun.rc.$i ]; then | ||||
|         >&2 echo "missing exit code for rank $i, aborting" | ||||
|         exit 1 | ||||
|       fi | ||||
|       if ! grep -q '^0$' .srun.rc.$i; then | ||||
|         >&2 echo "non-zero exit for rank $i, aborting" | ||||
|         exit 1 | ||||
|       fi | ||||
|     done | ||||
| 
 | ||||
|     ${postSrun} | ||||
|     EOF | ||||
| 
 | ||||
|  | ||||
| @ -99,13 +99,17 @@ rec { | ||||
|         inherit nextStage; | ||||
|       } | ||||
|     ); | ||||
| 
 | ||||
|     baywatch = {nextStage, ...}: stages.baywatch { | ||||
|       inherit nextStage; | ||||
|     }; | ||||
|   }; | ||||
| 
 | ||||
|   stdPipelineOverride = {overrides ? {}}: | ||||
|   let | ||||
|     stages = stdStages // overrides; | ||||
|   in | ||||
|     with stages; [ sbatch isolate control srun isolate ]; | ||||
|     with stages; [ sbatch isolate control srun isolate baywatch ]; | ||||
| 
 | ||||
| 
 | ||||
|   stdPipeline = stdPipelineOverride {}; | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user