heat: split granularity with extended mode
The HWC version is not yet complete.
This commit is contained in:
		
							parent
							
								
									3566cf0152
								
							
						
					
					
						commit
						d1c32869c1
					
				| @ -5,10 +5,16 @@ | |||||||
| , targetMachine | , targetMachine | ||||||
| , stages | , stages | ||||||
| , garlicTools | , garlicTools | ||||||
|  | , writeText | ||||||
| , enablePerf ? false | , enablePerf ? false | ||||||
| , enableCTF ? false | , enableCTF ? false | ||||||
|  | , enableHWC ? false | ||||||
|  | , enableExtended ? false | ||||||
| }: | }: | ||||||
| 
 | 
 | ||||||
|  | # TODO: Finish HWC first | ||||||
|  | assert (enableHWC == false); | ||||||
|  | 
 | ||||||
| with stdenv.lib; | with stdenv.lib; | ||||||
| with garlicTools; | with garlicTools; | ||||||
| 
 | 
 | ||||||
| @ -17,10 +23,6 @@ let | |||||||
|   varConf = with bsc; { |   varConf = with bsc; { | ||||||
|     cbs = range2 32 4096; |     cbs = range2 32 4096; | ||||||
|     rbs = range2 32 4096; |     rbs = range2 32 4096; | ||||||
|     #cbs = [ 64 256 1024 4096 ]; |  | ||||||
|     #rbs = [ 32 128 512 1024 ]; |  | ||||||
|     #cbs = [ 4096 ]; |  | ||||||
|     #rbs = [ 512 ]; |  | ||||||
|   }; |   }; | ||||||
| 
 | 
 | ||||||
|   machineConfig = targetMachine.config; |   machineConfig = targetMachine.config; | ||||||
| @ -38,12 +40,11 @@ let | |||||||
|     timesteps = 10; |     timesteps = 10; | ||||||
|     cols = 1024 * 16; # Columns |     cols = 1024 * 16; # Columns | ||||||
|     rows = 1024 * 16; # Rows |     rows = 1024 * 16; # Rows | ||||||
|     cbs = c.cbs; |     inherit (c) cbs rbs; | ||||||
|     rbs = c.rbs; |  | ||||||
|     gitBranch = "garlic/tampi+isend+oss+task"; |     gitBranch = "garlic/tampi+isend+oss+task"; | ||||||
|      |      | ||||||
|     # Repeat the execution of each unit 30 times |     # Repeat the execution of each unit 30 times | ||||||
|     loops = 1; |     loops = 10; | ||||||
| 
 | 
 | ||||||
|     # Resources |     # Resources | ||||||
|     qos = "debug"; |     qos = "debug"; | ||||||
| @ -55,10 +56,20 @@ let | |||||||
|     jobName = unitName; |     jobName = unitName; | ||||||
|   }; |   }; | ||||||
| 
 | 
 | ||||||
|  |   filterConfigs = c: let | ||||||
|  |     # Too small sizes lead to huge overheads | ||||||
|  |     goodSize = (c.cbs * c.rbs >= 1024); | ||||||
|  |     # When the extended units are not enabled, we only select those in | ||||||
|  |     # the diagonal. | ||||||
|  |     extended = if (enableExtended) then true | ||||||
|  |       else c.cbs == c.rbs; | ||||||
|  |   in | ||||||
|  |     goodSize && extended; | ||||||
|  | 
 | ||||||
|   # Compute the array of configurations |   # Compute the array of configurations | ||||||
|   configs = stdexp.buildConfigs { |   configs = filter (filterConfigs) (stdexp.buildConfigs { | ||||||
|     inherit varConf genConf; |     inherit varConf genConf; | ||||||
|   }; |   }); | ||||||
| 
 | 
 | ||||||
|   perf = {nextStage, conf, ...}: stages.perf { |   perf = {nextStage, conf, ...}: stages.perf { | ||||||
|     inherit nextStage; |     inherit nextStage; | ||||||
| @ -66,54 +77,71 @@ let | |||||||
|       "-e cycles,instructions,cache-references,cache-misses"; |       "-e cycles,instructions,cache-references,cache-misses"; | ||||||
|   }; |   }; | ||||||
| 
 | 
 | ||||||
|   ctf = {nextStage, conf, ...}: with conf; stages.exec { |   ctf = {nextStage, conf, ...}: let | ||||||
|  |     # Create the nanos6 configuration file | ||||||
|  |     nanos6ConfigFile = writeText "nanos6.toml" '' | ||||||
|  |       version.instrument = "ctf" | ||||||
|  |       turbo.enabled = false | ||||||
|  |       instrument.ctf.converter.enabled = false | ||||||
|  |     '' + optionalString (enableHWC) '' | ||||||
|  |       hardware_counters.papi.enabled = true | ||||||
|  |       hardware_counters.papi.counters = [ | ||||||
|  |         "PAPI_TOT_INS", "PAPI_TOT_CYC", | ||||||
|  |         "PAPI_L1_TCM", "PAPI_L2_TCM", "PAPI_L3_TCM" | ||||||
|  |       ] | ||||||
|  |     ''; | ||||||
|  | 
 | ||||||
|  |   in stages.exec { | ||||||
|     inherit nextStage; |     inherit nextStage; | ||||||
|  | 
 | ||||||
|  |     # And use it | ||||||
|     env = '' |     env = '' | ||||||
|       export NANOS6_CONFIG_OVERRIDE="version.instrument=ctf,\ |       export NANOS6_CONFIG=${nanos6ConfigFile} | ||||||
|         instrument.ctf.converter.enabled=false" |  | ||||||
|     ''; |     ''; | ||||||
|     # Only one process converts the trace, otherwise use: |  | ||||||
|     #  if [ $SLURM_PROCID == 0 ]; then |  | ||||||
|     #    ... |  | ||||||
|     #  fi |  | ||||||
|     post = '' |  | ||||||
|       if [ $SLURM_PROCID == 0 ]; then |  | ||||||
|         sleep 2 |  | ||||||
|         for tracedir in trace_*; do |  | ||||||
|           offset=$(grep 'offset =' $tracedir/ctf/ust/uid/1000/64-bit/metadata | \ |  | ||||||
|             grep -o '[0-9]*') |  | ||||||
|           echo "offset = $offset" |  | ||||||
| 
 | 
 | ||||||
|           start_time=$(awk '/^start_time / {print $2}' stdout.log) |     # FIXME: We should run a hook *after* srun has ended, so we can | ||||||
|           end_time=$(awk '/^end_time / {print $2}' stdout.log) |     # execute it in one process only (not in N ranks). This hack works | ||||||
|  |     # with one process only. Or be able to compute the name of the trace | ||||||
|  |     # directory so we can begin the conversion in parallel | ||||||
|  |     post = assert (conf.nodes * conf.ntasksPerNode == 1); '' | ||||||
|  |       tracedir=$(ls -d trace_* | head -1) | ||||||
|  |       echo "using tracedir=$tracedir" | ||||||
| 
 | 
 | ||||||
|           begin=$(awk "BEGIN{print $start_time*1e9 - $offset}") |       offset=$(grep 'offset =' $tracedir/ctf/ust/uid/1000/64-bit/metadata | \ | ||||||
|           end=$(awk "BEGIN{print $end_time*1e9 - $offset}") |         grep -o '[0-9]*') | ||||||
|  |       echo "offset = $offset" | ||||||
| 
 | 
 | ||||||
|           echo "only events between $begin and $end" |       start_time=$(awk '/^start_time / {print $2}' stdout.log) | ||||||
|  |       end_time=$(awk '/^end_time / {print $2}' stdout.log) | ||||||
| 
 | 
 | ||||||
|           ${bsc.cn6}/bin/cn6 -s $tracedir |       begin=$(awk "BEGIN{print $start_time*1e9 - $offset}") | ||||||
|  |       end=$(awk "BEGIN{print $end_time*1e9 - $offset}") | ||||||
| 
 | 
 | ||||||
|           ${bsc.cn6}/bin/cut $begin $end < $tracedir/prv/trace.prv |\ |       echo "only events between $begin and $end" | ||||||
|             ${bsc.cn6}/bin/hcut 1 ${toString conf.cpusPerTask} \ |  | ||||||
|             > $tracedir/prv/trace-cut.prv |  | ||||||
| 
 | 
 | ||||||
|           ${bsc.cn6}/bin/dur 6400025 0 < $tracedir/prv/trace-cut.prv |\ |       ${bsc.cn6}/bin/cn6 -s $tracedir | ||||||
|             awk '{s+=$1} END {print s}' >> .garlic/time_mode_dead.csv & |  | ||||||
| 
 | 
 | ||||||
|           ${bsc.cn6}/bin/dur 6400025 1 < $tracedir/prv/trace-cut.prv |\ |       ${bsc.cn6}/bin/cut $begin $end < $tracedir/prv/trace.prv |\ | ||||||
|             awk '{s+=$1} END {print s}' >> .garlic/time_mode_runtime.csv & |         ${bsc.cn6}/bin/hcut 1 ${toString conf.cpusPerTask} \ | ||||||
|  |         > $tracedir/prv/trace-cut.prv | ||||||
| 
 | 
 | ||||||
|           ${bsc.cn6}/bin/dur 6400025 3 < $tracedir/prv/trace-cut.prv |\ |       ${bsc.cn6}/bin/dur 6400025 0 < $tracedir/prv/trace-cut.prv |\ | ||||||
|             awk '{s+=$1} END {print s}' >> .garlic/time_mode_task.csv & |         awk '{s+=$1} END {print s}' >> .garlic/time_mode_dead.csv & | ||||||
| 
 | 
 | ||||||
|           wait |       ${bsc.cn6}/bin/dur 6400025 1 < $tracedir/prv/trace-cut.prv |\ | ||||||
|  |         awk '{s+=$1} END {print s}' >> .garlic/time_mode_runtime.csv & | ||||||
| 
 | 
 | ||||||
|           # Remove the traces at the end, as they are huge |       ${bsc.cn6}/bin/dur 6400025 3 < $tracedir/prv/trace-cut.prv |\ | ||||||
|           rm -rf $tracedir |         awk '{s+=$1} END {print s}' >> .garlic/time_mode_task.csv & | ||||||
|         done | 
 | ||||||
|       fi |       wait | ||||||
|     ''; | 
 | ||||||
|  |       # Remove the traces at the end, as they are huge | ||||||
|  |       rm -rf $tracedir | ||||||
|  |       ''; | ||||||
|  |       # TODO: To enable HWC we need to first add a taskwait before the | ||||||
|  |       # first get_time() measurement, otherwise we get the HWC of the | ||||||
|  |       # main task, which will be huge. | ||||||
|   }; |   }; | ||||||
| 
 | 
 | ||||||
|   exec = {nextStage, conf, ...}: stages.exec { |   exec = {nextStage, conf, ...}: stages.exec { | ||||||
|  | |||||||
		Reference in New Issue
	
	Block a user