heat: split granularity with extended mode

The HWC version is not yet complete.
This commit is contained in:
Rodrigo Arias 2021-04-06 18:38:15 +02:00
parent 3566cf0152
commit d1c32869c1

View File

@ -5,10 +5,16 @@
, targetMachine , targetMachine
, stages , stages
, garlicTools , garlicTools
, writeText
, enablePerf ? false , enablePerf ? false
, enableCTF ? false , enableCTF ? false
, enableHWC ? false
, enableExtended ? false
}: }:
# TODO: Finish HWC first
assert (enableHWC == false);
with stdenv.lib; with stdenv.lib;
with garlicTools; with garlicTools;
@ -17,10 +23,6 @@ let
varConf = with bsc; { varConf = with bsc; {
cbs = range2 32 4096; cbs = range2 32 4096;
rbs = range2 32 4096; rbs = range2 32 4096;
#cbs = [ 64 256 1024 4096 ];
#rbs = [ 32 128 512 1024 ];
#cbs = [ 4096 ];
#rbs = [ 512 ];
}; };
machineConfig = targetMachine.config; machineConfig = targetMachine.config;
@ -38,12 +40,11 @@ let
timesteps = 10; timesteps = 10;
cols = 1024 * 16; # Columns cols = 1024 * 16; # Columns
rows = 1024 * 16; # Rows rows = 1024 * 16; # Rows
cbs = c.cbs; inherit (c) cbs rbs;
rbs = c.rbs;
gitBranch = "garlic/tampi+isend+oss+task"; gitBranch = "garlic/tampi+isend+oss+task";
# Repeat the execution of each unit 30 times # Repeat the execution of each unit 30 times
loops = 1; loops = 10;
# Resources # Resources
qos = "debug"; qos = "debug";
@ -55,10 +56,20 @@ let
jobName = unitName; jobName = unitName;
}; };
filterConfigs = c: let
# Too small sizes lead to huge overheads
goodSize = (c.cbs * c.rbs >= 1024);
# When the extended units are not enabled, we only select those in
# the diagonal.
extended = if (enableExtended) then true
else c.cbs == c.rbs;
in
goodSize && extended;
# Compute the array of configurations # Compute the array of configurations
configs = stdexp.buildConfigs { configs = filter (filterConfigs) (stdexp.buildConfigs {
inherit varConf genConf; inherit varConf genConf;
}; });
perf = {nextStage, conf, ...}: stages.perf { perf = {nextStage, conf, ...}: stages.perf {
inherit nextStage; inherit nextStage;
@ -66,54 +77,71 @@ let
"-e cycles,instructions,cache-references,cache-misses"; "-e cycles,instructions,cache-references,cache-misses";
}; };
ctf = {nextStage, conf, ...}: with conf; stages.exec { ctf = {nextStage, conf, ...}: let
# Create the nanos6 configuration file
nanos6ConfigFile = writeText "nanos6.toml" ''
version.instrument = "ctf"
turbo.enabled = false
instrument.ctf.converter.enabled = false
'' + optionalString (enableHWC) ''
hardware_counters.papi.enabled = true
hardware_counters.papi.counters = [
"PAPI_TOT_INS", "PAPI_TOT_CYC",
"PAPI_L1_TCM", "PAPI_L2_TCM", "PAPI_L3_TCM"
]
'';
in stages.exec {
inherit nextStage; inherit nextStage;
# And use it
env = '' env = ''
export NANOS6_CONFIG_OVERRIDE="version.instrument=ctf,\ export NANOS6_CONFIG=${nanos6ConfigFile}
instrument.ctf.converter.enabled=false"
''; '';
# Only one process converts the trace, otherwise use:
# if [ $SLURM_PROCID == 0 ]; then
# ...
# fi
post = ''
if [ $SLURM_PROCID == 0 ]; then
sleep 2
for tracedir in trace_*; do
offset=$(grep 'offset =' $tracedir/ctf/ust/uid/1000/64-bit/metadata | \
grep -o '[0-9]*')
echo "offset = $offset"
start_time=$(awk '/^start_time / {print $2}' stdout.log) # FIXME: We should run a hook *after* srun has ended, so we can
end_time=$(awk '/^end_time / {print $2}' stdout.log) # execute it in one process only (not in N ranks). This hack works
# with one process only. Or be able to compute the name of the trace
# directory so we can begin the conversion in parallel
post = assert (conf.nodes * conf.ntasksPerNode == 1); ''
tracedir=$(ls -d trace_* | head -1)
echo "using tracedir=$tracedir"
begin=$(awk "BEGIN{print $start_time*1e9 - $offset}") offset=$(grep 'offset =' $tracedir/ctf/ust/uid/1000/64-bit/metadata | \
end=$(awk "BEGIN{print $end_time*1e9 - $offset}") grep -o '[0-9]*')
echo "offset = $offset"
echo "only events between $begin and $end" start_time=$(awk '/^start_time / {print $2}' stdout.log)
end_time=$(awk '/^end_time / {print $2}' stdout.log)
${bsc.cn6}/bin/cn6 -s $tracedir begin=$(awk "BEGIN{print $start_time*1e9 - $offset}")
end=$(awk "BEGIN{print $end_time*1e9 - $offset}")
${bsc.cn6}/bin/cut $begin $end < $tracedir/prv/trace.prv |\ echo "only events between $begin and $end"
${bsc.cn6}/bin/hcut 1 ${toString conf.cpusPerTask} \
> $tracedir/prv/trace-cut.prv
${bsc.cn6}/bin/dur 6400025 0 < $tracedir/prv/trace-cut.prv |\ ${bsc.cn6}/bin/cn6 -s $tracedir
awk '{s+=$1} END {print s}' >> .garlic/time_mode_dead.csv &
${bsc.cn6}/bin/dur 6400025 1 < $tracedir/prv/trace-cut.prv |\ ${bsc.cn6}/bin/cut $begin $end < $tracedir/prv/trace.prv |\
awk '{s+=$1} END {print s}' >> .garlic/time_mode_runtime.csv & ${bsc.cn6}/bin/hcut 1 ${toString conf.cpusPerTask} \
> $tracedir/prv/trace-cut.prv
${bsc.cn6}/bin/dur 6400025 3 < $tracedir/prv/trace-cut.prv |\ ${bsc.cn6}/bin/dur 6400025 0 < $tracedir/prv/trace-cut.prv |\
awk '{s+=$1} END {print s}' >> .garlic/time_mode_task.csv & awk '{s+=$1} END {print s}' >> .garlic/time_mode_dead.csv &
wait ${bsc.cn6}/bin/dur 6400025 1 < $tracedir/prv/trace-cut.prv |\
awk '{s+=$1} END {print s}' >> .garlic/time_mode_runtime.csv &
# Remove the traces at the end, as they are huge ${bsc.cn6}/bin/dur 6400025 3 < $tracedir/prv/trace-cut.prv |\
rm -rf $tracedir awk '{s+=$1} END {print s}' >> .garlic/time_mode_task.csv &
done
fi wait
'';
# Remove the traces at the end, as they are huge
rm -rf $tracedir
'';
# TODO: To enable HWC we need to first add a taskwait before the
# first get_time() measurement, otherwise we get the HWC of the
# main task, which will be huge.
}; };
exec = {nextStage, conf, ...}: stages.exec { exec = {nextStage, conf, ...}: stages.exec {