heat: split granularity with extended mode

The HWC version is not yet complete.
2021-04-06 18:38:15 +02:00 · 2021-04-06 18:38:15 +02:00 · d1c32869c1
commit d1c32869c1
parent 3566cf0152
1 changed files with 72 additions and 44 deletions
--- a/garlic/exp/heat/granul.nix
+++ b/garlic/exp/heat/granul.nix
@ -5,10 +5,16 @@
 , targetMachine
 , stages
 , garlicTools
+, writeText
 , enablePerf ? false
 , enableCTF ? false
+, enableHWC ? false
+, enableExtended ? false
 }:

+# TODO: Finish HWC first
+assert (enableHWC == false);
+
 with stdenv.lib;
 with garlicTools;

@ -17,10 +23,6 @@ let
  varConf = with bsc; {
    cbs = range2 32 4096;
    rbs = range2 32 4096;
-    #cbs = [ 64 256 1024 4096 ];
-    #rbs = [ 32 128 512 1024 ];
-    #cbs = [ 4096 ];
-    #rbs = [ 512 ];
  };

  machineConfig = targetMachine.config;
@ -38,12 +40,11 @@ let
    timesteps = 10;
    cols = 1024 * 16; # Columns
    rows = 1024 * 16; # Rows
-    cbs = c.cbs;
-    rbs = c.rbs;
+    inherit (c) cbs rbs;
    gitBranch = "garlic/tampi+isend+oss+task";
    
    # Repeat the execution of each unit 30 times
-    loops = 1;
+    loops = 10;

    # Resources
    qos = "debug";
@ -55,10 +56,20 @@ let
    jobName = unitName;
  };

+  filterConfigs = c: let
+    # Too small sizes lead to huge overheads
+    goodSize = (c.cbs * c.rbs >= 1024);
+    # When the extended units are not enabled, we only select those in
+    # the diagonal.
+    extended = if (enableExtended) then true
+      else c.cbs == c.rbs;
+  in
+    goodSize && extended;
+
  # Compute the array of configurations
-  configs = stdexp.buildConfigs {
+  configs = filter (filterConfigs) (stdexp.buildConfigs {
    inherit varConf genConf;
-  };
+  });

  perf = {nextStage, conf, ...}: stages.perf {
    inherit nextStage;
@ -66,54 +77,71 @@ let
      "-e cycles,instructions,cache-references,cache-misses";
  };

-  ctf = {nextStage, conf, ...}: with conf; stages.exec {
+  ctf = {nextStage, conf, ...}: let
+    # Create the nanos6 configuration file
+    nanos6ConfigFile = writeText "nanos6.toml" ''
+      version.instrument = "ctf"
+      turbo.enabled = false
+      instrument.ctf.converter.enabled = false
+    '' + optionalString (enableHWC) ''
+      hardware_counters.papi.enabled = true
+      hardware_counters.papi.counters = [
+        "PAPI_TOT_INS", "PAPI_TOT_CYC",
+        "PAPI_L1_TCM", "PAPI_L2_TCM", "PAPI_L3_TCM"
+      ]
+    '';
+
+  in stages.exec {
    inherit nextStage;
+
+    # And use it
    env = ''
-      export NANOS6_CONFIG_OVERRIDE="version.instrument=ctf,\
-        instrument.ctf.converter.enabled=false"
+      export NANOS6_CONFIG=${nanos6ConfigFile}
    '';
-    # Only one process converts the trace, otherwise use:
-    #  if [ $SLURM_PROCID == 0 ]; then
-    #    ...
-    #  fi
-    post = ''
-      if [ $SLURM_PROCID == 0 ]; then
-        sleep 2
-        for tracedir in trace_*; do
-          offset=$(grep 'offset =' $tracedir/ctf/ust/uid/1000/64-bit/metadata | \
-            grep -o '[0-9]*')
-          echo "offset = $offset"

-          start_time=$(awk '/^start_time / {print $2}' stdout.log)
-          end_time=$(awk '/^end_time / {print $2}' stdout.log)
+    # FIXME: We should run a hook *after* srun has ended, so we can
+    # execute it in one process only (not in N ranks). This hack works
+    # with one process only. Or be able to compute the name of the trace
+    # directory so we can begin the conversion in parallel
+    post = assert (conf.nodes * conf.ntasksPerNode == 1); ''
+      tracedir=$(ls -d trace_* | head -1)
+      echo "using tracedir=$tracedir"

-          begin=$(awk "BEGIN{print $start_time*1e9 - $offset}")
-          end=$(awk "BEGIN{print $end_time*1e9 - $offset}")
+      offset=$(grep 'offset =' $tracedir/ctf/ust/uid/1000/64-bit/metadata | \
+        grep -o '[0-9]*')
+      echo "offset = $offset"

-          echo "only events between $begin and $end"
+      start_time=$(awk '/^start_time / {print $2}' stdout.log)
+      end_time=$(awk '/^end_time / {print $2}' stdout.log)

-          ${bsc.cn6}/bin/cn6 -s $tracedir
+      begin=$(awk "BEGIN{print $start_time*1e9 - $offset}")
+      end=$(awk "BEGIN{print $end_time*1e9 - $offset}")

-          ${bsc.cn6}/bin/cut $begin $end < $tracedir/prv/trace.prv |\
-            ${bsc.cn6}/bin/hcut 1 ${toString conf.cpusPerTask} \
-            > $tracedir/prv/trace-cut.prv
+      echo "only events between $begin and $end"

-          ${bsc.cn6}/bin/dur 6400025 0 < $tracedir/prv/trace-cut.prv |\
-            awk '{s+=$1} END {print s}' >> .garlic/time_mode_dead.csv &
+      ${bsc.cn6}/bin/cn6 -s $tracedir

-          ${bsc.cn6}/bin/dur 6400025 1 < $tracedir/prv/trace-cut.prv |\
-            awk '{s+=$1} END {print s}' >> .garlic/time_mode_runtime.csv &
+      ${bsc.cn6}/bin/cut $begin $end < $tracedir/prv/trace.prv |\
+        ${bsc.cn6}/bin/hcut 1 ${toString conf.cpusPerTask} \
+        > $tracedir/prv/trace-cut.prv

-          ${bsc.cn6}/bin/dur 6400025 3 < $tracedir/prv/trace-cut.prv |\
-            awk '{s+=$1} END {print s}' >> .garlic/time_mode_task.csv &
+      ${bsc.cn6}/bin/dur 6400025 0 < $tracedir/prv/trace-cut.prv |\
+        awk '{s+=$1} END {print s}' >> .garlic/time_mode_dead.csv &

-          wait
+      ${bsc.cn6}/bin/dur 6400025 1 < $tracedir/prv/trace-cut.prv |\
+        awk '{s+=$1} END {print s}' >> .garlic/time_mode_runtime.csv &

-          # Remove the traces at the end, as they are huge
-          rm -rf $tracedir
-        done
-      fi
-    '';
+      ${bsc.cn6}/bin/dur 6400025 3 < $tracedir/prv/trace-cut.prv |\
+        awk '{s+=$1} END {print s}' >> .garlic/time_mode_task.csv &
+
+      wait
+
+      # Remove the traces at the end, as they are huge
+      rm -rf $tracedir
+      '';
+      # TODO: To enable HWC we need to first add a taskwait before the
+      # first get_time() measurement, otherwise we get the HWC of the
+      # main task, which will be huge.
  };

  exec = {nextStage, conf, ...}: stages.exec {