fwi: adjust input size to meet timing constraints

The previous iniput size for both granularity and strong scaling tests where too big to meet the timing constrains needed for garlic. This patch sets a new, smaller, input size. Also, a minor cleanup is applied to the rest of the fwi experiments and figures.
2021-04-07 12:35:44 +02:00
parent 3e5a56ebdb
commit 989f6ee018
14 changed files with 96 additions and 267 deletions
--- a/garlic/apps/fwi/default.nix
+++ b/garlic/apps/fwi/default.nix
@@ -34,6 +34,7 @@ stdenv.mkDerivation rec {
  # FIXME: Allow multiple MPI implementations
  postPatch = ''
    sed -i 's/= OPENMPI$/= INTEL/g' Makefile
    sed -i 's/USE_O_DIRECT ?= NO/USE_O_DIRECT ?= YES/g' Makefile || true
  '';
  # FIXME: This is an ugly hack.
--- a/garlic/exp/fwi/data_reuse.nix
+++ b/garlic/exp/fwi/data_reuse.nix
@@ -1,3 +1,23 @@
 # This test compares a FWI version using poor data locality (+NOREUSE) versus
 # the optimized version (used for all other experiments). Follows a pseudocode
 # snippet illustrating the fundamental difference between version.
 #
 # NOREUSE
 # ----------------------
 # for (y) for (x) for (z)
 #   computA(v[y][x][z]);
 # for (y) for (x) for (z)
 #   computB(v[y][x][z]);
 # for (y) for (x) for (z)
 #   computC(v[y][x][z]);
 #
 # Optimized version
 # ----------------------
 # for (y) for (x) for (z)
 #   computA(v[y][x][z]);
 #   computB(v[y][x][z]);
 #   computC(v[y][x][z]);
 {
  stdenv
 , stdexp
@@ -15,34 +35,14 @@ let
  # Initial variable configuration
  varConf = {
    gitBranch = [
 #      "garlic/tampi+send+oss+task"
 #      "garlic/mpi+send+omp+task"
       "garlic/mpi+send+oss+task"
-       "garlic/mpi+send+oss+task+noreuse"
+       "garlic/mpi+send+oss+task+NOREUSE"
 #      "garlic/mpi+send+seq"
 #      "garlic/oss+task"
 #      "garlic/omp+task"
 #      "garlic/seq"
    ];
    blocksize = [ 1 2 4 8 ];
    #blocksize = [ 1 2 ];
    n = [
 #   	{nx=50;  ny=4000; nz=50;}
 #   	{nx=20;  ny=4000; nz=20;}
 #    	{nx=300; ny=8000; nz=300;} # half node, /
 #    	{nx=300; ny=1000; nz=300;} # half node, /
 #    	{nx=200; ny=1000; nz=200;} # half node, not enough tasks
 #    	{nx=200; ny=4000; nz=200;} # --/ half node
 #    	{nx=250; ny=2000; nz=250;} # / half node
    	{nx=300; ny=2000; nz=300;} # / half node
 #    	{nx=100; ny=2000; nz=100;} # \-// half node
 #    	{nx=150; ny=2000; nz=150;} # \-/ half node
 #	{nx=200; ny=64000; nz=200;} # --/ 16 nodes
 #    	{nx=200; ny=4000; nz=200;} # --/ half node
 #    	{nx=200; ny=8000; nz=200;} # --/ 1 node
 #    	{nx=100; ny=8000; nz=100;} # --/ half node
    ];
  };
--- a/garlic/exp/fwi/granularity.nix
+++ b/garlic/exp/fwi/granularity.nix
@@ -1,3 +1,5 @@
 # Regular granularity test for FWI
 {
  stdenv
 , stdexp
@@ -15,20 +17,20 @@ let
  # Initial variable configuration
  varConf = {
    gitBranch = [
-       "garlic/tampi+send+oss+task"
+#      "garlic/tampi+send+oss+task"
-       "garlic/tampi+isend+oss+task"
+      "garlic/tampi+isend+oss+task"
-       "garlic/mpi+send+omp+task"
+#      "garlic/mpi+send+omp+task"
-       "garlic/mpi+send+oss+task"
+#      "garlic/mpi+send+oss+task"
 #      "garlic/mpi+send+seq"
 #      "garlic/oss+task"
 #      "garlic/omp+task"
 #      "garlic/seq"
    ];
-    blocksize = [ 1 2 4 8 16 32 ];
+    blocksize = [ 1 2 4 8 16 32 64 128 256 ];
    n = [
-    	{nx=500; nz=500; ny=2000; ntpn=2; nn=1;}
+    	{nx=100; nz=100; ny=8000; ntpn=2; nn=1;}
    ];
  };
--- a/garlic/exp/fwi/memory_affinity.nix
+++ b/garlic/exp/fwi/memory_affinity.nix
@@ -1,138 +0,0 @@
 {
  stdenv
 , stdexp
 , bsc
 , targetMachine
 , stages
 }:
 with stdenv.lib;
 let
  inherit (targetMachine) fs;
  # Initial variable configuration
  varConf = {
    gitBranch = [
 #      "garlic/tampi+send+oss+task"
 #      "garlic/mpi+send+omp+task"
       "garlic/mpi+send+oss+task"
 #      "garlic/mpi+send+seq"
 #      "garlic/oss+task"
 #      "garlic/omp+task"
 #      "garlic/seq"
    ];
    blocksize = [ 1 ];
    n = [
 #       {nx=500; nz=500; ny=8000;}
        {nx=500; nz=500; ny=2000;}
    ];
    nodes = [ 1 ]
    numactl = [ true false ]
  };
 # The c value contains something like:
 # {
 #   n = { nx=500; ny=500; nz=500; }
 #   blocksize = 1;
 #   gitBranch = "garlic/tampi+send+oss+task";
 # }
  machineConfig = targetMachine.config;
  # Generate the complete configuration for each unit
  genConf = with bsc; c: targetMachine.config // rec {
    expName = "fwi";
    unitName = "${expName}-test";
    inherit (machineConfig) hw;
    cc = icc;
    inherit (c) gitBranch blocksize;
    useNumactl = c.numactl
    #nx = c.n.nx;
    #ny = c.n.ny;
    #nz = c.n.nz;
    # Same but shorter:
    inherit (c.n) nx ny nz;
    fwiInput = bsc.apps.fwi.input.override {
      inherit (c.n) nx ny nz;
    };
    # Other FWI parameters
    ioFreq = -1;
    # Repeat the execution of each unit several times
    loops = 10;
    #loops = 1;
    # Resources
    cpusPerTask = if (useNumactl) then hw.cpusPerNode else hw.cpusPerSocket;
    ntasksPerNode = hw.cpusPerNode / cpusPerTask;
    nodes = c.nodes;
    qos = "debug";
    time = "02:00:00";
    jobName = unitName;
    tracing = "no";
    # Enable permissions to write in the local storage
    extraMounts = [ fs.local.temp ];
  };
  # Compute the array of configurations
  configs = stdexp.buildConfigs {
    inherit varConf genConf;
  };
  exec = {nextStage, conf, ...}: stages.exec ({
    inherit nextStage;
    pre = ''
      CDIR=$PWD
      if [[ "${conf.tracing}" == "yes" ]]; then
          export NANOS6_CONFIG_OVERRIDE="version.instrument=ctf"
      fi
      EXECDIR="${fs.local.temp}/out/$GARLIC_USER/$GARLIC_UNIT/$GARLIC_RUN"
      mkdir -p $EXECDIR
      cd $EXECDIR
      ln -fs ${conf.fwiInput}/InputModels InputModels || true
    '';
    argv = [
      "${conf.fwiInput}/fwi_params.txt"
      "${conf.fwiInput}/fwi_frequencies.txt"
      conf.blocksize
      "-1" # Fordward steps
      "-1" # Backward steps
      conf.ioFreq # Write/read frequency
    ];
    post = ''
      rm -rf Results || true
      if [[ "${conf.tracing}" == "yes" ]]; then
          mv trace_* $CDIR
      fi
    '';
  } // optionalAttrs (conf.useNumact) {
    program = "${numactl}/bin/numactl --interleave=all ${stageProgram nextStage}";
  });
  apps = bsc.garlic.apps;
  # FWI program
  program = {nextStage, conf, ...}: apps.fwi.solver.override {
    inherit (conf) cc gitBranch fwiInput;
  };
  pipeline = stdexp.stdPipeline ++ [ exec program ];
 in
  stdexp.genExperiment { inherit configs pipeline; }
--- a/garlic/exp/fwi/strong_scaling_forkjoin.nix
+++ b/garlic/exp/fwi/strong_scaling_forkjoin.nix
@@ -1,3 +1,6 @@
 # Strong scaling test for FWI variants based on forkjoint. This
 # experiment does not rely on block sizes.
 {
  stdenv
 , stdexp
@@ -15,20 +18,13 @@ let
  # Initial variable configuration
  varConf = {
    gitBranch = [
 #      "garlic/tampi+send+oss+task"
 #      "garlic/mpi+send+omp+task"
 #      "garlic/mpi+send+oss+task"
       "garlic/mpi+send+omp+fork"
 #      "garlic/mpi+send+seq"
 #      "garlic/oss+task"
 #      "garlic/omp+task"
 #      "garlic/seq"
    ];
    blocksize = [ 0 ];
    n = [
-        {nx=500; nz=500; ny=16000;}
+        {nx=100; nz=100; ny=8000;}
    ];
    nodes = [ 1 2 4 8 16 ];
--- a/garlic/exp/fwi/strong_scaling_io.nix
+++ b/garlic/exp/fwi/strong_scaling_io.nix
@@ -1,3 +1,10 @@
 # Strong scaling test for FWI variants based on tasks with and without I/O.
 # This experiment solves a computationally expensive input which brings the
 # storage devices to saturation when I/O is enabled.  the same input us run
 # without I/O for comparison purposes..  Also, the experiments are runt for a
 # range of block sizes deemed as efficient according to the granularity
 # experiment.
 {
  stdenv
 , stdexp
--- a/garlic/exp/fwi/strong_scaling_mpionly.nix
+++ b/garlic/exp/fwi/strong_scaling_mpionly.nix
@@ -1,3 +1,7 @@
 # Strong scaling test for FWI variants based exclusively on MPI. This
 # experiment does not rely on block sizes. An MPI process is instantiated per
 # core.
 {
  stdenv
 , stdexp
@@ -15,24 +19,17 @@ let
  # Initial variable configuration
  varConf = {
    gitBranch = [
 #      "garlic/tampi+send+oss+task"
 #      "garlic/mpi+send+omp+task"
 #      "garlic/mpi+send+oss+task"
 #      "garlic/mpi+send+omp+fork"
       "garlic/mpi+send+seq"
 #      "garlic/oss+task"
 #      "garlic/omp+task"
 #      "garlic/seq"
    ];
    blocksize = [ 0 ];
    n = [
-        {nx=500; nz=500; ny=16000;}
+        {nx=100; nz=100; ny=8000;}
    ];
-    # Not enough planes for 8 and 16 nodes
+    # Not enough planes for 4, 8 and 16 nodes
-    nodes = [ 1 2 4 ];
+    nodes = [ 1 2 ];
  };
--- a/garlic/exp/fwi/strong_scaling_task.nix
+++ b/garlic/exp/fwi/strong_scaling_task.nix
@@ -1,3 +1,7 @@
 # Strong scaling test for FWI variants based on tasks. This
 # experiment explores a range of block sizes deemed as efficient
 # according to the granularity experiment.
 {
  stdenv
 , stdexp
@@ -19,16 +23,12 @@ let
       "garlic/tampi+isend+oss+task"
       "garlic/mpi+send+omp+task"
       "garlic/mpi+send+oss+task"
 #      "garlic/mpi+send+seq"
 #      "garlic/oss+task"
 #      "garlic/omp+task"
 #      "garlic/seq"
    ];
-    blocksize = [ 1 2 4 8 ];
+    blocksize = [ 1 2 4 8 16 ];
    n = [
-        {nx=500; nz=500; ny=16000;}
+        {nx=100; nz=100; ny=8000;}
    ];
    nodes = [ 1 2 4 8 16 ];
--- a/garlic/exp/fwi/sync_io.nix
+++ b/garlic/exp/fwi/sync_io.nix
@@ -1,3 +1,7 @@
 # This experiment compares the effect of not using I/O versus using O_DIRECT |
 # O_DSYNC enabled I/O. This is a reduced version of the strong_scaling_io
 # experiment.
 {
  stdenv
 , stdexp
@@ -15,8 +19,8 @@ let
  # Initial variable configuration
  varConf = {
    gitBranch = [
-#      "garlic/tampi+send+oss+task"
+       "garlic/tampi+send+oss+task"
-       "garlic/mpi+send+omp+task"
+#      "garlic/mpi+send+omp+task"
 #      "garlic/mpi+send+oss+task"
 #      "garlic/mpi+send+seq"
 #      "garlic/oss+task"
@@ -24,14 +28,16 @@ let
 #      "garlic/seq"
    ];
-    blocksize = [ 1 2 4 8 16 32 ];
+    blocksize = [ 1 ];
    #blocksize = [ 1 2 4 8 ];
    n = [
-    	#{nx=500; nz=500; ny=1000; ntpn=1; nn=1;}
+        {nx=500; nz=500; ny=16000;}
    	{nx=500; nz=500; ny=2000; ntpn=2; nn=1;}
    ];
    nodes = [ 4 ];
    ioFreq = [ 9999 (-1) ];
  };
 # The c value contains something like:
@@ -57,14 +63,14 @@ let
    #nz = c.n.nz;
    # Same but shorter:
-    inherit (c.n) nx ny nz ntpn nn;
+    inherit (c.n) nx ny nz;
    fwiInput = bsc.apps.fwi.input.override {
      inherit (c.n) nx ny nz;
    };
    # Other FWI parameters
-    ioFreq = -1;
+    ioFreq = c.ioFreq;
    # Repeat the execution of each unit several times
    loops = 10;
@@ -72,8 +78,8 @@ let
    # Resources
    cpusPerTask = hw.cpusPerSocket;
-    ntasksPerNode = ntpn;
+    ntasksPerNode = 2;
-    nodes = nn;
+    nodes = c.nodes;
    qos = "debug";
    time = "02:00:00";
    jobName = unitName;
--- a/garlic/exp/index.nix
+++ b/garlic/exp/index.nix
@@ -98,12 +98,13 @@
  };
  fwi = {
-    test                    = callPackage ./fwi/test.nix { };
+    granularity             = callPackage ./fwi/granularity.nix { };
    strong_scaling_task     = callPackage ./fwi/strong_scaling_task.nix { };
    strong_scaling_forkjoin = callPackage ./fwi/strong_scaling_forkjoin.nix { };
    strong_scaling_mpionly  = callPackage ./fwi/strong_scaling_mpionly.nix { };
    data_reuse              = callPackage ./fwi/data_reuse.nix { };
    strong_scaling_io       = callPackage ./fwi/strong_scaling_io.nix { };
-    granularity             = callPackage ./fwi/granularity.nix { };
+    sync_io                 = callPackage ./fwi/sync_io.nix { };
  };
  osu = rec {
--- a/garlic/fig/fwi/granularity.R
+++ b/garlic/fig/fwi/granularity.R
@@ -30,7 +30,7 @@ w=5
 ####################################################################
 ### Line Graph
 ####################################################################
-png("time.png", width=w*ppi, height=h*ppi, res=ppi)
+png("mtime.png", width=w*ppi, height=h*ppi, res=ppi)
 ## Create the plot with the normalized time vs nblocks
 p = ggplot(df, aes(x = blocksize, y=mtime, group=gitBranch, color=gitBranch)) +
@@ -49,22 +49,23 @@ print(p)
 dev.off()
 ####################################################################
-### Boxplot
+### Line Graph
 ####################################################################
-png("box.png", width=w*ppi, height=h*ppi, res=ppi)
+png("time.png", width=w*ppi, height=h*ppi, res=ppi)
-# Create the plot with the normalized time vs nblocks
+
-p = ggplot(df, aes(x=blocksize, y=time, group=gitBranch, colour=gitBranch)) +
+## Create the plot with the normalized time vs nblocks
-        # Labels
+p = ggplot(df, aes(x = blocksize, y=time, group=gitBranch, color=gitBranch)) +
-        labs(x="Blocksize", y="Normalized time",
+  geom_point() +
-              title=sprintf("FWI Time"),
+  geom_line() +
-              subtitle=input_file) +
+  theme_bw() +
-        # Draw boxplots
+  labs(x="Blocksize", y="Time (s)", title="FWI granularity",
-        geom_boxplot() +
+    subtitle=input_file) +
-        theme_bw() +
+  theme(plot.subtitle=element_text(size=8)) +
-        theme(plot.subtitle=element_text(size=8)) +
+  theme(legend.position = c(0.5, 0.88))
-        theme(legend.position = c(0.5, 0.88))
+
 # Render the plot
 print(p)
-## Save the png image
+
 # Save the png image
 dev.off()
--- a/garlic/fig/fwi/strong_scaling.R
+++ b/garlic/fig/fwi/strong_scaling.R
@@ -14,7 +14,7 @@ dataset = jsonlite::stream_in(file(input_file)) %>%
  jsonlite::flatten()
 # Select block size to display
-useBlocksize = 1
+useBlocksize = 2
 # We only need the nblocks and time
 df = select(dataset, config.blocksize, config.gitBranch, config.nodes, time) %>%
@@ -59,7 +59,7 @@ print(p)
 dev.off()
 ####################################################################
-### Line plot (timei x nodes)
+### Line plot (time x nodes)
 ####################################################################
 png("nxtime.png", width=w*ppi, height=h*ppi, res=ppi)
--- a/garlic/fig/fwi/test.R
+++ b/garlic/fig/fwi/test.R
@@ -1,46 +0,0 @@
 library(ggplot2)
 library(dplyr)
 library(scales)
 library(jsonlite)
 args=commandArgs(trailingOnly=TRUE)
 # Read the timetable from args[1]
 input_file = "input.json"
 if (length(args)>0) { input_file = args[1] }
 # Load the dataset in NDJSON format
 dataset = jsonlite::stream_in(file(input_file)) %>%
 	jsonlite::flatten()
 # We only need the nblocks and time
 df = select(dataset, config.blocksize, config.gitBranch, time) %>%
 	rename(blocksize=config.blocksize, gitBranch=config.gitBranch) %>%
  group_by(blocksize, gitBranch) %>%
  mutate(mtime = median(time)) %>%
  ungroup()
 df$gitBranch = as.factor(df$gitBranch)
 df$blocksize = as.factor(df$blocksize)
 ppi=300
 h=5
 w=5
 png("time.png", width=w*ppi, height=h*ppi, res=ppi)
 #
 ## Create the plot with the normalized time vs nblocks
 p = ggplot(df, aes(x=blocksize, y=time)) +
 	geom_point() +
 	geom_line(aes(y=mtime, group=gitBranch, color=gitBranch)) +
 	theme_bw() +
 	labs(x="Blocksize", y="Time (s)", title="FWI granularity",
    subtitle=input_file) +
 	theme(plot.subtitle=element_text(size=8)) +
 	theme(legend.position = c(0.5, 0.88))
 # Render the plot
 print(p)
 # Save the png image
 dev.off()
--- a/garlic/fig/index.nix
+++ b/garlic/fig/index.nix
@@ -62,10 +62,12 @@ in
  };
  fwi = with exp.fwi; {
    test              = stdPlot ./fwi/test.R [ test ];
    strong_scaling    = stdPlot ./fwi/strong_scaling.R [ strong_scaling_task strong_scaling_forkjoin strong_scaling_mpionly ];
    strong_scaling_io = stdPlot ./fwi/strong_scaling_io.R [ strong_scaling_io ];
    granularity       = stdPlot ./fwi/granularity.R [ granularity ];
    strong_scaling    = stdPlot ./fwi/strong_scaling.R [ strong_scaling_task strong_scaling_forkjoin ];
    #strong_scaling    = stdPlot ./fwi/strong_scaling.R [ strong_scaling_task strong_scaling_forkjoin strong_scaling_mpionly ];
    data_reuse        = stdPlot ./fwi/granularity.R [ data_reuse ];
    strong_scaling_io = stdPlot ./fwi/strong_scaling_io.R [ strong_scaling_io ];
    sync_io           = stdPlot ./fwi/strong_scaling_io.R [ sync_io ];
  };
  osu = with exp.osu; {