fwi: adjust input size to meet timing constraints

The previous iniput size for both granularity and strong scaling tests where too big to meet the timing constrains needed for garlic. This patch sets a new, smaller, input size. Also, a minor cleanup is applied to the rest of the fwi experiments and figures.
2021-04-07 12:35:44 +02:00 · 2021-04-07 12:35:44 +02:00 · 989f6ee018
commit 989f6ee018
parent 3e5a56ebdb
14 changed files with 96 additions and 267 deletions
--- a/garlic/apps/fwi/default.nix
+++ b/garlic/apps/fwi/default.nix
@ -34,6 +34,7 @@ stdenv.mkDerivation rec {
  # FIXME: Allow multiple MPI implementations
  postPatch = ''
    sed -i 's/= OPENMPI$/= INTEL/g' Makefile
+    sed -i 's/USE_O_DIRECT ?= NO/USE_O_DIRECT ?= YES/g' Makefile || true
  '';

  # FIXME: This is an ugly hack.
--- a/garlic/exp/fwi/data_reuse.nix
+++ b/garlic/exp/fwi/data_reuse.nix
@ -1,3 +1,23 @@
+# This test compares a FWI version using poor data locality (+NOREUSE) versus
+# the optimized version (used for all other experiments). Follows a pseudocode
+# snippet illustrating the fundamental difference between version.
+#
+# NOREUSE
+# ----------------------
+# for (y) for (x) for (z)
+#   computA(v[y][x][z]);
+# for (y) for (x) for (z)
+#   computB(v[y][x][z]);
+# for (y) for (x) for (z)
+#   computC(v[y][x][z]);
+#
+# Optimized version
+# ----------------------
+# for (y) for (x) for (z)
+#   computA(v[y][x][z]);
+#   computB(v[y][x][z]);
+#   computC(v[y][x][z]);
+
 {
  stdenv
 , stdexp
@ -15,34 +35,14 @@ let
  # Initial variable configuration
  varConf = {
    gitBranch = [
-#      "garlic/tampi+send+oss+task"
-#      "garlic/mpi+send+omp+task"
       "garlic/mpi+send+oss+task"
-       "garlic/mpi+send+oss+task+noreuse"
-#      "garlic/mpi+send+seq"
-#      "garlic/oss+task"
-#      "garlic/omp+task"
-#      "garlic/seq"
+       "garlic/mpi+send+oss+task+NOREUSE"
    ];

    blocksize = [ 1 2 4 8 ];
-    #blocksize = [ 1 2 ];

    n = [
-#   	{nx=50;  ny=4000; nz=50;}
-#   	{nx=20;  ny=4000; nz=20;}
-#    	{nx=300; ny=8000; nz=300;} # half node, /
-#    	{nx=300; ny=1000; nz=300;} # half node, /
-#    	{nx=200; ny=1000; nz=200;} # half node, not enough tasks
-#    	{nx=200; ny=4000; nz=200;} # --/ half node
-#    	{nx=250; ny=2000; nz=250;} # / half node
    	{nx=300; ny=2000; nz=300;} # / half node
-#    	{nx=100; ny=2000; nz=100;} # \-// half node
-#    	{nx=150; ny=2000; nz=150;} # \-/ half node
-#	{nx=200; ny=64000; nz=200;} # --/ 16 nodes
-#    	{nx=200; ny=4000; nz=200;} # --/ half node
-#    	{nx=200; ny=8000; nz=200;} # --/ 1 node
-#    	{nx=100; ny=8000; nz=100;} # --/ half node
    ];
  };

--- a/garlic/exp/fwi/granularity.nix
+++ b/garlic/exp/fwi/granularity.nix
@ -1,3 +1,5 @@
+# Regular granularity test for FWI
+
 {
  stdenv
 , stdexp
@ -15,20 +17,20 @@ let
  # Initial variable configuration
  varConf = {
    gitBranch = [
-       "garlic/tampi+send+oss+task"
-       "garlic/tampi+isend+oss+task"
-       "garlic/mpi+send+omp+task"
-       "garlic/mpi+send+oss+task"
+#      "garlic/tampi+send+oss+task"
+      "garlic/tampi+isend+oss+task"
+#      "garlic/mpi+send+omp+task"
+#      "garlic/mpi+send+oss+task"
 #      "garlic/mpi+send+seq"
 #      "garlic/oss+task"
 #      "garlic/omp+task"
 #      "garlic/seq"
    ];

-    blocksize = [ 1 2 4 8 16 32 ];
+    blocksize = [ 1 2 4 8 16 32 64 128 256 ];

    n = [
-    	{nx=500; nz=500; ny=2000; ntpn=2; nn=1;}
+    	{nx=100; nz=100; ny=8000; ntpn=2; nn=1;}
    ];

  };
--- a/garlic/exp/fwi/memory_affinity.nix
+++ b/garlic/exp/fwi/memory_affinity.nix
@ -1,138 +0,0 @@
-{
-  stdenv
-, stdexp
-, bsc
-, targetMachine
-, stages
-}:
-
-with stdenv.lib;
-
-let
-
-  inherit (targetMachine) fs;
-
-  # Initial variable configuration
-  varConf = {
-    gitBranch = [
-#      "garlic/tampi+send+oss+task"
-#      "garlic/mpi+send+omp+task"
-       "garlic/mpi+send+oss+task"
-#      "garlic/mpi+send+seq"
-#      "garlic/oss+task"
-#      "garlic/omp+task"
-#      "garlic/seq"
-    ];
-
-    blocksize = [ 1 ];
-
-    n = [
-#       {nx=500; nz=500; ny=8000;}
-        {nx=500; nz=500; ny=2000;}
-    ];
-
-    nodes = [ 1 ]
-
-    numactl = [ true false ]
-
-  };
-
-# The c value contains something like:
-# {
-#   n = { nx=500; ny=500; nz=500; }
-#   blocksize = 1;
-#   gitBranch = "garlic/tampi+send+oss+task";
-# }
-
-  machineConfig = targetMachine.config;
-
-  # Generate the complete configuration for each unit
-  genConf = with bsc; c: targetMachine.config // rec {
-    expName = "fwi";
-    unitName = "${expName}-test";
-    inherit (machineConfig) hw;
-
-    cc = icc;
-    inherit (c) gitBranch blocksize;
-    useNumactl = c.numactl
-
-    #nx = c.n.nx;
-    #ny = c.n.ny;
-    #nz = c.n.nz;
-
-    # Same but shorter:
-    inherit (c.n) nx ny nz;
-
-    fwiInput = bsc.apps.fwi.input.override {
-      inherit (c.n) nx ny nz;
-    };
-
-    # Other FWI parameters
-    ioFreq = -1;
-
-    # Repeat the execution of each unit several times
-    loops = 10;
-    #loops = 1;
-
-    # Resources
-    cpusPerTask = if (useNumactl) then hw.cpusPerNode else hw.cpusPerSocket;
-    ntasksPerNode = hw.cpusPerNode / cpusPerTask;
-    nodes = c.nodes;
-    qos = "debug";
-    time = "02:00:00";
-    jobName = unitName;
-
-    tracing = "no";
-
-    # Enable permissions to write in the local storage
-    extraMounts = [ fs.local.temp ];
-
-  };
-
-  # Compute the array of configurations
-  configs = stdexp.buildConfigs {
-    inherit varConf genConf;
-  };
-
-  exec = {nextStage, conf, ...}: stages.exec ({
-    inherit nextStage;
-    pre = ''
-      CDIR=$PWD
-      if [[ "${conf.tracing}" == "yes" ]]; then
-          export NANOS6_CONFIG_OVERRIDE="version.instrument=ctf"
-      fi
-      EXECDIR="${fs.local.temp}/out/$GARLIC_USER/$GARLIC_UNIT/$GARLIC_RUN"
-      mkdir -p $EXECDIR
-      cd $EXECDIR
-      ln -fs ${conf.fwiInput}/InputModels InputModels || true
-    '';
-    argv = [
-      "${conf.fwiInput}/fwi_params.txt"
-      "${conf.fwiInput}/fwi_frequencies.txt"
-      conf.blocksize
-      "-1" # Fordward steps
-      "-1" # Backward steps
-      conf.ioFreq # Write/read frequency
-    ];
-    post = ''
-      rm -rf Results || true
-      if [[ "${conf.tracing}" == "yes" ]]; then
-          mv trace_* $CDIR
-      fi
-    '';
-  } // optionalAttrs (conf.useNumact) {
-    program = "${numactl}/bin/numactl --interleave=all ${stageProgram nextStage}";
-  });
-
-  apps = bsc.garlic.apps;
-
-  # FWI program
-  program = {nextStage, conf, ...}: apps.fwi.solver.override {
-    inherit (conf) cc gitBranch fwiInput;
-  };
-
-  pipeline = stdexp.stdPipeline ++ [ exec program ];
-
-in
- 
-  stdexp.genExperiment { inherit configs pipeline; }
--- a/garlic/exp/fwi/strong_scaling_forkjoin.nix
+++ b/garlic/exp/fwi/strong_scaling_forkjoin.nix
@ -1,3 +1,6 @@
+# Strong scaling test for FWI variants based on forkjoint. This
+# experiment does not rely on block sizes.
+
 {
  stdenv
 , stdexp
@ -15,20 +18,13 @@ let
  # Initial variable configuration
  varConf = {
    gitBranch = [
-#      "garlic/tampi+send+oss+task"
-#      "garlic/mpi+send+omp+task"
-#      "garlic/mpi+send+oss+task"
       "garlic/mpi+send+omp+fork"
-#      "garlic/mpi+send+seq"
-#      "garlic/oss+task"
-#      "garlic/omp+task"
-#      "garlic/seq"
    ];

    blocksize = [ 0 ];

    n = [
-        {nx=500; nz=500; ny=16000;}
+        {nx=100; nz=100; ny=8000;}
    ];

    nodes = [ 1 2 4 8 16 ];
--- a/garlic/exp/fwi/strong_scaling_io.nix
+++ b/garlic/exp/fwi/strong_scaling_io.nix
@ -1,3 +1,10 @@
+# Strong scaling test for FWI variants based on tasks with and without I/O.
+# This experiment solves a computationally expensive input which brings the
+# storage devices to saturation when I/O is enabled.  the same input us run
+# without I/O for comparison purposes..  Also, the experiments are runt for a
+# range of block sizes deemed as efficient according to the granularity
+# experiment.
+
 {
  stdenv
 , stdexp
--- a/garlic/exp/fwi/strong_scaling_mpionly.nix
+++ b/garlic/exp/fwi/strong_scaling_mpionly.nix
@ -1,3 +1,7 @@
+# Strong scaling test for FWI variants based exclusively on MPI. This
+# experiment does not rely on block sizes. An MPI process is instantiated per
+# core.
+
 {
  stdenv
 , stdexp
@ -15,24 +19,17 @@ let
  # Initial variable configuration
  varConf = {
    gitBranch = [
-#      "garlic/tampi+send+oss+task"
-#      "garlic/mpi+send+omp+task"
-#      "garlic/mpi+send+oss+task"
-#      "garlic/mpi+send+omp+fork"
       "garlic/mpi+send+seq"
-#      "garlic/oss+task"
-#      "garlic/omp+task"
-#      "garlic/seq"
    ];

    blocksize = [ 0 ];

    n = [
-        {nx=500; nz=500; ny=16000;}
+        {nx=100; nz=100; ny=8000;}
    ];

-    # Not enough planes for 8 and 16 nodes
-    nodes = [ 1 2 4 ];
+    # Not enough planes for 4, 8 and 16 nodes
+    nodes = [ 1 2 ];

  };

--- a/garlic/exp/fwi/strong_scaling_task.nix
+++ b/garlic/exp/fwi/strong_scaling_task.nix
@ -1,3 +1,7 @@
+# Strong scaling test for FWI variants based on tasks. This
+# experiment explores a range of block sizes deemed as efficient
+# according to the granularity experiment.
+
 {
  stdenv
 , stdexp
@ -19,16 +23,12 @@ let
       "garlic/tampi+isend+oss+task"
       "garlic/mpi+send+omp+task"
       "garlic/mpi+send+oss+task"
-#      "garlic/mpi+send+seq"
-#      "garlic/oss+task"
-#      "garlic/omp+task"
-#      "garlic/seq"
    ];

-    blocksize = [ 1 2 4 8 ];
+    blocksize = [ 1 2 4 8 16 ];

    n = [
-        {nx=500; nz=500; ny=16000;}
+        {nx=100; nz=100; ny=8000;}
    ];

    nodes = [ 1 2 4 8 16 ];
--- a/garlic/exp/fwi/sync_io.nix
+++ b/garlic/exp/fwi/sync_io.nix
@ -1,3 +1,7 @@
+# This experiment compares the effect of not using I/O versus using O_DIRECT |
+# O_DSYNC enabled I/O. This is a reduced version of the strong_scaling_io
+# experiment.
+
 {
  stdenv
 , stdexp
@ -15,8 +19,8 @@ let
  # Initial variable configuration
  varConf = {
    gitBranch = [
-#      "garlic/tampi+send+oss+task"
-       "garlic/mpi+send+omp+task"
+       "garlic/tampi+send+oss+task"
+#      "garlic/mpi+send+omp+task"
 #      "garlic/mpi+send+oss+task"
 #      "garlic/mpi+send+seq"
 #      "garlic/oss+task"
@ -24,14 +28,16 @@ let
 #      "garlic/seq"
    ];

-    blocksize = [ 1 2 4 8 16 32 ];
-    #blocksize = [ 1 2 4 8 ];
+    blocksize = [ 1 ];

    n = [
-    	#{nx=500; nz=500; ny=1000; ntpn=1; nn=1;}
-    	{nx=500; nz=500; ny=2000; ntpn=2; nn=1;}
+        {nx=500; nz=500; ny=16000;}
    ];

+    nodes = [ 4 ];
+
+    ioFreq = [ 9999 (-1) ];
+
  };

 # The c value contains something like:
@ -57,14 +63,14 @@ let
    #nz = c.n.nz;

    # Same but shorter:
-    inherit (c.n) nx ny nz ntpn nn;
+    inherit (c.n) nx ny nz;

    fwiInput = bsc.apps.fwi.input.override {
      inherit (c.n) nx ny nz;
    };

    # Other FWI parameters
-    ioFreq = -1;
+    ioFreq = c.ioFreq;

    # Repeat the execution of each unit several times
    loops = 10;
@ -72,8 +78,8 @@ let

    # Resources
    cpusPerTask = hw.cpusPerSocket;
-    ntasksPerNode = ntpn;
-    nodes = nn;
+    ntasksPerNode = 2;
+    nodes = c.nodes;
    qos = "debug";
    time = "02:00:00";
    jobName = unitName;
--- a/garlic/exp/index.nix
+++ b/garlic/exp/index.nix
@ -98,12 +98,13 @@
  };

  fwi = {
-    test                    = callPackage ./fwi/test.nix { };
+    granularity             = callPackage ./fwi/granularity.nix { };
    strong_scaling_task     = callPackage ./fwi/strong_scaling_task.nix { };
    strong_scaling_forkjoin = callPackage ./fwi/strong_scaling_forkjoin.nix { };
    strong_scaling_mpionly  = callPackage ./fwi/strong_scaling_mpionly.nix { };
+    data_reuse              = callPackage ./fwi/data_reuse.nix { };
    strong_scaling_io       = callPackage ./fwi/strong_scaling_io.nix { };
-    granularity             = callPackage ./fwi/granularity.nix { };
+    sync_io                 = callPackage ./fwi/sync_io.nix { };
  };

  osu = rec {
--- a/garlic/fig/fwi/granularity.R
+++ b/garlic/fig/fwi/granularity.R
@ -30,7 +30,7 @@ w=5
 ####################################################################
 ### Line Graph
 ####################################################################
-png("time.png", width=w*ppi, height=h*ppi, res=ppi)
+png("mtime.png", width=w*ppi, height=h*ppi, res=ppi)

 ## Create the plot with the normalized time vs nblocks
 p = ggplot(df, aes(x = blocksize, y=mtime, group=gitBranch, color=gitBranch)) +
@ -49,22 +49,23 @@ print(p)
 dev.off()

 ####################################################################
-### Boxplot
+### Line Graph
 ####################################################################
-png("box.png", width=w*ppi, height=h*ppi, res=ppi)
-# Create the plot with the normalized time vs nblocks
-p = ggplot(df, aes(x=blocksize, y=time, group=gitBranch, colour=gitBranch)) +
-        # Labels
-        labs(x="Blocksize", y="Normalized time",
-              title=sprintf("FWI Time"),
-              subtitle=input_file) +
-        # Draw boxplots
-        geom_boxplot() +
-        theme_bw() +
-        theme(plot.subtitle=element_text(size=8)) +
-        theme(legend.position = c(0.5, 0.88))
+png("time.png", width=w*ppi, height=h*ppi, res=ppi)
+
+## Create the plot with the normalized time vs nblocks
+p = ggplot(df, aes(x = blocksize, y=time, group=gitBranch, color=gitBranch)) +
+  geom_point() +
+  geom_line() +
+  theme_bw() +
+  labs(x="Blocksize", y="Time (s)", title="FWI granularity",
+    subtitle=input_file) +
+  theme(plot.subtitle=element_text(size=8)) +
+  theme(legend.position = c(0.5, 0.88))
+
 # Render the plot
 print(p)
-## Save the png image
+
+# Save the png image
 dev.off()

--- a/garlic/fig/fwi/strong_scaling.R
+++ b/garlic/fig/fwi/strong_scaling.R
@ -14,7 +14,7 @@ dataset = jsonlite::stream_in(file(input_file)) %>%
  jsonlite::flatten()

 # Select block size to display
-useBlocksize = 1
+useBlocksize = 2

 # We only need the nblocks and time
 df = select(dataset, config.blocksize, config.gitBranch, config.nodes, time) %>%
@ -59,7 +59,7 @@ print(p)
 dev.off()

 ####################################################################
-### Line plot (timei x nodes)
+### Line plot (time x nodes)
 ####################################################################
 png("nxtime.png", width=w*ppi, height=h*ppi, res=ppi)

--- a/garlic/fig/fwi/test.R
+++ b/garlic/fig/fwi/test.R
@ -1,46 +0,0 @@
-library(ggplot2)
-library(dplyr)
-library(scales)
-library(jsonlite)
-
-args=commandArgs(trailingOnly=TRUE)
-
-# Read the timetable from args[1]
-input_file = "input.json"
-if (length(args)>0) { input_file = args[1] }
-
-# Load the dataset in NDJSON format
-dataset = jsonlite::stream_in(file(input_file)) %>%
-	jsonlite::flatten()
-
-# We only need the nblocks and time
-df = select(dataset, config.blocksize, config.gitBranch, time) %>%
-	rename(blocksize=config.blocksize, gitBranch=config.gitBranch) %>%
-  group_by(blocksize, gitBranch) %>%
-  mutate(mtime = median(time)) %>%
-  ungroup()
-
-df$gitBranch = as.factor(df$gitBranch)
-df$blocksize = as.factor(df$blocksize)
-
-ppi=300
-h=5
-w=5
-
-png("time.png", width=w*ppi, height=h*ppi, res=ppi)
-#
-## Create the plot with the normalized time vs nblocks
-p = ggplot(df, aes(x=blocksize, y=time)) +
-	geom_point() +
-	geom_line(aes(y=mtime, group=gitBranch, color=gitBranch)) +
-	theme_bw() +
-	labs(x="Blocksize", y="Time (s)", title="FWI granularity",
-    subtitle=input_file) +
-	theme(plot.subtitle=element_text(size=8)) +
-	theme(legend.position = c(0.5, 0.88))
-
-# Render the plot
-print(p)
-
-# Save the png image
-dev.off()
--- a/garlic/fig/index.nix
+++ b/garlic/fig/index.nix
@ -62,10 +62,12 @@ in
  };

  fwi = with exp.fwi; {
-    test              = stdPlot ./fwi/test.R [ test ];
-    strong_scaling    = stdPlot ./fwi/strong_scaling.R [ strong_scaling_task strong_scaling_forkjoin strong_scaling_mpionly ];
-    strong_scaling_io = stdPlot ./fwi/strong_scaling_io.R [ strong_scaling_io ];
    granularity       = stdPlot ./fwi/granularity.R [ granularity ];
+    strong_scaling    = stdPlot ./fwi/strong_scaling.R [ strong_scaling_task strong_scaling_forkjoin ];
+    #strong_scaling    = stdPlot ./fwi/strong_scaling.R [ strong_scaling_task strong_scaling_forkjoin strong_scaling_mpionly ];
+    data_reuse        = stdPlot ./fwi/granularity.R [ data_reuse ];
+    strong_scaling_io = stdPlot ./fwi/strong_scaling_io.R [ strong_scaling_io ];
+    sync_io           = stdPlot ./fwi/strong_scaling_io.R [ sync_io ];
  };

  osu = with exp.osu; {