From c41456412c8d09c9f8f09ef6f9297c73f9f159de Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Fri, 12 Mar 2021 19:33:40 +0100 Subject: [PATCH] examples: Add granularity examples --- garlic/exp/examples/granularity.nix | 188 +++++++++++++++++++++++++++ garlic/exp/index.nix | 4 + garlic/fig/examples/granularity.R | 194 ++++++++++++++++++++++++++++ garlic/fig/index.nix | 4 + 4 files changed, 390 insertions(+) create mode 100644 garlic/exp/examples/granularity.nix create mode 100644 garlic/fig/examples/granularity.R diff --git a/garlic/exp/examples/granularity.nix b/garlic/exp/examples/granularity.nix new file mode 100644 index 0000000..5425e54 --- /dev/null +++ b/garlic/exp/examples/granularity.nix @@ -0,0 +1,188 @@ +# This file defines an experiment. It is designed as a function that takes +# several parameters and returns a derivation. This derivation, when built will +# create several scripts that can be executed and launch the experiment. + +# These are the inputs to this function: an attribute set which must contain the +# following keys: +{ + stdenv +, stdexp +, bsc +, targetMachine +, stages +, garlicTools +}: + +# We import in the scope the content of the `stdenv.lib` attribute, which +# contain useful functions like `toString`, which will be used later. This is +# handy to avoid writting `stdenv.lib.tostring`. + +with stdenv.lib; + +# We also have some functions specific to the garlic benchmark which we import +# as well. Take a look at the garlic/tools.nix file for more details. +with garlicTools; + +# The `let` keyword allows us to define some local variables which will be used +# later. It works as the local variable concept in the C language. +let + + # Initial variable configuration: every attribute in this set contains lists + # of options which will be used to compute the configuration of the units. The + # cartesian product of all the values will be computed. + varConf = { + # In this case we will vary the columns and rows of the blocksize. This + # configuration will create 3 x 2 = 6 units. + cbs = [ 256 1024 4096 ]; + rbs = [ 512 1024 ]; + }; + + # Generate the complete configuration for each unit: genConf is a function + # that accepts the argument `c` and returns a attribute set. The attribute set + # is formed by joining the configuration of the machine (which includes + # details like the number of nodes or the architecture) and the configuration + # that we define for our units. + # + # Notice the use of the `rec` keyword, which allows us to access the elements + # of the set while is being defined. + genConf = c: targetMachine.config // rec { + + # These attributes are user defined, and thus the user will need to handle + # them manually. They are not read by the standard pipeline: + + # Here we load the `hw` attribute from the machine configuration, so we can + # access it, for example, the number of CPUs per socket as hw.cpusPerSocket. + hw = targetMachine.config.hw; + + # These options will be used by the heat app, be we write them here so they + # are stored in the unit configuration. + timesteps = 10; + cols = 1024 * 16; # Columns + rows = 1024 * 16; # Rows + + # The blocksize is set to the values passed in the `c` parameter, which will + # be set to one of all the configurations of the cartesian product. for + # example: cbs = 256 and rbs = 512. + # We can also write `inherit (c) cbs rbs`, as is a shorthand notation. + cbs = c.cbs; + rbs = c.rbs; + + # The git branch is specified here as well, as will be used when we specify + # the heat app + gitBranch = "garlic/tampi+isend+oss+task"; + + # ------------------------------------------------------------------------- + + # These attributes are part of the standard pipeline, and are required for + # each experiment. They are automatically recognized by the standard + # execution pipeline. + + # The experiment name: + expName = "example-granularity-heat"; + + # The experimental unit name. It will be used to create a symlink in the + # index (at /gpfs/projects/bsc15/garlic/$USER/index/) so you can easily find + # the unit. Notice that the symlink is overwritten each time you run a unit + # with the same same. + # + # We use the toString function to convert the numeric value of cbs and rbs + # to a string like: "example-granularity-heat.cbs-256.rbs-512" + unitName = expName + + ".cbs-${toString cbs}" + + ".rbs-${toString rbs}"; + + # Repeat the execution of each unit a few times: this option is + # automatically taken by the experiment, which will repeat the execution of + # the program that many times. It is recommended to run the app at least 30 + # times, but we only used 10 here for demostration purposes (as it will be + # faster to run) + loops = 10; + + # Resources: here we configure the resources in the machine. The queue to be + # used is `debug` as is the fastest for small jobs. + qos = "debug"; + + # Then the number of MPI processes or tasks per node: + ntasksPerNode = 1; + + # And the number of nodes: + nodes = 1; + + # We use all the CPUs available in one socket to each MPI process or task. + # Notice that the number of CPUs per socket is not specified directly. but + # loaded from the configuration of the machine that will be used to run our + # experiment. The affinity mask is set accordingly. + cpusPerTask = hw.cpusPerSocket; + + # The time will limit the execution of the program in case of a deadlock + time = "02:00:00"; + + # The job name will appear in the `squeue` and helps to identify what is + # running. Currently is set to the name of the unit. + jobName = unitName; + }; + + # Using the `varConf` and our function `genConf` we compute a list of the + # complete configuration of every unit. + configs = stdexp.buildConfigs { + inherit varConf genConf; + }; + + # Now that we have the list of configs, we need to write how that information + # is used to run our program. In our case we will use some params such as the + # number of rows and columns of the input problem or the blocksize as argv + # values. + + # The exec stage is used to run a program with some arguments. + exec = {nextStage, conf, ...}: stages.exec { + # All stages require the nextStage attribute, which is passed as parameter. + inherit nextStage; + + # Then, we fill the argv array with the elements that will be used when + # running our program. Notice that we load the attributes from the + # configuration which is passed as argument as well. + argv = [ + "--rows" conf.rows + "--cols" conf.cols + "--rbs" conf.rbs + "--cbs" conf.cbs + "--timesteps" conf.timesteps + ]; + + # This program requires a file called `head.conf` in the current directory. + # To do it, we run this small script in the `pre` hook, which simple runs + # some commands before running the program. Notice that this command is + # executed in every MPI task. + pre = '' + ln -sf ${nextStage}/etc/heat.conf heat.conf || true + ''; + }; + + # The program stage is only used to specify which program we should run. + # We use this stage to specify build-time parameters such as the gitBranch, + # which will be used to fetch the source code. We use the `override` function + # of the `bsc.garlic.apps.heat` derivation to change the input paramenters. + program = {nextStage, conf, ...}: bsc.garlic.apps.heat.override { + inherit (conf) gitBranch; + }; + + # Other stages may be defined here, in case that we want to do something + # additional, like running the program under `perf stats` or set some + # envionment variables. + + # Once all the stages are defined, we build the pipeline array. The + # `stdexp.stdPipeline` contains the standard pipeline stages, so we don't need + # to specify them. We only specify how we run our program, and what program + # exactly, by adding our `exec` and `program` stages: + pipeline = stdexp.stdPipeline ++ [ exec program ]; + +# Then, we use the `configs` and the `pipeline` just defined inside the `in` +# part, to build the complete experiment: +in + + # The `stdexp.genExperiment` function generates an experiment by calling every + # stage of the pipeline with the different configs, and thus creating + # different units. The result is the top level derivation which is the + # `trebuchet`, which is the script that, when executed, launches the complete + # experiment. + stdexp.genExperiment { inherit configs pipeline; } diff --git a/garlic/exp/index.nix b/garlic/exp/index.nix index d5ddc20..a90daa0 100644 --- a/garlic/exp/index.nix +++ b/garlic/exp/index.nix @@ -103,4 +103,8 @@ impi = callPackage ./osu/impi.nix { }; bwShm = bw.override { interNode = false; }; }; + + examples = { + granularity = callPackage ./examples/granularity.nix { }; + }; } diff --git a/garlic/fig/examples/granularity.R b/garlic/fig/examples/granularity.R new file mode 100644 index 0000000..2fddf5d --- /dev/null +++ b/garlic/fig/examples/granularity.R @@ -0,0 +1,194 @@ +# This R program takes as argument the dataset that contains the results of the +# execution of the heat example experiment and produces some plots. All the +# knowledge to understand how this script works is covered by this nice R book: +# +# Winston Chang, R Graphics Cookbook: Practical Recipes for Visualizing Data, +# O’Reilly Media (2020). 2nd edition +# +# Which can be freely read it online here: https://r-graphics.org/ +# +# Please, search in this book before copying some random (and probably oudated) +# reply on stack overflow. + +# We load some R packages to import the required functions. We mainly use the +# tidyverse packages, which are very good for ploting data, +library(ggplot2) +library(dplyr, warn.conflicts = FALSE) +library(scales) +library(jsonlite) +library(viridis, warn.conflicts = FALSE) + +# Here we simply load the arguments to find the input dataset. If nothing is +# specified we use the file named `input` in the current directory. +# We can run this script directly using: +# Rscript + +# Load the arguments (argv) +args = commandArgs(trailingOnly=TRUE) + +# Set the input dataset if given in argv[1], or use "input" as default +if (length(args)>0) { input_file = args[1] } else { input_file = "input" } + +# Here we build of dataframe from the input dataset by chaining operations using +# the magritte operator `%>%`, which is similar to a UNIX pipe. +# First we read the input file, which is expected to be NDJSON +df = jsonlite::stream_in(file(input_file), verbose=FALSE) %>% + + # Then we flatten it, as it may contain dictionaries inside the columns + jsonlite::flatten() %>% + + # Now the dataframe contains all the configuration of the units inside the + # columns named `config.*`, for example `config.cbs`. We first select only + # the columns that we need: + select(config.cbs, config.rbs, unit, time) %>% + + # And then we rename those columns to something shorter: + rename(cbs=config.cbs, rbs=config.rbs) %>% + + # The columns contain the values that we specified in the experiment as + # integers. However, we need to tell R that those values are factors. So we + # apply to those columns the `as.factor()` function: + mutate(cbs = as.factor(cbs)) %>% + mutate(rbs = as.factor(rbs)) %>% + + # The same for the unit (which is the hash that nix has given to each unit) + mutate(unit = as.factor(unit)) %>% + + # Then, we can group our dataset by each unit. This will always work + # independently of the variables that vary from unit to unit. + group_by(unit) %>% + + # And compute some metrics which are applied to each group. For example we + # compute the median time within the runs of a unit: + mutate(median.time = median(time)) %>% + mutate(normalized.time = time / median.time - 1) %>% + mutate(log.median.time = log(median.time)) %>% + + # Then, we remove the grouping. This step is very important, otherwise the + # plotting functions get confused: + ungroup() + + +# These constants will be used when creating the plots. We use high quality +# images with 300 dots per inch and 5 x 5 inches of size by default. +dpi = 300 +h = 5 +w = 5 + + +# --------------------------------------------------------------------- + + +# We plot the median time (of each unit) as we vary the block size. As we vary +# both the cbs and rbs, we plot cbs while fixing rbs at a time. +p = ggplot(df, aes(x=cbs, y=median.time, color=rbs)) + + # We add a point to the median + geom_point() + + + # We also add the lines to connect the points. We need to specify which + # variable will do the grouping, otherwise we will have one line per point. + geom_line(aes(group=rbs)) + + + # The bw theme is recommended for publications + theme_bw() + + + # Here we add the title and the labels of the axes + labs(x="cbs", y="Median time (s)", title="Heat granularity: median time", + subtitle=input_file) + + + # And set the subtitle font size a bit smaller, so it fits nicely + theme(plot.subtitle=element_text(size=8)) + +# Then, we save the plot both in png and pdf +ggsave("median.time.png", plot=p, width=w, height=h, dpi=dpi) +ggsave("median.time.pdf", plot=p, width=w, height=h, dpi=dpi) + + +# --------------------------------------------------------------------- + + +# Another interesting plot is the normalized time, which shows the variance of +# the execution times, and can be used to find problems: +p = ggplot(df, aes(x=cbs, y=normalized.time)) + + + # The boxplots are useful to identify outliers and problems with the + # distribution of time + geom_boxplot() + + + # We add a line to mark the 1% limit above and below the median + geom_hline(yintercept=c(-0.01, 0.01), linetype="dashed", color="red") + + + # We split the plot into subplots, one for each value of the rbs column + facet_wrap(~ rbs) + + + # The bw theme is recommended for publications + theme_bw() + + + # Here we add the title and the labels of the axes + labs(x="cbs", y="Normalized time", title="Heat granularity: normalized time", + subtitle=input_file) + + + # And set the subtitle font size a bit smaller, so it fits nicely + theme(plot.subtitle=element_text(size=8)) + +# Then, we save the plot both in png and pdf +ggsave("normalized.time.png", plot=p, width=w, height=h, dpi=dpi) +ggsave("normalized.time.pdf", plot=p, width=w, height=h, dpi=dpi) + + +# --------------------------------------------------------------------- + + +# We plot the time of each run as we vary the block size +p = ggplot(df, aes(x=cbs, y=time, color=rbs)) + + + # We add a points (scatter plot) using circles (shape=21) a bit larger + # than the default (size=3) + geom_point(shape=21, size=3) + + + # The bw theme is recommended for publications + theme_bw() + + + # Here we add the title and the labels of the axes + labs(x="cbs", y="Time (s)", title="Heat granularity: time", + subtitle=input_file) + + + # And set the subtitle font size a bit smaller, so it fits nicely + theme(plot.subtitle=element_text(size=8)) + +# Then, we save the plot both in png and pdf +ggsave("time.png", plot=p, width=w, height=h, dpi=dpi) +ggsave("time.pdf", plot=p, width=w, height=h, dpi=dpi) + + +# --------------------------------------------------------------------- + + +# We can also plot both cbs and rbs in each dimension by mapping the time with a +# color. The `fill` argument instruct R to use the `median.time` as color +p = ggplot(df, aes(x=cbs, y=rbs, fill=median.time)) + + + # Then we use the geom_raster method to paint rectangles filled with color + geom_raster() + + + # The colors are set using the viridis package, using the plasma palete. Those + # colors are designed to be safe for color impaired people and also when + # converting the figures to grayscale. + scale_fill_viridis(option="plasma") + + + # We also force each tile to be an square + coord_fixed() + + + # The bw theme is recommended for publications + theme_bw() + + + # Here we add the title and the labels of the axes + labs(x="cbs", y="rbs", title="Heat granularity: time", + subtitle=input_file) + + + # And set the subtitle font size a bit smaller, so it fits nicely + theme(plot.subtitle=element_text(size=8)) + +# Then, we save the plot both in png and pdf +ggsave("time.heatmap.png", plot=p, width=w, height=h, dpi=dpi) +ggsave("time.heatmap.pdf", plot=p, width=w, height=h, dpi=dpi) diff --git a/garlic/fig/index.nix b/garlic/fig/index.nix index 3219ae8..1b370ae 100644 --- a/garlic/fig/index.nix +++ b/garlic/fig/index.nix @@ -74,4 +74,8 @@ in "osu/bwShm" = osu.bwShm; "heat/cache" = heat.cache; }; + + examples = with exp.examples; { + granularity = stdPlot ./examples/granularity.R [ granularity ]; + }; }