diff --git a/doc/runtime/distributed.md b/doc/runtime/distributed.md new file mode 100644 index 0000000..ab08335 --- /dev/null +++ b/doc/runtime/distributed.md @@ -0,0 +1,34 @@ +# Distributed traces (MPI) + +The ovni trace is designed to support concurrent programs running in different +nodes in a cluster. It is often the case that the monotonic clock +(`CLOCK_MONOTONIC`) are not synchronized between machines (in general they +measure the time since boot). + +To generate a coherent Paraver trace, the offsets of the clocks need to be +provided to the emulator too. To do so, run the `ovnisync` program using MPI on +the same nodes your workload will use. If you are using SLURM, you may want to +use something like: + + % srun ./application + % srun ovnisync + +!!! warning + + Beware that you cannot launch two MPI programs inside the same srun session, + you must invoke srun twice. + +By default, it will generate the `ovni/clock-offsets.txt` file, with the +relative offsets to the rank 0 of MPI. The emulator will automatically pick the +offsets when processing the trace. Use the ovnisync `-o` option to select a +different output path (see the `-c` option in ovniemu to load the file). + +Here is an example table with three nodes, all units are in nanoseconds. The +standard deviation is less than 1 us: + +``` +rank hostname offset_median offset_mean offset_std +0 xeon01 0 0.000000 0.000000 +1 xeon04 1165382584 1165382582.900000 135.286341 +2 xeon05 3118113507 3118113599.070000 180.571610 +``` diff --git a/emu.c b/emu.c index 3cbd675..19ce4e1 100644 --- a/emu.c +++ b/emu.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 Barcelona Supercomputing Center (BSC) +/* Copyright (c) 2021-2022 Barcelona Supercomputing Center (BSC) * SPDX-License-Identifier: GPL-3.0-or-later */ #define _POSIX_C_SOURCE 200112L @@ -849,14 +849,35 @@ load_clock_offsets(struct ovni_emu *emu) struct ovni_trace *trace; struct ovni_stream *stream; - f = fopen(emu->clock_offset_file, "r"); - - if(f == NULL) + if(emu->clock_offset_file != NULL) { - err("error opening clock offset file %s: %s\n", - emu->clock_offset_file, - strerror(errno)); - exit(EXIT_FAILURE); + f = fopen(emu->clock_offset_file, "r"); + + /* If provided by the user, it must exist */ + if(f == NULL) + { + err("error opening clock offset file %s: %s\n", + emu->clock_offset_file, + strerror(errno)); + exit(EXIT_FAILURE); + } + } + else + { + char path[PATH_MAX]; + if(snprintf(path, PATH_MAX, "%s/clock-offsets.txt", + emu->tracedir) >= PATH_MAX) + { + die("clock offset path too long\n"); + } + + f = fopen(path, "r"); + + if(f == NULL) + { + /* May not exist, but is fine */ + return; + } } /* Ignore header line */ diff --git a/mkdocs.yml b/mkdocs.yml index 2f27a70..5aad544 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -22,6 +22,7 @@ nav: - concepts.md - 'Runtime': - runtime/tracing.md + - runtime/distributed.md - runtime/kernel.md - runtime/trace_spec.md - 'Emulation': diff --git a/ovnisync.c b/ovnisync.c index a45a2ab..a06dc10 100644 --- a/ovnisync.c +++ b/ovnisync.c @@ -1,16 +1,19 @@ -/* Copyright (c) 2021 Barcelona Supercomputing Center (BSC) +/* Copyright (c) 2021-2022 Barcelona Supercomputing Center (BSC) * SPDX-License-Identifier: GPL-3.0-or-later */ -#define _POSIX_C_SOURCE 200112L +#define _POSIX_C_SOURCE 200809L +#include #include -#include -#include -#include -#include #include -#include +#include +#include +#include +#include +#include +#include #include +#include #include "ovni.h" @@ -50,6 +53,7 @@ struct options { int ndrift_samples; int drift_wait; /* in seconds */ int verbose; + char *outpath; }; static double @@ -91,11 +95,59 @@ usage(void) { fprintf(stderr, "%s: clock synchronization utility\n", progname); fprintf(stderr, "\n"); - fprintf(stderr, "Usage: %s [-d ndrift_samples] [-v] [-n nsamples] [-w drift_delay]\n", + fprintf(stderr, "Usage: %s [-o outfile] [-d ndrift_samples] [-v] [-n nsamples] [-w drift_delay]\n", progname); exit(EXIT_FAILURE); } +static int +try_mkdir(const char *path, mode_t mode) +{ + struct stat st; + + if(stat(path, &st) != 0) + { + /* Directory does not exist */ + return mkdir(path, mode); + } + else if(!S_ISDIR(st.st_mode)) + { + errno = ENOTDIR; + return -1; + } + + return 0; +} + +static int +mkpath(const char *path, mode_t mode) +{ + char *pp; + char *sp; + int status; + char *copypath = strdup(path); + + /* Remove trailing slash */ + int last = strlen(path) - 1; + while (last > 0 && copypath[last] == '/') + copypath[last--] = '\0'; + + status = 0; + pp = copypath; + while (status == 0 && (sp = strchr(pp, '/')) != 0) { + if (sp != pp) { + /* Neither root nor double slash in path */ + *sp = '\0'; + status = try_mkdir(copypath, mode); + *sp = '/'; + } + pp = sp + 1; + } + + free(copypath); + return status; +} + static void parse_options(struct options *options, int argc, char *argv[]) { @@ -106,8 +158,9 @@ parse_options(struct options *options, int argc, char *argv[]) options->nsamples = 100; options->verbose = 0; options->drift_wait = 5; + options->outpath = "ovni/clock-offsets.txt"; - while ((opt = getopt(argc, argv, "d:vn:w:")) != -1) { + while ((opt = getopt(argc, argv, "d:vn:w:o:h")) != -1) { switch (opt) { case 'd': options->ndrift_samples = atoi(optarg); @@ -121,6 +174,10 @@ parse_options(struct options *options, int argc, char *argv[]) case 'n': options->nsamples = atoi(optarg); break; + case 'o': + options->outpath = optarg; + break; + case 'h': default: /* '?' */ usage(); } @@ -326,46 +383,48 @@ build_offset_table(int nsamples, int rank, int verbose) } static void -print_drift_header(struct offset_table *table) +print_drift_header(FILE *out, struct offset_table *table) { int i; //char buf[64]; - printf("%-20s", "wallclock"); + fprintf(out, "%-20s", "wallclock"); for(i=0; inprocs; i++) { //sprintf(buf, "rank%d", i); - printf(" %-20s", table->offset[i]->hostname); + fprintf(out, " %-20s", table->offset[i]->hostname); } - printf("\n"); + fprintf(out, "\n"); } static void -print_drift_row(struct offset_table *table) +print_drift_row(FILE *out, struct offset_table *table) { int i; - printf("%-20f", table->offset[0]->wall_t1); + fprintf(out, "%-20f", table->offset[0]->wall_t1); for(i=0; inprocs; i++) - printf(" %-20ld", table->offset[i]->offset); + fprintf(out, " %-20ld", table->offset[i]->offset); - printf("\n"); + fprintf(out, "\n"); } static void -print_table_detailed(struct offset_table *table) +print_table_detailed(FILE *out, struct offset_table *table) { int i; struct offset *offset; - printf("%-10s %-20s %-20s %-20s %-20s\n", "rank", "hostname", "offset_median", "offset_mean", "offset_std"); + fprintf(out, "%-10s %-20s %-20s %-20s %-20s\n", + "rank", "hostname", "offset_median", "offset_mean", "offset_std"); + for(i=0; inprocs; i++) { offset = table->offset[i]; - printf("%-10d %-20s %-20ld %-20f %-20f\n", + fprintf(out, "%-10d %-20s %-20ld %-20f %-20f\n", i, offset->hostname, offset->offset, offset->delta_mean, offset->delta_std); } @@ -377,9 +436,28 @@ do_work(struct options *options, int rank) int drift_mode; int i; struct offset_table *table; + FILE *out = NULL; drift_mode = options->ndrift_samples > 1 ? 1 : 0; + if(rank == 0) + { + if(mkpath(options->outpath, 0755) != 0) + { + fprintf(stderr, "mkpath(%s) failed: %s\n", + options->outpath, strerror(errno)); + exit(EXIT_FAILURE); + } + + out = fopen(options->outpath, "w"); + if(out == NULL) + { + fprintf(stderr, "fopen(%s) failed: %s\n", + options->outpath, strerror(errno)); + exit(EXIT_FAILURE); + } + } + for(i=0; indrift_samples; i++) { table = build_offset_table(options->nsamples, rank, options->verbose); @@ -389,13 +467,13 @@ do_work(struct options *options, int rank) if(drift_mode) { if(i == 0) - print_drift_header(table); + print_drift_header(out, table); - print_drift_row(table); + print_drift_row(out, table); } else { - print_table_detailed(table); + print_table_detailed(out, table); } free(table->_offset); @@ -406,6 +484,9 @@ do_work(struct options *options, int rank) if(drift_mode) sleep(options->drift_wait); } + + if(rank == 0) + fclose(out); } int