ovni/src/emu/ovnisort.c

608 lines
13 KiB
C

/* Copyright (c) 2021-2024 Barcelona Supercomputing Center (BSC)
* SPDX-License-Identifier: GPL-3.0-or-later */
/* This program is a really bad idea. It attempts to sort streams by using a
* window of the last N events in memory, so we can quickly access the event
* clocks. Then, as soon as we detect a region of potentially unsorted events,
* we go back until we find a suitable position and start injecting the events
* in order.
*
* The events inside a unsorted region may not be ordered, they will be sorted
* by qsort() first. The number of events that we will look back is limited by
* N.
*/
#include <fcntl.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "common.h"
#include "ovni.h"
#include "stream.h"
#include "trace.h"
struct ring {
ssize_t head;
ssize_t tail;
ssize_t size;
struct ovni_ev **ev;
};
struct sortplan {
/* The first and last events which need sorting */
struct ovni_ev *bad0;
/* The next event which must be not affected */
struct ovni_ev *next;
/* Pointer to the stream buffer */
uint8_t *base;
struct ring *r;
/* File descriptor of the stream file */
int fd;
};
enum operation_mode { SORT,
CHECK };
static char *tracedir = NULL;
static enum operation_mode operation_mode = SORT;
static size_t max_look_back = 1000000;
static void
ring_reset(struct ring *r)
{
r->head = r->tail = 0;
}
static void
ring_add(struct ring *r, struct ovni_ev *ev)
{
r->ev[r->tail] = ev;
r->tail++;
if (r->tail >= r->size)
r->tail = 0;
if (r->head == r->tail)
r->head = r->tail + 1;
if (r->head >= r->size)
r->head = 0;
}
static void
ring_check(struct ring *r, long long start)
{
uint64_t last_clock = 0;
for (long long i = start; i != r->tail; i = (i + 1) % r->size) {
uint64_t clock = r->ev[i]->header.clock;
if (clock < last_clock) {
die("ring not sorted at i=%lld, last_clock=%"PRIu64" clock=%"PRIu64 ,
i, last_clock, clock);
}
last_clock = clock;
}
}
static ssize_t
find_destination(struct ring *r, uint64_t clock)
{
ssize_t nback = 0;
ssize_t start = r->tail - 1 >= 0 ? r->tail - 1 : r->size - 1;
ssize_t end = r->head - 1 >= 0 ? r->head - 1 : r->size - 1;
uint64_t last_clock = 0;
for (ssize_t i = start; i != end; i = i - 1 < 0 ? r->size - 1 : i - 1) {
last_clock = r->ev[i]->header.clock;
if (last_clock < clock) {
dbg("found suitable position %zd events backwards",
nback);
return i;
}
nback++;
}
/* If there is no event with a lower clock and we haven't fill the ring
* yet, then we are at the beginning and no other event has be emitted
* before the sort window. So simply return the first marker. */
if (nback < (ssize_t) r->size - 1) {
if (r->head != 0)
die("ring head expected to be 0");
if (r->tail >= r->size - 1)
die("ring tail=%zd expected to be less than %zd", r->tail, r->size - 1);
dbg("starting of ring with nback=%zd", nback);
return r->head;
}
err("cannot find a event previous to clock %"PRIu64, clock);
err("nback=%zd, last_clock=%"PRIu64, nback, last_clock);
return -1;
}
static int
starts_unsorted_region(struct ovni_ev *ev)
{
return ev->header.model == 'O' && ev->header.category == 'U' && ev->header.value == '[';
}
static int
ends_unsorted_region(struct ovni_ev *ev)
{
return ev->header.model == 'O' && ev->header.category == 'U' && ev->header.value == ']';
}
static uint64_t
find_min_clock(uint8_t *src, uint8_t *end)
{
uint8_t *p = src;
struct ovni_ev *ev0 = (struct ovni_ev *) p;
uint64_t min_clock = ev0->header.clock;
while (1) {
if (p >= end)
break;
struct ovni_ev *ev = (struct ovni_ev *) p;
if (ev->header.clock < min_clock)
min_clock = ev->header.clock;
p += ovni_ev_size(ev);
}
return min_clock;
}
static long
count_events(uint8_t *src, uint8_t *end)
{
uint8_t *p = src;
long n = 0;
while (1) {
if (p >= end)
break;
struct ovni_ev *ev = (struct ovni_ev *) p;
p += ovni_ev_size(ev);
n++;
}
return n;
}
static void
index_events(struct ovni_ev **table, long n, uint8_t *buf)
{
uint8_t *p = buf;
for (long i = 0; i < n; i++) {
table[i] = (struct ovni_ev *) p;
p += ovni_ev_size(table[i]);
}
}
static void
write_events(struct ovni_ev **table, long n, uint8_t *buf)
{
for (long i = 0; i < n; i++) {
struct ovni_ev *ev = table[i];
size_t size = (size_t) ovni_ev_size(ev);
memcpy(buf, ev, size);
buf += size;
dbg("injected event %c%c%c at %"PRIu64,
ev->header.model,
ev->header.category,
ev->header.value,
ev->header.clock);
}
}
static int
cmp_ev(const void *a, const void *b)
{
struct ovni_ev **pev1 = (struct ovni_ev **) a;
struct ovni_ev **pev2 = (struct ovni_ev **) b;
struct ovni_ev *ev1 = *pev1;
struct ovni_ev *ev2 = *pev2;
int64_t clock1 = (int64_t) ev1->header.clock;
int64_t clock2 = (int64_t) ev2->header.clock;
if (clock1 < clock2)
return -1;
if (clock1 > clock2)
return +1;
else
return 0;
}
static void
sort_buf(uint8_t *src, uint8_t *buf, int64_t bufsize)
{
struct ovni_ev *ev = (struct ovni_ev *) src;
dbg("first event before sorting %c%c%c at %"PRIu64,
ev->header.model,
ev->header.category,
ev->header.value,
ev->header.clock);
/* Create a copy of the array */
uint8_t *buf2 = malloc((size_t) bufsize);
if (buf2 == NULL)
die("malloc failed:");
memcpy(buf2, src, (size_t) bufsize);
long n = count_events(buf2, buf2 + bufsize);
struct ovni_ev **table = calloc((size_t) n, sizeof(struct ovni_ev *));
if (table == NULL)
die("calloc failed:");
index_events(table, n, buf2);
qsort(table, (size_t) n, sizeof(struct ovni_ev *), cmp_ev);
write_events(table, n, buf);
dbg("first event after sorting %c%c%c at %"PRIu64,
ev->header.model,
ev->header.category,
ev->header.value,
ev->header.clock);
free(table);
free(buf2);
dbg("sorted %ld events", n);
}
static void
write_stream(int fd, void *base, void *dst, const void *src, size_t size)
{
while (size > 0) {
off_t offset = (off_t) dst - (off_t) base;
ssize_t written = pwrite(fd, src, size, offset);
if (written < 0)
die("pwrite failed:");
size -= (size_t) written;
src = (void *) (((uint8_t *) src) + written);
dst = (void *) (((uint8_t *) dst) + written);
}
}
static void
rebuild_ring(struct ring *r, long long start, struct ovni_ev *first, struct ovni_ev *last)
{
long long nbad = 0;
long long n = 0;
struct ovni_ev *ev = first;
for (long long i = start; i != r->tail; i = i + 1 >= r->size ? 0 : i + 1) {
n++;
if (ev != r->ev[i])
nbad++;
if (ev >= last)
die("exceeding last pointer");
r->ev[i] = ev;
size_t size = (size_t) ovni_ev_size(ev);
ev = (struct ovni_ev *) (((uint8_t *) ev) + size);
}
if (ev != last)
die("inconsistency: ev != last");
dbg("rebuilt ring with %lld / %lld misplaced events", nbad, n);
}
static int
execute_sort_plan(struct sortplan *sp)
{
uint64_t clock0 = sp->bad0->header.clock;
dbg("attempt to sort: start clock %"PRIi64, sp->bad0->header.clock);
uint64_t min_clock = find_min_clock((void *) sp->bad0, (void *) sp->next);
if (min_clock < clock0) {
clock0 = min_clock;
dbg("region not sorted, using min clock=%"PRIi64, clock0);
}
/* Cannot sort in one pass; just fail for now */
int64_t i0 = find_destination(sp->r, clock0);
if (i0 < 0) {
err("cannot find destination for region starting at clock %"PRIi64, clock0);
err("consider increasing the look back size with -n");
return -1;
}
/* Set the pointer to the first event that may be affected */
struct ovni_ev *first = sp->r->ev[i0];
long long dirty = i0;
/* Allocate a working buffer */
uintptr_t bufsize = (uintptr_t) sp->next - (uintptr_t) first;
if (bufsize <= 0)
die("bufsize is non-positive");
uint8_t *buf = malloc(bufsize);
if (!buf)
die("malloc failed:");
sort_buf((uint8_t *) first, buf, (int64_t) bufsize);
write_stream(sp->fd, sp->base, first, buf, bufsize);
free(buf);
/* Pointers from the ring buffer are invalid now, rebuild them */
rebuild_ring(sp->r, dirty, first, sp->next);
/* Invariant: The ring buffer is always sorted here. Check from the
* dirty position onwards, so we avoid scanning all events. */
ring_check(sp->r, dirty);
return 0;
}
/* Sort the events in the stream chronologically using a ring */
static int
stream_winsort(struct stream *stream, struct ring *r)
{
char *fn = stream->obspath;
int fd = open(fn, O_WRONLY);
if (fd < 0)
die("open %s failed:", fn);
ring_reset(r);
struct sortplan sp = {0};
sp.r = r;
sp.fd = fd;
sp.base = stream->buf;
size_t empty_regions = 0;
size_t updated = 0;
char st = 'S';
int ret = 0;
while ((ret = stream_step(stream)) == 0) {
struct ovni_ev *ev = stream_ev(stream);
if (st == 'S' && starts_unsorted_region(ev)) {
st = 'U';
} else if (st == 'U') {
/* Ensure that we have at least one unsorted
* event inside the section */
if (ends_unsorted_region(ev)) {
empty_regions++;
st = 'S';
} else {
st = 'X';
sp.bad0 = ev;
}
} else if (st == 'X') {
if (ends_unsorted_region(ev)) {
updated = 1;
sp.next = ev;
dbg("executing sort plan for stream %s",
stream->relpath);
if (execute_sort_plan(&sp) < 0) {
err("sort failed for stream %s",
stream->relpath);
return -1;
}
/* Clear markers */
sp.next = NULL;
sp.bad0 = NULL;
st = 'S';
}
}
ring_add(r, ev);
}
if (ret < 0) {
err("stream_step failed");
return -1;
}
if (empty_regions > 0)
warn("stream %s contains %zd empty sort regions",
stream->relpath, empty_regions);
if (updated && fdatasync(fd) < 0)
die("fdatasync %s failed:", fn);
if (close(fd) < 0)
die("close %s failed:", fn);
return 0;
}
/* Ensures that each individual stream is sorted */
static int
stream_check(struct stream *stream)
{
int ret = stream_step(stream);
if (ret < 0) {
err("stream_step failed");
return -1;
}
/* Reached the end */
if (ret != 0)
return 0;
struct ovni_ev *ev = stream_ev(stream);
uint64_t last_clock = ev->header.clock;
int backjump = 0;
while ((ret = stream_step(stream)) == 0) {
ev = stream_ev(stream);
uint64_t cur_clock = ovni_ev_get_clock(ev);
if (cur_clock < last_clock) {
err("backwards jump in time %"PRIi64" -> %"PRIi64" for stream %s",
last_clock, cur_clock, stream->relpath);
backjump = 1;
}
last_clock = cur_clock;
}
if (ret < 0) {
err("stream_step failed");
return -1;
}
if (backjump)
return -1;
return 0;
}
static int
process_trace(struct trace *trace)
{
struct ring ring;
int ret = 0;
ring.size = (ssize_t) max_look_back;
ring.ev = malloc((size_t) ring.size * sizeof(struct ovni_ev *));
if (ring.ev == NULL)
die("malloc failed:");
for (struct stream *stream = trace->streams; stream; stream = stream->next) {
stream_allow_unsorted(stream);
if (operation_mode == SORT) {
dbg("sorting stream %s", stream->relpath);
if (stream_winsort(stream, &ring) != 0) {
err("sort stream %s failed", stream->relpath);
/* When sorting, return at the first
* attempt */
return -1;
}
} else {
if (stream_check(stream) != 0) {
info("stream %s is not sorted", stream->relpath);
/* When checking, report all errors and
* then fail */
ret = -1;
}
}
}
free(ring.ev);
if (operation_mode == CHECK) {
if (ret == 0) {
info("all streams sorted");
} else {
info("streams NOT sorted");
}
}
return ret;
}
static void
usage(void)
{
rerr("Usage: ovnisort [-c] tracedir\n");
rerr("\n");
rerr("Sorts the events in each stream of the trace given in\n");
rerr("tracedir, so they are suitable for the emulator ovniemu.\n");
rerr("Only the events enclosed by OU[ OU] are sorted. At most a\n");
rerr("total of %zd events are looked back to insert the unsorted\n",
max_look_back);
rerr("events, so the sort procedure can fail with an error.\n");
rerr("\n");
rerr("Options:\n");
rerr(" -c Enable check mode: don't sort, ensure the\n");
rerr(" trace is already sorted.\n");
rerr("\n");
rerr(" -n Set the number of events to look back.\n");
rerr(" Default: %zd\n", max_look_back);
rerr("\n");
rerr(" tracedir The trace directory generated by ovni.\n");
rerr("\n");
exit(EXIT_FAILURE);
}
static void
parse_args(int argc, char *argv[])
{
int opt;
while ((opt = getopt(argc, argv, "cn:")) != -1) {
switch (opt) {
case 'c':
operation_mode = CHECK;
break;
case 'n':
max_look_back = (size_t) atol(optarg);
break;
default: /* '?' */
usage();
}
}
if (optind >= argc) {
err("missing tracedir");
usage();
}
tracedir = argv[optind];
}
int
main(int argc, char *argv[])
{
progname_set("ovnisort");
if (getenv("OVNI_DEBUG") != NULL)
enable_debug();
parse_args(argc, argv);
struct trace *trace = calloc(1, sizeof(struct trace));
if (trace == NULL) {
err("calloc failed:");
return 1;
}
if (trace_load(trace, tracedir) != 0) {
err("failed to load trace: %s", tracedir);
return 1;
}
int ret = process_trace(trace);
free(trace);
if (ret)
return 1;
return 0;
}