Store rank information in the libovni thread

Prevents threads from finishing while another thread is being
initializing the rank information, causing a race to read the rank and
nranks.
This commit is contained in:
Rodrigo Arias 2024-09-13 08:52:10 +02:00
parent f6fc166a38
commit 16dbc8bf5d

View File

@ -54,6 +54,10 @@ struct ovni_rthread {
struct ovni_rcpu *cpus; struct ovni_rcpu *cpus;
int rank_set;
int rank;
int nranks;
/* Where the stream dir is finally copied */ /* Where the stream dir is finally copied */
char thdir_final[PATH_MAX]; char thdir_final[PATH_MAX];
char thdir[PATH_MAX]; char thdir[PATH_MAX];
@ -78,9 +82,6 @@ struct ovni_rproc {
int pid; int pid;
char loom[OVNI_MAX_HOSTNAME]; char loom[OVNI_MAX_HOSTNAME];
clockid_t clockid; clockid_t clockid;
int rank_set;
int rank;
int nranks;
atomic_int st; atomic_int st;
@ -201,9 +202,12 @@ ovni_proc_set_rank(int rank, int nranks)
if (rproc.st != ST_READY) if (rproc.st != ST_READY)
die("process not ready"); die("process not ready");
rproc.rank_set = 1; if (!rthread.ready)
rproc.rank = rank; die("thread not yet initialized");
rproc.nranks = nranks;
rthread.rank_set = 1;
rthread.rank = rank;
rthread.nranks = nranks;
} }
/* Create $tracedir/loom.$loom/proc.$pid and return it in path. */ /* Create $tracedir/loom.$loom/proc.$pid and return it in path. */
@ -557,10 +561,10 @@ ovni_thread_init(pid_t tid)
static void static void
set_thread_rank(JSON_Object *meta) set_thread_rank(JSON_Object *meta)
{ {
if (json_object_dotset_number(meta, "ovni.rank", rproc.rank) != 0) if (json_object_dotset_number(meta, "ovni.rank", rthread.rank) != 0)
die("json_object_set_number for rank failed"); die("json_object_set_number for rank failed");
if (json_object_dotset_number(meta, "ovni.nranks", rproc.nranks) != 0) if (json_object_dotset_number(meta, "ovni.nranks", rthread.nranks) != 0)
die("json_object_set_number for nranks failed"); die("json_object_set_number for nranks failed");
} }
@ -612,7 +616,7 @@ ovni_thread_free(void)
if (meta == NULL) if (meta == NULL)
die("json_value_get_object failed"); die("json_value_get_object failed");
if (rproc.rank_set) if (rthread.rank_set)
set_thread_rank(meta); set_thread_rank(meta);
/* It can happen there are no CPUs defined if there is another /* It can happen there are no CPUs defined if there is another