usnic: add more commments/explanation about error cases
If we really get a catastrophic error from a libfabric call, don't bother trying to continue (because data has been corrupted and there's nothing sane left to do). Just call opal_btl_usnic_exit() (which tries to call the PML error callback, but we're so early in the module_init process that this likely hasn't been setup yet, so the job will likely abort).
Этот коммит содержится в:
родитель
51583789fb
Коммит
9c926e5e82
@ -217,13 +217,23 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
||||
}
|
||||
|
||||
/* Loop polling for USD destination creation completion (they were
|
||||
individually started in btl_usnic_proc.c) */
|
||||
individually started in btl_usnic_proc.c)
|
||||
|
||||
In the loop below, the num_left value is decremented by a value
|
||||
we get back from the event queue. There are error cases where
|
||||
we (theothetically) can get a corrupted event entry back, and
|
||||
not know how much to decrement num_left. Hence, num_left can
|
||||
be inaccurate. In such cases, this is probably indicative of a
|
||||
larger error. Plus, we're not even all the way through module
|
||||
init yet, so only sane thing to do is abort. */
|
||||
while (num_left > 0) {
|
||||
opal_btl_usnic_addr_context_t *context;
|
||||
|
||||
ret = fi_eq_sread(module->av_eq, &event, &entry, sizeof(entry), -1, 0);
|
||||
if (sizeof(entry) == ret) {
|
||||
context = entry.context;
|
||||
/* The usnic provider returns the number of inserts
|
||||
completed in entry.data */
|
||||
num_left -= entry.data;
|
||||
free(context);
|
||||
ret = 0;
|
||||
@ -275,8 +285,11 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
||||
/* we can't break here, need to finish reaping all inserts */
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
/* If we get here, it means fi_eq_readerr() failed
|
||||
badly, which means something has gone tremendously
|
||||
wrong. Probably the only safe thing to do here is
|
||||
exit. */
|
||||
opal_show_help("help-mpi-btl-usnic.txt",
|
||||
"internal error during init",
|
||||
true,
|
||||
@ -287,13 +300,15 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
||||
"Returned != sizeof(err_entry)");
|
||||
ret = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
error_occurred = true;
|
||||
/* we can't break here, need to finish reaping all inserts */
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Some kind of error from fi_eq_sread */
|
||||
else {
|
||||
/* Per above, there's really nothing sane left to do
|
||||
but exit */
|
||||
opal_btl_usnic_exit(module);
|
||||
}
|
||||
} else {
|
||||
/* If we get here, it means fi_eq_readerr() failed badly,
|
||||
which means something has gone tremendously wrong.
|
||||
Probably the only safe thing to do here is exit. */
|
||||
opal_show_help("help-mpi-btl-usnic.txt",
|
||||
"internal error during init",
|
||||
true,
|
||||
@ -304,8 +319,10 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
||||
"Returned != (sizeof(entry) or -FI_EAVAIL)");
|
||||
ret = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
error_occurred = true;
|
||||
/* we can't break here, need to finish reaping all inserts */
|
||||
continue;
|
||||
|
||||
/* Per above, there's really nothing sane left to do but
|
||||
exit */
|
||||
opal_btl_usnic_exit(module);
|
||||
}
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user