usnic: add more commments/explanation about error cases
If we really get a catastrophic error from a libfabric call, don't bother trying to continue (because data has been corrupted and there's nothing sane left to do). Just call opal_btl_usnic_exit() (which tries to call the PML error callback, but we're so early in the module_init process that this likely hasn't been setup yet, so the job will likely abort).
Этот коммит содержится в:
родитель
51583789fb
Коммит
9c926e5e82
@ -217,13 +217,23 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Loop polling for USD destination creation completion (they were
|
/* Loop polling for USD destination creation completion (they were
|
||||||
individually started in btl_usnic_proc.c) */
|
individually started in btl_usnic_proc.c)
|
||||||
|
|
||||||
|
In the loop below, the num_left value is decremented by a value
|
||||||
|
we get back from the event queue. There are error cases where
|
||||||
|
we (theothetically) can get a corrupted event entry back, and
|
||||||
|
not know how much to decrement num_left. Hence, num_left can
|
||||||
|
be inaccurate. In such cases, this is probably indicative of a
|
||||||
|
larger error. Plus, we're not even all the way through module
|
||||||
|
init yet, so only sane thing to do is abort. */
|
||||||
while (num_left > 0) {
|
while (num_left > 0) {
|
||||||
opal_btl_usnic_addr_context_t *context;
|
opal_btl_usnic_addr_context_t *context;
|
||||||
|
|
||||||
ret = fi_eq_sread(module->av_eq, &event, &entry, sizeof(entry), -1, 0);
|
ret = fi_eq_sread(module->av_eq, &event, &entry, sizeof(entry), -1, 0);
|
||||||
if (sizeof(entry) == ret) {
|
if (sizeof(entry) == ret) {
|
||||||
context = entry.context;
|
context = entry.context;
|
||||||
|
/* The usnic provider returns the number of inserts
|
||||||
|
completed in entry.data */
|
||||||
num_left -= entry.data;
|
num_left -= entry.data;
|
||||||
free(context);
|
free(context);
|
||||||
ret = 0;
|
ret = 0;
|
||||||
@ -275,8 +285,11 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
|||||||
/* we can't break here, need to finish reaping all inserts */
|
/* we can't break here, need to finish reaping all inserts */
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else {
|
/* If we get here, it means fi_eq_readerr() failed
|
||||||
|
badly, which means something has gone tremendously
|
||||||
|
wrong. Probably the only safe thing to do here is
|
||||||
|
exit. */
|
||||||
opal_show_help("help-mpi-btl-usnic.txt",
|
opal_show_help("help-mpi-btl-usnic.txt",
|
||||||
"internal error during init",
|
"internal error during init",
|
||||||
true,
|
true,
|
||||||
@ -287,13 +300,15 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
|||||||
"Returned != sizeof(err_entry)");
|
"Returned != sizeof(err_entry)");
|
||||||
ret = OPAL_ERR_OUT_OF_RESOURCE;
|
ret = OPAL_ERR_OUT_OF_RESOURCE;
|
||||||
error_occurred = true;
|
error_occurred = true;
|
||||||
/* we can't break here, need to finish reaping all inserts */
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Some kind of error from fi_eq_sread */
|
/* Per above, there's really nothing sane left to do
|
||||||
else {
|
but exit */
|
||||||
|
opal_btl_usnic_exit(module);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* If we get here, it means fi_eq_readerr() failed badly,
|
||||||
|
which means something has gone tremendously wrong.
|
||||||
|
Probably the only safe thing to do here is exit. */
|
||||||
opal_show_help("help-mpi-btl-usnic.txt",
|
opal_show_help("help-mpi-btl-usnic.txt",
|
||||||
"internal error during init",
|
"internal error during init",
|
||||||
true,
|
true,
|
||||||
@ -304,8 +319,10 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
|
|||||||
"Returned != (sizeof(entry) or -FI_EAVAIL)");
|
"Returned != (sizeof(entry) or -FI_EAVAIL)");
|
||||||
ret = OPAL_ERR_OUT_OF_RESOURCE;
|
ret = OPAL_ERR_OUT_OF_RESOURCE;
|
||||||
error_occurred = true;
|
error_occurred = true;
|
||||||
/* we can't break here, need to finish reaping all inserts */
|
|
||||||
continue;
|
/* Per above, there's really nothing sane left to do but
|
||||||
|
exit */
|
||||||
|
opal_btl_usnic_exit(module);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user