Properly mark a node as down and decrease the number of daemons so any
subsequent grpcomm collectives can correctly operate. Note that only the direct grpcomm component knows how to deal with down nodes.
Этот коммит содержится в:
родитель
2a9f818d24
Коммит
df8ac7b747
@ -331,6 +331,10 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
}
|
}
|
||||||
/* mark the daemon as gone */
|
/* mark the daemon as gone */
|
||||||
ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
|
ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
|
||||||
|
/* update the state */
|
||||||
|
pptr->state = state;
|
||||||
|
/* adjust our num_procs */
|
||||||
|
--orte_process_info.num_procs;
|
||||||
/* if we have ordered orteds to terminate or abort
|
/* if we have ordered orteds to terminate or abort
|
||||||
* is in progress, record it */
|
* is in progress, record it */
|
||||||
if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
|
if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
|
||||||
|
@ -432,7 +432,8 @@ static void xcast_recv(int status, orte_process_name_t* sender,
|
|||||||
OBJ_RELEASE(item);
|
OBJ_RELEASE(item);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (ORTE_PROC_STATE_RUNNING < rec->state) {
|
if (ORTE_PROC_STATE_RUNNING < rec->state ||
|
||||||
|
!ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) {
|
||||||
opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay",
|
opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
|
||||||
OBJ_RELEASE(rly);
|
OBJ_RELEASE(rly);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user