1
1

Properly mark a node as down and decrease the number of daemons so any

subsequent grpcomm collectives can correctly operate. Note that only the
direct grpcomm component knows how to deal with down nodes.
Этот коммит содержится в:
Ralph Castain 2016-10-21 09:53:37 -07:00
родитель 2a9f818d24
Коммит df8ac7b747
2 изменённых файлов: 6 добавлений и 1 удалений

Просмотреть файл

@ -331,6 +331,10 @@ static void proc_errors(int fd, short args, void *cbdata)
} }
/* mark the daemon as gone */ /* mark the daemon as gone */
ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE); ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
/* update the state */
pptr->state = state;
/* adjust our num_procs */
--orte_process_info.num_procs;
/* if we have ordered orteds to terminate or abort /* if we have ordered orteds to terminate or abort
* is in progress, record it */ * is in progress, record it */
if (orte_orteds_term_ordered || orte_abnormal_term_ordered) { if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {

Просмотреть файл

@ -432,7 +432,8 @@ static void xcast_recv(int status, orte_process_name_t* sender,
OBJ_RELEASE(item); OBJ_RELEASE(item);
continue; continue;
} }
if (ORTE_PROC_STATE_RUNNING < rec->state) { if (ORTE_PROC_STATE_RUNNING < rec->state ||
!ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) {
opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay", opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
OBJ_RELEASE(rly); OBJ_RELEASE(rly);