From df8ac7b7476830ed807ea3c6e59f5c06161d4546 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 21 Oct 2016 09:53:37 -0700 Subject: [PATCH] Properly mark a node as down and decrease the number of daemons so any subsequent grpcomm collectives can correctly operate. Note that only the direct grpcomm component knows how to deal with down nodes. --- orte/mca/errmgr/dvm/errmgr_dvm.c | 4 ++++ orte/mca/grpcomm/direct/grpcomm_direct.c | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/orte/mca/errmgr/dvm/errmgr_dvm.c b/orte/mca/errmgr/dvm/errmgr_dvm.c index c259ac0253..2e940c9b00 100644 --- a/orte/mca/errmgr/dvm/errmgr_dvm.c +++ b/orte/mca/errmgr/dvm/errmgr_dvm.c @@ -331,6 +331,10 @@ static void proc_errors(int fd, short args, void *cbdata) } /* mark the daemon as gone */ ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE); + /* update the state */ + pptr->state = state; + /* adjust our num_procs */ + --orte_process_info.num_procs; /* if we have ordered orteds to terminate or abort * is in progress, record it */ if (orte_orteds_term_ordered || orte_abnormal_term_ordered) { diff --git a/orte/mca/grpcomm/direct/grpcomm_direct.c b/orte/mca/grpcomm/direct/grpcomm_direct.c index 7804461b51..a6e41456b9 100644 --- a/orte/mca/grpcomm/direct/grpcomm_direct.c +++ b/orte/mca/grpcomm/direct/grpcomm_direct.c @@ -432,7 +432,8 @@ static void xcast_recv(int status, orte_process_name_t* sender, OBJ_RELEASE(item); continue; } - if (ORTE_PROC_STATE_RUNNING < rec->state) { + if (ORTE_PROC_STATE_RUNNING < rec->state || + !ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) { opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name)); OBJ_RELEASE(rly);