From cfbfbb75a20a9755c976f3664001862933292c21 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 27 Jul 2010 16:20:58 +0000 Subject: [PATCH] Don't abort if a race condition causes a nid not to be found This commit was SVN r23513. --- orte/mca/sensor/heartbeat/sensor_heartbeat.c | 34 +++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/orte/mca/sensor/heartbeat/sensor_heartbeat.c b/orte/mca/sensor/heartbeat/sensor_heartbeat.c index b5c8641dc1..8becd08079 100644 --- a/orte/mca/sensor/heartbeat/sensor_heartbeat.c +++ b/orte/mca/sensor/heartbeat/sensor_heartbeat.c @@ -105,6 +105,11 @@ static int init(void) { int rc; + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, + "%s initializing heartbeat recvs", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + #if ORTE_ENABLE_MULTICAST /* setup multicast recv for heartbeats */ if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_SYS_CHANNEL, @@ -317,14 +322,16 @@ static void recv_rmcast_beats(int status, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); - /* get this daemon's nid */ - if (NULL == (nid = orte_util_lookup_nid(sender))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - exit(1); + /* get this daemon's nid - if it isn't here, just ignore + * as this is caused by a race condition at startup + */ + if (NULL != (nid = orte_util_lookup_nid(sender))) { + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, + "%s updating beat time for %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender))); + nid->beat = gettime(); } - - /* update its time */ - nid->beat = gettime(); } static void rmcast_callback_fn(int status, @@ -353,11 +360,14 @@ static void recv_rml_beats(int status, orte_process_name_t* sender, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); - /* get this daemon's nid */ - if (NULL == (nid = orte_util_lookup_nid(sender))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - } else { - /* update its time */ + /* get this daemon's nid - if it isn't here, just ignore + * as this is caused by a race condition at startup + */ + if (NULL != (nid = orte_util_lookup_nid(sender))) { + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, + "%s updating beat time for %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender))); nid->beat = gettime(); }