1
1

Don't abort if a race condition causes a nid not to be found

This commit was SVN r23513.
Этот коммит содержится в:
Ralph Castain 2010-07-27 16:20:58 +00:00
родитель 06fe2c4c20
Коммит cfbfbb75a2

Просмотреть файл

@ -105,6 +105,11 @@ static int init(void)
{ {
int rc; int rc;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s initializing heartbeat recvs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
#if ORTE_ENABLE_MULTICAST #if ORTE_ENABLE_MULTICAST
/* setup multicast recv for heartbeats */ /* setup multicast recv for heartbeats */
if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_SYS_CHANNEL, if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_SYS_CHANNEL,
@ -317,14 +322,16 @@ static void recv_rmcast_beats(int status,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender))); ORTE_NAME_PRINT(sender)));
/* get this daemon's nid */ /* get this daemon's nid - if it isn't here, just ignore
if (NULL == (nid = orte_util_lookup_nid(sender))) { * as this is caused by a race condition at startup
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); */
exit(1); if (NULL != (nid = orte_util_lookup_nid(sender))) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s updating beat time for %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
nid->beat = gettime();
} }
/* update its time */
nid->beat = gettime();
} }
static void rmcast_callback_fn(int status, static void rmcast_callback_fn(int status,
@ -353,11 +360,14 @@ static void recv_rml_beats(int status, orte_process_name_t* sender,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender))); ORTE_NAME_PRINT(sender)));
/* get this daemon's nid */ /* get this daemon's nid - if it isn't here, just ignore
if (NULL == (nid = orte_util_lookup_nid(sender))) { * as this is caused by a race condition at startup
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); */
} else { if (NULL != (nid = orte_util_lookup_nid(sender))) {
/* update its time */ OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s updating beat time for %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
nid->beat = gettime(); nid->beat = gettime();
} }