Don't abort if a race condition causes a nid not to be found
This commit was SVN r23513.
Этот коммит содержится в:
родитель
06fe2c4c20
Коммит
cfbfbb75a2
@ -105,6 +105,11 @@ static int init(void)
|
|||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||||
|
"%s initializing heartbeat recvs",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
|
|
||||||
#if ORTE_ENABLE_MULTICAST
|
#if ORTE_ENABLE_MULTICAST
|
||||||
/* setup multicast recv for heartbeats */
|
/* setup multicast recv for heartbeats */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_SYS_CHANNEL,
|
if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_SYS_CHANNEL,
|
||||||
@ -317,14 +322,16 @@ static void recv_rmcast_beats(int status,
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(sender)));
|
ORTE_NAME_PRINT(sender)));
|
||||||
|
|
||||||
/* get this daemon's nid */
|
/* get this daemon's nid - if it isn't here, just ignore
|
||||||
if (NULL == (nid = orte_util_lookup_nid(sender))) {
|
* as this is caused by a race condition at startup
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
*/
|
||||||
exit(1);
|
if (NULL != (nid = orte_util_lookup_nid(sender))) {
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||||
|
"%s updating beat time for %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(sender)));
|
||||||
|
nid->beat = gettime();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* update its time */
|
|
||||||
nid->beat = gettime();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void rmcast_callback_fn(int status,
|
static void rmcast_callback_fn(int status,
|
||||||
@ -353,11 +360,14 @@ static void recv_rml_beats(int status, orte_process_name_t* sender,
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(sender)));
|
ORTE_NAME_PRINT(sender)));
|
||||||
|
|
||||||
/* get this daemon's nid */
|
/* get this daemon's nid - if it isn't here, just ignore
|
||||||
if (NULL == (nid = orte_util_lookup_nid(sender))) {
|
* as this is caused by a race condition at startup
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
*/
|
||||||
} else {
|
if (NULL != (nid = orte_util_lookup_nid(sender))) {
|
||||||
/* update its time */
|
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||||
|
"%s updating beat time for %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(sender)));
|
||||||
nid->beat = gettime();
|
nid->beat = gettime();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user