1
1

Adjust some default values, and ensure we don't start sending too soon

This commit was SVN r23492.
Этот коммит содержится в:
Ralph Castain 2010-07-23 19:37:16 +00:00
родитель 140e427a79
Коммит ff2d573f7e
2 изменённых файлов: 24 добавлений и 10 удалений

Просмотреть файл

@ -210,10 +210,14 @@ static void send_heartbeat(int fd, short event, void *arg)
int rc; int rc;
/* if we are aborting or shutting down, ignore this */ /* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing) { if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
return; return;
} }
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s sending heartbeat",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* setup the buffer - nothing to pack as receipt alone is the "beat" */ /* setup the buffer - nothing to pack as receipt alone is the "beat" */
buf = OBJ_NEW(opal_buffer_t); buf = OBJ_NEW(opal_buffer_t);
@ -257,7 +261,7 @@ static void check_heartbeat(int fd, short dummy, void *arg)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if we are aborting or shutting down, ignore this */ /* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing) { if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
return; return;
} }
@ -304,14 +308,19 @@ static void recv_rmcast_beats(int status,
orte_nid_t *nid; orte_nid_t *nid;
/* if we are aborting or shutting down, ignore this */ /* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing) { if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
return; return;
} }
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s recvd heartbeat from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
/* get this daemon's nid */ /* get this daemon's nid */
if (NULL == (nid = orte_util_lookup_nid(sender))) { if (NULL == (nid = orte_util_lookup_nid(sender))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return; exit(1);
} }
/* update its time */ /* update its time */
@ -335,10 +344,15 @@ static void recv_rml_beats(int status, orte_process_name_t* sender,
orte_nid_t *nid; orte_nid_t *nid;
/* if we are aborting or shutting down, ignore this */ /* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing) { if (orte_abnormal_term_ordered || orte_finalizing || !orte_intialized) {
return; return;
} }
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s recvd heartbeat from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
/* get this daemon's nid */ /* get this daemon's nid */
if (NULL == (nid = orte_util_lookup_nid(sender))) { if (NULL == (nid = orte_util_lookup_nid(sender))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);

Просмотреть файл

@ -59,13 +59,13 @@ static int orte_sensor_heartbeat_open(void)
/* lookup parameters */ /* lookup parameters */
mca_base_param_reg_int(c, "beat", mca_base_param_reg_int(c, "beat",
"Heartbeat rate in milliseconds (default=1)", "Heartbeat rate in milliseconds (default=100)",
false, false, 1, &tmp); false, false, 100, &tmp);
mca_sensor_heartbeat_component.beat = tmp; mca_sensor_heartbeat_component.beat = tmp;
mca_base_param_reg_int(c, "check", mca_base_param_reg_int(c, "check",
"Check for failure rate in milliseconds (default=5)", "Check for failure rate in milliseconds (default=500)",
false, false, 5, &tmp); false, false, 500, &tmp);
mca_sensor_heartbeat_component.check = tmp; mca_sensor_heartbeat_component.check = tmp;
mca_base_param_reg_int(c, "missed", mca_base_param_reg_int(c, "missed",