a94438343b
This commit was SVN r20741. The following SVN revision numbers were found above: r20740 --> open-mpi/ompi@2a70618a77
143 строки
4.3 KiB
C
143 строки
4.3 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
|
|
#ifdef HAVE_SYS_TIME_H
|
|
#include <sys/time.h>
|
|
#endif
|
|
|
|
#include "opal/dss/dss.h"
|
|
|
|
#include "orte/mca/rml/rml.h"
|
|
#include "orte/mca/rml/rml_types.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/runtime/orte_wait.h"
|
|
#include "orte/util/name_fns.h"
|
|
|
|
#include "orte/mca/plm/base/plm_private.h"
|
|
|
|
#define HEARTBEAT_CK 2
|
|
|
|
void orte_plm_base_heartbeat(int fd, short event, void *arg)
|
|
{
|
|
opal_buffer_t buf;
|
|
orte_plm_cmd_flag_t command = ORTE_PLM_HEARTBEAT_CMD;
|
|
opal_event_t *tmp = (opal_event_t*)arg;
|
|
struct timeval now;
|
|
int rc;
|
|
|
|
/* setup the buffer */
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
|
|
|
/* tell the HNP this is a heartbeat */
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_PLM_CMD))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto CLEANUP;
|
|
}
|
|
|
|
/* send heartbeat to HNP */
|
|
if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_PLM, 0))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto CLEANUP;
|
|
}
|
|
|
|
/* reset the timer */
|
|
now.tv_sec = orte_heartbeat_rate;
|
|
now.tv_usec = 0;
|
|
opal_evtimer_add(tmp, &now);
|
|
|
|
CLEANUP:
|
|
OBJ_DESTRUCT(&buf);
|
|
}
|
|
|
|
/* this function automatically gets periodically called
|
|
* by the event library so we can check on the state
|
|
* of the various orteds
|
|
*/
|
|
static void check_heartbeat(int fd, short dummy, void *arg)
|
|
{
|
|
orte_vpid_t v;
|
|
orte_proc_t **procs;
|
|
orte_job_t *daemons;
|
|
struct timeval timeout;
|
|
bool died = false;
|
|
opal_event_t *tmp = (opal_event_t*)arg;
|
|
struct timeval now;
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
|
"%s plm:base:check_heartbeat",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
/* if we are aborting or shutting down, ignore this */
|
|
if (orte_abnormal_term_ordered || 0 == orte_heartbeat_rate) {
|
|
return;
|
|
}
|
|
|
|
/* get the job object for the daemons */
|
|
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
return;
|
|
}
|
|
procs = (orte_proc_t**)daemons->procs->addr;
|
|
|
|
/* get current time */
|
|
gettimeofday(&timeout, NULL);
|
|
|
|
/* cycle through the daemons - make sure we check them all
|
|
* in case multiple daemons died so all of those that did die
|
|
* can be appropriately flagged
|
|
*/
|
|
for (v=1; v < daemons->num_procs; v++) {
|
|
if ((timeout.tv_sec - procs[v]->beat) > HEARTBEAT_CK*orte_heartbeat_rate) {
|
|
/* declare this orted dead */
|
|
procs[v]->state = ORTE_PROC_STATE_ABORTED;
|
|
procs[v]->exit_code = ORTE_ERROR_DEFAULT_EXIT_CODE;
|
|
if (NULL == daemons->aborted_proc) {
|
|
daemons->aborted_proc = procs[v];
|
|
}
|
|
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
|
died = true;
|
|
}
|
|
}
|
|
|
|
/* if any daemon died, abort */
|
|
if (died) {
|
|
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1,
|
|
ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_ABORTED);
|
|
return;
|
|
}
|
|
|
|
/* reset the timer */
|
|
now.tv_sec = HEARTBEAT_CK*orte_heartbeat_rate;
|
|
now.tv_usec = 0;
|
|
opal_evtimer_add(tmp, &now);
|
|
}
|
|
|
|
void orte_plm_base_start_heart(void)
|
|
{
|
|
/* if the heartbeat rate > 0, then start the heart */
|
|
if (0 < orte_heartbeat_rate) {
|
|
ORTE_TIMER_EVENT(HEARTBEAT_CK*orte_heartbeat_rate, 0, check_heartbeat);
|
|
}
|
|
}
|