Create a new "heartbeat" module in the sensor framework and move the plm_base heartbeat code there. Add new proc and job states for heartbeat_failed. Remove the "heartbeat" cmd line option for orted as this is now done automatically if the --enable-heartbeat configure option is set.
This commit was SVN r23102.
Этот коммит содержится в:
родитель
99f223210d
Коммит
2ff1ae13e1
@ -92,6 +92,9 @@ else
|
||||
AC_MSG_RESULT([no])
|
||||
orte_want_multicast=0
|
||||
fi
|
||||
AC_DEFINE_UNQUOTED([ORTE_ENABLE_MULTICAST],
|
||||
[$orte_want_multicast],
|
||||
[Whether we want multicast messaging enabled])
|
||||
|
||||
#
|
||||
# Do we want sensors enabled?
|
||||
@ -111,4 +114,22 @@ AC_DEFINE_UNQUOTED([ORTE_ENABLE_SENSORS],
|
||||
[$orte_want_sensors],
|
||||
[Whether we want sensors enabled])
|
||||
|
||||
#
|
||||
# Do we want daemon heartbeats enabled?
|
||||
|
||||
AC_MSG_CHECKING([if want daemon heartbeats])
|
||||
AC_ARG_ENABLE([heartbeat],
|
||||
[AC_HELP_STRING([--enable-heartbeat],
|
||||
[Enable heartbeat monitoring of daemons (default: disabled)])])
|
||||
if test "$enable_heartbeat" = "yes"; then
|
||||
AC_MSG_RESULT([yes])
|
||||
orte_want_heartbeats=1
|
||||
else
|
||||
AC_MSG_RESULT([no])
|
||||
orte_want_heartbeats=0
|
||||
fi
|
||||
AC_DEFINE_UNQUOTED([ORTE_ENABLE_HEARTBEAT],
|
||||
[$orte_want_heartbeats],
|
||||
[Whether we want daemon heartbeat monitoring enabled])
|
||||
|
||||
])dnl
|
||||
|
@ -35,9 +35,7 @@
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#endif
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
@ -721,21 +719,19 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
if (jdata->abort) {
|
||||
/* the job aborted - turn off any sensors on this job */
|
||||
orte_sensor.stop(jdata->jobid);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ORTE_JOB_STATE_UNTERMINATED > jdata->state &&
|
||||
jdata->num_terminated >= jdata->num_procs) {
|
||||
/* this job has terminated */
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
|
||||
/* turn off any sensor monitors on this job */
|
||||
orte_sensor.stop(jdata->jobid);
|
||||
#endif
|
||||
|
||||
if (0 < non_zero) {
|
||||
/* warn user */
|
||||
opal_output(orte_clean_output,
|
||||
@ -902,6 +898,11 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
orte_proc_t proc;
|
||||
int rc;
|
||||
|
||||
/* stop local sensors for this job */
|
||||
if (ORTE_VPID_WILDCARD == vpid) {
|
||||
orte_sensor.stop(job);
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&cmd, opal_pointer_array_t);
|
||||
OBJ_CONSTRUCT(&proc, orte_proc_t);
|
||||
proc.name.jobid = job;
|
||||
|
@ -61,9 +61,8 @@
|
||||
#include "orte/mca/notifier/base/base.h"
|
||||
#include "orte/mca/rmcast/base/base.h"
|
||||
#include "orte/mca/db/base/base.h"
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#endif
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
@ -419,7 +418,6 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
goto error;
|
||||
}
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
/* setup the SENSOR framework */
|
||||
if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -431,8 +429,9 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
error = "ortesensor_select";
|
||||
goto error;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* start the local sensors */
|
||||
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
@ -445,6 +444,9 @@ error:
|
||||
|
||||
int orte_ess_base_orted_finalize(void)
|
||||
{
|
||||
/* stop the local sensors */
|
||||
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
/* ensure all the orteds depart together */
|
||||
if (!orte_abnormal_term_ordered) {
|
||||
/* if we are abnormally terminating, don't attempt
|
||||
@ -454,9 +456,7 @@ int orte_ess_base_orted_finalize(void)
|
||||
orte_grpcomm.onesided_barrier();
|
||||
}
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
orte_sensor_base_close();
|
||||
#endif
|
||||
orte_db_base_close();
|
||||
orte_notifier_base_close();
|
||||
|
||||
|
@ -57,9 +57,8 @@
|
||||
#include "orte/mca/notifier/base/base.h"
|
||||
#include "orte/mca/rmcast/base/base.h"
|
||||
#include "orte/mca/db/base/base.h"
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#endif
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
@ -540,7 +539,6 @@ static int rte_init(void)
|
||||
goto error;
|
||||
}
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
/* setup the SENSOR framework */
|
||||
if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -549,11 +547,12 @@ static int rte_init(void)
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "ortesensor_select";
|
||||
error = "orte_sensor_select";
|
||||
goto error;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* start the local sensors */
|
||||
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
/* if a tool has launched us and is requesting event reports,
|
||||
* then set its contact info into the comm system
|
||||
*/
|
||||
@ -603,15 +602,16 @@ static int rte_finalize(void)
|
||||
orte_job_t *job;
|
||||
int i;
|
||||
|
||||
/* stop the local sensors */
|
||||
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
/* remove my contact info file */
|
||||
contact_path = opal_os_path(false, orte_process_info.top_session_dir,
|
||||
"contact.txt", NULL);
|
||||
unlink(contact_path);
|
||||
free(contact_path);
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
orte_sensor_base_close();
|
||||
#endif
|
||||
orte_db_base_close();
|
||||
orte_notifier_base_close();
|
||||
|
||||
|
@ -55,6 +55,7 @@
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/routed/base/base.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
#include "orte/util/context_fns.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
@ -1900,6 +1901,9 @@ CLEANUP:
|
||||
"%s odls:launch setting waitpids",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* start the sensors for this job (if any) */
|
||||
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
/* if the launch didn't fail, setup the waitpids on the children */
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
|
@ -272,7 +272,7 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"alps",
|
||||
&proc_vpid_index,
|
||||
false, nodelist_flat);
|
||||
nodelist_flat);
|
||||
free(nodelist_flat);
|
||||
|
||||
/* tell the new daemons the base of the name list so they can compute
|
||||
|
@ -40,6 +40,5 @@ libmca_plm_la_SOURCES += \
|
||||
base/plm_base_jobid.c \
|
||||
base/plm_base_proxy.c \
|
||||
base/plm_base_orted_cmds.c \
|
||||
base/plm_base_rsh_support.c \
|
||||
base/plm_base_heartbeat.c
|
||||
base/plm_base_rsh_support.c
|
||||
endif
|
||||
|
@ -1,145 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
|
||||
#define HEARTBEAT_CK 2
|
||||
|
||||
void orte_plm_base_heartbeat(int fd, short event, void *arg)
|
||||
{
|
||||
opal_buffer_t buf;
|
||||
orte_plm_cmd_flag_t command = ORTE_PLM_HEARTBEAT_CMD;
|
||||
opal_event_t *tmp = (opal_event_t*)arg;
|
||||
struct timeval now;
|
||||
int rc;
|
||||
|
||||
/* setup the buffer */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
|
||||
/* tell the HNP this is a heartbeat */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_PLM_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* send heartbeat to HNP */
|
||||
if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_PLM, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* reset the timer */
|
||||
now.tv_sec = orte_heartbeat_rate;
|
||||
now.tv_usec = 0;
|
||||
opal_evtimer_add(tmp, &now);
|
||||
|
||||
CLEANUP:
|
||||
OBJ_DESTRUCT(&buf);
|
||||
}
|
||||
|
||||
/* this function automatically gets periodically called
|
||||
* by the event library so we can check on the state
|
||||
* of the various orteds
|
||||
*/
|
||||
static void check_heartbeat(int fd, short dummy, void *arg)
|
||||
{
|
||||
int v;
|
||||
orte_proc_t *proc;
|
||||
orte_job_t *daemons;
|
||||
struct timeval timeout;
|
||||
bool died = false;
|
||||
opal_event_t *tmp = (opal_event_t*)arg;
|
||||
struct timeval now;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:base:check_heartbeat",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || 0 == orte_heartbeat_rate) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the job object for the daemons */
|
||||
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return;
|
||||
}
|
||||
|
||||
/* get current time */
|
||||
gettimeofday(&timeout, NULL);
|
||||
|
||||
/* cycle through the daemons - make sure we check them all
|
||||
* in case multiple daemons died so all of those that did die
|
||||
* can be appropriately flagged
|
||||
*/
|
||||
for (v=1; v < daemons->procs->size; v++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) {
|
||||
continue;
|
||||
}
|
||||
if ((timeout.tv_sec - proc->beat) > HEARTBEAT_CK*orte_heartbeat_rate) {
|
||||
/* declare this orted dead */
|
||||
proc->state = ORTE_PROC_STATE_ABORTED;
|
||||
proc->exit_code = ORTE_ERROR_DEFAULT_EXIT_CODE;
|
||||
if (NULL == daemons->aborted_proc) {
|
||||
daemons->aborted_proc = proc;
|
||||
}
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
died = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* if any daemon died, abort */
|
||||
if (died) {
|
||||
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_ABORTED,
|
||||
NULL, ORTE_PROC_STATE_UNDEF, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return;
|
||||
}
|
||||
|
||||
/* reset the timer */
|
||||
now.tv_sec = HEARTBEAT_CK*orte_heartbeat_rate;
|
||||
now.tv_usec = 0;
|
||||
opal_evtimer_add(tmp, &now);
|
||||
}
|
||||
|
||||
void orte_plm_base_start_heart(void)
|
||||
{
|
||||
/* if the heartbeat rate > 0, then start the heart */
|
||||
if (0 < orte_heartbeat_rate) {
|
||||
ORTE_TIMER_EVENT(HEARTBEAT_CK*orte_heartbeat_rate, 0, check_heartbeat);
|
||||
}
|
||||
}
|
@ -53,9 +53,6 @@
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#endif
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
@ -391,11 +388,6 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
goto WAKEUP;
|
||||
}
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
/* start any sensor monitoring of this job */
|
||||
orte_sensor.start(job);
|
||||
#endif
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:launch completed for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -742,7 +734,7 @@ int orte_plm_base_setup_orted_cmd(int *argc, char ***argv)
|
||||
int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
char *ess,
|
||||
int *proc_vpid_index,
|
||||
bool heartbeat, char *nodes)
|
||||
char *nodes)
|
||||
{
|
||||
char *param = NULL;
|
||||
int loc_id;
|
||||
@ -788,13 +780,6 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
opal_argv_append(argc, argv, param);
|
||||
free(param);
|
||||
}
|
||||
if (heartbeat && 0 < orte_heartbeat_rate) {
|
||||
/* tell the daemon to do a heartbeat */
|
||||
opal_argv_append(argc, argv, "--heartbeat");
|
||||
asprintf(¶m, "%d", orte_heartbeat_rate);
|
||||
opal_argv_append(argc, argv, param);
|
||||
free(param);
|
||||
}
|
||||
|
||||
/* tell the orted what ESS component to use */
|
||||
opal_argv_append(argc, argv, "-mca");
|
||||
|
@ -100,9 +100,6 @@ int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command)
|
||||
"%s plm:base:orted_cmd sending orted_exit commands",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* stop all heartbeats */
|
||||
orte_heartbeat_rate = 0;
|
||||
|
||||
OBJ_CONSTRUCT(&cmd, opal_buffer_t);
|
||||
|
||||
/* since the orteds are being ordered to exit, and we are
|
||||
|
@ -146,7 +146,6 @@ static void process_msg(int fd, short event, void *data)
|
||||
orte_proc_state_t state;
|
||||
orte_exit_code_t exit_code;
|
||||
int rc=ORTE_SUCCESS, ret;
|
||||
struct timeval beat;
|
||||
orte_app_context_t *app, *child_app;
|
||||
opal_list_item_t *item;
|
||||
int dump[128];
|
||||
@ -458,29 +457,6 @@ static void process_msg(int fd, short event, void *data)
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PLM_HEARTBEAT_CMD:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive got heartbeat from %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&msgpkt->sender)));
|
||||
/* lookup the daemon object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
/* this job can not possibly have been removed, so this is an error */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
goto CLEANUP;
|
||||
}
|
||||
gettimeofday(&beat, NULL);
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, msgpkt->sender.vpid))) {
|
||||
/* this proc is no longer in table - skip it */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive daemon %s is not in proc table",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_VPID_PRINT(msgpkt->sender.vpid)));
|
||||
break;
|
||||
}
|
||||
proc->beat = beat.tv_sec;
|
||||
break;
|
||||
|
||||
case ORTE_PLM_INIT_ROUTES_CMD:
|
||||
count=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) {
|
||||
|
@ -131,12 +131,6 @@ ORTE_DECLSPEC int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, c
|
||||
orte_node_rank_t nrank, orte_local_rank_t lrank,
|
||||
orte_vpid_t nlocal, int nslots, bool overwrite);
|
||||
|
||||
/**
|
||||
* Heartbeat support
|
||||
*/
|
||||
ORTE_DECLSPEC void orte_plm_base_heartbeat(int fd, short event, void *data);
|
||||
ORTE_DECLSPEC void orte_plm_base_start_heart(void);
|
||||
|
||||
/**
|
||||
* Utilities for plm components that use proxy daemons
|
||||
*/
|
||||
@ -161,7 +155,7 @@ ORTE_DECLSPEC void orte_plm_base_recv(int status, orte_process_name_t* sender,
|
||||
ORTE_DECLSPEC int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
char *ess_module,
|
||||
int *proc_vpid_index,
|
||||
bool heartbeat, char *nodes);
|
||||
char *nodes);
|
||||
|
||||
/*
|
||||
* Proxy functions for use by daemons and application procs
|
||||
|
@ -216,7 +216,7 @@ GETMAP:
|
||||
/* Add basic orted command line options */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv, "env",
|
||||
&proc_vpid_index,
|
||||
false, NULL);
|
||||
NULL);
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
|
@ -221,7 +221,7 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"lsf",
|
||||
&proc_vpid_index,
|
||||
false, nodelist);
|
||||
nodelist);
|
||||
free(nodelist);
|
||||
|
||||
/* tell the new daemons the base of the name list so they can compute
|
||||
|
@ -62,7 +62,7 @@ typedef uint32_t orte_proc_state_t;
|
||||
#define ORTE_PROC_STATE_COMM_FAILED 0x00002000 /* process communication has failed */
|
||||
#define ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED 0x00004000 /* process exceeded a sensor limit */
|
||||
#define ORTE_PROC_STATE_CALLED_ABORT 0x00008000 /* process called "errmgr.abort" */
|
||||
|
||||
#define ORTE_PROC_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
||||
/*
|
||||
* Job state codes
|
||||
*/
|
||||
@ -93,6 +93,7 @@ typedef uint32_t orte_job_state_t;
|
||||
#define ORTE_JOB_STATE_COMM_FAILED 0x00002000 /* communication has failed */
|
||||
#define ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED 0x00004000 /* job had a process that exceeded a sensor limit */
|
||||
#define ORTE_JOB_STATE_CALLED_ABORT 0x00008000 /* at least one process called "errmgr.abort" */
|
||||
#define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
||||
|
||||
/* the job never even attempted to launch due to an error earlier in the
|
||||
* launch procedure
|
||||
@ -131,8 +132,7 @@ typedef uint8_t orte_plm_cmd_flag_t;
|
||||
#define ORTE_PLM_CMD OPAL_UINT8
|
||||
#define ORTE_PLM_LAUNCH_JOB_CMD 1
|
||||
#define ORTE_PLM_UPDATE_PROC_STATE 2
|
||||
#define ORTE_PLM_HEARTBEAT_CMD 3
|
||||
#define ORTE_PLM_INIT_ROUTES_CMD 4
|
||||
#define ORTE_PLM_INIT_ROUTES_CMD 3
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -1074,7 +1074,7 @@ int orte_plm_process_launch(orte_job_t *jdata)
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"env",
|
||||
&proc_vpid_index,
|
||||
false, NULL);
|
||||
NULL);
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
|
@ -665,7 +665,7 @@ static int setup_launch(int *argcptr, char ***argvptr,
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"env",
|
||||
proc_vpid_index,
|
||||
true, NULL);
|
||||
NULL);
|
||||
|
||||
/* ensure that only the ssh plm is selected on the remote daemon */
|
||||
opal_argv_append_nosize(&argv, "-mca");
|
||||
@ -1431,14 +1431,6 @@ launch_apps:
|
||||
recv_issued = false;
|
||||
}
|
||||
|
||||
/* setup a "heartbeat" timer to periodically check on
|
||||
* the state-of-health of the orteds, if requested AND
|
||||
* we actually launched some daemons!
|
||||
*/
|
||||
if ((NULL != map) && (0 < map->num_new_daemons)) {
|
||||
orte_plm_base_start_heart();
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -239,7 +239,6 @@ static void ssh_child(char *cmd, char **argv)
|
||||
*/
|
||||
int orte_plm_rshd_launch(orte_job_t *jdata)
|
||||
{
|
||||
orte_job_map_t *map = NULL;
|
||||
char **argv = NULL;
|
||||
char *cmd, *param;
|
||||
int rc, i;
|
||||
@ -379,14 +378,6 @@ cleanup:
|
||||
ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
|
||||
/* setup a "heartbeat" timer to periodically check on
|
||||
* the state-of-health of the orteds, if requested AND
|
||||
* we actually launched some daemons!
|
||||
*/
|
||||
if ((NULL != map) && (0 < map->num_new_daemons)) {
|
||||
orte_plm_base_start_heart();
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -309,7 +309,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
/* Add basic orted command line options, including debug flags */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"slurm", &proc_vpid_index,
|
||||
false, nodelist_flat);
|
||||
nodelist_flat);
|
||||
free(nodelist_flat);
|
||||
|
||||
/* tell the new daemons the base of the name list so they can compute
|
||||
|
@ -244,7 +244,7 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
/* Add basic orted command line options */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv, "tm",
|
||||
&proc_vpid_index,
|
||||
true, nodelist);
|
||||
nodelist);
|
||||
free(nodelist);
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
@ -465,14 +465,6 @@ launch_apps:
|
||||
ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
|
||||
/* setup a "heartbeat" timer to periodically check on
|
||||
* the state-of-health of the orteds, if requested AND
|
||||
* we actually launched some daemons!
|
||||
*/
|
||||
if (0 < map->num_new_daemons) {
|
||||
orte_plm_base_start_heart();
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm:launch: finished",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
@ -552,14 +552,6 @@ launch_apps:
|
||||
ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
|
||||
/* setup a "heartbeat" timer to periodically check on
|
||||
* the state-of-health of the orteds, if requested AND
|
||||
* we actually launched some daemons!
|
||||
*/
|
||||
if (0 < map->num_new_daemons) {
|
||||
orte_plm_base_start_heart();
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm:launch: finished",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
@ -438,7 +438,7 @@ cleanup:
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"env",
|
||||
NULL,
|
||||
true, NULL);
|
||||
NULL);
|
||||
|
||||
/* Note that capacity is a starting capacity, not max */
|
||||
NSMutableArray *ret = [NSMutableArray arrayWithCapacity: argc];
|
||||
|
@ -177,6 +177,9 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t);
|
||||
/* tag for receiving ack of abort msg */
|
||||
#define ORTE_RML_TAG_ABORT 38
|
||||
|
||||
/* tag for receiving heartbeats */
|
||||
#define ORTE_RML_TAG_HEARTBEAT 39
|
||||
|
||||
#define ORTE_RML_TAG_MAX 100
|
||||
|
||||
|
||||
|
@ -139,6 +139,11 @@ static void start(orte_jobid_t jobid)
|
||||
char *filename;
|
||||
file_tracker_t *ft;
|
||||
|
||||
/* cannot monitor my own job */
|
||||
if (jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s starting file monitoring for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -252,6 +257,11 @@ static void stop(orte_jobid_t jobid)
|
||||
opal_list_item_t *item;
|
||||
file_tracker_t *ft;
|
||||
|
||||
/* cannot monitor my own job */
|
||||
if (jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (item = opal_list_get_first(&jobs);
|
||||
item != opal_list_get_end(&jobs);
|
||||
item = opal_list_get_next(item)) {
|
||||
|
37
orte/mca/sensor/heartbeat/Makefile.am
Обычный файл
37
orte/mca/sensor/heartbeat/Makefile.am
Обычный файл
@ -0,0 +1,37 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-sensor-heartbeat.txt
|
||||
|
||||
sources = \
|
||||
sensor_heartbeat.c \
|
||||
sensor_heartbeat.h \
|
||||
sensor_heartbeat_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_sensor_heartbeat_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_heartbeat.la
|
||||
else
|
||||
component_noinst = libmca_sensor_heartbeat.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_heartbeat_la_SOURCES = $(sources)
|
||||
mca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_heartbeat_la_SOURCES =$(sources)
|
||||
libmca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version
|
19
orte/mca/sensor/heartbeat/configure.m4
Обычный файл
19
orte/mca/sensor/heartbeat/configure.m4
Обычный файл
@ -0,0 +1,19 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_sensor_heartbeat_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_sensor_heartbeat_CONFIG], [
|
||||
# if we don't want heartbeats, don't compile
|
||||
# this component
|
||||
AS_IF([test "$orte_want_heartbeats" = "1"],
|
||||
[$1], [$2])
|
||||
])dnl
|
||||
|
14
orte/mca/sensor/heartbeat/configure.params
Обычный файл
14
orte/mca/sensor/heartbeat/configure.params
Обычный файл
@ -0,0 +1,14 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_CONFIG_FILES="Makefile"
|
20
orte/mca/sensor/heartbeat/help-orte-sensor-heartbeat.txt
Обычный файл
20
orte/mca/sensor/heartbeat/help-orte-sensor-heartbeat.txt
Обычный файл
@ -0,0 +1,20 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for the memory usage sensor
|
||||
#
|
||||
[mem-limit-exceeded]
|
||||
A process has exceeded the specified limit on memory usage:
|
||||
|
||||
Node: %s
|
||||
Process rank: %s
|
||||
Memory used: %luGbytes
|
||||
Memory limit: %luGbytes
|
||||
|
368
orte/mca/sensor/heartbeat/sensor_heartbeat.c
Обычный файл
368
orte/mca/sensor/heartbeat/sensor_heartbeat.c
Обычный файл
@ -0,0 +1,368 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/pstat/pstat.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/rmcast/rmcast.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_heartbeat.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void start(orte_jobid_t job);
|
||||
static void stop(orte_jobid_t job);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_heartbeat_module = {
|
||||
init,
|
||||
finalize,
|
||||
start,
|
||||
stop
|
||||
};
|
||||
|
||||
/* declare the local functions */
|
||||
static void check_heartbeat(int fd, short event, void *arg);
|
||||
static void send_heartbeat(int fd, short event, void *arg);
|
||||
#if ORTE_ENABLE_MULTICAST
|
||||
static void recv_rmcast_beats(int status,
|
||||
orte_rmcast_channel_t channel,
|
||||
orte_rmcast_tag_t tag,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buf, void* cbdata);
|
||||
static void rmcast_callback_fn(int status,
|
||||
orte_rmcast_channel_t channel,
|
||||
orte_rmcast_tag_t tag,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buf, void* cbdata);
|
||||
#else
|
||||
static void recv_rml_beats(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata);
|
||||
static void rml_callback_fn(int status,
|
||||
struct orte_process_name_t* peer,
|
||||
struct opal_buffer_t* buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata);
|
||||
#endif
|
||||
|
||||
/* local globals */
|
||||
static opal_event_t *send_ev = NULL, *check_ev = NULL;
|
||||
static struct timeval send_time, check_time;
|
||||
static double timeout;
|
||||
|
||||
#include MCA_timer_IMPLEMENTATION_HEADER
|
||||
static inline double gettime(void) __opal_attribute_always_inline__;
|
||||
static inline double gettime(void)
|
||||
{
|
||||
double wtime;
|
||||
#if OPAL_TIMER_USEC_NATIVE
|
||||
wtime = ((double) opal_timer_base_get_usec()) / 1000000.0;
|
||||
#else
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
wtime = tv.tv_sec;
|
||||
wtime += (double)tv.tv_usec / 1000000.0;
|
||||
#endif
|
||||
return wtime;
|
||||
}
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
#if ORTE_ENABLE_MULTICAST
|
||||
/* setup multicast recv for heartbeats */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_SYS_CHANNEL,
|
||||
ORTE_RMCAST_TAG_HEARTBEAT,
|
||||
ORTE_RMCAST_PERSISTENT,
|
||||
recv_rmcast_beats, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
#else
|
||||
/* setup RML recv for the HNP to receive heartbeats */
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
ORTE_RML_TAG_HEARTBEAT,
|
||||
ORTE_RML_NON_PERSISTENT,
|
||||
recv_rml_beats,
|
||||
NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
if (NULL != send_ev) {
|
||||
opal_event_del(send_ev);
|
||||
free(send_ev);
|
||||
send_ev = NULL;
|
||||
}
|
||||
if (NULL != check_ev) {
|
||||
opal_event_del(check_ev);
|
||||
free(check_ev);
|
||||
check_ev = NULL;
|
||||
}
|
||||
|
||||
#if ORTE_ENABLE_MULTICAST
|
||||
orte_rmcast.cancel_recv(ORTE_RMCAST_SYS_CHANNEL, ORTE_RMCAST_TAG_HEARTBEAT);
|
||||
#else
|
||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_HEARTBEAT);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start sending and checking heartbeats
|
||||
*/
|
||||
static void start(orte_jobid_t jobid)
|
||||
{
|
||||
uint64_t time;
|
||||
|
||||
if (jobid != ORTE_PROC_MY_NAME->jobid) {
|
||||
/* heartbeats are only for daemons and HNPs */
|
||||
return;
|
||||
}
|
||||
|
||||
/* setup the send */
|
||||
time = mca_sensor_heartbeat_component.beat * 1000; /* convert to microsecs */
|
||||
send_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
|
||||
opal_evtimer_set(send_ev, send_heartbeat, send_ev);
|
||||
send_time.tv_sec = time / 1000000;
|
||||
send_time.tv_usec = time % 1000000;
|
||||
opal_evtimer_add(send_ev, &send_time);
|
||||
|
||||
/* define the timeout */
|
||||
timeout = 2.0 * (double)time;
|
||||
|
||||
/* setup the check */
|
||||
time = mca_sensor_heartbeat_component.check * 1000; /* convert to microsecs */
|
||||
check_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
|
||||
opal_evtimer_set(check_ev, check_heartbeat, check_ev);
|
||||
check_time.tv_sec = time / 1000000;
|
||||
check_time.tv_usec = time % 1000000;
|
||||
opal_evtimer_add(check_ev, &check_time);
|
||||
}
|
||||
|
||||
|
||||
static void stop(orte_jobid_t jobid)
|
||||
{
|
||||
if (jobid != ORTE_PROC_MY_NAME->jobid) {
|
||||
/* heartbeats are only for daemons and HNPs */
|
||||
return;
|
||||
}
|
||||
|
||||
if (NULL != send_ev) {
|
||||
opal_event_del(send_ev);
|
||||
free(send_ev);
|
||||
send_ev = NULL;
|
||||
}
|
||||
if (NULL != check_ev) {
|
||||
opal_event_del(check_ev);
|
||||
free(check_ev);
|
||||
check_ev = NULL;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void send_heartbeat(int fd, short event, void *arg)
|
||||
{
|
||||
opal_buffer_t *buf;
|
||||
opal_event_t *tmp = (opal_event_t*)arg;
|
||||
int rc;
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || orte_finalizing) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* setup the buffer - nothing to pack as receipt alone is the "beat" */
|
||||
buf = OBJ_NEW(opal_buffer_t);
|
||||
|
||||
#if ORTE_ENABLE_MULTICAST
|
||||
if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer_nb(ORTE_RMCAST_SYS_CHANNEL,
|
||||
ORTE_RMCAST_TAG_HEARTBEAT, buf,
|
||||
rmcast_callback_fn, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
return;
|
||||
}
|
||||
#else
|
||||
/* send heartbeat to HNP */
|
||||
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
|
||||
ORTE_RML_TAG_HEARTBEAT, 0,
|
||||
rml_callback_fn, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* reset the timer */
|
||||
opal_evtimer_add(tmp, &send_time);
|
||||
}
|
||||
|
||||
/* this function automatically gets periodically called
|
||||
* by the event library so we can check on the state
|
||||
* of the various orteds
|
||||
*/
|
||||
static void check_heartbeat(int fd, short dummy, void *arg)
|
||||
{
|
||||
int v;
|
||||
orte_nid_t *nid;
|
||||
double now;
|
||||
opal_event_t *tmp = (opal_event_t*)arg;
|
||||
orte_process_name_t name;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s sensor:check_heartbeat",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || orte_finalizing) {
|
||||
return;
|
||||
}
|
||||
|
||||
name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
/* get current time */
|
||||
now = gettime();
|
||||
|
||||
/* cycle through the nidmap - make sure we check them all
|
||||
* in case multiple daemons are late so all of those that did
|
||||
* can be appropriately flagged
|
||||
*/
|
||||
for (v=0; v < orte_nidmap.size; v++) {
|
||||
if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, v))) {
|
||||
continue;
|
||||
}
|
||||
if (0 == nid->beat) {
|
||||
/* haven't recvd a beat yet */
|
||||
continue;
|
||||
}
|
||||
if ((now - nid->beat) > timeout) {
|
||||
nid->missed++;
|
||||
if (mca_sensor_heartbeat_component.missed < nid->missed) {
|
||||
/* heartbeat failed */
|
||||
name.vpid = v;
|
||||
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_HEARTBEAT_FAILED,
|
||||
&name, ORTE_PROC_STATE_HEARTBEAT_FAILED,
|
||||
ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* reset the timer */
|
||||
opal_evtimer_add(tmp, &check_time);
|
||||
}
|
||||
|
||||
#if ORTE_ENABLE_MULTICAST
|
||||
static void recv_rmcast_beats(int status,
|
||||
orte_rmcast_channel_t channel,
|
||||
orte_rmcast_tag_t tag,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buf, void* cbdata)
|
||||
{
|
||||
orte_nid_t *nid;
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || orte_finalizing) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* get this daemon's nid */
|
||||
if (NULL == (nid = orte_util_lookup_nid(sender))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return;
|
||||
}
|
||||
|
||||
/* update its time */
|
||||
nid->beat = gettime();
|
||||
}
|
||||
|
||||
static void rmcast_callback_fn(int status,
|
||||
orte_rmcast_channel_t channel,
|
||||
orte_rmcast_tag_t tag,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buf, void* cbdata)
|
||||
{
|
||||
OBJ_RELEASE(buf);
|
||||
}
|
||||
|
||||
#else
|
||||
static void recv_rml_beats(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
orte_nid_t *nid;
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || orte_finalizing) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* get this daemon's nid */
|
||||
if (NULL == (nid = orte_util_lookup_nid(sender))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
} else {
|
||||
/* update its time */
|
||||
nid->beat = gettime();
|
||||
}
|
||||
|
||||
/* reissue the recv */
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
ORTE_RML_TAG_HEARTBEAT,
|
||||
ORTE_RML_NON_PERSISTENT,
|
||||
recv_rml_beats,
|
||||
NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
static void rml_callback_fn(int status,
|
||||
struct orte_process_name_t* peer,
|
||||
struct opal_buffer_t* buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
OBJ_RELEASE(buffer);
|
||||
}
|
||||
#endif
|
38
orte/mca/sensor/heartbeat/sensor_heartbeat.h
Обычный файл
38
orte/mca/sensor/heartbeat/sensor_heartbeat.h
Обычный файл
@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Heartbeat sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_HEARTBEAT_H
|
||||
#define ORTE_SENSOR_HEARTBEAT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_sensor_heartbeat_component_t {
|
||||
orte_sensor_base_component_t super;
|
||||
int beat;
|
||||
int check;
|
||||
int missed;
|
||||
};
|
||||
typedef struct orte_sensor_heartbeat_component_t orte_sensor_heartbeat_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_heartbeat_component_t mca_sensor_heartbeat_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_heartbeat_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
99
orte/mca/sensor/heartbeat/sensor_heartbeat_component.c
Обычный файл
99
orte/mca/sensor/heartbeat/sensor_heartbeat_component.c
Обычный файл
@ -0,0 +1,99 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "sensor_heartbeat.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_sensor_heartbeat_open(void);
|
||||
static int orte_sensor_heartbeat_close(void);
|
||||
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
orte_sensor_heartbeat_component_t mca_sensor_heartbeat_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"heartbeat", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_heartbeat_open, /* component open */
|
||||
orte_sensor_heartbeat_close, /* component close */
|
||||
orte_sensor_heartbeat_query /* component query */
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_sensor_heartbeat_open(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_heartbeat_component.super.base_version;
|
||||
int tmp;
|
||||
|
||||
/* lookup parameters */
|
||||
mca_base_param_reg_int(c, "beat",
|
||||
"Heartbeat rate in milliseconds (default=1)",
|
||||
false, false, 1, &tmp);
|
||||
if (tmp < 0) {
|
||||
opal_output(0, "Illegal value %d - must be > 0", tmp);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
mca_sensor_heartbeat_component.beat = tmp;
|
||||
|
||||
mca_base_param_reg_int(c, "check",
|
||||
"Check for failure rate in milliseconds (default=5)",
|
||||
false, false, 5, &tmp);
|
||||
mca_sensor_heartbeat_component.check = tmp;
|
||||
|
||||
mca_base_param_reg_int(c, "missed",
|
||||
"Number of missed heartbeats before failure is declared (default=5)",
|
||||
false, false, 5, &tmp);
|
||||
mca_sensor_heartbeat_component.missed = tmp;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
*priority = 10; /* use if we were built */
|
||||
*module = (mca_base_module_t *)&orte_sensor_heartbeat_module;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_heartbeat_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -109,6 +109,11 @@ static void start(orte_jobid_t jobid)
|
||||
opal_list_item_t *item;
|
||||
int rc, tmp;
|
||||
|
||||
/* cannot monitor my own job */
|
||||
if (jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s starting memory monitoring for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -177,6 +182,11 @@ static void stop(orte_jobid_t jobid)
|
||||
opal_list_item_t *item;
|
||||
memusage_tracker_t *job;
|
||||
|
||||
/* cannot monitor my own job */
|
||||
if (jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (item = opal_list_get_first(&jobs);
|
||||
item != opal_list_get_end(&jobs);
|
||||
item = opal_list_get_next(item)) {
|
||||
|
@ -125,7 +125,6 @@ static struct {
|
||||
int fail;
|
||||
int fail_delay;
|
||||
bool abort;
|
||||
int heartbeat;
|
||||
} orted_globals;
|
||||
|
||||
/*
|
||||
@ -149,10 +148,6 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
|
||||
&orted_globals.fail_delay, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Have the orted specified for failure delay for the provided number of seconds before failing" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "heartbeat", 1,
|
||||
&orted_globals.heartbeat, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Seconds between orted heartbeat messages to be sent to HNP (default: 0 => no heartbeat)" },
|
||||
|
||||
{ "orte", "debug", NULL, 'd', NULL, "debug", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Debug the OpenRTE" },
|
||||
@ -785,11 +780,6 @@ int orte_daemon(int argc, char *argv[])
|
||||
opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
|
||||
/* if we were told to do a heartbeat, then setup to do so */
|
||||
if (0 < orted_globals.heartbeat) {
|
||||
ORTE_TIMER_EVENT(orted_globals.heartbeat, 0, orte_plm_base_heartbeat);
|
||||
}
|
||||
|
||||
/* if we were given a launch string, then process it */
|
||||
if (NULL != orted_launch_cmd) {
|
||||
opal_buffer_t launch;
|
||||
|
@ -94,7 +94,6 @@ bool orte_abnormal_term_ordered = false;
|
||||
bool orte_routing_is_enabled = false;
|
||||
bool orte_job_term_ordered = false;
|
||||
|
||||
int orte_heartbeat_rate;
|
||||
int orte_startup_timeout;
|
||||
|
||||
int orte_timeout_usec_per_proc;
|
||||
@ -828,7 +827,6 @@ static void orte_proc_construct(orte_proc_t* proc)
|
||||
proc->node = NULL;
|
||||
proc->nodename = NULL;
|
||||
proc->rml_uri = NULL;
|
||||
proc->beat = 0;
|
||||
proc->restarts = 0;
|
||||
proc->relocates = 0;
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
@ -908,6 +906,10 @@ static void orte_nid_construct(orte_nid_t *ptr)
|
||||
ptr->daemon = ORTE_VPID_INVALID;
|
||||
OBJ_CONSTRUCT(&ptr->attrs, opal_list_t);
|
||||
OBJ_CONSTRUCT(&ptr->sysinfo, opal_list_t);
|
||||
#if ORTE_ENABLE_HEARTBEAT
|
||||
ptr->beat = 0;
|
||||
ptr->missed = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void orte_nid_destruct(orte_nid_t *ptr)
|
||||
|
@ -450,8 +450,6 @@ struct orte_proc_t {
|
||||
char *nodename;
|
||||
/* RML contact info */
|
||||
char *rml_uri;
|
||||
/* seconds when last heartbeat was detected */
|
||||
time_t beat;
|
||||
/* number of times this process has been restarted */
|
||||
int32_t restarts;
|
||||
/* number of times this process has been relocated */
|
||||
@ -489,6 +487,12 @@ typedef struct {
|
||||
opal_list_t attrs;
|
||||
/* list of system info */
|
||||
opal_list_t sysinfo;
|
||||
#if ORTE_ENABLE_HEARTBEAT
|
||||
/* seconds when last heartbeat was detected */
|
||||
double beat;
|
||||
/* number of missed heartbeats */
|
||||
int missed;
|
||||
#endif
|
||||
} orte_nid_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_nid_t);
|
||||
|
||||
@ -587,7 +591,6 @@ ORTE_DECLSPEC extern bool orte_abnormal_term_ordered;
|
||||
ORTE_DECLSPEC extern bool orte_routing_is_enabled;
|
||||
ORTE_DECLSPEC extern bool orte_job_term_ordered;
|
||||
|
||||
ORTE_DECLSPEC extern int orte_heartbeat_rate;
|
||||
ORTE_DECLSPEC extern int orte_startup_timeout;
|
||||
|
||||
ORTE_DECLSPEC extern int orte_timeout_usec_per_proc;
|
||||
|
@ -185,10 +185,6 @@ int orte_register_params(void)
|
||||
"Have the specified orted fail after specified number of seconds (default: 0 => no delay)",
|
||||
false, false, 0, &orted_debug_failure_delay);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "heartbeat_rate",
|
||||
"Seconds between checks for daemon state-of-health (default: 0 => do not check)",
|
||||
false, false, 0, &orte_heartbeat_rate);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "startup_timeout",
|
||||
"Milliseconds/daemon to wait for startup before declaring failed_to_start (default: 0 => do not check)",
|
||||
false, false, 0, &orte_startup_timeout);
|
||||
|
@ -89,10 +89,8 @@
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#endif
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#endif
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
#endif
|
||||
@ -436,7 +434,6 @@ void orte_info_open_components(void)
|
||||
opal_pointer_array_add(&component_map, map);
|
||||
#endif
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
if (ORTE_SUCCESS != orte_sensor_base_open()) {
|
||||
goto error;
|
||||
}
|
||||
@ -444,7 +441,6 @@ void orte_info_open_components(void)
|
||||
map->type = strdup("sensor");
|
||||
map->components = &mca_sensor_base_components_available;
|
||||
opal_pointer_array_add(&component_map, map);
|
||||
#endif
|
||||
|
||||
if (ORTE_SUCCESS != orte_filem_base_open()) {
|
||||
goto error;
|
||||
|
@ -210,9 +210,7 @@ int main(int argc, char *argv[])
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
opal_pointer_array_add(&mca_types, "snapc");
|
||||
#endif
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
opal_pointer_array_add(&mca_types, "sensor");
|
||||
#endif
|
||||
opal_pointer_array_add(&mca_types, "filem");
|
||||
#endif
|
||||
/* these are always included */
|
||||
|
@ -176,11 +176,12 @@ const char *orte_job_state_to_str(orte_job_state_t state)
|
||||
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
|
||||
return "SENSOR BOUND EXCEEDED";
|
||||
break;
|
||||
|
||||
case ORTE_JOB_STATE_NEVER_LAUNCHED:
|
||||
return "NEVER LAUNCHED";
|
||||
case ORTE_JOB_STATE_ABORT_ORDERED:
|
||||
return "ABORT IN PROGRESS";
|
||||
case ORTE_JOB_STATE_HEARTBEAT_FAILED:
|
||||
return "HEARTBEAT FAILED";
|
||||
default:
|
||||
return "UNKNOWN STATE!";
|
||||
}
|
||||
@ -220,7 +221,9 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
|
||||
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
|
||||
return "SENSOR BOUND EXCEEDED";
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
||||
return "HEARTBEAT FAILED";
|
||||
break;
|
||||
default:
|
||||
return "UNKNOWN STATE!";
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user