From 2ff1ae13e16efb494a18559fa13e32483eed4e34 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 5 May 2010 00:48:43 +0000 Subject: [PATCH] Create a new "heartbeat" module in the sensor framework and move the plm_base heartbeat code there. Add new proc and job states for heartbeat_failed. Remove the "heartbeat" cmd line option for orted as this is now done automatically if the --enable-heartbeat configure option is set. This commit was SVN r23102. --- orte/config/orte_configure_options.m4 | 21 + orte/mca/errmgr/hnp/errmgr_hnp.c | 13 +- orte/mca/ess/base/ess_base_std_orted.c | 14 +- orte/mca/ess/hnp/ess_hnp_module.c | 16 +- orte/mca/odls/base/odls_base_default_fns.c | 4 + orte/mca/plm/alps/plm_alps_module.c | 2 +- orte/mca/plm/base/Makefile.am | 3 +- orte/mca/plm/base/plm_base_heartbeat.c | 145 ------- orte/mca/plm/base/plm_base_launch_support.c | 17 +- orte/mca/plm/base/plm_base_orted_cmds.c | 3 - orte/mca/plm/base/plm_base_receive.c | 24 -- orte/mca/plm/base/plm_private.h | 8 +- orte/mca/plm/ccp/plm_ccp_module.c | 2 +- orte/mca/plm/lsf/plm_lsf_module.c | 2 +- orte/mca/plm/plm_types.h | 6 +- orte/mca/plm/process/plm_process_module.c | 2 +- orte/mca/plm/rsh/plm_rsh_module.c | 10 +- orte/mca/plm/rshd/plm_rshd_module.c | 9 - orte/mca/plm/slurm/plm_slurm_module.c | 2 +- orte/mca/plm/tm/plm_tm_module.c | 10 +- orte/mca/plm/tmd/plm_tmd_module.c | 8 - orte/mca/plm/xgrid/src/plm_xgrid_client.m | 2 +- orte/mca/rml/rml_types.h | 3 + orte/mca/sensor/file/sensor_file.c | 10 + orte/mca/sensor/heartbeat/Makefile.am | 37 ++ orte/mca/sensor/heartbeat/configure.m4 | 19 + orte/mca/sensor/heartbeat/configure.params | 14 + .../heartbeat/help-orte-sensor-heartbeat.txt | 20 + orte/mca/sensor/heartbeat/sensor_heartbeat.c | 368 ++++++++++++++++++ orte/mca/sensor/heartbeat/sensor_heartbeat.h | 38 ++ .../heartbeat/sensor_heartbeat_component.c | 99 +++++ orte/mca/sensor/memusage/sensor_memusage.c | 10 + orte/orted/orted_main.c | 10 - orte/runtime/orte_globals.c | 6 +- orte/runtime/orte_globals.h | 9 +- orte/runtime/orte_mca_params.c | 4 - orte/tools/orte-info/components.c | 4 - orte/tools/orte-info/orte-info.c | 2 - orte/util/error_strings.c | 7 +- 39 files changed, 694 insertions(+), 289 deletions(-) delete mode 100644 orte/mca/plm/base/plm_base_heartbeat.c create mode 100644 orte/mca/sensor/heartbeat/Makefile.am create mode 100644 orte/mca/sensor/heartbeat/configure.m4 create mode 100644 orte/mca/sensor/heartbeat/configure.params create mode 100644 orte/mca/sensor/heartbeat/help-orte-sensor-heartbeat.txt create mode 100644 orte/mca/sensor/heartbeat/sensor_heartbeat.c create mode 100644 orte/mca/sensor/heartbeat/sensor_heartbeat.h create mode 100644 orte/mca/sensor/heartbeat/sensor_heartbeat_component.c diff --git a/orte/config/orte_configure_options.m4 b/orte/config/orte_configure_options.m4 index f7cb65490f..dfc9797fe2 100644 --- a/orte/config/orte_configure_options.m4 +++ b/orte/config/orte_configure_options.m4 @@ -92,6 +92,9 @@ else AC_MSG_RESULT([no]) orte_want_multicast=0 fi +AC_DEFINE_UNQUOTED([ORTE_ENABLE_MULTICAST], + [$orte_want_multicast], + [Whether we want multicast messaging enabled]) # # Do we want sensors enabled? @@ -111,4 +114,22 @@ AC_DEFINE_UNQUOTED([ORTE_ENABLE_SENSORS], [$orte_want_sensors], [Whether we want sensors enabled]) +# +# Do we want daemon heartbeats enabled? + +AC_MSG_CHECKING([if want daemon heartbeats]) +AC_ARG_ENABLE([heartbeat], + [AC_HELP_STRING([--enable-heartbeat], + [Enable heartbeat monitoring of daemons (default: disabled)])]) +if test "$enable_heartbeat" = "yes"; then + AC_MSG_RESULT([yes]) + orte_want_heartbeats=1 +else + AC_MSG_RESULT([no]) + orte_want_heartbeats=0 +fi +AC_DEFINE_UNQUOTED([ORTE_ENABLE_HEARTBEAT], + [$orte_want_heartbeats], + [Whether we want daemon heartbeat monitoring enabled]) + ])dnl diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index 137597d1b4..bf10fee3c1 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -35,9 +35,7 @@ #include "orte/mca/plm/base/plm_private.h" #include "orte/mca/plm/plm.h" #include "orte/mca/rmaps/rmaps_types.h" -#if ORTE_ENABLE_SENSORS #include "orte/mca/sensor/sensor.h" -#endif #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/errmgr_private.h" @@ -721,21 +719,19 @@ static void check_job_complete(orte_job_t *jdata) } } -#if ORTE_ENABLE_SENSORS if (jdata->abort) { /* the job aborted - turn off any sensors on this job */ orte_sensor.stop(jdata->jobid); } -#endif if (ORTE_JOB_STATE_UNTERMINATED > jdata->state && jdata->num_terminated >= jdata->num_procs) { /* this job has terminated */ jdata->state = ORTE_JOB_STATE_TERMINATED; -#if ORTE_ENABLE_SENSORS + /* turn off any sensor monitors on this job */ orte_sensor.stop(jdata->jobid); -#endif + if (0 < non_zero) { /* warn user */ opal_output(orte_clean_output, @@ -902,6 +898,11 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid) orte_proc_t proc; int rc; + /* stop local sensors for this job */ + if (ORTE_VPID_WILDCARD == vpid) { + orte_sensor.stop(job); + } + OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); OBJ_CONSTRUCT(&proc, orte_proc_t); proc.name.jobid = job; diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index dbf0e8537f..28ef0785c2 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -61,9 +61,8 @@ #include "orte/mca/notifier/base/base.h" #include "orte/mca/rmcast/base/base.h" #include "orte/mca/db/base/base.h" -#if ORTE_ENABLE_SENSORS #include "orte/mca/sensor/base/base.h" -#endif +#include "orte/mca/sensor/sensor.h" #include "orte/runtime/orte_cr.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_globals.h" @@ -419,7 +418,6 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } -#if ORTE_ENABLE_SENSORS /* setup the SENSOR framework */ if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) { ORTE_ERROR_LOG(ret); @@ -431,8 +429,9 @@ int orte_ess_base_orted_setup(char **hosts) error = "ortesensor_select"; goto error; } -#endif - + /* start the local sensors */ + orte_sensor.start(ORTE_PROC_MY_NAME->jobid); + return ORTE_SUCCESS; error: @@ -445,6 +444,9 @@ error: int orte_ess_base_orted_finalize(void) { + /* stop the local sensors */ + orte_sensor.stop(ORTE_PROC_MY_NAME->jobid); + /* ensure all the orteds depart together */ if (!orte_abnormal_term_ordered) { /* if we are abnormally terminating, don't attempt @@ -454,9 +456,7 @@ int orte_ess_base_orted_finalize(void) orte_grpcomm.onesided_barrier(); } -#if ORTE_ENABLE_SENSORS orte_sensor_base_close(); -#endif orte_db_base_close(); orte_notifier_base_close(); diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index 740f54ee47..0fee37def1 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -57,9 +57,8 @@ #include "orte/mca/notifier/base/base.h" #include "orte/mca/rmcast/base/base.h" #include "orte/mca/db/base/base.h" -#if ORTE_ENABLE_SENSORS #include "orte/mca/sensor/base/base.h" -#endif +#include "orte/mca/sensor/sensor.h" #include "orte/mca/rmaps/base/base.h" #if OPAL_ENABLE_FT_CR == 1 @@ -540,7 +539,6 @@ static int rte_init(void) goto error; } -#if ORTE_ENABLE_SENSORS /* setup the SENSOR framework */ if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) { ORTE_ERROR_LOG(ret); @@ -549,11 +547,12 @@ static int rte_init(void) } if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) { ORTE_ERROR_LOG(ret); - error = "ortesensor_select"; + error = "orte_sensor_select"; goto error; } -#endif - + /* start the local sensors */ + orte_sensor.start(ORTE_PROC_MY_NAME->jobid); + /* if a tool has launched us and is requesting event reports, * then set its contact info into the comm system */ @@ -603,15 +602,16 @@ static int rte_finalize(void) orte_job_t *job; int i; + /* stop the local sensors */ + orte_sensor.stop(ORTE_PROC_MY_NAME->jobid); + /* remove my contact info file */ contact_path = opal_os_path(false, orte_process_info.top_session_dir, "contact.txt", NULL); unlink(contact_path); free(contact_path); -#if ORTE_ENABLE_SENSORS orte_sensor_base_close(); -#endif orte_db_base_close(); orte_notifier_base_close(); diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index b8b236e367..3e7eafc3d6 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -55,6 +55,7 @@ #include "orte/mca/plm/base/base.h" #include "orte/mca/routed/base/base.h" #include "orte/mca/rmaps/rmaps_types.h" +#include "orte/mca/sensor/sensor.h" #include "orte/util/context_fns.h" #include "orte/util/name_fns.h" @@ -1900,6 +1901,9 @@ CLEANUP: "%s odls:launch setting waitpids", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* start the sensors for this job (if any) */ + orte_sensor.start(ORTE_PROC_MY_NAME->jobid); + /* if the launch didn't fail, setup the waitpids on the children */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); diff --git a/orte/mca/plm/alps/plm_alps_module.c b/orte/mca/plm/alps/plm_alps_module.c index feec94871c..bc54671632 100644 --- a/orte/mca/plm/alps/plm_alps_module.c +++ b/orte/mca/plm/alps/plm_alps_module.c @@ -272,7 +272,7 @@ static int plm_alps_launch_job(orte_job_t *jdata) orte_plm_base_orted_append_basic_args(&argc, &argv, "alps", &proc_vpid_index, - false, nodelist_flat); + nodelist_flat); free(nodelist_flat); /* tell the new daemons the base of the name list so they can compute diff --git a/orte/mca/plm/base/Makefile.am b/orte/mca/plm/base/Makefile.am index 265963cd40..753f190b5c 100644 --- a/orte/mca/plm/base/Makefile.am +++ b/orte/mca/plm/base/Makefile.am @@ -40,6 +40,5 @@ libmca_plm_la_SOURCES += \ base/plm_base_jobid.c \ base/plm_base_proxy.c \ base/plm_base_orted_cmds.c \ - base/plm_base_rsh_support.c \ - base/plm_base_heartbeat.c + base/plm_base_rsh_support.c endif diff --git a/orte/mca/plm/base/plm_base_heartbeat.c b/orte/mca/plm/base/plm_base_heartbeat.c deleted file mode 100644 index 674b9b5a0d..0000000000 --- a/orte/mca/plm/base/plm_base_heartbeat.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#ifdef HAVE_SYS_TIME_H -#include -#endif - -#include "opal/dss/dss.h" -#include "opal/class/opal_pointer_array.h" - -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/runtime/orte_globals.h" -#include "orte/runtime/orte_wait.h" -#include "orte/util/name_fns.h" - -#include "orte/mca/plm/base/plm_private.h" - -#define HEARTBEAT_CK 2 - -void orte_plm_base_heartbeat(int fd, short event, void *arg) -{ - opal_buffer_t buf; - orte_plm_cmd_flag_t command = ORTE_PLM_HEARTBEAT_CMD; - opal_event_t *tmp = (opal_event_t*)arg; - struct timeval now; - int rc; - - /* setup the buffer */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - - /* tell the HNP this is a heartbeat */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - - /* send heartbeat to HNP */ - if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_PLM, 0))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - - /* reset the timer */ - now.tv_sec = orte_heartbeat_rate; - now.tv_usec = 0; - opal_evtimer_add(tmp, &now); - -CLEANUP: - OBJ_DESTRUCT(&buf); -} - -/* this function automatically gets periodically called - * by the event library so we can check on the state - * of the various orteds - */ -static void check_heartbeat(int fd, short dummy, void *arg) -{ - int v; - orte_proc_t *proc; - orte_job_t *daemons; - struct timeval timeout; - bool died = false; - opal_event_t *tmp = (opal_event_t*)arg; - struct timeval now; - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base:check_heartbeat", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* if we are aborting or shutting down, ignore this */ - if (orte_abnormal_term_ordered || 0 == orte_heartbeat_rate) { - return; - } - - /* get the job object for the daemons */ - if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return; - } - - /* get current time */ - gettimeofday(&timeout, NULL); - - /* cycle through the daemons - make sure we check them all - * in case multiple daemons died so all of those that did die - * can be appropriately flagged - */ - for (v=1; v < daemons->procs->size; v++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) { - continue; - } - if ((timeout.tv_sec - proc->beat) > HEARTBEAT_CK*orte_heartbeat_rate) { - /* declare this orted dead */ - proc->state = ORTE_PROC_STATE_ABORTED; - proc->exit_code = ORTE_ERROR_DEFAULT_EXIT_CODE; - if (NULL == daemons->aborted_proc) { - daemons->aborted_proc = proc; - } - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - died = true; - } - } - - /* if any daemon died, abort */ - if (died) { - orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_ABORTED, - NULL, ORTE_PROC_STATE_UNDEF, ORTE_ERROR_DEFAULT_EXIT_CODE); - return; - } - - /* reset the timer */ - now.tv_sec = HEARTBEAT_CK*orte_heartbeat_rate; - now.tv_usec = 0; - opal_evtimer_add(tmp, &now); -} - -void orte_plm_base_start_heart(void) -{ - /* if the heartbeat rate > 0, then start the heart */ - if (0 < orte_heartbeat_rate) { - ORTE_TIMER_EVENT(HEARTBEAT_CK*orte_heartbeat_rate, 0, check_heartbeat); - } -} diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 815c8c2089..cd799f80c4 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -53,9 +53,6 @@ #include "orte/mca/filem/filem.h" #include "orte/mca/filem/base/base.h" #include "orte/mca/rml/base/rml_contact.h" -#if ORTE_ENABLE_SENSORS -#include "orte/mca/sensor/sensor.h" -#endif #include "orte/runtime/orte_globals.h" #include "orte/runtime/runtime.h" #include "orte/runtime/orte_locks.h" @@ -391,11 +388,6 @@ int orte_plm_base_launch_apps(orte_jobid_t job) goto WAKEUP; } -#if ORTE_ENABLE_SENSORS - /* start any sensor monitoring of this job */ - orte_sensor.start(job); -#endif - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:launch completed for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -742,7 +734,7 @@ int orte_plm_base_setup_orted_cmd(int *argc, char ***argv) int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, char *ess, int *proc_vpid_index, - bool heartbeat, char *nodes) + char *nodes) { char *param = NULL; int loc_id; @@ -788,13 +780,6 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, opal_argv_append(argc, argv, param); free(param); } - if (heartbeat && 0 < orte_heartbeat_rate) { - /* tell the daemon to do a heartbeat */ - opal_argv_append(argc, argv, "--heartbeat"); - asprintf(¶m, "%d", orte_heartbeat_rate); - opal_argv_append(argc, argv, param); - free(param); - } /* tell the orted what ESS component to use */ opal_argv_append(argc, argv, "-mca"); diff --git a/orte/mca/plm/base/plm_base_orted_cmds.c b/orte/mca/plm/base/plm_base_orted_cmds.c index 1323d20044..cead17be84 100644 --- a/orte/mca/plm/base/plm_base_orted_cmds.c +++ b/orte/mca/plm/base/plm_base_orted_cmds.c @@ -100,9 +100,6 @@ int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command) "%s plm:base:orted_cmd sending orted_exit commands", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* stop all heartbeats */ - orte_heartbeat_rate = 0; - OBJ_CONSTRUCT(&cmd, opal_buffer_t); /* since the orteds are being ordered to exit, and we are diff --git a/orte/mca/plm/base/plm_base_receive.c b/orte/mca/plm/base/plm_base_receive.c index b62666dfeb..9a3f9b4cb5 100644 --- a/orte/mca/plm/base/plm_base_receive.c +++ b/orte/mca/plm/base/plm_base_receive.c @@ -146,7 +146,6 @@ static void process_msg(int fd, short event, void *data) orte_proc_state_t state; orte_exit_code_t exit_code; int rc=ORTE_SUCCESS, ret; - struct timeval beat; orte_app_context_t *app, *child_app; opal_list_item_t *item; int dump[128]; @@ -458,29 +457,6 @@ static void process_msg(int fd, short event, void *data) } break; - case ORTE_PLM_HEARTBEAT_CMD: - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive got heartbeat from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&msgpkt->sender))); - /* lookup the daemon object */ - if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { - /* this job can not possibly have been removed, so this is an error */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - goto CLEANUP; - } - gettimeofday(&beat, NULL); - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, msgpkt->sender.vpid))) { - /* this proc is no longer in table - skip it */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive daemon %s is not in proc table", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_VPID_PRINT(msgpkt->sender.vpid))); - break; - } - proc->beat = beat.tv_sec; - break; - case ORTE_PLM_INIT_ROUTES_CMD: count=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) { diff --git a/orte/mca/plm/base/plm_private.h b/orte/mca/plm/base/plm_private.h index 806ff47948..3a5e20c796 100644 --- a/orte/mca/plm/base/plm_private.h +++ b/orte/mca/plm/base/plm_private.h @@ -131,12 +131,6 @@ ORTE_DECLSPEC int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, c orte_node_rank_t nrank, orte_local_rank_t lrank, orte_vpid_t nlocal, int nslots, bool overwrite); -/** - * Heartbeat support - */ -ORTE_DECLSPEC void orte_plm_base_heartbeat(int fd, short event, void *data); -ORTE_DECLSPEC void orte_plm_base_start_heart(void); - /** * Utilities for plm components that use proxy daemons */ @@ -161,7 +155,7 @@ ORTE_DECLSPEC void orte_plm_base_recv(int status, orte_process_name_t* sender, ORTE_DECLSPEC int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, char *ess_module, int *proc_vpid_index, - bool heartbeat, char *nodes); + char *nodes); /* * Proxy functions for use by daemons and application procs diff --git a/orte/mca/plm/ccp/plm_ccp_module.c b/orte/mca/plm/ccp/plm_ccp_module.c index 5a6a2eb221..118adb4f83 100644 --- a/orte/mca/plm/ccp/plm_ccp_module.c +++ b/orte/mca/plm/ccp/plm_ccp_module.c @@ -216,7 +216,7 @@ GETMAP: /* Add basic orted command line options */ orte_plm_base_orted_append_basic_args(&argc, &argv, "env", &proc_vpid_index, - false, NULL); + NULL); if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { param = opal_argv_join(argv, ' '); diff --git a/orte/mca/plm/lsf/plm_lsf_module.c b/orte/mca/plm/lsf/plm_lsf_module.c index b98ec4a403..fb5da7c3e9 100644 --- a/orte/mca/plm/lsf/plm_lsf_module.c +++ b/orte/mca/plm/lsf/plm_lsf_module.c @@ -221,7 +221,7 @@ static int plm_lsf_launch_job(orte_job_t *jdata) orte_plm_base_orted_append_basic_args(&argc, &argv, "lsf", &proc_vpid_index, - false, nodelist); + nodelist); free(nodelist); /* tell the new daemons the base of the name list so they can compute diff --git a/orte/mca/plm/plm_types.h b/orte/mca/plm/plm_types.h index b94b4387c4..baa7833228 100644 --- a/orte/mca/plm/plm_types.h +++ b/orte/mca/plm/plm_types.h @@ -62,7 +62,7 @@ typedef uint32_t orte_proc_state_t; #define ORTE_PROC_STATE_COMM_FAILED 0x00002000 /* process communication has failed */ #define ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED 0x00004000 /* process exceeded a sensor limit */ #define ORTE_PROC_STATE_CALLED_ABORT 0x00008000 /* process called "errmgr.abort" */ - +#define ORTE_PROC_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */ /* * Job state codes */ @@ -93,6 +93,7 @@ typedef uint32_t orte_job_state_t; #define ORTE_JOB_STATE_COMM_FAILED 0x00002000 /* communication has failed */ #define ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED 0x00004000 /* job had a process that exceeded a sensor limit */ #define ORTE_JOB_STATE_CALLED_ABORT 0x00008000 /* at least one process called "errmgr.abort" */ +#define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */ /* the job never even attempted to launch due to an error earlier in the * launch procedure @@ -131,8 +132,7 @@ typedef uint8_t orte_plm_cmd_flag_t; #define ORTE_PLM_CMD OPAL_UINT8 #define ORTE_PLM_LAUNCH_JOB_CMD 1 #define ORTE_PLM_UPDATE_PROC_STATE 2 -#define ORTE_PLM_HEARTBEAT_CMD 3 -#define ORTE_PLM_INIT_ROUTES_CMD 4 +#define ORTE_PLM_INIT_ROUTES_CMD 3 END_C_DECLS diff --git a/orte/mca/plm/process/plm_process_module.c b/orte/mca/plm/process/plm_process_module.c index 71f2d626a8..319c33ee2b 100644 --- a/orte/mca/plm/process/plm_process_module.c +++ b/orte/mca/plm/process/plm_process_module.c @@ -1074,7 +1074,7 @@ int orte_plm_process_launch(orte_job_t *jdata) orte_plm_base_orted_append_basic_args(&argc, &argv, "env", &proc_vpid_index, - false, NULL); + NULL); if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { param = opal_argv_join(argv, ' '); diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index b4d64351c1..16822ced8c 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -665,7 +665,7 @@ static int setup_launch(int *argcptr, char ***argvptr, orte_plm_base_orted_append_basic_args(&argc, &argv, "env", proc_vpid_index, - true, NULL); + NULL); /* ensure that only the ssh plm is selected on the remote daemon */ opal_argv_append_nosize(&argv, "-mca"); @@ -1431,14 +1431,6 @@ launch_apps: recv_issued = false; } - /* setup a "heartbeat" timer to periodically check on - * the state-of-health of the orteds, if requested AND - * we actually launched some daemons! - */ - if ((NULL != map) && (0 < map->num_new_daemons)) { - orte_plm_base_start_heart(); - } - return rc; } diff --git a/orte/mca/plm/rshd/plm_rshd_module.c b/orte/mca/plm/rshd/plm_rshd_module.c index 303cc6a5e1..1df0a0f1b2 100644 --- a/orte/mca/plm/rshd/plm_rshd_module.c +++ b/orte/mca/plm/rshd/plm_rshd_module.c @@ -239,7 +239,6 @@ static void ssh_child(char *cmd, char **argv) */ int orte_plm_rshd_launch(orte_job_t *jdata) { - orte_job_map_t *map = NULL; char **argv = NULL; char *cmd, *param; int rc, i; @@ -379,14 +378,6 @@ cleanup: ORTE_ERROR_DEFAULT_EXIT_CODE); } - /* setup a "heartbeat" timer to periodically check on - * the state-of-health of the orteds, if requested AND - * we actually launched some daemons! - */ - if ((NULL != map) && (0 < map->num_new_daemons)) { - orte_plm_base_start_heart(); - } - return rc; } diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index 329a0f0b55..c597157e30 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -309,7 +309,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata) /* Add basic orted command line options, including debug flags */ orte_plm_base_orted_append_basic_args(&argc, &argv, "slurm", &proc_vpid_index, - false, nodelist_flat); + nodelist_flat); free(nodelist_flat); /* tell the new daemons the base of the name list so they can compute diff --git a/orte/mca/plm/tm/plm_tm_module.c b/orte/mca/plm/tm/plm_tm_module.c index 8cc454de82..1ce6a70160 100644 --- a/orte/mca/plm/tm/plm_tm_module.c +++ b/orte/mca/plm/tm/plm_tm_module.c @@ -244,7 +244,7 @@ static int plm_tm_launch_job(orte_job_t *jdata) /* Add basic orted command line options */ orte_plm_base_orted_append_basic_args(&argc, &argv, "tm", &proc_vpid_index, - true, nodelist); + nodelist); free(nodelist); if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { @@ -465,14 +465,6 @@ launch_apps: ORTE_ERROR_DEFAULT_EXIT_CODE); } - /* setup a "heartbeat" timer to periodically check on - * the state-of-health of the orteds, if requested AND - * we actually launched some daemons! - */ - if (0 < map->num_new_daemons) { - orte_plm_base_start_heart(); - } - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:tm:launch: finished", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); diff --git a/orte/mca/plm/tmd/plm_tmd_module.c b/orte/mca/plm/tmd/plm_tmd_module.c index dbea163277..c0eb1f7923 100644 --- a/orte/mca/plm/tmd/plm_tmd_module.c +++ b/orte/mca/plm/tmd/plm_tmd_module.c @@ -552,14 +552,6 @@ launch_apps: ORTE_ERROR_DEFAULT_EXIT_CODE); } - /* setup a "heartbeat" timer to periodically check on - * the state-of-health of the orteds, if requested AND - * we actually launched some daemons! - */ - if (0 < map->num_new_daemons) { - orte_plm_base_start_heart(); - } - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:tm:launch: finished", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); diff --git a/orte/mca/plm/xgrid/src/plm_xgrid_client.m b/orte/mca/plm/xgrid/src/plm_xgrid_client.m index d30da12c48..9119a12f61 100644 --- a/orte/mca/plm/xgrid/src/plm_xgrid_client.m +++ b/orte/mca/plm/xgrid/src/plm_xgrid_client.m @@ -438,7 +438,7 @@ cleanup: orte_plm_base_orted_append_basic_args(&argc, &argv, "env", NULL, - true, NULL); + NULL); /* Note that capacity is a starting capacity, not max */ NSMutableArray *ret = [NSMutableArray arrayWithCapacity: argc]; diff --git a/orte/mca/rml/rml_types.h b/orte/mca/rml/rml_types.h index 8beca71c81..800fb657bd 100644 --- a/orte/mca/rml/rml_types.h +++ b/orte/mca/rml/rml_types.h @@ -177,6 +177,9 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t); /* tag for receiving ack of abort msg */ #define ORTE_RML_TAG_ABORT 38 +/* tag for receiving heartbeats */ +#define ORTE_RML_TAG_HEARTBEAT 39 + #define ORTE_RML_TAG_MAX 100 diff --git a/orte/mca/sensor/file/sensor_file.c b/orte/mca/sensor/file/sensor_file.c index 12ef1f52d1..0fccb5fd64 100644 --- a/orte/mca/sensor/file/sensor_file.c +++ b/orte/mca/sensor/file/sensor_file.c @@ -139,6 +139,11 @@ static void start(orte_jobid_t jobid) char *filename; file_tracker_t *ft; + /* cannot monitor my own job */ + if (jobid == ORTE_PROC_MY_NAME->jobid) { + return; + } + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s starting file monitoring for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -252,6 +257,11 @@ static void stop(orte_jobid_t jobid) opal_list_item_t *item; file_tracker_t *ft; + /* cannot monitor my own job */ + if (jobid == ORTE_PROC_MY_NAME->jobid) { + return; + } + for (item = opal_list_get_first(&jobs); item != opal_list_get_end(&jobs); item = opal_list_get_next(item)) { diff --git a/orte/mca/sensor/heartbeat/Makefile.am b/orte/mca/sensor/heartbeat/Makefile.am new file mode 100644 index 0000000000..a3680ac16b --- /dev/null +++ b/orte/mca/sensor/heartbeat/Makefile.am @@ -0,0 +1,37 @@ +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_pkgdata_DATA = help-orte-sensor-heartbeat.txt + +sources = \ + sensor_heartbeat.c \ + sensor_heartbeat.h \ + sensor_heartbeat_component.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_sensor_heartbeat_DSO +component_noinst = +component_install = mca_sensor_heartbeat.la +else +component_noinst = libmca_sensor_heartbeat.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_sensor_heartbeat_la_SOURCES = $(sources) +mca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_sensor_heartbeat_la_SOURCES =$(sources) +libmca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/sensor/heartbeat/configure.m4 b/orte/mca/sensor/heartbeat/configure.m4 new file mode 100644 index 0000000000..8500b6480b --- /dev/null +++ b/orte/mca/sensor/heartbeat/configure.m4 @@ -0,0 +1,19 @@ +# -*- shell-script -*- +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_sensor_heartbeat_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_sensor_heartbeat_CONFIG], [ + # if we don't want heartbeats, don't compile + # this component + AS_IF([test "$orte_want_heartbeats" = "1"], + [$1], [$2]) +])dnl + diff --git a/orte/mca/sensor/heartbeat/configure.params b/orte/mca/sensor/heartbeat/configure.params new file mode 100644 index 0000000000..0a0f20e94b --- /dev/null +++ b/orte/mca/sensor/heartbeat/configure.params @@ -0,0 +1,14 @@ +# -*- shell-script -*- +# +# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/orte/mca/sensor/heartbeat/help-orte-sensor-heartbeat.txt b/orte/mca/sensor/heartbeat/help-orte-sensor-heartbeat.txt new file mode 100644 index 0000000000..3c8d15418c --- /dev/null +++ b/orte/mca/sensor/heartbeat/help-orte-sensor-heartbeat.txt @@ -0,0 +1,20 @@ +# -*- text -*- +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for the memory usage sensor +# +[mem-limit-exceeded] +A process has exceeded the specified limit on memory usage: + +Node: %s +Process rank: %s +Memory used: %luGbytes +Memory limit: %luGbytes + diff --git a/orte/mca/sensor/heartbeat/sensor_heartbeat.c b/orte/mca/sensor/heartbeat/sensor_heartbeat.c new file mode 100644 index 0000000000..fbc4cebd1c --- /dev/null +++ b/orte/mca/sensor/heartbeat/sensor_heartbeat.c @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" +#include "orte/types.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ +#include + +#include "opal_stdint.h" +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "opal/mca/pstat/pstat.h" + +#include "orte/util/show_help.h" +#include "orte/util/proc_info.h" +#include "orte/util/name_fns.h" +#include "orte/util/nidmap.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/odls/odls_types.h" +#include "orte/mca/rmcast/rmcast.h" +#include "orte/mca/rml/rml.h" +#include "orte/runtime/orte_wait.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/sensor/base/base.h" +#include "orte/mca/sensor/base/sensor_private.h" +#include "sensor_heartbeat.h" + +/* declare the API functions */ +static int init(void); +static void finalize(void); +static void start(orte_jobid_t job); +static void stop(orte_jobid_t job); + +/* instantiate the module */ +orte_sensor_base_module_t orte_sensor_heartbeat_module = { + init, + finalize, + start, + stop +}; + +/* declare the local functions */ +static void check_heartbeat(int fd, short event, void *arg); +static void send_heartbeat(int fd, short event, void *arg); +#if ORTE_ENABLE_MULTICAST +static void recv_rmcast_beats(int status, + orte_rmcast_channel_t channel, + orte_rmcast_tag_t tag, + orte_process_name_t *sender, + opal_buffer_t *buf, void* cbdata); +static void rmcast_callback_fn(int status, + orte_rmcast_channel_t channel, + orte_rmcast_tag_t tag, + orte_process_name_t *sender, + opal_buffer_t *buf, void* cbdata); +#else +static void recv_rml_beats(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); +static void rml_callback_fn(int status, + struct orte_process_name_t* peer, + struct opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata); +#endif + +/* local globals */ +static opal_event_t *send_ev = NULL, *check_ev = NULL; +static struct timeval send_time, check_time; +static double timeout; + +#include MCA_timer_IMPLEMENTATION_HEADER +static inline double gettime(void) __opal_attribute_always_inline__; +static inline double gettime(void) +{ + double wtime; +#if OPAL_TIMER_USEC_NATIVE + wtime = ((double) opal_timer_base_get_usec()) / 1000000.0; +#else + struct timeval tv; + gettimeofday(&tv, NULL); + wtime = tv.tv_sec; + wtime += (double)tv.tv_usec / 1000000.0; +#endif + return wtime; +} + +static int init(void) +{ + int rc; + +#if ORTE_ENABLE_MULTICAST + /* setup multicast recv for heartbeats */ + if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_SYS_CHANNEL, + ORTE_RMCAST_TAG_HEARTBEAT, + ORTE_RMCAST_PERSISTENT, + recv_rmcast_beats, NULL))) { + ORTE_ERROR_LOG(rc); + } +#else + /* setup RML recv for the HNP to receive heartbeats */ + if (ORTE_PROC_IS_HNP) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_HEARTBEAT, + ORTE_RML_NON_PERSISTENT, + recv_rml_beats, + NULL))) { + ORTE_ERROR_LOG(rc); + } + } +#endif + + return rc; +} + +static void finalize(void) +{ + if (NULL != send_ev) { + opal_event_del(send_ev); + free(send_ev); + send_ev = NULL; + } + if (NULL != check_ev) { + opal_event_del(check_ev); + free(check_ev); + check_ev = NULL; + } + +#if ORTE_ENABLE_MULTICAST + orte_rmcast.cancel_recv(ORTE_RMCAST_SYS_CHANNEL, ORTE_RMCAST_TAG_HEARTBEAT); +#else + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_HEARTBEAT); +#endif + return; +} + +/* + * Start sending and checking heartbeats + */ +static void start(orte_jobid_t jobid) +{ + uint64_t time; + + if (jobid != ORTE_PROC_MY_NAME->jobid) { + /* heartbeats are only for daemons and HNPs */ + return; + } + + /* setup the send */ + time = mca_sensor_heartbeat_component.beat * 1000; /* convert to microsecs */ + send_ev = (opal_event_t*)malloc(sizeof(opal_event_t)); + opal_evtimer_set(send_ev, send_heartbeat, send_ev); + send_time.tv_sec = time / 1000000; + send_time.tv_usec = time % 1000000; + opal_evtimer_add(send_ev, &send_time); + + /* define the timeout */ + timeout = 2.0 * (double)time; + + /* setup the check */ + time = mca_sensor_heartbeat_component.check * 1000; /* convert to microsecs */ + check_ev = (opal_event_t*)malloc(sizeof(opal_event_t)); + opal_evtimer_set(check_ev, check_heartbeat, check_ev); + check_time.tv_sec = time / 1000000; + check_time.tv_usec = time % 1000000; + opal_evtimer_add(check_ev, &check_time); +} + + +static void stop(orte_jobid_t jobid) +{ + if (jobid != ORTE_PROC_MY_NAME->jobid) { + /* heartbeats are only for daemons and HNPs */ + return; + } + + if (NULL != send_ev) { + opal_event_del(send_ev); + free(send_ev); + send_ev = NULL; + } + if (NULL != check_ev) { + opal_event_del(check_ev); + free(check_ev); + check_ev = NULL; + } + return; +} + +static void send_heartbeat(int fd, short event, void *arg) +{ + opal_buffer_t *buf; + opal_event_t *tmp = (opal_event_t*)arg; + int rc; + + /* if we are aborting or shutting down, ignore this */ + if (orte_abnormal_term_ordered || orte_finalizing) { + return; + } + + /* setup the buffer - nothing to pack as receipt alone is the "beat" */ + buf = OBJ_NEW(opal_buffer_t); + +#if ORTE_ENABLE_MULTICAST + if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer_nb(ORTE_RMCAST_SYS_CHANNEL, + ORTE_RMCAST_TAG_HEARTBEAT, buf, + rmcast_callback_fn, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + return; + } +#else + /* send heartbeat to HNP */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, + ORTE_RML_TAG_HEARTBEAT, 0, + rml_callback_fn, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + return; + } +#endif + + /* reset the timer */ + opal_evtimer_add(tmp, &send_time); +} + +/* this function automatically gets periodically called + * by the event library so we can check on the state + * of the various orteds + */ +static void check_heartbeat(int fd, short dummy, void *arg) +{ + int v; + orte_nid_t *nid; + double now; + opal_event_t *tmp = (opal_event_t*)arg; + orte_process_name_t name; + + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, + "%s sensor:check_heartbeat", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* if we are aborting or shutting down, ignore this */ + if (orte_abnormal_term_ordered || orte_finalizing) { + return; + } + + name.jobid = ORTE_PROC_MY_NAME->jobid; + + /* get current time */ + now = gettime(); + + /* cycle through the nidmap - make sure we check them all + * in case multiple daemons are late so all of those that did + * can be appropriately flagged + */ + for (v=0; v < orte_nidmap.size; v++) { + if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, v))) { + continue; + } + if (0 == nid->beat) { + /* haven't recvd a beat yet */ + continue; + } + if ((now - nid->beat) > timeout) { + nid->missed++; + if (mca_sensor_heartbeat_component.missed < nid->missed) { + /* heartbeat failed */ + name.vpid = v; + orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_HEARTBEAT_FAILED, + &name, ORTE_PROC_STATE_HEARTBEAT_FAILED, + ORTE_ERROR_DEFAULT_EXIT_CODE); + } + } + } + + /* reset the timer */ + opal_evtimer_add(tmp, &check_time); +} + +#if ORTE_ENABLE_MULTICAST +static void recv_rmcast_beats(int status, + orte_rmcast_channel_t channel, + orte_rmcast_tag_t tag, + orte_process_name_t *sender, + opal_buffer_t *buf, void* cbdata) +{ + orte_nid_t *nid; + + /* if we are aborting or shutting down, ignore this */ + if (orte_abnormal_term_ordered || orte_finalizing) { + return; + } + + /* get this daemon's nid */ + if (NULL == (nid = orte_util_lookup_nid(sender))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return; + } + + /* update its time */ + nid->beat = gettime(); +} + +static void rmcast_callback_fn(int status, + orte_rmcast_channel_t channel, + orte_rmcast_tag_t tag, + orte_process_name_t *sender, + opal_buffer_t *buf, void* cbdata) +{ + OBJ_RELEASE(buf); +} + +#else +static void recv_rml_beats(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + orte_nid_t *nid; + + /* if we are aborting or shutting down, ignore this */ + if (orte_abnormal_term_ordered || orte_finalizing) { + return; + } + + /* get this daemon's nid */ + if (NULL == (nid = orte_util_lookup_nid(sender))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + } else { + /* update its time */ + nid->beat = gettime(); + } + + /* reissue the recv */ + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_HEARTBEAT, + ORTE_RML_NON_PERSISTENT, + recv_rml_beats, + NULL))) { + ORTE_ERROR_LOG(rc); + } +} + +static void rml_callback_fn(int status, + struct orte_process_name_t* peer, + struct opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + OBJ_RELEASE(buffer); +} +#endif diff --git a/orte/mca/sensor/heartbeat/sensor_heartbeat.h b/orte/mca/sensor/heartbeat/sensor_heartbeat.h new file mode 100644 index 0000000000..c7a62c0010 --- /dev/null +++ b/orte/mca/sensor/heartbeat/sensor_heartbeat.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * Heartbeat sensor + */ +#ifndef ORTE_SENSOR_HEARTBEAT_H +#define ORTE_SENSOR_HEARTBEAT_H + +#include "orte_config.h" + +#include "orte/mca/sensor/sensor.h" + +BEGIN_C_DECLS + +struct orte_sensor_heartbeat_component_t { + orte_sensor_base_component_t super; + int beat; + int check; + int missed; +}; +typedef struct orte_sensor_heartbeat_component_t orte_sensor_heartbeat_component_t; + +ORTE_MODULE_DECLSPEC extern orte_sensor_heartbeat_component_t mca_sensor_heartbeat_component; +extern orte_sensor_base_module_t orte_sensor_heartbeat_module; + + +END_C_DECLS + +#endif diff --git a/orte/mca/sensor/heartbeat/sensor_heartbeat_component.c b/orte/mca/sensor/heartbeat/sensor_heartbeat_component.c new file mode 100644 index 0000000000..324e227fe7 --- /dev/null +++ b/orte/mca/sensor/heartbeat/sensor_heartbeat_component.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include "opal/mca/base/base.h" +#include "opal/util/output.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/class/opal_pointer_array.h" + +#include "orte/util/proc_info.h" +#include "orte/util/show_help.h" + +#include "sensor_heartbeat.h" + +/* + * Local functions + */ + +static int orte_sensor_heartbeat_open(void); +static int orte_sensor_heartbeat_close(void); +static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority); + +orte_sensor_heartbeat_component_t mca_sensor_heartbeat_component = { + { + { + ORTE_SENSOR_BASE_VERSION_1_0_0, + + "heartbeat", /* MCA component name */ + ORTE_MAJOR_VERSION, /* MCA component major version */ + ORTE_MINOR_VERSION, /* MCA component minor version */ + ORTE_RELEASE_VERSION, /* MCA component release version */ + orte_sensor_heartbeat_open, /* component open */ + orte_sensor_heartbeat_close, /* component close */ + orte_sensor_heartbeat_query /* component query */ + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } + } +}; + + +/** + * component open/close/init function + */ +static int orte_sensor_heartbeat_open(void) +{ + mca_base_component_t *c = &mca_sensor_heartbeat_component.super.base_version; + int tmp; + + /* lookup parameters */ + mca_base_param_reg_int(c, "beat", + "Heartbeat rate in milliseconds (default=1)", + false, false, 1, &tmp); + if (tmp < 0) { + opal_output(0, "Illegal value %d - must be > 0", tmp); + return ORTE_ERR_FATAL; + } + mca_sensor_heartbeat_component.beat = tmp; + + mca_base_param_reg_int(c, "check", + "Check for failure rate in milliseconds (default=5)", + false, false, 5, &tmp); + mca_sensor_heartbeat_component.check = tmp; + + mca_base_param_reg_int(c, "missed", + "Number of missed heartbeats before failure is declared (default=5)", + false, false, 5, &tmp); + mca_sensor_heartbeat_component.missed = tmp; + + return ORTE_SUCCESS; +} + + +static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority) +{ + *priority = 10; /* use if we were built */ + *module = (mca_base_module_t *)&orte_sensor_heartbeat_module; + + return ORTE_SUCCESS; +} + +/** + * Close all subsystems. + */ + +static int orte_sensor_heartbeat_close(void) +{ + return ORTE_SUCCESS; +} diff --git a/orte/mca/sensor/memusage/sensor_memusage.c b/orte/mca/sensor/memusage/sensor_memusage.c index 1485310c4e..aca504b8fc 100644 --- a/orte/mca/sensor/memusage/sensor_memusage.c +++ b/orte/mca/sensor/memusage/sensor_memusage.c @@ -109,6 +109,11 @@ static void start(orte_jobid_t jobid) opal_list_item_t *item; int rc, tmp; + /* cannot monitor my own job */ + if (jobid == ORTE_PROC_MY_NAME->jobid) { + return; + } + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s starting memory monitoring for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -177,6 +182,11 @@ static void stop(orte_jobid_t jobid) opal_list_item_t *item; memusage_tracker_t *job; + /* cannot monitor my own job */ + if (jobid == ORTE_PROC_MY_NAME->jobid) { + return; + } + for (item = opal_list_get_first(&jobs); item != opal_list_get_end(&jobs); item = opal_list_get_next(item)) { diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index c36fb8fb55..280af2d423 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -125,7 +125,6 @@ static struct { int fail; int fail_delay; bool abort; - int heartbeat; } orted_globals; /* @@ -149,10 +148,6 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = { &orted_globals.fail_delay, OPAL_CMD_LINE_TYPE_INT, "Have the orted specified for failure delay for the provided number of seconds before failing" }, - { NULL, NULL, NULL, '\0', NULL, "heartbeat", 1, - &orted_globals.heartbeat, OPAL_CMD_LINE_TYPE_INT, - "Seconds between orted heartbeat messages to be sent to HNP (default: 0 => no heartbeat)" }, - { "orte", "debug", NULL, 'd', NULL, "debug", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Debug the OpenRTE" }, @@ -785,11 +780,6 @@ int orte_daemon(int argc, char *argv[]) opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } - /* if we were told to do a heartbeat, then setup to do so */ - if (0 < orted_globals.heartbeat) { - ORTE_TIMER_EVENT(orted_globals.heartbeat, 0, orte_plm_base_heartbeat); - } - /* if we were given a launch string, then process it */ if (NULL != orted_launch_cmd) { opal_buffer_t launch; diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 770baba0fb..3f19bd45b5 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -94,7 +94,6 @@ bool orte_abnormal_term_ordered = false; bool orte_routing_is_enabled = false; bool orte_job_term_ordered = false; -int orte_heartbeat_rate; int orte_startup_timeout; int orte_timeout_usec_per_proc; @@ -828,7 +827,6 @@ static void orte_proc_construct(orte_proc_t* proc) proc->node = NULL; proc->nodename = NULL; proc->rml_uri = NULL; - proc->beat = 0; proc->restarts = 0; proc->relocates = 0; #if OPAL_ENABLE_FT_CR == 1 @@ -908,6 +906,10 @@ static void orte_nid_construct(orte_nid_t *ptr) ptr->daemon = ORTE_VPID_INVALID; OBJ_CONSTRUCT(&ptr->attrs, opal_list_t); OBJ_CONSTRUCT(&ptr->sysinfo, opal_list_t); +#if ORTE_ENABLE_HEARTBEAT + ptr->beat = 0; + ptr->missed = 0; +#endif } static void orte_nid_destruct(orte_nid_t *ptr) diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 10b937aefb..5c8947160b 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -450,8 +450,6 @@ struct orte_proc_t { char *nodename; /* RML contact info */ char *rml_uri; - /* seconds when last heartbeat was detected */ - time_t beat; /* number of times this process has been restarted */ int32_t restarts; /* number of times this process has been relocated */ @@ -489,6 +487,12 @@ typedef struct { opal_list_t attrs; /* list of system info */ opal_list_t sysinfo; +#if ORTE_ENABLE_HEARTBEAT + /* seconds when last heartbeat was detected */ + double beat; + /* number of missed heartbeats */ + int missed; +#endif } orte_nid_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_nid_t); @@ -587,7 +591,6 @@ ORTE_DECLSPEC extern bool orte_abnormal_term_ordered; ORTE_DECLSPEC extern bool orte_routing_is_enabled; ORTE_DECLSPEC extern bool orte_job_term_ordered; -ORTE_DECLSPEC extern int orte_heartbeat_rate; ORTE_DECLSPEC extern int orte_startup_timeout; ORTE_DECLSPEC extern int orte_timeout_usec_per_proc; diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index 5efd4fd76a..7c39820c8a 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -185,10 +185,6 @@ int orte_register_params(void) "Have the specified orted fail after specified number of seconds (default: 0 => no delay)", false, false, 0, &orted_debug_failure_delay); - mca_base_param_reg_int_name("orte", "heartbeat_rate", - "Seconds between checks for daemon state-of-health (default: 0 => do not check)", - false, false, 0, &orte_heartbeat_rate); - mca_base_param_reg_int_name("orte", "startup_timeout", "Milliseconds/daemon to wait for startup before declaring failed_to_start (default: 0 => do not check)", false, false, 0, &orte_startup_timeout); diff --git a/orte/tools/orte-info/components.c b/orte/tools/orte-info/components.c index a37d0c8c2f..3153f4dec1 100644 --- a/orte/tools/orte-info/components.c +++ b/orte/tools/orte-info/components.c @@ -89,10 +89,8 @@ #include "orte/mca/snapc/snapc.h" #include "orte/mca/snapc/base/base.h" #endif -#if ORTE_ENABLE_SENSORS #include "orte/mca/sensor/sensor.h" #include "orte/mca/sensor/base/base.h" -#endif #include "orte/mca/filem/filem.h" #include "orte/mca/filem/base/base.h" #endif @@ -436,7 +434,6 @@ void orte_info_open_components(void) opal_pointer_array_add(&component_map, map); #endif -#if ORTE_ENABLE_SENSORS if (ORTE_SUCCESS != orte_sensor_base_open()) { goto error; } @@ -444,7 +441,6 @@ void orte_info_open_components(void) map->type = strdup("sensor"); map->components = &mca_sensor_base_components_available; opal_pointer_array_add(&component_map, map); -#endif if (ORTE_SUCCESS != orte_filem_base_open()) { goto error; diff --git a/orte/tools/orte-info/orte-info.c b/orte/tools/orte-info/orte-info.c index efddec7c2d..37c67b812b 100644 --- a/orte/tools/orte-info/orte-info.c +++ b/orte/tools/orte-info/orte-info.c @@ -210,9 +210,7 @@ int main(int argc, char *argv[]) #if OPAL_ENABLE_FT_CR == 1 opal_pointer_array_add(&mca_types, "snapc"); #endif -#if ORTE_ENABLE_SENSORS opal_pointer_array_add(&mca_types, "sensor"); -#endif opal_pointer_array_add(&mca_types, "filem"); #endif /* these are always included */ diff --git a/orte/util/error_strings.c b/orte/util/error_strings.c index 526f58758f..1572a21acb 100644 --- a/orte/util/error_strings.c +++ b/orte/util/error_strings.c @@ -176,11 +176,12 @@ const char *orte_job_state_to_str(orte_job_state_t state) case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: return "SENSOR BOUND EXCEEDED"; break; - case ORTE_JOB_STATE_NEVER_LAUNCHED: return "NEVER LAUNCHED"; case ORTE_JOB_STATE_ABORT_ORDERED: return "ABORT IN PROGRESS"; + case ORTE_JOB_STATE_HEARTBEAT_FAILED: + return "HEARTBEAT FAILED"; default: return "UNKNOWN STATE!"; } @@ -220,7 +221,9 @@ const char *orte_proc_state_to_str(orte_proc_state_t state) case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: return "SENSOR BOUND EXCEEDED"; break; - + case ORTE_PROC_STATE_HEARTBEAT_FAILED: + return "HEARTBEAT FAILED"; + break; default: return "UNKNOWN STATE!"; }