1
1

Create a new "heartbeat" module in the sensor framework and move the plm_base heartbeat code there. Add new proc and job states for heartbeat_failed. Remove the "heartbeat" cmd line option for orted as this is now done automatically if the --enable-heartbeat configure option is set.

This commit was SVN r23102.
Этот коммит содержится в:
Ralph Castain 2010-05-05 00:48:43 +00:00
родитель 99f223210d
Коммит 2ff1ae13e1
39 изменённых файлов: 694 добавлений и 289 удалений

Просмотреть файл

@ -92,6 +92,9 @@ else
AC_MSG_RESULT([no])
orte_want_multicast=0
fi
AC_DEFINE_UNQUOTED([ORTE_ENABLE_MULTICAST],
[$orte_want_multicast],
[Whether we want multicast messaging enabled])
#
# Do we want sensors enabled?
@ -111,4 +114,22 @@ AC_DEFINE_UNQUOTED([ORTE_ENABLE_SENSORS],
[$orte_want_sensors],
[Whether we want sensors enabled])
#
# Do we want daemon heartbeats enabled?
AC_MSG_CHECKING([if want daemon heartbeats])
AC_ARG_ENABLE([heartbeat],
[AC_HELP_STRING([--enable-heartbeat],
[Enable heartbeat monitoring of daemons (default: disabled)])])
if test "$enable_heartbeat" = "yes"; then
AC_MSG_RESULT([yes])
orte_want_heartbeats=1
else
AC_MSG_RESULT([no])
orte_want_heartbeats=0
fi
AC_DEFINE_UNQUOTED([ORTE_ENABLE_HEARTBEAT],
[$orte_want_heartbeats],
[Whether we want daemon heartbeat monitoring enabled])
])dnl

Просмотреть файл

@ -35,9 +35,7 @@
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/rmaps/rmaps_types.h"
#if ORTE_ENABLE_SENSORS
#include "orte/mca/sensor/sensor.h"
#endif
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
@ -721,21 +719,19 @@ static void check_job_complete(orte_job_t *jdata)
}
}
#if ORTE_ENABLE_SENSORS
if (jdata->abort) {
/* the job aborted - turn off any sensors on this job */
orte_sensor.stop(jdata->jobid);
}
#endif
if (ORTE_JOB_STATE_UNTERMINATED > jdata->state &&
jdata->num_terminated >= jdata->num_procs) {
/* this job has terminated */
jdata->state = ORTE_JOB_STATE_TERMINATED;
#if ORTE_ENABLE_SENSORS
/* turn off any sensor monitors on this job */
orte_sensor.stop(jdata->jobid);
#endif
if (0 < non_zero) {
/* warn user */
opal_output(orte_clean_output,
@ -902,6 +898,11 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
orte_proc_t proc;
int rc;
/* stop local sensors for this job */
if (ORTE_VPID_WILDCARD == vpid) {
orte_sensor.stop(job);
}
OBJ_CONSTRUCT(&cmd, opal_pointer_array_t);
OBJ_CONSTRUCT(&proc, orte_proc_t);
proc.name.jobid = job;

Просмотреть файл

@ -61,9 +61,8 @@
#include "orte/mca/notifier/base/base.h"
#include "orte/mca/rmcast/base/base.h"
#include "orte/mca/db/base/base.h"
#if ORTE_ENABLE_SENSORS
#include "orte/mca/sensor/base/base.h"
#endif
#include "orte/mca/sensor/sensor.h"
#include "orte/runtime/orte_cr.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h"
@ -419,7 +418,6 @@ int orte_ess_base_orted_setup(char **hosts)
goto error;
}
#if ORTE_ENABLE_SENSORS
/* setup the SENSOR framework */
if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) {
ORTE_ERROR_LOG(ret);
@ -431,8 +429,9 @@ int orte_ess_base_orted_setup(char **hosts)
error = "ortesensor_select";
goto error;
}
#endif
/* start the local sensors */
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
return ORTE_SUCCESS;
error:
@ -445,6 +444,9 @@ error:
int orte_ess_base_orted_finalize(void)
{
/* stop the local sensors */
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);
/* ensure all the orteds depart together */
if (!orte_abnormal_term_ordered) {
/* if we are abnormally terminating, don't attempt
@ -454,9 +456,7 @@ int orte_ess_base_orted_finalize(void)
orte_grpcomm.onesided_barrier();
}
#if ORTE_ENABLE_SENSORS
orte_sensor_base_close();
#endif
orte_db_base_close();
orte_notifier_base_close();

Просмотреть файл

@ -57,9 +57,8 @@
#include "orte/mca/notifier/base/base.h"
#include "orte/mca/rmcast/base/base.h"
#include "orte/mca/db/base/base.h"
#if ORTE_ENABLE_SENSORS
#include "orte/mca/sensor/base/base.h"
#endif
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/rmaps/base/base.h"
#if OPAL_ENABLE_FT_CR == 1
@ -540,7 +539,6 @@ static int rte_init(void)
goto error;
}
#if ORTE_ENABLE_SENSORS
/* setup the SENSOR framework */
if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) {
ORTE_ERROR_LOG(ret);
@ -549,11 +547,12 @@ static int rte_init(void)
}
if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
ORTE_ERROR_LOG(ret);
error = "ortesensor_select";
error = "orte_sensor_select";
goto error;
}
#endif
/* start the local sensors */
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
/* if a tool has launched us and is requesting event reports,
* then set its contact info into the comm system
*/
@ -603,15 +602,16 @@ static int rte_finalize(void)
orte_job_t *job;
int i;
/* stop the local sensors */
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);
/* remove my contact info file */
contact_path = opal_os_path(false, orte_process_info.top_session_dir,
"contact.txt", NULL);
unlink(contact_path);
free(contact_path);
#if ORTE_ENABLE_SENSORS
orte_sensor_base_close();
#endif
orte_db_base_close();
orte_notifier_base_close();

Просмотреть файл

@ -55,6 +55,7 @@
#include "orte/mca/plm/base/base.h"
#include "orte/mca/routed/base/base.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/util/context_fns.h"
#include "orte/util/name_fns.h"
@ -1900,6 +1901,9 @@ CLEANUP:
"%s odls:launch setting waitpids",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* start the sensors for this job (if any) */
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
/* if the launch didn't fail, setup the waitpids on the children */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);

Просмотреть файл

@ -272,7 +272,7 @@ static int plm_alps_launch_job(orte_job_t *jdata)
orte_plm_base_orted_append_basic_args(&argc, &argv,
"alps",
&proc_vpid_index,
false, nodelist_flat);
nodelist_flat);
free(nodelist_flat);
/* tell the new daemons the base of the name list so they can compute

Просмотреть файл

@ -40,6 +40,5 @@ libmca_plm_la_SOURCES += \
base/plm_base_jobid.c \
base/plm_base_proxy.c \
base/plm_base_orted_cmds.c \
base/plm_base_rsh_support.c \
base/plm_base_heartbeat.c
base/plm_base_rsh_support.c
endif

Просмотреть файл

@ -1,145 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#include "opal/dss/dss.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/name_fns.h"
#include "orte/mca/plm/base/plm_private.h"
#define HEARTBEAT_CK 2
void orte_plm_base_heartbeat(int fd, short event, void *arg)
{
opal_buffer_t buf;
orte_plm_cmd_flag_t command = ORTE_PLM_HEARTBEAT_CMD;
opal_event_t *tmp = (opal_event_t*)arg;
struct timeval now;
int rc;
/* setup the buffer */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
/* tell the HNP this is a heartbeat */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* send heartbeat to HNP */
if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_PLM, 0))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* reset the timer */
now.tv_sec = orte_heartbeat_rate;
now.tv_usec = 0;
opal_evtimer_add(tmp, &now);
CLEANUP:
OBJ_DESTRUCT(&buf);
}
/* this function automatically gets periodically called
* by the event library so we can check on the state
* of the various orteds
*/
static void check_heartbeat(int fd, short dummy, void *arg)
{
int v;
orte_proc_t *proc;
orte_job_t *daemons;
struct timeval timeout;
bool died = false;
opal_event_t *tmp = (opal_event_t*)arg;
struct timeval now;
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:base:check_heartbeat",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || 0 == orte_heartbeat_rate) {
return;
}
/* get the job object for the daemons */
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return;
}
/* get current time */
gettimeofday(&timeout, NULL);
/* cycle through the daemons - make sure we check them all
* in case multiple daemons died so all of those that did die
* can be appropriately flagged
*/
for (v=1; v < daemons->procs->size; v++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) {
continue;
}
if ((timeout.tv_sec - proc->beat) > HEARTBEAT_CK*orte_heartbeat_rate) {
/* declare this orted dead */
proc->state = ORTE_PROC_STATE_ABORTED;
proc->exit_code = ORTE_ERROR_DEFAULT_EXIT_CODE;
if (NULL == daemons->aborted_proc) {
daemons->aborted_proc = proc;
}
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
died = true;
}
}
/* if any daemon died, abort */
if (died) {
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_ABORTED,
NULL, ORTE_PROC_STATE_UNDEF, ORTE_ERROR_DEFAULT_EXIT_CODE);
return;
}
/* reset the timer */
now.tv_sec = HEARTBEAT_CK*orte_heartbeat_rate;
now.tv_usec = 0;
opal_evtimer_add(tmp, &now);
}
void orte_plm_base_start_heart(void)
{
/* if the heartbeat rate > 0, then start the heart */
if (0 < orte_heartbeat_rate) {
ORTE_TIMER_EVENT(HEARTBEAT_CK*orte_heartbeat_rate, 0, check_heartbeat);
}
}

Просмотреть файл

@ -53,9 +53,6 @@
#include "orte/mca/filem/filem.h"
#include "orte/mca/filem/base/base.h"
#include "orte/mca/rml/base/rml_contact.h"
#if ORTE_ENABLE_SENSORS
#include "orte/mca/sensor/sensor.h"
#endif
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_locks.h"
@ -391,11 +388,6 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
goto WAKEUP;
}
#if ORTE_ENABLE_SENSORS
/* start any sensor monitoring of this job */
orte_sensor.start(job);
#endif
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:launch completed for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -742,7 +734,7 @@ int orte_plm_base_setup_orted_cmd(int *argc, char ***argv)
int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
char *ess,
int *proc_vpid_index,
bool heartbeat, char *nodes)
char *nodes)
{
char *param = NULL;
int loc_id;
@ -788,13 +780,6 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
opal_argv_append(argc, argv, param);
free(param);
}
if (heartbeat && 0 < orte_heartbeat_rate) {
/* tell the daemon to do a heartbeat */
opal_argv_append(argc, argv, "--heartbeat");
asprintf(&param, "%d", orte_heartbeat_rate);
opal_argv_append(argc, argv, param);
free(param);
}
/* tell the orted what ESS component to use */
opal_argv_append(argc, argv, "-mca");

Просмотреть файл

@ -100,9 +100,6 @@ int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command)
"%s plm:base:orted_cmd sending orted_exit commands",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* stop all heartbeats */
orte_heartbeat_rate = 0;
OBJ_CONSTRUCT(&cmd, opal_buffer_t);
/* since the orteds are being ordered to exit, and we are

Просмотреть файл

@ -146,7 +146,6 @@ static void process_msg(int fd, short event, void *data)
orte_proc_state_t state;
orte_exit_code_t exit_code;
int rc=ORTE_SUCCESS, ret;
struct timeval beat;
orte_app_context_t *app, *child_app;
opal_list_item_t *item;
int dump[128];
@ -458,29 +457,6 @@ static void process_msg(int fd, short event, void *data)
}
break;
case ORTE_PLM_HEARTBEAT_CMD:
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive got heartbeat from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&msgpkt->sender)));
/* lookup the daemon object */
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
/* this job can not possibly have been removed, so this is an error */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
goto CLEANUP;
}
gettimeofday(&beat, NULL);
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, msgpkt->sender.vpid))) {
/* this proc is no longer in table - skip it */
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive daemon %s is not in proc table",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_VPID_PRINT(msgpkt->sender.vpid)));
break;
}
proc->beat = beat.tv_sec;
break;
case ORTE_PLM_INIT_ROUTES_CMD:
count=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) {

Просмотреть файл

@ -131,12 +131,6 @@ ORTE_DECLSPEC int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, c
orte_node_rank_t nrank, orte_local_rank_t lrank,
orte_vpid_t nlocal, int nslots, bool overwrite);
/**
* Heartbeat support
*/
ORTE_DECLSPEC void orte_plm_base_heartbeat(int fd, short event, void *data);
ORTE_DECLSPEC void orte_plm_base_start_heart(void);
/**
* Utilities for plm components that use proxy daemons
*/
@ -161,7 +155,7 @@ ORTE_DECLSPEC void orte_plm_base_recv(int status, orte_process_name_t* sender,
ORTE_DECLSPEC int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
char *ess_module,
int *proc_vpid_index,
bool heartbeat, char *nodes);
char *nodes);
/*
* Proxy functions for use by daemons and application procs

Просмотреть файл

@ -216,7 +216,7 @@ GETMAP:
/* Add basic orted command line options */
orte_plm_base_orted_append_basic_args(&argc, &argv, "env",
&proc_vpid_index,
false, NULL);
NULL);
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
param = opal_argv_join(argv, ' ');

Просмотреть файл

@ -221,7 +221,7 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
orte_plm_base_orted_append_basic_args(&argc, &argv,
"lsf",
&proc_vpid_index,
false, nodelist);
nodelist);
free(nodelist);
/* tell the new daemons the base of the name list so they can compute

Просмотреть файл

@ -62,7 +62,7 @@ typedef uint32_t orte_proc_state_t;
#define ORTE_PROC_STATE_COMM_FAILED 0x00002000 /* process communication has failed */
#define ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED 0x00004000 /* process exceeded a sensor limit */
#define ORTE_PROC_STATE_CALLED_ABORT 0x00008000 /* process called "errmgr.abort" */
#define ORTE_PROC_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
/*
* Job state codes
*/
@ -93,6 +93,7 @@ typedef uint32_t orte_job_state_t;
#define ORTE_JOB_STATE_COMM_FAILED 0x00002000 /* communication has failed */
#define ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED 0x00004000 /* job had a process that exceeded a sensor limit */
#define ORTE_JOB_STATE_CALLED_ABORT 0x00008000 /* at least one process called "errmgr.abort" */
#define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
/* the job never even attempted to launch due to an error earlier in the
* launch procedure
@ -131,8 +132,7 @@ typedef uint8_t orte_plm_cmd_flag_t;
#define ORTE_PLM_CMD OPAL_UINT8
#define ORTE_PLM_LAUNCH_JOB_CMD 1
#define ORTE_PLM_UPDATE_PROC_STATE 2
#define ORTE_PLM_HEARTBEAT_CMD 3
#define ORTE_PLM_INIT_ROUTES_CMD 4
#define ORTE_PLM_INIT_ROUTES_CMD 3
END_C_DECLS

Просмотреть файл

@ -1074,7 +1074,7 @@ int orte_plm_process_launch(orte_job_t *jdata)
orte_plm_base_orted_append_basic_args(&argc, &argv,
"env",
&proc_vpid_index,
false, NULL);
NULL);
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
param = opal_argv_join(argv, ' ');

Просмотреть файл

@ -665,7 +665,7 @@ static int setup_launch(int *argcptr, char ***argvptr,
orte_plm_base_orted_append_basic_args(&argc, &argv,
"env",
proc_vpid_index,
true, NULL);
NULL);
/* ensure that only the ssh plm is selected on the remote daemon */
opal_argv_append_nosize(&argv, "-mca");
@ -1431,14 +1431,6 @@ launch_apps:
recv_issued = false;
}
/* setup a "heartbeat" timer to periodically check on
* the state-of-health of the orteds, if requested AND
* we actually launched some daemons!
*/
if ((NULL != map) && (0 < map->num_new_daemons)) {
orte_plm_base_start_heart();
}
return rc;
}

Просмотреть файл

@ -239,7 +239,6 @@ static void ssh_child(char *cmd, char **argv)
*/
int orte_plm_rshd_launch(orte_job_t *jdata)
{
orte_job_map_t *map = NULL;
char **argv = NULL;
char *cmd, *param;
int rc, i;
@ -379,14 +378,6 @@ cleanup:
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
/* setup a "heartbeat" timer to periodically check on
* the state-of-health of the orteds, if requested AND
* we actually launched some daemons!
*/
if ((NULL != map) && (0 < map->num_new_daemons)) {
orte_plm_base_start_heart();
}
return rc;
}

Просмотреть файл

@ -309,7 +309,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
/* Add basic orted command line options, including debug flags */
orte_plm_base_orted_append_basic_args(&argc, &argv,
"slurm", &proc_vpid_index,
false, nodelist_flat);
nodelist_flat);
free(nodelist_flat);
/* tell the new daemons the base of the name list so they can compute

Просмотреть файл

@ -244,7 +244,7 @@ static int plm_tm_launch_job(orte_job_t *jdata)
/* Add basic orted command line options */
orte_plm_base_orted_append_basic_args(&argc, &argv, "tm",
&proc_vpid_index,
true, nodelist);
nodelist);
free(nodelist);
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
@ -465,14 +465,6 @@ launch_apps:
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
/* setup a "heartbeat" timer to periodically check on
* the state-of-health of the orteds, if requested AND
* we actually launched some daemons!
*/
if (0 < map->num_new_daemons) {
orte_plm_base_start_heart();
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm:launch: finished",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -552,14 +552,6 @@ launch_apps:
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
/* setup a "heartbeat" timer to periodically check on
* the state-of-health of the orteds, if requested AND
* we actually launched some daemons!
*/
if (0 < map->num_new_daemons) {
orte_plm_base_start_heart();
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm:launch: finished",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -438,7 +438,7 @@ cleanup:
orte_plm_base_orted_append_basic_args(&argc, &argv,
"env",
NULL,
true, NULL);
NULL);
/* Note that capacity is a starting capacity, not max */
NSMutableArray *ret = [NSMutableArray arrayWithCapacity: argc];

Просмотреть файл

@ -177,6 +177,9 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t);
/* tag for receiving ack of abort msg */
#define ORTE_RML_TAG_ABORT 38
/* tag for receiving heartbeats */
#define ORTE_RML_TAG_HEARTBEAT 39
#define ORTE_RML_TAG_MAX 100

Просмотреть файл

@ -139,6 +139,11 @@ static void start(orte_jobid_t jobid)
char *filename;
file_tracker_t *ft;
/* cannot monitor my own job */
if (jobid == ORTE_PROC_MY_NAME->jobid) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s starting file monitoring for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -252,6 +257,11 @@ static void stop(orte_jobid_t jobid)
opal_list_item_t *item;
file_tracker_t *ft;
/* cannot monitor my own job */
if (jobid == ORTE_PROC_MY_NAME->jobid) {
return;
}
for (item = opal_list_get_first(&jobs);
item != opal_list_get_end(&jobs);
item = opal_list_get_next(item)) {

37
orte/mca/sensor/heartbeat/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-sensor-heartbeat.txt
sources = \
sensor_heartbeat.c \
sensor_heartbeat.h \
sensor_heartbeat_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_sensor_heartbeat_DSO
component_noinst =
component_install = mca_sensor_heartbeat.la
else
component_noinst = libmca_sensor_heartbeat.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_heartbeat_la_SOURCES = $(sources)
mca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_heartbeat_la_SOURCES =$(sources)
libmca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version

19
orte/mca/sensor/heartbeat/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,19 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_heartbeat_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_sensor_heartbeat_CONFIG], [
# if we don't want heartbeats, don't compile
# this component
AS_IF([test "$orte_want_heartbeats" = "1"],
[$1], [$2])
])dnl

14
orte/mca/sensor/heartbeat/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,14 @@
# -*- shell-script -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -0,0 +1,20 @@
# -*- text -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for the memory usage sensor
#
[mem-limit-exceeded]
A process has exceeded the specified limit on memory usage:
Node: %s
Process rank: %s
Memory used: %luGbytes
Memory limit: %luGbytes

368
orte/mca/sensor/heartbeat/sensor_heartbeat.c Обычный файл
Просмотреть файл

@ -0,0 +1,368 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include "opal_stdint.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/mca/pstat/pstat.h"
#include "orte/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/util/nidmap.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/rmcast/rmcast.h"
#include "orte/mca/rml/rml.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_heartbeat.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(orte_jobid_t job);
static void stop(orte_jobid_t job);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_heartbeat_module = {
init,
finalize,
start,
stop
};
/* declare the local functions */
static void check_heartbeat(int fd, short event, void *arg);
static void send_heartbeat(int fd, short event, void *arg);
#if ORTE_ENABLE_MULTICAST
static void recv_rmcast_beats(int status,
orte_rmcast_channel_t channel,
orte_rmcast_tag_t tag,
orte_process_name_t *sender,
opal_buffer_t *buf, void* cbdata);
static void rmcast_callback_fn(int status,
orte_rmcast_channel_t channel,
orte_rmcast_tag_t tag,
orte_process_name_t *sender,
opal_buffer_t *buf, void* cbdata);
#else
static void recv_rml_beats(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata);
static void rml_callback_fn(int status,
struct orte_process_name_t* peer,
struct opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata);
#endif
/* local globals */
static opal_event_t *send_ev = NULL, *check_ev = NULL;
static struct timeval send_time, check_time;
static double timeout;
#include MCA_timer_IMPLEMENTATION_HEADER
static inline double gettime(void) __opal_attribute_always_inline__;
static inline double gettime(void)
{
double wtime;
#if OPAL_TIMER_USEC_NATIVE
wtime = ((double) opal_timer_base_get_usec()) / 1000000.0;
#else
struct timeval tv;
gettimeofday(&tv, NULL);
wtime = tv.tv_sec;
wtime += (double)tv.tv_usec / 1000000.0;
#endif
return wtime;
}
static int init(void)
{
int rc;
#if ORTE_ENABLE_MULTICAST
/* setup multicast recv for heartbeats */
if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_SYS_CHANNEL,
ORTE_RMCAST_TAG_HEARTBEAT,
ORTE_RMCAST_PERSISTENT,
recv_rmcast_beats, NULL))) {
ORTE_ERROR_LOG(rc);
}
#else
/* setup RML recv for the HNP to receive heartbeats */
if (ORTE_PROC_IS_HNP) {
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_HEARTBEAT,
ORTE_RML_NON_PERSISTENT,
recv_rml_beats,
NULL))) {
ORTE_ERROR_LOG(rc);
}
}
#endif
return rc;
}
static void finalize(void)
{
if (NULL != send_ev) {
opal_event_del(send_ev);
free(send_ev);
send_ev = NULL;
}
if (NULL != check_ev) {
opal_event_del(check_ev);
free(check_ev);
check_ev = NULL;
}
#if ORTE_ENABLE_MULTICAST
orte_rmcast.cancel_recv(ORTE_RMCAST_SYS_CHANNEL, ORTE_RMCAST_TAG_HEARTBEAT);
#else
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_HEARTBEAT);
#endif
return;
}
/*
* Start sending and checking heartbeats
*/
static void start(orte_jobid_t jobid)
{
uint64_t time;
if (jobid != ORTE_PROC_MY_NAME->jobid) {
/* heartbeats are only for daemons and HNPs */
return;
}
/* setup the send */
time = mca_sensor_heartbeat_component.beat * 1000; /* convert to microsecs */
send_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
opal_evtimer_set(send_ev, send_heartbeat, send_ev);
send_time.tv_sec = time / 1000000;
send_time.tv_usec = time % 1000000;
opal_evtimer_add(send_ev, &send_time);
/* define the timeout */
timeout = 2.0 * (double)time;
/* setup the check */
time = mca_sensor_heartbeat_component.check * 1000; /* convert to microsecs */
check_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
opal_evtimer_set(check_ev, check_heartbeat, check_ev);
check_time.tv_sec = time / 1000000;
check_time.tv_usec = time % 1000000;
opal_evtimer_add(check_ev, &check_time);
}
static void stop(orte_jobid_t jobid)
{
if (jobid != ORTE_PROC_MY_NAME->jobid) {
/* heartbeats are only for daemons and HNPs */
return;
}
if (NULL != send_ev) {
opal_event_del(send_ev);
free(send_ev);
send_ev = NULL;
}
if (NULL != check_ev) {
opal_event_del(check_ev);
free(check_ev);
check_ev = NULL;
}
return;
}
static void send_heartbeat(int fd, short event, void *arg)
{
opal_buffer_t *buf;
opal_event_t *tmp = (opal_event_t*)arg;
int rc;
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing) {
return;
}
/* setup the buffer - nothing to pack as receipt alone is the "beat" */
buf = OBJ_NEW(opal_buffer_t);
#if ORTE_ENABLE_MULTICAST
if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer_nb(ORTE_RMCAST_SYS_CHANNEL,
ORTE_RMCAST_TAG_HEARTBEAT, buf,
rmcast_callback_fn, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return;
}
#else
/* send heartbeat to HNP */
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
ORTE_RML_TAG_HEARTBEAT, 0,
rml_callback_fn, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return;
}
#endif
/* reset the timer */
opal_evtimer_add(tmp, &send_time);
}
/* this function automatically gets periodically called
* by the event library so we can check on the state
* of the various orteds
*/
static void check_heartbeat(int fd, short dummy, void *arg)
{
int v;
orte_nid_t *nid;
double now;
opal_event_t *tmp = (opal_event_t*)arg;
orte_process_name_t name;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s sensor:check_heartbeat",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing) {
return;
}
name.jobid = ORTE_PROC_MY_NAME->jobid;
/* get current time */
now = gettime();
/* cycle through the nidmap - make sure we check them all
* in case multiple daemons are late so all of those that did
* can be appropriately flagged
*/
for (v=0; v < orte_nidmap.size; v++) {
if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, v))) {
continue;
}
if (0 == nid->beat) {
/* haven't recvd a beat yet */
continue;
}
if ((now - nid->beat) > timeout) {
nid->missed++;
if (mca_sensor_heartbeat_component.missed < nid->missed) {
/* heartbeat failed */
name.vpid = v;
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_HEARTBEAT_FAILED,
&name, ORTE_PROC_STATE_HEARTBEAT_FAILED,
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
}
}
/* reset the timer */
opal_evtimer_add(tmp, &check_time);
}
#if ORTE_ENABLE_MULTICAST
static void recv_rmcast_beats(int status,
orte_rmcast_channel_t channel,
orte_rmcast_tag_t tag,
orte_process_name_t *sender,
opal_buffer_t *buf, void* cbdata)
{
orte_nid_t *nid;
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing) {
return;
}
/* get this daemon's nid */
if (NULL == (nid = orte_util_lookup_nid(sender))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return;
}
/* update its time */
nid->beat = gettime();
}
static void rmcast_callback_fn(int status,
orte_rmcast_channel_t channel,
orte_rmcast_tag_t tag,
orte_process_name_t *sender,
opal_buffer_t *buf, void* cbdata)
{
OBJ_RELEASE(buf);
}
#else
static void recv_rml_beats(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
orte_nid_t *nid;
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing) {
return;
}
/* get this daemon's nid */
if (NULL == (nid = orte_util_lookup_nid(sender))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
} else {
/* update its time */
nid->beat = gettime();
}
/* reissue the recv */
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_HEARTBEAT,
ORTE_RML_NON_PERSISTENT,
recv_rml_beats,
NULL))) {
ORTE_ERROR_LOG(rc);
}
}
static void rml_callback_fn(int status,
struct orte_process_name_t* peer,
struct opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata)
{
OBJ_RELEASE(buffer);
}
#endif

Просмотреть файл

@ -0,0 +1,38 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Heartbeat sensor
*/
#ifndef ORTE_SENSOR_HEARTBEAT_H
#define ORTE_SENSOR_HEARTBEAT_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
struct orte_sensor_heartbeat_component_t {
orte_sensor_base_component_t super;
int beat;
int check;
int missed;
};
typedef struct orte_sensor_heartbeat_component_t orte_sensor_heartbeat_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_heartbeat_component_t mca_sensor_heartbeat_component;
extern orte_sensor_base_module_t orte_sensor_heartbeat_module;
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,99 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_heartbeat.h"
/*
* Local functions
*/
static int orte_sensor_heartbeat_open(void);
static int orte_sensor_heartbeat_close(void);
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority);
orte_sensor_heartbeat_component_t mca_sensor_heartbeat_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"heartbeat", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_heartbeat_open, /* component open */
orte_sensor_heartbeat_close, /* component close */
orte_sensor_heartbeat_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
}
};
/**
* component open/close/init function
*/
static int orte_sensor_heartbeat_open(void)
{
mca_base_component_t *c = &mca_sensor_heartbeat_component.super.base_version;
int tmp;
/* lookup parameters */
mca_base_param_reg_int(c, "beat",
"Heartbeat rate in milliseconds (default=1)",
false, false, 1, &tmp);
if (tmp < 0) {
opal_output(0, "Illegal value %d - must be > 0", tmp);
return ORTE_ERR_FATAL;
}
mca_sensor_heartbeat_component.beat = tmp;
mca_base_param_reg_int(c, "check",
"Check for failure rate in milliseconds (default=5)",
false, false, 5, &tmp);
mca_sensor_heartbeat_component.check = tmp;
mca_base_param_reg_int(c, "missed",
"Number of missed heartbeats before failure is declared (default=5)",
false, false, 5, &tmp);
mca_sensor_heartbeat_component.missed = tmp;
return ORTE_SUCCESS;
}
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority)
{
*priority = 10; /* use if we were built */
*module = (mca_base_module_t *)&orte_sensor_heartbeat_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_heartbeat_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -109,6 +109,11 @@ static void start(orte_jobid_t jobid)
opal_list_item_t *item;
int rc, tmp;
/* cannot monitor my own job */
if (jobid == ORTE_PROC_MY_NAME->jobid) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s starting memory monitoring for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -177,6 +182,11 @@ static void stop(orte_jobid_t jobid)
opal_list_item_t *item;
memusage_tracker_t *job;
/* cannot monitor my own job */
if (jobid == ORTE_PROC_MY_NAME->jobid) {
return;
}
for (item = opal_list_get_first(&jobs);
item != opal_list_get_end(&jobs);
item = opal_list_get_next(item)) {

Просмотреть файл

@ -125,7 +125,6 @@ static struct {
int fail;
int fail_delay;
bool abort;
int heartbeat;
} orted_globals;
/*
@ -149,10 +148,6 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
&orted_globals.fail_delay, OPAL_CMD_LINE_TYPE_INT,
"Have the orted specified for failure delay for the provided number of seconds before failing" },
{ NULL, NULL, NULL, '\0', NULL, "heartbeat", 1,
&orted_globals.heartbeat, OPAL_CMD_LINE_TYPE_INT,
"Seconds between orted heartbeat messages to be sent to HNP (default: 0 => no heartbeat)" },
{ "orte", "debug", NULL, 'd', NULL, "debug", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Debug the OpenRTE" },
@ -785,11 +780,6 @@ int orte_daemon(int argc, char *argv[])
opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
/* if we were told to do a heartbeat, then setup to do so */
if (0 < orted_globals.heartbeat) {
ORTE_TIMER_EVENT(orted_globals.heartbeat, 0, orte_plm_base_heartbeat);
}
/* if we were given a launch string, then process it */
if (NULL != orted_launch_cmd) {
opal_buffer_t launch;

Просмотреть файл

@ -94,7 +94,6 @@ bool orte_abnormal_term_ordered = false;
bool orte_routing_is_enabled = false;
bool orte_job_term_ordered = false;
int orte_heartbeat_rate;
int orte_startup_timeout;
int orte_timeout_usec_per_proc;
@ -828,7 +827,6 @@ static void orte_proc_construct(orte_proc_t* proc)
proc->node = NULL;
proc->nodename = NULL;
proc->rml_uri = NULL;
proc->beat = 0;
proc->restarts = 0;
proc->relocates = 0;
#if OPAL_ENABLE_FT_CR == 1
@ -908,6 +906,10 @@ static void orte_nid_construct(orte_nid_t *ptr)
ptr->daemon = ORTE_VPID_INVALID;
OBJ_CONSTRUCT(&ptr->attrs, opal_list_t);
OBJ_CONSTRUCT(&ptr->sysinfo, opal_list_t);
#if ORTE_ENABLE_HEARTBEAT
ptr->beat = 0;
ptr->missed = 0;
#endif
}
static void orte_nid_destruct(orte_nid_t *ptr)

Просмотреть файл

@ -450,8 +450,6 @@ struct orte_proc_t {
char *nodename;
/* RML contact info */
char *rml_uri;
/* seconds when last heartbeat was detected */
time_t beat;
/* number of times this process has been restarted */
int32_t restarts;
/* number of times this process has been relocated */
@ -489,6 +487,12 @@ typedef struct {
opal_list_t attrs;
/* list of system info */
opal_list_t sysinfo;
#if ORTE_ENABLE_HEARTBEAT
/* seconds when last heartbeat was detected */
double beat;
/* number of missed heartbeats */
int missed;
#endif
} orte_nid_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_nid_t);
@ -587,7 +591,6 @@ ORTE_DECLSPEC extern bool orte_abnormal_term_ordered;
ORTE_DECLSPEC extern bool orte_routing_is_enabled;
ORTE_DECLSPEC extern bool orte_job_term_ordered;
ORTE_DECLSPEC extern int orte_heartbeat_rate;
ORTE_DECLSPEC extern int orte_startup_timeout;
ORTE_DECLSPEC extern int orte_timeout_usec_per_proc;

Просмотреть файл

@ -185,10 +185,6 @@ int orte_register_params(void)
"Have the specified orted fail after specified number of seconds (default: 0 => no delay)",
false, false, 0, &orted_debug_failure_delay);
mca_base_param_reg_int_name("orte", "heartbeat_rate",
"Seconds between checks for daemon state-of-health (default: 0 => do not check)",
false, false, 0, &orte_heartbeat_rate);
mca_base_param_reg_int_name("orte", "startup_timeout",
"Milliseconds/daemon to wait for startup before declaring failed_to_start (default: 0 => do not check)",
false, false, 0, &orte_startup_timeout);

Просмотреть файл

@ -89,10 +89,8 @@
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#endif
#if ORTE_ENABLE_SENSORS
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/sensor/base/base.h"
#endif
#include "orte/mca/filem/filem.h"
#include "orte/mca/filem/base/base.h"
#endif
@ -436,7 +434,6 @@ void orte_info_open_components(void)
opal_pointer_array_add(&component_map, map);
#endif
#if ORTE_ENABLE_SENSORS
if (ORTE_SUCCESS != orte_sensor_base_open()) {
goto error;
}
@ -444,7 +441,6 @@ void orte_info_open_components(void)
map->type = strdup("sensor");
map->components = &mca_sensor_base_components_available;
opal_pointer_array_add(&component_map, map);
#endif
if (ORTE_SUCCESS != orte_filem_base_open()) {
goto error;

Просмотреть файл

@ -210,9 +210,7 @@ int main(int argc, char *argv[])
#if OPAL_ENABLE_FT_CR == 1
opal_pointer_array_add(&mca_types, "snapc");
#endif
#if ORTE_ENABLE_SENSORS
opal_pointer_array_add(&mca_types, "sensor");
#endif
opal_pointer_array_add(&mca_types, "filem");
#endif
/* these are always included */

Просмотреть файл

@ -176,11 +176,12 @@ const char *orte_job_state_to_str(orte_job_state_t state)
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
return "SENSOR BOUND EXCEEDED";
break;
case ORTE_JOB_STATE_NEVER_LAUNCHED:
return "NEVER LAUNCHED";
case ORTE_JOB_STATE_ABORT_ORDERED:
return "ABORT IN PROGRESS";
case ORTE_JOB_STATE_HEARTBEAT_FAILED:
return "HEARTBEAT FAILED";
default:
return "UNKNOWN STATE!";
}
@ -220,7 +221,9 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
return "SENSOR BOUND EXCEEDED";
break;
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
return "HEARTBEAT FAILED";
break;
default:
return "UNKNOWN STATE!";
}