Add a sensor framework to ORTE that monitors applications and notifies the errmgr when they exceed specified boundaries. Two modules are included here:
1. file activity - can monitor file size, access and modification times. If these fail to change over a specified number of sampling iterations (rate is an mca param), then the errmgr is notified. 2. memory usage - checks amount of memory used by a process. Limit and sampling rate can be set. This support must be enabled by configuring --enable-sensors. ompi_info and orte-info have been updated to include the new framework. Also includes some initial steps toward restoring the recovery capability. Most notably, the ODLS API has been extended to include a "restart_proc" entry for restarting a local process, and organizes the various ERRMGR framework globals into a single struct as we do in the other ORTE frameworks. Fix an oversight in the ERRMGR framework where a pointer array was constructed, but not initialized. Implementation continues. This commit was SVN r23043.
Этот коммит содержится в:
родитель
2fe1bc043d
Коммит
b9893aacc5
@ -115,6 +115,10 @@
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#endif
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#endif
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
#endif
|
||||
@ -465,6 +469,16 @@ void ompi_info_open_components(void)
|
||||
opal_pointer_array_add(&component_map, map);
|
||||
#endif
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
if (ORTE_SUCCESS != orte_sensor_base_open()) {
|
||||
goto error;
|
||||
}
|
||||
map = OBJ_NEW(orte_info_component_map_t);
|
||||
map->type = strdup("sensor");
|
||||
map->components = &mca_sensor_base_components_available;
|
||||
opal_pointer_array_add(&component_map, map);
|
||||
#endif
|
||||
|
||||
if (ORTE_SUCCESS != orte_filem_base_open()) {
|
||||
goto error;
|
||||
}
|
||||
|
@ -229,6 +229,9 @@ int main(int argc, char *argv[])
|
||||
opal_pointer_array_add(&mca_types, "plm");
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
opal_pointer_array_add(&mca_types, "snapc");
|
||||
#endif
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
opal_pointer_array_add(&mca_types, "sensor");
|
||||
#endif
|
||||
opal_pointer_array_add(&mca_types, "filem");
|
||||
#endif
|
||||
|
@ -93,4 +93,22 @@ else
|
||||
orte_want_multicast=0
|
||||
fi
|
||||
|
||||
#
|
||||
# Do we want sensors enabled?
|
||||
|
||||
AC_MSG_CHECKING([if want sensors])
|
||||
AC_ARG_ENABLE([sensors],
|
||||
[AC_HELP_STRING([--enable-sensors],
|
||||
[Enable internal sensors (default: disabled)])])
|
||||
if test "$enable_sensors" = "yes"; then
|
||||
AC_MSG_RESULT([yes])
|
||||
orte_want_sensors=1
|
||||
else
|
||||
AC_MSG_RESULT([no])
|
||||
orte_want_sensors=0
|
||||
fi
|
||||
AC_DEFINE_UNQUOTED([ORTE_ENABLE_SENSORS],
|
||||
[$orte_want_sensors],
|
||||
[Whether we want sensors enabled])
|
||||
|
||||
])dnl
|
||||
|
@ -55,12 +55,6 @@ ORTE_DECLSPEC int orte_errmgr_base_close(void);
|
||||
* Output and component variables
|
||||
*/
|
||||
ORTE_DECLSPEC extern opal_list_t orte_errmgr_base_components_available;
|
||||
ORTE_DECLSPEC extern int orte_errmgr_base_output;
|
||||
ORTE_DECLSPEC extern bool orte_errmgr_base_shutting_down;
|
||||
ORTE_DECLSPEC extern bool orte_errmgr_base_enable_recovery;
|
||||
|
||||
extern opal_pointer_array_t orte_errmgr_base_modules;
|
||||
extern bool orte_errmgr_initialized;
|
||||
|
||||
/*
|
||||
* Additional External API function declared in errmgr.h
|
||||
|
@ -38,8 +38,8 @@ int orte_errmgr_base_close(void)
|
||||
OPAL_TRACE(5);
|
||||
|
||||
/* Close all selected components */
|
||||
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
|
||||
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
@ -50,13 +50,13 @@ int orte_errmgr_base_close(void)
|
||||
|
||||
/* Close all remaining available components (may be one if this is a
|
||||
OMPI RTE program, or [possibly] multiple if this is ompi_info) */
|
||||
mca_base_components_close(orte_errmgr_base_output,
|
||||
mca_base_components_close(orte_errmgr_base.output,
|
||||
&orte_errmgr_base_components_available,
|
||||
NULL);
|
||||
|
||||
OBJ_DESTRUCT(&orte_errmgr_base_modules);
|
||||
OBJ_DESTRUCT(&orte_errmgr_base.modules);
|
||||
|
||||
orte_errmgr_initialized = false;
|
||||
orte_errmgr_base.initialized = false;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -73,8 +73,8 @@ int orte_errmgr_base_update_state(orte_jobid_t job,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if( !orte_errmgr_base_shutting_down ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
if( !orte_errmgr_base.shutting_down ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:base:update_state() %s) "
|
||||
"------- %s state updated for process %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -88,8 +88,8 @@ int orte_errmgr_base_update_state(orte_jobid_t job,
|
||||
/********************************
|
||||
* Call the active modules
|
||||
********************************/
|
||||
for (i = 0; i < orte_errmgr_base_modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
|
||||
for (i = 0; i < orte_errmgr_base.modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
@ -143,14 +143,14 @@ int orte_errmgr_base_predicted_fault(char ***proc_list,
|
||||
int i, rc;
|
||||
orte_errmgr_stack_state_t stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:base:predicted_fault() %s) "
|
||||
"------- Notifying components... (%3d active components)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_errmgr_base_modules.size));
|
||||
orte_errmgr_base.modules.size));
|
||||
|
||||
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
|
||||
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
@ -176,22 +176,22 @@ int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
|
||||
/*
|
||||
* If the user did not ask for recovery, then do not process recovery events
|
||||
*/
|
||||
if( !orte_errmgr_base_enable_recovery ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
if( !orte_errmgr_base.enable_recovery ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:base:suggest_map_targets() %s) "
|
||||
"------- Recovery currently disabled! Skipping...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:base:suggest_map_targets() %s) "
|
||||
"------- Notifying components... (%3d active components)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_errmgr_base_modules.size));
|
||||
orte_errmgr_base.modules.size));
|
||||
|
||||
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
|
||||
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
@ -211,14 +211,14 @@ int orte_errmgr_base_ft_event(int state)
|
||||
orte_errmgr_base_module_t *module = NULL;
|
||||
int i;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:base:ft_event() %s) "
|
||||
"------- Notifying components... (%3d active components)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_errmgr_base_modules.size));
|
||||
orte_errmgr_base.modules.size));
|
||||
|
||||
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
|
||||
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
|
||||
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
|
||||
if( NULL == module ) {
|
||||
continue;
|
||||
}
|
||||
@ -229,60 +229,3 @@ int orte_errmgr_base_ft_event(int state)
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
void orte_errmgr_base_update_runtime(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
{
|
||||
orte_proc_t *loc_proc;
|
||||
int32_t i;
|
||||
|
||||
/* has this already been done */
|
||||
if (ORTE_ERRMGR_STACK_STATE_UPDATED & *stack_state) {
|
||||
return;
|
||||
}
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_UPDATED;
|
||||
|
||||
/*
|
||||
* orterun is trying to shutdown, so just let it
|
||||
*/
|
||||
if (orte_errmgr_base_shutting_down) {
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* orte_errmgr_base_incomplete_start() will pass a NULL since all processes
|
||||
* are effected by this fault.
|
||||
* JJH: Since we do not handle the recovery from such errors yet, just
|
||||
* skip processing, and go to the abort sequence.
|
||||
*/
|
||||
if (NULL == proc) {
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove the route to this process
|
||||
*/
|
||||
orte_routed.delete_route(proc);
|
||||
|
||||
/*
|
||||
* Set the process state in the job data structure
|
||||
*/
|
||||
loc_proc = NULL;
|
||||
for (i = 0; i < jdata->procs->size; ++i) {
|
||||
if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (loc_proc->name.vpid != proc->vpid) {
|
||||
continue;
|
||||
}
|
||||
|
||||
loc_proc->state = state;
|
||||
if (ORTE_PROC_STATE_UNTERMINATED < state) {
|
||||
jdata->num_terminated++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -53,6 +53,8 @@ bool orte_errmgr_base_shutting_down = false;
|
||||
bool orte_errmgr_initialized = false;
|
||||
opal_list_t orte_errmgr_base_components_available;
|
||||
|
||||
orte_errmgr_base_t orte_errmgr_base;
|
||||
|
||||
/* Public module provides a wrapper around previous functions */
|
||||
orte_errmgr_API_t orte_errmgr = {
|
||||
orte_errmgr_base_log,
|
||||
@ -73,13 +75,14 @@ int orte_errmgr_base_open(void)
|
||||
OPAL_TRACE(5);
|
||||
|
||||
/* Only pass this way once */
|
||||
if( orte_errmgr_initialized ) {
|
||||
if( orte_errmgr_base.initialized ) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&orte_errmgr_base_modules, opal_pointer_array_t);
|
||||
OBJ_CONSTRUCT(&orte_errmgr_base.modules, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&orte_errmgr_base.modules, 3, INT_MAX, 1);
|
||||
|
||||
orte_errmgr_base_output = opal_output_open(NULL);
|
||||
orte_errmgr_base.output = opal_output_open(NULL);
|
||||
|
||||
mca_base_param_reg_int_name("errmgr",
|
||||
"base_enable_recovery",
|
||||
@ -87,27 +90,53 @@ int orte_errmgr_base_open(void)
|
||||
" [Default = disabled]",
|
||||
false, false,
|
||||
0, &value);
|
||||
orte_errmgr_base_enable_recovery = OPAL_INT_TO_BOOL(value);
|
||||
orte_errmgr_base.enable_recovery = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("errmgr",
|
||||
"max_global_restarts",
|
||||
"Max number of times to relocate a failed process to a new node",
|
||||
false, false,
|
||||
-1, &orte_errmgr_base.max_global_restarts);
|
||||
|
||||
mca_base_param_reg_int_name("errmgr",
|
||||
"max_local_restarts",
|
||||
"Max number of times to locally restart a failed process before relocating it to a new node",
|
||||
false, false,
|
||||
-1, &orte_errmgr_base.max_local_restarts);
|
||||
|
||||
if (orte_errmgr_base.enable_recovery) {
|
||||
if (orte_errmgr_base.max_global_restarts < 0 ) {
|
||||
orte_errmgr_base.max_global_restarts = 3;
|
||||
}
|
||||
if (orte_errmgr_base.max_local_restarts < 0) {
|
||||
orte_errmgr_base.max_local_restarts = 3;
|
||||
}
|
||||
} else {
|
||||
if (orte_errmgr_base.max_local_restarts > 0 ||
|
||||
orte_errmgr_base.max_global_restarts > 0) {
|
||||
orte_errmgr_base.enable_recovery = true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* A flag to indicate that orterun is shutting down, so skip the recovery
|
||||
* logic.
|
||||
*/
|
||||
orte_errmgr_base_shutting_down = false;
|
||||
orte_errmgr_base.shutting_down = false;
|
||||
|
||||
/*
|
||||
* Open up all available components
|
||||
*/
|
||||
if (ORTE_SUCCESS !=
|
||||
mca_base_components_open("errmgr",
|
||||
orte_errmgr_base_output,
|
||||
orte_errmgr_base.output,
|
||||
mca_errmgr_base_static_components,
|
||||
&orte_errmgr_base_components_available,
|
||||
true)) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
orte_errmgr_initialized = true;
|
||||
orte_errmgr_base.initialized = true;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -34,11 +34,6 @@
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
|
||||
/*
|
||||
* List of composite modules, ordered by priority
|
||||
*/
|
||||
opal_pointer_array_t orte_errmgr_base_modules;
|
||||
|
||||
struct orte_errmgr_base_select_module_t {
|
||||
mca_base_component_t *component;
|
||||
mca_base_module_t *module;
|
||||
@ -60,8 +55,9 @@ int orte_errmgr_base_select(void)
|
||||
bool none_found;
|
||||
|
||||
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&tmp_array, 3, INT_MAX, 1);
|
||||
|
||||
opal_output_verbose(10, orte_errmgr_base_output,
|
||||
opal_output_verbose(10, orte_errmgr_base.output,
|
||||
"errmgr:base:select: Auto-selecting components");
|
||||
|
||||
/*
|
||||
@ -79,7 +75,7 @@ int orte_errmgr_base_select(void)
|
||||
* If there is a query function then use it.
|
||||
*/
|
||||
if (NULL == component->mca_query_component) {
|
||||
opal_output_verbose(5, orte_errmgr_base_output,
|
||||
opal_output_verbose(5, orte_errmgr_base.output,
|
||||
"errmgr:base:select Skipping component [%s]. It does not implement a query function",
|
||||
component->mca_component_name );
|
||||
continue;
|
||||
@ -88,7 +84,7 @@ int orte_errmgr_base_select(void)
|
||||
/*
|
||||
* Query this component for the module and priority
|
||||
*/
|
||||
opal_output_verbose(5, orte_errmgr_base_output,
|
||||
opal_output_verbose(5, orte_errmgr_base.output,
|
||||
"errmgr:base:select Querying component [%s]",
|
||||
component->mca_component_name);
|
||||
|
||||
@ -98,7 +94,7 @@ int orte_errmgr_base_select(void)
|
||||
* If no module was returned or negative priority, then skip component
|
||||
*/
|
||||
if (NULL == module || priority < 0) {
|
||||
opal_output_verbose(5, orte_errmgr_base_output,
|
||||
opal_output_verbose(5, orte_errmgr_base.output,
|
||||
"errmgr:base:select Skipping component [%s]. Query failed to return a module",
|
||||
component->mca_component_name );
|
||||
continue;
|
||||
@ -107,7 +103,7 @@ int orte_errmgr_base_select(void)
|
||||
/*
|
||||
* Append them to the temporary list, we will sort later
|
||||
*/
|
||||
opal_output_verbose(5, orte_errmgr_base_output,
|
||||
opal_output_verbose(5, orte_errmgr_base.output,
|
||||
"errmgr:base:select Query of component [%s] set priority to %d",
|
||||
component->mca_component_name, priority);
|
||||
tmp_module = (orte_errmgr_base_select_module_t *)malloc(sizeof(orte_errmgr_base_select_module_t));
|
||||
@ -156,10 +152,10 @@ int orte_errmgr_base_select(void)
|
||||
tmp_module = tmp_module_sw;
|
||||
opal_pointer_array_set_item(&tmp_array, j, NULL);
|
||||
}
|
||||
opal_output_verbose(5, orte_errmgr_base_output,
|
||||
opal_output_verbose(5, orte_errmgr_base.output,
|
||||
"errmgr:base:select Add module with priority [%s] %d",
|
||||
tmp_module->component->mca_component_name, tmp_module->priority);
|
||||
opal_pointer_array_add(&orte_errmgr_base_modules, (void*)(tmp_module->module));
|
||||
opal_pointer_array_add(&orte_errmgr_base.modules, (void*)(tmp_module->module));
|
||||
free(tmp_module);
|
||||
}
|
||||
OBJ_DESTRUCT(&tmp_array);
|
||||
@ -167,8 +163,8 @@ int orte_errmgr_base_select(void)
|
||||
/*
|
||||
* Initialize each of the Errmgr Modules
|
||||
*/
|
||||
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
|
||||
i_module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
|
||||
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
|
||||
i_module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
|
||||
if( NULL == i_module ) {
|
||||
continue;
|
||||
}
|
||||
|
@ -39,6 +39,19 @@
|
||||
*/
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* define a struct to hold framework-global values */
|
||||
typedef struct {
|
||||
int output;
|
||||
bool shutting_down;
|
||||
bool enable_recovery;
|
||||
opal_pointer_array_t modules;
|
||||
bool initialized;
|
||||
int max_global_restarts;
|
||||
int max_local_restarts;
|
||||
} orte_errmgr_base_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_t orte_errmgr_base;
|
||||
|
||||
/* Define the ERRMGR command flag */
|
||||
typedef uint8_t orte_errmgr_cmd_flag_t;
|
||||
#define ORTE_ERRMGR_CMD OPAL_UINT8
|
||||
@ -70,11 +83,6 @@ ORTE_DECLSPEC int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
|
||||
opal_list_t *node_list);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_ft_event(int state);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_update_runtime(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_errmgr_stack_state_t *stack_state);
|
||||
|
||||
/*
|
||||
* Additional External API function declared in errmgr.h
|
||||
*/
|
||||
|
@ -30,11 +30,13 @@
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#endif
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
@ -48,6 +50,7 @@ static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code);
|
||||
static void check_job_complete(orte_job_t *jdata);
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
@ -114,10 +117,19 @@ static int update_state(orte_jobid_t job,
|
||||
/* indicate that this is the end of the line */
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp: job %s reported state %s"
|
||||
" for proc %s state %s exit_code %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job),
|
||||
orte_job_state_to_str(jobstate),
|
||||
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state), exit_code));
|
||||
|
||||
/*
|
||||
* if orterun is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_errmgr_base_shutting_down) {
|
||||
if (orte_errmgr_base.shutting_down) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -144,7 +156,7 @@ static int update_state(orte_jobid_t job,
|
||||
/* update the state */
|
||||
jdata->state = jobstate;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp: job %s reported state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
@ -206,6 +218,21 @@ static int update_state(orte_jobid_t job,
|
||||
hnp_abort(jdata->jobid, exit_code);
|
||||
}
|
||||
break;
|
||||
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
|
||||
/* update all procs in job */
|
||||
update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
|
||||
/* order all local procs for this job to be killed */
|
||||
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
|
||||
check_job_complete(jdata); /* set the local proc states */
|
||||
/* the job object for this job will have been NULL'd
|
||||
* in the array if the job was solely local. If it isn't
|
||||
* NULL, then we need to tell everyone else to die
|
||||
*/
|
||||
if (NULL != (jdata = orte_get_job_data_object(job))) {
|
||||
hnp_abort(jdata->jobid, exit_code);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -258,6 +285,19 @@ static int update_state(orte_jobid_t job,
|
||||
check_job_complete(jdata);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
|
||||
update_proc(jdata, proc, state, exit_code);
|
||||
killprocs(proc->jobid, proc->vpid);
|
||||
check_job_complete(jdata); /* need to set the job state */
|
||||
/* the job object for this job will have been NULL'd
|
||||
* in the array if the job was solely local. If it isn't
|
||||
* NULL, then we need to tell everyone else to die
|
||||
*/
|
||||
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
|
||||
hnp_abort(jdata->jobid, exit_code);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -295,14 +335,14 @@ static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code)
|
||||
|
||||
/* if we are already in progress, then ignore this call */
|
||||
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp: abort in progress, ignoring abort on job %s with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job), exit_code));
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp: abort called on job %s with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job), exit_code));
|
||||
@ -367,7 +407,7 @@ static void failed_start(orte_job_t *jdata, orte_exit_code_t exit_code)
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp: job %s reported incomplete start",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
@ -528,7 +568,7 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
* Determine how the process state affects the job state
|
||||
*/
|
||||
if (ORTE_PROC_STATE_FAILED_TO_START == proc->state) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr_hnp:check_job_completed proc %s failed to start",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
@ -542,7 +582,7 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||
}
|
||||
} else if (ORTE_PROC_STATE_ABORTED == proc->state) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed proc %s aborted",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
@ -556,7 +596,7 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||
}
|
||||
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed proc %s aborted by signal",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
@ -570,7 +610,7 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||
}
|
||||
} else if (ORTE_PROC_STATE_TERM_WO_SYNC == proc->state) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed proc %s terminated without sync",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
@ -590,7 +630,7 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
} else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed proc %s killed by cmd",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
@ -609,7 +649,7 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
goto CHECK_ALIVE;
|
||||
} else if (ORTE_PROC_STATE_UNTERMINATED < proc->state &&
|
||||
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed proc %s terminated and continuous",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
@ -626,11 +666,22 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
if (jdata->abort) {
|
||||
/* the job aborted - turn off any sensors on this job */
|
||||
orte_sensor.stop(jdata->jobid);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ORTE_JOB_STATE_UNTERMINATED > jdata->state &&
|
||||
jdata->num_terminated >= jdata->num_procs) {
|
||||
/* this job has terminated */
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
/* turn off any sensor monitors on this job */
|
||||
orte_sensor.stop(jdata->jobid);
|
||||
#endif
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
@ -679,7 +730,7 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {
|
||||
continue;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s releasing procs from node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
node->name));
|
||||
@ -693,7 +744,7 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
}
|
||||
node->slots_inuse--;
|
||||
node->num_procs--;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s releasing proc %s from node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), node->name));
|
||||
@ -748,7 +799,7 @@ CHECK_ALIVE:
|
||||
* just return, though, as we need to ensure we cleanout the
|
||||
* job data for the job that just completed
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed job %s is not terminated (%d:%d)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job->jobid),
|
||||
@ -756,7 +807,7 @@ CHECK_ALIVE:
|
||||
one_still_alive = true;
|
||||
}
|
||||
else {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed job %s is terminated (%d vs %d [0x%x])",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job->jobid),
|
||||
@ -765,13 +816,13 @@ CHECK_ALIVE:
|
||||
}
|
||||
/* if a job is still alive, we just return */
|
||||
if (one_still_alive) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed at least one job is not terminated",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
return;
|
||||
}
|
||||
/* if we get here, then all jobs are done, so wakeup */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed all jobs terminated - waking up",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* set the exit status to 0 - this will only happen if it
|
||||
@ -780,3 +831,22 @@ CHECK_ALIVE:
|
||||
ORTE_UPDATE_EXIT_STATUS(0);
|
||||
orte_trigger_event(&orte_exit);
|
||||
}
|
||||
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
{
|
||||
opal_pointer_array_t cmd;
|
||||
orte_proc_t proc;
|
||||
int rc;
|
||||
|
||||
OBJ_CONSTRUCT(&cmd, opal_pointer_array_t);
|
||||
OBJ_CONSTRUCT(&proc, orte_proc_t);
|
||||
proc.name.jobid = job;
|
||||
proc.name.vpid = vpid;
|
||||
opal_pointer_array_add(&cmd, &proc);
|
||||
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OBJ_DESTRUCT(&proc);
|
||||
}
|
||||
|
||||
|
@ -53,14 +53,7 @@ orte_errmgr_base_component_t mca_errmgr_hnp_component =
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
/* Verbosity level */
|
||||
0,
|
||||
/* opal_output handler */
|
||||
-1,
|
||||
/* Default priority */
|
||||
1
|
||||
}
|
||||
};
|
||||
|
||||
static int errmgr_hnp_open(void)
|
||||
@ -88,4 +81,3 @@ static int errmgr_hnp_component_query(mca_base_module_t **module, int *priority)
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
|
@ -21,6 +21,7 @@
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
@ -28,14 +29,13 @@
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/plm/plm_types.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
#include "errmgr_orted.h"
|
||||
|
||||
@ -47,6 +47,7 @@ static bool all_children_registered(orte_jobid_t job);
|
||||
static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf);
|
||||
static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code);
|
||||
static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state);
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
|
||||
|
||||
|
||||
/*
|
||||
@ -173,6 +174,11 @@ static int update_state(orte_jobid_t job,
|
||||
/* update all local child states */
|
||||
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING);
|
||||
break;
|
||||
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
|
||||
/* update all procs in job */
|
||||
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
|
||||
/* order all local procs for this job to be killed */
|
||||
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -198,7 +204,65 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
|
||||
/*** UPDATE COMMAND FOR A SPECIFIC PROCESS ***/
|
||||
if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) {
|
||||
/* find this proc in the local children */
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == proc->jobid &&
|
||||
child->name->vpid == proc->vpid) {
|
||||
child->state = state;
|
||||
}
|
||||
}
|
||||
killprocs(proc->jobid, proc->vpid);
|
||||
/* let the proc be reported back when terminated */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_TERMINATED < state) {
|
||||
#if 0
|
||||
if (orte_errmgr_base.enable_recovery) {
|
||||
/* lookup the local jobdat for this job */
|
||||
jobdat = NULL;
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
jobdat = (orte_odls_job_t*)item;
|
||||
|
||||
/* is this the specified job? */
|
||||
if (jobdat->jobid == proc->jobid) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == jobdat) {
|
||||
/* race condition - may not have been formed yet */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* find this proc in the local children */
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == proc->jobid &&
|
||||
child->name->vpid == proc->vpid) {
|
||||
/* see if this child has reached its local restart limit */
|
||||
if (child->restarts == jobdat->max_local_restarts ) {
|
||||
goto REPORT_ABORT;
|
||||
}
|
||||
/* otherwise, attempt to restart it locally */
|
||||
child->restarts++;
|
||||
if (ORTE_SUCCESS != (rc = orte_odls.restart_proc(child))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ABORT;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
REPORT_ABORT:
|
||||
#endif
|
||||
/* if the job hasn't completed and the state is abnormally
|
||||
* terminated, then we need to alert the HNP right away
|
||||
*/
|
||||
@ -234,7 +298,7 @@ static int update_state(orte_jobid_t job,
|
||||
/* remove the child from our local list as it is no longer alive */
|
||||
opal_list_remove_item(&orte_local_children, &child->super);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:orted reporting proc %s aborted to HNP",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
@ -279,7 +343,7 @@ static int update_state(orte_jobid_t job,
|
||||
* else that needs it
|
||||
*/
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:orted: sending contact info to HNP",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
@ -361,7 +425,7 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
|
||||
FINAL_CLEANUP:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:orted reporting all procs in %s terminated",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jobdat->jobid)));
|
||||
@ -639,7 +703,7 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code)
|
||||
}
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp: job %s reported incomplete start",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jobdat->jobid)));
|
||||
@ -663,3 +727,21 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
{
|
||||
opal_pointer_array_t cmd;
|
||||
orte_proc_t proc;
|
||||
int rc;
|
||||
|
||||
OBJ_CONSTRUCT(&cmd, opal_pointer_array_t);
|
||||
OBJ_CONSTRUCT(&proc, orte_proc_t);
|
||||
proc.name.jobid = job;
|
||||
proc.name.vpid = vpid;
|
||||
opal_pointer_array_add(&cmd, &proc);
|
||||
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OBJ_DESTRUCT(&proc);
|
||||
}
|
||||
|
@ -53,14 +53,7 @@ orte_errmgr_base_component_t mca_errmgr_orted_component =
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
/* Verbosity level */
|
||||
0,
|
||||
/* opal_output handler */
|
||||
-1,
|
||||
/* Default priority */
|
||||
1
|
||||
}
|
||||
};
|
||||
|
||||
static int errmgr_orted_open(void)
|
||||
|
@ -61,7 +61,9 @@
|
||||
#include "orte/mca/notifier/base/base.h"
|
||||
#include "orte/mca/rmcast/base/base.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#endif
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
@ -417,6 +419,20 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
goto error;
|
||||
}
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
/* setup the SENSOR framework */
|
||||
if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_sensor_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "ortesensor_select";
|
||||
goto error;
|
||||
}
|
||||
#endif
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
@ -438,6 +454,9 @@ int orte_ess_base_orted_finalize(void)
|
||||
orte_grpcomm.onesided_barrier();
|
||||
}
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
orte_sensor_base_close();
|
||||
#endif
|
||||
orte_state_base_close();
|
||||
orte_notifier_base_close();
|
||||
|
||||
|
@ -57,6 +57,9 @@
|
||||
#include "orte/mca/notifier/base/base.h"
|
||||
#include "orte/mca/rmcast/base/base.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#endif
|
||||
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
@ -537,6 +540,20 @@ static int rte_init(void)
|
||||
goto error;
|
||||
}
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
/* setup the SENSOR framework */
|
||||
if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_sensor_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "ortesensor_select";
|
||||
goto error;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* if a tool has launched us and is requesting event reports,
|
||||
* then set its contact info into the comm system
|
||||
*/
|
||||
@ -592,6 +609,9 @@ static int rte_finalize(void)
|
||||
unlink(contact_path);
|
||||
free(contact_path);
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
orte_sensor_base_close();
|
||||
#endif
|
||||
orte_state_base_close();
|
||||
orte_notifier_base_close();
|
||||
|
||||
|
@ -367,6 +367,12 @@ pack_add_procs:
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the max number of local restarts allowed for this job */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->max_local_restarts, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the number of app_contexts for this job */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->num_apps, 1, ORTE_APP_IDX))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -813,6 +819,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* unpack the max number of local restarts allowed for this job */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->max_local_restarts, &cnt, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* unpack the number of app_contexts for this job */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_apps, &cnt, ORTE_APP_IDX))) {
|
||||
@ -2841,3 +2853,9 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer,
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
|
||||
orte_odls_base_fork_local_proc_fn_t fork_local)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -132,6 +132,7 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
|
||||
ptr->num_contributors = 0;
|
||||
ptr->num_participating = -1;
|
||||
ptr->num_collected = 0;
|
||||
ptr->max_local_restarts = 0;
|
||||
}
|
||||
static void orte_odls_job_destructor(orte_odls_job_t *ptr)
|
||||
{
|
||||
|
@ -138,6 +138,9 @@ ORTE_DECLSPEC int orte_odls_base_default_require_sync(orte_process_name_t *proc,
|
||||
opal_buffer_t *buffer,
|
||||
bool drop_nidmap);
|
||||
|
||||
ORTE_DECLSPEC int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
|
||||
orte_odls_base_fork_local_proc_fn_t fork_local);
|
||||
|
||||
/*
|
||||
* Preload binary/files functions
|
||||
*/
|
||||
|
@ -93,6 +93,7 @@
|
||||
static int orte_odls_default_launch_local_procs(opal_buffer_t *data);
|
||||
static int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs);
|
||||
static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal);
|
||||
static int orte_odls_default_restart_proc(orte_odls_child_t *child);
|
||||
|
||||
static void set_handler_default(int sig);
|
||||
|
||||
@ -102,7 +103,8 @@ orte_odls_base_module_t orte_odls_default_module = {
|
||||
orte_odls_default_kill_local_procs,
|
||||
orte_odls_default_signal_local_procs,
|
||||
orte_odls_base_default_deliver_message,
|
||||
orte_odls_base_default_require_sync
|
||||
orte_odls_base_default_require_sync,
|
||||
orte_odls_default_restart_proc
|
||||
};
|
||||
|
||||
/* convenience macro for erroring out */
|
||||
@ -1101,3 +1103,17 @@ static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc,
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_odls_default_restart_proc(orte_odls_child_t *child)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* restart the local proc */
|
||||
if (ORTE_SUCCESS != (rc = orte_odls_base_default_restart_proc(child, odls_default_fork_local_proc))) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:default:restart_proc failed to launch on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -82,6 +82,10 @@ typedef int (*orte_odls_base_module_deliver_message_fn_t)(orte_jobid_t job, opal
|
||||
typedef int (*orte_odls_base_module_require_sync_fn_t)(orte_process_name_t *proc,
|
||||
opal_buffer_t *buffer,
|
||||
bool drop_nidmap);
|
||||
/**
|
||||
* Restart a local process
|
||||
*/
|
||||
typedef int (*orte_odls_base_module_restart_proc_fn_t)(orte_odls_child_t *child);
|
||||
|
||||
/**
|
||||
* pls module version
|
||||
@ -93,6 +97,7 @@ struct orte_odls_base_module_1_3_0_t {
|
||||
orte_odls_base_module_signal_local_process_fn_t signal_local_procs;
|
||||
orte_odls_base_module_deliver_message_fn_t deliver_message;
|
||||
orte_odls_base_module_require_sync_fn_t require_sync;
|
||||
orte_odls_base_module_restart_proc_fn_t restart_proc;
|
||||
};
|
||||
|
||||
/** shorten orte_odls_base_module_1_3_0_t declaration */
|
||||
|
@ -141,6 +141,7 @@ typedef struct orte_odls_job_t {
|
||||
int num_participating;
|
||||
int num_collected;
|
||||
struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */
|
||||
int32_t max_local_restarts; /* max number of times a local proc can be restarted */
|
||||
} orte_odls_job_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t);
|
||||
|
||||
|
@ -220,6 +220,19 @@ static int odls_process_signal_local_proc(const orte_process_name_t *proc, int32
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int orte_odls_process_restart_proc(orte_odls_child_t *child)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* restart the local proc */
|
||||
if (ORTE_SUCCESS != (rc = orte_odls_base_default_restart_proc(child, odls_process_fork_local_proc))) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:process:restart_proc failed to launch on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
orte_odls_base_module_t orte_odls_process_module = {
|
||||
orte_odls_base_default_get_add_procs_data,
|
||||
@ -227,5 +240,6 @@ orte_odls_base_module_t orte_odls_process_module = {
|
||||
odls_process_kill_local_procs,
|
||||
odls_process_signal_local_proc,
|
||||
orte_odls_base_default_deliver_message,
|
||||
orte_odls_base_default_require_sync
|
||||
orte_odls_base_default_require_sync,
|
||||
orte_odls_process_restart_proc
|
||||
};
|
||||
|
@ -52,6 +52,9 @@
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#endif
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
@ -354,6 +357,11 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
goto WAKEUP;
|
||||
}
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
/* start any sensor monitoring of this job */
|
||||
orte_sensor.start(job);
|
||||
#endif
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:launch completed for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
|
35
orte/mca/sensor/Makefile.am
Обычный файл
35
orte/mca/sensor/Makefile.am
Обычный файл
@ -0,0 +1,35 @@
|
||||
#
|
||||
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# main library setup
|
||||
noinst_LTLIBRARIES = libmca_sensor.la
|
||||
libmca_sensor_la_SOURCES =
|
||||
|
||||
# header setup
|
||||
nobase_orte_HEADERS =
|
||||
|
||||
# local files
|
||||
headers = sensor.h \
|
||||
sensor_types.h
|
||||
|
||||
libmca_sensor_la_SOURCES += $(headers)
|
||||
|
||||
# Conditionally install the header files
|
||||
if WANT_INSTALL_HEADERS
|
||||
nobase_orte_HEADERS += $(headers)
|
||||
ortedir = $(includedir)/openmpi/orte/mca/sensor
|
||||
else
|
||||
ortedir = $(includedir)
|
||||
endif
|
||||
|
||||
include base/Makefile.am
|
||||
|
||||
distclean-local:
|
||||
rm -f base/static-components.h
|
26
orte/mca/sensor/base/Makefile.am
Обычный файл
26
orte/mca/sensor/base/Makefile.am
Обычный файл
@ -0,0 +1,26 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
headers += \
|
||||
base/base.h
|
||||
|
||||
libmca_sensor_la_SOURCES += \
|
||||
base/sensor_base_open.c
|
||||
|
||||
if !ORTE_DISABLE_FULL_SUPPORT
|
||||
|
||||
headers += \
|
||||
base/sensor_private.h
|
||||
|
||||
libmca_sensor_la_SOURCES += \
|
||||
base/sensor_base_close.c \
|
||||
base/sensor_base_select.c
|
||||
|
||||
endif
|
52
orte/mca/sensor/base/base.h
Обычный файл
52
orte/mca/sensor/base/base.h
Обычный файл
@ -0,0 +1,52 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef MCA_SENSOR_BASE_H
|
||||
#define MCA_SENSOR_BASE_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/mca/mca.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
|
||||
/*
|
||||
* Global functions for MCA overall collective open and close
|
||||
*/
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* function definitions
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_sensor_base_open(void);
|
||||
ORTE_DECLSPEC int orte_sensor_base_select(void);
|
||||
ORTE_DECLSPEC int orte_sensor_base_close(void);
|
||||
|
||||
/*
|
||||
* globals that might be needed
|
||||
*/
|
||||
|
||||
ORTE_DECLSPEC extern opal_list_t mca_sensor_base_components_available;
|
||||
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
|
||||
/* no base functions to protect at this time */
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
46
orte/mca/sensor/base/sensor_base_close.c
Обычный файл
46
orte/mca/sensor/base/sensor_base_close.c
Обычный файл
@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
|
||||
int orte_sensor_base_close(void)
|
||||
{
|
||||
orte_sensor_base_module_t *i_module;
|
||||
int i;
|
||||
|
||||
for (i=0; i < orte_sensor_base.modules.size; i++) {
|
||||
if (NULL == (i_module = (orte_sensor_base_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
|
||||
continue;
|
||||
}
|
||||
if (NULL != i_module->finalize) {
|
||||
i_module->finalize();
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&orte_sensor_base.modules);
|
||||
|
||||
/* Close all remaining available components */
|
||||
|
||||
mca_base_components_close(orte_sensor_base.output,
|
||||
&mca_sensor_base_components_available, NULL);
|
||||
|
||||
/* All done */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
108
orte/mca/sensor/base/sensor_base_open.c
Обычный файл
108
orte/mca/sensor/base/sensor_base_open.c
Обычный файл
@ -0,0 +1,108 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
|
||||
/*
|
||||
* The following file was created by configure. It contains extern
|
||||
* statements and the definition of an array of pointers to each
|
||||
* component's public mca_base_component_t struct.
|
||||
*/
|
||||
|
||||
#include "orte/mca/sensor/base/static-components.h"
|
||||
|
||||
/* base functions */
|
||||
static void start(orte_jobid_t jobid);
|
||||
static void stop(orte_jobid_t jobid);
|
||||
|
||||
/*
|
||||
* Global variables
|
||||
*/
|
||||
orte_sensor_base_t orte_sensor_base;
|
||||
orte_sensor_base_API_module_t orte_sensor = {
|
||||
start,
|
||||
stop
|
||||
};
|
||||
opal_list_t mca_sensor_base_components_available;
|
||||
|
||||
/**
|
||||
* Function for finding and opening either all MCA components, or the one
|
||||
* that was specifically requested via a MCA parameter.
|
||||
*/
|
||||
int orte_sensor_base_open(void)
|
||||
{
|
||||
/* Debugging / verbose output. Always have stream open, with
|
||||
verbose set by the mca open system... */
|
||||
orte_sensor_base.output = opal_output_open(NULL);
|
||||
|
||||
/* construct the array of modules */
|
||||
OBJ_CONSTRUCT(&orte_sensor_base.modules, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&orte_sensor_base.modules, 3, INT_MAX, 1);
|
||||
|
||||
/* Open up all available components */
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
mca_base_components_open("sensor", orte_sensor_base.output,
|
||||
mca_sensor_base_static_components,
|
||||
&mca_sensor_base_components_available, true)) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void start(orte_jobid_t jobid)
|
||||
{
|
||||
orte_sensor_base_module_t *i_module;
|
||||
int i;
|
||||
|
||||
for (i=0; i < orte_sensor_base.modules.size; i++) {
|
||||
if (NULL == (i_module = (orte_sensor_base_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
|
||||
continue;
|
||||
}
|
||||
if (NULL != i_module->start) {
|
||||
i_module->start(jobid);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void stop(orte_jobid_t jobid)
|
||||
{
|
||||
orte_sensor_base_module_t *i_module;
|
||||
int i;
|
||||
|
||||
for (i=0; i < orte_sensor_base.modules.size; i++) {
|
||||
if (NULL == (i_module = (orte_sensor_base_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
|
||||
continue;
|
||||
}
|
||||
if (NULL != i_module->stop) {
|
||||
i_module->stop(jobid);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
177
orte/mca/sensor/base/sensor_base_select.c
Обычный файл
177
orte/mca/sensor/base/sensor_base_select.c
Обычный файл
@ -0,0 +1,177 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
|
||||
|
||||
struct orte_sensor_base_select_module_t {
|
||||
mca_base_component_t *component;
|
||||
mca_base_module_t *module;
|
||||
int priority;
|
||||
};
|
||||
typedef struct orte_sensor_base_select_module_t orte_sensor_base_select_module_t;
|
||||
|
||||
|
||||
/**
|
||||
* Function for weeding out sensor components that don't want to run.
|
||||
*
|
||||
* Call the init function on all available components to find out if
|
||||
* they want to run. Select all components that don't fail. Failing
|
||||
* components will be closed and unloaded. The selected modules will
|
||||
* be returned to the caller in a opal_list_t.
|
||||
*/
|
||||
int orte_sensor_base_select(void)
|
||||
{
|
||||
mca_base_component_list_item_t *cli = NULL;
|
||||
mca_base_component_t *component = NULL;
|
||||
mca_base_module_t *module = NULL;
|
||||
orte_sensor_base_module_t *i_module;
|
||||
opal_list_item_t *item;
|
||||
int priority = 0, i, j, low_i;
|
||||
int exit_status = OPAL_SUCCESS;
|
||||
opal_pointer_array_t tmp_array;
|
||||
bool none_found;
|
||||
orte_sensor_base_select_module_t *tmp_module = NULL, *tmp_module_sw = NULL;
|
||||
|
||||
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
|
||||
|
||||
opal_output_verbose(10, orte_sensor_base.output,
|
||||
"sensor:base:select: Auto-selecting components");
|
||||
|
||||
/*
|
||||
* Traverse the list of available components.
|
||||
* For each call their 'query' functions to determine relative priority.
|
||||
*/
|
||||
none_found = true;
|
||||
for (item = opal_list_get_first(&mca_sensor_base_components_available);
|
||||
item != opal_list_get_end(&mca_sensor_base_components_available);
|
||||
item = opal_list_get_next(item) ) {
|
||||
cli = (mca_base_component_list_item_t *) item;
|
||||
component = (mca_base_component_t *) cli->cli_component;
|
||||
|
||||
/*
|
||||
* If there is a query function then use it.
|
||||
*/
|
||||
if (NULL == component->mca_query_component) {
|
||||
opal_output_verbose(5, orte_sensor_base.output,
|
||||
"sensor:base:select Skipping component [%s]. It does not implement a query function",
|
||||
component->mca_component_name );
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Query this component for the module and priority
|
||||
*/
|
||||
opal_output_verbose(5, orte_sensor_base.output,
|
||||
"sensor:base:select Querying component [%s]",
|
||||
component->mca_component_name);
|
||||
|
||||
component->mca_query_component(&module, &priority);
|
||||
|
||||
/*
|
||||
* If no module was returned or negative priority, then skip component
|
||||
*/
|
||||
if (NULL == module || priority < 0) {
|
||||
opal_output_verbose(5, orte_sensor_base.output,
|
||||
"sensor:base:select Skipping component [%s]. Query failed to return a module",
|
||||
component->mca_component_name );
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Append them to the temporary list, we will sort later
|
||||
*/
|
||||
opal_output_verbose(5, orte_sensor_base.output,
|
||||
"sensor:base:select Query of component [%s] set priority to %d",
|
||||
component->mca_component_name, priority);
|
||||
tmp_module = (orte_sensor_base_select_module_t *)malloc(sizeof(orte_sensor_base_select_module_t));
|
||||
tmp_module->component = component;
|
||||
tmp_module->module = module;
|
||||
tmp_module->priority = priority;
|
||||
|
||||
opal_pointer_array_add(&tmp_array, (void*)tmp_module);
|
||||
none_found = false;
|
||||
}
|
||||
|
||||
if (none_found) {
|
||||
/* okay for no modules to be found */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sort the list by decending priority
|
||||
*/
|
||||
priority = 0;
|
||||
for(j = 0; j < tmp_array.size; ++j) {
|
||||
tmp_module_sw = (orte_sensor_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, j);
|
||||
if( NULL == tmp_module_sw ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
low_i = -1;
|
||||
priority = tmp_module_sw->priority;
|
||||
|
||||
for(i = 0; i < tmp_array.size; ++i) {
|
||||
tmp_module = (orte_sensor_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, i);
|
||||
if( NULL == tmp_module ) {
|
||||
continue;
|
||||
}
|
||||
if( tmp_module->priority > priority ) {
|
||||
low_i = i;
|
||||
priority = tmp_module->priority;
|
||||
}
|
||||
}
|
||||
|
||||
if( low_i >= 0 ) {
|
||||
tmp_module = (orte_sensor_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, low_i);
|
||||
opal_pointer_array_set_item(&tmp_array, low_i, NULL);
|
||||
j--; /* Try this entry again, if it is not the lowest */
|
||||
} else {
|
||||
tmp_module = tmp_module_sw;
|
||||
opal_pointer_array_set_item(&tmp_array, j, NULL);
|
||||
}
|
||||
opal_output_verbose(5, orte_sensor_base.output,
|
||||
"sensor:base:select Add module with priority [%s] %d",
|
||||
tmp_module->component->mca_component_name, tmp_module->priority);
|
||||
opal_pointer_array_add(&orte_sensor_base.modules, (void*)(tmp_module->module));
|
||||
free(tmp_module);
|
||||
}
|
||||
OBJ_DESTRUCT(&tmp_array);
|
||||
|
||||
/*
|
||||
* Initialize each of the modules
|
||||
*/
|
||||
for(i = 0; i < orte_sensor_base.modules.size; ++i) {
|
||||
i_module = (orte_sensor_base_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i);
|
||||
if( NULL == i_module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != i_module->init ) {
|
||||
i_module->init();
|
||||
}
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
44
orte/mca/sensor/base/sensor_private.h
Обычный файл
44
orte/mca/sensor/base/sensor_private.h
Обычный файл
@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef MCA_SENSOR_PRIVATE_H
|
||||
#define MCA_SENSOR_PRIVATE_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/dss/dss_types.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor_types.h"
|
||||
|
||||
|
||||
/*
|
||||
* Global functions for MCA overall collective open and close
|
||||
*/
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
|
||||
/* define a struct to hold framework-global values */
|
||||
typedef struct {
|
||||
int output;
|
||||
opal_pointer_array_t modules;
|
||||
} orte_sensor_base_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_sensor_base_t orte_sensor_base;
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
37
orte/mca/sensor/file/Makefile.am
Обычный файл
37
orte/mca/sensor/file/Makefile.am
Обычный файл
@ -0,0 +1,37 @@
|
||||
#
|
||||
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-sensor-file.txt
|
||||
|
||||
sources = \
|
||||
sensor_file.c \
|
||||
sensor_file.h \
|
||||
sensor_file_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_sensor_file_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_file.la
|
||||
else
|
||||
component_noinst = libmca_sensor_file.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_file_la_SOURCES = $(sources)
|
||||
mca_sensor_file_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_file_la_SOURCES =$(sources)
|
||||
libmca_sensor_file_la_LDFLAGS = -module -avoid-version
|
19
orte/mca/sensor/file/configure.m4
Обычный файл
19
orte/mca/sensor/file/configure.m4
Обычный файл
@ -0,0 +1,19 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_sensor_file_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_sensor_file_CONFIG], [
|
||||
# if we don't want sensors, don't compile
|
||||
# this component
|
||||
AS_IF([test "$orte_want_sensors" = "1"],
|
||||
[$1], [$2])
|
||||
])dnl
|
||||
|
14
orte/mca/sensor/file/configure.params
Обычный файл
14
orte/mca/sensor/file/configure.params
Обычный файл
@ -0,0 +1,14 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_CONFIG_FILES="Makefile"
|
18
orte/mca/sensor/file/help-orte-sensor-file.txt
Обычный файл
18
orte/mca/sensor/file/help-orte-sensor-file.txt
Обычный файл
@ -0,0 +1,18 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for the file sensor
|
||||
#
|
||||
[file-stalled]
|
||||
A specified file is not changing, indicating a possibly stalled application:
|
||||
|
||||
File: %s
|
||||
Last size: %lu
|
||||
Last access: %sLast modification: %s
|
354
orte/mca/sensor/file/sensor_file.c
Обычный файл
354
orte/mca/sensor/file/sensor_file.c
Обычный файл
@ -0,0 +1,354 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stddef.h>
|
||||
#include <ctype.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_NETDB_H
|
||||
#include <netdb.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_PARAM_H
|
||||
#include <sys/param.h>
|
||||
#endif
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#ifdef HAVE_TIME_H
|
||||
#include <time.h>
|
||||
#endif
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_file.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void start(orte_jobid_t jobid);
|
||||
static void stop(orte_jobid_t jobid);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_file_module = {
|
||||
init,
|
||||
finalize,
|
||||
start,
|
||||
stop
|
||||
};
|
||||
|
||||
/* define a tracking object */
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
orte_jobid_t jobid;
|
||||
orte_vpid_t vpid;
|
||||
char *file;
|
||||
int tick;
|
||||
bool check_size;
|
||||
bool check_access;
|
||||
bool check_mod;
|
||||
int32_t file_size;
|
||||
time_t last_access;
|
||||
time_t last_mod;
|
||||
int limit;
|
||||
} file_tracker_t;
|
||||
static void ft_constructor(file_tracker_t *ft)
|
||||
{
|
||||
ft->file = NULL;
|
||||
ft->tick = 0;
|
||||
ft->file_size = 0;
|
||||
ft->last_access = 0;
|
||||
ft->last_mod = 0;
|
||||
ft->limit = 0;
|
||||
}
|
||||
static void ft_destructor(file_tracker_t *ft)
|
||||
{
|
||||
if (NULL != ft->file) {
|
||||
free(ft->file);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(file_tracker_t,
|
||||
opal_list_item_t,
|
||||
ft_constructor, ft_destructor);
|
||||
|
||||
/* declare the local functions */
|
||||
static void sample(int fd, short event, void *arg);
|
||||
|
||||
/* local globals */
|
||||
static opal_event_t *sample_ev = NULL;
|
||||
static struct timeval sample_time;
|
||||
static opal_list_t jobs;
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
OBJ_CONSTRUCT(&jobs, opal_list_t);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
if (NULL != sample_ev) {
|
||||
opal_event_del(sample_ev);
|
||||
free(sample_ev);
|
||||
sample_ev = NULL;
|
||||
}
|
||||
while (NULL != (item = opal_list_remove_first(&jobs))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&jobs);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start monitoring of local processes
|
||||
*/
|
||||
static void start(orte_jobid_t jobid)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_file_component.super.base_version;
|
||||
opal_list_item_t *item;
|
||||
orte_odls_job_t *jobdat;
|
||||
orte_app_context_t *app;
|
||||
int rc, tmp;
|
||||
char *filename;
|
||||
file_tracker_t *ft;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s starting file monitoring for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jobid)));
|
||||
|
||||
/* get the local jobdat for this job */
|
||||
jobdat = NULL;
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_end(&orte_local_jobdata)) {
|
||||
jobdat = (orte_odls_job_t*)item;
|
||||
if (jobid == jobdat->jobid) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == jobdat) {
|
||||
/* no local procs for this job */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s sensor:file no local procs for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jobid)));
|
||||
return;
|
||||
}
|
||||
|
||||
/* must be at least one app_context, so use the first */
|
||||
if (NULL == (app = jobdat->apps[0])) {
|
||||
/* got a problem */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return;
|
||||
}
|
||||
|
||||
/* search the environ to get the filename */
|
||||
if (ORTE_SUCCESS != (rc = mca_base_param_find_string(c, "filename", app->env, &filename))) {
|
||||
/* was a default file given */
|
||||
if (NULL == mca_sensor_file_component.file) {
|
||||
/* can't do anything without a file */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s sensor:file no file for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jobid)));
|
||||
return;
|
||||
}
|
||||
filename = mca_sensor_file_component.file;
|
||||
}
|
||||
|
||||
/* create the tracking object */
|
||||
ft = OBJ_NEW(file_tracker_t);
|
||||
ft->jobid = jobid;
|
||||
ft->file = strdup(filename);
|
||||
|
||||
/* search the environ to see what we are checking */
|
||||
tmp = 0;
|
||||
if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_size", app->env, &tmp))) {
|
||||
/* was a default value given */
|
||||
if (0 < mca_sensor_file_component.check_size) {
|
||||
ft->check_size = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_size);
|
||||
}
|
||||
} else {
|
||||
ft->check_size = OPAL_INT_TO_BOOL(tmp);
|
||||
}
|
||||
tmp = 0;
|
||||
if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_access", app->env, &tmp))) {
|
||||
/* was a default value given */
|
||||
if (0 < mca_sensor_file_component.check_access) {
|
||||
ft->check_access = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_access);
|
||||
}
|
||||
} else {
|
||||
ft->check_access = OPAL_INT_TO_BOOL(tmp);
|
||||
}
|
||||
tmp = 0;
|
||||
if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_mod", app->env, &tmp))) {
|
||||
/* was a default value given */
|
||||
if (0 < mca_sensor_file_component.check_mod) {
|
||||
ft->check_mod = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_mod);
|
||||
}
|
||||
} else {
|
||||
ft->check_mod = OPAL_INT_TO_BOOL(tmp);
|
||||
}
|
||||
tmp = 0;
|
||||
if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "limit", app->env, &tmp))) {
|
||||
ft->limit = mca_sensor_file_component.limit;
|
||||
} else {
|
||||
ft->limit = tmp;
|
||||
}
|
||||
opal_list_append(&jobs, &ft->super);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s file %s monitored for %s%s%s with limit %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ft->file, ft->check_size ? "SIZE:" : " ",
|
||||
ft->check_access ? "ACCESS TIME:" : " ",
|
||||
ft->check_mod ? "MOD TIME" : " ", ft->limit));
|
||||
|
||||
/* start sampling */
|
||||
if (NULL == sample_ev) {
|
||||
/* startup a timer to wake us up periodically
|
||||
* for a data sample
|
||||
*/
|
||||
sample_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
|
||||
opal_evtimer_set(sample_ev, sample, sample_ev);
|
||||
sample_time.tv_sec = mca_sensor_file_component.sample_rate;
|
||||
sample_time.tv_usec = 0;
|
||||
opal_evtimer_add(sample_ev, &sample_time);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static void stop(orte_jobid_t jobid)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
file_tracker_t *ft;
|
||||
|
||||
for (item = opal_list_get_first(&jobs);
|
||||
item != opal_list_get_end(&jobs);
|
||||
item = opal_list_get_next(item)) {
|
||||
ft = (file_tracker_t*)item;
|
||||
if (jobid == ft->jobid) {
|
||||
opal_list_remove_item(&jobs, item);
|
||||
OBJ_RELEASE(item);
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* if no jobs remain, stop the sampling */
|
||||
if (opal_list_is_empty(&jobs) && NULL != sample_ev) {
|
||||
opal_event_del(sample_ev);
|
||||
free(sample_ev);
|
||||
sample_ev = NULL;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void sample(int fd, short event, void *arg)
|
||||
{
|
||||
struct stat buf;
|
||||
opal_list_item_t *item;
|
||||
file_tracker_t *ft;
|
||||
|
||||
/* if we are not sampling any more, then just return */
|
||||
if (NULL == sample_ev) {
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s sampling files",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
for (item = opal_list_get_first(&jobs);
|
||||
item != opal_list_get_end(&jobs);
|
||||
item = opal_list_get_next(item)) {
|
||||
ft = (file_tracker_t*)item;
|
||||
|
||||
/* stat the file and get its size */
|
||||
if (0 > stat(ft->file, &buf)) {
|
||||
/* cannot stat file */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s could not stat %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ft->file));
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s size %lu access %s\tmod %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(unsigned long)buf.st_size, ctime(&buf.st_atime), ctime(&buf.st_mtime)));
|
||||
|
||||
if (ft->check_size) {
|
||||
if (buf.st_size == ft->file_size) {
|
||||
ft->tick++;
|
||||
goto CHECK;
|
||||
} else {
|
||||
ft->tick = 0;
|
||||
ft->file_size = buf.st_size;
|
||||
}
|
||||
}
|
||||
if (ft->check_access) {
|
||||
if (buf.st_atime == ft->last_access) {
|
||||
ft->tick++;
|
||||
goto CHECK;
|
||||
} else {
|
||||
ft->tick = 0;
|
||||
ft->last_access = buf.st_atime;
|
||||
}
|
||||
}
|
||||
if (ft->check_mod) {
|
||||
if (buf.st_mtime == ft->last_mod) {
|
||||
ft->tick++;
|
||||
goto CHECK;
|
||||
} else {
|
||||
ft->tick = 0;
|
||||
ft->last_mod = buf.st_mtime;
|
||||
}
|
||||
}
|
||||
|
||||
CHECK:
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s sampled file %s tick %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ft->file, ft->tick));
|
||||
|
||||
if (ft->tick == ft->limit) {
|
||||
orte_show_help("help-orte-sensor-file.txt", "file-stalled", true,
|
||||
ft->file, ft->file_size, ctime(&ft->last_access), ctime(&ft->last_mod));
|
||||
orte_errmgr.update_state(ft->jobid, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
}
|
||||
|
||||
/* restart the timer */
|
||||
opal_evtimer_add(sample_ev, &sample_time);
|
||||
}
|
41
orte/mca/sensor/file/sensor_file.h
Обычный файл
41
orte/mca/sensor/file/sensor_file.h
Обычный файл
@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* File movement sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_FILE_H
|
||||
#define ORTE_SENSOR_FILE_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_sensor_file_component_t {
|
||||
orte_sensor_base_component_t super;
|
||||
int sample_rate;
|
||||
char *file;
|
||||
bool check_size;
|
||||
bool check_access;
|
||||
bool check_mod;
|
||||
int limit;
|
||||
};
|
||||
typedef struct orte_sensor_file_component_t orte_sensor_file_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_file_component_t mca_sensor_file_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_file_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
107
orte/mca/sensor/file/sensor_file_component.c
Обычный файл
107
orte/mca/sensor/file/sensor_file_component.c
Обычный файл
@ -0,0 +1,107 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "sensor_file.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_sensor_file_open(void);
|
||||
static int orte_sensor_file_close(void);
|
||||
static int orte_sensor_file_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
orte_sensor_file_component_t mca_sensor_file_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"file", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_file_open, /* component open */
|
||||
orte_sensor_file_close, /* component close */
|
||||
orte_sensor_file_query /* component query */
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_sensor_file_open(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_file_component.super.base_version;
|
||||
int tmp;
|
||||
|
||||
/* lookup parameters */
|
||||
mca_base_param_reg_int(c, "sample_rate",
|
||||
"Sample rate in seconds (default=10)",
|
||||
false, false, 10, &mca_sensor_file_component.sample_rate);
|
||||
|
||||
mca_base_param_reg_string(c, "filename",
|
||||
"File to be monitored",
|
||||
false, false, NULL, &mca_sensor_file_component.file);
|
||||
|
||||
mca_base_param_reg_int(c, "check_size",
|
||||
"Check the file size",
|
||||
false, false, false, &tmp);
|
||||
mca_sensor_file_component.check_size = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
mca_base_param_reg_int(c, "check_access",
|
||||
"Check access time",
|
||||
false, false, false, &tmp);
|
||||
mca_sensor_file_component.check_access = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
mca_base_param_reg_int(c, "check_mod",
|
||||
"Check modification time",
|
||||
false, false, false, &tmp);
|
||||
mca_sensor_file_component.check_mod = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
mca_base_param_reg_int(c, "limit",
|
||||
"Number of times the sensor can detect no motion before declaring error (default=3)",
|
||||
false, false, 3, &mca_sensor_file_component.limit);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_sensor_file_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
*priority = 0; /* select only if specified */
|
||||
*module = (mca_base_module_t *)&orte_sensor_file_module;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_file_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
37
orte/mca/sensor/memusage/Makefile.am
Обычный файл
37
orte/mca/sensor/memusage/Makefile.am
Обычный файл
@ -0,0 +1,37 @@
|
||||
#
|
||||
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-sensor-memusage.txt
|
||||
|
||||
sources = \
|
||||
sensor_memusage.c \
|
||||
sensor_memusage.h \
|
||||
sensor_memusage_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_sensor_memusage_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_memusage.la
|
||||
else
|
||||
component_noinst = libmca_sensor_memusage.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_memusage_la_SOURCES = $(sources)
|
||||
mca_sensor_memusage_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_memusage_la_SOURCES =$(sources)
|
||||
libmca_sensor_memusage_la_LDFLAGS = -module -avoid-version
|
19
orte/mca/sensor/memusage/configure.m4
Обычный файл
19
orte/mca/sensor/memusage/configure.m4
Обычный файл
@ -0,0 +1,19 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_sensor_memusage_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_sensor_memusage_CONFIG], [
|
||||
# if we don't want sensors, don't compile
|
||||
# this component
|
||||
AS_IF([test "$orte_want_sensors" = "1"],
|
||||
[$1], [$2])
|
||||
])dnl
|
||||
|
14
orte/mca/sensor/memusage/configure.params
Обычный файл
14
orte/mca/sensor/memusage/configure.params
Обычный файл
@ -0,0 +1,14 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_CONFIG_FILES="Makefile"
|
20
orte/mca/sensor/memusage/help-orte-sensor-memusage.txt
Обычный файл
20
orte/mca/sensor/memusage/help-orte-sensor-memusage.txt
Обычный файл
@ -0,0 +1,20 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for the memory usage sensor
|
||||
#
|
||||
[mem-limit-exceeded]
|
||||
A process has exceeded the specified limit on memory usage:
|
||||
|
||||
Node: %s
|
||||
Process rank: %s
|
||||
Memory used: %luGbytes
|
||||
Memory limit: %luGbytes
|
||||
|
264
orte/mca/sensor/memusage/sensor_memusage.c
Обычный файл
264
orte/mca/sensor/memusage/sensor_memusage.c
Обычный файл
@ -0,0 +1,264 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/pstat/pstat.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_memusage.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void start(orte_jobid_t job);
|
||||
static void stop(orte_jobid_t job);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_memusage_module = {
|
||||
init,
|
||||
finalize,
|
||||
start,
|
||||
stop
|
||||
};
|
||||
|
||||
/* define a tracking object */
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
orte_jobid_t jobid;
|
||||
unsigned long memory_limit;
|
||||
} memusage_tracker_t;
|
||||
static void constructor(memusage_tracker_t *ptr)
|
||||
{
|
||||
ptr->memory_limit = 0;
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(memusage_tracker_t,
|
||||
opal_list_item_t,
|
||||
constructor, NULL);
|
||||
|
||||
/* declare the local functions */
|
||||
static void sample(int fd, short event, void *arg);
|
||||
|
||||
/* local globals */
|
||||
static opal_event_t *sample_ev = NULL;
|
||||
static opal_list_t jobs;
|
||||
static struct timeval sample_time;
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
OBJ_CONSTRUCT(&jobs, opal_list_t);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
if (NULL != sample_ev) {
|
||||
opal_event_del(sample_ev);
|
||||
free(sample_ev);
|
||||
sample_ev = NULL;
|
||||
}
|
||||
while (NULL != (item = opal_list_remove_first(&jobs))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&jobs);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start monitoring of local processes
|
||||
*/
|
||||
static void start(orte_jobid_t jobid)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_memusage_component.super.base_version;
|
||||
memusage_tracker_t *job;
|
||||
orte_odls_job_t *jobdat;
|
||||
orte_app_context_t *app;
|
||||
opal_list_item_t *item;
|
||||
int rc, tmp;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s starting memory monitoring for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jobid)));
|
||||
|
||||
/* get the local jobdat for this job */
|
||||
jobdat = NULL;
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_end(&orte_local_jobdata)) {
|
||||
jobdat = (orte_odls_job_t*)item;
|
||||
if (jobid == jobdat->jobid) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == jobdat) {
|
||||
/* no local procs for this job */
|
||||
return;
|
||||
}
|
||||
|
||||
/* must be at least one app_context, so use the first */
|
||||
if (NULL == (app = jobdat->apps[0])) {
|
||||
/* got a problem */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return;
|
||||
}
|
||||
|
||||
/* search the environ to get memory limit */
|
||||
tmp = 0;
|
||||
if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "memory_limit", app->env, &tmp))) {
|
||||
/* was a default value given */
|
||||
if (0 < mca_sensor_memusage_component.memory_limit) {
|
||||
tmp = mca_sensor_memusage_component.memory_limit;
|
||||
}
|
||||
}
|
||||
if (tmp <= 0) {
|
||||
/* we don't want to monitor this job */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s memory monitoring for job %s is not requested",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jobid)));
|
||||
return;
|
||||
}
|
||||
|
||||
job = OBJ_NEW(memusage_tracker_t);
|
||||
job->jobid = jobid;
|
||||
job->memory_limit = tmp;
|
||||
opal_list_append(&jobs, &job->super);
|
||||
|
||||
if (NULL == sample_ev) {
|
||||
/* startup a timer to wake us up periodically
|
||||
* for a data sample
|
||||
*/
|
||||
sample_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
|
||||
opal_evtimer_set(sample_ev, sample, sample_ev);
|
||||
sample_time.tv_sec = mca_sensor_memusage_component.sample_rate;
|
||||
sample_time.tv_usec = 0;
|
||||
opal_evtimer_add(sample_ev, &sample_time);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static void stop(orte_jobid_t jobid)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
memusage_tracker_t *job;
|
||||
|
||||
for (item = opal_list_get_first(&jobs);
|
||||
item != opal_list_get_end(&jobs);
|
||||
item = opal_list_get_next(item)) {
|
||||
job = (memusage_tracker_t*)item;
|
||||
if (jobid == job->jobid) {
|
||||
opal_list_remove_item(&jobs, item);
|
||||
OBJ_RELEASE(item);
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* if no jobs remain, stop the sampling */
|
||||
if (opal_list_is_empty(&jobs) && NULL != sample_ev) {
|
||||
opal_event_del(sample_ev);
|
||||
free(sample_ev);
|
||||
sample_ev = NULL;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void sample(int fd, short event, void *arg)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
orte_odls_child_t *child;
|
||||
opal_pstats_t stats;
|
||||
int rc;
|
||||
memusage_tracker_t *job;
|
||||
bool monitored;
|
||||
|
||||
/* if we are not sampling any more, then just return */
|
||||
if (NULL == sample_ev) {
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"sample:memusage sampling resource usage"));
|
||||
|
||||
/* loop through our local children */
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
|
||||
/* is this in a job we are monitoring */
|
||||
monitored = false;
|
||||
for (item = opal_list_get_first(&jobs);
|
||||
item != opal_list_get_end(&jobs);
|
||||
item = opal_list_get_next(item)) {
|
||||
job = (memusage_tracker_t*)item;
|
||||
if (child->name->jobid == job->jobid) {
|
||||
monitored = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!monitored) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* get the process resource utilization stats */
|
||||
OBJ_CONSTRUCT(&stats, opal_pstats_t);
|
||||
if (ORTE_SUCCESS != (rc = opal_pstat.query(child->pid, &stats))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&stats);
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"sample:memusage got memory size of %lu Gbytes for proc %s",
|
||||
(unsigned long)stats.vsize/1000000, ORTE_NAME_PRINT(child->name)));
|
||||
|
||||
/* check the memory size for limit */
|
||||
if ((stats.vsize/1000000) > job->memory_limit) {
|
||||
/* memory limit exceeded */
|
||||
orte_show_help("help-orte-sensor-memusage.txt", "mem-limit-exceeded",
|
||||
true, orte_process_info.nodename, ORTE_VPID_PRINT(child->name->vpid),
|
||||
(unsigned long)stats.vsize/1000000, (unsigned long)job->memory_limit);
|
||||
orte_errmgr.update_state(child->name->jobid, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED,
|
||||
child->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED,
|
||||
ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
OBJ_DESTRUCT(&stats);
|
||||
}
|
||||
|
||||
/* restart the timer */
|
||||
opal_evtimer_add(sample_ev, &sample_time);
|
||||
}
|
37
orte/mca/sensor/memusage/sensor_memusage.h
Обычный файл
37
orte/mca/sensor/memusage/sensor_memusage.h
Обычный файл
@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Process Resource Utilization sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_MEMUSAGE_H
|
||||
#define ORTE_SENSOR_MEMUSAGE_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_sensor_memusage_component_t {
|
||||
orte_sensor_base_component_t super;
|
||||
int sample_rate;
|
||||
uint64_t memory_limit;
|
||||
};
|
||||
typedef struct orte_sensor_memusage_component_t orte_sensor_memusage_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_memusage_component_t mca_sensor_memusage_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_memusage_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
94
orte/mca/sensor/memusage/sensor_memusage_component.c
Обычный файл
94
orte/mca/sensor/memusage/sensor_memusage_component.c
Обычный файл
@ -0,0 +1,94 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "sensor_memusage.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_sensor_memusage_open(void);
|
||||
static int orte_sensor_memusage_close(void);
|
||||
static int orte_sensor_memusage_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
orte_sensor_memusage_component_t mca_sensor_memusage_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"memusage", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_memusage_open, /* component open */
|
||||
orte_sensor_memusage_close, /* component close */
|
||||
orte_sensor_memusage_query /* component query */
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_sensor_memusage_open(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_memusage_component.super.base_version;
|
||||
int tmp;
|
||||
|
||||
/* lookup parameters */
|
||||
mca_base_param_reg_int(c, "sample_rate",
|
||||
"Sample rate in seconds (default=10)",
|
||||
false, false, 10, &tmp);
|
||||
if (tmp < 0) {
|
||||
opal_output(0, "Illegal value %d - must be > 0", tmp);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
mca_sensor_memusage_component.sample_rate = tmp;
|
||||
|
||||
mca_base_param_reg_int(c, "memory_limit",
|
||||
"Max virtual memory size in GBytes",
|
||||
false, false, 0, &tmp);
|
||||
mca_sensor_memusage_component.memory_limit = tmp;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_sensor_memusage_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
*priority = 0; /* select only if specified */
|
||||
*module = (mca_base_module_t *)&orte_sensor_memusage_module;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_memusage_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
97
orte/mca/sensor/sensor.h
Обычный файл
97
orte/mca/sensor/sensor.h
Обычный файл
@ -0,0 +1,97 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* @file:
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_SENSOR_H
|
||||
#define MCA_SENSOR_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Component functions - all MUST be provided!
|
||||
*/
|
||||
|
||||
/* initialize the selected module */
|
||||
typedef int (*orte_sensor_base_module_init_fn_t)(void);
|
||||
|
||||
/* finalize the selected module */
|
||||
typedef void (*orte_sensor_base_module_finalize_fn_t)(void);
|
||||
|
||||
/* start collecting data */
|
||||
typedef void (*orte_sensor_base_module_start_fn_t)(orte_jobid_t jobid);
|
||||
|
||||
/* stop collecting data */
|
||||
typedef void (*orte_sensor_base_module_stop_fn_t)(orte_jobid_t jobid);
|
||||
|
||||
/* API module */
|
||||
/*
|
||||
* Ver 1.0
|
||||
*/
|
||||
struct orte_sensor_base_API_module_1_0_0_t {
|
||||
orte_sensor_base_module_start_fn_t start;
|
||||
orte_sensor_base_module_stop_fn_t stop;
|
||||
};
|
||||
|
||||
typedef struct orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_1_0_0_t;
|
||||
typedef orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_t;
|
||||
|
||||
|
||||
/*
|
||||
* Component modules Ver 1.0
|
||||
*/
|
||||
struct orte_sensor_base_module_1_0_0_t {
|
||||
orte_sensor_base_module_init_fn_t init;
|
||||
orte_sensor_base_module_finalize_fn_t finalize;
|
||||
orte_sensor_base_module_start_fn_t start;
|
||||
orte_sensor_base_module_stop_fn_t stop;
|
||||
};
|
||||
|
||||
typedef struct orte_sensor_base_module_1_0_0_t orte_sensor_base_module_1_0_0_t;
|
||||
typedef orte_sensor_base_module_1_0_0_t orte_sensor_base_module_t;
|
||||
|
||||
/*
|
||||
* the standard component data structure
|
||||
*/
|
||||
struct orte_sensor_base_component_1_0_0_t {
|
||||
mca_base_component_t base_version;
|
||||
mca_base_component_data_t base_data;
|
||||
};
|
||||
typedef struct orte_sensor_base_component_1_0_0_t orte_sensor_base_component_1_0_0_t;
|
||||
typedef orte_sensor_base_component_1_0_0_t orte_sensor_base_component_t;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Macro for use in components that are of type sensor v1.0.0
|
||||
*/
|
||||
#define ORTE_SENSOR_BASE_VERSION_1_0_0 \
|
||||
/* sensor v1.0 is chained to MCA v2.0 */ \
|
||||
MCA_BASE_VERSION_2_0_0, \
|
||||
/* sensor v1.0 */ \
|
||||
"sensor", 1, 0, 0
|
||||
|
||||
/* Global structure for accessing sensor functions
|
||||
*/
|
||||
ORTE_DECLSPEC extern orte_sensor_base_API_module_t orte_sensor; /* holds API function pointers */
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_SENSOR_H */
|
50
orte/mca/sensor/sensor_types.h
Обычный файл
50
orte/mca/sensor/sensor_types.h
Обычный файл
@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef ORTE_MCA_SENSOR_TYPES_H
|
||||
#define ORTE_MCA_SENSOR_TYPES_H
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
|
||||
#include "opal/dss/dss_types.h"
|
||||
|
||||
/*
|
||||
* General SENSOR types - instanced in runtime/orte_globals.c
|
||||
*/
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
enum {
|
||||
ORTE_SENSOR_SCALE_LINEAR,
|
||||
ORTE_SENSOR_SCALE_LOG,
|
||||
ORTE_SENSOR_SCALE_SIGMOID
|
||||
};
|
||||
|
||||
/*
|
||||
* Structure for passing data from sensors
|
||||
*/
|
||||
typedef struct {
|
||||
opal_object_t super;
|
||||
char *sensor;
|
||||
struct timeval timestamp;
|
||||
opal_byte_object_t data;
|
||||
} orte_sensor_data_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_sensor_data_t);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -305,6 +305,20 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the max local restarts */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(jobs[i]->max_local_restarts)), 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the max global restarts */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(jobs[i]->max_global_restarts)), 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* pack the ckpt state */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
@ -491,6 +505,13 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the number of relocates */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)&(procs[i]->relocates), 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* pack the ckpt state */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/runtime/data_type_support/orte_dt_support.h"
|
||||
@ -214,11 +215,10 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
|
||||
asprintf(&pfx2, "%s", prefix);
|
||||
}
|
||||
|
||||
asprintf(&tmp, "\n%sData for job: %s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %0x\tAbort: %s", pfx2,
|
||||
asprintf(&tmp, "\n%sData for job: %s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %s\tAbort: %s", pfx2,
|
||||
ORTE_JOBID_PRINT(src->jobid),
|
||||
(long)src->num_apps, src->controls, ORTE_VPID_PRINT(src->stdin_target),
|
||||
src->state, src->abort ? "True" : "False");
|
||||
|
||||
orte_job_state_to_str(src->state), src->abort ? "True" : "False");
|
||||
asprintf(&pfx, "%s\t", pfx2);
|
||||
free(pfx2);
|
||||
|
||||
@ -248,7 +248,8 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
|
||||
tmp = tmp2;
|
||||
}
|
||||
|
||||
asprintf(&tmp2, "%s\n%sNum procs: %ld\tMax Restarts: %d", tmp, pfx, (long)src->num_procs, src->max_restarts);
|
||||
asprintf(&tmp2, "%s\n%sNum procs: %ld\tMax Local Restarts: %d\tMax Global Restarts", tmp, pfx,
|
||||
(long)src->num_procs, src->max_local_restarts, src->max_global_restarts);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
@ -448,30 +449,6 @@ PRINT_PROCS:
|
||||
/*
|
||||
* PROC
|
||||
*/
|
||||
static char* orte_dt_print_proc_state(orte_proc_state_t state)
|
||||
{
|
||||
switch(state) {
|
||||
case ORTE_PROC_STATE_INIT:
|
||||
return "init";
|
||||
case ORTE_PROC_STATE_LAUNCHED:
|
||||
return "launched";
|
||||
case ORTE_PROC_STATE_RUNNING:
|
||||
return "running";
|
||||
case ORTE_PROC_STATE_TERMINATED:
|
||||
return "terminated";
|
||||
case ORTE_PROC_STATE_ABORTED:
|
||||
return "aborted";
|
||||
case ORTE_PROC_STATE_FAILED_TO_START:
|
||||
return "failed-to-start";
|
||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
||||
return "aborted-by-signal";
|
||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
||||
return "terminated-without-sync";
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_type_t type)
|
||||
{
|
||||
char *tmp, *tmp2, *pfx2;
|
||||
@ -488,22 +465,12 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
|
||||
|
||||
if (orte_xml_output) {
|
||||
/* need to create the output in XML format */
|
||||
tmp = orte_dt_print_proc_state(src->state);
|
||||
if (NULL == tmp) {
|
||||
if (0 == src->pid) {
|
||||
asprintf(output, "%s<process rank=\"%s\"/>\n", pfx2, ORTE_VPID_PRINT(src->name.vpid));
|
||||
} else {
|
||||
asprintf(output, "%s<process rank=\"%s\" pid=\"%d\"/>\n", pfx2,
|
||||
ORTE_VPID_PRINT(src->name.vpid), (int)src->pid);
|
||||
}
|
||||
} else {
|
||||
if (0 == src->pid) {
|
||||
asprintf(output, "%s<process rank=\"%s\" status=\"%s\"/>\n", pfx2,
|
||||
ORTE_VPID_PRINT(src->name.vpid), tmp);
|
||||
ORTE_VPID_PRINT(src->name.vpid), orte_proc_state_to_str(src->state));
|
||||
} else {
|
||||
asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" status=\"%s\"/>\n", pfx2,
|
||||
ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, tmp);
|
||||
}
|
||||
ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, orte_proc_state_to_str(src->state));
|
||||
}
|
||||
free(pfx2);
|
||||
return ORTE_SUCCESS;
|
||||
@ -527,8 +494,8 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tState: %0x\tRestarts: %d\tApp_context: %ld\tSlot list: %s", tmp, pfx2,
|
||||
src->state, src->restarts, (long)src->app_idx,
|
||||
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tRelocates: %d\tApp_context: %ld\tSlot list: %s", tmp, pfx2,
|
||||
orte_proc_state_to_str(src->state), src->restarts, src->relocates, (long)src->app_idx,
|
||||
(NULL == src->slot_list) ? "NULL" : src->slot_list);
|
||||
free(tmp);
|
||||
|
||||
|
@ -307,6 +307,22 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the max local restarts */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(&(jobs[i]->max_local_restarts)), &n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the max global restarts */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(&(jobs[i]->max_global_restarts)), &n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* unpack the ckpt state */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
@ -522,6 +538,14 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the number of relocates */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(&(procs[i]->relocates)), &n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* unpack the ckpt state */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
|
@ -640,7 +640,8 @@ static void orte_job_construct(orte_job_t* job)
|
||||
OBJ_CONSTRUCT(&job->reported_cond, opal_condition_t);
|
||||
job->not_reported = true;
|
||||
|
||||
job->max_restarts = INT32_MAX;
|
||||
job->max_local_restarts = 0;
|
||||
job->max_global_restarts = 0;
|
||||
|
||||
job->launch_msg_sent.tv_sec = 0;
|
||||
job->launch_msg_sent.tv_usec = 0;
|
||||
@ -833,6 +834,7 @@ static void orte_proc_construct(orte_proc_t* proc)
|
||||
proc->rml_uri = NULL;
|
||||
proc->beat = 0;
|
||||
proc->restarts = 0;
|
||||
proc->relocates = 0;
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
proc->ckpt_state = 0;
|
||||
proc->ckpt_snapshot_ref = NULL;
|
||||
|
@ -390,8 +390,10 @@ typedef struct {
|
||||
bool abort;
|
||||
/* proc that caused that to happen */
|
||||
struct orte_proc_t *aborted_proc;
|
||||
/* max number of times a process can be restarted */
|
||||
int32_t max_restarts;
|
||||
/* max number of times a process can be restarted locally */
|
||||
int32_t max_local_restarts;
|
||||
/* max number of times a process can be relocated to another node */
|
||||
int32_t max_global_restarts;
|
||||
/* time launch message was sent */
|
||||
struct timeval launch_msg_sent;
|
||||
/* max time for launch msg to be received */
|
||||
@ -450,6 +452,8 @@ struct orte_proc_t {
|
||||
time_t beat;
|
||||
/* number of times this process has been restarted */
|
||||
int32_t restarts;
|
||||
/* number of times this process has been relocated */
|
||||
int32_t relocates;
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* ckpt state */
|
||||
size_t ckpt_state;
|
||||
|
@ -89,6 +89,10 @@
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#endif
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#endif
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
#endif
|
||||
@ -432,6 +436,16 @@ void orte_info_open_components(void)
|
||||
opal_pointer_array_add(&component_map, map);
|
||||
#endif
|
||||
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
if (ORTE_SUCCESS != orte_sensor_base_open()) {
|
||||
goto error;
|
||||
}
|
||||
map = OBJ_NEW(orte_info_component_map_t);
|
||||
map->type = strdup("sensor");
|
||||
map->components = &mca_sensor_base_components_available;
|
||||
opal_pointer_array_add(&component_map, map);
|
||||
#endif
|
||||
|
||||
if (ORTE_SUCCESS != orte_filem_base_open()) {
|
||||
goto error;
|
||||
}
|
||||
|
@ -209,6 +209,9 @@ int main(int argc, char *argv[])
|
||||
opal_pointer_array_add(&mca_types, "plm");
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
opal_pointer_array_add(&mca_types, "snapc");
|
||||
#endif
|
||||
#if ORTE_ENABLE_SENSORS
|
||||
opal_pointer_array_add(&mca_types, "sensor");
|
||||
#endif
|
||||
opal_pointer_array_add(&mca_types, "filem");
|
||||
#endif
|
||||
|
@ -81,7 +81,7 @@
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
@ -1139,8 +1139,8 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
* This keeps the errmgr from trying to recover from the shutdown
|
||||
* procedure.
|
||||
*/
|
||||
orte_errmgr_base_enable_recovery = false;
|
||||
orte_errmgr_base_shutting_down = true;
|
||||
orte_errmgr_base.enable_recovery = false;
|
||||
orte_errmgr_base.shutting_down = true;
|
||||
|
||||
/* terminate the orteds - they will automatically kill
|
||||
* their local procs
|
||||
|
@ -141,45 +141,45 @@ const char *orte_job_state_to_str(orte_job_state_t state)
|
||||
{
|
||||
switch(state) {
|
||||
case ORTE_JOB_STATE_UNDEF:
|
||||
return strdup("UNDEFINED");
|
||||
return "UNDEFINED";
|
||||
case ORTE_JOB_STATE_INIT:
|
||||
return strdup("INITIALIZED");
|
||||
return "INITIALIZED";
|
||||
case ORTE_JOB_STATE_RESTART:
|
||||
return strdup("RESTARTING");
|
||||
return "RESTARTING";
|
||||
case ORTE_JOB_STATE_LAUNCHED:
|
||||
return strdup("LAUNCHED");
|
||||
return "LAUNCHED";
|
||||
case ORTE_JOB_STATE_RUNNING:
|
||||
return strdup("RUNNING");
|
||||
return "RUNNING";
|
||||
case ORTE_JOB_STATE_SUSPENDED:
|
||||
return strdup("SUSPENDED");
|
||||
return "SUSPENDED";
|
||||
case ORTE_JOB_STATE_REGISTERED:
|
||||
return strdup("SYNC REGISTERED");
|
||||
return "SYNC REGISTERED";
|
||||
case ORTE_JOB_STATE_UNTERMINATED:
|
||||
return strdup("UNTERMINATED");
|
||||
return "UNTERMINATED";
|
||||
case ORTE_JOB_STATE_TERMINATED:
|
||||
return strdup("NORMALLY TERMINATED");
|
||||
return "NORMALLY TERMINATED";
|
||||
case ORTE_JOB_STATE_ABORTED:
|
||||
return strdup("ABORTED");
|
||||
return "ABORTED";
|
||||
case ORTE_JOB_STATE_FAILED_TO_START:
|
||||
return strdup("FAILED TO START");
|
||||
return "FAILED TO START";
|
||||
case ORTE_JOB_STATE_ABORTED_BY_SIG:
|
||||
return strdup("ABORTED BY SIGNAL");
|
||||
return "ABORTED BY SIGNAL";
|
||||
case ORTE_JOB_STATE_ABORTED_WO_SYNC:
|
||||
return strdup("TERMINATED WITHOUT SYNC");
|
||||
return "TERMINATED WITHOUT SYNC";
|
||||
case ORTE_JOB_STATE_KILLED_BY_CMD:
|
||||
return strdup("KILLED BY INTERNAL COMMAND");
|
||||
return "KILLED BY INTERNAL COMMAND";
|
||||
case ORTE_JOB_STATE_COMM_FAILED:
|
||||
return strdup("COMMUNICATION FAILURE");
|
||||
return "COMMUNICATION FAILURE";
|
||||
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
|
||||
return strdup("SENSOR BOUND EXCEEDED");
|
||||
return "SENSOR BOUND EXCEEDED";
|
||||
break;
|
||||
|
||||
case ORTE_JOB_STATE_NEVER_LAUNCHED:
|
||||
return strdup("NEVER LAUNCHED");
|
||||
return "NEVER LAUNCHED";
|
||||
case ORTE_JOB_STATE_ABORT_ORDERED:
|
||||
return strdup("ABORT IN PROGRESS");
|
||||
return "ABORT IN PROGRESS";
|
||||
default:
|
||||
return strdup("UNKNOWN STATE!");
|
||||
return "UNKNOWN STATE!";
|
||||
}
|
||||
}
|
||||
|
||||
@ -187,39 +187,39 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
|
||||
{
|
||||
switch(state) {
|
||||
case ORTE_PROC_STATE_UNDEF:
|
||||
return strdup("UNDEFINED");
|
||||
return "UNDEFINED";
|
||||
case ORTE_PROC_STATE_INIT:
|
||||
return strdup("INITIALIZED");
|
||||
return "INITIALIZED";
|
||||
case ORTE_PROC_STATE_RESTART:
|
||||
return strdup("RESTARTING");
|
||||
return "RESTARTING";
|
||||
case ORTE_PROC_STATE_LAUNCHED:
|
||||
return strdup("LAUNCHED");
|
||||
return "LAUNCHED";
|
||||
case ORTE_PROC_STATE_RUNNING:
|
||||
return strdup("RUNNING");
|
||||
return "RUNNING";
|
||||
case ORTE_PROC_STATE_REGISTERED:
|
||||
return strdup("SYNC REGISTERED");
|
||||
return "SYNC REGISTERED";
|
||||
case ORTE_PROC_STATE_UNTERMINATED:
|
||||
return strdup("UNTERMINATED");
|
||||
return "UNTERMINATED";
|
||||
case ORTE_PROC_STATE_TERMINATED:
|
||||
return strdup("NORMALLY TERMINATED");
|
||||
return "NORMALLY TERMINATED";
|
||||
case ORTE_PROC_STATE_ABORTED:
|
||||
return strdup("ABORTED");
|
||||
return "ABORTED";
|
||||
case ORTE_PROC_STATE_FAILED_TO_START:
|
||||
return strdup("FAILED TO START");
|
||||
return "FAILED TO START";
|
||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
||||
return strdup("ABORTED BY SIGNAL");
|
||||
return "ABORTED BY SIGNAL";
|
||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
||||
return strdup("TERMINATED WITHOUT SYNC");
|
||||
return "TERMINATED WITHOUT SYNC";
|
||||
case ORTE_PROC_STATE_KILLED_BY_CMD:
|
||||
return strdup("KILLED BY INTERNAL COMMAND");
|
||||
return "KILLED BY INTERNAL COMMAND";
|
||||
case ORTE_PROC_STATE_COMM_FAILED:
|
||||
return strdup("COMMUNICATION FAILURE");
|
||||
return "COMMUNICATION FAILURE";
|
||||
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
|
||||
return strdup("SENSOR BOUND EXCEEDED");
|
||||
return "SENSOR BOUND EXCEEDED";
|
||||
break;
|
||||
|
||||
default:
|
||||
return strdup("UNKNOWN STATE!");
|
||||
return "UNKNOWN STATE!";
|
||||
}
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user