1
1

Add a sensor framework to ORTE that monitors applications and notifies the errmgr when they exceed specified boundaries. Two modules are included here:

1. file activity - can monitor file size, access and modification times. If these fail to change over a specified number of sampling iterations (rate is an mca param), then the errmgr is notified.

2. memory usage - checks amount of memory used by a process. Limit and sampling rate can be set.

This support must be enabled by configuring --enable-sensors.

ompi_info and orte-info have been updated to include the new framework.

Also includes some initial steps toward restoring the recovery capability. Most notably, the ODLS API has been extended to include a "restart_proc" entry for restarting a local process, and organizes the various ERRMGR framework globals into a single struct as we do in the other ORTE frameworks. Fix an oversight in the ERRMGR framework where a pointer array was constructed, but not initialized.

Implementation continues.

This commit was SVN r23043.
Этот коммит содержится в:
Ralph Castain 2010-04-26 22:15:57 +00:00
родитель 2fe1bc043d
Коммит b9893aacc5
55 изменённых файлов: 2242 добавлений и 250 удалений

Просмотреть файл

@ -115,6 +115,10 @@
#include "orte/mca/snapc/snapc.h" #include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h" #include "orte/mca/snapc/base/base.h"
#endif #endif
#if ORTE_ENABLE_SENSORS
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/sensor/base/base.h"
#endif
#include "orte/mca/filem/filem.h" #include "orte/mca/filem/filem.h"
#include "orte/mca/filem/base/base.h" #include "orte/mca/filem/base/base.h"
#endif #endif
@ -465,6 +469,16 @@ void ompi_info_open_components(void)
opal_pointer_array_add(&component_map, map); opal_pointer_array_add(&component_map, map);
#endif #endif
#if ORTE_ENABLE_SENSORS
if (ORTE_SUCCESS != orte_sensor_base_open()) {
goto error;
}
map = OBJ_NEW(orte_info_component_map_t);
map->type = strdup("sensor");
map->components = &mca_sensor_base_components_available;
opal_pointer_array_add(&component_map, map);
#endif
if (ORTE_SUCCESS != orte_filem_base_open()) { if (ORTE_SUCCESS != orte_filem_base_open()) {
goto error; goto error;
} }

Просмотреть файл

@ -229,6 +229,9 @@ int main(int argc, char *argv[])
opal_pointer_array_add(&mca_types, "plm"); opal_pointer_array_add(&mca_types, "plm");
#if OPAL_ENABLE_FT_CR == 1 #if OPAL_ENABLE_FT_CR == 1
opal_pointer_array_add(&mca_types, "snapc"); opal_pointer_array_add(&mca_types, "snapc");
#endif
#if ORTE_ENABLE_SENSORS
opal_pointer_array_add(&mca_types, "sensor");
#endif #endif
opal_pointer_array_add(&mca_types, "filem"); opal_pointer_array_add(&mca_types, "filem");
#endif #endif

Просмотреть файл

@ -93,4 +93,22 @@ else
orte_want_multicast=0 orte_want_multicast=0
fi fi
#
# Do we want sensors enabled?
AC_MSG_CHECKING([if want sensors])
AC_ARG_ENABLE([sensors],
[AC_HELP_STRING([--enable-sensors],
[Enable internal sensors (default: disabled)])])
if test "$enable_sensors" = "yes"; then
AC_MSG_RESULT([yes])
orte_want_sensors=1
else
AC_MSG_RESULT([no])
orte_want_sensors=0
fi
AC_DEFINE_UNQUOTED([ORTE_ENABLE_SENSORS],
[$orte_want_sensors],
[Whether we want sensors enabled])
])dnl ])dnl

Просмотреть файл

@ -55,12 +55,6 @@ ORTE_DECLSPEC int orte_errmgr_base_close(void);
* Output and component variables * Output and component variables
*/ */
ORTE_DECLSPEC extern opal_list_t orte_errmgr_base_components_available; ORTE_DECLSPEC extern opal_list_t orte_errmgr_base_components_available;
ORTE_DECLSPEC extern int orte_errmgr_base_output;
ORTE_DECLSPEC extern bool orte_errmgr_base_shutting_down;
ORTE_DECLSPEC extern bool orte_errmgr_base_enable_recovery;
extern opal_pointer_array_t orte_errmgr_base_modules;
extern bool orte_errmgr_initialized;
/* /*
* Additional External API function declared in errmgr.h * Additional External API function declared in errmgr.h

Просмотреть файл

@ -38,8 +38,8 @@ int orte_errmgr_base_close(void)
OPAL_TRACE(5); OPAL_TRACE(5);
/* Close all selected components */ /* Close all selected components */
for(i = 0; i < orte_errmgr_base_modules.size; ++i) { for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i); module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
if( NULL == module ) { if( NULL == module ) {
continue; continue;
} }
@ -50,13 +50,13 @@ int orte_errmgr_base_close(void)
/* Close all remaining available components (may be one if this is a /* Close all remaining available components (may be one if this is a
OMPI RTE program, or [possibly] multiple if this is ompi_info) */ OMPI RTE program, or [possibly] multiple if this is ompi_info) */
mca_base_components_close(orte_errmgr_base_output, mca_base_components_close(orte_errmgr_base.output,
&orte_errmgr_base_components_available, &orte_errmgr_base_components_available,
NULL); NULL);
OBJ_DESTRUCT(&orte_errmgr_base_modules); OBJ_DESTRUCT(&orte_errmgr_base.modules);
orte_errmgr_initialized = false; orte_errmgr_base.initialized = false;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -73,8 +73,8 @@ int orte_errmgr_base_update_state(orte_jobid_t job,
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
if( !orte_errmgr_base_shutting_down ) { if( !orte_errmgr_base.shutting_down ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:base:update_state() %s) " "errmgr:base:update_state() %s) "
"------- %s state updated for process %s", "------- %s state updated for process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -88,8 +88,8 @@ int orte_errmgr_base_update_state(orte_jobid_t job,
/******************************** /********************************
* Call the active modules * Call the active modules
********************************/ ********************************/
for (i = 0; i < orte_errmgr_base_modules.size; ++i) { for (i = 0; i < orte_errmgr_base.modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i); module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
if( NULL == module ) { if( NULL == module ) {
continue; continue;
} }
@ -143,14 +143,14 @@ int orte_errmgr_base_predicted_fault(char ***proc_list,
int i, rc; int i, rc;
orte_errmgr_stack_state_t stack_state = ORTE_ERRMGR_STACK_STATE_NONE; orte_errmgr_stack_state_t stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:base:predicted_fault() %s) " "errmgr:base:predicted_fault() %s) "
"------- Notifying components... (%3d active components)", "------- Notifying components... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base_modules.size)); orte_errmgr_base.modules.size));
for(i = 0; i < orte_errmgr_base_modules.size; ++i) { for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i); module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
if( NULL == module ) { if( NULL == module ) {
continue; continue;
} }
@ -176,22 +176,22 @@ int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
/* /*
* If the user did not ask for recovery, then do not process recovery events * If the user did not ask for recovery, then do not process recovery events
*/ */
if( !orte_errmgr_base_enable_recovery ) { if( !orte_errmgr_base.enable_recovery ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:base:suggest_map_targets() %s) " "errmgr:base:suggest_map_targets() %s) "
"------- Recovery currently disabled! Skipping...", "------- Recovery currently disabled! Skipping...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) )); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ));
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:base:suggest_map_targets() %s) " "errmgr:base:suggest_map_targets() %s) "
"------- Notifying components... (%3d active components)", "------- Notifying components... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base_modules.size)); orte_errmgr_base.modules.size));
for(i = 0; i < orte_errmgr_base_modules.size; ++i) { for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i); module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
if( NULL == module ) { if( NULL == module ) {
continue; continue;
} }
@ -211,14 +211,14 @@ int orte_errmgr_base_ft_event(int state)
orte_errmgr_base_module_t *module = NULL; orte_errmgr_base_module_t *module = NULL;
int i; int i;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:base:ft_event() %s) " "errmgr:base:ft_event() %s) "
"------- Notifying components... (%3d active components)", "------- Notifying components... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base_modules.size)); orte_errmgr_base.modules.size));
for(i = 0; i < orte_errmgr_base_modules.size; ++i) { for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i); module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
if( NULL == module ) { if( NULL == module ) {
continue; continue;
} }
@ -229,60 +229,3 @@ int orte_errmgr_base_ft_event(int state)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
void orte_errmgr_base_update_runtime(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state)
{
orte_proc_t *loc_proc;
int32_t i;
/* has this already been done */
if (ORTE_ERRMGR_STACK_STATE_UPDATED & *stack_state) {
return;
}
*stack_state |= ORTE_ERRMGR_STACK_STATE_UPDATED;
/*
* orterun is trying to shutdown, so just let it
*/
if (orte_errmgr_base_shutting_down) {
return;
}
/*
* orte_errmgr_base_incomplete_start() will pass a NULL since all processes
* are effected by this fault.
* JJH: Since we do not handle the recovery from such errors yet, just
* skip processing, and go to the abort sequence.
*/
if (NULL == proc) {
return;
}
/*
* Remove the route to this process
*/
orte_routed.delete_route(proc);
/*
* Set the process state in the job data structure
*/
loc_proc = NULL;
for (i = 0; i < jdata->procs->size; ++i) {
if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
continue;
}
if (loc_proc->name.vpid != proc->vpid) {
continue;
}
loc_proc->state = state;
if (ORTE_PROC_STATE_UNTERMINATED < state) {
jdata->num_terminated++;
}
break;
}
}

Просмотреть файл

@ -53,6 +53,8 @@ bool orte_errmgr_base_shutting_down = false;
bool orte_errmgr_initialized = false; bool orte_errmgr_initialized = false;
opal_list_t orte_errmgr_base_components_available; opal_list_t orte_errmgr_base_components_available;
orte_errmgr_base_t orte_errmgr_base;
/* Public module provides a wrapper around previous functions */ /* Public module provides a wrapper around previous functions */
orte_errmgr_API_t orte_errmgr = { orte_errmgr_API_t orte_errmgr = {
orte_errmgr_base_log, orte_errmgr_base_log,
@ -73,13 +75,14 @@ int orte_errmgr_base_open(void)
OPAL_TRACE(5); OPAL_TRACE(5);
/* Only pass this way once */ /* Only pass this way once */
if( orte_errmgr_initialized ) { if( orte_errmgr_base.initialized ) {
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
OBJ_CONSTRUCT(&orte_errmgr_base_modules, opal_pointer_array_t); OBJ_CONSTRUCT(&orte_errmgr_base.modules, opal_pointer_array_t);
opal_pointer_array_init(&orte_errmgr_base.modules, 3, INT_MAX, 1);
orte_errmgr_base_output = opal_output_open(NULL); orte_errmgr_base.output = opal_output_open(NULL);
mca_base_param_reg_int_name("errmgr", mca_base_param_reg_int_name("errmgr",
"base_enable_recovery", "base_enable_recovery",
@ -87,27 +90,53 @@ int orte_errmgr_base_open(void)
" [Default = disabled]", " [Default = disabled]",
false, false, false, false,
0, &value); 0, &value);
orte_errmgr_base_enable_recovery = OPAL_INT_TO_BOOL(value); orte_errmgr_base.enable_recovery = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("errmgr",
"max_global_restarts",
"Max number of times to relocate a failed process to a new node",
false, false,
-1, &orte_errmgr_base.max_global_restarts);
mca_base_param_reg_int_name("errmgr",
"max_local_restarts",
"Max number of times to locally restart a failed process before relocating it to a new node",
false, false,
-1, &orte_errmgr_base.max_local_restarts);
if (orte_errmgr_base.enable_recovery) {
if (orte_errmgr_base.max_global_restarts < 0 ) {
orte_errmgr_base.max_global_restarts = 3;
}
if (orte_errmgr_base.max_local_restarts < 0) {
orte_errmgr_base.max_local_restarts = 3;
}
} else {
if (orte_errmgr_base.max_local_restarts > 0 ||
orte_errmgr_base.max_global_restarts > 0) {
orte_errmgr_base.enable_recovery = true;
}
}
/* /*
* A flag to indicate that orterun is shutting down, so skip the recovery * A flag to indicate that orterun is shutting down, so skip the recovery
* logic. * logic.
*/ */
orte_errmgr_base_shutting_down = false; orte_errmgr_base.shutting_down = false;
/* /*
* Open up all available components * Open up all available components
*/ */
if (ORTE_SUCCESS != if (ORTE_SUCCESS !=
mca_base_components_open("errmgr", mca_base_components_open("errmgr",
orte_errmgr_base_output, orte_errmgr_base.output,
mca_errmgr_base_static_components, mca_errmgr_base_static_components,
&orte_errmgr_base_components_available, &orte_errmgr_base_components_available,
true)) { true)) {
return ORTE_ERROR; return ORTE_ERROR;
} }
orte_errmgr_initialized = true; orte_errmgr_base.initialized = true;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -34,11 +34,6 @@
#include "orte/mca/errmgr/base/errmgr_private.h" #include "orte/mca/errmgr/base/errmgr_private.h"
/*
* List of composite modules, ordered by priority
*/
opal_pointer_array_t orte_errmgr_base_modules;
struct orte_errmgr_base_select_module_t { struct orte_errmgr_base_select_module_t {
mca_base_component_t *component; mca_base_component_t *component;
mca_base_module_t *module; mca_base_module_t *module;
@ -60,8 +55,9 @@ int orte_errmgr_base_select(void)
bool none_found; bool none_found;
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t); OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
opal_pointer_array_init(&tmp_array, 3, INT_MAX, 1);
opal_output_verbose(10, orte_errmgr_base_output, opal_output_verbose(10, orte_errmgr_base.output,
"errmgr:base:select: Auto-selecting components"); "errmgr:base:select: Auto-selecting components");
/* /*
@ -79,7 +75,7 @@ int orte_errmgr_base_select(void)
* If there is a query function then use it. * If there is a query function then use it.
*/ */
if (NULL == component->mca_query_component) { if (NULL == component->mca_query_component) {
opal_output_verbose(5, orte_errmgr_base_output, opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Skipping component [%s]. It does not implement a query function", "errmgr:base:select Skipping component [%s]. It does not implement a query function",
component->mca_component_name ); component->mca_component_name );
continue; continue;
@ -88,7 +84,7 @@ int orte_errmgr_base_select(void)
/* /*
* Query this component for the module and priority * Query this component for the module and priority
*/ */
opal_output_verbose(5, orte_errmgr_base_output, opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Querying component [%s]", "errmgr:base:select Querying component [%s]",
component->mca_component_name); component->mca_component_name);
@ -98,7 +94,7 @@ int orte_errmgr_base_select(void)
* If no module was returned or negative priority, then skip component * If no module was returned or negative priority, then skip component
*/ */
if (NULL == module || priority < 0) { if (NULL == module || priority < 0) {
opal_output_verbose(5, orte_errmgr_base_output, opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Skipping component [%s]. Query failed to return a module", "errmgr:base:select Skipping component [%s]. Query failed to return a module",
component->mca_component_name ); component->mca_component_name );
continue; continue;
@ -107,7 +103,7 @@ int orte_errmgr_base_select(void)
/* /*
* Append them to the temporary list, we will sort later * Append them to the temporary list, we will sort later
*/ */
opal_output_verbose(5, orte_errmgr_base_output, opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Query of component [%s] set priority to %d", "errmgr:base:select Query of component [%s] set priority to %d",
component->mca_component_name, priority); component->mca_component_name, priority);
tmp_module = (orte_errmgr_base_select_module_t *)malloc(sizeof(orte_errmgr_base_select_module_t)); tmp_module = (orte_errmgr_base_select_module_t *)malloc(sizeof(orte_errmgr_base_select_module_t));
@ -156,10 +152,10 @@ int orte_errmgr_base_select(void)
tmp_module = tmp_module_sw; tmp_module = tmp_module_sw;
opal_pointer_array_set_item(&tmp_array, j, NULL); opal_pointer_array_set_item(&tmp_array, j, NULL);
} }
opal_output_verbose(5, orte_errmgr_base_output, opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Add module with priority [%s] %d", "errmgr:base:select Add module with priority [%s] %d",
tmp_module->component->mca_component_name, tmp_module->priority); tmp_module->component->mca_component_name, tmp_module->priority);
opal_pointer_array_add(&orte_errmgr_base_modules, (void*)(tmp_module->module)); opal_pointer_array_add(&orte_errmgr_base.modules, (void*)(tmp_module->module));
free(tmp_module); free(tmp_module);
} }
OBJ_DESTRUCT(&tmp_array); OBJ_DESTRUCT(&tmp_array);
@ -167,8 +163,8 @@ int orte_errmgr_base_select(void)
/* /*
* Initialize each of the Errmgr Modules * Initialize each of the Errmgr Modules
*/ */
for(i = 0; i < orte_errmgr_base_modules.size; ++i) { for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
i_module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i); i_module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
if( NULL == i_module ) { if( NULL == i_module ) {
continue; continue;
} }

Просмотреть файл

@ -39,6 +39,19 @@
*/ */
BEGIN_C_DECLS BEGIN_C_DECLS
/* define a struct to hold framework-global values */
typedef struct {
int output;
bool shutting_down;
bool enable_recovery;
opal_pointer_array_t modules;
bool initialized;
int max_global_restarts;
int max_local_restarts;
} orte_errmgr_base_t;
ORTE_DECLSPEC extern orte_errmgr_base_t orte_errmgr_base;
/* Define the ERRMGR command flag */ /* Define the ERRMGR command flag */
typedef uint8_t orte_errmgr_cmd_flag_t; typedef uint8_t orte_errmgr_cmd_flag_t;
#define ORTE_ERRMGR_CMD OPAL_UINT8 #define ORTE_ERRMGR_CMD OPAL_UINT8
@ -70,11 +83,6 @@ ORTE_DECLSPEC int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
opal_list_t *node_list); opal_list_t *node_list);
ORTE_DECLSPEC int orte_errmgr_base_ft_event(int state); ORTE_DECLSPEC int orte_errmgr_base_ft_event(int state);
ORTE_DECLSPEC void orte_errmgr_base_update_runtime(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state);
/* /*
* Additional External API function declared in errmgr.h * Additional External API function declared in errmgr.h
*/ */

Просмотреть файл

@ -30,11 +30,13 @@
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_locks.h" #include "orte/runtime/orte_locks.h"
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls_types.h" #include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h" #include "orte/mca/odls/base/base.h"
#include "orte/mca/plm/base/base.h" #include "orte/mca/plm/base/base.h"
#include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/rmaps/rmaps_types.h"
#if ORTE_ENABLE_SENSORS
#include "orte/mca/sensor/sensor.h"
#endif
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h" #include "orte/mca/errmgr/base/errmgr_private.h"
@ -48,6 +50,7 @@ static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
orte_proc_state_t state, orte_proc_state_t state,
orte_exit_code_t exit_code); orte_exit_code_t exit_code);
static void check_job_complete(orte_job_t *jdata); static void check_job_complete(orte_job_t *jdata);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
/* /*
* Module functions: Global * Module functions: Global
@ -114,10 +117,19 @@ static int update_state(orte_jobid_t job,
/* indicate that this is the end of the line */ /* indicate that this is the end of the line */
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE; *stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp: job %s reported state %s"
" for proc %s state %s exit_code %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job),
orte_job_state_to_str(jobstate),
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state), exit_code));
/* /*
* if orterun is trying to shutdown, just let it * if orterun is trying to shutdown, just let it
*/ */
if (orte_errmgr_base_shutting_down) { if (orte_errmgr_base.shutting_down) {
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -144,7 +156,7 @@ static int update_state(orte_jobid_t job,
/* update the state */ /* update the state */
jdata->state = jobstate; jdata->state = jobstate;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp: job %s reported state %s", "%s errmgr:hnp: job %s reported state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid), ORTE_JOBID_PRINT(jdata->jobid),
@ -206,6 +218,21 @@ static int update_state(orte_jobid_t job,
hnp_abort(jdata->jobid, exit_code); hnp_abort(jdata->jobid, exit_code);
} }
break; break;
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
/* update all procs in job */
update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
/* order all local procs for this job to be killed */
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die
*/
if (NULL != (jdata = orte_get_job_data_object(job))) {
hnp_abort(jdata->jobid, exit_code);
}
break;
default: default:
break; break;
} }
@ -258,6 +285,19 @@ static int update_state(orte_jobid_t job,
check_job_complete(jdata); check_job_complete(jdata);
break; break;
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
update_proc(jdata, proc, state, exit_code);
killprocs(proc->jobid, proc->vpid);
check_job_complete(jdata); /* need to set the job state */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die
*/
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
hnp_abort(jdata->jobid, exit_code);
}
break;
default: default:
break; break;
} }
@ -295,14 +335,14 @@ static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code)
/* if we are already in progress, then ignore this call */ /* if we are already in progress, then ignore this call */
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp: abort in progress, ignoring abort on job %s with status %d", "%s errmgr:hnp: abort in progress, ignoring abort on job %s with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), exit_code)); ORTE_JOBID_PRINT(job), exit_code));
return; return;
} }
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp: abort called on job %s with status %d", "%s errmgr:hnp: abort called on job %s with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), exit_code)); ORTE_JOBID_PRINT(job), exit_code));
@ -367,7 +407,7 @@ static void failed_start(orte_job_t *jdata, orte_exit_code_t exit_code)
} }
} }
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp: job %s reported incomplete start", "%s errmgr:hnp: job %s reported incomplete start",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid))); ORTE_JOBID_PRINT(jdata->jobid)));
@ -528,7 +568,7 @@ static void check_job_complete(orte_job_t *jdata)
* Determine how the process state affects the job state * Determine how the process state affects the job state
*/ */
if (ORTE_PROC_STATE_FAILED_TO_START == proc->state) { if (ORTE_PROC_STATE_FAILED_TO_START == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr_hnp:check_job_completed proc %s failed to start", "%s errmgr_hnp:check_job_completed proc %s failed to start",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name))); ORTE_NAME_PRINT(&proc->name)));
@ -542,7 +582,7 @@ static void check_job_complete(orte_job_t *jdata)
ORTE_UPDATE_EXIT_STATUS(proc->exit_code); ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
} }
} else if (ORTE_PROC_STATE_ABORTED == proc->state) { } else if (ORTE_PROC_STATE_ABORTED == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed proc %s aborted", "%s errmgr:hnp:check_job_completed proc %s aborted",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name))); ORTE_NAME_PRINT(&proc->name)));
@ -556,7 +596,7 @@ static void check_job_complete(orte_job_t *jdata)
ORTE_UPDATE_EXIT_STATUS(proc->exit_code); ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
} }
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state) { } else if (ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed proc %s aborted by signal", "%s errmgr:hnp:check_job_completed proc %s aborted by signal",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name))); ORTE_NAME_PRINT(&proc->name)));
@ -570,7 +610,7 @@ static void check_job_complete(orte_job_t *jdata)
ORTE_UPDATE_EXIT_STATUS(proc->exit_code); ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
} }
} else if (ORTE_PROC_STATE_TERM_WO_SYNC == proc->state) { } else if (ORTE_PROC_STATE_TERM_WO_SYNC == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed proc %s terminated without sync", "%s errmgr:hnp:check_job_completed proc %s terminated without sync",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name))); ORTE_NAME_PRINT(&proc->name)));
@ -590,7 +630,7 @@ static void check_job_complete(orte_job_t *jdata)
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
} }
} else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) { } else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed proc %s killed by cmd", "%s errmgr:hnp:check_job_completed proc %s killed by cmd",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name))); ORTE_NAME_PRINT(&proc->name)));
@ -609,7 +649,7 @@ static void check_job_complete(orte_job_t *jdata)
goto CHECK_ALIVE; goto CHECK_ALIVE;
} else if (ORTE_PROC_STATE_UNTERMINATED < proc->state && } else if (ORTE_PROC_STATE_UNTERMINATED < proc->state &&
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed proc %s terminated and continuous", "%s errmgr:hnp:check_job_completed proc %s terminated and continuous",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name))); ORTE_NAME_PRINT(&proc->name)));
@ -626,11 +666,22 @@ static void check_job_complete(orte_job_t *jdata)
} }
} }
#if ORTE_ENABLE_SENSORS
if (jdata->abort) {
/* the job aborted - turn off any sensors on this job */
orte_sensor.stop(jdata->jobid);
}
#endif
if (ORTE_JOB_STATE_UNTERMINATED > jdata->state && if (ORTE_JOB_STATE_UNTERMINATED > jdata->state &&
jdata->num_terminated >= jdata->num_procs) { jdata->num_terminated >= jdata->num_procs) {
/* this job has terminated */ /* this job has terminated */
jdata->state = ORTE_JOB_STATE_TERMINATED; jdata->state = ORTE_JOB_STATE_TERMINATED;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, #if ORTE_ENABLE_SENSORS
/* turn off any sensor monitors on this job */
orte_sensor.stop(jdata->jobid);
#endif
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs", "%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid))); ORTE_JOBID_PRINT(jdata->jobid)));
@ -679,7 +730,7 @@ static void check_job_complete(orte_job_t *jdata)
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {
continue; continue;
} }
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s releasing procs from node %s", "%s releasing procs from node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name)); node->name));
@ -693,7 +744,7 @@ static void check_job_complete(orte_job_t *jdata)
} }
node->slots_inuse--; node->slots_inuse--;
node->num_procs--; node->num_procs--;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s releasing proc %s from node %s", "%s releasing proc %s from node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), node->name)); ORTE_NAME_PRINT(&proc->name), node->name));
@ -748,7 +799,7 @@ CHECK_ALIVE:
* just return, though, as we need to ensure we cleanout the * just return, though, as we need to ensure we cleanout the
* job data for the job that just completed * job data for the job that just completed
*/ */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed job %s is not terminated (%d:%d)", "%s errmgr:hnp:check_job_completed job %s is not terminated (%d:%d)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job->jobid), ORTE_JOBID_PRINT(job->jobid),
@ -756,7 +807,7 @@ CHECK_ALIVE:
one_still_alive = true; one_still_alive = true;
} }
else { else {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed job %s is terminated (%d vs %d [0x%x])", "%s errmgr:hnp:check_job_completed job %s is terminated (%d vs %d [0x%x])",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job->jobid), ORTE_JOBID_PRINT(job->jobid),
@ -765,13 +816,13 @@ CHECK_ALIVE:
} }
/* if a job is still alive, we just return */ /* if a job is still alive, we just return */
if (one_still_alive) { if (one_still_alive) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed at least one job is not terminated", "%s errmgr:hnp:check_job_completed at least one job is not terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return; return;
} }
/* if we get here, then all jobs are done, so wakeup */ /* if we get here, then all jobs are done, so wakeup */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed all jobs terminated - waking up", "%s errmgr:hnp:check_job_completed all jobs terminated - waking up",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* set the exit status to 0 - this will only happen if it /* set the exit status to 0 - this will only happen if it
@ -780,3 +831,22 @@ CHECK_ALIVE:
ORTE_UPDATE_EXIT_STATUS(0); ORTE_UPDATE_EXIT_STATUS(0);
orte_trigger_event(&orte_exit); orte_trigger_event(&orte_exit);
} }
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
{
opal_pointer_array_t cmd;
orte_proc_t proc;
int rc;
OBJ_CONSTRUCT(&cmd, opal_pointer_array_t);
OBJ_CONSTRUCT(&proc, orte_proc_t);
proc.name.jobid = job;
proc.name.vpid = vpid;
opal_pointer_array_add(&cmd, &proc);
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(&cmd);
OBJ_DESTRUCT(&proc);
}

Просмотреть файл

@ -53,14 +53,7 @@ orte_errmgr_base_component_t mca_errmgr_hnp_component =
{ {
/* The component is checkpoint ready */ /* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT MCA_BASE_METADATA_PARAM_CHECKPOINT
}, }
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
1
}; };
static int errmgr_hnp_open(void) static int errmgr_hnp_open(void)
@ -88,4 +81,3 @@ static int errmgr_hnp_component_query(mca_base_module_t **module, int *priority)
*module = NULL; *module = NULL;
return ORTE_ERROR; return ORTE_ERROR;
} }

Просмотреть файл

@ -21,6 +21,7 @@
#endif #endif
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "orte/util/error_strings.h" #include "orte/util/error_strings.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
@ -28,14 +29,13 @@
#include "orte/util/session_dir.h" #include "orte/util/session_dir.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "opal/dss/dss.h"
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls_types.h" #include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h" #include "orte/mca/plm/plm_types.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_orted.h" #include "errmgr_orted.h"
@ -47,6 +47,7 @@ static bool all_children_registered(orte_jobid_t job);
static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf); static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf);
static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code); static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code);
static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state); static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
/* /*
@ -173,6 +174,11 @@ static int update_state(orte_jobid_t job,
/* update all local child states */ /* update all local child states */
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING); update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING);
break; break;
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
/* update all procs in job */
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
/* order all local procs for this job to be killed */
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
default: default:
break; break;
} }
@ -198,7 +204,65 @@ static int update_state(orte_jobid_t job,
} }
/*** UPDATE COMMAND FOR A SPECIFIC PROCESS ***/ /*** UPDATE COMMAND FOR A SPECIFIC PROCESS ***/
if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) {
/* find this proc in the local children */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
child->state = state;
}
}
killprocs(proc->jobid, proc->vpid);
/* let the proc be reported back when terminated */
return ORTE_SUCCESS;
}
if (ORTE_PROC_STATE_TERMINATED < state) { if (ORTE_PROC_STATE_TERMINATED < state) {
#if 0
if (orte_errmgr_base.enable_recovery) {
/* lookup the local jobdat for this job */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == proc->jobid) {
break;
}
}
if (NULL == jobdat) {
/* race condition - may not have been formed yet */
return ORTE_SUCCESS;
}
/* find this proc in the local children */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
/* see if this child has reached its local restart limit */
if (child->restarts == jobdat->max_local_restarts ) {
goto REPORT_ABORT;
}
/* otherwise, attempt to restart it locally */
child->restarts++;
if (ORTE_SUCCESS != (rc = orte_odls.restart_proc(child))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ABORT;
}
return ORTE_SUCCESS;
}
}
}
REPORT_ABORT:
#endif
/* if the job hasn't completed and the state is abnormally /* if the job hasn't completed and the state is abnormally
* terminated, then we need to alert the HNP right away * terminated, then we need to alert the HNP right away
*/ */
@ -234,7 +298,7 @@ static int update_state(orte_jobid_t job,
/* remove the child from our local list as it is no longer alive */ /* remove the child from our local list as it is no longer alive */
opal_list_remove_item(&orte_local_children, &child->super); opal_list_remove_item(&orte_local_children, &child->super);
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted reporting proc %s aborted to HNP", "%s errmgr:orted reporting proc %s aborted to HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name))); ORTE_NAME_PRINT(child->name)));
@ -279,7 +343,7 @@ static int update_state(orte_jobid_t job,
* else that needs it * else that needs it
*/ */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted: sending contact info to HNP", "%s errmgr:orted: sending contact info to HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -361,7 +425,7 @@ static int update_state(orte_jobid_t job,
} }
FINAL_CLEANUP: FINAL_CLEANUP:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted reporting all procs in %s terminated", "%s errmgr:orted reporting all procs in %s terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobdat->jobid))); ORTE_JOBID_PRINT(jobdat->jobid)));
@ -639,7 +703,7 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code)
} }
} }
} }
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp: job %s reported incomplete start", "%s errmgr:hnp: job %s reported incomplete start",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobdat->jobid))); ORTE_JOBID_PRINT(jobdat->jobid)));
@ -663,3 +727,21 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs
} }
} }
} }
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
{
opal_pointer_array_t cmd;
orte_proc_t proc;
int rc;
OBJ_CONSTRUCT(&cmd, opal_pointer_array_t);
OBJ_CONSTRUCT(&proc, orte_proc_t);
proc.name.jobid = job;
proc.name.vpid = vpid;
opal_pointer_array_add(&cmd, &proc);
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(&cmd);
OBJ_DESTRUCT(&proc);
}

Просмотреть файл

@ -53,14 +53,7 @@ orte_errmgr_base_component_t mca_errmgr_orted_component =
{ {
/* The component is checkpoint ready */ /* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT MCA_BASE_METADATA_PARAM_CHECKPOINT
}, }
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
1
}; };
static int errmgr_orted_open(void) static int errmgr_orted_open(void)

Просмотреть файл

@ -61,7 +61,9 @@
#include "orte/mca/notifier/base/base.h" #include "orte/mca/notifier/base/base.h"
#include "orte/mca/rmcast/base/base.h" #include "orte/mca/rmcast/base/base.h"
#include "orte/mca/state/base/base.h" #include "orte/mca/state/base/base.h"
#if ORTE_ENABLE_SENSORS
#include "orte/mca/sensor/base/base.h"
#endif
#include "orte/runtime/orte_cr.h" #include "orte/runtime/orte_cr.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
@ -417,6 +419,20 @@ int orte_ess_base_orted_setup(char **hosts)
goto error; goto error;
} }
#if ORTE_ENABLE_SENSORS
/* setup the SENSOR framework */
if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_sensor_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
ORTE_ERROR_LOG(ret);
error = "ortesensor_select";
goto error;
}
#endif
return ORTE_SUCCESS; return ORTE_SUCCESS;
error: error:
@ -438,6 +454,9 @@ int orte_ess_base_orted_finalize(void)
orte_grpcomm.onesided_barrier(); orte_grpcomm.onesided_barrier();
} }
#if ORTE_ENABLE_SENSORS
orte_sensor_base_close();
#endif
orte_state_base_close(); orte_state_base_close();
orte_notifier_base_close(); orte_notifier_base_close();

Просмотреть файл

@ -57,6 +57,9 @@
#include "orte/mca/notifier/base/base.h" #include "orte/mca/notifier/base/base.h"
#include "orte/mca/rmcast/base/base.h" #include "orte/mca/rmcast/base/base.h"
#include "orte/mca/state/base/base.h" #include "orte/mca/state/base/base.h"
#if ORTE_ENABLE_SENSORS
#include "orte/mca/sensor/base/base.h"
#endif
#include "orte/mca/rmaps/base/base.h" #include "orte/mca/rmaps/base/base.h"
#if OPAL_ENABLE_FT_CR == 1 #if OPAL_ENABLE_FT_CR == 1
@ -537,6 +540,20 @@ static int rte_init(void)
goto error; goto error;
} }
#if ORTE_ENABLE_SENSORS
/* setup the SENSOR framework */
if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_sensor_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
ORTE_ERROR_LOG(ret);
error = "ortesensor_select";
goto error;
}
#endif
/* if a tool has launched us and is requesting event reports, /* if a tool has launched us and is requesting event reports,
* then set its contact info into the comm system * then set its contact info into the comm system
*/ */
@ -592,6 +609,9 @@ static int rte_finalize(void)
unlink(contact_path); unlink(contact_path);
free(contact_path); free(contact_path);
#if ORTE_ENABLE_SENSORS
orte_sensor_base_close();
#endif
orte_state_base_close(); orte_state_base_close();
orte_notifier_base_close(); orte_notifier_base_close();

Просмотреть файл

@ -367,6 +367,12 @@ pack_add_procs:
return rc; return rc;
} }
/* pack the max number of local restarts allowed for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->max_local_restarts, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the number of app_contexts for this job */ /* pack the number of app_contexts for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->num_apps, 1, ORTE_APP_IDX))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->num_apps, 1, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -813,6 +819,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto REPORT_ERROR; goto REPORT_ERROR;
} }
/* unpack the max number of local restarts allowed for this job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->max_local_restarts, &cnt, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the number of app_contexts for this job */ /* unpack the number of app_contexts for this job */
cnt=1; cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_apps, &cnt, ORTE_APP_IDX))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_apps, &cnt, ORTE_APP_IDX))) {
@ -2841,3 +2853,9 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer,
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
orte_odls_base_fork_local_proc_fn_t fork_local)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -132,6 +132,7 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
ptr->num_contributors = 0; ptr->num_contributors = 0;
ptr->num_participating = -1; ptr->num_participating = -1;
ptr->num_collected = 0; ptr->num_collected = 0;
ptr->max_local_restarts = 0;
} }
static void orte_odls_job_destructor(orte_odls_job_t *ptr) static void orte_odls_job_destructor(orte_odls_job_t *ptr)
{ {

Просмотреть файл

@ -138,6 +138,9 @@ ORTE_DECLSPEC int orte_odls_base_default_require_sync(orte_process_name_t *proc,
opal_buffer_t *buffer, opal_buffer_t *buffer,
bool drop_nidmap); bool drop_nidmap);
ORTE_DECLSPEC int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
orte_odls_base_fork_local_proc_fn_t fork_local);
/* /*
* Preload binary/files functions * Preload binary/files functions
*/ */

Просмотреть файл

@ -93,6 +93,7 @@
static int orte_odls_default_launch_local_procs(opal_buffer_t *data); static int orte_odls_default_launch_local_procs(opal_buffer_t *data);
static int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs); static int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs);
static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal); static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal);
static int orte_odls_default_restart_proc(orte_odls_child_t *child);
static void set_handler_default(int sig); static void set_handler_default(int sig);
@ -102,7 +103,8 @@ orte_odls_base_module_t orte_odls_default_module = {
orte_odls_default_kill_local_procs, orte_odls_default_kill_local_procs,
orte_odls_default_signal_local_procs, orte_odls_default_signal_local_procs,
orte_odls_base_default_deliver_message, orte_odls_base_default_deliver_message,
orte_odls_base_default_require_sync orte_odls_base_default_require_sync,
orte_odls_default_restart_proc
}; };
/* convenience macro for erroring out */ /* convenience macro for erroring out */
@ -1101,3 +1103,17 @@ static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc,
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
static int orte_odls_default_restart_proc(orte_odls_child_t *child)
{
int rc;
/* restart the local proc */
if (ORTE_SUCCESS != (rc = orte_odls_base_default_restart_proc(child, odls_default_fork_local_proc))) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:restart_proc failed to launch on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
}
return rc;
}

Просмотреть файл

@ -82,6 +82,10 @@ typedef int (*orte_odls_base_module_deliver_message_fn_t)(orte_jobid_t job, opal
typedef int (*orte_odls_base_module_require_sync_fn_t)(orte_process_name_t *proc, typedef int (*orte_odls_base_module_require_sync_fn_t)(orte_process_name_t *proc,
opal_buffer_t *buffer, opal_buffer_t *buffer,
bool drop_nidmap); bool drop_nidmap);
/**
* Restart a local process
*/
typedef int (*orte_odls_base_module_restart_proc_fn_t)(orte_odls_child_t *child);
/** /**
* pls module version * pls module version
@ -93,6 +97,7 @@ struct orte_odls_base_module_1_3_0_t {
orte_odls_base_module_signal_local_process_fn_t signal_local_procs; orte_odls_base_module_signal_local_process_fn_t signal_local_procs;
orte_odls_base_module_deliver_message_fn_t deliver_message; orte_odls_base_module_deliver_message_fn_t deliver_message;
orte_odls_base_module_require_sync_fn_t require_sync; orte_odls_base_module_require_sync_fn_t require_sync;
orte_odls_base_module_restart_proc_fn_t restart_proc;
}; };
/** shorten orte_odls_base_module_1_3_0_t declaration */ /** shorten orte_odls_base_module_1_3_0_t declaration */

Просмотреть файл

@ -141,6 +141,7 @@ typedef struct orte_odls_job_t {
int num_participating; int num_participating;
int num_collected; int num_collected;
struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */ struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */
int32_t max_local_restarts; /* max number of times a local proc can be restarted */
} orte_odls_job_t; } orte_odls_job_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t); ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t);

Просмотреть файл

@ -220,6 +220,19 @@ static int odls_process_signal_local_proc(const orte_process_name_t *proc, int32
return rc; return rc;
} }
static int orte_odls_process_restart_proc(orte_odls_child_t *child)
{
int rc;
/* restart the local proc */
if (ORTE_SUCCESS != (rc = orte_odls_base_default_restart_proc(child, odls_process_fork_local_proc))) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:process:restart_proc failed to launch on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
}
return rc;
}
orte_odls_base_module_t orte_odls_process_module = { orte_odls_base_module_t orte_odls_process_module = {
orte_odls_base_default_get_add_procs_data, orte_odls_base_default_get_add_procs_data,
@ -227,5 +240,6 @@ orte_odls_base_module_t orte_odls_process_module = {
odls_process_kill_local_procs, odls_process_kill_local_procs,
odls_process_signal_local_proc, odls_process_signal_local_proc,
orte_odls_base_default_deliver_message, orte_odls_base_default_deliver_message,
orte_odls_base_default_require_sync orte_odls_base_default_require_sync,
orte_odls_process_restart_proc
}; };

Просмотреть файл

@ -52,6 +52,9 @@
#include "orte/mca/filem/filem.h" #include "orte/mca/filem/filem.h"
#include "orte/mca/filem/base/base.h" #include "orte/mca/filem/base/base.h"
#include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/rml/base/rml_contact.h"
#if ORTE_ENABLE_SENSORS
#include "orte/mca/sensor/sensor.h"
#endif
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
#include "orte/runtime/orte_locks.h" #include "orte/runtime/orte_locks.h"
@ -354,6 +357,11 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
goto WAKEUP; goto WAKEUP;
} }
#if ORTE_ENABLE_SENSORS
/* start any sensor monitoring of this job */
orte_sensor.start(job);
#endif
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:launch completed for job %s", "%s plm:base:launch completed for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

35
orte/mca/sensor/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# main library setup
noinst_LTLIBRARIES = libmca_sensor.la
libmca_sensor_la_SOURCES =
# header setup
nobase_orte_HEADERS =
# local files
headers = sensor.h \
sensor_types.h
libmca_sensor_la_SOURCES += $(headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
nobase_orte_HEADERS += $(headers)
ortedir = $(includedir)/openmpi/orte/mca/sensor
else
ortedir = $(includedir)
endif
include base/Makefile.am
distclean-local:
rm -f base/static-components.h

26
orte/mca/sensor/base/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,26 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers += \
base/base.h
libmca_sensor_la_SOURCES += \
base/sensor_base_open.c
if !ORTE_DISABLE_FULL_SUPPORT
headers += \
base/sensor_private.h
libmca_sensor_la_SOURCES += \
base/sensor_base_close.c \
base/sensor_base_select.c
endif

52
orte/mca/sensor/base/base.h Обычный файл
Просмотреть файл

@ -0,0 +1,52 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_SENSOR_BASE_H
#define MCA_SENSOR_BASE_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "opal/mca/mca.h"
#include "orte/mca/sensor/sensor.h"
/*
* Global functions for MCA overall collective open and close
*/
BEGIN_C_DECLS
/*
* function definitions
*/
ORTE_DECLSPEC int orte_sensor_base_open(void);
ORTE_DECLSPEC int orte_sensor_base_select(void);
ORTE_DECLSPEC int orte_sensor_base_close(void);
/*
* globals that might be needed
*/
ORTE_DECLSPEC extern opal_list_t mca_sensor_base_components_available;
#if !ORTE_DISABLE_FULL_SUPPORT
/* no base functions to protect at this time */
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS
#endif

46
orte/mca/sensor/base/sensor_base_close.c Обычный файл
Просмотреть файл

@ -0,0 +1,46 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
int orte_sensor_base_close(void)
{
orte_sensor_base_module_t *i_module;
int i;
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_base_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->finalize) {
i_module->finalize();
}
}
OBJ_DESTRUCT(&orte_sensor_base.modules);
/* Close all remaining available components */
mca_base_components_close(orte_sensor_base.output,
&mca_sensor_base_components_available, NULL);
/* All done */
return ORTE_SUCCESS;
}

108
orte/mca/sensor/base/sensor_base_open.c Обычный файл
Просмотреть файл

@ -0,0 +1,108 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/class/opal_pointer_array.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "orte/mca/sensor/base/static-components.h"
/* base functions */
static void start(orte_jobid_t jobid);
static void stop(orte_jobid_t jobid);
/*
* Global variables
*/
orte_sensor_base_t orte_sensor_base;
orte_sensor_base_API_module_t orte_sensor = {
start,
stop
};
opal_list_t mca_sensor_base_components_available;
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
int orte_sensor_base_open(void)
{
/* Debugging / verbose output. Always have stream open, with
verbose set by the mca open system... */
orte_sensor_base.output = opal_output_open(NULL);
/* construct the array of modules */
OBJ_CONSTRUCT(&orte_sensor_base.modules, opal_pointer_array_t);
opal_pointer_array_init(&orte_sensor_base.modules, 3, INT_MAX, 1);
/* Open up all available components */
if (ORTE_SUCCESS !=
mca_base_components_open("sensor", orte_sensor_base.output,
mca_sensor_base_static_components,
&mca_sensor_base_components_available, true)) {
return ORTE_ERROR;
}
/* All done */
return ORTE_SUCCESS;
}
static void start(orte_jobid_t jobid)
{
orte_sensor_base_module_t *i_module;
int i;
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_base_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->start) {
i_module->start(jobid);
}
}
return;
}
static void stop(orte_jobid_t jobid)
{
orte_sensor_base_module_t *i_module;
int i;
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_base_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->stop) {
i_module->stop(jobid);
}
}
return;
}

177
orte/mca/sensor/base/sensor_base_select.c Обычный файл
Просмотреть файл

@ -0,0 +1,177 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
struct orte_sensor_base_select_module_t {
mca_base_component_t *component;
mca_base_module_t *module;
int priority;
};
typedef struct orte_sensor_base_select_module_t orte_sensor_base_select_module_t;
/**
* Function for weeding out sensor components that don't want to run.
*
* Call the init function on all available components to find out if
* they want to run. Select all components that don't fail. Failing
* components will be closed and unloaded. The selected modules will
* be returned to the caller in a opal_list_t.
*/
int orte_sensor_base_select(void)
{
mca_base_component_list_item_t *cli = NULL;
mca_base_component_t *component = NULL;
mca_base_module_t *module = NULL;
orte_sensor_base_module_t *i_module;
opal_list_item_t *item;
int priority = 0, i, j, low_i;
int exit_status = OPAL_SUCCESS;
opal_pointer_array_t tmp_array;
bool none_found;
orte_sensor_base_select_module_t *tmp_module = NULL, *tmp_module_sw = NULL;
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
opal_output_verbose(10, orte_sensor_base.output,
"sensor:base:select: Auto-selecting components");
/*
* Traverse the list of available components.
* For each call their 'query' functions to determine relative priority.
*/
none_found = true;
for (item = opal_list_get_first(&mca_sensor_base_components_available);
item != opal_list_get_end(&mca_sensor_base_components_available);
item = opal_list_get_next(item) ) {
cli = (mca_base_component_list_item_t *) item;
component = (mca_base_component_t *) cli->cli_component;
/*
* If there is a query function then use it.
*/
if (NULL == component->mca_query_component) {
opal_output_verbose(5, orte_sensor_base.output,
"sensor:base:select Skipping component [%s]. It does not implement a query function",
component->mca_component_name );
continue;
}
/*
* Query this component for the module and priority
*/
opal_output_verbose(5, orte_sensor_base.output,
"sensor:base:select Querying component [%s]",
component->mca_component_name);
component->mca_query_component(&module, &priority);
/*
* If no module was returned or negative priority, then skip component
*/
if (NULL == module || priority < 0) {
opal_output_verbose(5, orte_sensor_base.output,
"sensor:base:select Skipping component [%s]. Query failed to return a module",
component->mca_component_name );
continue;
}
/*
* Append them to the temporary list, we will sort later
*/
opal_output_verbose(5, orte_sensor_base.output,
"sensor:base:select Query of component [%s] set priority to %d",
component->mca_component_name, priority);
tmp_module = (orte_sensor_base_select_module_t *)malloc(sizeof(orte_sensor_base_select_module_t));
tmp_module->component = component;
tmp_module->module = module;
tmp_module->priority = priority;
opal_pointer_array_add(&tmp_array, (void*)tmp_module);
none_found = false;
}
if (none_found) {
/* okay for no modules to be found */
return ORTE_SUCCESS;
}
/*
* Sort the list by decending priority
*/
priority = 0;
for(j = 0; j < tmp_array.size; ++j) {
tmp_module_sw = (orte_sensor_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, j);
if( NULL == tmp_module_sw ) {
continue;
}
low_i = -1;
priority = tmp_module_sw->priority;
for(i = 0; i < tmp_array.size; ++i) {
tmp_module = (orte_sensor_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, i);
if( NULL == tmp_module ) {
continue;
}
if( tmp_module->priority > priority ) {
low_i = i;
priority = tmp_module->priority;
}
}
if( low_i >= 0 ) {
tmp_module = (orte_sensor_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, low_i);
opal_pointer_array_set_item(&tmp_array, low_i, NULL);
j--; /* Try this entry again, if it is not the lowest */
} else {
tmp_module = tmp_module_sw;
opal_pointer_array_set_item(&tmp_array, j, NULL);
}
opal_output_verbose(5, orte_sensor_base.output,
"sensor:base:select Add module with priority [%s] %d",
tmp_module->component->mca_component_name, tmp_module->priority);
opal_pointer_array_add(&orte_sensor_base.modules, (void*)(tmp_module->module));
free(tmp_module);
}
OBJ_DESTRUCT(&tmp_array);
/*
* Initialize each of the modules
*/
for(i = 0; i < orte_sensor_base.modules.size; ++i) {
i_module = (orte_sensor_base_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i);
if( NULL == i_module ) {
continue;
}
if( NULL != i_module->init ) {
i_module->init();
}
}
return exit_status;
}

44
orte/mca/sensor/base/sensor_private.h Обычный файл
Просмотреть файл

@ -0,0 +1,44 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_SENSOR_PRIVATE_H
#define MCA_SENSOR_PRIVATE_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/dss/dss_types.h"
#include "orte/mca/sensor/sensor_types.h"
/*
* Global functions for MCA overall collective open and close
*/
BEGIN_C_DECLS
#if !ORTE_DISABLE_FULL_SUPPORT
/* define a struct to hold framework-global values */
typedef struct {
int output;
opal_pointer_array_t modules;
} orte_sensor_base_t;
ORTE_DECLSPEC extern orte_sensor_base_t orte_sensor_base;
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS
#endif

37
orte/mca/sensor/file/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-sensor-file.txt
sources = \
sensor_file.c \
sensor_file.h \
sensor_file_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_sensor_file_DSO
component_noinst =
component_install = mca_sensor_file.la
else
component_noinst = libmca_sensor_file.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_file_la_SOURCES = $(sources)
mca_sensor_file_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_file_la_SOURCES =$(sources)
libmca_sensor_file_la_LDFLAGS = -module -avoid-version

19
orte/mca/sensor/file/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,19 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_file_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_sensor_file_CONFIG], [
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

14
orte/mca/sensor/file/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,14 @@
# -*- shell-script -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -0,0 +1,18 @@
# -*- text -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for the file sensor
#
[file-stalled]
A specified file is not changing, indicating a possibly stalled application:
File: %s
Last size: %lu
Last access: %sLast modification: %s

354
orte/mca/sensor/file/sensor_file.c Обычный файл
Просмотреть файл

@ -0,0 +1,354 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <stdio.h>
#include <stddef.h>
#include <ctype.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#ifdef HAVE_TIME_H
#include <time.h>
#endif
#include <sys/stat.h>
#include <sys/types.h>
#include "opal_stdint.h"
#include "opal/util/output.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_file.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(orte_jobid_t jobid);
static void stop(orte_jobid_t jobid);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_file_module = {
init,
finalize,
start,
stop
};
/* define a tracking object */
typedef struct {
opal_list_item_t super;
orte_jobid_t jobid;
orte_vpid_t vpid;
char *file;
int tick;
bool check_size;
bool check_access;
bool check_mod;
int32_t file_size;
time_t last_access;
time_t last_mod;
int limit;
} file_tracker_t;
static void ft_constructor(file_tracker_t *ft)
{
ft->file = NULL;
ft->tick = 0;
ft->file_size = 0;
ft->last_access = 0;
ft->last_mod = 0;
ft->limit = 0;
}
static void ft_destructor(file_tracker_t *ft)
{
if (NULL != ft->file) {
free(ft->file);
}
}
OBJ_CLASS_INSTANCE(file_tracker_t,
opal_list_item_t,
ft_constructor, ft_destructor);
/* declare the local functions */
static void sample(int fd, short event, void *arg);
/* local globals */
static opal_event_t *sample_ev = NULL;
static struct timeval sample_time;
static opal_list_t jobs;
static int init(void)
{
OBJ_CONSTRUCT(&jobs, opal_list_t);
return ORTE_SUCCESS;
}
static void finalize(void)
{
opal_list_item_t *item;
if (NULL != sample_ev) {
opal_event_del(sample_ev);
free(sample_ev);
sample_ev = NULL;
}
while (NULL != (item = opal_list_remove_first(&jobs))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&jobs);
return;
}
/*
* Start monitoring of local processes
*/
static void start(orte_jobid_t jobid)
{
mca_base_component_t *c = &mca_sensor_file_component.super.base_version;
opal_list_item_t *item;
orte_odls_job_t *jobdat;
orte_app_context_t *app;
int rc, tmp;
char *filename;
file_tracker_t *ft;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s starting file monitoring for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobid)));
/* get the local jobdat for this job */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_end(&orte_local_jobdata)) {
jobdat = (orte_odls_job_t*)item;
if (jobid == jobdat->jobid) {
break;
}
}
if (NULL == jobdat) {
/* no local procs for this job */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s sensor:file no local procs for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobid)));
return;
}
/* must be at least one app_context, so use the first */
if (NULL == (app = jobdat->apps[0])) {
/* got a problem */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return;
}
/* search the environ to get the filename */
if (ORTE_SUCCESS != (rc = mca_base_param_find_string(c, "filename", app->env, &filename))) {
/* was a default file given */
if (NULL == mca_sensor_file_component.file) {
/* can't do anything without a file */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s sensor:file no file for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobid)));
return;
}
filename = mca_sensor_file_component.file;
}
/* create the tracking object */
ft = OBJ_NEW(file_tracker_t);
ft->jobid = jobid;
ft->file = strdup(filename);
/* search the environ to see what we are checking */
tmp = 0;
if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_size", app->env, &tmp))) {
/* was a default value given */
if (0 < mca_sensor_file_component.check_size) {
ft->check_size = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_size);
}
} else {
ft->check_size = OPAL_INT_TO_BOOL(tmp);
}
tmp = 0;
if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_access", app->env, &tmp))) {
/* was a default value given */
if (0 < mca_sensor_file_component.check_access) {
ft->check_access = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_access);
}
} else {
ft->check_access = OPAL_INT_TO_BOOL(tmp);
}
tmp = 0;
if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_mod", app->env, &tmp))) {
/* was a default value given */
if (0 < mca_sensor_file_component.check_mod) {
ft->check_mod = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_mod);
}
} else {
ft->check_mod = OPAL_INT_TO_BOOL(tmp);
}
tmp = 0;
if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "limit", app->env, &tmp))) {
ft->limit = mca_sensor_file_component.limit;
} else {
ft->limit = tmp;
}
opal_list_append(&jobs, &ft->super);
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s file %s monitored for %s%s%s with limit %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ft->file, ft->check_size ? "SIZE:" : " ",
ft->check_access ? "ACCESS TIME:" : " ",
ft->check_mod ? "MOD TIME" : " ", ft->limit));
/* start sampling */
if (NULL == sample_ev) {
/* startup a timer to wake us up periodically
* for a data sample
*/
sample_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
opal_evtimer_set(sample_ev, sample, sample_ev);
sample_time.tv_sec = mca_sensor_file_component.sample_rate;
sample_time.tv_usec = 0;
opal_evtimer_add(sample_ev, &sample_time);
}
return;
}
static void stop(orte_jobid_t jobid)
{
opal_list_item_t *item;
file_tracker_t *ft;
for (item = opal_list_get_first(&jobs);
item != opal_list_get_end(&jobs);
item = opal_list_get_next(item)) {
ft = (file_tracker_t*)item;
if (jobid == ft->jobid) {
opal_list_remove_item(&jobs, item);
OBJ_RELEASE(item);
break;
}
}
/* if no jobs remain, stop the sampling */
if (opal_list_is_empty(&jobs) && NULL != sample_ev) {
opal_event_del(sample_ev);
free(sample_ev);
sample_ev = NULL;
}
return;
}
static void sample(int fd, short event, void *arg)
{
struct stat buf;
opal_list_item_t *item;
file_tracker_t *ft;
/* if we are not sampling any more, then just return */
if (NULL == sample_ev) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s sampling files",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
for (item = opal_list_get_first(&jobs);
item != opal_list_get_end(&jobs);
item = opal_list_get_next(item)) {
ft = (file_tracker_t*)item;
/* stat the file and get its size */
if (0 > stat(ft->file, &buf)) {
/* cannot stat file */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s could not stat %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ft->file));
continue;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s size %lu access %s\tmod %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(unsigned long)buf.st_size, ctime(&buf.st_atime), ctime(&buf.st_mtime)));
if (ft->check_size) {
if (buf.st_size == ft->file_size) {
ft->tick++;
goto CHECK;
} else {
ft->tick = 0;
ft->file_size = buf.st_size;
}
}
if (ft->check_access) {
if (buf.st_atime == ft->last_access) {
ft->tick++;
goto CHECK;
} else {
ft->tick = 0;
ft->last_access = buf.st_atime;
}
}
if (ft->check_mod) {
if (buf.st_mtime == ft->last_mod) {
ft->tick++;
goto CHECK;
} else {
ft->tick = 0;
ft->last_mod = buf.st_mtime;
}
}
CHECK:
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s sampled file %s tick %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ft->file, ft->tick));
if (ft->tick == ft->limit) {
orte_show_help("help-orte-sensor-file.txt", "file-stalled", true,
ft->file, ft->file_size, ctime(&ft->last_access), ctime(&ft->last_mod));
orte_errmgr.update_state(ft->jobid, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED,
NULL, ORTE_PROC_STATE_UNDEF,
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
}
/* restart the timer */
opal_evtimer_add(sample_ev, &sample_time);
}

41
orte/mca/sensor/file/sensor_file.h Обычный файл
Просмотреть файл

@ -0,0 +1,41 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* File movement sensor
*/
#ifndef ORTE_SENSOR_FILE_H
#define ORTE_SENSOR_FILE_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
struct orte_sensor_file_component_t {
orte_sensor_base_component_t super;
int sample_rate;
char *file;
bool check_size;
bool check_access;
bool check_mod;
int limit;
};
typedef struct orte_sensor_file_component_t orte_sensor_file_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_file_component_t mca_sensor_file_component;
extern orte_sensor_base_module_t orte_sensor_file_module;
END_C_DECLS
#endif

107
orte/mca/sensor/file/sensor_file_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,107 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_file.h"
/*
* Local functions
*/
static int orte_sensor_file_open(void);
static int orte_sensor_file_close(void);
static int orte_sensor_file_query(mca_base_module_t **module, int *priority);
orte_sensor_file_component_t mca_sensor_file_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"file", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_file_open, /* component open */
orte_sensor_file_close, /* component close */
orte_sensor_file_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
}
};
/**
* component open/close/init function
*/
static int orte_sensor_file_open(void)
{
mca_base_component_t *c = &mca_sensor_file_component.super.base_version;
int tmp;
/* lookup parameters */
mca_base_param_reg_int(c, "sample_rate",
"Sample rate in seconds (default=10)",
false, false, 10, &mca_sensor_file_component.sample_rate);
mca_base_param_reg_string(c, "filename",
"File to be monitored",
false, false, NULL, &mca_sensor_file_component.file);
mca_base_param_reg_int(c, "check_size",
"Check the file size",
false, false, false, &tmp);
mca_sensor_file_component.check_size = OPAL_INT_TO_BOOL(tmp);
mca_base_param_reg_int(c, "check_access",
"Check access time",
false, false, false, &tmp);
mca_sensor_file_component.check_access = OPAL_INT_TO_BOOL(tmp);
mca_base_param_reg_int(c, "check_mod",
"Check modification time",
false, false, false, &tmp);
mca_sensor_file_component.check_mod = OPAL_INT_TO_BOOL(tmp);
mca_base_param_reg_int(c, "limit",
"Number of times the sensor can detect no motion before declaring error (default=3)",
false, false, 3, &mca_sensor_file_component.limit);
return ORTE_SUCCESS;
}
static int orte_sensor_file_query(mca_base_module_t **module, int *priority)
{
*priority = 0; /* select only if specified */
*module = (mca_base_module_t *)&orte_sensor_file_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_file_close(void)
{
return ORTE_SUCCESS;
}

37
orte/mca/sensor/memusage/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-sensor-memusage.txt
sources = \
sensor_memusage.c \
sensor_memusage.h \
sensor_memusage_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_sensor_memusage_DSO
component_noinst =
component_install = mca_sensor_memusage.la
else
component_noinst = libmca_sensor_memusage.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_memusage_la_SOURCES = $(sources)
mca_sensor_memusage_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_memusage_la_SOURCES =$(sources)
libmca_sensor_memusage_la_LDFLAGS = -module -avoid-version

19
orte/mca/sensor/memusage/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,19 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_memusage_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_sensor_memusage_CONFIG], [
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

14
orte/mca/sensor/memusage/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,14 @@
# -*- shell-script -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -0,0 +1,20 @@
# -*- text -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for the memory usage sensor
#
[mem-limit-exceeded]
A process has exceeded the specified limit on memory usage:
Node: %s
Process rank: %s
Memory used: %luGbytes
Memory limit: %luGbytes

264
orte/mca/sensor/memusage/sensor_memusage.c Обычный файл
Просмотреть файл

@ -0,0 +1,264 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include "opal_stdint.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/mca/pstat/pstat.h"
#include "orte/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_memusage.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(orte_jobid_t job);
static void stop(orte_jobid_t job);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_memusage_module = {
init,
finalize,
start,
stop
};
/* define a tracking object */
typedef struct {
opal_list_item_t super;
orte_jobid_t jobid;
unsigned long memory_limit;
} memusage_tracker_t;
static void constructor(memusage_tracker_t *ptr)
{
ptr->memory_limit = 0;
}
OBJ_CLASS_INSTANCE(memusage_tracker_t,
opal_list_item_t,
constructor, NULL);
/* declare the local functions */
static void sample(int fd, short event, void *arg);
/* local globals */
static opal_event_t *sample_ev = NULL;
static opal_list_t jobs;
static struct timeval sample_time;
static int init(void)
{
OBJ_CONSTRUCT(&jobs, opal_list_t);
return ORTE_SUCCESS;
}
static void finalize(void)
{
opal_list_item_t *item;
if (NULL != sample_ev) {
opal_event_del(sample_ev);
free(sample_ev);
sample_ev = NULL;
}
while (NULL != (item = opal_list_remove_first(&jobs))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&jobs);
return;
}
/*
* Start monitoring of local processes
*/
static void start(orte_jobid_t jobid)
{
mca_base_component_t *c = &mca_sensor_memusage_component.super.base_version;
memusage_tracker_t *job;
orte_odls_job_t *jobdat;
orte_app_context_t *app;
opal_list_item_t *item;
int rc, tmp;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s starting memory monitoring for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobid)));
/* get the local jobdat for this job */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_end(&orte_local_jobdata)) {
jobdat = (orte_odls_job_t*)item;
if (jobid == jobdat->jobid) {
break;
}
}
if (NULL == jobdat) {
/* no local procs for this job */
return;
}
/* must be at least one app_context, so use the first */
if (NULL == (app = jobdat->apps[0])) {
/* got a problem */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return;
}
/* search the environ to get memory limit */
tmp = 0;
if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "memory_limit", app->env, &tmp))) {
/* was a default value given */
if (0 < mca_sensor_memusage_component.memory_limit) {
tmp = mca_sensor_memusage_component.memory_limit;
}
}
if (tmp <= 0) {
/* we don't want to monitor this job */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s memory monitoring for job %s is not requested",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobid)));
return;
}
job = OBJ_NEW(memusage_tracker_t);
job->jobid = jobid;
job->memory_limit = tmp;
opal_list_append(&jobs, &job->super);
if (NULL == sample_ev) {
/* startup a timer to wake us up periodically
* for a data sample
*/
sample_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
opal_evtimer_set(sample_ev, sample, sample_ev);
sample_time.tv_sec = mca_sensor_memusage_component.sample_rate;
sample_time.tv_usec = 0;
opal_evtimer_add(sample_ev, &sample_time);
}
return;
}
static void stop(orte_jobid_t jobid)
{
opal_list_item_t *item;
memusage_tracker_t *job;
for (item = opal_list_get_first(&jobs);
item != opal_list_get_end(&jobs);
item = opal_list_get_next(item)) {
job = (memusage_tracker_t*)item;
if (jobid == job->jobid) {
opal_list_remove_item(&jobs, item);
OBJ_RELEASE(item);
break;
}
}
/* if no jobs remain, stop the sampling */
if (opal_list_is_empty(&jobs) && NULL != sample_ev) {
opal_event_del(sample_ev);
free(sample_ev);
sample_ev = NULL;
}
return;
}
static void sample(int fd, short event, void *arg)
{
opal_list_item_t *item;
orte_odls_child_t *child;
opal_pstats_t stats;
int rc;
memusage_tracker_t *job;
bool monitored;
/* if we are not sampling any more, then just return */
if (NULL == sample_ev) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"sample:memusage sampling resource usage"));
/* loop through our local children */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this in a job we are monitoring */
monitored = false;
for (item = opal_list_get_first(&jobs);
item != opal_list_get_end(&jobs);
item = opal_list_get_next(item)) {
job = (memusage_tracker_t*)item;
if (child->name->jobid == job->jobid) {
monitored = true;
break;
}
}
if (!monitored) {
continue;
}
/* get the process resource utilization stats */
OBJ_CONSTRUCT(&stats, opal_pstats_t);
if (ORTE_SUCCESS != (rc = opal_pstat.query(child->pid, &stats))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&stats);
continue;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"sample:memusage got memory size of %lu Gbytes for proc %s",
(unsigned long)stats.vsize/1000000, ORTE_NAME_PRINT(child->name)));
/* check the memory size for limit */
if ((stats.vsize/1000000) > job->memory_limit) {
/* memory limit exceeded */
orte_show_help("help-orte-sensor-memusage.txt", "mem-limit-exceeded",
true, orte_process_info.nodename, ORTE_VPID_PRINT(child->name->vpid),
(unsigned long)stats.vsize/1000000, (unsigned long)job->memory_limit);
orte_errmgr.update_state(child->name->jobid, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED,
child->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED,
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
OBJ_DESTRUCT(&stats);
}
/* restart the timer */
opal_evtimer_add(sample_ev, &sample_time);
}

37
orte/mca/sensor/memusage/sensor_memusage.h Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Process Resource Utilization sensor
*/
#ifndef ORTE_SENSOR_MEMUSAGE_H
#define ORTE_SENSOR_MEMUSAGE_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
struct orte_sensor_memusage_component_t {
orte_sensor_base_component_t super;
int sample_rate;
uint64_t memory_limit;
};
typedef struct orte_sensor_memusage_component_t orte_sensor_memusage_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_memusage_component_t mca_sensor_memusage_component;
extern orte_sensor_base_module_t orte_sensor_memusage_module;
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,94 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_memusage.h"
/*
* Local functions
*/
static int orte_sensor_memusage_open(void);
static int orte_sensor_memusage_close(void);
static int orte_sensor_memusage_query(mca_base_module_t **module, int *priority);
orte_sensor_memusage_component_t mca_sensor_memusage_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"memusage", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_memusage_open, /* component open */
orte_sensor_memusage_close, /* component close */
orte_sensor_memusage_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
}
};
/**
* component open/close/init function
*/
static int orte_sensor_memusage_open(void)
{
mca_base_component_t *c = &mca_sensor_memusage_component.super.base_version;
int tmp;
/* lookup parameters */
mca_base_param_reg_int(c, "sample_rate",
"Sample rate in seconds (default=10)",
false, false, 10, &tmp);
if (tmp < 0) {
opal_output(0, "Illegal value %d - must be > 0", tmp);
return ORTE_ERR_FATAL;
}
mca_sensor_memusage_component.sample_rate = tmp;
mca_base_param_reg_int(c, "memory_limit",
"Max virtual memory size in GBytes",
false, false, 0, &tmp);
mca_sensor_memusage_component.memory_limit = tmp;
return ORTE_SUCCESS;
}
static int orte_sensor_memusage_query(mca_base_module_t **module, int *priority)
{
*priority = 0; /* select only if specified */
*module = (mca_base_module_t *)&orte_sensor_memusage_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_memusage_close(void)
{
return ORTE_SUCCESS;
}

97
orte/mca/sensor/sensor.h Обычный файл
Просмотреть файл

@ -0,0 +1,97 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* @file:
*
*/
#ifndef MCA_SENSOR_H
#define MCA_SENSOR_H
/*
* includes
*/
#include "orte_config.h"
#include "orte/types.h"
#include "opal/mca/mca.h"
BEGIN_C_DECLS
/*
* Component functions - all MUST be provided!
*/
/* initialize the selected module */
typedef int (*orte_sensor_base_module_init_fn_t)(void);
/* finalize the selected module */
typedef void (*orte_sensor_base_module_finalize_fn_t)(void);
/* start collecting data */
typedef void (*orte_sensor_base_module_start_fn_t)(orte_jobid_t jobid);
/* stop collecting data */
typedef void (*orte_sensor_base_module_stop_fn_t)(orte_jobid_t jobid);
/* API module */
/*
* Ver 1.0
*/
struct orte_sensor_base_API_module_1_0_0_t {
orte_sensor_base_module_start_fn_t start;
orte_sensor_base_module_stop_fn_t stop;
};
typedef struct orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_1_0_0_t;
typedef orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_t;
/*
* Component modules Ver 1.0
*/
struct orte_sensor_base_module_1_0_0_t {
orte_sensor_base_module_init_fn_t init;
orte_sensor_base_module_finalize_fn_t finalize;
orte_sensor_base_module_start_fn_t start;
orte_sensor_base_module_stop_fn_t stop;
};
typedef struct orte_sensor_base_module_1_0_0_t orte_sensor_base_module_1_0_0_t;
typedef orte_sensor_base_module_1_0_0_t orte_sensor_base_module_t;
/*
* the standard component data structure
*/
struct orte_sensor_base_component_1_0_0_t {
mca_base_component_t base_version;
mca_base_component_data_t base_data;
};
typedef struct orte_sensor_base_component_1_0_0_t orte_sensor_base_component_1_0_0_t;
typedef orte_sensor_base_component_1_0_0_t orte_sensor_base_component_t;
/*
* Macro for use in components that are of type sensor v1.0.0
*/
#define ORTE_SENSOR_BASE_VERSION_1_0_0 \
/* sensor v1.0 is chained to MCA v2.0 */ \
MCA_BASE_VERSION_2_0_0, \
/* sensor v1.0 */ \
"sensor", 1, 0, 0
/* Global structure for accessing sensor functions
*/
ORTE_DECLSPEC extern orte_sensor_base_API_module_t orte_sensor; /* holds API function pointers */
END_C_DECLS
#endif /* MCA_SENSOR_H */

50
orte/mca/sensor/sensor_types.h Обычный файл
Просмотреть файл

@ -0,0 +1,50 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef ORTE_MCA_SENSOR_TYPES_H
#define ORTE_MCA_SENSOR_TYPES_H
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/dss/dss_types.h"
/*
* General SENSOR types - instanced in runtime/orte_globals.c
*/
BEGIN_C_DECLS
enum {
ORTE_SENSOR_SCALE_LINEAR,
ORTE_SENSOR_SCALE_LOG,
ORTE_SENSOR_SCALE_SIGMOID
};
/*
* Structure for passing data from sensors
*/
typedef struct {
opal_object_t super;
char *sensor;
struct timeval timestamp;
opal_byte_object_t data;
} orte_sensor_data_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_sensor_data_t);
END_C_DECLS
#endif

Просмотреть файл

@ -305,6 +305,20 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
return rc; return rc;
} }
/* pack the max local restarts */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(jobs[i]->max_local_restarts)), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the max global restarts */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(jobs[i]->max_global_restarts)), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#if OPAL_ENABLE_FT_CR == 1 #if OPAL_ENABLE_FT_CR == 1
/* pack the ckpt state */ /* pack the ckpt state */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
@ -491,6 +505,13 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
return rc; return rc;
} }
/* pack the number of relocates */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)&(procs[i]->relocates), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#if OPAL_ENABLE_FT_CR == 1 #if OPAL_ENABLE_FT_CR == 1
/* pack the ckpt state */ /* pack the ckpt state */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,

Просмотреть файл

@ -27,6 +27,7 @@
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "opal/dss/dss.h" #include "opal/dss/dss.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/util/error_strings.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/data_type_support/orte_dt_support.h" #include "orte/runtime/data_type_support/orte_dt_support.h"
@ -214,11 +215,10 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
asprintf(&pfx2, "%s", prefix); asprintf(&pfx2, "%s", prefix);
} }
asprintf(&tmp, "\n%sData for job: %s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %0x\tAbort: %s", pfx2, asprintf(&tmp, "\n%sData for job: %s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %s\tAbort: %s", pfx2,
ORTE_JOBID_PRINT(src->jobid), ORTE_JOBID_PRINT(src->jobid),
(long)src->num_apps, src->controls, ORTE_VPID_PRINT(src->stdin_target), (long)src->num_apps, src->controls, ORTE_VPID_PRINT(src->stdin_target),
src->state, src->abort ? "True" : "False"); orte_job_state_to_str(src->state), src->abort ? "True" : "False");
asprintf(&pfx, "%s\t", pfx2); asprintf(&pfx, "%s\t", pfx2);
free(pfx2); free(pfx2);
@ -248,7 +248,8 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
tmp = tmp2; tmp = tmp2;
} }
asprintf(&tmp2, "%s\n%sNum procs: %ld\tMax Restarts: %d", tmp, pfx, (long)src->num_procs, src->max_restarts); asprintf(&tmp2, "%s\n%sNum procs: %ld\tMax Local Restarts: %d\tMax Global Restarts", tmp, pfx,
(long)src->num_procs, src->max_local_restarts, src->max_global_restarts);
free(tmp); free(tmp);
tmp = tmp2; tmp = tmp2;
@ -448,30 +449,6 @@ PRINT_PROCS:
/* /*
* PROC * PROC
*/ */
static char* orte_dt_print_proc_state(orte_proc_state_t state)
{
switch(state) {
case ORTE_PROC_STATE_INIT:
return "init";
case ORTE_PROC_STATE_LAUNCHED:
return "launched";
case ORTE_PROC_STATE_RUNNING:
return "running";
case ORTE_PROC_STATE_TERMINATED:
return "terminated";
case ORTE_PROC_STATE_ABORTED:
return "aborted";
case ORTE_PROC_STATE_FAILED_TO_START:
return "failed-to-start";
case ORTE_PROC_STATE_ABORTED_BY_SIG:
return "aborted-by-signal";
case ORTE_PROC_STATE_TERM_WO_SYNC:
return "terminated-without-sync";
default:
return NULL;
}
}
int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_type_t type) int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_type_t type)
{ {
char *tmp, *tmp2, *pfx2; char *tmp, *tmp2, *pfx2;
@ -488,22 +465,12 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
if (orte_xml_output) { if (orte_xml_output) {
/* need to create the output in XML format */ /* need to create the output in XML format */
tmp = orte_dt_print_proc_state(src->state); if (0 == src->pid) {
if (NULL == tmp) { asprintf(output, "%s<process rank=\"%s\" status=\"%s\"/>\n", pfx2,
if (0 == src->pid) { ORTE_VPID_PRINT(src->name.vpid), orte_proc_state_to_str(src->state));
asprintf(output, "%s<process rank=\"%s\"/>\n", pfx2, ORTE_VPID_PRINT(src->name.vpid));
} else {
asprintf(output, "%s<process rank=\"%s\" pid=\"%d\"/>\n", pfx2,
ORTE_VPID_PRINT(src->name.vpid), (int)src->pid);
}
} else { } else {
if (0 == src->pid) { asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" status=\"%s\"/>\n", pfx2,
asprintf(output, "%s<process rank=\"%s\" status=\"%s\"/>\n", pfx2, ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, orte_proc_state_to_str(src->state));
ORTE_VPID_PRINT(src->name.vpid), tmp);
} else {
asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" status=\"%s\"/>\n", pfx2,
ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, tmp);
}
} }
free(pfx2); free(pfx2);
return ORTE_SUCCESS; return ORTE_SUCCESS;
@ -527,8 +494,8 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
free(tmp); free(tmp);
tmp = tmp2; tmp = tmp2;
asprintf(&tmp2, "%s\n%s\tState: %0x\tRestarts: %d\tApp_context: %ld\tSlot list: %s", tmp, pfx2, asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tRelocates: %d\tApp_context: %ld\tSlot list: %s", tmp, pfx2,
src->state, src->restarts, (long)src->app_idx, orte_proc_state_to_str(src->state), src->restarts, src->relocates, (long)src->app_idx,
(NULL == src->slot_list) ? "NULL" : src->slot_list); (NULL == src->slot_list) ? "NULL" : src->slot_list);
free(tmp); free(tmp);

Просмотреть файл

@ -307,6 +307,22 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
return rc; return rc;
} }
/* unpack the max local restarts */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->max_local_restarts)), &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the max global restarts */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->max_global_restarts)), &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#if OPAL_ENABLE_FT_CR == 1 #if OPAL_ENABLE_FT_CR == 1
/* unpack the ckpt state */ /* unpack the ckpt state */
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -522,6 +538,14 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
return rc; return rc;
} }
/* unpack the number of relocates */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(procs[i]->relocates)), &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#if OPAL_ENABLE_FT_CR == 1 #if OPAL_ENABLE_FT_CR == 1
/* unpack the ckpt state */ /* unpack the ckpt state */
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,

Просмотреть файл

@ -640,7 +640,8 @@ static void orte_job_construct(orte_job_t* job)
OBJ_CONSTRUCT(&job->reported_cond, opal_condition_t); OBJ_CONSTRUCT(&job->reported_cond, opal_condition_t);
job->not_reported = true; job->not_reported = true;
job->max_restarts = INT32_MAX; job->max_local_restarts = 0;
job->max_global_restarts = 0;
job->launch_msg_sent.tv_sec = 0; job->launch_msg_sent.tv_sec = 0;
job->launch_msg_sent.tv_usec = 0; job->launch_msg_sent.tv_usec = 0;
@ -833,6 +834,7 @@ static void orte_proc_construct(orte_proc_t* proc)
proc->rml_uri = NULL; proc->rml_uri = NULL;
proc->beat = 0; proc->beat = 0;
proc->restarts = 0; proc->restarts = 0;
proc->relocates = 0;
#if OPAL_ENABLE_FT_CR == 1 #if OPAL_ENABLE_FT_CR == 1
proc->ckpt_state = 0; proc->ckpt_state = 0;
proc->ckpt_snapshot_ref = NULL; proc->ckpt_snapshot_ref = NULL;

Просмотреть файл

@ -390,8 +390,10 @@ typedef struct {
bool abort; bool abort;
/* proc that caused that to happen */ /* proc that caused that to happen */
struct orte_proc_t *aborted_proc; struct orte_proc_t *aborted_proc;
/* max number of times a process can be restarted */ /* max number of times a process can be restarted locally */
int32_t max_restarts; int32_t max_local_restarts;
/* max number of times a process can be relocated to another node */
int32_t max_global_restarts;
/* time launch message was sent */ /* time launch message was sent */
struct timeval launch_msg_sent; struct timeval launch_msg_sent;
/* max time for launch msg to be received */ /* max time for launch msg to be received */
@ -450,6 +452,8 @@ struct orte_proc_t {
time_t beat; time_t beat;
/* number of times this process has been restarted */ /* number of times this process has been restarted */
int32_t restarts; int32_t restarts;
/* number of times this process has been relocated */
int32_t relocates;
#if OPAL_ENABLE_FT_CR == 1 #if OPAL_ENABLE_FT_CR == 1
/* ckpt state */ /* ckpt state */
size_t ckpt_state; size_t ckpt_state;

Просмотреть файл

@ -89,6 +89,10 @@
#include "orte/mca/snapc/snapc.h" #include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h" #include "orte/mca/snapc/base/base.h"
#endif #endif
#if ORTE_ENABLE_SENSORS
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/sensor/base/base.h"
#endif
#include "orte/mca/filem/filem.h" #include "orte/mca/filem/filem.h"
#include "orte/mca/filem/base/base.h" #include "orte/mca/filem/base/base.h"
#endif #endif
@ -432,6 +436,16 @@ void orte_info_open_components(void)
opal_pointer_array_add(&component_map, map); opal_pointer_array_add(&component_map, map);
#endif #endif
#if ORTE_ENABLE_SENSORS
if (ORTE_SUCCESS != orte_sensor_base_open()) {
goto error;
}
map = OBJ_NEW(orte_info_component_map_t);
map->type = strdup("sensor");
map->components = &mca_sensor_base_components_available;
opal_pointer_array_add(&component_map, map);
#endif
if (ORTE_SUCCESS != orte_filem_base_open()) { if (ORTE_SUCCESS != orte_filem_base_open()) {
goto error; goto error;
} }

Просмотреть файл

@ -209,6 +209,9 @@ int main(int argc, char *argv[])
opal_pointer_array_add(&mca_types, "plm"); opal_pointer_array_add(&mca_types, "plm");
#if OPAL_ENABLE_FT_CR == 1 #if OPAL_ENABLE_FT_CR == 1
opal_pointer_array_add(&mca_types, "snapc"); opal_pointer_array_add(&mca_types, "snapc");
#endif
#if ORTE_ENABLE_SENSORS
opal_pointer_array_add(&mca_types, "sensor");
#endif #endif
opal_pointer_array_add(&mca_types, "filem"); opal_pointer_array_add(&mca_types, "filem");
#endif #endif

Просмотреть файл

@ -81,7 +81,7 @@
#include "orte/mca/rml/rml_types.h" #include "orte/mca/rml/rml_types.h"
#include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/errmgr_private.h"
#include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/grpcomm/grpcomm.h"
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
@ -1139,8 +1139,8 @@ static void abort_exit_callback(int fd, short ign, void *arg)
* This keeps the errmgr from trying to recover from the shutdown * This keeps the errmgr from trying to recover from the shutdown
* procedure. * procedure.
*/ */
orte_errmgr_base_enable_recovery = false; orte_errmgr_base.enable_recovery = false;
orte_errmgr_base_shutting_down = true; orte_errmgr_base.shutting_down = true;
/* terminate the orteds - they will automatically kill /* terminate the orteds - they will automatically kill
* their local procs * their local procs

Просмотреть файл

@ -141,45 +141,45 @@ const char *orte_job_state_to_str(orte_job_state_t state)
{ {
switch(state) { switch(state) {
case ORTE_JOB_STATE_UNDEF: case ORTE_JOB_STATE_UNDEF:
return strdup("UNDEFINED"); return "UNDEFINED";
case ORTE_JOB_STATE_INIT: case ORTE_JOB_STATE_INIT:
return strdup("INITIALIZED"); return "INITIALIZED";
case ORTE_JOB_STATE_RESTART: case ORTE_JOB_STATE_RESTART:
return strdup("RESTARTING"); return "RESTARTING";
case ORTE_JOB_STATE_LAUNCHED: case ORTE_JOB_STATE_LAUNCHED:
return strdup("LAUNCHED"); return "LAUNCHED";
case ORTE_JOB_STATE_RUNNING: case ORTE_JOB_STATE_RUNNING:
return strdup("RUNNING"); return "RUNNING";
case ORTE_JOB_STATE_SUSPENDED: case ORTE_JOB_STATE_SUSPENDED:
return strdup("SUSPENDED"); return "SUSPENDED";
case ORTE_JOB_STATE_REGISTERED: case ORTE_JOB_STATE_REGISTERED:
return strdup("SYNC REGISTERED"); return "SYNC REGISTERED";
case ORTE_JOB_STATE_UNTERMINATED: case ORTE_JOB_STATE_UNTERMINATED:
return strdup("UNTERMINATED"); return "UNTERMINATED";
case ORTE_JOB_STATE_TERMINATED: case ORTE_JOB_STATE_TERMINATED:
return strdup("NORMALLY TERMINATED"); return "NORMALLY TERMINATED";
case ORTE_JOB_STATE_ABORTED: case ORTE_JOB_STATE_ABORTED:
return strdup("ABORTED"); return "ABORTED";
case ORTE_JOB_STATE_FAILED_TO_START: case ORTE_JOB_STATE_FAILED_TO_START:
return strdup("FAILED TO START"); return "FAILED TO START";
case ORTE_JOB_STATE_ABORTED_BY_SIG: case ORTE_JOB_STATE_ABORTED_BY_SIG:
return strdup("ABORTED BY SIGNAL"); return "ABORTED BY SIGNAL";
case ORTE_JOB_STATE_ABORTED_WO_SYNC: case ORTE_JOB_STATE_ABORTED_WO_SYNC:
return strdup("TERMINATED WITHOUT SYNC"); return "TERMINATED WITHOUT SYNC";
case ORTE_JOB_STATE_KILLED_BY_CMD: case ORTE_JOB_STATE_KILLED_BY_CMD:
return strdup("KILLED BY INTERNAL COMMAND"); return "KILLED BY INTERNAL COMMAND";
case ORTE_JOB_STATE_COMM_FAILED: case ORTE_JOB_STATE_COMM_FAILED:
return strdup("COMMUNICATION FAILURE"); return "COMMUNICATION FAILURE";
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
return strdup("SENSOR BOUND EXCEEDED"); return "SENSOR BOUND EXCEEDED";
break; break;
case ORTE_JOB_STATE_NEVER_LAUNCHED: case ORTE_JOB_STATE_NEVER_LAUNCHED:
return strdup("NEVER LAUNCHED"); return "NEVER LAUNCHED";
case ORTE_JOB_STATE_ABORT_ORDERED: case ORTE_JOB_STATE_ABORT_ORDERED:
return strdup("ABORT IN PROGRESS"); return "ABORT IN PROGRESS";
default: default:
return strdup("UNKNOWN STATE!"); return "UNKNOWN STATE!";
} }
} }
@ -187,39 +187,39 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
{ {
switch(state) { switch(state) {
case ORTE_PROC_STATE_UNDEF: case ORTE_PROC_STATE_UNDEF:
return strdup("UNDEFINED"); return "UNDEFINED";
case ORTE_PROC_STATE_INIT: case ORTE_PROC_STATE_INIT:
return strdup("INITIALIZED"); return "INITIALIZED";
case ORTE_PROC_STATE_RESTART: case ORTE_PROC_STATE_RESTART:
return strdup("RESTARTING"); return "RESTARTING";
case ORTE_PROC_STATE_LAUNCHED: case ORTE_PROC_STATE_LAUNCHED:
return strdup("LAUNCHED"); return "LAUNCHED";
case ORTE_PROC_STATE_RUNNING: case ORTE_PROC_STATE_RUNNING:
return strdup("RUNNING"); return "RUNNING";
case ORTE_PROC_STATE_REGISTERED: case ORTE_PROC_STATE_REGISTERED:
return strdup("SYNC REGISTERED"); return "SYNC REGISTERED";
case ORTE_PROC_STATE_UNTERMINATED: case ORTE_PROC_STATE_UNTERMINATED:
return strdup("UNTERMINATED"); return "UNTERMINATED";
case ORTE_PROC_STATE_TERMINATED: case ORTE_PROC_STATE_TERMINATED:
return strdup("NORMALLY TERMINATED"); return "NORMALLY TERMINATED";
case ORTE_PROC_STATE_ABORTED: case ORTE_PROC_STATE_ABORTED:
return strdup("ABORTED"); return "ABORTED";
case ORTE_PROC_STATE_FAILED_TO_START: case ORTE_PROC_STATE_FAILED_TO_START:
return strdup("FAILED TO START"); return "FAILED TO START";
case ORTE_PROC_STATE_ABORTED_BY_SIG: case ORTE_PROC_STATE_ABORTED_BY_SIG:
return strdup("ABORTED BY SIGNAL"); return "ABORTED BY SIGNAL";
case ORTE_PROC_STATE_TERM_WO_SYNC: case ORTE_PROC_STATE_TERM_WO_SYNC:
return strdup("TERMINATED WITHOUT SYNC"); return "TERMINATED WITHOUT SYNC";
case ORTE_PROC_STATE_KILLED_BY_CMD: case ORTE_PROC_STATE_KILLED_BY_CMD:
return strdup("KILLED BY INTERNAL COMMAND"); return "KILLED BY INTERNAL COMMAND";
case ORTE_PROC_STATE_COMM_FAILED: case ORTE_PROC_STATE_COMM_FAILED:
return strdup("COMMUNICATION FAILURE"); return "COMMUNICATION FAILURE";
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
return strdup("SENSOR BOUND EXCEEDED"); return "SENSOR BOUND EXCEEDED";
break; break;
default: default:
return strdup("UNKNOWN STATE!"); return "UNKNOWN STATE!";
} }
} }