1
1

Add an mca param to set the "fork agent" - i.e., a program to be run when forking off a process (e.g., valgrind). While you could specify this by "mpirun -n N fork_agent ./my_app", not everyone launches procs with ORTE from mpirun.

Provide the ability to store recent stat histories using the ring_buffer class

This commit was SVN r24842.
Этот коммит содержится в:
Ralph Castain 2011-06-30 03:12:38 +00:00
родитель 2e1fa3e08e
Коммит c449871ade
4 изменённых файлов: 68 добавлений и 10 удалений

Просмотреть файл

@ -45,6 +45,7 @@
#include "opal/util/argv.h"
#include "opal/util/opal_sos.h"
#include "opal/util/os_path.h"
#include "opal/util/path.h"
#include "opal/util/sys_limits.h"
#include "opal/dss/dss.h"
#include "opal/mca/paffinity/base/base.h"
@ -1888,8 +1889,30 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
}
}
} else if (NULL != orte_fork_agent) {
/* we were given a fork agent - use it */
argvsav = opal_argv_copy(app->argv);
/* free the argv */
opal_argv_free(app->argv);
app->argv = NULL;
/* now create a new one that starts with the fork agent */
app->argv = opal_argv_copy(orte_fork_agent);
/* add back the original argv */
for (inm=0; NULL != argvsav[inm]; inm++) {
opal_argv_append_nosize(&app->argv, argvsav[inm]);
}
/* the app exe name itself is in the argvsav array, so
* we can recover it from there later
*/
free(app->app);
app->app = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
if (NULL == app->app) {
opal_output(0, "%s CANNOT FIND FORK AGENT %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_fork_agent[0]);
rc = ORTE_ERR_NOT_FOUND;
goto CLEANUP;
}
}
/* setup the rest of the environment with the proc-specific items - these
* will be overwritten for each child
*/

Просмотреть файл

@ -82,8 +82,10 @@ char **orte_launch_environ;
bool orte_hnp_is_allocated = false;
bool orte_allocation_required;
/* launch agents */
char *orte_launch_agent = NULL;
char **orted_cmd_line=NULL;
char **orte_fork_agent=NULL;
/* debugger job */
orte_job_t *orte_debugger_daemon=NULL;
@ -183,6 +185,9 @@ bool orte_abort_non_zero_exit;
/* VM control */
bool orte_vm_launch = false;
/* length of stat history to keep */
int orte_stat_history_size;
#endif /* !ORTE_DISABLE_FULL_RTE */
int orte_debug_output = -1;
@ -841,14 +846,16 @@ static void orte_node_construct(orte_node_t* node)
OBJ_CONSTRUCT(&node->resources, opal_list_t);
OBJ_CONSTRUCT(&node->stats, opal_node_stats_t);
OBJ_CONSTRUCT(&node->stats, opal_ring_buffer_t);
opal_ring_buffer_init(&node->stats, orte_stat_history_size);
}
static void orte_node_destruct(orte_node_t* node)
{
int i;
opal_list_item_t *item;
opal_node_stats_t *stats;
if (NULL != node->name) {
free(node->name);
node->name = NULL;
@ -887,6 +894,11 @@ static void orte_node_destruct(orte_node_t* node)
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node->resources);
while (NULL != (stats = (opal_node_stats_t*)opal_ring_buffer_pop(&node->stats))) {
OBJ_RELEASE(stats);
}
OBJ_DESTRUCT(&node->stats);
}
@ -919,7 +931,8 @@ static void orte_proc_construct(orte_proc_t* proc)
proc->last_failure.tv_usec = 0;
proc->reported = false;
proc->beat = 0;
OBJ_CONSTRUCT(&proc->stats, opal_pstats_t);
OBJ_CONSTRUCT(&proc->stats, opal_ring_buffer_t);
opal_ring_buffer_init(&proc->stats, orte_stat_history_size);
proc->name.epoch = ORTE_EPOCH_MIN;
#if OPAL_ENABLE_FT_CR == 1
proc->ckpt_state = 0;
@ -930,6 +943,8 @@ static void orte_proc_construct(orte_proc_t* proc)
static void orte_proc_destruct(orte_proc_t* proc)
{
opal_pstats_t *stats;
/* do NOT free the nodename field as this is
* simply a pointer to a field in the
* associated node object - the node object
@ -950,7 +965,10 @@ static void orte_proc_destruct(orte_proc_t* proc)
free(proc->rml_uri);
proc->rml_uri = NULL;
}
while (NULL != (stats = (opal_pstats_t*)opal_ring_buffer_pop(&proc->stats))) {
OBJ_RELEASE(stats);
}
OBJ_DESTRUCT(&proc->stats);
#if OPAL_ENABLE_FT_CR == 1

Просмотреть файл

@ -36,6 +36,7 @@
#include "opal/class/opal_pointer_array.h"
#include "opal/class/opal_value_array.h"
#include "opal/class/opal_ring_buffer.h"
#include "opal/threads/threads.h"
#include "orte/mca/plm/plm_types.h"
@ -299,8 +300,8 @@ typedef struct {
char *username;
/* list of known system resources for this node */
opal_list_t resources;
/* stats at last sampling */
opal_node_stats_t stats;
/* history of resource usage - sized by sensor framework */
opal_ring_buffer_t stats;
} orte_node_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_node_t);
@ -499,8 +500,8 @@ struct orte_proc_t {
bool reported;
/* if heartbeat recvd during last time period */
int beat;
/* process stats at last sampling */
opal_pstats_t stats;
/* history of resource usage - sized by sensor framework */
opal_ring_buffer_t stats;
#if OPAL_ENABLE_FT_CR == 1
/* ckpt state */
size_t ckpt_state;
@ -612,8 +613,10 @@ ORTE_DECLSPEC extern char **orte_launch_environ;
ORTE_DECLSPEC extern bool orte_hnp_is_allocated;
ORTE_DECLSPEC extern bool orte_allocation_required;
/* launch agents */
ORTE_DECLSPEC extern char *orte_launch_agent;
ORTE_DECLSPEC extern char **orted_cmd_line;
ORTE_DECLSPEC extern char **orte_fork_agent;
/* debugger job */
ORTE_DECLSPEC extern orte_job_t *orte_debugger_daemon;
@ -722,6 +725,9 @@ ORTE_DECLSPEC extern bool orte_abort_non_zero_exit;
/* VM control */
ORTE_DECLSPEC extern bool orte_vm_launch;
/* length of stat history to keep */
ORTE_DECLSPEC extern int orte_stat_history_size;
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS

Просмотреть файл

@ -311,6 +311,13 @@ int orte_register_params(void)
"Command used to start processes on remote nodes (default: orted)",
false, false, "orted", &orte_launch_agent);
mca_base_param_reg_string_name("orte", "fork_agent",
"Command used to fork processes on remote nodes (default: NULL)",
false, false, NULL, &strval);
if (NULL != strval) {
orte_fork_agent = opal_argv_split(strval, ' ');
}
/* whether or not to require RM allocation */
mca_base_param_reg_int_name("orte", "allocation_required",
"Whether or not an allocation by a resource manager is required [default: no]",
@ -440,7 +447,7 @@ int orte_register_params(void)
/* tool communication controls */
mca_base_param_reg_string_name("orte", "report_events",
"URI to which events are to be reported (default: NULL)]",
"URI to which events are to be reported (default: NULL)",
false, false, NULL, &orte_report_events_uri);
if (NULL != orte_report_events_uri) {
orte_report_events = true;
@ -503,6 +510,10 @@ int orte_register_params(void)
false, false, (int)false, &value);
orte_vm_launch = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "stat_history_size",
"Number of stat samples to keep",
false, false, 1, &orte_stat_history_size);
#endif /* ORTE_DISABLE_FULL_SUPPORT */
return ORTE_SUCCESS;