Add an mca param to set the "fork agent" - i.e., a program to be run when forking off a process (e.g., valgrind). While you could specify this by "mpirun -n N fork_agent ./my_app", not everyone launches procs with ORTE from mpirun.
Provide the ability to store recent stat histories using the ring_buffer class This commit was SVN r24842.
Этот коммит содержится в:
родитель
2e1fa3e08e
Коммит
c449871ade
@ -45,6 +45,7 @@
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/sys_limits.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/paffinity/base/base.h"
|
||||
@ -1888,8 +1889,30 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
}
|
||||
|
||||
}
|
||||
} else if (NULL != orte_fork_agent) {
|
||||
/* we were given a fork agent - use it */
|
||||
argvsav = opal_argv_copy(app->argv);
|
||||
/* free the argv */
|
||||
opal_argv_free(app->argv);
|
||||
app->argv = NULL;
|
||||
/* now create a new one that starts with the fork agent */
|
||||
app->argv = opal_argv_copy(orte_fork_agent);
|
||||
/* add back the original argv */
|
||||
for (inm=0; NULL != argvsav[inm]; inm++) {
|
||||
opal_argv_append_nosize(&app->argv, argvsav[inm]);
|
||||
}
|
||||
/* the app exe name itself is in the argvsav array, so
|
||||
* we can recover it from there later
|
||||
*/
|
||||
free(app->app);
|
||||
app->app = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
|
||||
if (NULL == app->app) {
|
||||
opal_output(0, "%s CANNOT FIND FORK AGENT %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_fork_agent[0]);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* setup the rest of the environment with the proc-specific items - these
|
||||
* will be overwritten for each child
|
||||
*/
|
||||
|
@ -82,8 +82,10 @@ char **orte_launch_environ;
|
||||
bool orte_hnp_is_allocated = false;
|
||||
bool orte_allocation_required;
|
||||
|
||||
/* launch agents */
|
||||
char *orte_launch_agent = NULL;
|
||||
char **orted_cmd_line=NULL;
|
||||
char **orte_fork_agent=NULL;
|
||||
|
||||
/* debugger job */
|
||||
orte_job_t *orte_debugger_daemon=NULL;
|
||||
@ -183,6 +185,9 @@ bool orte_abort_non_zero_exit;
|
||||
/* VM control */
|
||||
bool orte_vm_launch = false;
|
||||
|
||||
/* length of stat history to keep */
|
||||
int orte_stat_history_size;
|
||||
|
||||
#endif /* !ORTE_DISABLE_FULL_RTE */
|
||||
|
||||
int orte_debug_output = -1;
|
||||
@ -841,14 +846,16 @@ static void orte_node_construct(orte_node_t* node)
|
||||
|
||||
OBJ_CONSTRUCT(&node->resources, opal_list_t);
|
||||
|
||||
OBJ_CONSTRUCT(&node->stats, opal_node_stats_t);
|
||||
OBJ_CONSTRUCT(&node->stats, opal_ring_buffer_t);
|
||||
opal_ring_buffer_init(&node->stats, orte_stat_history_size);
|
||||
}
|
||||
|
||||
static void orte_node_destruct(orte_node_t* node)
|
||||
{
|
||||
int i;
|
||||
opal_list_item_t *item;
|
||||
|
||||
opal_node_stats_t *stats;
|
||||
|
||||
if (NULL != node->name) {
|
||||
free(node->name);
|
||||
node->name = NULL;
|
||||
@ -887,6 +894,11 @@ static void orte_node_destruct(orte_node_t* node)
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node->resources);
|
||||
|
||||
while (NULL != (stats = (opal_node_stats_t*)opal_ring_buffer_pop(&node->stats))) {
|
||||
OBJ_RELEASE(stats);
|
||||
}
|
||||
OBJ_DESTRUCT(&node->stats);
|
||||
}
|
||||
|
||||
|
||||
@ -919,7 +931,8 @@ static void orte_proc_construct(orte_proc_t* proc)
|
||||
proc->last_failure.tv_usec = 0;
|
||||
proc->reported = false;
|
||||
proc->beat = 0;
|
||||
OBJ_CONSTRUCT(&proc->stats, opal_pstats_t);
|
||||
OBJ_CONSTRUCT(&proc->stats, opal_ring_buffer_t);
|
||||
opal_ring_buffer_init(&proc->stats, orte_stat_history_size);
|
||||
proc->name.epoch = ORTE_EPOCH_MIN;
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
proc->ckpt_state = 0;
|
||||
@ -930,6 +943,8 @@ static void orte_proc_construct(orte_proc_t* proc)
|
||||
|
||||
static void orte_proc_destruct(orte_proc_t* proc)
|
||||
{
|
||||
opal_pstats_t *stats;
|
||||
|
||||
/* do NOT free the nodename field as this is
|
||||
* simply a pointer to a field in the
|
||||
* associated node object - the node object
|
||||
@ -950,7 +965,10 @@ static void orte_proc_destruct(orte_proc_t* proc)
|
||||
free(proc->rml_uri);
|
||||
proc->rml_uri = NULL;
|
||||
}
|
||||
|
||||
|
||||
while (NULL != (stats = (opal_pstats_t*)opal_ring_buffer_pop(&proc->stats))) {
|
||||
OBJ_RELEASE(stats);
|
||||
}
|
||||
OBJ_DESTRUCT(&proc->stats);
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
|
@ -36,6 +36,7 @@
|
||||
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/class/opal_value_array.h"
|
||||
#include "opal/class/opal_ring_buffer.h"
|
||||
#include "opal/threads/threads.h"
|
||||
|
||||
#include "orte/mca/plm/plm_types.h"
|
||||
@ -299,8 +300,8 @@ typedef struct {
|
||||
char *username;
|
||||
/* list of known system resources for this node */
|
||||
opal_list_t resources;
|
||||
/* stats at last sampling */
|
||||
opal_node_stats_t stats;
|
||||
/* history of resource usage - sized by sensor framework */
|
||||
opal_ring_buffer_t stats;
|
||||
} orte_node_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_node_t);
|
||||
|
||||
@ -499,8 +500,8 @@ struct orte_proc_t {
|
||||
bool reported;
|
||||
/* if heartbeat recvd during last time period */
|
||||
int beat;
|
||||
/* process stats at last sampling */
|
||||
opal_pstats_t stats;
|
||||
/* history of resource usage - sized by sensor framework */
|
||||
opal_ring_buffer_t stats;
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* ckpt state */
|
||||
size_t ckpt_state;
|
||||
@ -612,8 +613,10 @@ ORTE_DECLSPEC extern char **orte_launch_environ;
|
||||
ORTE_DECLSPEC extern bool orte_hnp_is_allocated;
|
||||
ORTE_DECLSPEC extern bool orte_allocation_required;
|
||||
|
||||
/* launch agents */
|
||||
ORTE_DECLSPEC extern char *orte_launch_agent;
|
||||
ORTE_DECLSPEC extern char **orted_cmd_line;
|
||||
ORTE_DECLSPEC extern char **orte_fork_agent;
|
||||
|
||||
/* debugger job */
|
||||
ORTE_DECLSPEC extern orte_job_t *orte_debugger_daemon;
|
||||
@ -722,6 +725,9 @@ ORTE_DECLSPEC extern bool orte_abort_non_zero_exit;
|
||||
/* VM control */
|
||||
ORTE_DECLSPEC extern bool orte_vm_launch;
|
||||
|
||||
/* length of stat history to keep */
|
||||
ORTE_DECLSPEC extern int orte_stat_history_size;
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -311,6 +311,13 @@ int orte_register_params(void)
|
||||
"Command used to start processes on remote nodes (default: orted)",
|
||||
false, false, "orted", &orte_launch_agent);
|
||||
|
||||
mca_base_param_reg_string_name("orte", "fork_agent",
|
||||
"Command used to fork processes on remote nodes (default: NULL)",
|
||||
false, false, NULL, &strval);
|
||||
if (NULL != strval) {
|
||||
orte_fork_agent = opal_argv_split(strval, ' ');
|
||||
}
|
||||
|
||||
/* whether or not to require RM allocation */
|
||||
mca_base_param_reg_int_name("orte", "allocation_required",
|
||||
"Whether or not an allocation by a resource manager is required [default: no]",
|
||||
@ -440,7 +447,7 @@ int orte_register_params(void)
|
||||
|
||||
/* tool communication controls */
|
||||
mca_base_param_reg_string_name("orte", "report_events",
|
||||
"URI to which events are to be reported (default: NULL)]",
|
||||
"URI to which events are to be reported (default: NULL)",
|
||||
false, false, NULL, &orte_report_events_uri);
|
||||
if (NULL != orte_report_events_uri) {
|
||||
orte_report_events = true;
|
||||
@ -503,6 +510,10 @@ int orte_register_params(void)
|
||||
false, false, (int)false, &value);
|
||||
orte_vm_launch = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "stat_history_size",
|
||||
"Number of stat samples to keep",
|
||||
false, false, 1, &orte_stat_history_size);
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user