Fix a problem observed by Brian where processes launched local to mpirun lost their environment except for MCA params.
The problem stemmed from no longer launching a local orted on the same node as mpirun. The orted would save and reuse the base environment. Mpirun didn't do that, and the odls was using the orted's globally saved environment (which wasn't being set). This fix establishes a globally accessible base launch environment that both the orted and mpirun can utilize. Since we now use that, we don't need to pass it to the odls_launch_proc function, so remove that param from the API (and modify all components to handle the change). This commit was SVN r15405.
Этот коммит содержится в:
родитель
f325fed4e0
Коммит
2bded34a1d
@ -539,7 +539,7 @@ cleanup:
|
||||
* @retval error
|
||||
*/
|
||||
int
|
||||
orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ)
|
||||
orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data,)
|
||||
{
|
||||
odls_bproc_child_t *child;
|
||||
opal_list_item_t* item;
|
||||
|
@ -60,7 +60,7 @@ int orte_odls_bproc_finalize(void);
|
||||
* Interface
|
||||
*/
|
||||
int orte_odls_bproc_get_add_procs_data(orte_gpr_notify_data_t **data, orte_job_map_t *map);
|
||||
int orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ);
|
||||
int orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data);
|
||||
int orte_odls_bproc_kill_local_procs(orte_jobid_t job, bool set_state);
|
||||
int orte_odls_bproc_signal_local_procs(const orte_process_name_t* proc_name, int32_t signal);
|
||||
int orte_odls_bproc_deliver_message(orte_jobid_t job, orte_buffer_t *buffer, orte_rml_tag_t tag);
|
||||
|
@ -109,7 +109,7 @@ static bool is_preload_local_dup(char *local_ref, orte_filem_base_request_t *fil
|
||||
* External Interface
|
||||
*/
|
||||
static int orte_odls_default_get_add_procs_data(orte_gpr_notify_data_t **data, orte_job_map_t *map);
|
||||
static int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ);
|
||||
static int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data);
|
||||
static int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state);
|
||||
static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc,
|
||||
int32_t signal);
|
||||
@ -597,8 +597,7 @@ static int odls_default_fork_local_proc(
|
||||
orte_std_cntr_t total_slots_alloc,
|
||||
bool want_processor,
|
||||
size_t processor,
|
||||
bool oversubscribed,
|
||||
char **base_environ)
|
||||
bool oversubscribed)
|
||||
{
|
||||
pid_t pid;
|
||||
orte_iof_base_io_conf_t opts;
|
||||
@ -711,9 +710,9 @@ static int odls_default_fork_local_proc(
|
||||
/* setup base environment: copy the current environ and merge
|
||||
in the app context environ */
|
||||
if (NULL != context->env) {
|
||||
environ_copy = opal_environ_merge(base_environ, context->env);
|
||||
environ_copy = opal_environ_merge(orte_launch_environ, context->env);
|
||||
} else {
|
||||
environ_copy = opal_argv_copy(base_environ);
|
||||
environ_copy = opal_argv_copy(orte_launch_environ);
|
||||
}
|
||||
|
||||
/* special case handling for --prefix: this is somewhat icky,
|
||||
@ -933,7 +932,7 @@ static int odls_default_fork_local_proc(
|
||||
* Launch all processes allocated to the current node.
|
||||
*/
|
||||
|
||||
int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ)
|
||||
int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data)
|
||||
{
|
||||
int rc;
|
||||
orte_std_cntr_t i, j, kv, kv2, *sptr, total_slots_alloc;
|
||||
@ -1336,8 +1335,7 @@ DOFORK:
|
||||
if (ORTE_SUCCESS != (rc = odls_default_fork_local_proc(app, child, start,
|
||||
range, total_slots_alloc,
|
||||
want_processor,
|
||||
i, oversubscribed,
|
||||
base_environ))) {
|
||||
i, oversubscribed))) {
|
||||
/* do NOT ERROR_LOG this error - it generates
|
||||
* a message/node as most errors will be common
|
||||
* across the entire cluster. Instead, we let orterun
|
||||
|
@ -61,7 +61,7 @@ typedef int (*orte_odls_base_module_get_add_procs_data_fn_t)(orte_gpr_notify_dat
|
||||
/**
|
||||
* Locally launch the provided processes
|
||||
*/
|
||||
typedef int (*orte_odls_base_module_launch_local_processes_fn_t)(orte_gpr_notify_data_t *data, char **base_environ);
|
||||
typedef int (*orte_odls_base_module_launch_local_processes_fn_t)(orte_gpr_notify_data_t *data);
|
||||
|
||||
/**
|
||||
* Kill the local processes on this node
|
||||
|
@ -518,8 +518,7 @@ static int orte_odls_process_fork_local_proc(
|
||||
orte_std_cntr_t total_slots_alloc,
|
||||
bool want_processor,
|
||||
size_t processor,
|
||||
bool oversubscribed,
|
||||
char **base_environ)
|
||||
bool oversubscribed)
|
||||
{
|
||||
pid_t pid;
|
||||
orte_iof_base_io_conf_t opts;
|
||||
@ -580,9 +579,9 @@ static int orte_odls_process_fork_local_proc(
|
||||
/* setup base environment: copy the current environ and merge
|
||||
in the app context environ */
|
||||
if (NULL != context->env) {
|
||||
environ_copy = opal_environ_merge(base_environ, context->env);
|
||||
environ_copy = opal_environ_merge(orte_launch_environ, context->env);
|
||||
} else {
|
||||
environ_copy = opal_argv_copy(base_environ);
|
||||
environ_copy = opal_argv_copy(orte_launch_environ);
|
||||
}
|
||||
|
||||
/* special case handling for --prefix: this is somewhat icky,
|
||||
@ -757,7 +756,7 @@ static int orte_odls_process_fork_local_proc(
|
||||
* Launch all processes allocated to the current node.
|
||||
*/
|
||||
|
||||
static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ)
|
||||
static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data)
|
||||
{
|
||||
int rc;
|
||||
orte_std_cntr_t i, j, kv, kv2, *sptr, total_slots_alloc;
|
||||
@ -1151,8 +1150,7 @@ DOFORK:
|
||||
if (ORTE_SUCCESS != (rc = orte_odls_process_fork_local_proc(app, child, start,
|
||||
range, total_slots_alloc,
|
||||
want_processor,
|
||||
i, oversubscribed,
|
||||
base_environ))) {
|
||||
i, oversubscribed))) {
|
||||
/* do NOT ERROR_LOG this error - it generates
|
||||
* a message/node as most errors will be common
|
||||
* across the entire cluster. Instead, we let orterun
|
||||
|
@ -109,7 +109,6 @@ static struct {
|
||||
char* vpid_start;
|
||||
char* num_procs;
|
||||
char* universe;
|
||||
char **saved_environ;
|
||||
int uri_pipe;
|
||||
opal_mutex_t mutex;
|
||||
opal_condition_t condition;
|
||||
@ -235,7 +234,7 @@ int orte_daemon(int argc, char *argv[])
|
||||
}
|
||||
|
||||
/* save the environment for use when launching application processes */
|
||||
orted_globals.saved_environ = opal_argv_copy(environ);
|
||||
orte_launch_environ = opal_argv_copy(environ);
|
||||
|
||||
/* setup to check common command line options that just report and die */
|
||||
cmd_line = OBJ_NEW(opal_cmd_line_t);
|
||||
@ -808,7 +807,7 @@ static int process_commands(orte_process_name_t* sender,
|
||||
}
|
||||
|
||||
/* launch the processes */
|
||||
if (ORTE_SUCCESS != (ret = orte_odls.launch_local_procs(ndat, orted_globals.saved_environ))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_odls.launch_local_procs(ndat))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
|
@ -35,7 +35,7 @@ bool orte_debug_flag, orte_timing, orte_infrastructure;
|
||||
bool orte_debug_daemons_flag, orte_debug_daemons_file_flag;
|
||||
bool orted_spin_flag, orte_no_daemonize_flag;
|
||||
struct timeval orte_abort_timeout;
|
||||
|
||||
char **orte_launch_environ;
|
||||
|
||||
/*
|
||||
* Whether we have completed orte_init or not
|
||||
|
@ -45,6 +45,8 @@ ORTE_DECLSPEC extern bool orte_infrastructure, orted_spin_flag, orte_no_daemoniz
|
||||
|
||||
ORTE_DECLSPEC extern struct timeval orte_abort_timeout;
|
||||
|
||||
ORTE_DECLSPEC extern char **orte_launch_environ;
|
||||
|
||||
/**
|
||||
* Whether ORTE is initialized or not
|
||||
*/
|
||||
|
@ -334,6 +334,9 @@ int orterun(int argc, char *argv[])
|
||||
|
||||
/* Need to initialize OPAL so that install_dirs are filled in */
|
||||
opal_init_util();
|
||||
|
||||
/* save the environment for launch purposes */
|
||||
orte_launch_environ = opal_argv_copy(environ);
|
||||
|
||||
/* Setup MCA params */
|
||||
|
||||
@ -384,8 +387,7 @@ int orterun(int argc, char *argv[])
|
||||
if (ORTE_SUCCESS != (rc = orte_init(ORTE_INFRASTRUCTURE, ORTE_NON_BARRIER))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* since we are a daemon, we should *always* yield the processor when idle */
|
||||
opal_progress_set_yield_when_idle(true);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user