Enable parallel fork/exec of local procs by providing the option of multiple odls progress threads
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
3afadbad89
Коммит
70591bf4dc
@ -219,11 +219,6 @@ orte_iof_base_setup_parent(const orte_process_name_t* name,
|
|||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
close(opts->p_stdin[0]);
|
|
||||||
close(opts->p_stdout[1]);
|
|
||||||
close(opts->p_stderr[1]);
|
|
||||||
close(opts->p_internal[1]);
|
|
||||||
|
|
||||||
/* connect stdin endpoint */
|
/* connect stdin endpoint */
|
||||||
if (opts->connect_stdin) {
|
if (opts->connect_stdin) {
|
||||||
/* and connect the pty to stdin */
|
/* and connect the pty to stdin */
|
||||||
|
@ -625,22 +625,186 @@ static int compute_num_procs_alive(orte_jobid_t job)
|
|||||||
return num_procs_alive;
|
return num_procs_alive;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||||
|
{
|
||||||
|
orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cbdata;
|
||||||
|
orte_job_t *jobdat = cd->jdata;
|
||||||
|
orte_app_context_t *app = cd->app;
|
||||||
|
orte_proc_t *child = cd->child;
|
||||||
|
char **env = NULL, **argv = NULL, *cmd = NULL;
|
||||||
|
int rc, i;
|
||||||
|
|
||||||
|
/* thread-protect common values */
|
||||||
|
env = opal_argv_copy(app->env);
|
||||||
|
|
||||||
|
/* setup the pmix environment */
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &env))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto errorout;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ensure we clear any prior info regarding state or exit status in
|
||||||
|
* case this is a restart
|
||||||
|
*/
|
||||||
|
child->exit_code = 0;
|
||||||
|
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
|
||||||
|
/* if we are not forwarding output for this job, then
|
||||||
|
* flag iof as complete
|
||||||
|
*/
|
||||||
|
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||||
|
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
|
||||||
|
} else {
|
||||||
|
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
|
||||||
|
}
|
||||||
|
child->pid = 0;
|
||||||
|
if (NULL != child->rml_uri) {
|
||||||
|
free(child->rml_uri);
|
||||||
|
child->rml_uri = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* did the user request we display output in xterms? */
|
||||||
|
if (NULL != orte_xterm) {
|
||||||
|
opal_list_item_t *nmitem;
|
||||||
|
orte_namelist_t *nm;
|
||||||
|
/* see if this rank is one of those requested */
|
||||||
|
for (nmitem = opal_list_get_first(&orte_odls_globals.xterm_ranks);
|
||||||
|
nmitem != opal_list_get_end(&orte_odls_globals.xterm_ranks);
|
||||||
|
nmitem = opal_list_get_next(nmitem)) {
|
||||||
|
nm = (orte_namelist_t*)nmitem;
|
||||||
|
if (ORTE_VPID_WILDCARD == nm->name.vpid ||
|
||||||
|
child->name.vpid == nm->name.vpid) {
|
||||||
|
/* we want this one - modify the app's command to include
|
||||||
|
* the orte xterm cmd that starts with the xtermcmd */
|
||||||
|
argv = opal_argv_copy(orte_odls_globals.xtermcmd);
|
||||||
|
/* insert the rank into the correct place as a window title */
|
||||||
|
free(argv[2]);
|
||||||
|
asprintf(&argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid));
|
||||||
|
/* add in the argv from the app */
|
||||||
|
for (i=0; NULL != app->argv[i]; i++) {
|
||||||
|
opal_argv_append_nosize(&argv, app->argv[i]);
|
||||||
|
}
|
||||||
|
/* use the xterm cmd as the app string */
|
||||||
|
cmd = strdup(orte_odls_globals.xtermcmd[0]);
|
||||||
|
break;
|
||||||
|
} else if (jobdat->num_procs <= nm->name.vpid) { /* check for bozo case */
|
||||||
|
/* can't be done! */
|
||||||
|
orte_show_help("help-orte-odls-base.txt",
|
||||||
|
"orte-odls-base:xterm-rank-out-of-bounds",
|
||||||
|
true, nm->name.vpid, jobdat->num_procs);
|
||||||
|
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||||
|
goto errorout;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (NULL != orte_fork_agent) {
|
||||||
|
/* we were given a fork agent - use it */
|
||||||
|
argv = opal_argv_copy(orte_fork_agent);
|
||||||
|
/* add in the argv from the app */
|
||||||
|
for (i=0; NULL != app->argv[i]; i++) {
|
||||||
|
opal_argv_append_nosize(&argv, app->argv[i]);
|
||||||
|
}
|
||||||
|
/* the app exe name itself is in the argvsav array, so
|
||||||
|
* we can recover it from there later
|
||||||
|
*/
|
||||||
|
cmd = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
|
||||||
|
if (NULL == cmd) {
|
||||||
|
orte_show_help("help-orte-odls-base.txt",
|
||||||
|
"orte-odls-base:fork-agent-not-found",
|
||||||
|
true, orte_process_info.nodename, orte_fork_agent[0]);
|
||||||
|
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||||
|
goto errorout;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
cmd = strdup(app->app);
|
||||||
|
argv = opal_argv_copy(app->argv);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* setup the rest of the environment with the proc-specific items - these
|
||||||
|
* will be overwritten for each child
|
||||||
|
*/
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &env))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
child->exit_code = rc;
|
||||||
|
goto errorout;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* if we are indexing the argv by rank, do so now */
|
||||||
|
if (cd->index_argv) {
|
||||||
|
char *param;
|
||||||
|
asprintf(¶m, "%s-%d", argv[0], (int)child->name.vpid);
|
||||||
|
free(argv[0]);
|
||||||
|
argv[0] = param;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (5 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
|
||||||
|
opal_output(orte_odls_base_framework.framework_output, "%s odls:launch spawning child %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&child->name));
|
||||||
|
|
||||||
|
/* dump what is going to be exec'd */
|
||||||
|
if (7 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
|
||||||
|
opal_dss.dump(orte_odls_base_framework.framework_output, app, ORTE_APP_CONTEXT);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ORTE_SUCCESS != (rc = cd->fork_local(child, cmd, argv, env, jobdat, cd->opts))) {
|
||||||
|
child->exit_code = rc; /* error message already output */
|
||||||
|
goto errorout;
|
||||||
|
}
|
||||||
|
if (ORTE_SUCCESS != rc) {
|
||||||
|
/* do NOT ERROR_LOG this error - it generates
|
||||||
|
* a message/node as most errors will be common
|
||||||
|
* across the entire cluster. Instead, we let orterun
|
||||||
|
* output a consolidated error message for us
|
||||||
|
*/
|
||||||
|
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
||||||
|
child->exit_code = rc; /* error message already output */
|
||||||
|
goto errorout;
|
||||||
|
}
|
||||||
|
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_RUNNING);
|
||||||
|
if (NULL != env) {
|
||||||
|
opal_argv_free(env);
|
||||||
|
}
|
||||||
|
if (NULL != argv) {
|
||||||
|
opal_argv_free(argv);
|
||||||
|
}
|
||||||
|
if (NULL != cmd) {
|
||||||
|
free(cmd);
|
||||||
|
}
|
||||||
|
OBJ_RELEASE(cd);
|
||||||
|
return;
|
||||||
|
|
||||||
|
errorout:
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
||||||
|
if (NULL != env) {
|
||||||
|
opal_argv_free(env);
|
||||||
|
}
|
||||||
|
if (NULL != argv) {
|
||||||
|
opal_argv_free(argv);
|
||||||
|
}
|
||||||
|
if (NULL != cmd) {
|
||||||
|
free(cmd);
|
||||||
|
}
|
||||||
|
OBJ_RELEASE(cd);
|
||||||
|
}
|
||||||
|
|
||||||
void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_app_context_t *app;
|
orte_app_context_t *app;
|
||||||
orte_proc_t *child=NULL;
|
orte_proc_t *child=NULL;
|
||||||
int rc=ORTE_SUCCESS;
|
int rc=ORTE_SUCCESS;
|
||||||
orte_std_cntr_t proc_rank;
|
|
||||||
char basedir[MAXPATHLEN];
|
char basedir[MAXPATHLEN];
|
||||||
char **argvsav=NULL;
|
char **argvsav=NULL;
|
||||||
int inm, j, idx;
|
int j, idx;
|
||||||
int total_num_local_procs = 0;
|
int total_num_local_procs = 0;
|
||||||
orte_odls_launch_local_t *caddy = (orte_odls_launch_local_t*)cbdata;
|
orte_odls_launch_local_t *caddy = (orte_odls_launch_local_t*)cbdata;
|
||||||
orte_job_t *jobdat;
|
orte_job_t *jobdat;
|
||||||
orte_jobid_t job = caddy->job;
|
orte_jobid_t job = caddy->job;
|
||||||
orte_odls_base_fork_local_proc_fn_t fork_local = caddy->fork_local;
|
orte_odls_base_fork_local_proc_fn_t fork_local = caddy->fork_local;
|
||||||
bool index_argv;
|
bool index_argv;
|
||||||
|
char *msg;
|
||||||
|
orte_odls_spawn_caddy_t *cd;
|
||||||
|
opal_event_base_t *evb;
|
||||||
|
|
||||||
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
||||||
"%s local:launch",
|
"%s local:launch",
|
||||||
@ -671,45 +835,12 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
|||||||
goto GETOUT;
|
goto GETOUT;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if OPAL_ENABLE_FT_CR == 1
|
|
||||||
/*
|
|
||||||
* Notify the local SnapC component regarding new job
|
|
||||||
*/
|
|
||||||
if( ORTE_SUCCESS != (rc = orte_snapc.setup_job(job) ) ) {
|
|
||||||
/* Silent Failure :/ JJH */
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_FT_CR == 1
|
|
||||||
for (j=0; j < jobdat->apps->size; j++) {
|
|
||||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, j))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
orte_sstore.fetch_app_deps(app);
|
|
||||||
}
|
|
||||||
orte_sstore.wait_all_deps();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* track if we are indexing argvs so we don't check every time */
|
/* track if we are indexing argvs so we don't check every time */
|
||||||
index_argv = orte_get_attribute(&jobdat->attributes, ORTE_JOB_INDEX_ARGV, NULL, OPAL_BOOL);
|
index_argv = orte_get_attribute(&jobdat->attributes, ORTE_JOB_INDEX_ARGV, NULL, OPAL_BOOL);
|
||||||
|
|
||||||
/* compute the total number of local procs currently alive and about to be launched */
|
/* compute the total number of local procs currently alive and about to be launched */
|
||||||
total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs;
|
total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs;
|
||||||
|
|
||||||
for (j=0; j < jobdat->apps->size; j++) {
|
|
||||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, j))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* if this app isn't being used on our node, skip it */
|
|
||||||
if (!ORTE_FLAG_TEST(app, ORTE_APP_FLAG_USED_ON_NODE)) {
|
|
||||||
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
|
||||||
"%s app %d not used on node",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* check the system limits - if we are at our max allowed children, then
|
/* check the system limits - if we are at our max allowed children, then
|
||||||
* we won't be allowed to do this anyway, so we may as well abort now.
|
* we won't be allowed to do this anyway, so we may as well abort now.
|
||||||
* According to the documentation, num_procs = 0 is equivalent to
|
* According to the documentation, num_procs = 0 is equivalent to
|
||||||
@ -735,6 +866,47 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check to see if we have enough available file descriptors
|
||||||
|
* to launch these children - if not, then let's wait a little
|
||||||
|
* while to see if some come free. This can happen if we are
|
||||||
|
* in a tight loop over comm_spawn
|
||||||
|
*/
|
||||||
|
if (0 < opal_sys_limits.num_files) {
|
||||||
|
int limit;
|
||||||
|
limit = 4*total_num_local_procs + 6*jobdat->num_local_procs;
|
||||||
|
OPAL_OUTPUT_VERBOSE((10, orte_odls_base_framework.framework_output,
|
||||||
|
"%s checking limit on file descriptors %d need %d",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
opal_sys_limits.num_files, limit));
|
||||||
|
if (opal_sys_limits.num_files < limit) {
|
||||||
|
if (2 < caddy->retries) {
|
||||||
|
/* tried enough - give up */
|
||||||
|
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
||||||
|
goto ERROR_OUT;
|
||||||
|
}
|
||||||
|
/* don't have enough - wait a little time */
|
||||||
|
ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy);
|
||||||
|
if (NULL != argvsav) {
|
||||||
|
opal_argv_free(argvsav);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (j=0; j < jobdat->apps->size; j++) {
|
||||||
|
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, j))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* if this app isn't being used on our node, skip it */
|
||||||
|
if (!ORTE_FLAG_TEST(app, ORTE_APP_FLAG_USED_ON_NODE)) {
|
||||||
|
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
||||||
|
"%s app %d not used on node",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
/* setup the environment for this app */
|
/* setup the environment for this app */
|
||||||
if (ORTE_SUCCESS != (rc = orte_schizo.setup_fork(jobdat, app))) {
|
if (ORTE_SUCCESS != (rc = orte_schizo.setup_fork(jobdat, app))) {
|
||||||
|
|
||||||
@ -809,8 +981,30 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
|||||||
goto GETOUT;
|
goto GETOUT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* tell all children that they are being launched via ORTE */
|
||||||
|
opal_setenv(OPAL_MCA_PREFIX"orte_launch", "1", true, &app->env);
|
||||||
|
|
||||||
|
/* if the user requested it, set the system resource limits */
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_util_init_sys_limits(&msg))) {
|
||||||
|
orte_show_help("help-orte-odls-default.txt", "set limit", true,
|
||||||
|
orte_process_info.nodename, app,
|
||||||
|
__FILE__, __LINE__, msg);
|
||||||
|
/* cycle through children to find those for this jobid */
|
||||||
|
for (idx=0; idx < orte_local_children->size; idx++) {
|
||||||
|
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
|
||||||
|
j == (int)child->app_idx) {
|
||||||
|
child->exit_code = rc;
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
goto GETOUT;
|
||||||
|
}
|
||||||
|
|
||||||
/* okay, now let's launch all the local procs for this app using the provided fork_local fn */
|
/* okay, now let's launch all the local procs for this app using the provided fork_local fn */
|
||||||
for (proc_rank = 0, idx=0; idx < orte_local_children->size; idx++) {
|
for (idx=0; idx < orte_local_children->size; idx++) {
|
||||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
|
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -859,235 +1053,56 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&child->name)));
|
ORTE_NAME_PRINT(&child->name)));
|
||||||
|
|
||||||
/* setup the pmix environment */
|
/* set the waitpid callback here for thread protection and
|
||||||
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &app->env))) {
|
* to ensure we can capture the callback on shortlived apps */
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/* tell the child that it is being launched via ORTE */
|
|
||||||
opal_setenv(OPAL_MCA_PREFIX"orte_launch", "1", true, &app->env);
|
|
||||||
|
|
||||||
/* ensure we clear any prior info regarding state or exit status in
|
|
||||||
* case this is a restart
|
|
||||||
*/
|
|
||||||
child->exit_code = 0;
|
|
||||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
|
|
||||||
/* if we are not forwarding output for this job, then
|
|
||||||
* flag iof as complete
|
|
||||||
*/
|
|
||||||
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
|
||||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
|
|
||||||
} else {
|
|
||||||
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
|
|
||||||
}
|
|
||||||
child->pid = 0;
|
|
||||||
if (NULL != child->rml_uri) {
|
|
||||||
free(child->rml_uri);
|
|
||||||
child->rml_uri = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* check to see if we have enough available file descriptors
|
|
||||||
* to launch another child - if not, then let's wait a little
|
|
||||||
* while to see if some come free. This can happen if we are
|
|
||||||
* in a tight loop over comm_spawn
|
|
||||||
*/
|
|
||||||
if (0 < opal_sys_limits.num_files) {
|
|
||||||
int limit;
|
|
||||||
limit = 4*total_num_local_procs + 6;
|
|
||||||
OPAL_OUTPUT_VERBOSE((10, orte_odls_base_framework.framework_output,
|
|
||||||
"%s checking limit on file descriptors %d need %d",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
opal_sys_limits.num_files, limit));
|
|
||||||
if (opal_sys_limits.num_files < limit) {
|
|
||||||
if (2 < caddy->retries) {
|
|
||||||
/* tried enough - give up */
|
|
||||||
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/* don't have enough - wait a little time */
|
|
||||||
ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy);
|
|
||||||
if (NULL != argvsav) {
|
|
||||||
opal_argv_free(argvsav);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* did the user request we display output in xterms? */
|
|
||||||
if (NULL != orte_xterm) {
|
|
||||||
opal_list_item_t *nmitem;
|
|
||||||
orte_namelist_t *nm;
|
|
||||||
/* see if this rank is one of those requested */
|
|
||||||
for (nmitem = opal_list_get_first(&orte_odls_globals.xterm_ranks);
|
|
||||||
nmitem != opal_list_get_end(&orte_odls_globals.xterm_ranks);
|
|
||||||
nmitem = opal_list_get_next(nmitem)) {
|
|
||||||
nm = (orte_namelist_t*)nmitem;
|
|
||||||
if (ORTE_VPID_WILDCARD == nm->name.vpid ||
|
|
||||||
child->name.vpid == nm->name.vpid) {
|
|
||||||
/* we want this one - modify the app's command to include
|
|
||||||
* the orte xterm cmd. Need to be careful, though, that we
|
|
||||||
* don't modify the app for ALL ranks that use it! So we
|
|
||||||
* will create a copy of the argv so we can restore it later
|
|
||||||
*/
|
|
||||||
argvsav = opal_argv_copy(app->argv);
|
|
||||||
/* free the argv */
|
|
||||||
opal_argv_free(app->argv);
|
|
||||||
app->argv = NULL;
|
|
||||||
/* now create a new one that starts with the xtermcmd */
|
|
||||||
for (inm=0; inm < opal_argv_count(orte_odls_globals.xtermcmd); inm++) {
|
|
||||||
opal_argv_append_nosize(&app->argv, orte_odls_globals.xtermcmd[inm]);
|
|
||||||
}
|
|
||||||
/* insert the rank into the correct place as a window title */
|
|
||||||
free(app->argv[2]);
|
|
||||||
asprintf(&app->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid));
|
|
||||||
/* add back the original argv */
|
|
||||||
for (inm=0; inm < opal_argv_count(argvsav); inm++) {
|
|
||||||
opal_argv_append_nosize(&app->argv, argvsav[inm]);
|
|
||||||
}
|
|
||||||
/* the app exe name itself is in the argvsav array, so
|
|
||||||
* we can recover it from there later
|
|
||||||
*/
|
|
||||||
free(app->app);
|
|
||||||
app->app = strdup(orte_odls_globals.xtermcmd[0]);
|
|
||||||
break;
|
|
||||||
} else if (jobdat->num_procs <= nm->name.vpid) { /* check for bozo case */
|
|
||||||
/* can't be done! */
|
|
||||||
orte_show_help("help-orte-odls-base.txt",
|
|
||||||
"orte-odls-base:xterm-rank-out-of-bounds",
|
|
||||||
true, nm->name.vpid, jobdat->num_procs);
|
|
||||||
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
} else if (NULL != orte_fork_agent) {
|
|
||||||
/* we were given a fork agent - use it */
|
|
||||||
argvsav = opal_argv_copy(app->argv);
|
|
||||||
/* free the argv */
|
|
||||||
opal_argv_free(app->argv);
|
|
||||||
app->argv = NULL;
|
|
||||||
/* now create a new one that starts with the fork agent */
|
|
||||||
app->argv = opal_argv_copy(orte_fork_agent);
|
|
||||||
/* add back the original argv */
|
|
||||||
for (inm=0; NULL != argvsav[inm]; inm++) {
|
|
||||||
opal_argv_append_nosize(&app->argv, argvsav[inm]);
|
|
||||||
}
|
|
||||||
/* the app exe name itself is in the argvsav array, so
|
|
||||||
* we can recover it from there later
|
|
||||||
*/
|
|
||||||
free(app->app);
|
|
||||||
app->app = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
|
|
||||||
if (NULL == app->app) {
|
|
||||||
orte_show_help("help-orte-odls-base.txt",
|
|
||||||
"orte-odls-base:fork-agent-not-found",
|
|
||||||
true, orte_process_info.nodename, orte_fork_agent[0]);
|
|
||||||
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* setup the rest of the environment with the proc-specific items - these
|
|
||||||
* will be overwritten for each child
|
|
||||||
*/
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
child->exit_code = rc;
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_FT_CR == 1
|
|
||||||
/*
|
|
||||||
* OPAL CRS components need the opportunity to take action before a process
|
|
||||||
* is forked.
|
|
||||||
* Needs access to:
|
|
||||||
* - Environment
|
|
||||||
* - Rank/ORTE Name
|
|
||||||
* - Binary to exec
|
|
||||||
*/
|
|
||||||
if( NULL != opal_crs.crs_prelaunch ) {
|
|
||||||
if( OPAL_SUCCESS != (rc = opal_crs.crs_prelaunch(child->name.vpid,
|
|
||||||
orte_sstore_base_prelaunch_location,
|
|
||||||
&(app->app),
|
|
||||||
&(app->cwd),
|
|
||||||
&(app->argv),
|
|
||||||
&(app->env) ) ) ) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
child->exit_code = rc;
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
/* if we are indexing the argv by rank, do so now */
|
|
||||||
if (index_argv) {
|
|
||||||
char *param;
|
|
||||||
asprintf(¶m, "%s-%d", app->argv[0], (int)child->name.vpid);
|
|
||||||
free(app->argv[0]);
|
|
||||||
app->argv[0] = param;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (5 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
|
|
||||||
opal_output(orte_odls_base_framework.framework_output, "%s odls:launch spawning child %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&child->name));
|
|
||||||
|
|
||||||
/* dump what is going to be exec'd */
|
|
||||||
if (7 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
|
|
||||||
opal_dss.dump(orte_odls_base_framework.framework_output, app, ORTE_APP_CONTEXT);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (rc = fork_local(app, child, app->env, jobdat))) {
|
|
||||||
child->exit_code = rc; /* error message already output */
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
|
|
||||||
/* if we indexed the argv, we need to restore it to
|
|
||||||
* its original form
|
|
||||||
*/
|
|
||||||
if (index_argv) {
|
|
||||||
/* restore the argv[0] */
|
|
||||||
char *param;
|
|
||||||
if (NULL == (param = strrchr(app->argv[0], '-'))) {
|
|
||||||
child->exit_code = ORTE_ERR_NOT_FOUND;
|
|
||||||
rc = ORTE_ERR_NOT_FOUND;
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
*param = '\0';
|
|
||||||
}
|
|
||||||
if (ORTE_SUCCESS != rc) {
|
|
||||||
/* do NOT ERROR_LOG this error - it generates
|
|
||||||
* a message/node as most errors will be common
|
|
||||||
* across the entire cluster. Instead, we let orterun
|
|
||||||
* output a consolidated error message for us
|
|
||||||
*/
|
|
||||||
child->exit_code = rc; /* error message already output */
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
|
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
|
||||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_RUNNING);
|
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
|
||||||
|
|
||||||
|
/* dispatch this child to the next available launch thread */
|
||||||
|
cd = OBJ_NEW(orte_odls_spawn_caddy_t);
|
||||||
|
cd->jdata = jobdat;
|
||||||
|
cd->app = app;
|
||||||
|
cd->child = child;
|
||||||
|
cd->fork_local = fork_local;
|
||||||
|
cd->index_argv = index_argv;
|
||||||
|
/* setup any IOF */
|
||||||
|
cd->opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
|
||||||
|
|
||||||
|
/* do we want to setup stdin? */
|
||||||
|
if (jobdat->stdin_target == ORTE_VPID_WILDCARD ||
|
||||||
|
child->name.vpid == jobdat->stdin_target) {
|
||||||
|
cd->opts.connect_stdin = true;
|
||||||
|
} else {
|
||||||
|
cd->opts.connect_stdin = false;
|
||||||
}
|
}
|
||||||
/* move to next processor */
|
if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&cd->opts))) {
|
||||||
proc_rank++;
|
ORTE_ERROR_LOG(rc);
|
||||||
/* reset the exe name, if necessary */
|
child->exit_code = rc;
|
||||||
if (NULL != argvsav) {
|
OBJ_RELEASE(cd);
|
||||||
/* release the current argv array */
|
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
||||||
opal_argv_free(app->argv);
|
goto GETOUT;
|
||||||
/* restore the original one */
|
}
|
||||||
app->argv = argvsav;
|
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||||
argvsav = NULL;
|
/* connect endpoints IOF */
|
||||||
/* the app exe name itself is now in the argv[0] posn */
|
rc = orte_iof_base_setup_parent(&child->name, &cd->opts);
|
||||||
free(app->app);
|
if (ORTE_SUCCESS != rc) {
|
||||||
app->app = strdup(app->argv[0]);
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(cd);
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
||||||
|
goto GETOUT;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
++orte_odls_globals.next_base;
|
||||||
|
if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
|
||||||
|
orte_odls_globals.next_base = 0;
|
||||||
|
}
|
||||||
|
evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
|
||||||
|
opal_event_set(evb, &cd->ev, -1,
|
||||||
|
OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
|
||||||
|
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
|
||||||
|
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
|
||||||
|
|
||||||
}
|
}
|
||||||
} /* complete launching all children for this app */
|
|
||||||
/* reset our working directory back to our default location - if we
|
/* reset our working directory back to our default location - if we
|
||||||
* don't do this, then we will be looking for relative paths starting
|
* don't do this, then we will be looking for relative paths starting
|
||||||
* from the last wdir option specified by the user. Thus, we would
|
* from the last wdir option specified by the user. Thus, we would
|
||||||
@ -1097,9 +1112,6 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
|||||||
*/
|
*/
|
||||||
chdir(basedir);
|
chdir(basedir);
|
||||||
}
|
}
|
||||||
if (NULL != argvsav) {
|
|
||||||
opal_argv_free(argvsav);
|
|
||||||
}
|
|
||||||
|
|
||||||
GETOUT:
|
GETOUT:
|
||||||
/* tell the state machine that all local procs for this job
|
/* tell the state machine that all local procs for this job
|
||||||
@ -1659,6 +1671,7 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
|
|||||||
orte_app_context_t *app;
|
orte_app_context_t *app;
|
||||||
orte_job_t *jobdat;
|
orte_job_t *jobdat;
|
||||||
char basedir[MAXPATHLEN];
|
char basedir[MAXPATHLEN];
|
||||||
|
orte_iof_base_io_conf_t opts;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||||
"%s odls:restart_proc for proc %s",
|
"%s odls:restart_proc for proc %s",
|
||||||
@ -1690,7 +1703,7 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
|
|||||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, child->app_idx);
|
app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, child->app_idx);
|
||||||
|
|
||||||
/* reset envars to match this child */
|
/* reset envars to match this child */
|
||||||
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app))) {
|
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &app->env))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto CLEANUP;
|
goto CLEANUP;
|
||||||
}
|
}
|
||||||
@ -1701,12 +1714,24 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
|
|||||||
goto CLEANUP;
|
goto CLEANUP;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* setup any IOF */
|
||||||
|
memset(&opts, 0, sizeof(orte_iof_base_io_conf_t));
|
||||||
|
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||||
|
/* connect endpoints IOF */
|
||||||
|
rc = orte_iof_base_setup_parent(&child->name, &opts);
|
||||||
|
if (ORTE_SUCCESS != rc) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
||||||
|
goto CLEANUP;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||||
"%s restarting app %s",
|
"%s restarting app %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
|
||||||
|
|
||||||
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
|
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
|
||||||
if (ORTE_SUCCESS != (rc = fork_local(app, child, app->env, jobdat))) {
|
if (ORTE_SUCCESS != (rc = fork_local(child, app->app, app->argv, app->env, jobdat, opts))) {
|
||||||
orte_wait_cb_cancel(child);
|
orte_wait_cb_cancel(child);
|
||||||
child->exit_code = ORTE_ERR_SILENT; /* error message already output */
|
child->exit_code = ORTE_ERR_SILENT; /* error message already output */
|
||||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2014-2015 Research Organization for Information Science
|
* Copyright (c) 2014-2015 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -32,6 +33,7 @@
|
|||||||
#include "orte/mca/mca.h"
|
#include "orte/mca/mca.h"
|
||||||
#include "opal/mca/base/base.h"
|
#include "opal/mca/base/base.h"
|
||||||
#include "opal/mca/hwloc/hwloc-internal.h"
|
#include "opal/mca/hwloc/hwloc-internal.h"
|
||||||
|
#include "opal/runtime/opal_progress_threads.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/util/path.h"
|
#include "opal/util/path.h"
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
@ -76,6 +78,14 @@ static int orte_odls_base_register(mca_base_register_flag_t flags)
|
|||||||
MCA_BASE_VAR_SCOPE_READONLY,
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
&orte_odls_globals.timeout_before_sigkill);
|
&orte_odls_globals.timeout_before_sigkill);
|
||||||
|
|
||||||
|
orte_odls_globals.num_threads = 0;
|
||||||
|
(void) mca_base_var_register("orte", "odls", "base", "num_threads",
|
||||||
|
"Number of threads to use for spawning local procs",
|
||||||
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||||
|
OPAL_INFO_LVL_9,
|
||||||
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
|
&orte_odls_globals.num_threads);
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -99,6 +109,15 @@ static int orte_odls_base_close(void)
|
|||||||
}
|
}
|
||||||
OBJ_RELEASE(orte_local_children);
|
OBJ_RELEASE(orte_local_children);
|
||||||
|
|
||||||
|
if (0 < orte_odls_globals.num_threads) {
|
||||||
|
/* stop the progress threads */
|
||||||
|
for (i=0; NULL != orte_odls_globals.ev_threads[i]; i++) {
|
||||||
|
opal_progress_thread_finalize(orte_odls_globals.ev_threads[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(orte_odls_globals.ev_bases);
|
||||||
|
opal_argv_free(orte_odls_globals.ev_threads);
|
||||||
|
|
||||||
return mca_base_framework_components_close(&orte_odls_base_framework, NULL);
|
return mca_base_framework_components_close(&orte_odls_base_framework, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -174,6 +193,25 @@ static int orte_odls_base_open(mca_base_open_flag_t flags)
|
|||||||
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-e");
|
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-e");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* setup the pool of worker threads */
|
||||||
|
orte_odls_globals.ev_threads = NULL;
|
||||||
|
orte_odls_globals.next_base = 0;
|
||||||
|
if (0 == orte_odls_globals.num_threads) {
|
||||||
|
orte_odls_globals.ev_bases = (opal_event_base_t**)malloc(sizeof(opal_event_base_t*));
|
||||||
|
/* use the default event base */
|
||||||
|
orte_odls_globals.ev_bases[0] = orte_event_base;
|
||||||
|
} else {
|
||||||
|
orte_odls_globals.ev_bases =
|
||||||
|
(opal_event_base_t**)malloc(orte_odls_globals.num_threads * sizeof(opal_event_base_t*));
|
||||||
|
for (i=0; i < orte_odls_globals.num_threads; i++) {
|
||||||
|
asprintf(&tmp, "ORTE-ODLS-%d", i);
|
||||||
|
orte_odls_globals.ev_bases[i] = opal_progress_thread_init(tmp);
|
||||||
|
opal_argv_append_nosize(&orte_odls_globals.ev_threads, tmp);
|
||||||
|
free(tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/* Open up all available components */
|
/* Open up all available components */
|
||||||
return mca_base_framework_components_open(&orte_odls_base_framework, flags);
|
return mca_base_framework_components_open(&orte_odls_base_framework, flags);
|
||||||
}
|
}
|
||||||
@ -197,3 +235,11 @@ OBJ_CLASS_INSTANCE(orte_odls_launch_local_t,
|
|||||||
opal_object_t,
|
opal_object_t,
|
||||||
launch_local_const,
|
launch_local_const,
|
||||||
launch_local_dest);
|
launch_local_dest);
|
||||||
|
|
||||||
|
static void sccon(orte_odls_spawn_caddy_t *p)
|
||||||
|
{
|
||||||
|
memset(&p->opts, 0, sizeof(orte_iof_base_io_conf_t));
|
||||||
|
}
|
||||||
|
OBJ_CLASS_INSTANCE(orte_odls_spawn_caddy_t,
|
||||||
|
opal_object_t,
|
||||||
|
sccon, NULL);
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -36,6 +36,7 @@
|
|||||||
#include "opal/class/opal_bitmap.h"
|
#include "opal/class/opal_bitmap.h"
|
||||||
#include "opal/dss/dss_types.h"
|
#include "opal/dss/dss_types.h"
|
||||||
|
|
||||||
|
#include "orte/mca/iof/base/iof_base_setup.h"
|
||||||
#include "orte/mca/rml/rml_types.h"
|
#include "orte/mca/rml/rml_types.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
@ -56,11 +57,15 @@ typedef struct {
|
|||||||
opal_list_t xterm_ranks;
|
opal_list_t xterm_ranks;
|
||||||
/* the xterm cmd to be used */
|
/* the xterm cmd to be used */
|
||||||
char **xtermcmd;
|
char **xtermcmd;
|
||||||
|
/* thread pool */
|
||||||
|
int num_threads;
|
||||||
|
opal_event_base_t **ev_bases; // event base array for progress threads
|
||||||
|
char** ev_threads; // event progress thread names
|
||||||
|
int next_base; // counter to load-level thread use
|
||||||
} orte_odls_globals_t;
|
} orte_odls_globals_t;
|
||||||
|
|
||||||
ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals;
|
ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals;
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Default functions that are common to most environments - can
|
* Default functions that are common to most environments - can
|
||||||
* be overridden by specific environments if they need something
|
* be overridden by specific environments if they need something
|
||||||
@ -74,11 +79,27 @@ ORTE_DECLSPEC int
|
|||||||
orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||||
orte_jobid_t *job);
|
orte_jobid_t *job);
|
||||||
|
|
||||||
|
ORTE_DECLSPEC void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata);
|
||||||
|
|
||||||
/* define a function that will fork a local proc */
|
/* define a function that will fork a local proc */
|
||||||
typedef int (*orte_odls_base_fork_local_proc_fn_t)(orte_app_context_t *context,
|
typedef int (*orte_odls_base_fork_local_proc_fn_t)(orte_proc_t *child,
|
||||||
orte_proc_t *child,
|
char *app, char **argv,
|
||||||
char **environ_copy,
|
char **environ_copy,
|
||||||
orte_job_t *jdata);
|
orte_job_t *jdata,
|
||||||
|
orte_iof_base_io_conf_t opts);
|
||||||
|
|
||||||
|
/* define an object for fork/exec the local proc */
|
||||||
|
typedef struct {
|
||||||
|
opal_object_t super;
|
||||||
|
opal_event_t ev;
|
||||||
|
orte_job_t *jdata;
|
||||||
|
orte_app_context_t *app;
|
||||||
|
orte_proc_t *child;
|
||||||
|
bool index_argv;
|
||||||
|
orte_iof_base_io_conf_t opts;
|
||||||
|
orte_odls_base_fork_local_proc_fn_t fork_local;
|
||||||
|
} orte_odls_spawn_caddy_t;
|
||||||
|
OBJ_CLASS_DECLARATION(orte_odls_spawn_caddy_t);
|
||||||
|
|
||||||
/* define an object for starting local launch */
|
/* define an object for starting local launch */
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
|
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -144,8 +144,9 @@ static int orte_odls_default_restart_proc(orte_proc_t *child);
|
|||||||
static void send_error_show_help(int fd, int exit_status,
|
static void send_error_show_help(int fd, int exit_status,
|
||||||
const char *file, const char *topic, ...)
|
const char *file, const char *topic, ...)
|
||||||
__opal_attribute_noreturn__;
|
__opal_attribute_noreturn__;
|
||||||
static int do_child(orte_app_context_t* context,
|
|
||||||
orte_proc_t *child,
|
static int do_child(orte_proc_t *child,
|
||||||
|
char *cmd, char **argv,
|
||||||
char **environ_copy,
|
char **environ_copy,
|
||||||
orte_job_t *jobdat, int write_fd,
|
orte_job_t *jobdat, int write_fd,
|
||||||
orte_iof_base_io_conf_t opts)
|
orte_iof_base_io_conf_t opts)
|
||||||
@ -318,16 +319,15 @@ static int close_open_file_descriptors(int write_fd,
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int do_child(orte_app_context_t* context,
|
static int do_child(orte_proc_t *child,
|
||||||
orte_proc_t *child,
|
char *app, char **argv,
|
||||||
char **environ_copy,
|
char **environ_copy,
|
||||||
orte_job_t *jobdat, int write_fd,
|
orte_job_t *jobdat, int write_fd,
|
||||||
orte_iof_base_io_conf_t opts)
|
orte_iof_base_io_conf_t opts)
|
||||||
{
|
{
|
||||||
int i, rc;
|
int i;
|
||||||
sigset_t sigs;
|
sigset_t sigs;
|
||||||
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
||||||
char *param, *msg;
|
|
||||||
|
|
||||||
#if HAVE_SETPGID
|
#if HAVE_SETPGID
|
||||||
/* Set a new process group for this child, so that any
|
/* Set a new process group for this child, so that any
|
||||||
@ -359,7 +359,7 @@ static int do_child(orte_app_context_t* context,
|
|||||||
send_error_show_help(write_fd, 1,
|
send_error_show_help(write_fd, 1,
|
||||||
"help-orte-odls-default.txt",
|
"help-orte-odls-default.txt",
|
||||||
"iof setup failed",
|
"iof setup failed",
|
||||||
orte_process_info.nodename, context->app);
|
orte_process_info.nodename, app);
|
||||||
/* Does not return */
|
/* Does not return */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -384,18 +384,6 @@ static int do_child(orte_app_context_t* context,
|
|||||||
close(fdnull);
|
close(fdnull);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if the user requested it, set the system resource limits */
|
|
||||||
if (OPAL_SUCCESS != (rc = opal_util_init_sys_limits(&msg))) {
|
|
||||||
send_error_show_help(write_fd, 1, "help-orte-odls-default.txt",
|
|
||||||
"set limit",
|
|
||||||
orte_process_info.nodename, context->app,
|
|
||||||
__FILE__, __LINE__, msg);
|
|
||||||
}
|
|
||||||
/* ensure we only do this once */
|
|
||||||
(void) mca_base_var_env_name("opal_set_max_sys_limits", ¶m);
|
|
||||||
opal_unsetenv(param, &environ_copy);
|
|
||||||
free(param);
|
|
||||||
|
|
||||||
/* close all open file descriptors w/ exception of stdin/stdout/stderr,
|
/* close all open file descriptors w/ exception of stdin/stdout/stderr,
|
||||||
the pipe used for the IOF INTERNAL messages, and the pipe up to
|
the pipe used for the IOF INTERNAL messages, and the pipe up to
|
||||||
the parent. */
|
the parent. */
|
||||||
@ -408,10 +396,10 @@ static int do_child(orte_app_context_t* context,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (context->argv == NULL) {
|
if (argv == NULL) {
|
||||||
context->argv = malloc(sizeof(char*)*2);
|
argv = malloc(sizeof(char*)*2);
|
||||||
context->argv[0] = strdup(context->app);
|
argv[0] = strdup(app);
|
||||||
context->argv[1] = NULL;
|
argv[1] = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Set signal handlers back to the default. Do this close to
|
/* Set signal handlers back to the default. Do this close to
|
||||||
@ -436,16 +424,16 @@ static int do_child(orte_app_context_t* context,
|
|||||||
|
|
||||||
/* Exec the new executable */
|
/* Exec the new executable */
|
||||||
|
|
||||||
execve(context->app, context->argv, environ_copy);
|
execve(app, argv, environ_copy);
|
||||||
send_error_show_help(write_fd, 1,
|
send_error_show_help(write_fd, 1,
|
||||||
"help-orte-odls-default.txt", "execve error",
|
"help-orte-odls-default.txt", "execve error",
|
||||||
orte_process_info.nodename, context->app, strerror(errno));
|
orte_process_info.nodename, app, strerror(errno));
|
||||||
/* Does not return */
|
/* Does not return */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int do_parent(orte_app_context_t* context,
|
static int do_parent(orte_proc_t *child,
|
||||||
orte_proc_t *child,
|
char *app, char **argv,
|
||||||
char **environ_copy,
|
char **environ_copy,
|
||||||
orte_job_t *jobdat, int read_fd,
|
orte_job_t *jobdat, int read_fd,
|
||||||
orte_iof_base_io_conf_t opts)
|
orte_iof_base_io_conf_t opts)
|
||||||
@ -454,19 +442,10 @@ static int do_parent(orte_app_context_t* context,
|
|||||||
orte_odls_pipe_err_msg_t msg;
|
orte_odls_pipe_err_msg_t msg;
|
||||||
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
|
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
|
||||||
|
|
||||||
if (NULL != child && ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
close(opts.p_stdin[0]);
|
||||||
/* connect endpoints IOF */
|
close(opts.p_stdout[1]);
|
||||||
rc = orte_iof_base_setup_parent(&child->name, &opts);
|
close(opts.p_stderr[1]);
|
||||||
if (ORTE_SUCCESS != rc) {
|
close(opts.p_internal[1]);
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
close(read_fd);
|
|
||||||
|
|
||||||
if (NULL != child) {
|
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Block reading a message from the pipe */
|
/* Block reading a message from the pipe */
|
||||||
while (1) {
|
while (1) {
|
||||||
@ -503,7 +482,7 @@ static int do_parent(orte_app_context_t* context,
|
|||||||
if (OPAL_SUCCESS != rc) {
|
if (OPAL_SUCCESS != rc) {
|
||||||
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename, context->app,
|
orte_process_info.nodename, app,
|
||||||
"opal_fd_read", __FILE__, __LINE__);
|
"opal_fd_read", __FILE__, __LINE__);
|
||||||
if (NULL != child) {
|
if (NULL != child) {
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
child->state = ORTE_PROC_STATE_UNDEF;
|
||||||
@ -517,7 +496,7 @@ static int do_parent(orte_app_context_t* context,
|
|||||||
if (OPAL_SUCCESS != rc) {
|
if (OPAL_SUCCESS != rc) {
|
||||||
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename, context->app,
|
orte_process_info.nodename, app,
|
||||||
"opal_fd_read", __FILE__, __LINE__);
|
"opal_fd_read", __FILE__, __LINE__);
|
||||||
if (NULL != child) {
|
if (NULL != child) {
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
child->state = ORTE_PROC_STATE_UNDEF;
|
||||||
@ -531,7 +510,7 @@ static int do_parent(orte_app_context_t* context,
|
|||||||
if (NULL == str) {
|
if (NULL == str) {
|
||||||
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename, context->app,
|
orte_process_info.nodename, app,
|
||||||
"opal_fd_read", __FILE__, __LINE__);
|
"opal_fd_read", __FILE__, __LINE__);
|
||||||
if (NULL != child) {
|
if (NULL != child) {
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
child->state = ORTE_PROC_STATE_UNDEF;
|
||||||
@ -580,39 +559,16 @@ static int do_parent(orte_app_context_t* context,
|
|||||||
/**
|
/**
|
||||||
* Fork/exec the specified processes
|
* Fork/exec the specified processes
|
||||||
*/
|
*/
|
||||||
static int odls_default_fork_local_proc(orte_app_context_t* context,
|
static int odls_default_fork_local_proc(orte_proc_t *child,
|
||||||
orte_proc_t *child,
|
char *app,
|
||||||
|
char **argv,
|
||||||
char **environ_copy,
|
char **environ_copy,
|
||||||
orte_job_t *jobdat)
|
orte_job_t *jobdat,
|
||||||
|
orte_iof_base_io_conf_t opts)
|
||||||
{
|
{
|
||||||
orte_iof_base_io_conf_t opts = {0};
|
int p[2];
|
||||||
int rc, p[2];
|
|
||||||
pid_t pid;
|
pid_t pid;
|
||||||
|
|
||||||
if (NULL != child) {
|
|
||||||
/* should pull this information from MPIRUN instead of going with
|
|
||||||
default */
|
|
||||||
opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
|
|
||||||
|
|
||||||
/* do we want to setup stdin? */
|
|
||||||
if (NULL != child &&
|
|
||||||
(jobdat->stdin_target == ORTE_VPID_WILDCARD ||
|
|
||||||
child->name.vpid == jobdat->stdin_target)) {
|
|
||||||
opts.connect_stdin = true;
|
|
||||||
} else {
|
|
||||||
opts.connect_stdin = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&opts))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
if (NULL != child) {
|
|
||||||
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
|
||||||
child->exit_code = rc;
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* A pipe is used to communicate between the parent and child to
|
/* A pipe is used to communicate between the parent and child to
|
||||||
indicate whether the exec ultimately succeeded or failed. The
|
indicate whether the exec ultimately succeeded or failed. The
|
||||||
child sets the pipe to be close-on-exec; the child only ever
|
child sets the pipe to be close-on-exec; the child only ever
|
||||||
@ -647,12 +603,12 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
|||||||
|
|
||||||
if (pid == 0) {
|
if (pid == 0) {
|
||||||
close(p[0]);
|
close(p[0]);
|
||||||
do_child(context, child, environ_copy, jobdat, p[1], opts);
|
do_child(child, app, argv, environ_copy, jobdat, p[1], opts);
|
||||||
/* Does not return */
|
/* Does not return */
|
||||||
}
|
}
|
||||||
|
|
||||||
close(p[1]);
|
close(p[1]);
|
||||||
return do_parent(context, child, environ_copy, jobdat, p[0], opts);
|
return do_parent(child, app, argv, environ_copy, jobdat, p[0], opts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -73,7 +73,8 @@ ORTE_DECLSPEC int orte_schizo_base_setup_fork(orte_job_t *jdata,
|
|||||||
orte_app_context_t *context);
|
orte_app_context_t *context);
|
||||||
ORTE_DECLSPEC int orte_schizo_base_setup_child(orte_job_t *jobdat,
|
ORTE_DECLSPEC int orte_schizo_base_setup_child(orte_job_t *jobdat,
|
||||||
orte_proc_t *child,
|
orte_proc_t *child,
|
||||||
orte_app_context_t *app);
|
orte_app_context_t *app,
|
||||||
|
char ***env);
|
||||||
ORTE_DECLSPEC orte_schizo_launch_environ_t orte_schizo_base_check_launch_environment(void);
|
ORTE_DECLSPEC orte_schizo_launch_environ_t orte_schizo_base_check_launch_environment(void);
|
||||||
ORTE_DECLSPEC long orte_schizo_base_get_remaining_time(void);
|
ORTE_DECLSPEC long orte_schizo_base_get_remaining_time(void);
|
||||||
ORTE_DECLSPEC void orte_schizo_base_finalize(void);
|
ORTE_DECLSPEC void orte_schizo_base_finalize(void);
|
||||||
|
@ -128,14 +128,15 @@ int orte_schizo_base_setup_fork(orte_job_t *jdata,
|
|||||||
|
|
||||||
int orte_schizo_base_setup_child(orte_job_t *jdata,
|
int orte_schizo_base_setup_child(orte_job_t *jdata,
|
||||||
orte_proc_t *child,
|
orte_proc_t *child,
|
||||||
orte_app_context_t *app)
|
orte_app_context_t *app,
|
||||||
|
char ***env)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
orte_schizo_base_active_module_t *mod;
|
orte_schizo_base_active_module_t *mod;
|
||||||
|
|
||||||
OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) {
|
OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) {
|
||||||
if (NULL != mod->module->setup_child) {
|
if (NULL != mod->module->setup_child) {
|
||||||
rc = mod->module->setup_child(jdata, child, app);
|
rc = mod->module->setup_child(jdata, child, app, env);
|
||||||
if (ORTE_SUCCESS != rc && ORTE_ERR_TAKE_NEXT_OPTION != rc) {
|
if (ORTE_SUCCESS != rc && ORTE_ERR_TAKE_NEXT_OPTION != rc) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
|
@ -61,7 +61,8 @@ static int setup_fork(orte_job_t *jdata,
|
|||||||
orte_app_context_t *context);
|
orte_app_context_t *context);
|
||||||
static int setup_child(orte_job_t *jobdat,
|
static int setup_child(orte_job_t *jobdat,
|
||||||
orte_proc_t *child,
|
orte_proc_t *child,
|
||||||
orte_app_context_t *app);
|
orte_app_context_t *app,
|
||||||
|
char ***env);
|
||||||
|
|
||||||
orte_schizo_base_module_t orte_schizo_ompi_module = {
|
orte_schizo_base_module_t orte_schizo_ompi_module = {
|
||||||
.define_cli = define_cli,
|
.define_cli = define_cli,
|
||||||
@ -992,7 +993,8 @@ static int setup_fork(orte_job_t *jdata,
|
|||||||
|
|
||||||
static int setup_child(orte_job_t *jdata,
|
static int setup_child(orte_job_t *jdata,
|
||||||
orte_proc_t *child,
|
orte_proc_t *child,
|
||||||
orte_app_context_t *app)
|
orte_app_context_t *app,
|
||||||
|
char ***env)
|
||||||
{
|
{
|
||||||
char *param, *value;
|
char *param, *value;
|
||||||
int rc, i;
|
int rc, i;
|
||||||
@ -1026,7 +1028,7 @@ static int setup_child(orte_job_t *jdata,
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
opal_setenv("OMPI_MCA_ess_base_jobid", value, true, &app->env);
|
opal_setenv("OMPI_MCA_ess_base_jobid", value, true, env);
|
||||||
free(value);
|
free(value);
|
||||||
|
|
||||||
/* setup the vpid */
|
/* setup the vpid */
|
||||||
@ -1034,7 +1036,7 @@ static int setup_child(orte_job_t *jdata,
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
opal_setenv("OMPI_MCA_ess_base_vpid", value, true, &app->env);
|
opal_setenv("OMPI_MCA_ess_base_vpid", value, true, env);
|
||||||
|
|
||||||
/* although the vpid IS the process' rank within the job, users
|
/* although the vpid IS the process' rank within the job, users
|
||||||
* would appreciate being given a public environmental variable
|
* would appreciate being given a public environmental variable
|
||||||
@ -1044,7 +1046,7 @@ static int setup_child(orte_job_t *jdata,
|
|||||||
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||||
* We know - just live with it
|
* We know - just live with it
|
||||||
*/
|
*/
|
||||||
opal_setenv("OMPI_COMM_WORLD_RANK", value, true, &app->env);
|
opal_setenv("OMPI_COMM_WORLD_RANK", value, true, env);
|
||||||
free(value); /* done with this now */
|
free(value); /* done with this now */
|
||||||
|
|
||||||
/* users would appreciate being given a public environmental variable
|
/* users would appreciate being given a public environmental variable
|
||||||
@ -1060,7 +1062,7 @@ static int setup_child(orte_job_t *jdata,
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
asprintf(&value, "%lu", (unsigned long) child->local_rank);
|
asprintf(&value, "%lu", (unsigned long) child->local_rank);
|
||||||
opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env);
|
opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, env);
|
||||||
free(value);
|
free(value);
|
||||||
|
|
||||||
/* users would appreciate being given a public environmental variable
|
/* users would appreciate being given a public environmental variable
|
||||||
@ -1076,9 +1078,9 @@ static int setup_child(orte_job_t *jdata,
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
asprintf(&value, "%lu", (unsigned long) child->node_rank);
|
asprintf(&value, "%lu", (unsigned long) child->node_rank);
|
||||||
opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, &app->env);
|
opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, env);
|
||||||
/* set an mca param for it too */
|
/* set an mca param for it too */
|
||||||
opal_setenv("OMPI_MCA_orte_ess_node_rank", value, true, &app->env);
|
opal_setenv("OMPI_MCA_orte_ess_node_rank", value, true, env);
|
||||||
free(value);
|
free(value);
|
||||||
|
|
||||||
/* provide the identifier for the PMIx connection - the
|
/* provide the identifier for the PMIx connection - the
|
||||||
@ -1087,7 +1089,7 @@ static int setup_child(orte_job_t *jdata,
|
|||||||
* process name are the same, it isn't necessarily
|
* process name are the same, it isn't necessarily
|
||||||
* required */
|
* required */
|
||||||
orte_util_convert_process_name_to_string(&value, &child->name);
|
orte_util_convert_process_name_to_string(&value, &child->name);
|
||||||
opal_setenv("PMIX_ID", value, true, &app->env);
|
opal_setenv("PMIX_ID", value, true, env);
|
||||||
free(value);
|
free(value);
|
||||||
|
|
||||||
nrptr = &nrestarts;
|
nrptr = &nrestarts;
|
||||||
@ -1097,14 +1099,14 @@ static int setup_child(orte_job_t *jdata,
|
|||||||
* restarted so they can take appropriate action
|
* restarted so they can take appropriate action
|
||||||
*/
|
*/
|
||||||
asprintf(&value, "%d", nrestarts);
|
asprintf(&value, "%d", nrestarts);
|
||||||
opal_setenv("OMPI_MCA_orte_num_restarts", value, true, &app->env);
|
opal_setenv("OMPI_MCA_orte_num_restarts", value, true, env);
|
||||||
free(value);
|
free(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if the proc should not barrier in orte_init, tell it */
|
/* if the proc should not barrier in orte_init, tell it */
|
||||||
if (orte_get_attribute(&child->attributes, ORTE_PROC_NOBARRIER, NULL, OPAL_BOOL)
|
if (orte_get_attribute(&child->attributes, ORTE_PROC_NOBARRIER, NULL, OPAL_BOOL)
|
||||||
|| 0 < nrestarts) {
|
|| 0 < nrestarts) {
|
||||||
opal_setenv("OMPI_MCA_orte_do_not_barrier", "1", true, &app->env);
|
opal_setenv("OMPI_MCA_orte_do_not_barrier", "1", true, env);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if the proc isn't going to forward IO, then we need to flag that
|
/* if the proc isn't going to forward IO, then we need to flag that
|
||||||
@ -1116,7 +1118,7 @@ static int setup_child(orte_job_t *jdata,
|
|||||||
|
|
||||||
/* pass an envar so the proc can find any files it had prepositioned */
|
/* pass an envar so the proc can find any files it had prepositioned */
|
||||||
param = orte_process_info.proc_session_dir;
|
param = orte_process_info.proc_session_dir;
|
||||||
opal_setenv("OMPI_FILE_LOCATION", param, true, &app->env);
|
opal_setenv("OMPI_FILE_LOCATION", param, true, env);
|
||||||
|
|
||||||
/* if the user wanted the cwd to be the proc's session dir, then
|
/* if the user wanted the cwd to be the proc's session dir, then
|
||||||
* switch to that location now
|
* switch to that location now
|
||||||
@ -1144,9 +1146,9 @@ static int setup_child(orte_job_t *jdata,
|
|||||||
* again not match getcwd! This is beyond our control - we are only
|
* again not match getcwd! This is beyond our control - we are only
|
||||||
* ensuring they start out matching.
|
* ensuring they start out matching.
|
||||||
*/
|
*/
|
||||||
opal_setenv("PWD", param, true, &app->env);
|
opal_setenv("PWD", param, true, env);
|
||||||
/* update the initial wdir value too */
|
/* update the initial wdir value too */
|
||||||
opal_setenv("OMPI_MCA_initial_wdir", param, true, &app->env);
|
opal_setenv("OMPI_MCA_initial_wdir", param, true, env);
|
||||||
}
|
}
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -88,7 +88,8 @@ typedef int (*orte_schizo_base_module_setup_fork_fn_t)(orte_job_t *jdata,
|
|||||||
* proc upon execution */
|
* proc upon execution */
|
||||||
typedef int (*orte_schizo_base_module_setup_child_fn_t)(orte_job_t *jdata,
|
typedef int (*orte_schizo_base_module_setup_child_fn_t)(orte_job_t *jdata,
|
||||||
orte_proc_t *child,
|
orte_proc_t *child,
|
||||||
orte_app_context_t *app);
|
orte_app_context_t *app,
|
||||||
|
char ***env);
|
||||||
|
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user