1
1

Small update the the process PLS.

This commit was SVN r13293.
Этот коммит содержится в:
George Bosilca 2007-01-25 00:17:54 +00:00
родитель b6307807d8
Коммит 3b988fcdfd
2 изменённых файлов: 40 добавлений и 68 удалений

Просмотреть файл

@ -158,7 +158,7 @@ int orte_pls_process_component_open(void)
mca_base_param_reg_string(c, "orted",
"The command name that the process pls component will invoke for the ORTE daemon",
false, false, "orted",
false, false, "orted.exe",
&mca_pls_process_component.orted);
mca_base_param_reg_int(c, "priority",
@ -189,6 +189,9 @@ extern char **environ;
orte_pls_base_module_t *orte_pls_process_component_init(int *priority)
{
/* if we are not an HNP, then don't select us */
if (!orte_process_info.seed) {
return NULL;
}
*priority = mca_pls_process_component.priority;
return &orte_pls_process_module;

Просмотреть файл

@ -346,7 +346,7 @@ static int orte_pls_process_probe(orte_mapped_node_t * node, orte_pls_process_sh
* Fill the exec_path variable with the directory to the orted
*/
static int orte_pls_process_fill_exec_path ( char ** exec_path)
static int orte_pls_process_fill_exec_path( char ** exec_path )
{
struct stat buf;
@ -354,7 +354,7 @@ static int orte_pls_process_fill_exec_path ( char ** exec_path)
if (0 != stat(*exec_path, &buf)) {
char *path = getenv("PATH");
if (NULL == path) {
path = ("PATH is empty!");
path = "PATH is empty!";
}
opal_show_help("help-pls-process.txt", "no-local-orted",
true, path, OPAL_BINDIR);
@ -746,7 +746,7 @@ int orte_pls_process_launch(orte_jobid_t jobid)
n_item = opal_list_get_next(n_item)) {
orte_process_name_t* name;
pid_t pid;
char *exec_path;
char *exec_path = NULL;
char **exec_argv;
rmaps_node = (orte_mapped_node_t*)n_item;
@ -783,22 +783,13 @@ int orte_pls_process_launch(orte_jobid_t jobid)
goto cleanup;
}
/* fork a child to exec the process/ssh session */
/* set the process state to "launched" */
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(name, ORTE_PROC_STATE_LAUNCHED, 0))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* pid = fork();
if (pid < 0) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
*/
/* child */
/*if (pid == 0)*/ {
{
char* name_string;
char** env;
char* var;
@ -839,23 +830,32 @@ int orte_pls_process_launch(orte_jobid_t jobid)
}
exec_argv = &argv[local_exec_index];
exec_path = opal_path_findv(exec_argv[0], 0, environ, NULL);
/* If the user provide a prefix then first try to find the application there */
if( NULL != prefix_dir ) {
char* full_path[3];
if (NULL == exec_path && NULL == prefix_dir) {
rc = orte_pls_process_fill_exec_path (&exec_path);
if (ORTE_SUCCESS != rc) {
return rc;
}
} else {
if (NULL != prefix_dir) {
strcpy(prefix_dir,"c:");
exec_path = opal_os_path( false, prefix_dir, bin_base, "orted", NULL );
}
/* If we yet did not fill up the execpath, do so now */
if (NULL == exec_path) {
rc = orte_pls_process_fill_exec_path (&exec_path);
if (ORTE_SUCCESS != rc) {
return rc;
full_path[0] = opal_os_path( false, prefix_dir, NULL );
full_path[1] = opal_os_path( false, prefix_dir, bin_base, NULL );
full_path[2] = NULL;
exec_path = opal_path_find(exec_argv[0], full_path, F_OK, NULL);
free(full_path[0]); free(full_path[1]);
}
if( NULL == exec_path ) {
/* find the application in the default PATH */
exec_path = opal_path_findv(exec_argv[0], F_OK, environ, NULL);
if( NULL == exec_path ) {
char* full_path[2];
full_path[0] = opal_os_path( false, OPAL_BINDIR, NULL );
full_path[1] = NULL;
exec_path = opal_path_find(exec_argv[0], full_path, F_OK, NULL);
free(full_path[0]);
if( NULL == exec_path ) {
rc = orte_pls_process_fill_exec_path (&exec_path);
if (ORTE_SUCCESS != rc) {
return rc;
}
}
}
}
@ -872,7 +872,7 @@ int orte_pls_process_launch(orte_jobid_t jobid)
oldenv = getenv("PATH");
if (NULL != oldenv) {
char *temp;
asprintf(&temp, "%s;%s", newenv, oldenv ); //daniel asprintf(&temp, "%s:%s", newenv, oldenv );
asprintf(&temp, "%s;%s", newenv, oldenv );
free( newenv );
newenv = temp;
}
@ -881,13 +881,13 @@ int orte_pls_process_launch(orte_jobid_t jobid)
opal_output(0, "pls:process: reset PATH: %s", newenv);
}
free(newenv);
#if 0
/* Reset LD_LIBRARY_PATH */
newenv = opal_os_path( false, prefix_dir, lib_base, NULL );
oldenv = getenv("LD_LIBRARY_PATH");
if (NULL != oldenv) {
char* temp;
asprintf(&temp, "%s:%s", newenv, oldenv);
asprintf(&temp, "%s;%s", newenv, oldenv);
free(newenv);
newenv = temp;
}
@ -897,7 +897,6 @@ int orte_pls_process_launch(orte_jobid_t jobid)
newenv);
}
free(newenv);
#endif
}
/* Since this is a local execution, we need to
@ -916,7 +915,7 @@ int orte_pls_process_launch(orte_jobid_t jobid)
remote nodes (via process/ssh). This allows a user
to specify a path that is relative to $HOME for
both the cwd and argv[0] and it will work on
all nodes -- including the local nost.
all nodes -- including the local host.
Otherwise, it would work on remote nodes and
not the local node. If the user does not start
in $HOME on the remote nodes... well... let's
@ -965,16 +964,6 @@ int orte_pls_process_launch(orte_jobid_t jobid)
//set_handler_default(SIGPIPE);
set_handler_default(SIGCHLD);
/* Unblock all signals, for many of the same reasons that
we set the default handlers, above. This is noticable
on Linux where the event library blocks SIGTERM, but we
don't want that blocked by the orted (or, more
specifically, we don't want it to be blocked by the
orted and then inherited by the ORTE processes that it
forks, making them unkillable by SIGTERM). */
//sigprocmask(0, 0, &sigs);
//sigprocmask(SIG_UNBLOCK, &sigs, 0);
/* setup environment */
env = opal_argv_copy(environ);
var = mca_base_param_environ_variable("seed",NULL,NULL);
@ -988,30 +977,10 @@ int orte_pls_process_launch(orte_jobid_t jobid)
free(param);
}
}
//execve(exec_path, exec_argv, env);
pid = _spawnve( _P_NOWAIT, exec_path, exec_argv, env); //,NULL); daniel
if (pid == -1) opal_output(0, "pls:process: execv failed spawning new process; errno=%d\n", errno);
else opal_output(0, "pls:process: execv hopefully started (pid %d)\n", pid);
#if 0
} /*else*/ { /* father */
OPAL_THREAD_LOCK(&mca_pls_process_component.lock);
/* JJH Bug:
* If we are in '--debug-daemons' we keep the ssh connection
* alive for the span of the run. If we use this option
* AND we launch on more than "num_concurrent" machines
* then we will deadlock. No connections are terminated
* until the job is complete, no job is started
* since all the orteds are waiting for all the others
* to come online, and the others ore not launched because
* we are waiting on those that have started to terminate
* their ssh tunnels. :(
*/
if (mca_pls_process_component.num_children++ >=
mca_pls_process_component.num_concurrent) {
opal_condition_wait(&mca_pls_process_component.cond, &mca_pls_process_component.lock);
}
OPAL_THREAD_UNLOCK(&mca_pls_process_component.lock);
#endif
if (pid == -1) opal_output(0, "pls:process: execv failed spawning process %s; errno=%d\n", exec_path, errno);
else opal_output(0, "pls:process: execv %s hopefully started (pid %d)\n", exec_path, pid);
/* setup callback on sigchild - wait until setup above is complete
* as the callback can occur in the call to orte_wait_cb
*/