1
1

Make the Windows PLS component match the current requirements for

a PLS module.

This commit was SVN r15019.
Этот коммит содержится в:
George Bosilca 2007-06-12 22:34:56 +00:00
родитель af64009368
Коммит bf6f30a42c
2 изменённых файлов: 124 добавлений и 306 удалений

Просмотреть файл

@ -49,17 +49,6 @@ orte_pls_base_module_t* orte_pls_process_component_init(int *priority);
*/ */
int orte_pls_process_finalize(void); int orte_pls_process_finalize(void);
/*
* Interface
*/
int orte_pls_process_launch(orte_jobid_t);
int orte_pls_process_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*);
int orte_pls_process_terminate_orteds(orte_jobid_t, struct timeval *timeout, opal_list_t*);
int orte_pls_process_terminate_proc(const orte_process_name_t* proc_name);
int orte_pls_process_signal_job(orte_jobid_t, int32_t, opal_list_t*);
int orte_pls_process_signal_proc(const orte_process_name_t* proc_name, int32_t);
int orte_pls_process_cancel_operation(void);
/** /**
* PLS Component * PLS Component
*/ */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University * Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -30,12 +30,6 @@
#include <stdlib.h> #include <stdlib.h>
#include <windows.h> //daniel
#include <process.h> //
#include <stdio.h> //
#include <tchar.h> //daniel
#ifdef HAVE_UNISTD_H #ifdef HAVE_UNISTD_H
#include <unistd.h> #include <unistd.h>
#endif #endif
@ -62,7 +56,7 @@
#include <pwd.h> #include <pwd.h>
#endif #endif
#include "opal/install_dirs.h" #include "opal/mca/installdirs/installdirs.h"
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
#include "opal/util/if.h" #include "opal/util/if.h"
#include "opal/util/os_path.h" #include "opal/util/os_path.h"
@ -80,6 +74,8 @@
#include "orte/util/session_dir.h" #include "orte/util/session_dir.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/params.h"
#include "orte/dss/dss.h" #include "orte/dss/dss.h"
#include "orte/mca/ns/ns.h" #include "orte/mca/ns/ns.h"
@ -102,6 +98,15 @@
static int orte_pls_process_launch_threaded(orte_jobid_t jobid); static int orte_pls_process_launch_threaded(orte_jobid_t jobid);
#endif #endif
/*
* Interface
*/
static int orte_pls_process_launch(orte_jobid_t);
static int orte_pls_process_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*);
static int orte_pls_process_terminate_orteds(struct timeval *timeout, opal_list_t*);
static int orte_pls_process_terminate_proc(const orte_process_name_t* proc_name);
static int orte_pls_process_signal_job(orte_jobid_t, int32_t, opal_list_t*);
static int orte_pls_process_signal_proc(const orte_process_name_t* proc_name, int32_t);
orte_pls_base_module_t orte_pls_process_module = { orte_pls_base_module_t orte_pls_process_module = {
#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS && OMPI_ENABLE_PROGRESS_THREADS #if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS && OMPI_ENABLE_PROGRESS_THREADS
@ -114,7 +119,6 @@ orte_pls_base_module_t orte_pls_process_module = {
orte_pls_process_terminate_proc, orte_pls_process_terminate_proc,
orte_pls_process_signal_job, orte_pls_process_signal_job,
orte_pls_process_signal_proc, orte_pls_process_signal_proc,
orte_pls_process_cancel_operation,
orte_pls_process_finalize orte_pls_process_finalize
}; };
@ -122,6 +126,7 @@ static void set_handler_default(int sig);
enum { enum {
ORTE_PLS_RSH_SHELL_BASH = 0, ORTE_PLS_RSH_SHELL_BASH = 0,
ORTE_PLS_RSH_SHELL_ZSH,
ORTE_PLS_RSH_SHELL_TCSH, ORTE_PLS_RSH_SHELL_TCSH,
ORTE_PLS_RSH_SHELL_CSH, ORTE_PLS_RSH_SHELL_CSH,
ORTE_PLS_RSH_SHELL_KSH, ORTE_PLS_RSH_SHELL_KSH,
@ -133,6 +138,7 @@ typedef int orte_pls_process_shell;
static const char * orte_pls_process_shell_name[] = { static const char * orte_pls_process_shell_name[] = {
"bash", "bash",
"zsh",
"tcsh", /* tcsh has to be first otherwise strstr finds csh */ "tcsh", /* tcsh has to be first otherwise strstr finds csh */
"csh", "csh",
"ksh", "ksh",
@ -142,14 +148,10 @@ static const char * orte_pls_process_shell_name[] = {
/* local global storage of timing variables */ /* local global storage of timing variables */
static unsigned long mintime=999999999, miniter, maxtime=0, maxiter;
static float avgtime=0.0;
static struct timeval *launchstart;
static struct timeval joblaunchstart, joblaunchstop; static struct timeval joblaunchstart, joblaunchstop;
/* local global storage of the list of active daemons */ /* global storage of active jobid being launched */
static opal_list_t active_daemons; static orte_jobid_t active_job = ORTE_JOBID_INVALID;
/** /**
* Check the Shell variable on the specified node * Check the Shell variable on the specified node
@ -350,14 +352,14 @@ static int orte_pls_process_fill_exec_path( char ** exec_path )
{ {
struct stat buf; struct stat buf;
asprintf(exec_path, "%s/orted", OPAL_BINDIR); asprintf(exec_path, "%s/orted", opal_install_dirs.bindir);
if (0 != stat(*exec_path, &buf)) { if (0 != stat(*exec_path, &buf)) {
char *path = getenv("PATH"); char *path = getenv("PATH");
if (NULL == path) { if (NULL == path) {
path = "PATH is empty!"; path = "PATH is empty!";
} }
opal_show_help("help-pls-process.txt", "no-local-orted", opal_show_help("help-pls-process.txt", "no-local-orted",
true, path, OPAL_BINDIR); true, path, opal_install_dirs.bindir);
return ORTE_ERR_NOT_FOUND; return ORTE_ERR_NOT_FOUND;
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;
@ -370,56 +372,12 @@ static int orte_pls_process_fill_exec_path( char ** exec_path )
static void orte_pls_process_wait_daemon(pid_t pid, int status, void* cbdata) static void orte_pls_process_wait_daemon(pid_t pid, int status, void* cbdata)
{ {
orte_pls_daemon_info_t *info = (orte_pls_daemon_info_t*) cbdata; orte_pls_daemon_info_t *info = (orte_pls_daemon_info_t*) cbdata;
orte_mapped_node_t *node;
orte_mapped_proc_t *proc;
opal_list_item_t *item;
int rc; int rc;
unsigned long deltat; unsigned long deltat;
struct timeval launchstop;
/* if ssh exited abnormally, set the child processes to aborted
and print something useful to the user. The usual reasons for
ssh to exit abnormally all are a pretty good indication that
the child processes aren't going to start up properly.
This should somehow be pushed up to the calling level, but we
don't really have a way to do that just yet.
*/
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {
/* get the mapping for our node so we can cancel the right things */
rc = orte_rmaps.get_node_map(&node, info->cell,
info->nodename, info->active_job);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* set state of all processes associated with the daemon as
terminated */
for(item = opal_list_get_first(&node->procs);
item != opal_list_get_end(&node->procs);
item = opal_list_get_next(item)) {
proc = (orte_mapped_proc_t*) item;
/* Clean up the session directory as if we were the
process itself. This covers the case where the
process died abnormally and didn't cleanup its own
session directory. */
orte_session_dir_finalize(&(proc->name));
rc = orte_smr.set_proc_state(&(proc->name),
ORTE_PROC_STATE_ABORTED, status);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
}
OBJ_RELEASE(node);
cleanup:
/* tell the user something went wrong */ /* tell the user something went wrong */
opal_output(0, "ERROR: A daemon on node %s failed to start as expected.", opal_output(0, "ERROR: A daemon failed to start as expected.");
info->nodename);
opal_output(0, "ERROR: There may be more information available from"); opal_output(0, "ERROR: There may be more information available from");
opal_output(0, "ERROR: the remote shell (see above)."); opal_output(0, "ERROR: the remote shell (see above).");
@ -440,38 +398,23 @@ static void orte_pls_process_wait_daemon(pid_t pid, int status, void* cbdata)
} else { } else {
opal_output(0, "No extra status information is available: %d.", status); opal_output(0, "No extra status information is available: %d.", status);
} }
OPAL_THREAD_LOCK(&mca_pls_process_component.lock); /* The usual reasons for ssh to exit abnormally all are a pretty good
/* tell the system that this daemon is gone */ indication that the child processes aren't going to start up properly.
if (ORTE_SUCCESS != (rc = orte_pls_base_remove_daemon(info))) { Set the job state to indicate we failed to launch so orterun's exit status
will be non-zero and forcibly terminate the job so orterun can exit
*/
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(active_job, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
} }
/* remove the daemon from our local list */ if (ORTE_SUCCESS != (rc = orte_wakeup(active_job))) {
opal_list_remove_item(&active_daemons, &info->super); ORTE_ERROR_LOG(rc);
OBJ_RELEASE(info); }
OPAL_THREAD_UNLOCK(&mca_pls_process_component.lock);
} /* if abnormal exit */ } /* if abnormal exit */
/* release any waiting threads */ /* release any waiting threads */
OPAL_THREAD_LOCK(&mca_pls_process_component.lock); OPAL_THREAD_LOCK(&mca_pls_process_component.lock);
/* first check timing request */
if (mca_pls_process_component.timing) {
if (0 != gettimeofday(&launchstop, NULL)) {
opal_output(0, "pls_process: could not obtain stop time");
} else {
deltat = (launchstop.tv_sec - launchstart[info->name->vpid].tv_sec)*1000000 +
(launchstop.tv_usec - launchstart[info->name->vpid].tv_usec);
avgtime = avgtime + deltat;
if (deltat < mintime) {
mintime = deltat;
miniter = (unsigned long)info->name->vpid;
}
if (deltat > maxtime) {
maxtime = deltat;
maxiter = (unsigned long)info->name->vpid;
}
}
}
if (mca_pls_process_component.num_children-- >= if (mca_pls_process_component.num_children-- >=
mca_pls_process_component.num_concurrent || mca_pls_process_component.num_concurrent ||
@ -486,17 +429,7 @@ static void orte_pls_process_wait_daemon(pid_t pid, int status, void* cbdata)
deltat = (joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 + deltat = (joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
(joblaunchstop.tv_usec - joblaunchstart.tv_usec); (joblaunchstop.tv_usec - joblaunchstart.tv_usec);
opal_output(0, "pls_process: total time to launch job is %lu usec", deltat); opal_output(0, "pls_process: total time to launch job is %lu usec", deltat);
if (mintime < 999999999) {
/* had at least one non-local node */
avgtime = avgtime/opal_list_get_size(&active_daemons);
opal_output(0, "pls_process: average time to launch one daemon %f usec", avgtime);
opal_output(0, "pls_process: min time to launch a daemon was %lu usec for iter %lu", mintime, miniter);
opal_output(0, "pls_process: max time to launch a daemon was %lu usec for iter %lu", maxtime, maxiter);
} else {
opal_output(0, "No nonlocal launches to report for timing info");
}
} }
free(launchstart);
} }
OPAL_THREAD_UNLOCK(&mca_pls_process_component.lock); OPAL_THREAD_UNLOCK(&mca_pls_process_component.lock);
@ -510,22 +443,21 @@ static void orte_pls_process_wait_daemon(pid_t pid, int status, void* cbdata)
int orte_pls_process_launch(orte_jobid_t jobid) int orte_pls_process_launch(orte_jobid_t jobid)
{ {
orte_job_map_t *map; orte_job_map_t *map = NULL;
opal_list_item_t *n_item; opal_list_item_t *n_item;
orte_mapped_node_t *rmaps_node; orte_mapped_node_t *rmaps_node;
orte_std_cntr_t num_nodes; orte_std_cntr_t num_nodes;
orte_vpid_t vpid;
int node_name_index2; int node_name_index2;
int proc_name_index; int proc_name_index;
int local_exec_index; int local_exec_index;
char *jobid_string = NULL; char *jobid_string = NULL;
char *uri, *param; char *param;
char **argv = NULL; char **argv = NULL;
char *prefix_dir; char *prefix_dir;
int argc = 0; int argc = 0;
int rc; int rc;
char *lib_base = NULL, *bin_base = NULL; char *lib_base = NULL, *bin_base = NULL;
orte_pls_daemon_info_t *dmn; bool failed_launch = true;
if (mca_pls_process_component.timing) { if (mca_pls_process_component.timing) {
if (0 != gettimeofday(&joblaunchstart, NULL)) { if (0 != gettimeofday(&joblaunchstart, NULL)) {
@ -535,11 +467,8 @@ int orte_pls_process_launch(orte_jobid_t jobid)
} }
} }
/* setup a list that will contain the info for all the daemons /* set the active jobid */
* so we can store it on the registry when done and use it active_job = jobid;
* locally to track their state
*/
OBJ_CONSTRUCT(&active_daemons, opal_list_t);
/* Get the map for this job /* Get the map for this job
* We need the entire mapping for a couple of reasons: * We need the entire mapping for a couple of reasons:
@ -550,40 +479,42 @@ int orte_pls_process_launch(orte_jobid_t jobid)
rc = orte_rmaps.get_job_map(&map, jobid); rc = orte_rmaps.get_job_map(&map, jobid);
if (ORTE_SUCCESS != rc) { if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&active_daemons);
return rc; return rc;
} }
/* if the user requested that we re-use daemons, /* account for any reuse of daemons */
* launch the procs on any existing, re-usable daemons if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
*/ ORTE_ERROR_LOG(rc);
if (orte_pls_base.reuse_daemons) { goto cleanup;
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) { }
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(map); num_nodes = map->num_new_daemons;
OBJ_DESTRUCT(&active_daemons); if (0 == num_nodes) {
return rc; /* nothing to do - just return */
} failed_launch = false;
}
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
if (0 == num_nodes) {
/* nothing left to do - just return */
OBJ_RELEASE(map); OBJ_RELEASE(map);
OBJ_DESTRUCT(&active_daemons);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
if (mca_pls_process_component.debug_daemons && if (mca_pls_process_component.debug_daemons &&
mca_pls_process_component.num_concurrent < num_nodes) { mca_pls_process_component.num_concurrent < num_nodes) {
/* we can't run in this situation, so pretty print the error /**
* and exit * If we are in '--debug-daemons' we keep the ssh connection
* alive for the span of the run. If we use this option
* AND we launch on more than "num_concurrent" machines
* then we will deadlock. No connections are terminated
* until the job is complete, no job is started
* since all the orteds are waiting for all the others
* to come online, and the others ore not launched because
* we are waiting on those that have started to terminate
* their ssh tunnels. :(
* As we cannot run in this situation, pretty print the error
* and return an error code.
*/ */
opal_show_help("help-pls-process.txt", "deadlock-params", opal_show_help("help-pls-process.txt", "deadlock-params",
true, mca_pls_process_component.num_concurrent, num_nodes); true, mca_pls_process_component.num_concurrent, num_nodes);
OBJ_RELEASE(map); rc = ORTE_ERR_FATAL;
OBJ_DESTRUCT(&active_daemons); goto cleanup;
return ORTE_ERR_FATAL;
} }
/* /*
@ -605,29 +536,6 @@ int orte_pls_process_launch(orte_jobid_t jobid)
*/ */
prefix_dir = map->apps[0]->prefix_dir; prefix_dir = map->apps[0]->prefix_dir;
/*
* Allocate a range of vpids for the daemons.
*/
if (num_nodes == 0) {
return ORTE_ERR_BAD_PARAM;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) {
goto cleanup;
}
/* setup the orted triggers for passing their launch info */
if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* need integer value for command line parameter */
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, jobid))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* /*
* Build argv array * Build argv array
*/ */
@ -641,9 +549,7 @@ int orte_pls_process_launch(orte_jobid_t jobid)
orte_pls_base_orted_append_basic_args(&argc, &argv, orte_pls_base_orted_append_basic_args(&argc, &argv,
&proc_name_index, &proc_name_index,
&node_name_index2, &node_name_index2,
jobid_string, map->num_nodes);
(vpid + num_nodes)
);
if (mca_pls_process_component.debug) { if (mca_pls_process_component.debug) {
param = opal_argv_join(argv, ' '); param = opal_argv_join(argv, ' ');
@ -684,63 +590,25 @@ int orte_pls_process_launch(orte_jobid_t jobid)
and use that on the remote node. and use that on the remote node.
*/ */
lib_base = opal_basename(OPAL_LIBDIR); lib_base = opal_basename(opal_install_dirs.libdir);
bin_base = opal_basename(OPAL_BINDIR); bin_base = opal_basename(opal_install_dirs.bindir);
/* /*
* Iterate through each of the nodes * Iterate through each of the nodes
*/ */
if (mca_pls_process_component.timing) {
/* allocate space to track the start times */
launchstart = (struct timeval*)malloc((num_nodes+vpid) * sizeof(struct timeval));
}
for(n_item = opal_list_get_first(&map->nodes); for(n_item = opal_list_get_first(&map->nodes);
n_item != opal_list_get_end(&map->nodes); n_item != opal_list_get_end(&map->nodes);
n_item = opal_list_get_next(n_item)) { n_item = opal_list_get_next(n_item)) {
orte_process_name_t* name;
pid_t pid; pid_t pid;
char *exec_path = NULL; char *exec_path = NULL;
char **exec_argv; char **exec_argv;
rmaps_node = (orte_mapped_node_t*)n_item; rmaps_node = (orte_mapped_node_t*)n_item;
if (mca_pls_process_component.timing) { /* if this daemon already exists, don't launch it! */
if (0 != gettimeofday(&launchstart[vpid], NULL)) { if (rmaps_node->daemon_preexists) {
opal_output(0, "pls_process: could not obtain start time"); continue;
}
}
/* new daemon - setup to record its info */
dmn = OBJ_NEW(orte_pls_daemon_info_t);
dmn->active_job = jobid;
opal_list_append(&active_daemons, &dmn->super);
/* setup node name */
free(argv[node_name_index2]);
argv[node_name_index2] = strdup(rmaps_node->nodename);
/* save it in the daemon info */
dmn->nodename = strdup(rmaps_node->nodename);
/* initialize daemons process name */
rc = orte_ns.create_process_name(&name, rmaps_node->cell, 0, vpid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* save it in the daemon info */
dmn->cell = rmaps_node->cell;
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(dmn->name), name, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* set the process state to "launched" */
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(name, ORTE_PROC_STATE_LAUNCHED, 0))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
} }
if (mca_pls_process_component.debug) { if (mca_pls_process_component.debug) {
@ -772,15 +640,7 @@ int orte_pls_process_launch(orte_jobid_t jobid)
opal_output(0, "pls:process: %s is a LOCAL node\n", opal_output(0, "pls:process: %s is a LOCAL node\n",
rmaps_node->nodename); rmaps_node->nodename);
} }
if (mca_pls_process_component.timing) {
/* since this is a local launch, the daemon will never reach
* the waitpid callback - so set the start value to
* something nonsensical
*/
launchstart[vpid].tv_sec = 0;
launchstart[vpid].tv_usec = 0;
}
exec_argv = &argv[local_exec_index]; exec_argv = &argv[local_exec_index];
/* If the user provide a prefix then first try to find the application there */ /* If the user provide a prefix then first try to find the application there */
if( NULL != prefix_dir ) { if( NULL != prefix_dir ) {
@ -798,7 +658,7 @@ int orte_pls_process_launch(orte_jobid_t jobid)
if( NULL == exec_path ) { if( NULL == exec_path ) {
char* full_path[2]; char* full_path[2];
full_path[0] = opal_os_path( false, OPAL_BINDIR, NULL ); full_path[0] = opal_os_path( false, opal_install_dirs.bindir, NULL );
full_path[1] = NULL; full_path[1] = NULL;
exec_path = opal_path_find(exec_argv[0], full_path, F_OK, NULL); exec_path = opal_path_find(exec_argv[0], full_path, F_OK, NULL);
free(full_path[0]); free(full_path[0]);
@ -881,9 +741,9 @@ int orte_pls_process_launch(orte_jobid_t jobid)
#endif #endif
/* setup process name */ /* setup process name */
rc = orte_ns.get_proc_name_string(&name_string, name); rc = orte_ns.get_proc_name_string(&name_string, rmaps_node->daemon);
if (ORTE_SUCCESS != rc) { if (ORTE_SUCCESS != rc) {
opal_output(0, "orte_pls_process: unable to create process name"); opal_output(0, "orte_pls_process: unable to get daemon name as string");
exit(-1); exit(-1);
} }
free(argv[proc_name_index]); free(argv[proc_name_index]);
@ -915,31 +775,51 @@ int orte_pls_process_launch(orte_jobid_t jobid)
free(param); free(param);
} }
} }
pid = _spawnve( _P_NOWAIT, exec_path, exec_argv, env); //,NULL); daniel pid = _spawnve( _P_NOWAIT, exec_path, exec_argv, env);
if (pid == -1) opal_output(0, "pls:process: execv failed spawning process %s; errno=%d\n", exec_path, errno); if (pid == -1) {
else opal_output(0, "pls:process: execv %s hopefully started (pid %d)\n", exec_path, pid); /* indicate this daemon has been launched in case anyone is sitting on that trigger */
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(rmaps_node->daemon, ORTE_PROC_STATE_LAUNCHED, 0))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
failed_launch = true;
rc = ORTE_ERROR;
goto cleanup;
}
/* indicate this daemon has been launched in case anyone is sitting on that trigger */
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(rmaps_node->daemon, ORTE_PROC_STATE_LAUNCHED, 0))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
opal_output(0, "pls:process: execv %s hopefully started (pid %d)\n", exec_path, pid);
OPAL_THREAD_LOCK(&mca_pls_process_component.lock);
/* This situation can lead to a deadlock if '--debug-daemons' is set.
* However, the deadlock condition is tested at the begining of this
* function, so we're quite confident it should not happens here.
*/
if (mca_pls_process_component.num_children++ >=
mca_pls_process_component.num_concurrent) {
opal_condition_wait(&mca_pls_process_component.cond, &mca_pls_process_component.lock);
}
OPAL_THREAD_UNLOCK(&mca_pls_process_component.lock);
/* setup callback on sigchild - wait until setup above is complete /* setup callback on sigchild - wait until setup above is complete
* as the callback can occur in the call to orte_wait_cb * as the callback can occur in the call to orte_wait_cb
*/ */
orte_wait_cb(pid, orte_pls_process_wait_daemon, dmn); orte_wait_cb(pid, orte_pls_process_wait_daemon, NULL);
/* if required - add delay to avoid problems w/ X11 authentication */ /* if required - add delay to avoid problems w/ X11 authentication */
if (mca_pls_process_component.debug && mca_pls_process_component.delay) { if (mca_pls_process_component.debug && mca_pls_process_component.delay) {
sleep(mca_pls_process_component.delay); sleep(mca_pls_process_component.delay);
} }
vpid++;
} }
free(name);
} }
/* all done, so store the daemon info on the registry */
if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&active_daemons))) {
ORTE_ERROR_LOG(rc);
}
cleanup: cleanup:
OBJ_RELEASE(map); if (NULL != map) {
OBJ_RELEASE(map);
}
if (NULL != lib_base) { if (NULL != lib_base) {
free(lib_base); free(lib_base);
@ -948,8 +828,23 @@ int orte_pls_process_launch(orte_jobid_t jobid)
free(bin_base); free(bin_base);
} }
if (NULL != jobid_string) free(jobid_string); /* done with this variable */ if (NULL != jobid_string) {
if (NULL != argv) opal_argv_free(argv); free(jobid_string); /* done with this variable */
}
if (NULL != argv) {
opal_argv_free(argv);
}
/* check for failed launch - if so, force terminate */
if( failed_launch ) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_wakeup(jobid))) {
ORTE_ERROR_LOG(rc);
}
}
return rc; return rc;
} }
@ -961,60 +856,27 @@ int orte_pls_process_launch(orte_jobid_t jobid)
int orte_pls_process_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) int orte_pls_process_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
{ {
int rc; int rc;
opal_list_t daemons;
opal_list_item_t *item;
OPAL_TRACE(1);
/* construct the list of active daemons on this job */
OBJ_CONSTRUCT(&daemons, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* order them to kill their local procs for this job */ /* order them to kill their local procs for this job */
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(jobid, timeout, attrs))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto CLEANUP;
} }
CLEANUP:
while (NULL != (item = opal_list_remove_first(&daemons))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&daemons);
return rc; return rc;
} }
/** /**
* Terminate the orteds for a given job * Terminate the orteds for a given job
*/ */
int orte_pls_process_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) int orte_pls_process_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
{ {
int rc; int rc;
opal_list_t daemons;
opal_list_item_t *item;
OPAL_TRACE(1);
/* construct the list of active daemons on this job */
OBJ_CONSTRUCT(&daemons, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* now tell them to die! */ /* now tell them to die! */
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(timeout, attrs))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
} }
CLEANUP:
while (NULL != (item = opal_list_remove_first(&daemons))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&daemons);
return rc; return rc;
} }
@ -1031,28 +893,12 @@ int orte_pls_process_terminate_proc(const orte_process_name_t* proc)
int orte_pls_process_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs) int orte_pls_process_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs)
{ {
int rc; int rc;
opal_list_t daemons;
opal_list_item_t *item;
OPAL_TRACE(1);
/* construct the list of active daemons on this job */
OBJ_CONSTRUCT(&daemons, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&daemons);
return rc;
}
/* order them to pass this signal to their local procs */ /* order them to pass this signal to their local procs */
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_signal_local_procs(jobid, signal, &daemons))) { if (ORTE_SUCCESS != (rc = orte_pls_base_orted_signal_local_procs(jobid, signal, attrs))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
} }
while (NULL != (item = opal_list_remove_first(&daemons))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&daemons);
return rc; return rc;
} }
@ -1063,23 +909,6 @@ int orte_pls_process_signal_proc(const orte_process_name_t* proc, int32_t signal
return ORTE_ERR_NOT_IMPLEMENTED; return ORTE_ERR_NOT_IMPLEMENTED;
} }
/**
* Cancel an operation involving comm to an orted
*/
int orte_pls_process_cancel_operation(void)
{
int rc;
OPAL_TRACE(1);
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
int orte_pls_process_finalize(void) int orte_pls_process_finalize(void)
{ {
int rc; int rc;