1
1

Make the Windows PLS component match the current requirements for

a PLS module.

This commit was SVN r15019.
Этот коммит содержится в:
George Bosilca 2007-06-12 22:34:56 +00:00
родитель af64009368
Коммит bf6f30a42c
2 изменённых файлов: 124 добавлений и 306 удалений

Просмотреть файл

@ -49,17 +49,6 @@ orte_pls_base_module_t* orte_pls_process_component_init(int *priority);
*/
int orte_pls_process_finalize(void);
/*
* Interface
*/
int orte_pls_process_launch(orte_jobid_t);
int orte_pls_process_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*);
int orte_pls_process_terminate_orteds(orte_jobid_t, struct timeval *timeout, opal_list_t*);
int orte_pls_process_terminate_proc(const orte_process_name_t* proc_name);
int orte_pls_process_signal_job(orte_jobid_t, int32_t, opal_list_t*);
int orte_pls_process_signal_proc(const orte_process_name_t* proc_name, int32_t);
int orte_pls_process_cancel_operation(void);
/**
* PLS Component
*/

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -30,12 +30,6 @@
#include <stdlib.h>
#include <windows.h> //daniel
#include <process.h> //
#include <stdio.h> //
#include <tchar.h> //daniel
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
@ -62,7 +56,7 @@
#include <pwd.h>
#endif
#include "opal/install_dirs.h"
#include "opal/mca/installdirs/installdirs.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/if.h"
#include "opal/util/os_path.h"
@ -80,6 +74,8 @@
#include "orte/util/session_dir.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/params.h"
#include "orte/dss/dss.h"
#include "orte/mca/ns/ns.h"
@ -102,6 +98,15 @@
static int orte_pls_process_launch_threaded(orte_jobid_t jobid);
#endif
/*
* Interface
*/
static int orte_pls_process_launch(orte_jobid_t);
static int orte_pls_process_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*);
static int orte_pls_process_terminate_orteds(struct timeval *timeout, opal_list_t*);
static int orte_pls_process_terminate_proc(const orte_process_name_t* proc_name);
static int orte_pls_process_signal_job(orte_jobid_t, int32_t, opal_list_t*);
static int orte_pls_process_signal_proc(const orte_process_name_t* proc_name, int32_t);
orte_pls_base_module_t orte_pls_process_module = {
#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS && OMPI_ENABLE_PROGRESS_THREADS
@ -114,7 +119,6 @@ orte_pls_base_module_t orte_pls_process_module = {
orte_pls_process_terminate_proc,
orte_pls_process_signal_job,
orte_pls_process_signal_proc,
orte_pls_process_cancel_operation,
orte_pls_process_finalize
};
@ -122,6 +126,7 @@ static void set_handler_default(int sig);
enum {
ORTE_PLS_RSH_SHELL_BASH = 0,
ORTE_PLS_RSH_SHELL_ZSH,
ORTE_PLS_RSH_SHELL_TCSH,
ORTE_PLS_RSH_SHELL_CSH,
ORTE_PLS_RSH_SHELL_KSH,
@ -133,6 +138,7 @@ typedef int orte_pls_process_shell;
static const char * orte_pls_process_shell_name[] = {
"bash",
"zsh",
"tcsh", /* tcsh has to be first otherwise strstr finds csh */
"csh",
"ksh",
@ -142,14 +148,10 @@ static const char * orte_pls_process_shell_name[] = {
/* local global storage of timing variables */
static unsigned long mintime=999999999, miniter, maxtime=0, maxiter;
static float avgtime=0.0;
static struct timeval *launchstart;
static struct timeval joblaunchstart, joblaunchstop;
/* local global storage of the list of active daemons */
static opal_list_t active_daemons;
/* global storage of active jobid being launched */
static orte_jobid_t active_job = ORTE_JOBID_INVALID;
/**
* Check the Shell variable on the specified node
@ -350,14 +352,14 @@ static int orte_pls_process_fill_exec_path( char ** exec_path )
{
struct stat buf;
asprintf(exec_path, "%s/orted", OPAL_BINDIR);
asprintf(exec_path, "%s/orted", opal_install_dirs.bindir);
if (0 != stat(*exec_path, &buf)) {
char *path = getenv("PATH");
if (NULL == path) {
path = "PATH is empty!";
}
opal_show_help("help-pls-process.txt", "no-local-orted",
true, path, OPAL_BINDIR);
true, path, opal_install_dirs.bindir);
return ORTE_ERR_NOT_FOUND;
}
return ORTE_SUCCESS;
@ -370,56 +372,12 @@ static int orte_pls_process_fill_exec_path( char ** exec_path )
static void orte_pls_process_wait_daemon(pid_t pid, int status, void* cbdata)
{
orte_pls_daemon_info_t *info = (orte_pls_daemon_info_t*) cbdata;
orte_mapped_node_t *node;
orte_mapped_proc_t *proc;
opal_list_item_t *item;
int rc;
unsigned long deltat;
struct timeval launchstop;
/* if ssh exited abnormally, set the child processes to aborted
and print something useful to the user. The usual reasons for
ssh to exit abnormally all are a pretty good indication that
the child processes aren't going to start up properly.
This should somehow be pushed up to the calling level, but we
don't really have a way to do that just yet.
*/
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {
/* get the mapping for our node so we can cancel the right things */
rc = orte_rmaps.get_node_map(&node, info->cell,
info->nodename, info->active_job);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* set state of all processes associated with the daemon as
terminated */
for(item = opal_list_get_first(&node->procs);
item != opal_list_get_end(&node->procs);
item = opal_list_get_next(item)) {
proc = (orte_mapped_proc_t*) item;
/* Clean up the session directory as if we were the
process itself. This covers the case where the
process died abnormally and didn't cleanup its own
session directory. */
orte_session_dir_finalize(&(proc->name));
rc = orte_smr.set_proc_state(&(proc->name),
ORTE_PROC_STATE_ABORTED, status);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
}
OBJ_RELEASE(node);
cleanup:
/* tell the user something went wrong */
opal_output(0, "ERROR: A daemon on node %s failed to start as expected.",
info->nodename);
opal_output(0, "ERROR: A daemon failed to start as expected.");
opal_output(0, "ERROR: There may be more information available from");
opal_output(0, "ERROR: the remote shell (see above).");
@ -440,38 +398,23 @@ static void orte_pls_process_wait_daemon(pid_t pid, int status, void* cbdata)
} else {
opal_output(0, "No extra status information is available: %d.", status);
}
OPAL_THREAD_LOCK(&mca_pls_process_component.lock);
/* tell the system that this daemon is gone */
if (ORTE_SUCCESS != (rc = orte_pls_base_remove_daemon(info))) {
/* The usual reasons for ssh to exit abnormally all are a pretty good
indication that the child processes aren't going to start up properly.
Set the job state to indicate we failed to launch so orterun's exit status
will be non-zero and forcibly terminate the job so orterun can exit
*/
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(active_job, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc);
}
/* remove the daemon from our local list */
opal_list_remove_item(&active_daemons, &info->super);
OBJ_RELEASE(info);
OPAL_THREAD_UNLOCK(&mca_pls_process_component.lock);
if (ORTE_SUCCESS != (rc = orte_wakeup(active_job))) {
ORTE_ERROR_LOG(rc);
}
} /* if abnormal exit */
/* release any waiting threads */
OPAL_THREAD_LOCK(&mca_pls_process_component.lock);
/* first check timing request */
if (mca_pls_process_component.timing) {
if (0 != gettimeofday(&launchstop, NULL)) {
opal_output(0, "pls_process: could not obtain stop time");
} else {
deltat = (launchstop.tv_sec - launchstart[info->name->vpid].tv_sec)*1000000 +
(launchstop.tv_usec - launchstart[info->name->vpid].tv_usec);
avgtime = avgtime + deltat;
if (deltat < mintime) {
mintime = deltat;
miniter = (unsigned long)info->name->vpid;
}
if (deltat > maxtime) {
maxtime = deltat;
maxiter = (unsigned long)info->name->vpid;
}
}
}
if (mca_pls_process_component.num_children-- >=
mca_pls_process_component.num_concurrent ||
@ -486,17 +429,7 @@ static void orte_pls_process_wait_daemon(pid_t pid, int status, void* cbdata)
deltat = (joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
(joblaunchstop.tv_usec - joblaunchstart.tv_usec);
opal_output(0, "pls_process: total time to launch job is %lu usec", deltat);
if (mintime < 999999999) {
/* had at least one non-local node */
avgtime = avgtime/opal_list_get_size(&active_daemons);
opal_output(0, "pls_process: average time to launch one daemon %f usec", avgtime);
opal_output(0, "pls_process: min time to launch a daemon was %lu usec for iter %lu", mintime, miniter);
opal_output(0, "pls_process: max time to launch a daemon was %lu usec for iter %lu", maxtime, maxiter);
} else {
opal_output(0, "No nonlocal launches to report for timing info");
}
}
free(launchstart);
}
OPAL_THREAD_UNLOCK(&mca_pls_process_component.lock);
@ -510,22 +443,21 @@ static void orte_pls_process_wait_daemon(pid_t pid, int status, void* cbdata)
int orte_pls_process_launch(orte_jobid_t jobid)
{
orte_job_map_t *map;
orte_job_map_t *map = NULL;
opal_list_item_t *n_item;
orte_mapped_node_t *rmaps_node;
orte_std_cntr_t num_nodes;
orte_vpid_t vpid;
int node_name_index2;
int proc_name_index;
int local_exec_index;
char *jobid_string = NULL;
char *uri, *param;
char *param;
char **argv = NULL;
char *prefix_dir;
int argc = 0;
int rc;
char *lib_base = NULL, *bin_base = NULL;
orte_pls_daemon_info_t *dmn;
bool failed_launch = true;
if (mca_pls_process_component.timing) {
if (0 != gettimeofday(&joblaunchstart, NULL)) {
@ -535,11 +467,8 @@ int orte_pls_process_launch(orte_jobid_t jobid)
}
}
/* setup a list that will contain the info for all the daemons
* so we can store it on the registry when done and use it
* locally to track their state
*/
OBJ_CONSTRUCT(&active_daemons, opal_list_t);
/* set the active jobid */
active_job = jobid;
/* Get the map for this job
* We need the entire mapping for a couple of reasons:
@ -550,40 +479,42 @@ int orte_pls_process_launch(orte_jobid_t jobid)
rc = orte_rmaps.get_job_map(&map, jobid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&active_daemons);
return rc;
}
/* if the user requested that we re-use daemons,
* launch the procs on any existing, re-usable daemons
*/
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(map);
OBJ_DESTRUCT(&active_daemons);
return rc;
}
}
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
if (0 == num_nodes) {
/* nothing left to do - just return */
/* account for any reuse of daemons */
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
num_nodes = map->num_new_daemons;
if (0 == num_nodes) {
/* nothing to do - just return */
failed_launch = false;
OBJ_RELEASE(map);
OBJ_DESTRUCT(&active_daemons);
return ORTE_SUCCESS;
}
if (mca_pls_process_component.debug_daemons &&
mca_pls_process_component.num_concurrent < num_nodes) {
/* we can't run in this situation, so pretty print the error
* and exit
/**
* If we are in '--debug-daemons' we keep the ssh connection
* alive for the span of the run. If we use this option
* AND we launch on more than "num_concurrent" machines
* then we will deadlock. No connections are terminated
* until the job is complete, no job is started
* since all the orteds are waiting for all the others
* to come online, and the others ore not launched because
* we are waiting on those that have started to terminate
* their ssh tunnels. :(
* As we cannot run in this situation, pretty print the error
* and return an error code.
*/
opal_show_help("help-pls-process.txt", "deadlock-params",
true, mca_pls_process_component.num_concurrent, num_nodes);
OBJ_RELEASE(map);
OBJ_DESTRUCT(&active_daemons);
return ORTE_ERR_FATAL;
rc = ORTE_ERR_FATAL;
goto cleanup;
}
/*
@ -605,29 +536,6 @@ int orte_pls_process_launch(orte_jobid_t jobid)
*/
prefix_dir = map->apps[0]->prefix_dir;
/*
* Allocate a range of vpids for the daemons.
*/
if (num_nodes == 0) {
return ORTE_ERR_BAD_PARAM;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) {
goto cleanup;
}
/* setup the orted triggers for passing their launch info */
if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* need integer value for command line parameter */
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, jobid))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/*
* Build argv array
*/
@ -641,9 +549,7 @@ int orte_pls_process_launch(orte_jobid_t jobid)
orte_pls_base_orted_append_basic_args(&argc, &argv,
&proc_name_index,
&node_name_index2,
jobid_string,
(vpid + num_nodes)
);
map->num_nodes);
if (mca_pls_process_component.debug) {
param = opal_argv_join(argv, ' ');
@ -684,63 +590,25 @@ int orte_pls_process_launch(orte_jobid_t jobid)
and use that on the remote node.
*/
lib_base = opal_basename(OPAL_LIBDIR);
bin_base = opal_basename(OPAL_BINDIR);
lib_base = opal_basename(opal_install_dirs.libdir);
bin_base = opal_basename(opal_install_dirs.bindir);
/*
* Iterate through each of the nodes
*/
if (mca_pls_process_component.timing) {
/* allocate space to track the start times */
launchstart = (struct timeval*)malloc((num_nodes+vpid) * sizeof(struct timeval));
}
for(n_item = opal_list_get_first(&map->nodes);
n_item != opal_list_get_end(&map->nodes);
n_item = opal_list_get_next(n_item)) {
orte_process_name_t* name;
pid_t pid;
char *exec_path = NULL;
char **exec_argv;
rmaps_node = (orte_mapped_node_t*)n_item;
if (mca_pls_process_component.timing) {
if (0 != gettimeofday(&launchstart[vpid], NULL)) {
opal_output(0, "pls_process: could not obtain start time");
}
}
/* new daemon - setup to record its info */
dmn = OBJ_NEW(orte_pls_daemon_info_t);
dmn->active_job = jobid;
opal_list_append(&active_daemons, &dmn->super);
/* setup node name */
free(argv[node_name_index2]);
argv[node_name_index2] = strdup(rmaps_node->nodename);
/* save it in the daemon info */
dmn->nodename = strdup(rmaps_node->nodename);
/* initialize daemons process name */
rc = orte_ns.create_process_name(&name, rmaps_node->cell, 0, vpid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* save it in the daemon info */
dmn->cell = rmaps_node->cell;
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(dmn->name), name, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* set the process state to "launched" */
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(name, ORTE_PROC_STATE_LAUNCHED, 0))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
/* if this daemon already exists, don't launch it! */
if (rmaps_node->daemon_preexists) {
continue;
}
if (mca_pls_process_component.debug) {
@ -772,15 +640,7 @@ int orte_pls_process_launch(orte_jobid_t jobid)
opal_output(0, "pls:process: %s is a LOCAL node\n",
rmaps_node->nodename);
}
if (mca_pls_process_component.timing) {
/* since this is a local launch, the daemon will never reach
* the waitpid callback - so set the start value to
* something nonsensical
*/
launchstart[vpid].tv_sec = 0;
launchstart[vpid].tv_usec = 0;
}
exec_argv = &argv[local_exec_index];
/* If the user provide a prefix then first try to find the application there */
if( NULL != prefix_dir ) {
@ -798,7 +658,7 @@ int orte_pls_process_launch(orte_jobid_t jobid)
if( NULL == exec_path ) {
char* full_path[2];
full_path[0] = opal_os_path( false, OPAL_BINDIR, NULL );
full_path[0] = opal_os_path( false, opal_install_dirs.bindir, NULL );
full_path[1] = NULL;
exec_path = opal_path_find(exec_argv[0], full_path, F_OK, NULL);
free(full_path[0]);
@ -881,9 +741,9 @@ int orte_pls_process_launch(orte_jobid_t jobid)
#endif
/* setup process name */
rc = orte_ns.get_proc_name_string(&name_string, name);
rc = orte_ns.get_proc_name_string(&name_string, rmaps_node->daemon);
if (ORTE_SUCCESS != rc) {
opal_output(0, "orte_pls_process: unable to create process name");
opal_output(0, "orte_pls_process: unable to get daemon name as string");
exit(-1);
}
free(argv[proc_name_index]);
@ -915,31 +775,51 @@ int orte_pls_process_launch(orte_jobid_t jobid)
free(param);
}
}
pid = _spawnve( _P_NOWAIT, exec_path, exec_argv, env); //,NULL); daniel
if (pid == -1) opal_output(0, "pls:process: execv failed spawning process %s; errno=%d\n", exec_path, errno);
else opal_output(0, "pls:process: execv %s hopefully started (pid %d)\n", exec_path, pid);
pid = _spawnve( _P_NOWAIT, exec_path, exec_argv, env);
if (pid == -1) {
/* indicate this daemon has been launched in case anyone is sitting on that trigger */
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(rmaps_node->daemon, ORTE_PROC_STATE_LAUNCHED, 0))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
failed_launch = true;
rc = ORTE_ERROR;
goto cleanup;
}
/* indicate this daemon has been launched in case anyone is sitting on that trigger */
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(rmaps_node->daemon, ORTE_PROC_STATE_LAUNCHED, 0))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
opal_output(0, "pls:process: execv %s hopefully started (pid %d)\n", exec_path, pid);
OPAL_THREAD_LOCK(&mca_pls_process_component.lock);
/* This situation can lead to a deadlock if '--debug-daemons' is set.
* However, the deadlock condition is tested at the begining of this
* function, so we're quite confident it should not happens here.
*/
if (mca_pls_process_component.num_children++ >=
mca_pls_process_component.num_concurrent) {
opal_condition_wait(&mca_pls_process_component.cond, &mca_pls_process_component.lock);
}
OPAL_THREAD_UNLOCK(&mca_pls_process_component.lock);
/* setup callback on sigchild - wait until setup above is complete
* as the callback can occur in the call to orte_wait_cb
*/
orte_wait_cb(pid, orte_pls_process_wait_daemon, dmn);
orte_wait_cb(pid, orte_pls_process_wait_daemon, NULL);
/* if required - add delay to avoid problems w/ X11 authentication */
if (mca_pls_process_component.debug && mca_pls_process_component.delay) {
sleep(mca_pls_process_component.delay);
}
vpid++;
}
free(name);
}
/* all done, so store the daemon info on the registry */
if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&active_daemons))) {
ORTE_ERROR_LOG(rc);
}
cleanup:
OBJ_RELEASE(map);
if (NULL != map) {
OBJ_RELEASE(map);
}
if (NULL != lib_base) {
free(lib_base);
@ -948,8 +828,23 @@ int orte_pls_process_launch(orte_jobid_t jobid)
free(bin_base);
}
if (NULL != jobid_string) free(jobid_string); /* done with this variable */
if (NULL != argv) opal_argv_free(argv);
if (NULL != jobid_string) {
free(jobid_string); /* done with this variable */
}
if (NULL != argv) {
opal_argv_free(argv);
}
/* check for failed launch - if so, force terminate */
if( failed_launch ) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_wakeup(jobid))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
}
@ -961,60 +856,27 @@ int orte_pls_process_launch(orte_jobid_t jobid)
int orte_pls_process_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
{
int rc;
opal_list_t daemons;
opal_list_item_t *item;
OPAL_TRACE(1);
/* construct the list of active daemons on this job */
OBJ_CONSTRUCT(&daemons, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* order them to kill their local procs for this job */
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) {
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(jobid, timeout, attrs))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
CLEANUP:
while (NULL != (item = opal_list_remove_first(&daemons))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&daemons);
return rc;
}
/**
* Terminate the orteds for a given job
*/
int orte_pls_process_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
int orte_pls_process_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
{
int rc;
opal_list_t daemons;
opal_list_item_t *item;
OPAL_TRACE(1);
/* construct the list of active daemons on this job */
OBJ_CONSTRUCT(&daemons, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* now tell them to die! */
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) {
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(timeout, attrs))) {
ORTE_ERROR_LOG(rc);
}
CLEANUP:
while (NULL != (item = opal_list_remove_first(&daemons))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&daemons);
return rc;
}
@ -1031,28 +893,12 @@ int orte_pls_process_terminate_proc(const orte_process_name_t* proc)
int orte_pls_process_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs)
{
int rc;
opal_list_t daemons;
opal_list_item_t *item;
OPAL_TRACE(1);
/* construct the list of active daemons on this job */
OBJ_CONSTRUCT(&daemons, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&daemons);
return rc;
}
/* order them to pass this signal to their local procs */
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_signal_local_procs(jobid, signal, &daemons))) {
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_signal_local_procs(jobid, signal, attrs))) {
ORTE_ERROR_LOG(rc);
}
while (NULL != (item = opal_list_remove_first(&daemons))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&daemons);
return rc;
}
@ -1063,23 +909,6 @@ int orte_pls_process_signal_proc(const orte_process_name_t* proc, int32_t signal
return ORTE_ERR_NOT_IMPLEMENTED;
}
/**
* Cancel an operation involving comm to an orted
*/
int orte_pls_process_cancel_operation(void)
{
int rc;
OPAL_TRACE(1);
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
int orte_pls_process_finalize(void)
{
int rc;