Add ability to trap and propagate SIGUSR1/2 to remote processes. There are a number of small changes that hit a bunch of files:
1. Changed the RMGR and PLS APIs to add "signal_job" and "signal_proc" entry points. Only the "signal_job" entries are implemented - none of the components have implementations for "signal_proc" at this time. Thus, you can signal all of the procs in a job, but cannot currently signal only one specific proc. 2. Implemented those new API functions in all components except xgrid (Brian will do so very soon). Only the rsh/ssh and fork modules have been tested, however, and only under OS-X. 3. Added signal traps and callback functions for SIGUSR1/2 to orterun/mpirun that catch those signals and call the appropriate commands to propagate them out to all processes in the job. 4. Added a new test directory under the orte branch to (eventually) hold unit and system level tests for just the run-time. Since our test branch of the repository is under restricted access, people working on the RTE were continually developing their own system-level tests - thus making it hard to help diagnose problems. I have moved the more commonly-used functions here, and added one specifically for testing the SIGUSR1/2 functionality. I will be contacting people directly to seek help with testing the changes on more environments. Other than compile issues, you should see absolutely no change in behavior on any of your systems - this additional functionality is transparent to anyone who does not issue a SIGUSR1/2 to mpirun. Ralph This commit was SVN r10258.
Этот коммит содержится в:
родитель
08823e56fa
Коммит
ee5a626d25
@ -120,6 +120,8 @@ extern "C" {
|
||||
int orte_pls_base_proxy_mca_argv(int *argc, char ***argv);
|
||||
int orte_pls_base_proxy_terminate_job(orte_jobid_t jobid);
|
||||
int orte_pls_base_proxy_terminate_proc(const orte_process_name_t *proc);
|
||||
int orte_pls_base_proxy_signal_job(orte_jobid_t jobid, int32_t signal);
|
||||
int orte_pls_base_proxy_signal_proc(const orte_process_name_t *proc, int32_t signal);
|
||||
|
||||
/**
|
||||
* Check that the cwd in an app context exists and is accessible.
|
||||
|
@ -49,7 +49,7 @@ orte_pls_base_proxy_set_node_name(orte_ras_node_t* node,
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(values[0]);
|
||||
@ -81,7 +81,7 @@ orte_pls_base_proxy_set_node_name(orte_ras_node_t* node,
|
||||
OBJ_RELEASE(values[0]);
|
||||
free(jobid_string);
|
||||
free(key);
|
||||
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -261,3 +261,155 @@ orte_pls_base_proxy_terminate_proc(const orte_process_name_t *proc)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function gets called when the remote node notifies us that it has sent
|
||||
* the signal to its respective child processes.
|
||||
*/
|
||||
static void orte_pls_rsh_signal_job_rsp(
|
||||
int status,
|
||||
orte_process_name_t* peer,
|
||||
orte_buffer_t* rsp,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
int rc;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmgr_base_unpack_rsp(rsp))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This function gets called when the corresponding send completes. It then generates
|
||||
* a non-blocking receive so we can be notified when the action was actually completed
|
||||
* on the remote node.
|
||||
*/
|
||||
static void orte_pls_rsh_signal_job_cb(
|
||||
int status,
|
||||
orte_process_name_t* peer,
|
||||
orte_buffer_t* req,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
/* wait for response */
|
||||
int rc;
|
||||
if (status < 0) {
|
||||
ORTE_ERROR_LOG(status);
|
||||
OBJ_RELEASE(req);
|
||||
return;
|
||||
}
|
||||
|
||||
if (0 > (rc = orte_rml.recv_buffer_nb(peer, ORTE_RML_TAG_RMGR_CLNT, 0, orte_pls_rsh_signal_job_rsp, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
OBJ_RELEASE(req);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
orte_pls_base_proxy_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
char *keys[2];
|
||||
char *jobid_string;
|
||||
orte_gpr_value_t** values = NULL;
|
||||
size_t i, j, num_values = 0;
|
||||
orte_process_name_t proc, *pnptr;
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
asprintf(&keys[0], "%s-%s", ORTE_NODE_BOOTPROXY_KEY, jobid_string);
|
||||
keys[1] = NULL;
|
||||
|
||||
rc = orte_gpr.get(
|
||||
ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
||||
ORTE_NODE_SEGMENT,
|
||||
NULL,
|
||||
keys,
|
||||
&num_values,
|
||||
&values
|
||||
);
|
||||
if (rc != ORTE_SUCCESS) {
|
||||
free(jobid_string);
|
||||
return rc;
|
||||
}
|
||||
if (0 == num_values) {
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
for(i=0; i<num_values; i++) {
|
||||
orte_gpr_value_t* value = values[i];
|
||||
for(j=0; j<value->cnt; j++) {
|
||||
orte_gpr_keyval_t* keyval = value->keyvals[j];
|
||||
orte_buffer_t *cmd = OBJ_NEW(orte_buffer_t);
|
||||
int ret;
|
||||
if (cmd == NULL) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
if (strcmp(keyval->key, keys[0]) != 0)
|
||||
continue;
|
||||
|
||||
/** construct command */
|
||||
ret = orte_rmgr_base_pack_signal_job_cmd(cmd, jobid, signal);
|
||||
if (ORTE_SUCCESS != ret) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(cmd);
|
||||
rc = ret;
|
||||
continue;
|
||||
}
|
||||
|
||||
/** get the process name from the returned keyval */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&pnptr, values[i]->keyvals[0]->value, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
rc = ret;
|
||||
continue;
|
||||
}
|
||||
proc = *pnptr;
|
||||
|
||||
/** send a signal message to the bootproxy on each node */
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(
|
||||
&proc,
|
||||
cmd,
|
||||
ORTE_RML_TAG_RMGR_SVC,
|
||||
0,
|
||||
orte_pls_rsh_signal_job_cb,
|
||||
NULL))) {
|
||||
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(cmd);
|
||||
rc = ret;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
|
||||
free(jobid_string);
|
||||
free(keys[0]);
|
||||
|
||||
if (NULL != values) {
|
||||
for(i=0; i<num_values; i++) {
|
||||
if (NULL != values[i]) {
|
||||
OBJ_RELEASE(values[i]);
|
||||
}
|
||||
}
|
||||
if (NULL != values ) free(values);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
orte_pls_base_proxy_signal_proc(const orte_process_name_t *proc, int32_t signal)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
@ -76,8 +76,8 @@ extern char **environ;
|
||||
#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS
|
||||
int orte_pls_bproc_launch_threaded(orte_jobid_t);
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Initialization of the bproc module with all the needed function pointers
|
||||
*/
|
||||
@ -89,6 +89,8 @@ orte_pls_base_module_t orte_pls_bproc_module = {
|
||||
#endif
|
||||
orte_pls_bproc_terminate_job,
|
||||
orte_pls_bproc_terminate_proc,
|
||||
orte_pls_bproc_signal_job,
|
||||
orte_pls_bproc_signal_proc,
|
||||
orte_pls_bproc_finalize
|
||||
};
|
||||
|
||||
@ -1029,10 +1031,58 @@ int orte_pls_bproc_terminate_proc(const orte_process_name_t* proc_name) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Signal all processes associated with this job
|
||||
*/
|
||||
int orte_pls_bproc_signal_job(orte_jobid_t jobid, int32_t signal) {
|
||||
pid_t* pids;
|
||||
size_t i, num_pids;
|
||||
int rc;
|
||||
|
||||
/* signal application process */
|
||||
if(ORTE_SUCCESS != (rc = orte_pls_base_get_proc_pids(jobid, &pids, &num_pids)))
|
||||
return rc;
|
||||
for(i=0; i<num_pids; i++) {
|
||||
if(mca_pls_bproc_component.debug) {
|
||||
opal_output(0, "orte_pls_bproc: signaling proc: %d\n", pids[i]);
|
||||
}
|
||||
kill(pids[i], (int)signal);
|
||||
}
|
||||
if(NULL != pids)
|
||||
free(pids);
|
||||
|
||||
/** dont signal daemons - this is strictly for signalling application processes */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Signal a specific process.
|
||||
*/
|
||||
int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t signal) {
|
||||
int rc;
|
||||
pid_t pid;
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_pls_base_get_proc_pid(proc_name, &pid)))
|
||||
return rc;
|
||||
if(kill(pid, (int)signal) != 0) {
|
||||
switch(errno) {
|
||||
case EINVAL:
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
case ESRCH:
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
case EPERM:
|
||||
return ORTE_ERR_PERM;
|
||||
default:
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Module cleanup
|
||||
*/
|
||||
int orte_pls_bproc_finalize(void)
|
||||
int orte_pls_bproc_finalize(void)
|
||||
{
|
||||
/* wait for all daemons */
|
||||
OPAL_THREAD_LOCK(&mca_pls_bproc_component.lock);
|
||||
@ -1047,9 +1097,9 @@ int orte_pls_bproc_finalize(void)
|
||||
/*
|
||||
* Handle threading issues.
|
||||
*/
|
||||
|
||||
|
||||
#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS
|
||||
|
||||
|
||||
struct orte_pls_bproc_stack_t {
|
||||
opal_condition_t cond;
|
||||
opal_mutex_t mutex;
|
||||
@ -1058,7 +1108,7 @@ struct orte_pls_bproc_stack_t {
|
||||
int rc;
|
||||
};
|
||||
typedef struct orte_pls_bproc_stack_t orte_pls_bproc_stack_t;
|
||||
|
||||
|
||||
static void orte_pls_bproc_stack_construct(orte_pls_bproc_stack_t* stack)
|
||||
{
|
||||
OBJ_CONSTRUCT(&stack->mutex, opal_mutex_t);
|
||||
@ -1066,23 +1116,23 @@ static void orte_pls_bproc_stack_construct(orte_pls_bproc_stack_t* stack)
|
||||
stack->rc = 0;
|
||||
stack->complete = false;
|
||||
}
|
||||
|
||||
|
||||
static void orte_pls_bproc_stack_destruct(orte_pls_bproc_stack_t* stack)
|
||||
{
|
||||
OBJ_DESTRUCT(&stack->mutex);
|
||||
OBJ_DESTRUCT(&stack->cond);
|
||||
}
|
||||
|
||||
|
||||
static OBJ_CLASS_INSTANCE(
|
||||
orte_pls_bproc_stack_t,
|
||||
opal_object_t,
|
||||
orte_pls_bproc_stack_construct,
|
||||
orte_pls_bproc_stack_destruct);
|
||||
|
||||
|
||||
|
||||
|
||||
static void orte_pls_bproc_launch_cb(int fd, short event, void* args)
|
||||
{
|
||||
|
||||
|
||||
orte_pls_bproc_stack_t *stack = (orte_pls_bproc_stack_t*)args;
|
||||
stack->rc = orte_pls_bproc_launch(stack->jobid);
|
||||
OPAL_THREAD_LOCK(&stack->mutex);
|
||||
@ -1096,13 +1146,13 @@ int orte_pls_bproc_launch_threaded(orte_jobid_t jobid)
|
||||
struct timeval tv = { 0, 0 };
|
||||
struct opal_event event;
|
||||
struct orte_pls_bproc_stack_t stack;
|
||||
|
||||
|
||||
OBJ_CONSTRUCT(&stack, orte_pls_bproc_stack_t);
|
||||
|
||||
|
||||
stack.jobid = jobid;
|
||||
opal_evtimer_set(&event, orte_pls_bproc_launch_cb, &stack);
|
||||
opal_evtimer_add(&event, &tv);
|
||||
|
||||
|
||||
OPAL_THREAD_LOCK(&stack.mutex);
|
||||
while(stack.complete == false)
|
||||
opal_condition_wait(&stack.cond, &stack.mutex);
|
||||
@ -1110,6 +1160,6 @@ int orte_pls_bproc_launch_threaded(orte_jobid_t jobid)
|
||||
OBJ_DESTRUCT(&stack);
|
||||
return stack.rc;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -1,19 +1,19 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*
|
||||
@ -22,16 +22,16 @@
|
||||
* @file:
|
||||
* Header file for the bproc launcher. This launcher is actually split into 2
|
||||
* modules: pls_bproc & pls_bproc_orted. The general idea behind this launcher is:
|
||||
* -# pls_bproc is called by orterun. It figures out the process mapping and
|
||||
* -# pls_bproc is called by orterun. It figures out the process mapping and
|
||||
* launches orted's on the nodes
|
||||
* -# pls_bproc_orted is called by orted. This module intializes either a pty or
|
||||
* pipes, places symlinks to them in well know points of the filesystem, and
|
||||
* sets up the io forwarding. It then sends an ack back to orterun.
|
||||
* -# pls_bproc waits for an ack to come back from the orteds, then does several
|
||||
* parallel launches of the application processes. The number of launches is
|
||||
* parallel launches of the application processes. The number of launches is
|
||||
* equal to the maximum number of processes on a node. For example, if there
|
||||
* were 2 processes assigned to node 1, and 1 process asigned to node 2, we
|
||||
* would do a parallel launch that launches on process on each node, then
|
||||
* would do a parallel launch that launches on process on each node, then
|
||||
* another which launches another process on node 1.
|
||||
*/
|
||||
|
||||
@ -68,6 +68,8 @@ int orte_pls_bproc_finalize(void);
|
||||
int orte_pls_bproc_launch(orte_jobid_t);
|
||||
int orte_pls_bproc_terminate_job(orte_jobid_t);
|
||||
int orte_pls_bproc_terminate_proc(const orte_process_name_t* proc_name);
|
||||
int orte_pls_bproc_signal_job(orte_jobid_t, int32_t);
|
||||
int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t);
|
||||
|
||||
/**
|
||||
* PLS bproc Component
|
||||
@ -77,7 +79,7 @@ struct orte_pls_bproc_component_t {
|
||||
/**< The base class */
|
||||
bool done_launching;
|
||||
/**< Is true if we are done launching the user's app. */
|
||||
char * orted;
|
||||
char * orted;
|
||||
/**< The orted executeable. This can be an absolute path, or if not found
|
||||
* we will look for it in the user's path */
|
||||
int debug;
|
||||
|
@ -58,6 +58,8 @@ orte_pls_base_module_1_0_0_t orte_pls_bproc_orted_module = {
|
||||
orte_pls_bproc_orted_launch,
|
||||
orte_pls_bproc_orted_terminate_job,
|
||||
orte_pls_bproc_orted_terminate_proc,
|
||||
orte_pls_bproc_orted_signal_job,
|
||||
orte_pls_bproc_orted_signal_proc,
|
||||
orte_pls_bproc_orted_finalize
|
||||
};
|
||||
|
||||
@ -404,7 +406,7 @@ int orte_pls_bproc_orted_launch(orte_jobid_t jobid) {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* hack for bproc4, change process group so that we do not receive signals
|
||||
* hack for bproc4, change process group so that we do not receive signals
|
||||
* from the parent/front-end process, as bproc4 does not currently allow the
|
||||
* process to intercept the signal
|
||||
*/
|
||||
@ -550,6 +552,33 @@ int orte_pls_bproc_orted_terminate_proc(const orte_process_name_t* proc)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Function to signal a job. Since this component only runs on remote nodes
|
||||
* and doesn't actually launch any processes, this function is not needed
|
||||
* so is a noop.
|
||||
* @param jobid The job to signal
|
||||
* @param signal The signal to send
|
||||
* @retval ORTE_SUCCESS
|
||||
*/
|
||||
int orte_pls_bproc_orted_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Function to signal a process. Since this component only runs on remote nodes
|
||||
* and doesn't actually launch any processes, this function is not needed
|
||||
* so is a noop.
|
||||
* @param proc the process's name
|
||||
* @param signal The signal to send
|
||||
* @retval ORTE_SUCCESS
|
||||
*/
|
||||
int orte_pls_bproc_orted_terminate_proc(const orte_process_name_t* proc, int32_t signal)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Finalizes the bproc_orted module. Cleanup tmp directory/files
|
||||
* used for I/O forwarding.
|
||||
|
@ -5,14 +5,14 @@
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
@ -43,7 +43,7 @@ extern "C" {
|
||||
int orte_pls_bproc_orted_component_open(void);
|
||||
int orte_pls_bproc_orted_component_close(void);
|
||||
orte_pls_base_module_t* orte_pls_bproc_orted_init(int *priority);
|
||||
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
@ -55,7 +55,9 @@ int orte_pls_bproc_orted_finalize(void);
|
||||
int orte_pls_bproc_orted_launch(orte_jobid_t);
|
||||
int orte_pls_bproc_orted_terminate_job(orte_jobid_t);
|
||||
int orte_pls_bproc_orted_terminate_proc(const orte_process_name_t* proc_name);
|
||||
|
||||
int orte_pls_bproc_orted_signal_job(orte_jobid_t, int32_t);
|
||||
int orte_pls_bproc_orted_signal_proc(const orte_process_name_t* proc_name, int32_t);
|
||||
|
||||
/**
|
||||
* PLS bproc_orted component
|
||||
*/
|
||||
|
@ -70,6 +70,8 @@ orte_pls_base_module_t orte_pls_bproc_seed_module = {
|
||||
#endif
|
||||
orte_pls_bproc_seed_terminate_job,
|
||||
orte_pls_bproc_seed_terminate_proc,
|
||||
orte_pls_bproc_seed_signal_job,
|
||||
orte_pls_bproc_seed_signal_proc,
|
||||
orte_pls_bproc_seed_finalize
|
||||
};
|
||||
|
||||
@ -727,6 +729,58 @@ int orte_pls_bproc_seed_terminate_proc(const orte_process_name_t* proc_name)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Signal all processes associated with this job. Daemons are not included as this function
|
||||
* only applies to application processes.
|
||||
*/
|
||||
|
||||
int orte_pls_bproc_seed_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
pid_t* pids;
|
||||
pid_t my_pid = getpid();
|
||||
size_t i, num_pids;
|
||||
int rc;
|
||||
|
||||
/** signal application process */
|
||||
if(ORTE_SUCCESS != (rc = orte_pls_base_get_proc_pids(jobid, &pids, &num_pids)))
|
||||
return rc;
|
||||
for(i=0; i<num_pids; i++) {
|
||||
if(mca_pls_bproc_seed_component.debug) {
|
||||
opal_output(0, "orte_pls_bproc: killing proc: %d\n", pids[i]);
|
||||
}
|
||||
kill(pids[i], (int)signal);
|
||||
}
|
||||
if(NULL != pids)
|
||||
free(pids);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Signal a specific process.
|
||||
*/
|
||||
int orte_pls_bproc_seed_signal_proc(const orte_process_name_t* proc_name, int32_t signal)
|
||||
{
|
||||
int rc;
|
||||
pid_t pid;
|
||||
if(ORTE_SUCCESS != (rc = orte_pls_base_get_proc_pid(proc_name, &pid)))
|
||||
return rc;
|
||||
if(kill(pid, (int)signal) != 0) {
|
||||
switch(errno) {
|
||||
case EINVAL:
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
case ESRCH:
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
case EPERM:
|
||||
return ORTE_ERR_PERM;
|
||||
default:
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Module cleanup
|
||||
*/
|
||||
|
@ -1,19 +1,19 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*
|
||||
@ -50,6 +50,8 @@ int orte_pls_bproc_seed_finalize(void);
|
||||
int orte_pls_bproc_seed_launch(orte_jobid_t);
|
||||
int orte_pls_bproc_seed_terminate_job(orte_jobid_t);
|
||||
int orte_pls_bproc_seed_terminate_proc(const orte_process_name_t* proc_name);
|
||||
int orte_pls_bproc_seed_signal_job(orte_jobid_t, int32_t);
|
||||
int orte_pls_bproc_seed_signal_proc(const orte_process_name_t* proc_name, int32_t);
|
||||
|
||||
|
||||
/**
|
||||
@ -68,7 +70,7 @@ struct orte_pls_bproc_seed_component_t {
|
||||
opal_condition_t condition;
|
||||
};
|
||||
typedef struct orte_pls_bproc_seed_component_t orte_pls_bproc_seed_component_t;
|
||||
|
||||
|
||||
ORTE_DECLSPEC extern orte_pls_bproc_seed_component_t mca_pls_bproc_seed_component;
|
||||
ORTE_DECLSPEC extern orte_pls_base_module_t orte_pls_bproc_seed_module;
|
||||
|
||||
|
@ -5,14 +5,14 @@
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -48,7 +48,9 @@ int orte_pls_fork_finalize(void);
|
||||
int orte_pls_fork_launch(orte_jobid_t);
|
||||
int orte_pls_fork_terminate_job(orte_jobid_t);
|
||||
int orte_pls_fork_terminate_proc(const orte_process_name_t* proc_name);
|
||||
|
||||
int orte_pls_fork_signal_job(orte_jobid_t, int32_t);
|
||||
int orte_pls_fork_signal_proc(const orte_process_name_t* proc_name, int32_t signal);
|
||||
|
||||
/**
|
||||
* PLS Component
|
||||
*/
|
||||
|
@ -91,13 +91,15 @@ orte_pls_base_module_1_0_0_t orte_pls_fork_module = {
|
||||
#endif
|
||||
orte_pls_fork_terminate_job,
|
||||
orte_pls_fork_terminate_proc,
|
||||
orte_pls_fork_signal_job,
|
||||
orte_pls_fork_signal_proc,
|
||||
orte_pls_fork_finalize
|
||||
};
|
||||
|
||||
static void set_handler_default(int sig);
|
||||
|
||||
|
||||
static bool orte_pls_fork_child_died(pid_t pid, unsigned int timeout)
|
||||
static bool orte_pls_fork_child_died(pid_t pid, unsigned int timeout)
|
||||
{
|
||||
time_t end;
|
||||
pid_t ret;
|
||||
@ -140,14 +142,14 @@ static void orte_pls_fork_kill_processes(opal_value_array_t *pids)
|
||||
|
||||
/* The kill succeeded. Wait up to timeout_before_sigkill
|
||||
seconds to see if it died. */
|
||||
|
||||
|
||||
if (!orte_pls_fork_child_died(pid, mca_pls_fork_component.timeout_before_sigkill)) {
|
||||
kill(pid, SIGKILL);
|
||||
/* Double check that it actually died */
|
||||
if (!orte_pls_fork_child_died(pid, mca_pls_fork_component.timeout_before_sigkill)) {
|
||||
char hostname[MAXHOSTNAMELEN];
|
||||
gethostname(hostname, sizeof(hostname));
|
||||
|
||||
|
||||
opal_show_help("help-orte-pls-fork.txt",
|
||||
"orte-pls-fork:could-not-kill",
|
||||
true, hostname, pid);
|
||||
@ -300,7 +302,7 @@ static int orte_pls_fork_proc(
|
||||
write(p[1], &i, sizeof(int));
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
|
||||
/* setup base environment: copy the current environ and merge
|
||||
in the app context environ */
|
||||
if (NULL != context->env) {
|
||||
@ -466,7 +468,7 @@ static int orte_pls_fork_proc(
|
||||
}
|
||||
|
||||
/* save the pid in the registry */
|
||||
if (ORTE_SUCCESS !=
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_pls_base_set_proc_pid(&proc->proc_name, pid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
@ -542,7 +544,7 @@ int orte_pls_fork_launch(orte_jobid_t jobid)
|
||||
processes to be launched to ABORTED. This will
|
||||
cause the entire job to abort. */
|
||||
for (; i < map->num_procs; ++i) {
|
||||
orte_soh.set_proc_soh(&map->procs[i]->proc_name,
|
||||
orte_soh.set_proc_soh(&map->procs[i]->proc_name,
|
||||
ORTE_PROC_STATE_ABORTED, 0);
|
||||
}
|
||||
|
||||
@ -646,6 +648,114 @@ int orte_pls_fork_terminate_proc(const orte_process_name_t* proc)
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
/**
|
||||
* Query for all processes allocated to the job and signal
|
||||
* those on the current node.
|
||||
*/
|
||||
|
||||
int orte_pls_fork_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
/* query for the pids allocated on this node */
|
||||
char *segment;
|
||||
char *keys[3];
|
||||
orte_gpr_value_t** values = NULL;
|
||||
size_t i, k, num_values = 0;
|
||||
int rc;
|
||||
opal_value_array_t pids;
|
||||
pid_t pid;
|
||||
|
||||
/* setup the pid array */
|
||||
OBJ_CONSTRUCT(&pids, opal_value_array_t);
|
||||
opal_value_array_init(&pids, sizeof(pid_t));
|
||||
|
||||
/* query the job segment on the registry */
|
||||
if(ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
keys[0] = ORTE_NODE_NAME_KEY;
|
||||
keys[1] = ORTE_PROC_PID_KEY;
|
||||
keys[2] = NULL;
|
||||
|
||||
rc = orte_gpr.get(
|
||||
ORTE_GPR_KEYS_AND|ORTE_GPR_TOKENS_OR,
|
||||
segment,
|
||||
NULL,
|
||||
keys,
|
||||
&num_values,
|
||||
&values
|
||||
);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
free(segment);
|
||||
return rc;
|
||||
}
|
||||
|
||||
for(i=0; i<num_values; i++) {
|
||||
orte_gpr_value_t* value = values[i];
|
||||
pid_t pid = 0, *pidptr;
|
||||
for(k=0; k<value->cnt; k++) {
|
||||
orte_gpr_keyval_t* keyval = value->keyvals[k];
|
||||
if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) {
|
||||
if(orte_dss.compare(keyval->value->data, orte_system_info.nodename, ORTE_STRING) != ORTE_EQUAL) {
|
||||
break;
|
||||
}
|
||||
} else if (strcmp(keyval->key, ORTE_PROC_PID_KEY) == 0) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&pidptr, keyval->value, ORTE_PID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(segment);
|
||||
return rc;
|
||||
}
|
||||
pid = *pidptr;
|
||||
}
|
||||
}
|
||||
if (0 != pid) {
|
||||
opal_value_array_append_item(&pids, &pid);
|
||||
}
|
||||
OBJ_RELEASE(value);
|
||||
}
|
||||
|
||||
rc = ORTE_SUCCESS;
|
||||
/* If we have processes to signal, go signal them */
|
||||
for (i = 0; i < opal_value_array_get_size(&pids); ++i) {
|
||||
pid = OPAL_VALUE_ARRAY_GET_ITEM(&pids, pid_t, i);
|
||||
if(kill(pid, (int)signal) != 0) {
|
||||
switch(errno) {
|
||||
case EINVAL:
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
break;
|
||||
case ESRCH:
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
break;
|
||||
case EPERM:
|
||||
ORTE_ERROR_LOG(ORTE_ERR_PERM);
|
||||
rc = ORTE_ERR_PERM;
|
||||
break;
|
||||
default:
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
rc = ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&pids);
|
||||
|
||||
if(NULL != values) {
|
||||
free(values);
|
||||
}
|
||||
free(segment);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
int orte_pls_fork_signal_proc(const orte_process_name_t* proc, int32_t signal)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
int orte_pls_fork_finalize(void)
|
||||
{
|
||||
if(mca_pls_fork_component.reap) {
|
||||
|
@ -6,21 +6,21 @@
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* The Open RTE Process Launch Subsystem
|
||||
*
|
||||
*
|
||||
* The process launch subsystem (PLS) is responsible for actually
|
||||
* launching a specified application's processes across the indicated
|
||||
* resource. The PLS is invoked by the controlling program (mpirun or
|
||||
@ -35,22 +35,22 @@
|
||||
* not be available when the PLS is invoked. Thus, the PLS components
|
||||
* must include the ability to sense their environment where
|
||||
* necessary.
|
||||
*
|
||||
*
|
||||
* The PLS obtains its input information from several sources:
|
||||
*
|
||||
*
|
||||
* - the ORTE_JOB_SEGMENT of the registry. Information on this segment
|
||||
* includes: the application to be executed; the number of processes
|
||||
* of each application to be run; the context (argv and enviro arrays)
|
||||
* for each process.
|
||||
*
|
||||
*
|
||||
* - the ORTE_RESOURCE_SEGMENT of the registry. This includes:
|
||||
* identification of the launcher to be used on the indicated
|
||||
* resource; location of temporary directory and other filesystem
|
||||
* directory locations;
|
||||
*
|
||||
*
|
||||
* - MCA parameters. This includes any directive from the user as to
|
||||
* the launcher to be used and/or its configuration.
|
||||
*
|
||||
*
|
||||
* The PLS uses this information to launch the processes upon the
|
||||
* indicated resource(s). PLS components are free to ignore
|
||||
* information that is not pertinent to their operation. For example,
|
||||
@ -60,11 +60,11 @@
|
||||
* corresponding information that the mapper placed on the registry -
|
||||
* it is irrelevant to that launcher's operation (although a warning
|
||||
* to the user, in this case, might be appropriate).
|
||||
*
|
||||
*
|
||||
* The PLS is tightly coupled to the PLSNDS - the PLS name discovery
|
||||
* service - that each process uses to "discover" its official
|
||||
* name. Each PLS MUST:
|
||||
*
|
||||
*
|
||||
* - set the MCA parameter "pls_base_nds" to indicate the which name
|
||||
* discoverty service should be used on the remote side to discover
|
||||
* the process' name. The contents of the MCA parameter should be one
|
||||
@ -79,10 +79,10 @@
|
||||
* - where necessary, provide a function in the orte_plsnds directory
|
||||
* that can define the process name from whatever info that
|
||||
* corresponding launcher provided
|
||||
*
|
||||
*
|
||||
* More information on the requirements for the PLSNDS can be found in
|
||||
* the header file src/plsnds/plsnds.h.
|
||||
*
|
||||
*
|
||||
* Unless otherwise directed by the user and/or the system
|
||||
* configuration, the PLS will utilize a daemon-based launch to
|
||||
* maximize the availability of ORTE services. To accomplish this, the
|
||||
@ -91,17 +91,17 @@
|
||||
* sequence (with the first step being daemon launch, followed by the
|
||||
* secondary application launch). In turn, the PLS must provide a
|
||||
* component with the ability to launch via an existing daemon.
|
||||
*
|
||||
*
|
||||
* NOTE: The RMGR may override local launcher specification to utilize
|
||||
* the daemon-based launch component - it is expected that the daemons
|
||||
* in the local environment will know how to launch in that
|
||||
* environment. It is vital, therefore, that the PLS components NOT be
|
||||
* directly called by any ORTE function - instead, all PLS
|
||||
* functionality is to be accessed via the RMGR.
|
||||
*
|
||||
*
|
||||
* As part of the launch procedure, PLS components must provide the
|
||||
* following capabilities:
|
||||
*
|
||||
*
|
||||
* - set the "pls_base_nds" MCA parameter indicating which NDS is to
|
||||
* be used. This information is subsequently used by the name
|
||||
* discovery service to determine a process' official name, as
|
||||
@ -118,7 +118,7 @@
|
||||
* Since I/O forwarding is still under develpoment, this is not yet
|
||||
* well-defined.
|
||||
* </JMS>
|
||||
*
|
||||
*
|
||||
* - pass context info to each process. The argv and enviro arrays are
|
||||
* stored on the registry by the resource allocation subsystem (RAS) -
|
||||
* this includes any process- specific deviations from the
|
||||
@ -130,13 +130,13 @@
|
||||
* that allow it, PLS components should utilize methods that support
|
||||
* scalable launch of applications involving large numbers of
|
||||
* processes.
|
||||
*
|
||||
*
|
||||
* - detect that required libraries are present on involved compute
|
||||
* nodes. This is a secondary feature for future implementations.
|
||||
*
|
||||
*
|
||||
* - preposition files and libraries where required and possible. This
|
||||
* is a secondary feature for future implementations.
|
||||
*
|
||||
*
|
||||
* When launching an application, the PLS shall update the registry
|
||||
* with information on batch jobid, assigned jobname, etc. that may
|
||||
* have been provided by the local resource's launcher. This
|
||||
@ -146,22 +146,22 @@
|
||||
* process by a spawning daemon to detect completion of process
|
||||
* startup) should be stored on the ORTE_JOB_SEGMENT in the respective
|
||||
* process' container.
|
||||
*
|
||||
*
|
||||
* Once a process is launched, two options exist for subsequent
|
||||
* operations:
|
||||
*
|
||||
*
|
||||
* - if it is an ORTE process (i.e., one that calls orte_init), the
|
||||
* process will register itself on the ORTE_JOB_SEGMENT of the
|
||||
* registry. This includes providing information on the nodename where
|
||||
* the process is located, contact information for the runtime message
|
||||
* library (RML) and other subsystems, local pid, etc.
|
||||
*
|
||||
*
|
||||
* - if it is NOT an ORTE process, then registration will not take
|
||||
* place. In this case, the ability to subsequently monitor the
|
||||
* progress/state-of-health of the process and/or provide other
|
||||
* services *may* be limited. The PLS has no further responsibilities
|
||||
* for such processes.
|
||||
*
|
||||
*
|
||||
* Once the PLS has completed launch of the application, it notifies
|
||||
* the state-of-health (SOH) monitor that a jobid has been launched
|
||||
* and is now available for monitoring. It is the SOH's
|
||||
@ -194,7 +194,7 @@
|
||||
*/
|
||||
|
||||
/**
|
||||
* Launch the indicated jobid
|
||||
* Launch the indicated jobid
|
||||
*/
|
||||
typedef int (*orte_pls_base_module_launch_fn_t)(orte_jobid_t);
|
||||
|
||||
@ -209,6 +209,17 @@ typedef int (*orte_pls_base_module_terminate_job_fn_t)(orte_jobid_t);
|
||||
*/
|
||||
typedef int (*orte_pls_base_module_terminate_proc_fn_t)(const orte_process_name_t*);
|
||||
|
||||
/**
|
||||
* Signal any processes launched for the respective jobid by
|
||||
* this component.
|
||||
*/
|
||||
typedef int (*orte_pls_base_module_signal_job_fn_t)(orte_jobid_t, int32_t);
|
||||
|
||||
/**
|
||||
* Signal a specific process.
|
||||
*/
|
||||
typedef int (*orte_pls_base_module_signal_proc_fn_t)(const orte_process_name_t*, int32_t);
|
||||
|
||||
/**
|
||||
* Cleanup all resources held by the module
|
||||
*/
|
||||
@ -221,6 +232,8 @@ struct orte_pls_base_module_1_0_0_t {
|
||||
orte_pls_base_module_launch_fn_t launch;
|
||||
orte_pls_base_module_terminate_job_fn_t terminate_job;
|
||||
orte_pls_base_module_terminate_proc_fn_t terminate_proc;
|
||||
orte_pls_base_module_signal_job_fn_t signal_job;
|
||||
orte_pls_base_module_signal_proc_fn_t signal_proc;
|
||||
orte_pls_base_module_finalize_fn_t finalize;
|
||||
};
|
||||
|
||||
@ -238,10 +251,10 @@ typedef struct orte_pls_base_module_1_0_0_t orte_pls_base_module_t;
|
||||
* @param priority (OUT) Relative priority or ranking use by MCA to
|
||||
* select a module.
|
||||
*/
|
||||
typedef struct orte_pls_base_module_1_0_0_t*
|
||||
typedef struct orte_pls_base_module_1_0_0_t*
|
||||
(*orte_pls_base_component_init_fn_t)(int *priority);
|
||||
|
||||
/**
|
||||
/**
|
||||
* pls component v1.0.0
|
||||
*/
|
||||
struct orte_pls_base_component_1_0_0_t {
|
||||
|
@ -5,14 +5,14 @@
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
@ -56,12 +56,16 @@ extern char **environ;
|
||||
static int pls_poe_launch(orte_jobid_t jobid);
|
||||
static int pls_poe_terminate_job(orte_jobid_t jobid);
|
||||
static int pls_poe_terminate_proc(const orte_process_name_t *name);
|
||||
static int pls_poe_signal_job(orte_jobid_t jobid, int32_t signal);
|
||||
static int pls_poe_signal_proc(const orte_process_name_t *name, int32_t signal);
|
||||
static int pls_poe_finalize(void);
|
||||
|
||||
orte_pls_base_module_1_0_0_t orte_pls_poe_module = {
|
||||
pls_poe_launch,
|
||||
pls_poe_terminate_job,
|
||||
pls_poe_terminate_proc,
|
||||
pls_poe_signal_job,
|
||||
pls_poe_signal_proc,
|
||||
pls_poe_finalize
|
||||
};
|
||||
|
||||
@ -72,7 +76,7 @@ poe_set_handler_default - set signal handler to default
|
||||
static void poe_set_handler_default(int sig)
|
||||
{
|
||||
struct sigaction act;
|
||||
|
||||
|
||||
act.sa_handler = SIG_DFL;
|
||||
act.sa_flags = 0;
|
||||
sigemptyset(&act.sa_mask);
|
||||
@ -87,7 +91,7 @@ poe_argv_append_int - append integer variable to argument variable
|
||||
@param min minimum value [IN]
|
||||
@param argname argument name [IN]
|
||||
*/
|
||||
static inline int poe_argv_append_int(int *argc, char ***argv, int varname, int min, char *argname)
|
||||
static inline int poe_argv_append_int(int *argc, char ***argv, int varname, int min, char *argname)
|
||||
{
|
||||
char *tmp_string;
|
||||
if(varname >= min) {
|
||||
@ -97,7 +101,7 @@ static inline int poe_argv_append_int(int *argc, char ***argv, int varname, int
|
||||
free(tmp_string);
|
||||
} else {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -145,11 +149,11 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Allocate a range of vpids for the daemons.
|
||||
*/
|
||||
|
||||
|
||||
num_nodes = opal_list_get_size(&nodes);
|
||||
if(num_nodes == 0) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
@ -158,7 +162,7 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
||||
/* application */
|
||||
argv = opal_argv_copy(opal_argv_split(mca_pls_poe_component.orted, ' '));
|
||||
argc = opal_argv_count(argv);
|
||||
@ -166,7 +170,7 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
|
||||
opal_argv_append(&argc, &argv, "--debug");
|
||||
}
|
||||
opal_argv_append(&argc, &argv, "--debug-daemons");
|
||||
|
||||
|
||||
opal_argv_append(&argc, &argv, "--no-daemonize");
|
||||
opal_argv_append(&argc, &argv, "--bootproxy");
|
||||
/* need integer value for command line parameter - NOT hex */
|
||||
@ -187,7 +191,7 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
|
||||
opal_argv_append(&argc, &argv, tmp_string);
|
||||
free(tmp_string);
|
||||
|
||||
|
||||
|
||||
/* setup ns contact info */
|
||||
opal_argv_append(&argc, &argv, "--nsreplica");
|
||||
if(NULL != orte_process_info.ns_replica_uri) {
|
||||
@ -198,7 +202,7 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
|
||||
asprintf(¶m, "\"%s\"", uri);
|
||||
opal_argv_append(&argc, &argv, param);
|
||||
free(uri);
|
||||
|
||||
|
||||
/* setup gpr contact info */
|
||||
opal_argv_append(&argc, &argv, "--gprreplica");
|
||||
if(NULL != orte_process_info.gpr_replica_uri) {
|
||||
@ -209,24 +213,24 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
|
||||
asprintf(¶m, "\"%s\"", uri);
|
||||
opal_argv_append(&argc, &argv, param);
|
||||
free(uri);
|
||||
|
||||
|
||||
/*
|
||||
* Iterate through each of the nodes and spin
|
||||
* up a daemon.
|
||||
*/
|
||||
|
||||
|
||||
for(item = opal_list_get_first(&nodes);
|
||||
item != opal_list_get_end(&nodes);
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_ras_node_t* node = (orte_ras_node_t*)item;
|
||||
orte_process_name_t* name;
|
||||
pid_t pid;
|
||||
|
||||
|
||||
/* setup node name */
|
||||
argv[node_name_index2] = node->node_name;
|
||||
|
||||
fprintf(hfp,"%s\n",node->node_name);
|
||||
|
||||
fprintf(hfp,"%s\n",node->node_name);
|
||||
|
||||
/* initialize daemons process name */
|
||||
rc = orte_ns.create_process_name(&name, node->node_cellid, 0, vpid);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
@ -242,17 +246,17 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
|
||||
}
|
||||
argv[proc_name_index] = name_string;
|
||||
for(i=0;i<argc;i++) {
|
||||
fprintf(cfp,"%s ",argv[i]);
|
||||
fprintf(cfp,"%s ",argv[i]);
|
||||
}
|
||||
fprintf(cfp,"\n");
|
||||
|
||||
if (mca_pls_poe_component.verbose) {
|
||||
opal_output(0, "%s:cmdfile %s\n", __FUNCTION__, opal_argv_join(argv, ' '));
|
||||
}
|
||||
}
|
||||
vpid++;
|
||||
free(name);
|
||||
}
|
||||
|
||||
|
||||
fclose(cfp);
|
||||
fclose(hfp);
|
||||
|
||||
@ -283,21 +287,21 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
|
||||
|
||||
if (mca_pls_poe_component.verbose) {
|
||||
opal_output(0, "%s:cmdline %s\n", __FUNCTION__, opal_argv_join(argv, ' '));
|
||||
}
|
||||
}
|
||||
|
||||
pid = fork();
|
||||
if(pid < 0) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
||||
/* child */
|
||||
if(pid == 0) {
|
||||
execv(mca_pls_poe_component.path, argv);
|
||||
opal_output(0, "orte_pls_poe: execv failed with errno=%d\n", errno);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
cleanup:
|
||||
while(NULL != (item = opal_list_remove_first(&nodes))) {
|
||||
OBJ_RELEASE(item);
|
||||
@ -340,7 +344,7 @@ static void poe_wait_job(pid_t pid, int status, void* cbdata)
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*) item;
|
||||
size_t i;
|
||||
|
||||
|
||||
for(i = 0 ; i < map->num_procs ; ++i) {
|
||||
orte_session_dir_finalize(&(map->procs[i])->proc_name);
|
||||
rc = orte_soh.set_proc_soh(&(map->procs[i]->proc_name),
|
||||
@ -426,16 +430,16 @@ static int poe_create_cmd_file(
|
||||
fprintf(cfp,"%s",mca_pls_poe_component.env);
|
||||
while(environ_copy[i]!=NULL) {
|
||||
fprintf(cfp," %s",environ_copy[i++]);
|
||||
}
|
||||
}
|
||||
opal_argv_free(environ_copy);
|
||||
fprintf(cfp," %s",context->app);
|
||||
i=1;
|
||||
while(context->argv[i]!=NULL) {
|
||||
fprintf(cfp," %s",context->argv[i++]);
|
||||
}
|
||||
}
|
||||
|
||||
/* POE will upset if the file doesn't contain end of line. */
|
||||
fprintf(cfp,"\n");
|
||||
fprintf(cfp,"\n");
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -457,7 +461,7 @@ static inline int poe_launch_interactive(orte_jobid_t jobid)
|
||||
int rc, pid;
|
||||
sigset_t sigs;
|
||||
|
||||
if( (NULL==(mca_pls_poe_component.cmdfile=tempnam(NULL,NULL))) ||
|
||||
if( (NULL==(mca_pls_poe_component.cmdfile=tempnam(NULL,NULL))) ||
|
||||
(NULL==(cfp=fopen(mca_pls_poe_component.cmdfile,"w"))) ) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
@ -469,7 +473,7 @@ static inline int poe_launch_interactive(orte_jobid_t jobid)
|
||||
OBJ_CONSTRUCT(&mapping_list, opal_list_t);
|
||||
rc = orte_rmaps_base_mapped_node_query(&mapping_list, &nodes, jobid);
|
||||
if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
|
||||
|
||||
|
||||
num_nodes = opal_list_get_size(&nodes);
|
||||
|
||||
if(!strncmp(mca_pls_poe_component.resource_allocation,"hostfile",8)) {
|
||||
@ -484,14 +488,14 @@ static inline int poe_launch_interactive(orte_jobid_t jobid)
|
||||
item != opal_list_get_end(&nodes);
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_ras_node_t* node = (orte_ras_node_t*)item;
|
||||
fprintf(hfp,"%s\n",node->node_name);
|
||||
fprintf(hfp,"%s\n",node->node_name);
|
||||
}
|
||||
fclose(hfp);
|
||||
}
|
||||
|
||||
rc = orte_rmgr_base_get_job_slots(jobid, &num_procs);
|
||||
if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
|
||||
|
||||
|
||||
OBJ_CONSTRUCT(&map, opal_list_t);
|
||||
rc = orte_rmaps_base_get_map(jobid,&map);
|
||||
if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
|
||||
@ -547,7 +551,7 @@ static inline int poe_launch_interactive(orte_jobid_t jobid)
|
||||
|
||||
if(mca_pls_poe_component.verbose>10) {
|
||||
opal_output(0, "POE cmdline %s\n", opal_argv_join(argv, ' '));
|
||||
}
|
||||
}
|
||||
|
||||
/* Start job with POE */
|
||||
|
||||
@ -570,7 +574,7 @@ static inline int poe_launch_interactive(orte_jobid_t jobid)
|
||||
} else {
|
||||
orte_wait_cb(pid, poe_wait_job, NULL);
|
||||
}
|
||||
|
||||
|
||||
|
||||
cleanup:
|
||||
while(NULL != (item = opal_list_remove_first(&map))) {
|
||||
@ -615,6 +619,17 @@ static int pls_poe_terminate_proc(const orte_process_name_t *name)
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int pls_poe_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
|
||||
static int pls_poe_signal_proc(const orte_process_name_t *name, int32_t signal)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
/**
|
||||
pls_poe_finalize - clean up tempolary files
|
||||
@return error number
|
||||
|
@ -5,14 +5,14 @@
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
@ -51,7 +51,9 @@ int orte_pls_rsh_finalize(void);
|
||||
int orte_pls_rsh_launch(orte_jobid_t);
|
||||
int orte_pls_rsh_terminate_job(orte_jobid_t);
|
||||
int orte_pls_rsh_terminate_proc(const orte_process_name_t* proc_name);
|
||||
|
||||
int orte_pls_rsh_signal_job(orte_jobid_t, int32_t);
|
||||
int orte_pls_rsh_signal_proc(const orte_process_name_t* proc_name, int32_t);
|
||||
|
||||
/**
|
||||
* PLS Component
|
||||
*/
|
||||
|
@ -5,15 +5,15 @@
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
@ -100,6 +100,8 @@ orte_pls_base_module_1_0_0_t orte_pls_rsh_module = {
|
||||
#endif
|
||||
orte_pls_rsh_terminate_job,
|
||||
orte_pls_rsh_terminate_proc,
|
||||
orte_pls_rsh_signal_job,
|
||||
orte_pls_rsh_signal_proc,
|
||||
orte_pls_rsh_finalize
|
||||
};
|
||||
|
||||
@ -463,7 +465,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
} else {
|
||||
orte_pls_rsh_shell shell;
|
||||
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)opal_list_get_first(&mapping);
|
||||
orte_rmaps_base_node_t* rmaps_node =
|
||||
orte_rmaps_base_node_t* rmaps_node =
|
||||
(orte_rmaps_base_node_t*)opal_list_get_first(&map->nodes);
|
||||
orte_ras_node_t* node = rmaps_node->node;
|
||||
|
||||
@ -763,7 +765,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
}
|
||||
} else {
|
||||
if (NULL != prefix_dir) {
|
||||
asprintf(&exec_path, "%s/%s/orted",
|
||||
asprintf(&exec_path, "%s/%s/orted",
|
||||
prefix_dir, bin_base);
|
||||
}
|
||||
/* If we yet did not fill up the execpath, do so now */
|
||||
@ -785,7 +787,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
/* Reset PATH */
|
||||
oldenv = getenv("PATH");
|
||||
if (NULL != oldenv) {
|
||||
asprintf(&newenv, "%s/%s:%s", prefix_dir,
|
||||
asprintf(&newenv, "%s/%s:%s", prefix_dir,
|
||||
bin_base, oldenv);
|
||||
} else {
|
||||
asprintf(&newenv, "%s/%s", prefix_dir, bin_base);
|
||||
@ -799,7 +801,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
/* Reset LD_LIBRARY_PATH */
|
||||
oldenv = getenv("LD_LIBRARY_PATH");
|
||||
if (NULL != oldenv) {
|
||||
asprintf(&newenv, "%s/%s:%s", prefix_dir,
|
||||
asprintf(&newenv, "%s/%s:%s", prefix_dir,
|
||||
lib_base, oldenv);
|
||||
} else {
|
||||
asprintf(&newenv, "%s/%s", prefix_dir, lib_base);
|
||||
@ -858,7 +860,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
"PATH=%s/%s:$PATH ; export PATH ; "
|
||||
"LD_LIBRARY_PATH=%s/%s:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; "
|
||||
"%s/%s/%s",
|
||||
prefix_dir, bin_base,
|
||||
prefix_dir, bin_base,
|
||||
prefix_dir, lib_base,
|
||||
prefix_dir, bin_base,
|
||||
mca_pls_rsh_component.orted);
|
||||
@ -882,8 +884,8 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
"setenv LD_LIBRARY_PATH %s/%s:$LD_LIBRARY_PATH ; "
|
||||
"%s/%s/%s",
|
||||
prefix_dir, bin_base,
|
||||
prefix_dir, lib_base,
|
||||
prefix_dir, lib_base,
|
||||
prefix_dir, lib_base,
|
||||
prefix_dir, lib_base,
|
||||
prefix_dir, bin_base,
|
||||
mca_pls_rsh_component.orted);
|
||||
}
|
||||
@ -1022,6 +1024,16 @@ int orte_pls_rsh_terminate_proc(const orte_process_name_t* proc)
|
||||
return orte_pls_base_proxy_terminate_proc(proc);
|
||||
}
|
||||
|
||||
int orte_pls_rsh_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
return orte_pls_base_proxy_signal_job(jobid, signal);
|
||||
}
|
||||
|
||||
int orte_pls_rsh_signal_proc(const orte_process_name_t* proc, int32_t signal)
|
||||
{
|
||||
return orte_pls_base_proxy_signal_proc(proc, signal);
|
||||
}
|
||||
|
||||
int orte_pls_rsh_finalize(void)
|
||||
{
|
||||
if (mca_pls_rsh_component.reap) {
|
||||
|
@ -5,15 +5,15 @@
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
@ -70,6 +70,8 @@
|
||||
static int pls_slurm_launch(orte_jobid_t jobid);
|
||||
static int pls_slurm_terminate_job(orte_jobid_t jobid);
|
||||
static int pls_slurm_terminate_proc(const orte_process_name_t *name);
|
||||
static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal);
|
||||
static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal);
|
||||
static int pls_slurm_finalize(void);
|
||||
|
||||
static int pls_slurm_start_proc(int argc, char **argv, char **env,
|
||||
@ -83,6 +85,8 @@ orte_pls_base_module_1_0_0_t orte_pls_slurm_module = {
|
||||
pls_slurm_launch,
|
||||
pls_slurm_terminate_job,
|
||||
pls_slurm_terminate_proc,
|
||||
pls_slurm_signal_job,
|
||||
pls_slurm_signal_proc,
|
||||
pls_slurm_finalize
|
||||
};
|
||||
|
||||
@ -205,7 +209,7 @@ static int pls_slurm_launch(orte_jobid_t jobid)
|
||||
/* add the daemon command (as specified by user) */
|
||||
opal_argv_append(&argc, &argv, mca_pls_slurm_component.orted);
|
||||
opal_argv_append(&argc, &argv, "--no-daemonize");
|
||||
|
||||
|
||||
/* check for debug flags */
|
||||
orte_pls_base_proxy_mca_argv(&argc, &argv);
|
||||
|
||||
@ -218,8 +222,8 @@ static int pls_slurm_launch(orte_jobid_t jobid)
|
||||
opal_argv_append(&argc, &argv, "slurm");
|
||||
|
||||
/* set orte process name to be the base of the name list for the daemons */
|
||||
rc = orte_ns.create_process_name(&name,
|
||||
orte_process_info.my_name->cellid,
|
||||
rc = orte_ns.create_process_name(&name,
|
||||
orte_process_info.my_name->cellid,
|
||||
0, vpid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -252,7 +256,7 @@ static int pls_slurm_launch(orte_jobid_t jobid)
|
||||
orte_universe_info.host, orte_universe_info.name);
|
||||
opal_argv_append(&argc, &argv, param);
|
||||
free(param);
|
||||
|
||||
|
||||
/* setup ns contact info */
|
||||
opal_argv_append(&argc, &argv, "--nsreplica");
|
||||
if (NULL != orte_process_info.ns_replica_uri) {
|
||||
@ -320,7 +324,7 @@ static int pls_slurm_launch(orte_jobid_t jobid)
|
||||
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*) item2;
|
||||
char * app_prefix_dir = map->app->prefix_dir;
|
||||
|
||||
/* Increment the number of processes allocated to this node
|
||||
/* Increment the number of processes allocated to this node
|
||||
* This allows us to accurately test for oversubscription */
|
||||
num_processes += map->num_procs;
|
||||
|
||||
@ -354,7 +358,7 @@ static int pls_slurm_launch(orte_jobid_t jobid)
|
||||
}
|
||||
|
||||
/* save the daemons name on the node */
|
||||
if (ORTE_SUCCESS !=
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_pls_base_proxy_set_node_name(node, jobid, name))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
@ -363,7 +367,7 @@ static int pls_slurm_launch(orte_jobid_t jobid)
|
||||
vpid++;
|
||||
free(name);
|
||||
}
|
||||
|
||||
|
||||
/* setup environment */
|
||||
env = opal_argv_copy(environ);
|
||||
var = mca_base_param_environ_variable("seed", NULL, NULL);
|
||||
@ -391,7 +395,7 @@ static int pls_slurm_launch(orte_jobid_t jobid)
|
||||
}
|
||||
free(var);
|
||||
#endif
|
||||
|
||||
|
||||
/* exec the daemon */
|
||||
rc = pls_slurm_start_proc(argc, argv, env, cur_prefix);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
@ -401,7 +405,7 @@ static int pls_slurm_launch(orte_jobid_t jobid)
|
||||
|
||||
/* JMS: short we stash the srun pid in the gpr somewhere for cleanup? */
|
||||
/* JMS: how do we catch when srun dies? */
|
||||
|
||||
|
||||
cleanup:
|
||||
while (NULL != (item = opal_list_remove_first(&nodes))) {
|
||||
OBJ_RELEASE(item);
|
||||
@ -423,7 +427,7 @@ static int pls_slurm_terminate_job(orte_jobid_t jobid)
|
||||
/* JMS need appropriate code here to reap */
|
||||
srun_pid = 0;
|
||||
}
|
||||
return orte_pls_base_proxy_terminate_job(jobid);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@ -439,6 +443,27 @@ static int pls_slurm_terminate_proc(const orte_process_name_t *name)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Signal all the processes in the child srun by sending the signal directly to it
|
||||
*/
|
||||
static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
if (0 != srun_pid) {
|
||||
kill(srun_pid, (int)signal);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Signal a specific process
|
||||
*/
|
||||
static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal)
|
||||
{
|
||||
return orte_pls_base_proxy_signal_proc(name, signal);
|
||||
}
|
||||
|
||||
|
||||
static int pls_slurm_finalize(void)
|
||||
{
|
||||
/* cleanup any pending recvs */
|
||||
@ -465,7 +490,7 @@ static int pls_slurm_start_proc(int argc, char **argv, char **env,
|
||||
return ORTE_ERR_IN_ERRNO;
|
||||
} else if (0 == srun_pid) {
|
||||
char *bin_base = NULL, *lib_base = NULL;
|
||||
|
||||
|
||||
/* Figure out the basenames for the libdir and bindir. There
|
||||
is a lengthy comment about this in pls_rsh_module.c
|
||||
explaining all the rationale for how / why we're doing
|
||||
@ -479,7 +504,7 @@ static int pls_slurm_start_proc(int argc, char **argv, char **env,
|
||||
the child process, so it's ok to modify environ. */
|
||||
if (NULL != prefix) {
|
||||
char *oldenv, *newenv;
|
||||
|
||||
|
||||
/* Reset PATH */
|
||||
oldenv = getenv("PATH");
|
||||
if (NULL != oldenv) {
|
||||
@ -492,7 +517,7 @@ static int pls_slurm_start_proc(int argc, char **argv, char **env,
|
||||
opal_output(0, "pls:slurm: reset PATH: %s", newenv);
|
||||
}
|
||||
free(newenv);
|
||||
|
||||
|
||||
/* Reset LD_LIBRARY_PATH */
|
||||
oldenv = getenv("LD_LIBRARY_PATH");
|
||||
if (NULL != oldenv) {
|
||||
|
@ -70,6 +70,8 @@
|
||||
static int pls_tm_launch(orte_jobid_t jobid);
|
||||
static int pls_tm_terminate_job(orte_jobid_t jobid);
|
||||
static int pls_tm_terminate_proc(const orte_process_name_t *name);
|
||||
static int pls_tm_signal_job(orte_jobid_t jobid, int32_t signal);
|
||||
static int pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal);
|
||||
static int pls_tm_finalize(void);
|
||||
|
||||
static int pls_tm_connect(void);
|
||||
@ -84,6 +86,8 @@ orte_pls_base_module_1_0_0_t orte_pls_tm_module = {
|
||||
pls_tm_launch,
|
||||
pls_tm_terminate_job,
|
||||
pls_tm_terminate_proc,
|
||||
pls_tm_signal_job,
|
||||
pls_tm_signal_proc,
|
||||
pls_tm_finalize
|
||||
};
|
||||
|
||||
@ -449,6 +453,20 @@ pls_tm_terminate_proc(const orte_process_name_t *name)
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
pls_tm_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
return orte_pls_base_proxy_signal_job(jobid, signal);
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal)
|
||||
{
|
||||
return orte_pls_base_proxy_signal_proc(name, signal);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Free stuff
|
||||
*/
|
||||
|
@ -80,6 +80,8 @@ orte_pls_base_module_t orte_pls_xcpu_module = {
|
||||
orte_pls_xcpu_launch,
|
||||
orte_pls_xcpu_terminate_job,
|
||||
orte_pls_xcpu_terminate_proc,
|
||||
orte_pls_xcpu_signal_job,
|
||||
orte_pls_xcpu_signal_proc,
|
||||
orte_pls_xcpu_finalize
|
||||
};
|
||||
|
||||
@ -328,6 +330,12 @@ int orte_pls_xcpu_terminate_job(orte_jobid_t jobid){
|
||||
int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name){
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
int orte_pls_xcpu_signal_job(orte_jobid_t jobid, int32_t signal){
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
int orte_pls_xcpu_signal_proc(const orte_process_name_t* proc_name, int32_t signal){
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
int orte_pls_xcpu_finalize(void){
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -59,6 +59,8 @@ orte_pls_base_module_t* orte_pls_xcpu_init(int *priority); /* in component file
|
||||
int orte_pls_xcpu_launch(orte_jobid_t);
|
||||
int orte_pls_xcpu_terminate_job(orte_jobid_t);
|
||||
int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name);
|
||||
int orte_pls_xcpu_signal_job(orte_jobid_t, int32_t);
|
||||
int orte_pls_xcpu_signal_proc(const orte_process_name_t* proc_name, int32_t);
|
||||
int orte_pls_xcpu_finalize(void);
|
||||
|
||||
|
||||
|
@ -90,6 +90,16 @@ OMPI_DECLSPEC int orte_rmgr_base_pack_terminate_proc_cmd(
|
||||
orte_buffer_t* buffer,
|
||||
const orte_process_name_t* name);
|
||||
|
||||
OMPI_DECLSPEC int orte_rmgr_base_pack_signal_job_cmd(
|
||||
orte_buffer_t* buffer,
|
||||
orte_jobid_t job,
|
||||
int32_t signal);
|
||||
|
||||
OMPI_DECLSPEC int orte_rmgr_base_pack_signal_proc_cmd(
|
||||
orte_buffer_t* buffer,
|
||||
const orte_process_name_t* name,
|
||||
int32_t signal);
|
||||
|
||||
OMPI_DECLSPEC int orte_rmgr_base_unpack_rsp(
|
||||
orte_buffer_t* buffer);
|
||||
|
||||
@ -115,6 +125,8 @@ int orte_rmgr_base_map_not_available(orte_jobid_t);
|
||||
int orte_rmgr_base_launch_not_available(orte_jobid_t);
|
||||
int orte_rmgr_base_terminate_job_not_available(orte_jobid_t);
|
||||
int orte_rmgr_base_terminate_proc_not_available(const orte_process_name_t*);
|
||||
int orte_rmgr_base_signal_job_not_available(orte_jobid_t, int32_t);
|
||||
int orte_rmgr_base_signal_proc_not_available(const orte_process_name_t*, int32_t);
|
||||
int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job);
|
||||
int orte_rmgr_base_proc_stage_gate_subscribe(orte_jobid_t job, orte_gpr_notify_cb_fn_t, void*, orte_proc_state_t);
|
||||
int orte_rmgr_base_proc_stage_gate_mgr(
|
||||
|
@ -57,6 +57,8 @@ orte_rmgr_base_module_t orte_rmgr = {
|
||||
orte_rmgr_base_launch_not_available,
|
||||
orte_rmgr_base_terminate_job_not_available,
|
||||
orte_rmgr_base_terminate_proc_not_available,
|
||||
orte_rmgr_base_signal_job_not_available,
|
||||
orte_rmgr_base_signal_proc_not_available,
|
||||
orte_rmgr_base_spawn_not_available,
|
||||
orte_rmgr_base_proc_stage_gate_init,
|
||||
orte_rmgr_base_proc_stage_gate_mgr,
|
||||
|
@ -5,12 +5,12 @@
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
@ -29,15 +29,15 @@
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
int orte_rmgr_base_pack_cmd(orte_buffer_t* buffer, orte_rmgr_cmd_t cmd, orte_jobid_t jobid)
|
||||
{
|
||||
int rc;
|
||||
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
|
||||
rc = orte_dss.pack(buffer, &cmd, 1, ORTE_RMGR_CMD);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -62,11 +62,11 @@ int orte_rmgr_base_pack_create_cmd(
|
||||
size_t num_context)
|
||||
{
|
||||
int rc;
|
||||
|
||||
|
||||
orte_rmgr_cmd_t cmd = ORTE_RMGR_CMD_CREATE;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
|
||||
rc = orte_dss.pack(buffer, &cmd, 1, ORTE_RMGR_CMD);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -86,17 +86,17 @@ int orte_rmgr_base_pack_create_cmd(
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int orte_rmgr_base_pack_terminate_proc_cmd(
|
||||
orte_buffer_t* buffer,
|
||||
const orte_process_name_t* name)
|
||||
{
|
||||
int rc;
|
||||
|
||||
orte_rmgr_cmd_t cmd = ORTE_RMGR_CMD_CREATE;
|
||||
|
||||
orte_rmgr_cmd_t cmd = ORTE_RMGR_CMD_TERM_PROC;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
|
||||
rc = orte_dss.pack(buffer, &cmd, 1, ORTE_RMGR_CMD);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -111,7 +111,73 @@ int orte_rmgr_base_pack_terminate_proc_cmd(
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int orte_rmgr_base_pack_signal_job_cmd(
|
||||
orte_buffer_t* buffer,
|
||||
orte_jobid_t job,
|
||||
int32_t signal)
|
||||
{
|
||||
int rc;
|
||||
|
||||
orte_rmgr_cmd_t cmd = ORTE_RMGR_CMD_SIGNAL_JOB;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
rc = orte_dss.pack(buffer, &cmd, 1, ORTE_RMGR_CMD);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_dss.pack(buffer, &job, 1, ORTE_JOBID);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_dss.pack(buffer, (void*)&signal, 1, ORTE_INT32);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_rmgr_base_pack_signal_proc_cmd(
|
||||
orte_buffer_t* buffer,
|
||||
const orte_process_name_t* name,
|
||||
int32_t signal)
|
||||
{
|
||||
int rc;
|
||||
|
||||
orte_rmgr_cmd_t cmd = ORTE_RMGR_CMD_SIGNAL_PROC;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
rc = orte_dss.pack(buffer, &cmd, 1, ORTE_RMGR_CMD);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_dss.pack(buffer, (void*)name, 1, ORTE_NAME);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_dss.pack(buffer, (void*)&signal, 1, ORTE_INT32);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_rmgr_base_unpack_rsp(
|
||||
orte_buffer_t* buffer)
|
||||
{
|
||||
@ -119,14 +185,14 @@ int orte_rmgr_base_unpack_rsp(
|
||||
size_t cnt = 1;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_dss.unpack(buffer,&rc,&cnt,ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
int orte_rmgr_base_unpack_create_rsp(
|
||||
orte_buffer_t* buffer,
|
||||
orte_jobid_t* jobid)
|
||||
@ -135,7 +201,7 @@ int orte_rmgr_base_unpack_create_rsp(
|
||||
size_t cnt;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
|
||||
cnt = 1;
|
||||
if(ORTE_SUCCESS != (rc = orte_dss.unpack(buffer,jobid,&cnt,ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -5,14 +5,14 @@
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
@ -31,8 +31,8 @@
|
||||
*/
|
||||
int
|
||||
orte_rmgr_base_create_not_available(
|
||||
orte_app_context_t** app_context,
|
||||
size_t num_context,
|
||||
orte_app_context_t** app_context,
|
||||
size_t num_context,
|
||||
orte_jobid_t* jobid)
|
||||
{
|
||||
return ORTE_ERR_UNREACH;
|
||||
@ -80,10 +80,22 @@ orte_rmgr_base_terminate_proc_not_available(const orte_process_name_t* proc_name
|
||||
return ORTE_ERR_UNREACH;
|
||||
}
|
||||
|
||||
int
|
||||
orte_rmgr_base_signal_job_not_available(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
return ORTE_ERR_UNREACH;
|
||||
}
|
||||
|
||||
int
|
||||
orte_rmgr_base_signal_proc_not_available(const orte_process_name_t* proc_name, int32_t signal)
|
||||
{
|
||||
return ORTE_ERR_UNREACH;
|
||||
}
|
||||
|
||||
int
|
||||
orte_rmgr_base_spawn_not_available(
|
||||
orte_app_context_t** app_context,
|
||||
size_t num_context,
|
||||
orte_app_context_t** app_context,
|
||||
size_t num_context,
|
||||
orte_jobid_t* jobid,
|
||||
orte_rmgr_cb_fn_t cbfn,
|
||||
orte_proc_state_t cb_conditions)
|
||||
|
@ -5,12 +5,12 @@
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
@ -30,15 +30,15 @@
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
static int orte_rmgr_base_cmd_query(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
{
|
||||
int32_t rc = orte_rmgr.query();
|
||||
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
|
||||
return orte_dss.pack(rsp, &rc, 1, ORTE_INT32);
|
||||
}
|
||||
|
||||
@ -51,7 +51,7 @@ static int orte_rmgr_base_cmd_create(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
size_t i, cnt, num_context;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
|
||||
cnt = 1;
|
||||
if(ORTE_SUCCESS != (rc = orte_dss.unpack(req, &num_context, &cnt, ORTE_SIZE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -62,7 +62,7 @@ static int orte_rmgr_base_cmd_create(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
|
||||
cnt = num_context;
|
||||
if(ORTE_SUCCESS != (rc = orte_dss.unpack(req, context, &cnt, ORTE_APP_CONTEXT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -96,7 +96,7 @@ static int orte_rmgr_base_cmd_allocate(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
size_t cnt = 1;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_dss.unpack(req, &jobid, &cnt, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
} else {
|
||||
@ -112,7 +112,7 @@ static int orte_rmgr_base_cmd_deallocate(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
size_t cnt = 1;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_dss.unpack(req, &jobid, &cnt, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
} else {
|
||||
@ -128,7 +128,7 @@ static int orte_rmgr_base_cmd_map(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
size_t cnt = 1;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_dss.unpack(req, &jobid, &cnt, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
} else {
|
||||
@ -144,7 +144,7 @@ static int orte_rmgr_base_cmd_launch(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
size_t cnt = 1;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_dss.unpack(req, &jobid, &cnt, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
} else {
|
||||
@ -161,7 +161,7 @@ static int orte_rmgr_base_cmd_term_job(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
size_t cnt = 1;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_dss.unpack(req, &jobid, &cnt, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
} else {
|
||||
@ -178,7 +178,7 @@ static int orte_rmgr_base_cmd_term_proc(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
size_t cnt = 1;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_dss.unpack(req, &name, &cnt, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
} else {
|
||||
@ -188,6 +188,54 @@ static int orte_rmgr_base_cmd_term_proc(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
}
|
||||
|
||||
|
||||
static int orte_rmgr_base_cmd_signal_job(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
{
|
||||
int rc;
|
||||
orte_jobid_t jobid;
|
||||
size_t cnt = 1;
|
||||
int32_t signal;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_dss.unpack(req, &jobid, &cnt, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_dss.unpack(req, &signal, &cnt, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_rmgr.signal_job(jobid, signal);
|
||||
|
||||
return orte_dss.pack(rsp, &rc, 1, ORTE_INT32);
|
||||
}
|
||||
|
||||
|
||||
static int orte_rmgr_base_cmd_signal_proc(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
{
|
||||
int rc;
|
||||
orte_process_name_t name;
|
||||
size_t cnt = 1;
|
||||
int32_t signal;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_dss.unpack(req, &name, &cnt, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_dss.unpack(req, &signal, &cnt, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_rmgr.signal_proc(&name, signal);
|
||||
|
||||
return orte_dss.pack(rsp, &rc, 1, ORTE_INT32);
|
||||
}
|
||||
|
||||
|
||||
int orte_rmgr_base_cmd_dispatch(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
{
|
||||
@ -196,7 +244,7 @@ int orte_rmgr_base_cmd_dispatch(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(4);
|
||||
|
||||
|
||||
rc = orte_dss.unpack(req, &cmd, &cnt, ORTE_RMGR_CMD);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -220,6 +268,10 @@ int orte_rmgr_base_cmd_dispatch(orte_buffer_t* req, orte_buffer_t* rsp)
|
||||
return orte_rmgr_base_cmd_term_job(req,rsp);
|
||||
case ORTE_RMGR_CMD_TERM_PROC:
|
||||
return orte_rmgr_base_cmd_term_proc(req,rsp);
|
||||
case ORTE_RMGR_CMD_SIGNAL_JOB:
|
||||
return orte_rmgr_base_cmd_signal_job(req,rsp);
|
||||
case ORTE_RMGR_CMD_SIGNAL_PROC:
|
||||
return orte_rmgr_base_cmd_signal_proc(req,rsp);
|
||||
default:
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
|
@ -62,6 +62,12 @@ static int orte_rmgr_cnos_terminate_job(
|
||||
static int orte_rmgr_cnos_terminate_proc(
|
||||
const orte_process_name_t* proc_name);
|
||||
|
||||
static int orte_rmgr_cnos_signal_job(
|
||||
orte_jobid_t jobid);
|
||||
|
||||
static int orte_rmgr_cnos_signal_proc(
|
||||
const orte_process_name_t* proc_name);
|
||||
|
||||
static int orte_rmgr_cnos_spawn(
|
||||
orte_app_context_t** app_context,
|
||||
size_t num_context,
|
||||
@ -80,6 +86,8 @@ orte_rmgr_base_module_t orte_rmgr_cnos_module = {
|
||||
orte_rmgr_cnos_launch,
|
||||
orte_rmgr_cnos_terminate_job,
|
||||
orte_rmgr_cnos_terminate_proc,
|
||||
orte_rmgr_cnos_signal_job,
|
||||
orte_rmgr_cnos_signal_proc,
|
||||
orte_rmgr_cnos_spawn,
|
||||
orte_rmgr_base_proc_stage_gate_init,
|
||||
orte_rmgr_base_proc_stage_gate_mgr,
|
||||
@ -177,6 +185,17 @@ static int orte_rmgr_cnos_terminate_proc(const orte_process_name_t* proc_name)
|
||||
}
|
||||
|
||||
|
||||
static int orte_rmgr_cnos_signal_job(orte_jobid_t jobid)
|
||||
{
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
static int orte_rmgr_cnos_signal_proc(const orte_process_name_t* proc_name)
|
||||
{
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
|
||||
static int orte_rmgr_cnos_spawn(
|
||||
orte_app_context_t** app_context,
|
||||
size_t num_context,
|
||||
|
@ -59,6 +59,13 @@ static int orte_rmgr_proxy_terminate_job(
|
||||
static int orte_rmgr_proxy_terminate_proc(
|
||||
const orte_process_name_t* proc_name);
|
||||
|
||||
static int orte_rmgr_proxy_signal_job(
|
||||
orte_jobid_t jobid, int32_t signal);
|
||||
|
||||
static int orte_rmgr_proxy_signal_proc(
|
||||
const orte_process_name_t* proc_name,
|
||||
int32_t signal);
|
||||
|
||||
static int orte_rmgr_proxy_spawn(
|
||||
orte_app_context_t** app_context,
|
||||
size_t num_context,
|
||||
@ -75,6 +82,8 @@ orte_rmgr_base_module_t orte_rmgr_proxy_module = {
|
||||
orte_rmgr_proxy_launch,
|
||||
orte_rmgr_proxy_terminate_job,
|
||||
orte_rmgr_proxy_terminate_proc,
|
||||
orte_rmgr_proxy_signal_job,
|
||||
orte_rmgr_proxy_signal_proc,
|
||||
orte_rmgr_proxy_spawn,
|
||||
orte_rmgr_base_proc_stage_gate_init,
|
||||
orte_rmgr_base_proc_stage_gate_mgr,
|
||||
@ -260,6 +269,90 @@ static int orte_rmgr_proxy_terminate_proc(const orte_process_name_t* proc_name)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_rmgr_proxy_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
orte_buffer_t cmd;
|
||||
orte_buffer_t rsp;
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* construct command */
|
||||
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
|
||||
rc = orte_rmgr_base_pack_signal_job_cmd(&cmd, jobid, signal);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if(0 > (rc = orte_rml.send_buffer(ORTE_RML_NAME_SEED, &cmd, ORTE_RML_TAG_RMGR_SVC, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
|
||||
/* wait for response */
|
||||
OBJ_CONSTRUCT(&rsp, orte_buffer_t);
|
||||
if(0 > (rc = orte_rml.recv_buffer(ORTE_RML_NAME_SEED, &rsp, ORTE_RML_TAG_RMGR_CLNT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&rsp);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_rmgr_base_unpack_rsp(&rsp);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&rsp);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&rsp);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_rmgr_proxy_signal_proc(const orte_process_name_t* proc_name, int32_t signal)
|
||||
{
|
||||
orte_buffer_t cmd;
|
||||
orte_buffer_t rsp;
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* construct command */
|
||||
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
|
||||
rc = orte_rmgr_base_pack_signal_proc_cmd(&cmd, proc_name, signal);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if(0 > (rc = orte_rml.send_buffer(ORTE_RML_NAME_SEED, &cmd, ORTE_RML_TAG_RMGR_SVC, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
|
||||
/* wait for response */
|
||||
OBJ_CONSTRUCT(&rsp, orte_buffer_t);
|
||||
if(0 > (rc = orte_rml.recv_buffer(ORTE_RML_NAME_SEED, &rsp, ORTE_RML_TAG_RMGR_CLNT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&rsp);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_rmgr_base_unpack_rsp(&rsp);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&rsp);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&rsp);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void orte_rmgr_proxy_wireup_stdin(orte_jobid_t jobid)
|
||||
{
|
||||
int rc;
|
||||
|
@ -126,6 +126,18 @@ typedef int (*orte_rmgr_base_module_terminate_job_fn_t)(orte_jobid_t job);
|
||||
*/
|
||||
typedef int (*orte_rmgr_base_module_terminate_proc_fn_t)(const orte_process_name_t* proc_name);
|
||||
|
||||
|
||||
/**
|
||||
* Transmit a signal to an entire job
|
||||
*/
|
||||
typedef int (*orte_rmgr_base_module_signal_job_fn_t)(orte_jobid_t job, int32_t signal);
|
||||
|
||||
/**
|
||||
* Transmit a signal to a specific process
|
||||
*/
|
||||
typedef int (*orte_rmgr_base_module_signal_proc_fn_t)(const orte_process_name_t* proc_name, int32_t signal);
|
||||
|
||||
|
||||
/*
|
||||
* Callback function for resource manager
|
||||
*/
|
||||
@ -193,6 +205,8 @@ struct orte_rmgr_base_module_1_0_0_t {
|
||||
orte_rmgr_base_module_launch_fn_t launch;
|
||||
orte_rmgr_base_module_terminate_job_fn_t terminate_job;
|
||||
orte_rmgr_base_module_terminate_proc_fn_t terminate_proc;
|
||||
orte_rmgr_base_module_signal_job_fn_t signal_job;
|
||||
orte_rmgr_base_module_signal_proc_fn_t signal_proc;
|
||||
orte_rmgr_base_module_spawn_fn_t spawn;
|
||||
orte_rmgr_base_module_proc_stage_gate_init_fn_t stage_gate_init;
|
||||
orte_rmgr_base_module_proc_stage_gate_mgr_fn_t stage_gate_mgr;
|
||||
|
@ -31,15 +31,17 @@ extern "C" {
|
||||
/*
|
||||
* Constants for command values
|
||||
*/
|
||||
#define ORTE_RMGR_CMD_QUERY 1
|
||||
#define ORTE_RMGR_CMD_CREATE 2
|
||||
#define ORTE_RMGR_CMD_ALLOCATE 3
|
||||
#define ORTE_RMGR_CMD_DEALLOCATE 4
|
||||
#define ORTE_RMGR_CMD_MAP 5
|
||||
#define ORTE_RMGR_CMD_LAUNCH 6
|
||||
#define ORTE_RMGR_CMD_TERM_JOB 7
|
||||
#define ORTE_RMGR_CMD_TERM_PROC 8
|
||||
#define ORTE_RMGR_CMD_SPAWN 9
|
||||
#define ORTE_RMGR_CMD_QUERY 1
|
||||
#define ORTE_RMGR_CMD_CREATE 2
|
||||
#define ORTE_RMGR_CMD_ALLOCATE 3
|
||||
#define ORTE_RMGR_CMD_DEALLOCATE 4
|
||||
#define ORTE_RMGR_CMD_MAP 5
|
||||
#define ORTE_RMGR_CMD_LAUNCH 6
|
||||
#define ORTE_RMGR_CMD_TERM_JOB 7
|
||||
#define ORTE_RMGR_CMD_TERM_PROC 8
|
||||
#define ORTE_RMGR_CMD_SPAWN 9
|
||||
#define ORTE_RMGR_CMD_SIGNAL_JOB 10
|
||||
#define ORTE_RMGR_CMD_SIGNAL_PROC 11
|
||||
|
||||
#define ORTE_RMGR_CMD ORTE_UINT32
|
||||
typedef uint32_t orte_rmgr_cmd_t;
|
||||
|
@ -70,6 +70,13 @@ static int orte_rmgr_urm_terminate_job(
|
||||
static int orte_rmgr_urm_terminate_proc(
|
||||
const orte_process_name_t* proc_name);
|
||||
|
||||
static int orte_rmgr_urm_signal_job(
|
||||
orte_jobid_t jobid, int32_t signal);
|
||||
|
||||
static int orte_rmgr_urm_signal_proc(
|
||||
const orte_process_name_t* proc_name,
|
||||
int32_t signal);
|
||||
|
||||
static int orte_rmgr_urm_spawn(
|
||||
orte_app_context_t** app_context,
|
||||
size_t num_context,
|
||||
@ -89,6 +96,8 @@ orte_rmgr_base_module_t orte_rmgr_urm_module = {
|
||||
orte_rmgr_urm_launch,
|
||||
orte_rmgr_urm_terminate_job,
|
||||
orte_rmgr_urm_terminate_proc,
|
||||
orte_rmgr_urm_signal_job,
|
||||
orte_rmgr_urm_signal_proc,
|
||||
orte_rmgr_urm_spawn,
|
||||
orte_rmgr_base_proc_stage_gate_init,
|
||||
orte_rmgr_base_proc_stage_gate_mgr,
|
||||
@ -216,7 +225,7 @@ static int orte_rmgr_urm_terminate_proc(const orte_process_name_t* proc_name)
|
||||
{
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if ((0 == orte_ns.compare(ORTE_NS_CMP_ALL, proc_name,
|
||||
if ((0 == orte_ns.compare(ORTE_NS_CMP_ALL, proc_name,
|
||||
orte_process_info.my_name)) &&
|
||||
(orte_process_info.singleton)) {
|
||||
/* if we're trying to get ourselves killed and we're a
|
||||
@ -230,6 +239,45 @@ static int orte_rmgr_urm_terminate_proc(const orte_process_name_t* proc_name)
|
||||
}
|
||||
|
||||
|
||||
static int orte_rmgr_urm_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
int ret;
|
||||
orte_jobid_t my_jobid;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
ret = orte_ns.get_jobid(&my_jobid, orte_process_info.my_name);
|
||||
if (ORTE_SUCCESS == ret) {
|
||||
/** if our jobid is the one we're trying to signal AND we're a
|
||||
* singleton, then calling the urm_pls isn't going to be able
|
||||
* to do anything - we already have the signal! */
|
||||
if (orte_process_info.singleton && jobid == my_jobid) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
return mca_rmgr_urm_component.urm_pls->signal_job(jobid, signal);
|
||||
}
|
||||
|
||||
static int orte_rmgr_urm_signal_proc(const orte_process_name_t* proc_name, int32_t signal)
|
||||
{
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if ((0 == orte_ns.compare(ORTE_NS_CMP_ALL, proc_name,
|
||||
orte_process_info.my_name)) &&
|
||||
(orte_process_info.singleton)) {
|
||||
/** if we're trying to signal ourselves and we're a
|
||||
* singleton, calling signal_proc isn't going to work
|
||||
* properly -- there's no pls setup properly for us. Besides, we
|
||||
* already have the signal!
|
||||
*/
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
return mca_rmgr_urm_component.urm_pls->signal_proc(proc_name, signal);
|
||||
}
|
||||
|
||||
|
||||
static void orte_rmgr_urm_wireup_stdin(orte_jobid_t jobid)
|
||||
{
|
||||
int rc;
|
||||
|
14
orte/test/system/Makefile
Обычный файл
14
orte/test/system/Makefile
Обычный файл
@ -0,0 +1,14 @@
|
||||
PROGS = no_op mpi_no_op hello hello_null sigusr_trap
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
CC = mpicc
|
||||
CFLAGS = -g
|
||||
CXX = mpic++
|
||||
CXXFLAGS = -g
|
||||
F77 = mpif77
|
||||
FC = mpif77
|
||||
FFLAGS = -g
|
||||
|
||||
clean:
|
||||
rm -f $(PROGS) *~
|
23
orte/test/system/hello.c
Обычный файл
23
orte/test/system/hello.c
Обычный файл
@ -0,0 +1,23 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* The most basic of MPI applications
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "mpi.h"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int rank, size;
|
||||
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||
|
||||
printf("Hello, World, I am %d of %d\n", rank, size);
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
23
orte/test/system/hello_null.c
Обычный файл
23
orte/test/system/hello_null.c
Обычный файл
@ -0,0 +1,23 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* The most basic of MPI applications
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "mpi.h"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int rank, size;
|
||||
|
||||
MPI_Init(NULL, NULL);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||
|
||||
printf("Hello, World, I am %d of %d\n", rank, size);
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
17
orte/test/system/mpi_no_op.c
Обычный файл
17
orte/test/system/mpi_no_op.c
Обычный файл
@ -0,0 +1,17 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* The most basic of MPI applications
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "mpi.h"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
MPI_Init(&argc, &argv);
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
13
orte/test/system/no_op.c
Обычный файл
13
orte/test/system/no_op.c
Обычный файл
@ -0,0 +1,13 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* The most basic of applications
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
return 0;
|
||||
}
|
87
orte/test/system/sigusr_trap.c
Обычный файл
87
orte/test/system/sigusr_trap.c
Обычный файл
@ -0,0 +1,87 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* A test to trap user signals
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <signal.h>
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
||||
void sigusr_handler(int signum)
|
||||
{
|
||||
switch (signum) {
|
||||
case SIGUSR1:
|
||||
fprintf(stderr, "Trapped SIGUSR1\n");
|
||||
break;
|
||||
|
||||
case SIGUSR2:
|
||||
fprintf(stderr, "Trapped SIGUSR2\n");
|
||||
return;
|
||||
|
||||
default:
|
||||
fprintf(stderr, "Undefined signal %d trapped\n", signum);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void exit_handler(int signum)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_finalize())) {
|
||||
fprintf(stderr, "couldn't complete finalize - error code %d\n", rc);
|
||||
exit(1);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
|
||||
int rc;
|
||||
int i;
|
||||
double pi;
|
||||
|
||||
if (signal(SIGUSR1, sigusr_handler) == SIG_IGN) {
|
||||
fprintf(stderr, "Could not setup signal trap for SIGUSR1\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (signal(SIGUSR2, sigusr_handler) == SIG_IGN) {
|
||||
fprintf(stderr, "Could not setup signal trap for SIGUSR2\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (signal(SIGINT, exit_handler) == SIG_IGN) {
|
||||
fprintf(stderr, "Could not setup signal trap for SIGINT\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (signal(SIGHUP, exit_handler) == SIG_IGN) {
|
||||
fprintf(stderr, "Could not setup signal trap for SIGHUP\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (signal(SIGTERM, exit_handler) == SIG_IGN) {
|
||||
fprintf(stderr, "Could not setup signal trap for SIGTERM\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_init(true))) {
|
||||
fprintf(stderr, "couldn't complete init - error code %d\n", rc);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
i = 0;
|
||||
while (1) {
|
||||
i++;
|
||||
pi = i / 3.14159256;
|
||||
if (i > 100) i = 0;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -80,11 +80,17 @@ extern char** environ;
|
||||
*/
|
||||
static struct opal_event term_handler;
|
||||
static struct opal_event int_handler;
|
||||
static struct opal_event sigusr1_handler;
|
||||
static struct opal_event sigusr2_handler;
|
||||
static orte_jobid_t jobid = ORTE_JOBID_MAX;
|
||||
static orte_pointer_array_t *apps_pa;
|
||||
static bool wait_for_job_completion = true;
|
||||
static char *abort_msg = NULL;
|
||||
static size_t abort_msg_len = 0;
|
||||
static char *sigusr1_msg = NULL;
|
||||
static size_t sigusr1_msg_len = 0;
|
||||
static char *sigusr2_msg = NULL;
|
||||
static size_t sigusr2_msg_len = 0;
|
||||
static char *orterun_basename = NULL;
|
||||
static int max_display_aborted = 1;
|
||||
static int num_aborted = 0;
|
||||
@ -245,7 +251,9 @@ opal_cmd_line_init_t cmd_line_init[] = {
|
||||
* Local functions
|
||||
*/
|
||||
static void exit_callback(int fd, short event, void *arg);
|
||||
static void signal_callback(int fd, short flags, void *arg);
|
||||
static void abort_signal_callback(int fd, short flags, void *arg);
|
||||
static void sigusr1_callback(int fd, short flags, void *arg);
|
||||
static void sigusr2_callback(int fd, short flags, void *arg);
|
||||
static int create_app(int argc, char* argv[], orte_app_context_t **app,
|
||||
bool *made_app, char ***app_env);
|
||||
static int init_globals(void);
|
||||
@ -273,6 +281,12 @@ int orterun(int argc, char *argv[])
|
||||
asprintf(&abort_msg, "%s: killing job...\n", orterun_basename);
|
||||
abort_msg_len = strlen(abort_msg);
|
||||
|
||||
/** Setup the user signal message (for use in the signal handler) */
|
||||
asprintf(&sigusr1_msg, "%s: received SIGUSR1 signal\n", orterun_basename);
|
||||
sigusr1_msg_len = strlen(sigusr1_msg);
|
||||
asprintf(&sigusr2_msg, "%s: received SIGUSR2 signal\n", orterun_basename);
|
||||
sigusr2_msg_len = strlen(sigusr2_msg);
|
||||
|
||||
/* Check for some "global" command line params */
|
||||
|
||||
parse_globals(argc, argv);
|
||||
@ -389,13 +403,22 @@ int orterun(int argc, char *argv[])
|
||||
|
||||
/* Prep to start the application */
|
||||
|
||||
/** setup callbacks for abort signals */
|
||||
opal_signal_set(&term_handler, SIGTERM,
|
||||
signal_callback, NULL);
|
||||
abort_signal_callback, NULL);
|
||||
opal_signal_add(&term_handler, NULL);
|
||||
opal_signal_set(&int_handler, SIGINT,
|
||||
signal_callback, NULL);
|
||||
abort_signal_callback, NULL);
|
||||
opal_signal_add(&int_handler, NULL);
|
||||
|
||||
/** setup callbacks for user signals */
|
||||
opal_signal_set(&sigusr1_handler, SIGUSR1,
|
||||
sigusr1_callback, NULL);
|
||||
opal_signal_add(&sigusr1_handler, NULL);
|
||||
opal_signal_set(&sigusr2_handler, SIGUSR2,
|
||||
sigusr2_callback, NULL);
|
||||
opal_signal_add(&sigusr2_handler, NULL);
|
||||
|
||||
orte_totalview_init_before_spawn();
|
||||
|
||||
/* Spawn the job */
|
||||
@ -661,6 +684,10 @@ static void exit_callback(int fd, short event, void *arg)
|
||||
opal_signal_del(&term_handler);
|
||||
opal_signal_del(&int_handler);
|
||||
|
||||
/** Remove the USR signal handlers */
|
||||
opal_signal_del(&sigusr1_handler);
|
||||
opal_signal_del(&sigusr2_handler);
|
||||
|
||||
/* Trigger the normal exit conditions */
|
||||
|
||||
orterun_globals.exit = true;
|
||||
@ -674,7 +701,7 @@ static void exit_callback(int fd, short event, void *arg)
|
||||
* the job has been aborted.
|
||||
*/
|
||||
|
||||
static void signal_callback(int fd, short flags, void *arg)
|
||||
static void abort_signal_callback(int fd, short flags, void *arg)
|
||||
{
|
||||
int ret;
|
||||
struct timeval tv = { 5, 0 };
|
||||
@ -703,6 +730,53 @@ static void signal_callback(int fd, short flags, void *arg)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Pass user signals to the remote application processes
|
||||
*/
|
||||
|
||||
static void sigusr1_callback(int fd, short flags, void *arg)
|
||||
{
|
||||
int ret;
|
||||
static int signalled = 0;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if (0 != signalled++) { /** protect against multiple entry */
|
||||
return;
|
||||
}
|
||||
|
||||
write (2, sigusr1_msg, sigusr1_msg_len);
|
||||
|
||||
/** send the signal out to the processes */
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_rmgr.signal_job(jobid, SIGUSR1))) {
|
||||
fprintf(stderr, "SIGUSR1 could not be sent to the job\n");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void sigusr2_callback(int fd, short flags, void *arg)
|
||||
{
|
||||
int ret;
|
||||
static int signalled = 0;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if (0 != signalled++) { /** protect against multiple entry */
|
||||
return;
|
||||
}
|
||||
|
||||
write (2, sigusr2_msg, sigusr2_msg_len);
|
||||
|
||||
/** send the signal out to the processes */
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_rmgr.signal_job(jobid, SIGUSR2))) {
|
||||
fprintf(stderr, "SIGUSR2 could not be sent to the job\n");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
static int init_globals(void)
|
||||
{
|
||||
struct globals_t tmp = {
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user