Bring over the rest of the ctrl-c fixes. This commit includes:
1. add a "cancel_operation" API to the pls components that allows orterun to demand that an orted operation (e.g., terminate_job) be immediately cancelled and abandoned. 2. changes the pls orted commands from blocking to non-blocking. This allows us to interrupt those operations should an orted be non-responsive. The change also adds an orte_abort_timeout that limits how long orterun will automatically wait for the orteds to respond - if the terminate command, for example, doesn't see orted response within that time, then we printout an appropriate error message and just give up. 3. modifies orterun to allow multiple ctrl-c's to simply abort the program even if the orteds have not responded 4. does some cleanup on the orte-level mca params so that their implementation looks a lot more like that of ompi - makes it easier to maintain. This change also includes the definition of an orte_abort_timeout struct and associated MCA param (can't have too many!) so you can set the time after which orterun gives up on waiting for orteds to respond This needs more testing before migrating to 1.2. This commit was SVN r13304.
Этот коммит содержится в:
родитель
7b6ed64c7b
Коммит
ab5ea61100
@ -28,6 +28,7 @@
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/params.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
@ -85,9 +86,8 @@ int orte_errmgr_hnp_proc_aborted(orte_gpr_notify_message_t *msg)
|
||||
/* tell the pls to terminate the job AND ALL ITS DESCENDANTS */
|
||||
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
||||
orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
|
||||
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &attrs))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &orte_abort_timeout, &attrs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
|
||||
OBJ_DESTRUCT(&attrs);
|
||||
@ -152,7 +152,7 @@ int orte_errmgr_hnp_incomplete_start(orte_gpr_notify_message_t *msg)
|
||||
/* tell the pls to terminate the job - just kill this job, not any descendants since
|
||||
* the job is just trying to start
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, NULL))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &orte_abort_timeout, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
|
@ -20,6 +20,11 @@
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
@ -36,6 +41,25 @@
|
||||
#include "orte/mca/pls/base/pls_private.h"
|
||||
|
||||
static orte_std_cntr_t orted_cmd_num_active;
|
||||
static int completion_status;
|
||||
|
||||
static void orte_pls_base_orted_default_wakeup(int fd, short event, void *arg)
|
||||
{
|
||||
/* protect for threads */
|
||||
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
|
||||
|
||||
/* cancel the receive - we didn't get everyone's response in time */
|
||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK);
|
||||
|
||||
/* set the completion status to reflect timeout error */
|
||||
completion_status = ORTE_ERR_TIMEOUT;
|
||||
|
||||
/* declare us "done" so we can exit cleanly */
|
||||
opal_condition_signal(&orte_pls_base.orted_cmd_cond);
|
||||
|
||||
/* unlock us */
|
||||
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
|
||||
}
|
||||
|
||||
static void orte_pls_base_orted_send_cb(int status,
|
||||
orte_process_name_t* peer,
|
||||
@ -74,13 +98,34 @@ static void orte_pls_base_cmd_ack(int status, orte_process_name_t* sender,
|
||||
}
|
||||
|
||||
|
||||
int orte_pls_base_orted_exit(opal_list_t *daemons)
|
||||
int orte_pls_base_orted_cancel_operation(void)
|
||||
{
|
||||
/* protect for threads */
|
||||
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
|
||||
|
||||
/* cancel any waiting receive - we don't want to hear it */
|
||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK);
|
||||
|
||||
/* set the completion status to reflect cancellation */
|
||||
completion_status = ORTE_ERR_INTERUPTED;
|
||||
|
||||
/* declare us "done" so we can exit cleanly */
|
||||
opal_condition_signal(&orte_pls_base.orted_cmd_cond);
|
||||
|
||||
/* unlock us */
|
||||
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_pls_base_orted_exit(opal_list_t *daemons, struct timeval *timeout)
|
||||
{
|
||||
int rc;
|
||||
orte_buffer_t cmd;
|
||||
orte_daemon_cmd_flag_t command=ORTE_DAEMON_EXIT_CMD;
|
||||
opal_list_item_t *item;
|
||||
orte_pls_daemon_info_t *dmn;
|
||||
opal_event_t* event = NULL;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
@ -89,7 +134,8 @@ int orte_pls_base_orted_exit(opal_list_t *daemons)
|
||||
/* pack the command */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* send the commands as fast as we can */
|
||||
@ -106,7 +152,8 @@ int orte_pls_base_orted_exit(opal_list_t *daemons)
|
||||
}
|
||||
orted_cmd_num_active++;
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
|
||||
/* post the receive for the ack's */
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK,
|
||||
ORTE_RML_NON_PERSISTENT, orte_pls_base_cmd_ack, NULL);
|
||||
@ -115,29 +162,51 @@ int orte_pls_base_orted_exit(opal_list_t *daemons)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* define the default completion status */
|
||||
completion_status = ORTE_SUCCESS;
|
||||
|
||||
/* wait for all commands to have been ack'd */
|
||||
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
|
||||
if (orted_cmd_num_active > 0) {
|
||||
/* setup a delay to give the orteds time to complete their departure - wake us up if they
|
||||
* don't exit by the prescribed time
|
||||
*/
|
||||
if (NULL != timeout && /* only do this if the user gave us a time to wait */
|
||||
NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) {
|
||||
opal_evtimer_set(event, orte_pls_base_orted_default_wakeup, NULL);
|
||||
opal_evtimer_add(event, timeout);
|
||||
}
|
||||
|
||||
/* now go to sleep until woken up */
|
||||
opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
|
||||
|
||||
CLEANUP:
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
/* log an error if one occurred */
|
||||
if (ORTE_SUCCESS != completion_status) {
|
||||
ORTE_ERROR_LOG(completion_status);
|
||||
}
|
||||
|
||||
/* if started, kill the timer event so it doesn't hit us later */
|
||||
if (NULL != event) {
|
||||
opal_evtimer_del(event);
|
||||
free(event);
|
||||
}
|
||||
|
||||
/* we're done! */
|
||||
return ORTE_SUCCESS;
|
||||
return completion_status;
|
||||
}
|
||||
|
||||
|
||||
int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job)
|
||||
int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job, struct timeval *timeout)
|
||||
{
|
||||
int rc;
|
||||
orte_buffer_t cmd;
|
||||
orte_daemon_cmd_flag_t command=ORTE_DAEMON_KILL_LOCAL_PROCS;
|
||||
opal_list_item_t *item;
|
||||
orte_pls_daemon_info_t *dmn;
|
||||
|
||||
opal_event_t* event = NULL;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
|
||||
@ -145,13 +214,15 @@ int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job)
|
||||
/* pack the command */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the jobid */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &job, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* send the commands as fast as we can */
|
||||
@ -169,6 +240,7 @@ int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job)
|
||||
|
||||
orted_cmd_num_active++;
|
||||
}
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
|
||||
/* post the receive for the ack's */
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK,
|
||||
@ -178,19 +250,38 @@ int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* define the default completion status */
|
||||
completion_status = ORTE_SUCCESS;
|
||||
|
||||
/* wait for all commands to have been received */
|
||||
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
|
||||
if (orted_cmd_num_active > 0) {
|
||||
/* setup a delay to give the orteds time to complete their departure - wake us up if they
|
||||
* don't exit by the prescribed time
|
||||
*/
|
||||
if (NULL != timeout && /* only do this if the user gave us a time to wait */
|
||||
NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) {
|
||||
opal_evtimer_set(event, orte_pls_base_orted_default_wakeup, NULL);
|
||||
opal_evtimer_add(event, timeout);
|
||||
}
|
||||
/* now go to sleep until woken up */
|
||||
opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
|
||||
|
||||
|
||||
CLEANUP:
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
/* log an error if one occurred */
|
||||
if (ORTE_SUCCESS != completion_status) {
|
||||
ORTE_ERROR_LOG(completion_status);
|
||||
}
|
||||
|
||||
/* if started, kill the timer event so it doesn't hit us later */
|
||||
if (NULL != event) {
|
||||
opal_evtimer_del(event);
|
||||
free(event);
|
||||
}
|
||||
|
||||
/* we're done! */
|
||||
return ORTE_SUCCESS;
|
||||
return completion_status;
|
||||
}
|
||||
|
||||
|
||||
|
@ -99,6 +99,8 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
|
||||
int32_t signal;
|
||||
opal_list_t attrs;
|
||||
opal_list_item_t *item;
|
||||
struct timeval timeout;
|
||||
int32_t secs, microsecs;
|
||||
int rc;
|
||||
|
||||
count = 1;
|
||||
@ -127,12 +129,14 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
|
||||
break;
|
||||
|
||||
case ORTE_PLS_TERMINATE_JOB_CMD:
|
||||
/* get the jobid to be terminated */
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto SEND_ANSWER;
|
||||
}
|
||||
|
||||
/* get any attributes */
|
||||
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) {
|
||||
@ -140,22 +144,39 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
|
||||
goto SEND_ANSWER;
|
||||
}
|
||||
|
||||
/* get the timeout - packed as two separate int32's */
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &secs, &count, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto SEND_ANSWER;
|
||||
}
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, µsecs, &count, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto SEND_ANSWER;
|
||||
}
|
||||
timeout.tv_sec = secs;
|
||||
timeout.tv_usec = microsecs;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &attrs))) {
|
||||
/* issue the command */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &timeout, &attrs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* cleanup attribute list */
|
||||
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
|
||||
OBJ_DESTRUCT(&attrs);
|
||||
break;
|
||||
|
||||
case ORTE_PLS_TERMINATE_ORTEDS_CMD:
|
||||
/* get the jobid whose daemons are to be terminated */
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto SEND_ANSWER;
|
||||
}
|
||||
|
||||
/* get any attributes */
|
||||
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) {
|
||||
@ -163,10 +184,26 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
|
||||
goto SEND_ANSWER;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(job, &attrs))) {
|
||||
/* get the timeout - packed as two separate int32's */
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &secs, &count, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto SEND_ANSWER;
|
||||
}
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, µsecs, &count, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto SEND_ANSWER;
|
||||
}
|
||||
timeout.tv_sec = secs;
|
||||
timeout.tv_usec = microsecs;
|
||||
|
||||
/* issue the command */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(job, &timeout, &attrs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* cleanup attribute list */
|
||||
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
|
||||
OBJ_DESTRUCT(&attrs);
|
||||
break;
|
||||
@ -229,6 +266,13 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PLS_CANCEL_OPERATION_CMD:
|
||||
/* issue the command */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls.cancel_operation())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
}
|
||||
|
@ -26,6 +26,10 @@
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
|
||||
#include "orte/dss/dss_types.h"
|
||||
@ -41,18 +45,19 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* pls proxy commands
|
||||
*/
|
||||
typedef uint8_t orte_pls_cmd_flag_t;
|
||||
#define ORTE_PLS_CMD ORTE_UINT8
|
||||
#define ORTE_PLS_LAUNCH_JOB_CMD 1
|
||||
#define ORTE_PLS_TERMINATE_JOB_CMD 2
|
||||
#define ORTE_PLS_TERMINATE_PROC_CMD 3
|
||||
#define ORTE_PLS_SIGNAL_JOB_CMD 4
|
||||
#define ORTE_PLS_SIGNAL_PROC_CMD 5
|
||||
#define ORTE_PLS_TERMINATE_ORTEDS_CMD 6
|
||||
|
||||
/*
|
||||
* pls proxy commands
|
||||
*/
|
||||
typedef uint8_t orte_pls_cmd_flag_t;
|
||||
#define ORTE_PLS_CMD ORTE_UINT8
|
||||
#define ORTE_PLS_LAUNCH_JOB_CMD 1
|
||||
#define ORTE_PLS_TERMINATE_JOB_CMD 2
|
||||
#define ORTE_PLS_TERMINATE_PROC_CMD 3
|
||||
#define ORTE_PLS_SIGNAL_JOB_CMD 4
|
||||
#define ORTE_PLS_SIGNAL_PROC_CMD 5
|
||||
#define ORTE_PLS_TERMINATE_ORTEDS_CMD 6
|
||||
#define ORTE_PLS_CANCEL_OPERATION_CMD 7
|
||||
|
||||
/*
|
||||
* object for daemon information
|
||||
*/
|
||||
@ -75,9 +80,10 @@ extern "C" {
|
||||
/**
|
||||
* Utilities for pls components that use proxy daemons
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_pls_base_orted_exit(opal_list_t *daemons);
|
||||
ORTE_DECLSPEC int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job);
|
||||
ORTE_DECLSPEC int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal);
|
||||
int orte_pls_base_orted_cancel_operation(void);
|
||||
int orte_pls_base_orted_exit(opal_list_t *daemons, struct timeval *timeout);
|
||||
int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job, struct timeval *timeout);
|
||||
int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal);
|
||||
int orte_pls_base_orted_add_local_procs(opal_list_t *dmnlist, orte_gpr_notify_data_t *ndat);
|
||||
|
||||
ORTE_DECLSPEC int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job, opal_list_t *attrs);
|
||||
|
@ -40,6 +40,9 @@
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/install_dirs.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
@ -69,6 +72,7 @@
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/params.h"
|
||||
|
||||
#include "orte/mca/pls/base/pls_private.h"
|
||||
#include "pls_bproc.h"
|
||||
@ -513,6 +517,14 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
/* setup the daemon environment */
|
||||
orte_pls_bproc_setup_env(envp);
|
||||
|
||||
/* direct the daemons to drop contact files so the local procs
|
||||
* can learn how to contact them - this is used for routing
|
||||
* OOB messaging
|
||||
*/
|
||||
var = mca_base_param_environ_variable("odls","base","drop_contact_file");
|
||||
opal_setenv(var,"1", true, envp);
|
||||
free(var);
|
||||
|
||||
/* daemons calculate their process name using a "stride" of one, so
|
||||
* push that value into their environment */
|
||||
stride = 1;
|
||||
@ -704,7 +716,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
}
|
||||
rc = ORTE_ERROR;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_pls_bproc_terminate_job(map->job, NULL);
|
||||
orte_pls_bproc_terminate_job(map->job, &orte_abort_timeout, NULL);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
@ -767,10 +779,10 @@ orte_pls_bproc_node_failed(orte_gpr_notify_message_t *msg)
|
||||
orte_schema.extract_jobid_from_std_trigger_name(&job, msg->target);
|
||||
|
||||
/* terminate all jobs in the in the job family */
|
||||
orte_pls_bproc_terminate_job(job, NULL);
|
||||
orte_pls_bproc_terminate_job(job, &orte_abort_timeout, NULL);
|
||||
|
||||
/* kill the daemons */
|
||||
orte_pls_bproc_terminate_job(0, NULL);
|
||||
orte_pls_bproc_terminate_job(0, &orte_abort_timeout, NULL);
|
||||
|
||||
/* shouldn't ever get here.. */
|
||||
exit(1);
|
||||
@ -1159,7 +1171,7 @@ cleanup:
|
||||
|
||||
/**
|
||||
* Terminate all processes associated with this job */
|
||||
int orte_pls_bproc_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) {
|
||||
int orte_pls_bproc_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) {
|
||||
pid_t* pids;
|
||||
orte_std_cntr_t i, num_pids;
|
||||
int rc;
|
||||
@ -1189,7 +1201,7 @@ int orte_pls_bproc_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) {
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t daemons;
|
||||
@ -1205,7 +1217,7 @@ int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
}
|
||||
|
||||
/* now tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
@ -1295,6 +1307,23 @@ int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t sig
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cancel an operation involving comm to an orted
|
||||
*/
|
||||
int orte_pls_bproc_cancel_operation(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Module cleanup
|
||||
*/
|
||||
|
@ -42,6 +42,9 @@
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include <sys/bproc.h>
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/threads/condition.h"
|
||||
|
||||
@ -72,11 +75,12 @@ int orte_pls_bproc_finalize(void);
|
||||
* Interface
|
||||
*/
|
||||
int orte_pls_bproc_launch(orte_jobid_t);
|
||||
int orte_pls_bproc_terminate_job(orte_jobid_t, opal_list_t*);
|
||||
int orte_pls_bproc_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*);
|
||||
int orte_pls_bproc_terminate_proc(const orte_process_name_t* proc_name);
|
||||
int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, opal_list_t*);
|
||||
int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t*);
|
||||
int orte_pls_bproc_signal_job(orte_jobid_t, int32_t, opal_list_t*);
|
||||
int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t);
|
||||
int orte_pls_bproc_cancel_operation(void);
|
||||
|
||||
/* Utility routine to get/set process pid */
|
||||
ORTE_DECLSPEC int orte_pls_bproc_set_proc_pid(const orte_process_name_t*, pid_t, int);
|
||||
|
@ -26,6 +26,10 @@
|
||||
#ifdef HAVE_SIGNAL_H
|
||||
#include <signal.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_CNOS_PM_BARRIER
|
||||
#include <catamount/cnos_mpi_os.h>
|
||||
#endif
|
||||
@ -38,12 +42,13 @@
|
||||
|
||||
|
||||
static int orte_pls_cnos_launch_job(orte_jobid_t jobid);
|
||||
static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, opal_list_t *attrs);
|
||||
static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs);
|
||||
static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int orte_pls_cnos_terminate_proc(const orte_process_name_t* proc_name);
|
||||
static int orte_pls_cnos_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
|
||||
static int orte_pls_cnos_signal_proc(const orte_process_name_t* proc_name, int32_t signal);
|
||||
static int orte_pls_cnos_finalize(void);
|
||||
static int orte_pls_cnos_cancel_operation(void);
|
||||
|
||||
|
||||
orte_pls_base_module_t orte_pls_cnos_module = {
|
||||
@ -53,6 +58,7 @@ orte_pls_base_module_t orte_pls_cnos_module = {
|
||||
orte_pls_cnos_terminate_proc,
|
||||
orte_pls_cnos_signal_job,
|
||||
orte_pls_cnos_signal_proc,
|
||||
orte_pls_cnos_cancel_operation,
|
||||
orte_pls_cnos_finalize
|
||||
};
|
||||
|
||||
@ -68,7 +74,7 @@ static int orte_pls_cnos_launch_job(orte_jobid_t jobid)
|
||||
extern int killrank(rank_t RANK, int SIG);
|
||||
#endif
|
||||
|
||||
static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
orte_jobid_t my_jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
@ -85,7 +91,7 @@ static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
}
|
||||
|
||||
|
||||
static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
orte_jobid_t my_jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
@ -131,6 +137,11 @@ static int orte_pls_cnos_signal_proc(const orte_process_name_t* proc_name, int32
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
int orte_pls_rsh_cancel_operation(void)
|
||||
{
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
static int orte_pls_cnos_finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -90,6 +90,11 @@
|
||||
#define ORTE_PLS_GRIDENGINE_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "opal/mca/mca.h"
|
||||
|
||||
@ -113,11 +118,12 @@ int orte_pls_gridengine_finalize(void);
|
||||
* Interface
|
||||
*/
|
||||
int orte_pls_gridengine_launch_job(orte_jobid_t);
|
||||
int orte_pls_gridengine_terminate_job(orte_jobid_t, opal_list_t *attrs);
|
||||
int orte_pls_gridengine_terminate_orteds(orte_jobid_t, opal_list_t *attrs);
|
||||
int orte_pls_gridengine_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs);
|
||||
int orte_pls_gridengine_terminate_orteds(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs);
|
||||
int orte_pls_gridengine_terminate_proc(const orte_process_name_t*);
|
||||
int orte_pls_gridengine_signal_job(orte_jobid_t, int32_t, opal_list_t *attrs);
|
||||
int orte_pls_gridengine_signal_proc(const orte_process_name_t*, int32_t);
|
||||
int orte_pls_gridengine_cancel_operation(void);
|
||||
|
||||
/**
|
||||
* PLS Component
|
||||
|
@ -100,6 +100,7 @@ orte_pls_base_module_t orte_pls_gridengine_module = {
|
||||
orte_pls_gridengine_terminate_proc,
|
||||
orte_pls_gridengine_signal_job,
|
||||
orte_pls_gridengine_signal_proc,
|
||||
orte_pls_gridengine_cancel_operation,
|
||||
orte_pls_gridengine_finalize
|
||||
};
|
||||
|
||||
@ -774,7 +775,7 @@ static int update_slot_keyval(orte_ras_node_t* ras_node, int* slot_cnt)
|
||||
/**
|
||||
* Query the registry for all nodes participating in the job
|
||||
*/
|
||||
int orte_pls_gridengine_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
int orte_pls_gridengine_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t daemons;
|
||||
@ -788,7 +789,7 @@ int orte_pls_gridengine_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
}
|
||||
|
||||
/* order them to kill their local procs for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
@ -809,7 +810,7 @@ int orte_pls_gridengine_terminate_proc(const orte_process_name_t* proc)
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
int orte_pls_gridengine_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
int orte_pls_gridengine_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t daemons;
|
||||
@ -823,7 +824,7 @@ int orte_pls_gridengine_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
}
|
||||
|
||||
/* now tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
@ -872,6 +873,21 @@ int orte_pls_gridengine_signal_proc(const orte_process_name_t* proc, int32_t sig
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cancel an operation involving comm to an orted
|
||||
*/
|
||||
int orte_pls_gridengine_cancel_operation(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Finalize
|
||||
*/
|
||||
|
@ -202,12 +202,12 @@ typedef int (*orte_pls_base_module_launch_job_fn_t)(orte_jobid_t);
|
||||
* Terminate any processes launched for the respective jobid by
|
||||
* this component.
|
||||
*/
|
||||
typedef int (*orte_pls_base_module_terminate_job_fn_t)(orte_jobid_t, opal_list_t *attrs);
|
||||
typedef int (*orte_pls_base_module_terminate_job_fn_t)(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs);
|
||||
|
||||
/**
|
||||
* Terminate the daemons associated with this jobid
|
||||
*/
|
||||
typedef int (*orte_pls_base_module_terminate_orteds_fn_t)(orte_jobid_t, opal_list_t *attrs);
|
||||
typedef int (*orte_pls_base_module_terminate_orteds_fn_t)(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs);
|
||||
|
||||
/**
|
||||
* Terminate a specific process.
|
||||
@ -225,6 +225,11 @@ typedef int (*orte_pls_base_module_signal_job_fn_t)(orte_jobid_t, int32_t, opal_
|
||||
*/
|
||||
typedef int (*orte_pls_base_module_signal_proc_fn_t)(const orte_process_name_t*, int32_t);
|
||||
|
||||
/**
|
||||
* Cancel an ongoing operation involving communication to the orteds
|
||||
*/
|
||||
typedef int (*orte_pls_base_module_cancel_operation_fn_t)(void);
|
||||
|
||||
/**
|
||||
* Cleanup all resources held by the module
|
||||
*/
|
||||
@ -240,6 +245,7 @@ struct orte_pls_base_module_1_3_0_t {
|
||||
orte_pls_base_module_terminate_proc_fn_t terminate_proc;
|
||||
orte_pls_base_module_signal_job_fn_t signal_job;
|
||||
orte_pls_base_module_signal_proc_fn_t signal_proc;
|
||||
orte_pls_base_module_cancel_operation_fn_t cancel_operation;
|
||||
orte_pls_base_module_finalize_fn_t finalize;
|
||||
};
|
||||
|
||||
|
@ -30,6 +30,9 @@
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/argv.h"
|
||||
@ -60,12 +63,13 @@ extern char **environ;
|
||||
* Local functions
|
||||
*/
|
||||
static int pls_poe_launch_job(orte_jobid_t jobid);
|
||||
static int pls_poe_terminate_job(orte_jobid_t jobid, opal_list_t *attrs);
|
||||
static int pls_poe_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs);
|
||||
static int pls_poe_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_poe_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_poe_terminate_proc(const orte_process_name_t *name);
|
||||
static int pls_poe_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
|
||||
static int pls_poe_signal_proc(const orte_process_name_t *name, int32_t signal);
|
||||
static int pls_poe_finalize(void);
|
||||
static int pls_poe_cancel_operation(void);
|
||||
|
||||
orte_pls_base_module_t orte_pls_poe_module = {
|
||||
pls_poe_launch_job,
|
||||
@ -74,6 +78,7 @@ orte_pls_base_module_t orte_pls_poe_module = {
|
||||
pls_poe_terminate_proc,
|
||||
pls_poe_signal_job,
|
||||
pls_poe_signal_proc,
|
||||
pls_poe_cancel_operation,
|
||||
pls_poe_finalize
|
||||
};
|
||||
|
||||
@ -602,7 +607,7 @@ static int pls_poe_launch_job(orte_jobid_t jobid)
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int pls_poe_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
static int pls_poe_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
@ -613,7 +618,7 @@ static int pls_poe_terminate_proc(const orte_process_name_t *name)
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int pls_poe_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
static int pls_poe_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
@ -25,6 +25,10 @@
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
@ -49,11 +53,12 @@ int orte_pls_process_finalize(void);
|
||||
* Interface
|
||||
*/
|
||||
int orte_pls_process_launch(orte_jobid_t);
|
||||
int orte_pls_process_terminate_job(orte_jobid_t, opal_list_t*);
|
||||
int orte_pls_process_terminate_orteds(orte_jobid_t, opal_list_t*);
|
||||
int orte_pls_process_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*);
|
||||
int orte_pls_process_terminate_orteds(orte_jobid_t, struct timeval *timeout, opal_list_t*);
|
||||
int orte_pls_process_terminate_proc(const orte_process_name_t* proc_name);
|
||||
int orte_pls_process_signal_job(orte_jobid_t, int32_t, opal_list_t*);
|
||||
int orte_pls_process_signal_proc(const orte_process_name_t* proc_name, int32_t);
|
||||
int orte_pls_process_cancel_operation(void);
|
||||
|
||||
/**
|
||||
* PLS Component
|
||||
|
@ -115,6 +115,7 @@ orte_pls_base_module_t orte_pls_process_module = {
|
||||
orte_pls_process_terminate_proc,
|
||||
orte_pls_process_signal_job,
|
||||
orte_pls_process_signal_proc,
|
||||
orte_pls_process_cancel_operation,
|
||||
orte_pls_process_finalize
|
||||
};
|
||||
|
||||
@ -1029,7 +1030,7 @@ cleanup:
|
||||
/**
|
||||
* Terminate all processes for a given job
|
||||
*/
|
||||
int orte_pls_process_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
int orte_pls_process_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t daemons;
|
||||
@ -1045,7 +1046,7 @@ int orte_pls_process_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
}
|
||||
|
||||
/* order them to kill their local procs for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
@ -1061,7 +1062,7 @@ CLEANUP:
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
int orte_pls_process_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
int orte_pls_process_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t daemons;
|
||||
@ -1077,7 +1078,7 @@ int orte_pls_process_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
}
|
||||
|
||||
/* now tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
@ -1134,6 +1135,23 @@ int orte_pls_process_signal_proc(const orte_process_name_t* proc, int32_t signal
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cancel an operation involving comm to an orted
|
||||
*/
|
||||
int orte_pls_process_cancel_operation(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
int orte_pls_process_finalize(void)
|
||||
{
|
||||
int rc;
|
||||
|
@ -26,6 +26,10 @@
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/trace.h"
|
||||
|
||||
@ -110,13 +114,14 @@ int orte_pls_proxy_launch(orte_jobid_t job)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_pls_proxy_terminate_job(orte_jobid_t job, opal_list_t *attrs)
|
||||
int orte_pls_proxy_terminate_job(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
orte_buffer_t* cmd;
|
||||
orte_buffer_t* answer;
|
||||
orte_pls_cmd_flag_t command, ret_cmd;
|
||||
orte_std_cntr_t count;
|
||||
int rc;
|
||||
int32_t timefield;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
@ -146,6 +151,20 @@ int orte_pls_proxy_terminate_job(orte_jobid_t job, opal_list_t *attrs)
|
||||
return rc;
|
||||
}
|
||||
|
||||
timefield = timeout->tv_sec;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &timefield, 1, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
timefield = timeout->tv_usec;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &timefield, 1, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(orte_pls_proxy_replica, cmd, ORTE_RML_TAG_PLS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(cmd);
|
||||
@ -182,13 +201,14 @@ int orte_pls_proxy_terminate_job(orte_jobid_t job, opal_list_t *attrs)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_pls_proxy_terminate_orteds(orte_jobid_t job, opal_list_t *attrs)
|
||||
int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
orte_buffer_t* cmd;
|
||||
orte_buffer_t* answer;
|
||||
orte_pls_cmd_flag_t command, ret_cmd;
|
||||
orte_std_cntr_t count;
|
||||
int rc;
|
||||
int32_t timefield;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
@ -218,6 +238,20 @@ int orte_pls_proxy_terminate_orteds(orte_jobid_t job, opal_list_t *attrs)
|
||||
return rc;
|
||||
}
|
||||
|
||||
timefield = timeout->tv_sec;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &timefield, 1, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
timefield = timeout->tv_usec;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &timefield, 1, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(orte_pls_proxy_replica, cmd, ORTE_RML_TAG_PLS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(cmd);
|
||||
@ -471,5 +505,63 @@ int orte_pls_proxy_signal_proc(const orte_process_name_t* name, int32_t signal)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_pls_proxy_cancel_operation(void)
|
||||
{
|
||||
orte_buffer_t* cmd;
|
||||
orte_buffer_t* answer;
|
||||
orte_pls_cmd_flag_t command, ret_cmd;
|
||||
orte_std_cntr_t count;
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
command = ORTE_PLS_CANCEL_OPERATION_CMD;
|
||||
|
||||
cmd = OBJ_NEW(orte_buffer_t);
|
||||
if (cmd == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_PLS_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(orte_pls_proxy_replica, cmd, ORTE_RML_TAG_PLS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
OBJ_RELEASE(cmd);
|
||||
|
||||
answer = OBJ_NEW(orte_buffer_t);
|
||||
if(answer == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.recv_buffer(orte_pls_proxy_replica, answer, ORTE_RML_TAG_PLS)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &ret_cmd, &count, ORTE_PLS_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(answer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ret_cmd != command) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -22,6 +22,10 @@
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "orte/mca/pls/pls.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
@ -48,11 +52,12 @@ int orte_pls_proxy_finalize(void);
|
||||
* proxy function prototypes
|
||||
*/
|
||||
int orte_pls_proxy_launch(orte_jobid_t job);
|
||||
int orte_pls_proxy_terminate_job(orte_jobid_t job, opal_list_t *attrs);
|
||||
int orte_pls_proxy_terminate_orteds(orte_jobid_t job, opal_list_t *attrs);
|
||||
int orte_pls_proxy_terminate_job(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs);
|
||||
int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs);
|
||||
int orte_pls_proxy_terminate_proc(const orte_process_name_t* name);
|
||||
int orte_pls_proxy_signal_job(orte_jobid_t job, int32_t signal, opal_list_t *attrs);
|
||||
int orte_pls_proxy_signal_proc(const orte_process_name_t* name, int32_t signal);
|
||||
int orte_pls_proxy_cancel_operation(void);
|
||||
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
|
@ -65,6 +65,7 @@ static orte_pls_base_module_t orte_pls_proxy_module = {
|
||||
orte_pls_proxy_terminate_proc,
|
||||
orte_pls_proxy_signal_job,
|
||||
orte_pls_proxy_signal_proc,
|
||||
orte_pls_proxy_cancel_operation,
|
||||
orte_pls_proxy_finalize
|
||||
};
|
||||
|
||||
|
@ -25,8 +25,13 @@
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/mca/mca.h"
|
||||
|
||||
#include "orte/mca/pls/pls.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
@ -49,11 +54,12 @@ int orte_pls_rsh_finalize(void);
|
||||
* Interface
|
||||
*/
|
||||
int orte_pls_rsh_launch(orte_jobid_t);
|
||||
int orte_pls_rsh_terminate_job(orte_jobid_t, opal_list_t*);
|
||||
int orte_pls_rsh_terminate_orteds(orte_jobid_t, opal_list_t*);
|
||||
int orte_pls_rsh_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*);
|
||||
int orte_pls_rsh_terminate_orteds(orte_jobid_t, struct timeval *timeout, opal_list_t*);
|
||||
int orte_pls_rsh_terminate_proc(const orte_process_name_t* proc_name);
|
||||
int orte_pls_rsh_signal_job(orte_jobid_t, int32_t, opal_list_t*);
|
||||
int orte_pls_rsh_signal_proc(const orte_process_name_t* proc_name, int32_t);
|
||||
int orte_pls_rsh_cancel_operation(void);
|
||||
|
||||
/**
|
||||
* PLS Component
|
||||
|
@ -107,6 +107,7 @@ orte_pls_base_module_t orte_pls_rsh_module = {
|
||||
orte_pls_rsh_terminate_proc,
|
||||
orte_pls_rsh_signal_job,
|
||||
orte_pls_rsh_signal_proc,
|
||||
orte_pls_rsh_cancel_operation,
|
||||
orte_pls_rsh_finalize
|
||||
};
|
||||
|
||||
@ -943,6 +944,9 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
argv[local_exec_index_end] = NULL;
|
||||
}
|
||||
|
||||
/* tell the daemon to setup its own process session/group */
|
||||
opal_argv_append(&argc, &argv, "--set-sid");
|
||||
|
||||
/* Finally, chdir($HOME) because we're making the
|
||||
assumption that this is what will happen on
|
||||
remote nodes (via rsh/ssh). This allows a user
|
||||
@ -1128,7 +1132,7 @@ cleanup:
|
||||
/**
|
||||
* Terminate all processes for a given job
|
||||
*/
|
||||
int orte_pls_rsh_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
int orte_pls_rsh_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t daemons;
|
||||
@ -1144,7 +1148,7 @@ int orte_pls_rsh_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
}
|
||||
|
||||
/* order them to kill their local procs for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
@ -1160,7 +1164,7 @@ CLEANUP:
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t daemons;
|
||||
@ -1176,7 +1180,7 @@ int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
}
|
||||
|
||||
/* now tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
@ -1233,6 +1237,23 @@ int orte_pls_rsh_signal_proc(const orte_process_name_t* proc, int32_t signal)
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cancel an operation involving comm to an orted
|
||||
*/
|
||||
int orte_pls_rsh_cancel_operation(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
int orte_pls_rsh_finalize(void)
|
||||
{
|
||||
int rc;
|
||||
|
@ -74,12 +74,13 @@
|
||||
* Local functions
|
||||
*/
|
||||
static int pls_slurm_launch_job(orte_jobid_t jobid);
|
||||
static int pls_slurm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs);
|
||||
static int pls_slurm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs);
|
||||
static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_slurm_terminate_proc(const orte_process_name_t *name);
|
||||
static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
|
||||
static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal);
|
||||
static int pls_slurm_finalize(void);
|
||||
static int pls_slurm_cancel_operation(void);
|
||||
|
||||
static int pls_slurm_start_proc(int argc, char **argv, char **env,
|
||||
char *prefix);
|
||||
@ -95,6 +96,7 @@ orte_pls_base_module_1_3_0_t orte_pls_slurm_module = {
|
||||
pls_slurm_terminate_proc,
|
||||
pls_slurm_signal_job,
|
||||
pls_slurm_signal_proc,
|
||||
pls_slurm_cancel_operation,
|
||||
pls_slurm_finalize
|
||||
};
|
||||
|
||||
@ -443,7 +445,7 @@ cleanup:
|
||||
}
|
||||
|
||||
|
||||
static int pls_slurm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t daemons;
|
||||
@ -457,7 +459,7 @@ static int pls_slurm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
}
|
||||
|
||||
/* order them to kill their local procs for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
@ -474,7 +476,7 @@ CLEANUP:
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
static int pls_slurm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t daemons;
|
||||
@ -488,7 +490,7 @@ static int pls_slurm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
}
|
||||
|
||||
/* order them to go away */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
@ -534,6 +536,21 @@ static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Cancel an operation involving comm to an orted
|
||||
*/
|
||||
int pls_slurm_cancel_operation(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static int pls_slurm_finalize(void)
|
||||
{
|
||||
int rc;
|
||||
|
@ -79,11 +79,12 @@
|
||||
* Local functions
|
||||
*/
|
||||
static int pls_tm_launch_job(orte_jobid_t jobid);
|
||||
static int pls_tm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs);
|
||||
static int pls_tm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs);
|
||||
static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_tm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_tm_terminate_proc(const orte_process_name_t *name);
|
||||
static int pls_tm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
|
||||
static int pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal);
|
||||
static int pls_tm_cancel_operation(void);
|
||||
static int pls_tm_finalize(void);
|
||||
|
||||
static int pls_tm_connect(void);
|
||||
@ -559,7 +560,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
}
|
||||
|
||||
|
||||
static int pls_tm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t daemons;
|
||||
@ -573,7 +574,7 @@ static int pls_tm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
}
|
||||
|
||||
/* order them to kill their local procs for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
@ -590,7 +591,7 @@ CLEANUP:
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
int pls_tm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
int pls_tm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t daemons;
|
||||
@ -604,7 +605,7 @@ int pls_tm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
}
|
||||
|
||||
/* now tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
@ -661,6 +662,21 @@ static int pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Cancel an operation involving comm to an orted
|
||||
*/
|
||||
static int pls_tm_cancel_operation(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Free stuff
|
||||
*/
|
||||
|
@ -40,6 +40,9 @@
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
@ -86,6 +89,7 @@ orte_pls_base_module_t orte_pls_xcpu_module = {
|
||||
orte_pls_xcpu_terminate_proc,
|
||||
orte_pls_xcpu_signal_job,
|
||||
orte_pls_xcpu_signal_proc,
|
||||
orte_pls_xcpu_cancel_operation,
|
||||
orte_pls_xcpu_finalize
|
||||
};
|
||||
|
||||
@ -357,7 +361,7 @@ error:
|
||||
return rc;
|
||||
}
|
||||
|
||||
int orte_pls_xcpu_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
int orte_pls_xcpu_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int i, rc;
|
||||
orte_job_map_t *map;
|
||||
@ -378,7 +382,7 @@ int orte_pls_xcpu_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, opal_list_t * attrs)
|
||||
int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t * attrs)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -424,6 +428,14 @@ int orte_pls_xcpu_signal_proc(const orte_process_name_t* proc_name, int32_t sig)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cancel an operation involving comm to an orted
|
||||
*/
|
||||
int orte_pls_xcpu_cancel_operation(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_pls_xcpu_finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -32,6 +32,11 @@
|
||||
#define orte_pls_xcpu_H_
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "orte/class/orte_pointer_array.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
@ -57,11 +62,12 @@ orte_pls_base_module_t* orte_pls_xcpu_init(int *priority); /* in component file
|
||||
* Interface
|
||||
*/
|
||||
int orte_pls_xcpu_launch_job(orte_jobid_t);
|
||||
int orte_pls_xcpu_terminate_job(orte_jobid_t, opal_list_t *);
|
||||
int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, opal_list_t * attrs);
|
||||
int orte_pls_xcpu_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t *);
|
||||
int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t * attrs);
|
||||
int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name);
|
||||
int orte_pls_xcpu_signal_job(orte_jobid_t jobid, int32_t sig, opal_list_t*);
|
||||
int orte_pls_xcpu_signal_proc(const orte_process_name_t* proc_name, int32_t sig);
|
||||
int orte_pls_xcpu_cancel_operation(void);
|
||||
int orte_pls_xcpu_finalize(void);
|
||||
|
||||
void orte_pls_xcpu_close_sessions(void);
|
||||
|
@ -27,6 +27,9 @@
|
||||
#import <sys/stat.h>
|
||||
#import <sys/wait.h>
|
||||
#import <fcntl.h>
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#import "orte/orte_constants.h"
|
||||
#import "opal/util/argv.h"
|
||||
@ -45,14 +48,14 @@
|
||||
#import "pls_xgrid.h"
|
||||
|
||||
int orte_pls_xgrid_launch(orte_jobid_t jobid);
|
||||
int orte_pls_xgrid_terminate_job(orte_jobid_t jobid, opal_list_t *attrs);
|
||||
int orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs);
|
||||
int orte_pls_xgrid_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
int orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
int orte_pls_xgrid_terminate_proc(const orte_process_name_t* proc);
|
||||
int orte_pls_xgrid_signal_job(orte_jobid_t job, int32_t signal, opal_list_t *attrs);
|
||||
int orte_pls_xgrid_signal_proc(const orte_process_name_t* proc_name, int32_t signal);
|
||||
int orte_pls_xgrid_cancel_operation(void);
|
||||
int orte_pls_xgrid_finalize(void);
|
||||
|
||||
|
||||
orte_pls_base_module_1_3_0_t orte_pls_xgrid_module = {
|
||||
orte_pls_xgrid_launch,
|
||||
orte_pls_xgrid_terminate_job,
|
||||
@ -60,6 +63,7 @@ orte_pls_base_module_1_3_0_t orte_pls_xgrid_module = {
|
||||
orte_pls_xgrid_terminate_proc,
|
||||
orte_pls_xgrid_signal_job,
|
||||
orte_pls_xgrid_signal_proc,
|
||||
orte_pls_xgrid_cancel_operation,
|
||||
orte_pls_xgrid_finalize
|
||||
};
|
||||
|
||||
@ -81,7 +85,7 @@ orte_pls_xgrid_launch(orte_jobid_t jobid)
|
||||
* Terminate all processes for a given job
|
||||
*/
|
||||
int
|
||||
orte_pls_xgrid_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
orte_pls_xgrid_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t daemons;
|
||||
@ -95,7 +99,7 @@ orte_pls_xgrid_terminate_job(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
}
|
||||
|
||||
/* order them to kill their local procs for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
@ -118,7 +122,7 @@ CLEANUP:
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
int
|
||||
orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t daemons;
|
||||
@ -132,7 +136,7 @@ orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
||||
}
|
||||
|
||||
/* now tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
@ -195,6 +199,19 @@ orte_pls_xgrid_signal_proc(const orte_process_name_t* proc, int32_t signal)
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
orte_pls_xgrid_cancel_operation(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
orte_pls_xgrid_finalize(void)
|
||||
{
|
||||
|
@ -33,9 +33,6 @@
|
||||
* @retval ORTE_ERROR Upon failure.
|
||||
*/
|
||||
|
||||
/* globals used by RTE */
|
||||
int orte_debug_flag=(int)false;
|
||||
|
||||
int orte_init(bool infrastructure)
|
||||
{
|
||||
int rc;
|
||||
|
@ -133,18 +133,6 @@ int orte_init_stage1(bool infrastructure)
|
||||
|
||||
/***** ERROR LOGGING NOW AVAILABLE *****/
|
||||
|
||||
/* check for debug flag */
|
||||
if (0 > (ret = mca_base_param_register_int("orte", "debug", NULL, NULL, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "mca_base_param_register_int";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = mca_base_param_lookup_int(ret, &orte_debug_flag))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "mca_base_param_lookup_int";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the event library
|
||||
*/
|
||||
|
@ -20,14 +20,27 @@
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/params.h"
|
||||
|
||||
/* globals used by RTE */
|
||||
int orte_debug_flag;
|
||||
struct timeval orte_abort_timeout;
|
||||
|
||||
int orte_register_params(bool infrastructure)
|
||||
{
|
||||
int value;
|
||||
|
||||
mca_base_param_reg_int_name("orte", "debug",
|
||||
"Top-level ORTE debug switch",
|
||||
false, false, (int)false, NULL);
|
||||
false, false, (int)false, &value);
|
||||
orte_debug_flag = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("orte_debug", "daemons_file",
|
||||
"Whether want stdout/stderr of daemons to go to a file or not",
|
||||
@ -51,8 +64,14 @@ int orte_register_params(bool infrastructure)
|
||||
"Sequence of user-level debuggers to search for in orterun",
|
||||
false, false, "totalview @mpirun@ -a @mpirun_args@ : fxp @mpirun@ -a @mpirun_args@", NULL);
|
||||
|
||||
|
||||
mca_base_param_reg_int_name("orte", "abort_timeout",
|
||||
"Time to wait [in seconds] before giving up on aborting an ORTE operation",
|
||||
false, false, 10, &value);
|
||||
orte_abort_timeout.tv_sec = value;
|
||||
orte_abort_timeout.tv_usec = 0;
|
||||
|
||||
/* All done */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
49
orte/runtime/params.h
Обычный файл
49
orte/runtime/params.h
Обычный файл
@ -0,0 +1,49 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Global params for OpenRTE
|
||||
*/
|
||||
#ifndef ORTE_RUNTIME_PARAM_H
|
||||
#define ORTE_RUNTIME_PARAM_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* globals used by RTE - instanced in orte_params.c */
|
||||
|
||||
ORTE_DECLSPEC extern int orte_debug_flag;
|
||||
|
||||
ORTE_DECLSPEC extern struct timeval orte_abort_timeout;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* ORTE_RUNTIME_PARAM_H */
|
@ -69,6 +69,7 @@
|
||||
#include "orte/mca/pls/pls.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/params.h"
|
||||
|
||||
#include "orte/tools/orted/orted.h"
|
||||
|
||||
@ -126,6 +127,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
|
||||
&orted_globals.bootproxy, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Run as boot proxy for <job-id>" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "set-sid", 0,
|
||||
&orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Direct the orted to separate from the current session"},
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "name", 1,
|
||||
&orted_globals.name, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Set the orte process name"},
|
||||
@ -206,6 +211,9 @@ int main(int argc, char *argv[])
|
||||
/* save the environment for use when launching application processes */
|
||||
orted_globals.saved_environ = opal_argv_copy(environ);
|
||||
|
||||
/* setup mca param system */
|
||||
mca_base_param_init();
|
||||
|
||||
/* setup to check common command line options that just report and die */
|
||||
cmd_line = OBJ_NEW(opal_cmd_line_t);
|
||||
opal_cmd_line_create(cmd_line, orte_cmd_line_opts);
|
||||
@ -229,6 +237,11 @@ int main(int argc, char *argv[])
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* see if we were directed to separate from current session */
|
||||
if (orted_globals.set_sid) {
|
||||
setsid();
|
||||
}
|
||||
|
||||
/* see if they want us to spin until they can connect a debugger to us */
|
||||
i=0;
|
||||
while (orted_globals.spin) {
|
||||
@ -741,7 +754,7 @@ static void halt_vm(void)
|
||||
/* terminate the vm - this will also wake us up so we can exit */
|
||||
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
||||
orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
|
||||
ret = orte_pls.terminate_orteds(0, &attrs);
|
||||
ret = orte_pls.terminate_orteds(0, &orte_abort_timeout, &attrs);
|
||||
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
|
||||
OBJ_DESTRUCT(&attrs);
|
||||
|
||||
|
@ -35,6 +35,7 @@ typedef struct {
|
||||
bool debug;
|
||||
bool debug_daemons;
|
||||
bool debug_daemons_file;
|
||||
bool set_sid;
|
||||
char* ns_nds;
|
||||
char* name;
|
||||
char* vpid_start;
|
||||
|
@ -68,10 +68,12 @@
|
||||
#include "orte/mca/schema/base/base.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/params.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
#include "orte/tools/orteprobe/orteprobe.h"
|
||||
|
||||
#if 0
|
||||
orteprobe_globals_t orteprobe_globals;
|
||||
|
||||
/*
|
||||
@ -127,6 +129,7 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
|
||||
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||
};
|
||||
#endif
|
||||
|
||||
#if !defined(__WINDOWS__)
|
||||
extern char **environ;
|
||||
@ -134,6 +137,7 @@ extern char **environ;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
#if 0
|
||||
int ret = 0, ortedargc;
|
||||
opal_cmd_line_t *cmd_line = NULL;
|
||||
char *contact_path = NULL, *orted=NULL;
|
||||
@ -151,6 +155,10 @@ int main(int argc, char *argv[])
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Setup MCA params */
|
||||
mca_base_param_init();
|
||||
orte_register_params(false);
|
||||
|
||||
/* setup to check common command line options that just report and die */
|
||||
memset(&orteprobe_globals, 0, sizeof(orteprobe_globals));
|
||||
cmd_line = OBJ_NEW(opal_cmd_line_t);
|
||||
@ -464,4 +472,6 @@ int main(int argc, char *argv[])
|
||||
fprintf(stderr, "orteprobe: system appears to not support remote probes\n");
|
||||
exit(1);
|
||||
#endif
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
@ -96,6 +96,7 @@ WARNING: %s encountered an abnormal exit.
|
||||
This means that %s exited before it received notification that all
|
||||
started processes had terminated. You should double check and ensure
|
||||
that there are no runaway processes still executing.
|
||||
#
|
||||
[orterun:sigint-while-processing]
|
||||
WARNING: %s is in the process of killing a job, but has detected an
|
||||
interruption (probably control-C).
|
||||
@ -103,6 +104,13 @@ interruption (probably control-C).
|
||||
It is dangerous to interrupt %s while it is killing a job (proper
|
||||
termination may not be guaranteed). Hit control-C again within 1
|
||||
second if you really want to kill %s immediately.
|
||||
#
|
||||
[orterun:forced-end-failed]
|
||||
WARNING: %s was ordered to kill a job (probably with control-C), but
|
||||
was unable to successfully complete that order (returned error %s).
|
||||
You should double check and ensure that there are no runaway processes
|
||||
still executing.
|
||||
#
|
||||
[orterun:empty-prefix]
|
||||
A prefix was supplied to %s that only contained slashes.
|
||||
|
||||
@ -136,5 +144,5 @@ Things to check:
|
||||
- Ensure that any required licenses are available to run the debugger
|
||||
#
|
||||
[orterun:daemon-die]
|
||||
%s was unable to cleanly terminate the daemons for this job. Returned value %d instead of ORTE_SUCCESS.
|
||||
%s was unable to cleanly terminate the daemons for this job. Returned value %s instead of ORTE_SUCCESS.
|
||||
|
||||
|
@ -39,6 +39,9 @@
|
||||
#ifdef HAVE_LIBGEN_H
|
||||
#include <libgen.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/install_dirs.h"
|
||||
@ -71,6 +74,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/params.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
#include "orterun.h"
|
||||
@ -268,7 +272,7 @@ opal_cmd_line_init_t cmd_line_init[] = {
|
||||
&orterun_globals.do_not_launch, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Perform all necessary operations to prepare to launch the application, but do not actually launch it" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', "reuse-daemons", "reuse-daemons", 0,
|
||||
{ "pls", "base", "reuse_daemons", '\0', "reuse-daemons", "reuse-daemons", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"If set, reuse daemons to launch dynamically spawned processes"},
|
||||
|
||||
@ -470,9 +474,9 @@ int orterun(int argc, char *argv[])
|
||||
*/
|
||||
OBJ_CONSTRUCT(&attributes, opal_list_t);
|
||||
orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
|
||||
if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &attributes))) {
|
||||
opal_show_help("help-orterun.txt", "orterun:daemon-die", false,
|
||||
orterun_basename, NULL, NULL, ret);
|
||||
if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &orte_abort_timeout, &attributes))) {
|
||||
opal_show_help("help-orterun.txt", "orterun:daemon-die", true,
|
||||
orterun_basename, ORTE_ERROR_NAME(ret));
|
||||
}
|
||||
while (NULL != (item = opal_list_remove_first(&attributes))) OBJ_RELEASE(item);
|
||||
OBJ_DESTRUCT(&attributes);
|
||||
@ -686,9 +690,6 @@ static void exit_callback(int fd, short event, void *arg)
|
||||
{
|
||||
OPAL_TRACE(1);
|
||||
|
||||
opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
|
||||
true, orterun_basename, orterun_basename);
|
||||
|
||||
/* Remove the TERM and INT signal handlers */
|
||||
opal_signal_del(&term_handler);
|
||||
opal_signal_del(&int_handler);
|
||||
@ -720,62 +721,71 @@ typedef enum {
|
||||
static void abort_signal_callback(int fd, short flags, void *arg)
|
||||
{
|
||||
int ret;
|
||||
struct timeval tv = { 1, 0 };
|
||||
opal_event_t* event;
|
||||
opal_list_t attrs;
|
||||
opal_list_item_t *item;
|
||||
static abort_signal_state_t state;
|
||||
static abort_signal_state_t state=ABORT_SIGNAL_FIRST;
|
||||
static struct timeval invoked, now;
|
||||
double a, b;
|
||||
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
|
||||
/* If this whole process has already completed, then bail */
|
||||
switch (state) {
|
||||
case ABORT_SIGNAL_FIRST:
|
||||
/* This is the first time through */
|
||||
state = ABORT_SIGNAL_PROCESSING;
|
||||
break;
|
||||
|
||||
case ABORT_SIGNAL_WARNED:
|
||||
gettimeofday(&now, NULL);
|
||||
a = invoked.tv_sec * 1000000 + invoked.tv_usec;
|
||||
b = now.tv_sec * 1000000 + invoked.tv_usec;
|
||||
if (b - a <= 1000000) {
|
||||
/* We are in an event handler; exit_callback() will delete
|
||||
the handler that is currently running (which is a Bad
|
||||
Thing), so we can't call it directly. Instead, we have
|
||||
to exit this handler and setup to call exit_handler()
|
||||
after this. */
|
||||
if (NULL != (event = (opal_event_t*)
|
||||
malloc(sizeof(opal_event_t)))) {
|
||||
opal_evtimer_set(event, exit_callback, NULL);
|
||||
now.tv_sec = 0;
|
||||
now.tv_usec = 0;
|
||||
opal_evtimer_add(event, &now);
|
||||
state = ABORT_SIGNAL_DONE;
|
||||
}
|
||||
return;
|
||||
}
|
||||
/* Otherwise fall through to PROCESSING and warn again */
|
||||
|
||||
case ABORT_SIGNAL_PROCESSING:
|
||||
opal_show_help("help-orterun.txt", "orterun:sigint-while-processing",
|
||||
true, orterun_basename, orterun_basename,
|
||||
orterun_basename);
|
||||
gettimeofday(&invoked, NULL);
|
||||
state = ABORT_SIGNAL_WARNED;
|
||||
return;
|
||||
|
||||
case ABORT_SIGNAL_DONE:
|
||||
/* Nothing to do -- return */
|
||||
return;
|
||||
case ABORT_SIGNAL_FIRST:
|
||||
/* This is the first time through */
|
||||
state = ABORT_SIGNAL_PROCESSING;
|
||||
break;
|
||||
|
||||
case ABORT_SIGNAL_WARNED:
|
||||
gettimeofday(&now, NULL);
|
||||
a = invoked.tv_sec * 1000000 + invoked.tv_usec;
|
||||
b = now.tv_sec * 1000000 + invoked.tv_usec;
|
||||
if (b - a <= 1000000) {
|
||||
/* tell the pls to cancel the terminate request -
|
||||
* obviously, something is wrong at this point
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_pls.cancel_operation())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
/* give the user the warning about manual cleanup */
|
||||
opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
|
||||
true, orterun_basename, orterun_basename);
|
||||
|
||||
/* We are in an event handler; exit_callback() will delete
|
||||
the handler that is currently running (which is a Bad
|
||||
Thing), so we can't call it directly. Instead, we have
|
||||
to exit this handler and setup to call exit_handler()
|
||||
after this. */
|
||||
if (NULL != (event = (opal_event_t*)
|
||||
malloc(sizeof(opal_event_t)))) {
|
||||
opal_evtimer_set(event, exit_callback, NULL);
|
||||
now.tv_sec = 0;
|
||||
now.tv_usec = 0;
|
||||
opal_evtimer_add(event, &now);
|
||||
state = ABORT_SIGNAL_DONE;
|
||||
}
|
||||
return;
|
||||
}
|
||||
/* Otherwise fall through to PROCESSING and warn again */
|
||||
|
||||
case ABORT_SIGNAL_PROCESSING:
|
||||
opal_show_help("help-orterun.txt", "orterun:sigint-while-processing",
|
||||
true, orterun_basename, orterun_basename,
|
||||
orterun_basename);
|
||||
gettimeofday(&invoked, NULL);
|
||||
state = ABORT_SIGNAL_WARNED;
|
||||
return;
|
||||
|
||||
case ABORT_SIGNAL_DONE:
|
||||
/* Nothing to do -- return */
|
||||
return;
|
||||
}
|
||||
|
||||
if (!orterun_globals.quiet){
|
||||
fprintf(stderr, "%s: killing job...\n\n", orterun_basename);
|
||||
}
|
||||
|
||||
|
||||
/* terminate the job - this will also wakeup orterun so
|
||||
* it can kill all the orteds. Be sure to kill all the job's
|
||||
* descendants, if any, so nothing is left hanging
|
||||
@ -783,24 +793,20 @@ static void abort_signal_callback(int fd, short flags, void *arg)
|
||||
if (jobid != ORTE_JOBID_INVALID) {
|
||||
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
||||
orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
|
||||
ret = orte_pls.terminate_job(jobid, &attrs);
|
||||
ret = orte_pls.terminate_job(jobid, &orte_abort_timeout, &attrs);
|
||||
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
|
||||
OBJ_DESTRUCT(&attrs);
|
||||
if (ORTE_SUCCESS != ret) {
|
||||
opal_show_help("help-orterun.txt", "orterun:forced-end-failed",
|
||||
true, orterun_basename, ORTE_ERROR_NAME(ret));
|
||||
jobid = ORTE_JOBID_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
/* setup a delay to give the orteds time to complete their departure */
|
||||
if (NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) {
|
||||
opal_evtimer_set(event, exit_callback, NULL);
|
||||
opal_evtimer_add(event, &tv);
|
||||
}
|
||||
|
||||
state = ABORT_SIGNAL_DONE;
|
||||
state = ABORT_SIGNAL_DONE;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Pass user signals to the remote application processes
|
||||
*/
|
||||
@ -965,7 +971,7 @@ static int parse_globals(int argc, char* argv[])
|
||||
orterun_globals.by_slot = true;
|
||||
}
|
||||
|
||||
/* If we don't want to wait, we don't want to wait */
|
||||
/* If we don't want to wait, we don't want to wait */
|
||||
|
||||
if (orterun_globals.no_wait_for_job_completion) {
|
||||
wait_for_job_completion = false;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user