From ab5ea611005d1ad721afb71132582ece4617f65d Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 25 Jan 2007 14:17:44 +0000 Subject: [PATCH] Bring over the rest of the ctrl-c fixes. This commit includes: 1. add a "cancel_operation" API to the pls components that allows orterun to demand that an orted operation (e.g., terminate_job) be immediately cancelled and abandoned. 2. changes the pls orted commands from blocking to non-blocking. This allows us to interrupt those operations should an orted be non-responsive. The change also adds an orte_abort_timeout that limits how long orterun will automatically wait for the orteds to respond - if the terminate command, for example, doesn't see orted response within that time, then we printout an appropriate error message and just give up. 3. modifies orterun to allow multiple ctrl-c's to simply abort the program even if the orteds have not responded 4. does some cleanup on the orte-level mca params so that their implementation looks a lot more like that of ompi - makes it easier to maintain. This change also includes the definition of an orte_abort_timeout struct and associated MCA param (can't have too many!) so you can set the time after which orterun gives up on waiting for orteds to respond This needs more testing before migrating to 1.2. This commit was SVN r13304. --- orte/mca/errmgr/hnp/errmgr_hnp.c | 6 +- orte/mca/pls/base/pls_base_orted_cmds.c | 119 +++++++++++++++-- orte/mca/pls/base/pls_base_receive.c | 48 ++++++- orte/mca/pls/base/pls_private.h | 36 ++--- orte/mca/pls/bproc/pls_bproc.c | 41 +++++- orte/mca/pls/bproc/pls_bproc.h | 8 +- orte/mca/pls/cnos/pls_cnos.c | 19 ++- orte/mca/pls/gridengine/pls_gridengine.h | 10 +- .../pls/gridengine/pls_gridengine_module.c | 24 +++- orte/mca/pls/pls.h | 10 +- orte/mca/pls/poe/pls_poe_module.c | 13 +- orte/mca/pls/process/pls_process.h | 9 +- orte/mca/pls/process/pls_process_module.c | 26 +++- orte/mca/pls/proxy/pls_proxy.c | 98 +++++++++++++- orte/mca/pls/proxy/pls_proxy.h | 9 +- orte/mca/pls/proxy/pls_proxy_component.c | 1 + orte/mca/pls/rsh/pls_rsh.h | 10 +- orte/mca/pls/rsh/pls_rsh_module.c | 29 +++- orte/mca/pls/slurm/pls_slurm_module.c | 29 +++- orte/mca/pls/tm/pls_tm_module.c | 28 +++- orte/mca/pls/xcpu/pls_xcpu.c | 16 ++- orte/mca/pls/xcpu/pls_xcpu.h | 10 +- orte/mca/pls/xgrid/src/pls_xgrid_module.m | 31 ++++- orte/runtime/orte_init.c | 3 - orte/runtime/orte_init_stage1.c | 12 -- orte/runtime/orte_params.c | 23 +++- orte/runtime/params.h | 49 +++++++ orte/tools/orted/orted.c | 15 ++- orte/tools/orted/orted.h | 1 + orte/tools/orteprobe/orteprobe.c | 10 ++ orte/tools/orterun/help-orterun.txt | 10 +- orte/tools/orterun/orterun.c | 124 +++++++++--------- 32 files changed, 701 insertions(+), 176 deletions(-) create mode 100644 orte/runtime/params.h diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index ab422a05f2..9d5535f919 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -28,6 +28,7 @@ #include "opal/util/output.h" #include "orte/runtime/runtime.h" +#include "orte/runtime/params.h" #include "orte/mca/ns/ns_types.h" #include "orte/mca/gpr/gpr.h" #include "orte/mca/pls/pls.h" @@ -85,9 +86,8 @@ int orte_errmgr_hnp_proc_aborted(orte_gpr_notify_message_t *msg) /* tell the pls to terminate the job AND ALL ITS DESCENDANTS */ OBJ_CONSTRUCT(&attrs, opal_list_t); orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); - if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &attrs))) { + if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &orte_abort_timeout, &attrs))) { ORTE_ERROR_LOG(rc); - return rc; } while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attrs); @@ -152,7 +152,7 @@ int orte_errmgr_hnp_incomplete_start(orte_gpr_notify_message_t *msg) /* tell the pls to terminate the job - just kill this job, not any descendants since * the job is just trying to start */ - if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, NULL))) { + if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &orte_abort_timeout, NULL))) { ORTE_ERROR_LOG(rc); } diff --git a/orte/mca/pls/base/pls_base_orted_cmds.c b/orte/mca/pls/base/pls_base_orted_cmds.c index 0c86cee175..35221c7101 100644 --- a/orte/mca/pls/base/pls_base_orted_cmds.c +++ b/orte/mca/pls/base/pls_base_orted_cmds.c @@ -20,6 +20,11 @@ #include "orte_config.h" #include "orte/orte_constants.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif + +#include "opal/event/event.h" #include "opal/threads/condition.h" #include "opal/util/output.h" #include "opal/util/argv.h" @@ -36,6 +41,25 @@ #include "orte/mca/pls/base/pls_private.h" static orte_std_cntr_t orted_cmd_num_active; +static int completion_status; + +static void orte_pls_base_orted_default_wakeup(int fd, short event, void *arg) +{ + /* protect for threads */ + OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); + + /* cancel the receive - we didn't get everyone's response in time */ + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK); + + /* set the completion status to reflect timeout error */ + completion_status = ORTE_ERR_TIMEOUT; + + /* declare us "done" so we can exit cleanly */ + opal_condition_signal(&orte_pls_base.orted_cmd_cond); + + /* unlock us */ + OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); +} static void orte_pls_base_orted_send_cb(int status, orte_process_name_t* peer, @@ -74,13 +98,34 @@ static void orte_pls_base_cmd_ack(int status, orte_process_name_t* sender, } -int orte_pls_base_orted_exit(opal_list_t *daemons) +int orte_pls_base_orted_cancel_operation(void) +{ + /* protect for threads */ + OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); + + /* cancel any waiting receive - we don't want to hear it */ + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK); + + /* set the completion status to reflect cancellation */ + completion_status = ORTE_ERR_INTERUPTED; + + /* declare us "done" so we can exit cleanly */ + opal_condition_signal(&orte_pls_base.orted_cmd_cond); + + /* unlock us */ + OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); + + return ORTE_SUCCESS; +} + +int orte_pls_base_orted_exit(opal_list_t *daemons, struct timeval *timeout) { int rc; orte_buffer_t cmd; orte_daemon_cmd_flag_t command=ORTE_DAEMON_EXIT_CMD; opal_list_item_t *item; orte_pls_daemon_info_t *dmn; + opal_event_t* event = NULL; OPAL_TRACE(1); @@ -89,7 +134,8 @@ int orte_pls_base_orted_exit(opal_list_t *daemons) /* pack the command */ if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); - goto CLEANUP; + OBJ_DESTRUCT(&cmd); + return rc; } /* send the commands as fast as we can */ @@ -106,7 +152,8 @@ int orte_pls_base_orted_exit(opal_list_t *daemons) } orted_cmd_num_active++; } - + OBJ_DESTRUCT(&cmd); + /* post the receive for the ack's */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK, ORTE_RML_NON_PERSISTENT, orte_pls_base_cmd_ack, NULL); @@ -115,29 +162,51 @@ int orte_pls_base_orted_exit(opal_list_t *daemons) return rc; } + /* define the default completion status */ + completion_status = ORTE_SUCCESS; + /* wait for all commands to have been ack'd */ OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); if (orted_cmd_num_active > 0) { + /* setup a delay to give the orteds time to complete their departure - wake us up if they + * don't exit by the prescribed time + */ + if (NULL != timeout && /* only do this if the user gave us a time to wait */ + NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) { + opal_evtimer_set(event, orte_pls_base_orted_default_wakeup, NULL); + opal_evtimer_add(event, timeout); + } + + /* now go to sleep until woken up */ opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock); } OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); -CLEANUP: - OBJ_DESTRUCT(&cmd); + /* log an error if one occurred */ + if (ORTE_SUCCESS != completion_status) { + ORTE_ERROR_LOG(completion_status); + } + + /* if started, kill the timer event so it doesn't hit us later */ + if (NULL != event) { + opal_evtimer_del(event); + free(event); + } /* we're done! */ - return ORTE_SUCCESS; + return completion_status; } -int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job) +int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job, struct timeval *timeout) { int rc; orte_buffer_t cmd; orte_daemon_cmd_flag_t command=ORTE_DAEMON_KILL_LOCAL_PROCS; opal_list_item_t *item; orte_pls_daemon_info_t *dmn; - + opal_event_t* event = NULL; + OPAL_TRACE(1); OBJ_CONSTRUCT(&cmd, orte_buffer_t); @@ -145,13 +214,15 @@ int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job) /* pack the command */ if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); - goto CLEANUP; + OBJ_DESTRUCT(&cmd); + return rc; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &job, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); - goto CLEANUP; + OBJ_DESTRUCT(&cmd); + return rc; } /* send the commands as fast as we can */ @@ -169,6 +240,7 @@ int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job) orted_cmd_num_active++; } + OBJ_DESTRUCT(&cmd); /* post the receive for the ack's */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK, @@ -178,19 +250,38 @@ int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job) return rc; } + /* define the default completion status */ + completion_status = ORTE_SUCCESS; + /* wait for all commands to have been received */ OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); if (orted_cmd_num_active > 0) { + /* setup a delay to give the orteds time to complete their departure - wake us up if they + * don't exit by the prescribed time + */ + if (NULL != timeout && /* only do this if the user gave us a time to wait */ + NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) { + opal_evtimer_set(event, orte_pls_base_orted_default_wakeup, NULL); + opal_evtimer_add(event, timeout); + } + /* now go to sleep until woken up */ opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock); } OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); - -CLEANUP: - OBJ_DESTRUCT(&cmd); + /* log an error if one occurred */ + if (ORTE_SUCCESS != completion_status) { + ORTE_ERROR_LOG(completion_status); + } + + /* if started, kill the timer event so it doesn't hit us later */ + if (NULL != event) { + opal_evtimer_del(event); + free(event); + } /* we're done! */ - return ORTE_SUCCESS; + return completion_status; } diff --git a/orte/mca/pls/base/pls_base_receive.c b/orte/mca/pls/base/pls_base_receive.c index 810233564b..78a1acf6cb 100644 --- a/orte/mca/pls/base/pls_base_receive.c +++ b/orte/mca/pls/base/pls_base_receive.c @@ -99,6 +99,8 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender, int32_t signal; opal_list_t attrs; opal_list_item_t *item; + struct timeval timeout; + int32_t secs, microsecs; int rc; count = 1; @@ -127,12 +129,14 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender, break; case ORTE_PLS_TERMINATE_JOB_CMD: + /* get the jobid to be terminated */ count = 1; if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &job, &count, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto SEND_ANSWER; } + /* get any attributes */ OBJ_CONSTRUCT(&attrs, opal_list_t); count = 1; if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) { @@ -140,22 +144,39 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender, goto SEND_ANSWER; } + /* get the timeout - packed as two separate int32's */ + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &secs, &count, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + goto SEND_ANSWER; + } + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, µsecs, &count, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + goto SEND_ANSWER; + } + timeout.tv_sec = secs; + timeout.tv_usec = microsecs; - if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &attrs))) { + /* issue the command */ + if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &timeout, &attrs))) { ORTE_ERROR_LOG(rc); } + /* cleanup attribute list */ while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attrs); break; case ORTE_PLS_TERMINATE_ORTEDS_CMD: + /* get the jobid whose daemons are to be terminated */ count = 1; if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &job, &count, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto SEND_ANSWER; } + /* get any attributes */ OBJ_CONSTRUCT(&attrs, opal_list_t); count = 1; if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) { @@ -163,10 +184,26 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender, goto SEND_ANSWER; } - if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(job, &attrs))) { + /* get the timeout - packed as two separate int32's */ + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &secs, &count, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + goto SEND_ANSWER; + } + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, µsecs, &count, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + goto SEND_ANSWER; + } + timeout.tv_sec = secs; + timeout.tv_usec = microsecs; + + /* issue the command */ + if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(job, &timeout, &attrs))) { ORTE_ERROR_LOG(rc); } + /* cleanup attribute list */ while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attrs); break; @@ -229,6 +266,13 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender, } break; + case ORTE_PLS_CANCEL_OPERATION_CMD: + /* issue the command */ + if (ORTE_SUCCESS != (rc = orte_pls.cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + break; + default: ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); } diff --git a/orte/mca/pls/base/pls_private.h b/orte/mca/pls/base/pls_private.h index 9362692090..d1f89d7f62 100644 --- a/orte/mca/pls/base/pls_private.h +++ b/orte/mca/pls/base/pls_private.h @@ -26,6 +26,10 @@ */ #include "orte_config.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "opal/class/opal_list.h" #include "orte/dss/dss_types.h" @@ -41,18 +45,19 @@ extern "C" { #endif - /* - * pls proxy commands - */ - typedef uint8_t orte_pls_cmd_flag_t; - #define ORTE_PLS_CMD ORTE_UINT8 - #define ORTE_PLS_LAUNCH_JOB_CMD 1 - #define ORTE_PLS_TERMINATE_JOB_CMD 2 - #define ORTE_PLS_TERMINATE_PROC_CMD 3 - #define ORTE_PLS_SIGNAL_JOB_CMD 4 - #define ORTE_PLS_SIGNAL_PROC_CMD 5 - #define ORTE_PLS_TERMINATE_ORTEDS_CMD 6 - +/* + * pls proxy commands + */ +typedef uint8_t orte_pls_cmd_flag_t; +#define ORTE_PLS_CMD ORTE_UINT8 +#define ORTE_PLS_LAUNCH_JOB_CMD 1 +#define ORTE_PLS_TERMINATE_JOB_CMD 2 +#define ORTE_PLS_TERMINATE_PROC_CMD 3 +#define ORTE_PLS_SIGNAL_JOB_CMD 4 +#define ORTE_PLS_SIGNAL_PROC_CMD 5 +#define ORTE_PLS_TERMINATE_ORTEDS_CMD 6 +#define ORTE_PLS_CANCEL_OPERATION_CMD 7 + /* * object for daemon information */ @@ -75,9 +80,10 @@ extern "C" { /** * Utilities for pls components that use proxy daemons */ - ORTE_DECLSPEC int orte_pls_base_orted_exit(opal_list_t *daemons); - ORTE_DECLSPEC int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job); - ORTE_DECLSPEC int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal); + int orte_pls_base_orted_cancel_operation(void); + int orte_pls_base_orted_exit(opal_list_t *daemons, struct timeval *timeout); + int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job, struct timeval *timeout); + int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal); int orte_pls_base_orted_add_local_procs(opal_list_t *dmnlist, orte_gpr_notify_data_t *ndat); ORTE_DECLSPEC int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job, opal_list_t *attrs); diff --git a/orte/mca/pls/bproc/pls_bproc.c b/orte/mca/pls/bproc/pls_bproc.c index 48e2a098e9..8e06175229 100644 --- a/orte/mca/pls/bproc/pls_bproc.c +++ b/orte/mca/pls/bproc/pls_bproc.c @@ -40,6 +40,9 @@ #ifdef HAVE_STRING_H #include #endif /* HAVE_STRING_H */ +#ifdef HAVE_SYS_TIME_H +#include +#endif #include "opal/install_dirs.h" #include "opal/class/opal_list.h" @@ -69,6 +72,7 @@ #include "orte/mca/smr/smr.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/runtime.h" +#include "orte/runtime/params.h" #include "orte/mca/pls/base/pls_private.h" #include "pls_bproc.h" @@ -513,6 +517,14 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) { /* setup the daemon environment */ orte_pls_bproc_setup_env(envp); + /* direct the daemons to drop contact files so the local procs + * can learn how to contact them - this is used for routing + * OOB messaging + */ + var = mca_base_param_environ_variable("odls","base","drop_contact_file"); + opal_setenv(var,"1", true, envp); + free(var); + /* daemons calculate their process name using a "stride" of one, so * push that value into their environment */ stride = 1; @@ -704,7 +716,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) { } rc = ORTE_ERROR; ORTE_ERROR_LOG(rc); - orte_pls_bproc_terminate_job(map->job, NULL); + orte_pls_bproc_terminate_job(map->job, &orte_abort_timeout, NULL); goto cleanup; } } @@ -767,10 +779,10 @@ orte_pls_bproc_node_failed(orte_gpr_notify_message_t *msg) orte_schema.extract_jobid_from_std_trigger_name(&job, msg->target); /* terminate all jobs in the in the job family */ - orte_pls_bproc_terminate_job(job, NULL); + orte_pls_bproc_terminate_job(job, &orte_abort_timeout, NULL); /* kill the daemons */ - orte_pls_bproc_terminate_job(0, NULL); + orte_pls_bproc_terminate_job(0, &orte_abort_timeout, NULL); /* shouldn't ever get here.. */ exit(1); @@ -1159,7 +1171,7 @@ cleanup: /** * Terminate all processes associated with this job */ -int orte_pls_bproc_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) { +int orte_pls_bproc_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { pid_t* pids; orte_std_cntr_t i, num_pids; int rc; @@ -1189,7 +1201,7 @@ int orte_pls_bproc_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) { /** * Terminate the orteds for a given job */ -int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -1205,7 +1217,7 @@ int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) } /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } @@ -1295,6 +1307,23 @@ int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t sig return ORTE_SUCCESS; } +/** + * Cancel an operation involving comm to an orted + */ +int orte_pls_bproc_cancel_operation(void) +{ + int rc; + + OPAL_TRACE(1); + + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + /** * Module cleanup */ diff --git a/orte/mca/pls/bproc/pls_bproc.h b/orte/mca/pls/bproc/pls_bproc.h index 3207912f85..86c170b929 100644 --- a/orte/mca/pls/bproc/pls_bproc.h +++ b/orte/mca/pls/bproc/pls_bproc.h @@ -42,6 +42,9 @@ #include "orte/orte_constants.h" #include +#ifdef HAVE_SYS_TIME_H +#include +#endif #include "opal/threads/condition.h" @@ -72,11 +75,12 @@ int orte_pls_bproc_finalize(void); * Interface */ int orte_pls_bproc_launch(orte_jobid_t); -int orte_pls_bproc_terminate_job(orte_jobid_t, opal_list_t*); +int orte_pls_bproc_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*); int orte_pls_bproc_terminate_proc(const orte_process_name_t* proc_name); -int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, opal_list_t*); +int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t*); int orte_pls_bproc_signal_job(orte_jobid_t, int32_t, opal_list_t*); int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t); +int orte_pls_bproc_cancel_operation(void); /* Utility routine to get/set process pid */ ORTE_DECLSPEC int orte_pls_bproc_set_proc_pid(const orte_process_name_t*, pid_t, int); diff --git a/orte/mca/pls/cnos/pls_cnos.c b/orte/mca/pls/cnos/pls_cnos.c index e53129acdf..c4e053bd00 100644 --- a/orte/mca/pls/cnos/pls_cnos.c +++ b/orte/mca/pls/cnos/pls_cnos.c @@ -26,6 +26,10 @@ #ifdef HAVE_SIGNAL_H #include #endif +#ifdef HAVE_SYS_TIME_H +#include +#endif + #ifdef HAVE_CNOS_PM_BARRIER #include #endif @@ -38,12 +42,13 @@ static int orte_pls_cnos_launch_job(orte_jobid_t jobid); -static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, opal_list_t *attrs); -static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs); +static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); +static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); static int orte_pls_cnos_terminate_proc(const orte_process_name_t* proc_name); static int orte_pls_cnos_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs); static int orte_pls_cnos_signal_proc(const orte_process_name_t* proc_name, int32_t signal); static int orte_pls_cnos_finalize(void); +static int orte_pls_cnos_cancel_operation(void); orte_pls_base_module_t orte_pls_cnos_module = { @@ -53,6 +58,7 @@ orte_pls_base_module_t orte_pls_cnos_module = { orte_pls_cnos_terminate_proc, orte_pls_cnos_signal_job, orte_pls_cnos_signal_proc, + orte_pls_cnos_cancel_operation, orte_pls_cnos_finalize }; @@ -68,7 +74,7 @@ static int orte_pls_cnos_launch_job(orte_jobid_t jobid) extern int killrank(rank_t RANK, int SIG); #endif -static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { orte_jobid_t my_jobid = ORTE_PROC_MY_NAME->jobid; @@ -85,7 +91,7 @@ static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) } -static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { orte_jobid_t my_jobid = ORTE_PROC_MY_NAME->jobid; @@ -131,6 +137,11 @@ static int orte_pls_cnos_signal_proc(const orte_process_name_t* proc_name, int32 return ORTE_ERR_NOT_SUPPORTED; } +int orte_pls_rsh_cancel_operation(void) +{ + return ORTE_ERR_NOT_SUPPORTED; +} + static int orte_pls_cnos_finalize(void) { return ORTE_SUCCESS; diff --git a/orte/mca/pls/gridengine/pls_gridengine.h b/orte/mca/pls/gridengine/pls_gridengine.h index e141261809..34397ec5a3 100644 --- a/orte/mca/pls/gridengine/pls_gridengine.h +++ b/orte/mca/pls/gridengine/pls_gridengine.h @@ -90,6 +90,11 @@ #define ORTE_PLS_GRIDENGINE_EXPORT_H #include "orte_config.h" + +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "orte/mca/pls/pls.h" #include "opal/mca/mca.h" @@ -113,11 +118,12 @@ int orte_pls_gridengine_finalize(void); * Interface */ int orte_pls_gridengine_launch_job(orte_jobid_t); -int orte_pls_gridengine_terminate_job(orte_jobid_t, opal_list_t *attrs); -int orte_pls_gridengine_terminate_orteds(orte_jobid_t, opal_list_t *attrs); +int orte_pls_gridengine_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs); +int orte_pls_gridengine_terminate_orteds(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs); int orte_pls_gridengine_terminate_proc(const orte_process_name_t*); int orte_pls_gridengine_signal_job(orte_jobid_t, int32_t, opal_list_t *attrs); int orte_pls_gridengine_signal_proc(const orte_process_name_t*, int32_t); +int orte_pls_gridengine_cancel_operation(void); /** * PLS Component diff --git a/orte/mca/pls/gridengine/pls_gridengine_module.c b/orte/mca/pls/gridengine/pls_gridengine_module.c index bf9d569bd9..1c9006b70c 100644 --- a/orte/mca/pls/gridengine/pls_gridengine_module.c +++ b/orte/mca/pls/gridengine/pls_gridengine_module.c @@ -100,6 +100,7 @@ orte_pls_base_module_t orte_pls_gridengine_module = { orte_pls_gridengine_terminate_proc, orte_pls_gridengine_signal_job, orte_pls_gridengine_signal_proc, + orte_pls_gridengine_cancel_operation, orte_pls_gridengine_finalize }; @@ -774,7 +775,7 @@ static int update_slot_keyval(orte_ras_node_t* ras_node, int* slot_cnt) /** * Query the registry for all nodes participating in the job */ -int orte_pls_gridengine_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_gridengine_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -788,7 +789,7 @@ int orte_pls_gridengine_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) } /* order them to kill their local procs for this job */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -809,7 +810,7 @@ int orte_pls_gridengine_terminate_proc(const orte_process_name_t* proc) /** * Terminate the orteds for a given job */ -int orte_pls_gridengine_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_gridengine_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -823,7 +824,7 @@ int orte_pls_gridengine_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) } /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } @@ -872,6 +873,21 @@ int orte_pls_gridengine_signal_proc(const orte_process_name_t* proc, int32_t sig return ORTE_ERR_NOT_IMPLEMENTED; } +/** + * Cancel an operation involving comm to an orted + */ +int orte_pls_gridengine_cancel_operation(void) +{ + int rc; + + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + /** * Finalize */ diff --git a/orte/mca/pls/pls.h b/orte/mca/pls/pls.h index 2968ad7f49..7f5bf77d7f 100644 --- a/orte/mca/pls/pls.h +++ b/orte/mca/pls/pls.h @@ -202,12 +202,12 @@ typedef int (*orte_pls_base_module_launch_job_fn_t)(orte_jobid_t); * Terminate any processes launched for the respective jobid by * this component. */ -typedef int (*orte_pls_base_module_terminate_job_fn_t)(orte_jobid_t, opal_list_t *attrs); +typedef int (*orte_pls_base_module_terminate_job_fn_t)(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs); /** * Terminate the daemons associated with this jobid */ -typedef int (*orte_pls_base_module_terminate_orteds_fn_t)(orte_jobid_t, opal_list_t *attrs); +typedef int (*orte_pls_base_module_terminate_orteds_fn_t)(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs); /** * Terminate a specific process. @@ -225,6 +225,11 @@ typedef int (*orte_pls_base_module_signal_job_fn_t)(orte_jobid_t, int32_t, opal_ */ typedef int (*orte_pls_base_module_signal_proc_fn_t)(const orte_process_name_t*, int32_t); +/** + * Cancel an ongoing operation involving communication to the orteds + */ +typedef int (*orte_pls_base_module_cancel_operation_fn_t)(void); + /** * Cleanup all resources held by the module */ @@ -240,6 +245,7 @@ struct orte_pls_base_module_1_3_0_t { orte_pls_base_module_terminate_proc_fn_t terminate_proc; orte_pls_base_module_signal_job_fn_t signal_job; orte_pls_base_module_signal_proc_fn_t signal_proc; + orte_pls_base_module_cancel_operation_fn_t cancel_operation; orte_pls_base_module_finalize_fn_t finalize; }; diff --git a/orte/mca/pls/poe/pls_poe_module.c b/orte/mca/pls/poe/pls_poe_module.c index 590fba993e..be015e8071 100644 --- a/orte/mca/pls/poe/pls_poe_module.c +++ b/orte/mca/pls/poe/pls_poe_module.c @@ -30,6 +30,9 @@ #ifdef HAVE_UNISTD_H #include #endif +#ifdef HAVE_SYS_TIME_H +#include +#endif #include "opal/mca/base/mca_base_param.h" #include "opal/util/argv.h" @@ -60,12 +63,13 @@ extern char **environ; * Local functions */ static int pls_poe_launch_job(orte_jobid_t jobid); -static int pls_poe_terminate_job(orte_jobid_t jobid, opal_list_t *attrs); -static int pls_poe_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs); +static int pls_poe_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); +static int pls_poe_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); static int pls_poe_terminate_proc(const orte_process_name_t *name); static int pls_poe_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs); static int pls_poe_signal_proc(const orte_process_name_t *name, int32_t signal); static int pls_poe_finalize(void); +static int pls_poe_cancel_operation(void); orte_pls_base_module_t orte_pls_poe_module = { pls_poe_launch_job, @@ -74,6 +78,7 @@ orte_pls_base_module_t orte_pls_poe_module = { pls_poe_terminate_proc, pls_poe_signal_job, pls_poe_signal_proc, + pls_poe_cancel_operation, pls_poe_finalize }; @@ -602,7 +607,7 @@ static int pls_poe_launch_job(orte_jobid_t jobid) return ORTE_ERR_NOT_IMPLEMENTED; } -static int pls_poe_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +static int pls_poe_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { return ORTE_ERR_NOT_IMPLEMENTED; } @@ -613,7 +618,7 @@ static int pls_poe_terminate_proc(const orte_process_name_t *name) return ORTE_ERR_NOT_IMPLEMENTED; } -static int pls_poe_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +static int pls_poe_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { return ORTE_ERR_NOT_IMPLEMENTED; } diff --git a/orte/mca/pls/process/pls_process.h b/orte/mca/pls/process/pls_process.h index 5fa64968f3..4124fd5a68 100644 --- a/orte/mca/pls/process/pls_process.h +++ b/orte/mca/pls/process/pls_process.h @@ -25,6 +25,10 @@ #include "orte_config.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "opal/threads/condition.h" #include "opal/mca/mca.h" #include "orte/mca/pls/pls.h" @@ -49,11 +53,12 @@ int orte_pls_process_finalize(void); * Interface */ int orte_pls_process_launch(orte_jobid_t); -int orte_pls_process_terminate_job(orte_jobid_t, opal_list_t*); -int orte_pls_process_terminate_orteds(orte_jobid_t, opal_list_t*); +int orte_pls_process_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*); +int orte_pls_process_terminate_orteds(orte_jobid_t, struct timeval *timeout, opal_list_t*); int orte_pls_process_terminate_proc(const orte_process_name_t* proc_name); int orte_pls_process_signal_job(orte_jobid_t, int32_t, opal_list_t*); int orte_pls_process_signal_proc(const orte_process_name_t* proc_name, int32_t); +int orte_pls_process_cancel_operation(void); /** * PLS Component diff --git a/orte/mca/pls/process/pls_process_module.c b/orte/mca/pls/process/pls_process_module.c index ea942b2f58..586d7c4825 100644 --- a/orte/mca/pls/process/pls_process_module.c +++ b/orte/mca/pls/process/pls_process_module.c @@ -115,6 +115,7 @@ orte_pls_base_module_t orte_pls_process_module = { orte_pls_process_terminate_proc, orte_pls_process_signal_job, orte_pls_process_signal_proc, + orte_pls_process_cancel_operation, orte_pls_process_finalize }; @@ -1029,7 +1030,7 @@ cleanup: /** * Terminate all processes for a given job */ -int orte_pls_process_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_process_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -1045,7 +1046,7 @@ int orte_pls_process_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) } /* order them to kill their local procs for this job */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -1061,7 +1062,7 @@ CLEANUP: /** * Terminate the orteds for a given job */ -int orte_pls_process_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_process_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -1077,7 +1078,7 @@ int orte_pls_process_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) } /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } @@ -1134,6 +1135,23 @@ int orte_pls_process_signal_proc(const orte_process_name_t* proc, int32_t signal return ORTE_ERR_NOT_IMPLEMENTED; } +/** + * Cancel an operation involving comm to an orted + */ +int orte_pls_process_cancel_operation(void) +{ + int rc; + + OPAL_TRACE(1); + + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + int orte_pls_process_finalize(void) { int rc; diff --git a/orte/mca/pls/proxy/pls_proxy.c b/orte/mca/pls/proxy/pls_proxy.c index bdc6603fef..3ccd733408 100644 --- a/orte/mca/pls/proxy/pls_proxy.c +++ b/orte/mca/pls/proxy/pls_proxy.c @@ -26,6 +26,10 @@ #include "orte/orte_constants.h" #include "orte/orte_types.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "opal/util/output.h" #include "opal/util/trace.h" @@ -110,13 +114,14 @@ int orte_pls_proxy_launch(orte_jobid_t job) return ORTE_SUCCESS; } -int orte_pls_proxy_terminate_job(orte_jobid_t job, opal_list_t *attrs) +int orte_pls_proxy_terminate_job(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs) { orte_buffer_t* cmd; orte_buffer_t* answer; orte_pls_cmd_flag_t command, ret_cmd; orte_std_cntr_t count; int rc; + int32_t timefield; OPAL_TRACE(1); @@ -146,6 +151,20 @@ int orte_pls_proxy_terminate_job(orte_jobid_t job, opal_list_t *attrs) return rc; } + timefield = timeout->tv_sec; + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &timefield, 1, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + timefield = timeout->tv_usec; + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &timefield, 1, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + if (0 > orte_rml.send_buffer(orte_pls_proxy_replica, cmd, ORTE_RML_TAG_PLS, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(cmd); @@ -182,13 +201,14 @@ int orte_pls_proxy_terminate_job(orte_jobid_t job, opal_list_t *attrs) return ORTE_SUCCESS; } -int orte_pls_proxy_terminate_orteds(orte_jobid_t job, opal_list_t *attrs) +int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs) { orte_buffer_t* cmd; orte_buffer_t* answer; orte_pls_cmd_flag_t command, ret_cmd; orte_std_cntr_t count; int rc; + int32_t timefield; OPAL_TRACE(1); @@ -218,6 +238,20 @@ int orte_pls_proxy_terminate_orteds(orte_jobid_t job, opal_list_t *attrs) return rc; } + timefield = timeout->tv_sec; + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &timefield, 1, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + timefield = timeout->tv_usec; + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &timefield, 1, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + if (0 > orte_rml.send_buffer(orte_pls_proxy_replica, cmd, ORTE_RML_TAG_PLS, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(cmd); @@ -471,5 +505,63 @@ int orte_pls_proxy_signal_proc(const orte_process_name_t* name, int32_t signal) return ORTE_SUCCESS; } - +int orte_pls_proxy_cancel_operation(void) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_pls_cmd_flag_t command, ret_cmd; + orte_std_cntr_t count; + int rc; + + OPAL_TRACE(1); + + command = ORTE_PLS_CANCEL_OPERATION_CMD; + + cmd = OBJ_NEW(orte_buffer_t); + if (cmd == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_PLS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(orte_pls_proxy_replica, cmd, ORTE_RML_TAG_PLS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + answer = OBJ_NEW(orte_buffer_t); + if(answer == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (0 > orte_rml.recv_buffer(orte_pls_proxy_replica, answer, ORTE_RML_TAG_PLS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &ret_cmd, &count, ORTE_PLS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + if (ret_cmd != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + OBJ_RELEASE(answer); + return ORTE_SUCCESS; +} diff --git a/orte/mca/pls/proxy/pls_proxy.h b/orte/mca/pls/proxy/pls_proxy.h index 10db57fa4f..7445620327 100644 --- a/orte/mca/pls/proxy/pls_proxy.h +++ b/orte/mca/pls/proxy/pls_proxy.h @@ -22,6 +22,10 @@ #include "orte_config.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "orte/mca/pls/pls.h" #if defined(c_plusplus) || defined(__cplusplus) @@ -48,11 +52,12 @@ int orte_pls_proxy_finalize(void); * proxy function prototypes */ int orte_pls_proxy_launch(orte_jobid_t job); -int orte_pls_proxy_terminate_job(orte_jobid_t job, opal_list_t *attrs); -int orte_pls_proxy_terminate_orteds(orte_jobid_t job, opal_list_t *attrs); +int orte_pls_proxy_terminate_job(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs); +int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs); int orte_pls_proxy_terminate_proc(const orte_process_name_t* name); int orte_pls_proxy_signal_job(orte_jobid_t job, int32_t signal, opal_list_t *attrs); int orte_pls_proxy_signal_proc(const orte_process_name_t* name, int32_t signal); +int orte_pls_proxy_cancel_operation(void); #if defined(c_plusplus) || defined(__cplusplus) diff --git a/orte/mca/pls/proxy/pls_proxy_component.c b/orte/mca/pls/proxy/pls_proxy_component.c index 583423432a..f32633ee5c 100644 --- a/orte/mca/pls/proxy/pls_proxy_component.c +++ b/orte/mca/pls/proxy/pls_proxy_component.c @@ -65,6 +65,7 @@ static orte_pls_base_module_t orte_pls_proxy_module = { orte_pls_proxy_terminate_proc, orte_pls_proxy_signal_job, orte_pls_proxy_signal_proc, + orte_pls_proxy_cancel_operation, orte_pls_proxy_finalize }; diff --git a/orte/mca/pls/rsh/pls_rsh.h b/orte/mca/pls/rsh/pls_rsh.h index 554161d9e2..ece91faad5 100644 --- a/orte/mca/pls/rsh/pls_rsh.h +++ b/orte/mca/pls/rsh/pls_rsh.h @@ -25,8 +25,13 @@ #include "orte_config.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "opal/threads/condition.h" #include "opal/mca/mca.h" + #include "orte/mca/pls/pls.h" #if defined(c_plusplus) || defined(__cplusplus) @@ -49,11 +54,12 @@ int orte_pls_rsh_finalize(void); * Interface */ int orte_pls_rsh_launch(orte_jobid_t); -int orte_pls_rsh_terminate_job(orte_jobid_t, opal_list_t*); -int orte_pls_rsh_terminate_orteds(orte_jobid_t, opal_list_t*); +int orte_pls_rsh_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*); +int orte_pls_rsh_terminate_orteds(orte_jobid_t, struct timeval *timeout, opal_list_t*); int orte_pls_rsh_terminate_proc(const orte_process_name_t* proc_name); int orte_pls_rsh_signal_job(orte_jobid_t, int32_t, opal_list_t*); int orte_pls_rsh_signal_proc(const orte_process_name_t* proc_name, int32_t); +int orte_pls_rsh_cancel_operation(void); /** * PLS Component diff --git a/orte/mca/pls/rsh/pls_rsh_module.c b/orte/mca/pls/rsh/pls_rsh_module.c index 5e3518ecef..1c779e4ca1 100644 --- a/orte/mca/pls/rsh/pls_rsh_module.c +++ b/orte/mca/pls/rsh/pls_rsh_module.c @@ -107,6 +107,7 @@ orte_pls_base_module_t orte_pls_rsh_module = { orte_pls_rsh_terminate_proc, orte_pls_rsh_signal_job, orte_pls_rsh_signal_proc, + orte_pls_rsh_cancel_operation, orte_pls_rsh_finalize }; @@ -943,6 +944,9 @@ int orte_pls_rsh_launch(orte_jobid_t jobid) argv[local_exec_index_end] = NULL; } + /* tell the daemon to setup its own process session/group */ + opal_argv_append(&argc, &argv, "--set-sid"); + /* Finally, chdir($HOME) because we're making the assumption that this is what will happen on remote nodes (via rsh/ssh). This allows a user @@ -1128,7 +1132,7 @@ cleanup: /** * Terminate all processes for a given job */ -int orte_pls_rsh_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_rsh_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -1144,7 +1148,7 @@ int orte_pls_rsh_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) } /* order them to kill their local procs for this job */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -1160,7 +1164,7 @@ CLEANUP: /** * Terminate the orteds for a given job */ -int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -1176,7 +1180,7 @@ int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) } /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } @@ -1233,6 +1237,23 @@ int orte_pls_rsh_signal_proc(const orte_process_name_t* proc, int32_t signal) return ORTE_ERR_NOT_IMPLEMENTED; } +/** + * Cancel an operation involving comm to an orted + */ +int orte_pls_rsh_cancel_operation(void) +{ + int rc; + + OPAL_TRACE(1); + + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + int orte_pls_rsh_finalize(void) { int rc; diff --git a/orte/mca/pls/slurm/pls_slurm_module.c b/orte/mca/pls/slurm/pls_slurm_module.c index 16997ac5ba..01e66aa1c5 100644 --- a/orte/mca/pls/slurm/pls_slurm_module.c +++ b/orte/mca/pls/slurm/pls_slurm_module.c @@ -74,12 +74,13 @@ * Local functions */ static int pls_slurm_launch_job(orte_jobid_t jobid); -static int pls_slurm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs); -static int pls_slurm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs); +static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); +static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); static int pls_slurm_terminate_proc(const orte_process_name_t *name); static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs); static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal); static int pls_slurm_finalize(void); +static int pls_slurm_cancel_operation(void); static int pls_slurm_start_proc(int argc, char **argv, char **env, char *prefix); @@ -95,6 +96,7 @@ orte_pls_base_module_1_3_0_t orte_pls_slurm_module = { pls_slurm_terminate_proc, pls_slurm_signal_job, pls_slurm_signal_proc, + pls_slurm_cancel_operation, pls_slurm_finalize }; @@ -443,7 +445,7 @@ cleanup: } -static int pls_slurm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -457,7 +459,7 @@ static int pls_slurm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) } /* order them to kill their local procs for this job */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -474,7 +476,7 @@ CLEANUP: /** * Terminate the orteds for a given job */ -static int pls_slurm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -488,7 +490,7 @@ static int pls_slurm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) } /* order them to go away */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } @@ -534,6 +536,21 @@ static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal } +/** + * Cancel an operation involving comm to an orted + */ +int pls_slurm_cancel_operation(void) +{ + int rc; + + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + static int pls_slurm_finalize(void) { int rc; diff --git a/orte/mca/pls/tm/pls_tm_module.c b/orte/mca/pls/tm/pls_tm_module.c index cd2ae4b849..d9973f3731 100644 --- a/orte/mca/pls/tm/pls_tm_module.c +++ b/orte/mca/pls/tm/pls_tm_module.c @@ -79,11 +79,12 @@ * Local functions */ static int pls_tm_launch_job(orte_jobid_t jobid); -static int pls_tm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs); -static int pls_tm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs); +static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); +static int pls_tm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); static int pls_tm_terminate_proc(const orte_process_name_t *name); static int pls_tm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs); static int pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal); +static int pls_tm_cancel_operation(void); static int pls_tm_finalize(void); static int pls_tm_connect(void); @@ -559,7 +560,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid) } -static int pls_tm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -573,7 +574,7 @@ static int pls_tm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) } /* order them to kill their local procs for this job */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -590,7 +591,7 @@ CLEANUP: /** * Terminate the orteds for a given job */ -int pls_tm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +int pls_tm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -604,7 +605,7 @@ int pls_tm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) } /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } @@ -661,6 +662,21 @@ static int pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal) } +/** + * Cancel an operation involving comm to an orted + */ +static int pls_tm_cancel_operation(void) +{ + int rc; + + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + /* * Free stuff */ diff --git a/orte/mca/pls/xcpu/pls_xcpu.c b/orte/mca/pls/xcpu/pls_xcpu.c index 1ba57b2046..bdda5cd7b9 100644 --- a/orte/mca/pls/xcpu/pls_xcpu.c +++ b/orte/mca/pls/xcpu/pls_xcpu.c @@ -40,6 +40,9 @@ #ifdef HAVE_STRING_H #include #endif /* HAVE_STRING_H */ +#ifdef HAVE_SYS_TIME_H +#include +#endif #include "opal/event/event.h" #include "opal/mca/base/mca_base_param.h" @@ -86,6 +89,7 @@ orte_pls_base_module_t orte_pls_xcpu_module = { orte_pls_xcpu_terminate_proc, orte_pls_xcpu_signal_job, orte_pls_xcpu_signal_proc, + orte_pls_xcpu_cancel_operation, orte_pls_xcpu_finalize }; @@ -357,7 +361,7 @@ error: return rc; } -int orte_pls_xcpu_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_xcpu_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int i, rc; orte_job_map_t *map; @@ -378,7 +382,7 @@ int orte_pls_xcpu_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) return ORTE_SUCCESS; } -int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, opal_list_t * attrs) +int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t * attrs) { return ORTE_SUCCESS; } @@ -424,6 +428,14 @@ int orte_pls_xcpu_signal_proc(const orte_process_name_t* proc_name, int32_t sig) return ORTE_SUCCESS; } +/** + * Cancel an operation involving comm to an orted + */ +int orte_pls_xcpu_cancel_operation(void) +{ + return ORTE_SUCCESS; +} + int orte_pls_xcpu_finalize(void) { return ORTE_SUCCESS; diff --git a/orte/mca/pls/xcpu/pls_xcpu.h b/orte/mca/pls/xcpu/pls_xcpu.h index e61968d0b7..11b03b439e 100644 --- a/orte/mca/pls/xcpu/pls_xcpu.h +++ b/orte/mca/pls/xcpu/pls_xcpu.h @@ -32,6 +32,11 @@ #define orte_pls_xcpu_H_ #include "orte_config.h" + +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "orte/class/orte_pointer_array.h" #include "orte/orte_constants.h" #include "orte/mca/pls/base/base.h" @@ -57,11 +62,12 @@ orte_pls_base_module_t* orte_pls_xcpu_init(int *priority); /* in component file * Interface */ int orte_pls_xcpu_launch_job(orte_jobid_t); -int orte_pls_xcpu_terminate_job(orte_jobid_t, opal_list_t *); - int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, opal_list_t * attrs); +int orte_pls_xcpu_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t *); +int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t * attrs); int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name); int orte_pls_xcpu_signal_job(orte_jobid_t jobid, int32_t sig, opal_list_t*); int orte_pls_xcpu_signal_proc(const orte_process_name_t* proc_name, int32_t sig); +int orte_pls_xcpu_cancel_operation(void); int orte_pls_xcpu_finalize(void); void orte_pls_xcpu_close_sessions(void); diff --git a/orte/mca/pls/xgrid/src/pls_xgrid_module.m b/orte/mca/pls/xgrid/src/pls_xgrid_module.m index 50bfdf48c5..de407845d2 100644 --- a/orte/mca/pls/xgrid/src/pls_xgrid_module.m +++ b/orte/mca/pls/xgrid/src/pls_xgrid_module.m @@ -27,6 +27,9 @@ #import #import #import +#ifdef HAVE_SYS_TIME_H +#include +#endif #import "orte/orte_constants.h" #import "opal/util/argv.h" @@ -45,14 +48,14 @@ #import "pls_xgrid.h" int orte_pls_xgrid_launch(orte_jobid_t jobid); -int orte_pls_xgrid_terminate_job(orte_jobid_t jobid, opal_list_t *attrs); -int orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs); +int orte_pls_xgrid_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); +int orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); int orte_pls_xgrid_terminate_proc(const orte_process_name_t* proc); int orte_pls_xgrid_signal_job(orte_jobid_t job, int32_t signal, opal_list_t *attrs); int orte_pls_xgrid_signal_proc(const orte_process_name_t* proc_name, int32_t signal); +int orte_pls_xgrid_cancel_operation(void); int orte_pls_xgrid_finalize(void); - orte_pls_base_module_1_3_0_t orte_pls_xgrid_module = { orte_pls_xgrid_launch, orte_pls_xgrid_terminate_job, @@ -60,6 +63,7 @@ orte_pls_base_module_1_3_0_t orte_pls_xgrid_module = { orte_pls_xgrid_terminate_proc, orte_pls_xgrid_signal_job, orte_pls_xgrid_signal_proc, + orte_pls_xgrid_cancel_operation, orte_pls_xgrid_finalize }; @@ -81,7 +85,7 @@ orte_pls_xgrid_launch(orte_jobid_t jobid) * Terminate all processes for a given job */ int -orte_pls_xgrid_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +orte_pls_xgrid_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -95,7 +99,7 @@ orte_pls_xgrid_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) } /* order them to kill their local procs for this job */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -118,7 +122,7 @@ CLEANUP: * Terminate the orteds for a given job */ int -orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -132,7 +136,7 @@ orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) } /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } @@ -195,6 +199,19 @@ orte_pls_xgrid_signal_proc(const orte_process_name_t* proc, int32_t signal) } +int +orte_pls_xgrid_cancel_operation(void) +{ + int rc; + + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + int orte_pls_xgrid_finalize(void) { diff --git a/orte/runtime/orte_init.c b/orte/runtime/orte_init.c index 4b33b1a89f..4b13226288 100644 --- a/orte/runtime/orte_init.c +++ b/orte/runtime/orte_init.c @@ -33,9 +33,6 @@ * @retval ORTE_ERROR Upon failure. */ -/* globals used by RTE */ -int orte_debug_flag=(int)false; - int orte_init(bool infrastructure) { int rc; diff --git a/orte/runtime/orte_init_stage1.c b/orte/runtime/orte_init_stage1.c index b71956fd63..b958516351 100644 --- a/orte/runtime/orte_init_stage1.c +++ b/orte/runtime/orte_init_stage1.c @@ -133,18 +133,6 @@ int orte_init_stage1(bool infrastructure) /***** ERROR LOGGING NOW AVAILABLE *****/ - /* check for debug flag */ - if (0 > (ret = mca_base_param_register_int("orte", "debug", NULL, NULL, 0))) { - ORTE_ERROR_LOG(ret); - error = "mca_base_param_register_int"; - goto error; - } - if (ORTE_SUCCESS != (ret = mca_base_param_lookup_int(ret, &orte_debug_flag))) { - ORTE_ERROR_LOG(ret); - error = "mca_base_param_lookup_int"; - goto error; - } - /* * Initialize the event library */ diff --git a/orte/runtime/orte_params.c b/orte/runtime/orte_params.c index 3dc0e56ec2..e2673fdeaa 100644 --- a/orte/runtime/orte_params.c +++ b/orte/runtime/orte_params.c @@ -20,14 +20,27 @@ #include "orte/orte_constants.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "opal/mca/base/mca_base_param.h" + #include "orte/runtime/runtime.h" +#include "orte/runtime/params.h" + +/* globals used by RTE */ +int orte_debug_flag; +struct timeval orte_abort_timeout; int orte_register_params(bool infrastructure) { + int value; + mca_base_param_reg_int_name("orte", "debug", "Top-level ORTE debug switch", - false, false, (int)false, NULL); + false, false, (int)false, &value); + orte_debug_flag = OPAL_INT_TO_BOOL(value); mca_base_param_reg_int_name("orte_debug", "daemons_file", "Whether want stdout/stderr of daemons to go to a file or not", @@ -51,8 +64,14 @@ int orte_register_params(bool infrastructure) "Sequence of user-level debuggers to search for in orterun", false, false, "totalview @mpirun@ -a @mpirun_args@ : fxp @mpirun@ -a @mpirun_args@", NULL); + + mca_base_param_reg_int_name("orte", "abort_timeout", + "Time to wait [in seconds] before giving up on aborting an ORTE operation", + false, false, 10, &value); + orte_abort_timeout.tv_sec = value; + orte_abort_timeout.tv_usec = 0; + /* All done */ - return ORTE_SUCCESS; } diff --git a/orte/runtime/params.h b/orte/runtime/params.h new file mode 100644 index 0000000000..610e06452f --- /dev/null +++ b/orte/runtime/params.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Global params for OpenRTE + */ +#ifndef ORTE_RUNTIME_PARAM_H +#define ORTE_RUNTIME_PARAM_H + +#include "orte_config.h" + +#ifdef HAVE_SYS_TIME_H +#include +#endif + + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +/* globals used by RTE - instanced in orte_params.c */ + +ORTE_DECLSPEC extern int orte_debug_flag; + +ORTE_DECLSPEC extern struct timeval orte_abort_timeout; + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif + +#endif /* ORTE_RUNTIME_PARAM_H */ diff --git a/orte/tools/orted/orted.c b/orte/tools/orted/orted.c index 233bc6768a..f430fa3797 100644 --- a/orte/tools/orted/orted.c +++ b/orte/tools/orted/orted.c @@ -69,6 +69,7 @@ #include "orte/mca/pls/pls.h" #include "orte/runtime/runtime.h" +#include "orte/runtime/params.h" #include "orte/tools/orted/orted.h" @@ -126,6 +127,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = { &orted_globals.bootproxy, OPAL_CMD_LINE_TYPE_INT, "Run as boot proxy for " }, + { NULL, NULL, NULL, '\0', NULL, "set-sid", 0, + &orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL, + "Direct the orted to separate from the current session"}, + { NULL, NULL, NULL, '\0', NULL, "name", 1, &orted_globals.name, OPAL_CMD_LINE_TYPE_STRING, "Set the orte process name"}, @@ -206,6 +211,9 @@ int main(int argc, char *argv[]) /* save the environment for use when launching application processes */ orted_globals.saved_environ = opal_argv_copy(environ); + /* setup mca param system */ + mca_base_param_init(); + /* setup to check common command line options that just report and die */ cmd_line = OBJ_NEW(opal_cmd_line_t); opal_cmd_line_create(cmd_line, orte_cmd_line_opts); @@ -229,6 +237,11 @@ int main(int argc, char *argv[]) return 1; } + /* see if we were directed to separate from current session */ + if (orted_globals.set_sid) { + setsid(); + } + /* see if they want us to spin until they can connect a debugger to us */ i=0; while (orted_globals.spin) { @@ -741,7 +754,7 @@ static void halt_vm(void) /* terminate the vm - this will also wake us up so we can exit */ OBJ_CONSTRUCT(&attrs, opal_list_t); orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); - ret = orte_pls.terminate_orteds(0, &attrs); + ret = orte_pls.terminate_orteds(0, &orte_abort_timeout, &attrs); while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attrs); diff --git a/orte/tools/orted/orted.h b/orte/tools/orted/orted.h index c2c558cc08..f93411a0ed 100644 --- a/orte/tools/orted/orted.h +++ b/orte/tools/orted/orted.h @@ -35,6 +35,7 @@ typedef struct { bool debug; bool debug_daemons; bool debug_daemons_file; + bool set_sid; char* ns_nds; char* name; char* vpid_start; diff --git a/orte/tools/orteprobe/orteprobe.c b/orte/tools/orteprobe/orteprobe.c index 920af1bf92..8214b5a138 100644 --- a/orte/tools/orteprobe/orteprobe.c +++ b/orte/tools/orteprobe/orteprobe.c @@ -68,10 +68,12 @@ #include "orte/mca/schema/base/base.h" #include "orte/runtime/runtime.h" +#include "orte/runtime/params.h" #include "orte/runtime/orte_wait.h" #include "orte/tools/orteprobe/orteprobe.h" +#if 0 orteprobe_globals_t orteprobe_globals; /* @@ -127,6 +129,7 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = { { NULL, NULL, NULL, '\0', NULL, NULL, 0, NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } }; +#endif #if !defined(__WINDOWS__) extern char **environ; @@ -134,6 +137,7 @@ extern char **environ; int main(int argc, char *argv[]) { +#if 0 int ret = 0, ortedargc; opal_cmd_line_t *cmd_line = NULL; char *contact_path = NULL, *orted=NULL; @@ -151,6 +155,10 @@ int main(int argc, char *argv[]) return ret; } + /* Setup MCA params */ + mca_base_param_init(); + orte_register_params(false); + /* setup to check common command line options that just report and die */ memset(&orteprobe_globals, 0, sizeof(orteprobe_globals)); cmd_line = OBJ_NEW(opal_cmd_line_t); @@ -464,4 +472,6 @@ int main(int argc, char *argv[]) fprintf(stderr, "orteprobe: system appears to not support remote probes\n"); exit(1); #endif +#endif + exit(1); } diff --git a/orte/tools/orterun/help-orterun.txt b/orte/tools/orterun/help-orterun.txt index 94ebfbd37a..9dde4d516d 100644 --- a/orte/tools/orterun/help-orterun.txt +++ b/orte/tools/orterun/help-orterun.txt @@ -96,6 +96,7 @@ WARNING: %s encountered an abnormal exit. This means that %s exited before it received notification that all started processes had terminated. You should double check and ensure that there are no runaway processes still executing. +# [orterun:sigint-while-processing] WARNING: %s is in the process of killing a job, but has detected an interruption (probably control-C). @@ -103,6 +104,13 @@ interruption (probably control-C). It is dangerous to interrupt %s while it is killing a job (proper termination may not be guaranteed). Hit control-C again within 1 second if you really want to kill %s immediately. +# +[orterun:forced-end-failed] +WARNING: %s was ordered to kill a job (probably with control-C), but +was unable to successfully complete that order (returned error %s). +You should double check and ensure that there are no runaway processes +still executing. +# [orterun:empty-prefix] A prefix was supplied to %s that only contained slashes. @@ -136,5 +144,5 @@ Things to check: - Ensure that any required licenses are available to run the debugger # [orterun:daemon-die] -%s was unable to cleanly terminate the daemons for this job. Returned value %d instead of ORTE_SUCCESS. +%s was unable to cleanly terminate the daemons for this job. Returned value %s instead of ORTE_SUCCESS. diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index dd81c06bcf..9d8d55e625 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -39,6 +39,9 @@ #ifdef HAVE_LIBGEN_H #include #endif +#ifdef HAVE_SYS_TIME_H +#include +#endif #include "opal/event/event.h" #include "opal/install_dirs.h" @@ -71,6 +74,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/runtime/runtime.h" +#include "orte/runtime/params.h" #include "orte/runtime/orte_wait.h" #include "orterun.h" @@ -268,7 +272,7 @@ opal_cmd_line_init_t cmd_line_init[] = { &orterun_globals.do_not_launch, OPAL_CMD_LINE_TYPE_BOOL, "Perform all necessary operations to prepare to launch the application, but do not actually launch it" }, - { NULL, NULL, NULL, '\0', "reuse-daemons", "reuse-daemons", 0, + { "pls", "base", "reuse_daemons", '\0', "reuse-daemons", "reuse-daemons", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "If set, reuse daemons to launch dynamically spawned processes"}, @@ -470,9 +474,9 @@ int orterun(int argc, char *argv[]) */ OBJ_CONSTRUCT(&attributes, opal_list_t); orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); - if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &attributes))) { - opal_show_help("help-orterun.txt", "orterun:daemon-die", false, - orterun_basename, NULL, NULL, ret); + if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &orte_abort_timeout, &attributes))) { + opal_show_help("help-orterun.txt", "orterun:daemon-die", true, + orterun_basename, ORTE_ERROR_NAME(ret)); } while (NULL != (item = opal_list_remove_first(&attributes))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attributes); @@ -686,9 +690,6 @@ static void exit_callback(int fd, short event, void *arg) { OPAL_TRACE(1); - opal_show_help("help-orterun.txt", "orterun:abnormal-exit", - true, orterun_basename, orterun_basename); - /* Remove the TERM and INT signal handlers */ opal_signal_del(&term_handler); opal_signal_del(&int_handler); @@ -720,62 +721,71 @@ typedef enum { static void abort_signal_callback(int fd, short flags, void *arg) { int ret; - struct timeval tv = { 1, 0 }; opal_event_t* event; opal_list_t attrs; opal_list_item_t *item; - static abort_signal_state_t state; + static abort_signal_state_t state=ABORT_SIGNAL_FIRST; static struct timeval invoked, now; double a, b; - + OPAL_TRACE(1); - + /* If this whole process has already completed, then bail */ switch (state) { - case ABORT_SIGNAL_FIRST: - /* This is the first time through */ - state = ABORT_SIGNAL_PROCESSING; - break; - - case ABORT_SIGNAL_WARNED: - gettimeofday(&now, NULL); - a = invoked.tv_sec * 1000000 + invoked.tv_usec; - b = now.tv_sec * 1000000 + invoked.tv_usec; - if (b - a <= 1000000) { - /* We are in an event handler; exit_callback() will delete - the handler that is currently running (which is a Bad - Thing), so we can't call it directly. Instead, we have - to exit this handler and setup to call exit_handler() - after this. */ - if (NULL != (event = (opal_event_t*) - malloc(sizeof(opal_event_t)))) { - opal_evtimer_set(event, exit_callback, NULL); - now.tv_sec = 0; - now.tv_usec = 0; - opal_evtimer_add(event, &now); - state = ABORT_SIGNAL_DONE; - } - return; - } - /* Otherwise fall through to PROCESSING and warn again */ - - case ABORT_SIGNAL_PROCESSING: - opal_show_help("help-orterun.txt", "orterun:sigint-while-processing", - true, orterun_basename, orterun_basename, - orterun_basename); - gettimeofday(&invoked, NULL); - state = ABORT_SIGNAL_WARNED; - return; - - case ABORT_SIGNAL_DONE: - /* Nothing to do -- return */ - return; + case ABORT_SIGNAL_FIRST: + /* This is the first time through */ + state = ABORT_SIGNAL_PROCESSING; + break; + + case ABORT_SIGNAL_WARNED: + gettimeofday(&now, NULL); + a = invoked.tv_sec * 1000000 + invoked.tv_usec; + b = now.tv_sec * 1000000 + invoked.tv_usec; + if (b - a <= 1000000) { + /* tell the pls to cancel the terminate request - + * obviously, something is wrong at this point + */ + if (ORTE_SUCCESS != (ret = orte_pls.cancel_operation())) { + ORTE_ERROR_LOG(ret); + } + /* give the user the warning about manual cleanup */ + opal_show_help("help-orterun.txt", "orterun:abnormal-exit", + true, orterun_basename, orterun_basename); + + /* We are in an event handler; exit_callback() will delete + the handler that is currently running (which is a Bad + Thing), so we can't call it directly. Instead, we have + to exit this handler and setup to call exit_handler() + after this. */ + if (NULL != (event = (opal_event_t*) + malloc(sizeof(opal_event_t)))) { + opal_evtimer_set(event, exit_callback, NULL); + now.tv_sec = 0; + now.tv_usec = 0; + opal_evtimer_add(event, &now); + state = ABORT_SIGNAL_DONE; + } + return; + } + /* Otherwise fall through to PROCESSING and warn again */ + + case ABORT_SIGNAL_PROCESSING: + opal_show_help("help-orterun.txt", "orterun:sigint-while-processing", + true, orterun_basename, orterun_basename, + orterun_basename); + gettimeofday(&invoked, NULL); + state = ABORT_SIGNAL_WARNED; + return; + + case ABORT_SIGNAL_DONE: + /* Nothing to do -- return */ + return; } if (!orterun_globals.quiet){ fprintf(stderr, "%s: killing job...\n\n", orterun_basename); } - + /* terminate the job - this will also wakeup orterun so * it can kill all the orteds. Be sure to kill all the job's * descendants, if any, so nothing is left hanging @@ -783,24 +793,20 @@ static void abort_signal_callback(int fd, short flags, void *arg) if (jobid != ORTE_JOBID_INVALID) { OBJ_CONSTRUCT(&attrs, opal_list_t); orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); - ret = orte_pls.terminate_job(jobid, &attrs); + ret = orte_pls.terminate_job(jobid, &orte_abort_timeout, &attrs); while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attrs); if (ORTE_SUCCESS != ret) { + opal_show_help("help-orterun.txt", "orterun:forced-end-failed", + true, orterun_basename, ORTE_ERROR_NAME(ret)); jobid = ORTE_JOBID_INVALID; } } - /* setup a delay to give the orteds time to complete their departure */ - if (NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) { - opal_evtimer_set(event, exit_callback, NULL); - opal_evtimer_add(event, &tv); - } - + state = ABORT_SIGNAL_DONE; state = ABORT_SIGNAL_DONE; } - /** * Pass user signals to the remote application processes */ @@ -965,7 +971,7 @@ static int parse_globals(int argc, char* argv[]) orterun_globals.by_slot = true; } - /* If we don't want to wait, we don't want to wait */ + /* If we don't want to wait, we don't want to wait */ if (orterun_globals.no_wait_for_job_completion) { wait_for_job_completion = false;