diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index ab422a05f2..9d5535f919 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -28,6 +28,7 @@ #include "opal/util/output.h" #include "orte/runtime/runtime.h" +#include "orte/runtime/params.h" #include "orte/mca/ns/ns_types.h" #include "orte/mca/gpr/gpr.h" #include "orte/mca/pls/pls.h" @@ -85,9 +86,8 @@ int orte_errmgr_hnp_proc_aborted(orte_gpr_notify_message_t *msg) /* tell the pls to terminate the job AND ALL ITS DESCENDANTS */ OBJ_CONSTRUCT(&attrs, opal_list_t); orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); - if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &attrs))) { + if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &orte_abort_timeout, &attrs))) { ORTE_ERROR_LOG(rc); - return rc; } while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attrs); @@ -152,7 +152,7 @@ int orte_errmgr_hnp_incomplete_start(orte_gpr_notify_message_t *msg) /* tell the pls to terminate the job - just kill this job, not any descendants since * the job is just trying to start */ - if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, NULL))) { + if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &orte_abort_timeout, NULL))) { ORTE_ERROR_LOG(rc); } diff --git a/orte/mca/pls/base/pls_base_orted_cmds.c b/orte/mca/pls/base/pls_base_orted_cmds.c index 0c86cee175..35221c7101 100644 --- a/orte/mca/pls/base/pls_base_orted_cmds.c +++ b/orte/mca/pls/base/pls_base_orted_cmds.c @@ -20,6 +20,11 @@ #include "orte_config.h" #include "orte/orte_constants.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif + +#include "opal/event/event.h" #include "opal/threads/condition.h" #include "opal/util/output.h" #include "opal/util/argv.h" @@ -36,6 +41,25 @@ #include "orte/mca/pls/base/pls_private.h" static orte_std_cntr_t orted_cmd_num_active; +static int completion_status; + +static void orte_pls_base_orted_default_wakeup(int fd, short event, void *arg) +{ + /* protect for threads */ + OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); + + /* cancel the receive - we didn't get everyone's response in time */ + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK); + + /* set the completion status to reflect timeout error */ + completion_status = ORTE_ERR_TIMEOUT; + + /* declare us "done" so we can exit cleanly */ + opal_condition_signal(&orte_pls_base.orted_cmd_cond); + + /* unlock us */ + OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); +} static void orte_pls_base_orted_send_cb(int status, orte_process_name_t* peer, @@ -74,13 +98,34 @@ static void orte_pls_base_cmd_ack(int status, orte_process_name_t* sender, } -int orte_pls_base_orted_exit(opal_list_t *daemons) +int orte_pls_base_orted_cancel_operation(void) +{ + /* protect for threads */ + OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); + + /* cancel any waiting receive - we don't want to hear it */ + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK); + + /* set the completion status to reflect cancellation */ + completion_status = ORTE_ERR_INTERUPTED; + + /* declare us "done" so we can exit cleanly */ + opal_condition_signal(&orte_pls_base.orted_cmd_cond); + + /* unlock us */ + OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); + + return ORTE_SUCCESS; +} + +int orte_pls_base_orted_exit(opal_list_t *daemons, struct timeval *timeout) { int rc; orte_buffer_t cmd; orte_daemon_cmd_flag_t command=ORTE_DAEMON_EXIT_CMD; opal_list_item_t *item; orte_pls_daemon_info_t *dmn; + opal_event_t* event = NULL; OPAL_TRACE(1); @@ -89,7 +134,8 @@ int orte_pls_base_orted_exit(opal_list_t *daemons) /* pack the command */ if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); - goto CLEANUP; + OBJ_DESTRUCT(&cmd); + return rc; } /* send the commands as fast as we can */ @@ -106,7 +152,8 @@ int orte_pls_base_orted_exit(opal_list_t *daemons) } orted_cmd_num_active++; } - + OBJ_DESTRUCT(&cmd); + /* post the receive for the ack's */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK, ORTE_RML_NON_PERSISTENT, orte_pls_base_cmd_ack, NULL); @@ -115,29 +162,51 @@ int orte_pls_base_orted_exit(opal_list_t *daemons) return rc; } + /* define the default completion status */ + completion_status = ORTE_SUCCESS; + /* wait for all commands to have been ack'd */ OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); if (orted_cmd_num_active > 0) { + /* setup a delay to give the orteds time to complete their departure - wake us up if they + * don't exit by the prescribed time + */ + if (NULL != timeout && /* only do this if the user gave us a time to wait */ + NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) { + opal_evtimer_set(event, orte_pls_base_orted_default_wakeup, NULL); + opal_evtimer_add(event, timeout); + } + + /* now go to sleep until woken up */ opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock); } OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); -CLEANUP: - OBJ_DESTRUCT(&cmd); + /* log an error if one occurred */ + if (ORTE_SUCCESS != completion_status) { + ORTE_ERROR_LOG(completion_status); + } + + /* if started, kill the timer event so it doesn't hit us later */ + if (NULL != event) { + opal_evtimer_del(event); + free(event); + } /* we're done! */ - return ORTE_SUCCESS; + return completion_status; } -int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job) +int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job, struct timeval *timeout) { int rc; orte_buffer_t cmd; orte_daemon_cmd_flag_t command=ORTE_DAEMON_KILL_LOCAL_PROCS; opal_list_item_t *item; orte_pls_daemon_info_t *dmn; - + opal_event_t* event = NULL; + OPAL_TRACE(1); OBJ_CONSTRUCT(&cmd, orte_buffer_t); @@ -145,13 +214,15 @@ int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job) /* pack the command */ if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); - goto CLEANUP; + OBJ_DESTRUCT(&cmd); + return rc; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &job, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); - goto CLEANUP; + OBJ_DESTRUCT(&cmd); + return rc; } /* send the commands as fast as we can */ @@ -169,6 +240,7 @@ int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job) orted_cmd_num_active++; } + OBJ_DESTRUCT(&cmd); /* post the receive for the ack's */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK, @@ -178,19 +250,38 @@ int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job) return rc; } + /* define the default completion status */ + completion_status = ORTE_SUCCESS; + /* wait for all commands to have been received */ OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); if (orted_cmd_num_active > 0) { + /* setup a delay to give the orteds time to complete their departure - wake us up if they + * don't exit by the prescribed time + */ + if (NULL != timeout && /* only do this if the user gave us a time to wait */ + NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) { + opal_evtimer_set(event, orte_pls_base_orted_default_wakeup, NULL); + opal_evtimer_add(event, timeout); + } + /* now go to sleep until woken up */ opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock); } OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); - -CLEANUP: - OBJ_DESTRUCT(&cmd); + /* log an error if one occurred */ + if (ORTE_SUCCESS != completion_status) { + ORTE_ERROR_LOG(completion_status); + } + + /* if started, kill the timer event so it doesn't hit us later */ + if (NULL != event) { + opal_evtimer_del(event); + free(event); + } /* we're done! */ - return ORTE_SUCCESS; + return completion_status; } diff --git a/orte/mca/pls/base/pls_base_receive.c b/orte/mca/pls/base/pls_base_receive.c index 810233564b..78a1acf6cb 100644 --- a/orte/mca/pls/base/pls_base_receive.c +++ b/orte/mca/pls/base/pls_base_receive.c @@ -99,6 +99,8 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender, int32_t signal; opal_list_t attrs; opal_list_item_t *item; + struct timeval timeout; + int32_t secs, microsecs; int rc; count = 1; @@ -127,12 +129,14 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender, break; case ORTE_PLS_TERMINATE_JOB_CMD: + /* get the jobid to be terminated */ count = 1; if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &job, &count, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto SEND_ANSWER; } + /* get any attributes */ OBJ_CONSTRUCT(&attrs, opal_list_t); count = 1; if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) { @@ -140,22 +144,39 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender, goto SEND_ANSWER; } + /* get the timeout - packed as two separate int32's */ + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &secs, &count, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + goto SEND_ANSWER; + } + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, µsecs, &count, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + goto SEND_ANSWER; + } + timeout.tv_sec = secs; + timeout.tv_usec = microsecs; - if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &attrs))) { + /* issue the command */ + if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &timeout, &attrs))) { ORTE_ERROR_LOG(rc); } + /* cleanup attribute list */ while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attrs); break; case ORTE_PLS_TERMINATE_ORTEDS_CMD: + /* get the jobid whose daemons are to be terminated */ count = 1; if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &job, &count, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto SEND_ANSWER; } + /* get any attributes */ OBJ_CONSTRUCT(&attrs, opal_list_t); count = 1; if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) { @@ -163,10 +184,26 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender, goto SEND_ANSWER; } - if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(job, &attrs))) { + /* get the timeout - packed as two separate int32's */ + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &secs, &count, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + goto SEND_ANSWER; + } + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, µsecs, &count, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + goto SEND_ANSWER; + } + timeout.tv_sec = secs; + timeout.tv_usec = microsecs; + + /* issue the command */ + if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(job, &timeout, &attrs))) { ORTE_ERROR_LOG(rc); } + /* cleanup attribute list */ while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attrs); break; @@ -229,6 +266,13 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender, } break; + case ORTE_PLS_CANCEL_OPERATION_CMD: + /* issue the command */ + if (ORTE_SUCCESS != (rc = orte_pls.cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + break; + default: ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); } diff --git a/orte/mca/pls/base/pls_private.h b/orte/mca/pls/base/pls_private.h index 9362692090..d1f89d7f62 100644 --- a/orte/mca/pls/base/pls_private.h +++ b/orte/mca/pls/base/pls_private.h @@ -26,6 +26,10 @@ */ #include "orte_config.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "opal/class/opal_list.h" #include "orte/dss/dss_types.h" @@ -41,18 +45,19 @@ extern "C" { #endif - /* - * pls proxy commands - */ - typedef uint8_t orte_pls_cmd_flag_t; - #define ORTE_PLS_CMD ORTE_UINT8 - #define ORTE_PLS_LAUNCH_JOB_CMD 1 - #define ORTE_PLS_TERMINATE_JOB_CMD 2 - #define ORTE_PLS_TERMINATE_PROC_CMD 3 - #define ORTE_PLS_SIGNAL_JOB_CMD 4 - #define ORTE_PLS_SIGNAL_PROC_CMD 5 - #define ORTE_PLS_TERMINATE_ORTEDS_CMD 6 - +/* + * pls proxy commands + */ +typedef uint8_t orte_pls_cmd_flag_t; +#define ORTE_PLS_CMD ORTE_UINT8 +#define ORTE_PLS_LAUNCH_JOB_CMD 1 +#define ORTE_PLS_TERMINATE_JOB_CMD 2 +#define ORTE_PLS_TERMINATE_PROC_CMD 3 +#define ORTE_PLS_SIGNAL_JOB_CMD 4 +#define ORTE_PLS_SIGNAL_PROC_CMD 5 +#define ORTE_PLS_TERMINATE_ORTEDS_CMD 6 +#define ORTE_PLS_CANCEL_OPERATION_CMD 7 + /* * object for daemon information */ @@ -75,9 +80,10 @@ extern "C" { /** * Utilities for pls components that use proxy daemons */ - ORTE_DECLSPEC int orte_pls_base_orted_exit(opal_list_t *daemons); - ORTE_DECLSPEC int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job); - ORTE_DECLSPEC int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal); + int orte_pls_base_orted_cancel_operation(void); + int orte_pls_base_orted_exit(opal_list_t *daemons, struct timeval *timeout); + int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job, struct timeval *timeout); + int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal); int orte_pls_base_orted_add_local_procs(opal_list_t *dmnlist, orte_gpr_notify_data_t *ndat); ORTE_DECLSPEC int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job, opal_list_t *attrs); diff --git a/orte/mca/pls/bproc/pls_bproc.c b/orte/mca/pls/bproc/pls_bproc.c index 48e2a098e9..8e06175229 100644 --- a/orte/mca/pls/bproc/pls_bproc.c +++ b/orte/mca/pls/bproc/pls_bproc.c @@ -40,6 +40,9 @@ #ifdef HAVE_STRING_H #include #endif /* HAVE_STRING_H */ +#ifdef HAVE_SYS_TIME_H +#include +#endif #include "opal/install_dirs.h" #include "opal/class/opal_list.h" @@ -69,6 +72,7 @@ #include "orte/mca/smr/smr.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/runtime.h" +#include "orte/runtime/params.h" #include "orte/mca/pls/base/pls_private.h" #include "pls_bproc.h" @@ -513,6 +517,14 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) { /* setup the daemon environment */ orte_pls_bproc_setup_env(envp); + /* direct the daemons to drop contact files so the local procs + * can learn how to contact them - this is used for routing + * OOB messaging + */ + var = mca_base_param_environ_variable("odls","base","drop_contact_file"); + opal_setenv(var,"1", true, envp); + free(var); + /* daemons calculate their process name using a "stride" of one, so * push that value into their environment */ stride = 1; @@ -704,7 +716,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) { } rc = ORTE_ERROR; ORTE_ERROR_LOG(rc); - orte_pls_bproc_terminate_job(map->job, NULL); + orte_pls_bproc_terminate_job(map->job, &orte_abort_timeout, NULL); goto cleanup; } } @@ -767,10 +779,10 @@ orte_pls_bproc_node_failed(orte_gpr_notify_message_t *msg) orte_schema.extract_jobid_from_std_trigger_name(&job, msg->target); /* terminate all jobs in the in the job family */ - orte_pls_bproc_terminate_job(job, NULL); + orte_pls_bproc_terminate_job(job, &orte_abort_timeout, NULL); /* kill the daemons */ - orte_pls_bproc_terminate_job(0, NULL); + orte_pls_bproc_terminate_job(0, &orte_abort_timeout, NULL); /* shouldn't ever get here.. */ exit(1); @@ -1159,7 +1171,7 @@ cleanup: /** * Terminate all processes associated with this job */ -int orte_pls_bproc_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) { +int orte_pls_bproc_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { pid_t* pids; orte_std_cntr_t i, num_pids; int rc; @@ -1189,7 +1201,7 @@ int orte_pls_bproc_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) { /** * Terminate the orteds for a given job */ -int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -1205,7 +1217,7 @@ int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) } /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } @@ -1295,6 +1307,23 @@ int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t sig return ORTE_SUCCESS; } +/** + * Cancel an operation involving comm to an orted + */ +int orte_pls_bproc_cancel_operation(void) +{ + int rc; + + OPAL_TRACE(1); + + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + /** * Module cleanup */ diff --git a/orte/mca/pls/bproc/pls_bproc.h b/orte/mca/pls/bproc/pls_bproc.h index 3207912f85..86c170b929 100644 --- a/orte/mca/pls/bproc/pls_bproc.h +++ b/orte/mca/pls/bproc/pls_bproc.h @@ -42,6 +42,9 @@ #include "orte/orte_constants.h" #include +#ifdef HAVE_SYS_TIME_H +#include +#endif #include "opal/threads/condition.h" @@ -72,11 +75,12 @@ int orte_pls_bproc_finalize(void); * Interface */ int orte_pls_bproc_launch(orte_jobid_t); -int orte_pls_bproc_terminate_job(orte_jobid_t, opal_list_t*); +int orte_pls_bproc_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*); int orte_pls_bproc_terminate_proc(const orte_process_name_t* proc_name); -int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, opal_list_t*); +int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t*); int orte_pls_bproc_signal_job(orte_jobid_t, int32_t, opal_list_t*); int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t); +int orte_pls_bproc_cancel_operation(void); /* Utility routine to get/set process pid */ ORTE_DECLSPEC int orte_pls_bproc_set_proc_pid(const orte_process_name_t*, pid_t, int); diff --git a/orte/mca/pls/cnos/pls_cnos.c b/orte/mca/pls/cnos/pls_cnos.c index e53129acdf..c4e053bd00 100644 --- a/orte/mca/pls/cnos/pls_cnos.c +++ b/orte/mca/pls/cnos/pls_cnos.c @@ -26,6 +26,10 @@ #ifdef HAVE_SIGNAL_H #include #endif +#ifdef HAVE_SYS_TIME_H +#include +#endif + #ifdef HAVE_CNOS_PM_BARRIER #include #endif @@ -38,12 +42,13 @@ static int orte_pls_cnos_launch_job(orte_jobid_t jobid); -static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, opal_list_t *attrs); -static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs); +static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); +static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); static int orte_pls_cnos_terminate_proc(const orte_process_name_t* proc_name); static int orte_pls_cnos_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs); static int orte_pls_cnos_signal_proc(const orte_process_name_t* proc_name, int32_t signal); static int orte_pls_cnos_finalize(void); +static int orte_pls_cnos_cancel_operation(void); orte_pls_base_module_t orte_pls_cnos_module = { @@ -53,6 +58,7 @@ orte_pls_base_module_t orte_pls_cnos_module = { orte_pls_cnos_terminate_proc, orte_pls_cnos_signal_job, orte_pls_cnos_signal_proc, + orte_pls_cnos_cancel_operation, orte_pls_cnos_finalize }; @@ -68,7 +74,7 @@ static int orte_pls_cnos_launch_job(orte_jobid_t jobid) extern int killrank(rank_t RANK, int SIG); #endif -static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { orte_jobid_t my_jobid = ORTE_PROC_MY_NAME->jobid; @@ -85,7 +91,7 @@ static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) } -static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { orte_jobid_t my_jobid = ORTE_PROC_MY_NAME->jobid; @@ -131,6 +137,11 @@ static int orte_pls_cnos_signal_proc(const orte_process_name_t* proc_name, int32 return ORTE_ERR_NOT_SUPPORTED; } +int orte_pls_rsh_cancel_operation(void) +{ + return ORTE_ERR_NOT_SUPPORTED; +} + static int orte_pls_cnos_finalize(void) { return ORTE_SUCCESS; diff --git a/orte/mca/pls/gridengine/pls_gridengine.h b/orte/mca/pls/gridengine/pls_gridengine.h index e141261809..34397ec5a3 100644 --- a/orte/mca/pls/gridengine/pls_gridengine.h +++ b/orte/mca/pls/gridengine/pls_gridengine.h @@ -90,6 +90,11 @@ #define ORTE_PLS_GRIDENGINE_EXPORT_H #include "orte_config.h" + +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "orte/mca/pls/pls.h" #include "opal/mca/mca.h" @@ -113,11 +118,12 @@ int orte_pls_gridengine_finalize(void); * Interface */ int orte_pls_gridengine_launch_job(orte_jobid_t); -int orte_pls_gridengine_terminate_job(orte_jobid_t, opal_list_t *attrs); -int orte_pls_gridengine_terminate_orteds(orte_jobid_t, opal_list_t *attrs); +int orte_pls_gridengine_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs); +int orte_pls_gridengine_terminate_orteds(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs); int orte_pls_gridengine_terminate_proc(const orte_process_name_t*); int orte_pls_gridengine_signal_job(orte_jobid_t, int32_t, opal_list_t *attrs); int orte_pls_gridengine_signal_proc(const orte_process_name_t*, int32_t); +int orte_pls_gridengine_cancel_operation(void); /** * PLS Component diff --git a/orte/mca/pls/gridengine/pls_gridengine_module.c b/orte/mca/pls/gridengine/pls_gridengine_module.c index bf9d569bd9..1c9006b70c 100644 --- a/orte/mca/pls/gridengine/pls_gridengine_module.c +++ b/orte/mca/pls/gridengine/pls_gridengine_module.c @@ -100,6 +100,7 @@ orte_pls_base_module_t orte_pls_gridengine_module = { orte_pls_gridengine_terminate_proc, orte_pls_gridengine_signal_job, orte_pls_gridengine_signal_proc, + orte_pls_gridengine_cancel_operation, orte_pls_gridengine_finalize }; @@ -774,7 +775,7 @@ static int update_slot_keyval(orte_ras_node_t* ras_node, int* slot_cnt) /** * Query the registry for all nodes participating in the job */ -int orte_pls_gridengine_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_gridengine_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -788,7 +789,7 @@ int orte_pls_gridengine_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) } /* order them to kill their local procs for this job */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -809,7 +810,7 @@ int orte_pls_gridengine_terminate_proc(const orte_process_name_t* proc) /** * Terminate the orteds for a given job */ -int orte_pls_gridengine_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_gridengine_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -823,7 +824,7 @@ int orte_pls_gridengine_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) } /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } @@ -872,6 +873,21 @@ int orte_pls_gridengine_signal_proc(const orte_process_name_t* proc, int32_t sig return ORTE_ERR_NOT_IMPLEMENTED; } +/** + * Cancel an operation involving comm to an orted + */ +int orte_pls_gridengine_cancel_operation(void) +{ + int rc; + + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + /** * Finalize */ diff --git a/orte/mca/pls/pls.h b/orte/mca/pls/pls.h index 2968ad7f49..7f5bf77d7f 100644 --- a/orte/mca/pls/pls.h +++ b/orte/mca/pls/pls.h @@ -202,12 +202,12 @@ typedef int (*orte_pls_base_module_launch_job_fn_t)(orte_jobid_t); * Terminate any processes launched for the respective jobid by * this component. */ -typedef int (*orte_pls_base_module_terminate_job_fn_t)(orte_jobid_t, opal_list_t *attrs); +typedef int (*orte_pls_base_module_terminate_job_fn_t)(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs); /** * Terminate the daemons associated with this jobid */ -typedef int (*orte_pls_base_module_terminate_orteds_fn_t)(orte_jobid_t, opal_list_t *attrs); +typedef int (*orte_pls_base_module_terminate_orteds_fn_t)(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs); /** * Terminate a specific process. @@ -225,6 +225,11 @@ typedef int (*orte_pls_base_module_signal_job_fn_t)(orte_jobid_t, int32_t, opal_ */ typedef int (*orte_pls_base_module_signal_proc_fn_t)(const orte_process_name_t*, int32_t); +/** + * Cancel an ongoing operation involving communication to the orteds + */ +typedef int (*orte_pls_base_module_cancel_operation_fn_t)(void); + /** * Cleanup all resources held by the module */ @@ -240,6 +245,7 @@ struct orte_pls_base_module_1_3_0_t { orte_pls_base_module_terminate_proc_fn_t terminate_proc; orte_pls_base_module_signal_job_fn_t signal_job; orte_pls_base_module_signal_proc_fn_t signal_proc; + orte_pls_base_module_cancel_operation_fn_t cancel_operation; orte_pls_base_module_finalize_fn_t finalize; }; diff --git a/orte/mca/pls/poe/pls_poe_module.c b/orte/mca/pls/poe/pls_poe_module.c index 590fba993e..be015e8071 100644 --- a/orte/mca/pls/poe/pls_poe_module.c +++ b/orte/mca/pls/poe/pls_poe_module.c @@ -30,6 +30,9 @@ #ifdef HAVE_UNISTD_H #include #endif +#ifdef HAVE_SYS_TIME_H +#include +#endif #include "opal/mca/base/mca_base_param.h" #include "opal/util/argv.h" @@ -60,12 +63,13 @@ extern char **environ; * Local functions */ static int pls_poe_launch_job(orte_jobid_t jobid); -static int pls_poe_terminate_job(orte_jobid_t jobid, opal_list_t *attrs); -static int pls_poe_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs); +static int pls_poe_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); +static int pls_poe_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); static int pls_poe_terminate_proc(const orte_process_name_t *name); static int pls_poe_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs); static int pls_poe_signal_proc(const orte_process_name_t *name, int32_t signal); static int pls_poe_finalize(void); +static int pls_poe_cancel_operation(void); orte_pls_base_module_t orte_pls_poe_module = { pls_poe_launch_job, @@ -74,6 +78,7 @@ orte_pls_base_module_t orte_pls_poe_module = { pls_poe_terminate_proc, pls_poe_signal_job, pls_poe_signal_proc, + pls_poe_cancel_operation, pls_poe_finalize }; @@ -602,7 +607,7 @@ static int pls_poe_launch_job(orte_jobid_t jobid) return ORTE_ERR_NOT_IMPLEMENTED; } -static int pls_poe_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +static int pls_poe_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { return ORTE_ERR_NOT_IMPLEMENTED; } @@ -613,7 +618,7 @@ static int pls_poe_terminate_proc(const orte_process_name_t *name) return ORTE_ERR_NOT_IMPLEMENTED; } -static int pls_poe_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +static int pls_poe_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { return ORTE_ERR_NOT_IMPLEMENTED; } diff --git a/orte/mca/pls/process/pls_process.h b/orte/mca/pls/process/pls_process.h index 5fa64968f3..4124fd5a68 100644 --- a/orte/mca/pls/process/pls_process.h +++ b/orte/mca/pls/process/pls_process.h @@ -25,6 +25,10 @@ #include "orte_config.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "opal/threads/condition.h" #include "opal/mca/mca.h" #include "orte/mca/pls/pls.h" @@ -49,11 +53,12 @@ int orte_pls_process_finalize(void); * Interface */ int orte_pls_process_launch(orte_jobid_t); -int orte_pls_process_terminate_job(orte_jobid_t, opal_list_t*); -int orte_pls_process_terminate_orteds(orte_jobid_t, opal_list_t*); +int orte_pls_process_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*); +int orte_pls_process_terminate_orteds(orte_jobid_t, struct timeval *timeout, opal_list_t*); int orte_pls_process_terminate_proc(const orte_process_name_t* proc_name); int orte_pls_process_signal_job(orte_jobid_t, int32_t, opal_list_t*); int orte_pls_process_signal_proc(const orte_process_name_t* proc_name, int32_t); +int orte_pls_process_cancel_operation(void); /** * PLS Component diff --git a/orte/mca/pls/process/pls_process_module.c b/orte/mca/pls/process/pls_process_module.c index ea942b2f58..586d7c4825 100644 --- a/orte/mca/pls/process/pls_process_module.c +++ b/orte/mca/pls/process/pls_process_module.c @@ -115,6 +115,7 @@ orte_pls_base_module_t orte_pls_process_module = { orte_pls_process_terminate_proc, orte_pls_process_signal_job, orte_pls_process_signal_proc, + orte_pls_process_cancel_operation, orte_pls_process_finalize }; @@ -1029,7 +1030,7 @@ cleanup: /** * Terminate all processes for a given job */ -int orte_pls_process_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_process_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -1045,7 +1046,7 @@ int orte_pls_process_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) } /* order them to kill their local procs for this job */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -1061,7 +1062,7 @@ CLEANUP: /** * Terminate the orteds for a given job */ -int orte_pls_process_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_process_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -1077,7 +1078,7 @@ int orte_pls_process_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) } /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } @@ -1134,6 +1135,23 @@ int orte_pls_process_signal_proc(const orte_process_name_t* proc, int32_t signal return ORTE_ERR_NOT_IMPLEMENTED; } +/** + * Cancel an operation involving comm to an orted + */ +int orte_pls_process_cancel_operation(void) +{ + int rc; + + OPAL_TRACE(1); + + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + int orte_pls_process_finalize(void) { int rc; diff --git a/orte/mca/pls/proxy/pls_proxy.c b/orte/mca/pls/proxy/pls_proxy.c index bdc6603fef..3ccd733408 100644 --- a/orte/mca/pls/proxy/pls_proxy.c +++ b/orte/mca/pls/proxy/pls_proxy.c @@ -26,6 +26,10 @@ #include "orte/orte_constants.h" #include "orte/orte_types.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "opal/util/output.h" #include "opal/util/trace.h" @@ -110,13 +114,14 @@ int orte_pls_proxy_launch(orte_jobid_t job) return ORTE_SUCCESS; } -int orte_pls_proxy_terminate_job(orte_jobid_t job, opal_list_t *attrs) +int orte_pls_proxy_terminate_job(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs) { orte_buffer_t* cmd; orte_buffer_t* answer; orte_pls_cmd_flag_t command, ret_cmd; orte_std_cntr_t count; int rc; + int32_t timefield; OPAL_TRACE(1); @@ -146,6 +151,20 @@ int orte_pls_proxy_terminate_job(orte_jobid_t job, opal_list_t *attrs) return rc; } + timefield = timeout->tv_sec; + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &timefield, 1, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + timefield = timeout->tv_usec; + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &timefield, 1, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + if (0 > orte_rml.send_buffer(orte_pls_proxy_replica, cmd, ORTE_RML_TAG_PLS, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(cmd); @@ -182,13 +201,14 @@ int orte_pls_proxy_terminate_job(orte_jobid_t job, opal_list_t *attrs) return ORTE_SUCCESS; } -int orte_pls_proxy_terminate_orteds(orte_jobid_t job, opal_list_t *attrs) +int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs) { orte_buffer_t* cmd; orte_buffer_t* answer; orte_pls_cmd_flag_t command, ret_cmd; orte_std_cntr_t count; int rc; + int32_t timefield; OPAL_TRACE(1); @@ -218,6 +238,20 @@ int orte_pls_proxy_terminate_orteds(orte_jobid_t job, opal_list_t *attrs) return rc; } + timefield = timeout->tv_sec; + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &timefield, 1, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + timefield = timeout->tv_usec; + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &timefield, 1, ORTE_INT32))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + if (0 > orte_rml.send_buffer(orte_pls_proxy_replica, cmd, ORTE_RML_TAG_PLS, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(cmd); @@ -471,5 +505,63 @@ int orte_pls_proxy_signal_proc(const orte_process_name_t* name, int32_t signal) return ORTE_SUCCESS; } - +int orte_pls_proxy_cancel_operation(void) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_pls_cmd_flag_t command, ret_cmd; + orte_std_cntr_t count; + int rc; + + OPAL_TRACE(1); + + command = ORTE_PLS_CANCEL_OPERATION_CMD; + + cmd = OBJ_NEW(orte_buffer_t); + if (cmd == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_PLS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(orte_pls_proxy_replica, cmd, ORTE_RML_TAG_PLS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + answer = OBJ_NEW(orte_buffer_t); + if(answer == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (0 > orte_rml.recv_buffer(orte_pls_proxy_replica, answer, ORTE_RML_TAG_PLS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &ret_cmd, &count, ORTE_PLS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + if (ret_cmd != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + OBJ_RELEASE(answer); + return ORTE_SUCCESS; +} diff --git a/orte/mca/pls/proxy/pls_proxy.h b/orte/mca/pls/proxy/pls_proxy.h index 10db57fa4f..7445620327 100644 --- a/orte/mca/pls/proxy/pls_proxy.h +++ b/orte/mca/pls/proxy/pls_proxy.h @@ -22,6 +22,10 @@ #include "orte_config.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "orte/mca/pls/pls.h" #if defined(c_plusplus) || defined(__cplusplus) @@ -48,11 +52,12 @@ int orte_pls_proxy_finalize(void); * proxy function prototypes */ int orte_pls_proxy_launch(orte_jobid_t job); -int orte_pls_proxy_terminate_job(orte_jobid_t job, opal_list_t *attrs); -int orte_pls_proxy_terminate_orteds(orte_jobid_t job, opal_list_t *attrs); +int orte_pls_proxy_terminate_job(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs); +int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs); int orte_pls_proxy_terminate_proc(const orte_process_name_t* name); int orte_pls_proxy_signal_job(orte_jobid_t job, int32_t signal, opal_list_t *attrs); int orte_pls_proxy_signal_proc(const orte_process_name_t* name, int32_t signal); +int orte_pls_proxy_cancel_operation(void); #if defined(c_plusplus) || defined(__cplusplus) diff --git a/orte/mca/pls/proxy/pls_proxy_component.c b/orte/mca/pls/proxy/pls_proxy_component.c index 583423432a..f32633ee5c 100644 --- a/orte/mca/pls/proxy/pls_proxy_component.c +++ b/orte/mca/pls/proxy/pls_proxy_component.c @@ -65,6 +65,7 @@ static orte_pls_base_module_t orte_pls_proxy_module = { orte_pls_proxy_terminate_proc, orte_pls_proxy_signal_job, orte_pls_proxy_signal_proc, + orte_pls_proxy_cancel_operation, orte_pls_proxy_finalize }; diff --git a/orte/mca/pls/rsh/pls_rsh.h b/orte/mca/pls/rsh/pls_rsh.h index 554161d9e2..ece91faad5 100644 --- a/orte/mca/pls/rsh/pls_rsh.h +++ b/orte/mca/pls/rsh/pls_rsh.h @@ -25,8 +25,13 @@ #include "orte_config.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "opal/threads/condition.h" #include "opal/mca/mca.h" + #include "orte/mca/pls/pls.h" #if defined(c_plusplus) || defined(__cplusplus) @@ -49,11 +54,12 @@ int orte_pls_rsh_finalize(void); * Interface */ int orte_pls_rsh_launch(orte_jobid_t); -int orte_pls_rsh_terminate_job(orte_jobid_t, opal_list_t*); -int orte_pls_rsh_terminate_orteds(orte_jobid_t, opal_list_t*); +int orte_pls_rsh_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*); +int orte_pls_rsh_terminate_orteds(orte_jobid_t, struct timeval *timeout, opal_list_t*); int orte_pls_rsh_terminate_proc(const orte_process_name_t* proc_name); int orte_pls_rsh_signal_job(orte_jobid_t, int32_t, opal_list_t*); int orte_pls_rsh_signal_proc(const orte_process_name_t* proc_name, int32_t); +int orte_pls_rsh_cancel_operation(void); /** * PLS Component diff --git a/orte/mca/pls/rsh/pls_rsh_module.c b/orte/mca/pls/rsh/pls_rsh_module.c index 5e3518ecef..1c779e4ca1 100644 --- a/orte/mca/pls/rsh/pls_rsh_module.c +++ b/orte/mca/pls/rsh/pls_rsh_module.c @@ -107,6 +107,7 @@ orte_pls_base_module_t orte_pls_rsh_module = { orte_pls_rsh_terminate_proc, orte_pls_rsh_signal_job, orte_pls_rsh_signal_proc, + orte_pls_rsh_cancel_operation, orte_pls_rsh_finalize }; @@ -943,6 +944,9 @@ int orte_pls_rsh_launch(orte_jobid_t jobid) argv[local_exec_index_end] = NULL; } + /* tell the daemon to setup its own process session/group */ + opal_argv_append(&argc, &argv, "--set-sid"); + /* Finally, chdir($HOME) because we're making the assumption that this is what will happen on remote nodes (via rsh/ssh). This allows a user @@ -1128,7 +1132,7 @@ cleanup: /** * Terminate all processes for a given job */ -int orte_pls_rsh_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_rsh_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -1144,7 +1148,7 @@ int orte_pls_rsh_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) } /* order them to kill their local procs for this job */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -1160,7 +1164,7 @@ CLEANUP: /** * Terminate the orteds for a given job */ -int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -1176,7 +1180,7 @@ int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) } /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } @@ -1233,6 +1237,23 @@ int orte_pls_rsh_signal_proc(const orte_process_name_t* proc, int32_t signal) return ORTE_ERR_NOT_IMPLEMENTED; } +/** + * Cancel an operation involving comm to an orted + */ +int orte_pls_rsh_cancel_operation(void) +{ + int rc; + + OPAL_TRACE(1); + + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + int orte_pls_rsh_finalize(void) { int rc; diff --git a/orte/mca/pls/slurm/pls_slurm_module.c b/orte/mca/pls/slurm/pls_slurm_module.c index 16997ac5ba..01e66aa1c5 100644 --- a/orte/mca/pls/slurm/pls_slurm_module.c +++ b/orte/mca/pls/slurm/pls_slurm_module.c @@ -74,12 +74,13 @@ * Local functions */ static int pls_slurm_launch_job(orte_jobid_t jobid); -static int pls_slurm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs); -static int pls_slurm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs); +static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); +static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); static int pls_slurm_terminate_proc(const orte_process_name_t *name); static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs); static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal); static int pls_slurm_finalize(void); +static int pls_slurm_cancel_operation(void); static int pls_slurm_start_proc(int argc, char **argv, char **env, char *prefix); @@ -95,6 +96,7 @@ orte_pls_base_module_1_3_0_t orte_pls_slurm_module = { pls_slurm_terminate_proc, pls_slurm_signal_job, pls_slurm_signal_proc, + pls_slurm_cancel_operation, pls_slurm_finalize }; @@ -443,7 +445,7 @@ cleanup: } -static int pls_slurm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -457,7 +459,7 @@ static int pls_slurm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) } /* order them to kill their local procs for this job */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -474,7 +476,7 @@ CLEANUP: /** * Terminate the orteds for a given job */ -static int pls_slurm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -488,7 +490,7 @@ static int pls_slurm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) } /* order them to go away */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } @@ -534,6 +536,21 @@ static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal } +/** + * Cancel an operation involving comm to an orted + */ +int pls_slurm_cancel_operation(void) +{ + int rc; + + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + static int pls_slurm_finalize(void) { int rc; diff --git a/orte/mca/pls/tm/pls_tm_module.c b/orte/mca/pls/tm/pls_tm_module.c index cd2ae4b849..d9973f3731 100644 --- a/orte/mca/pls/tm/pls_tm_module.c +++ b/orte/mca/pls/tm/pls_tm_module.c @@ -79,11 +79,12 @@ * Local functions */ static int pls_tm_launch_job(orte_jobid_t jobid); -static int pls_tm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs); -static int pls_tm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs); +static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); +static int pls_tm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); static int pls_tm_terminate_proc(const orte_process_name_t *name); static int pls_tm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs); static int pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal); +static int pls_tm_cancel_operation(void); static int pls_tm_finalize(void); static int pls_tm_connect(void); @@ -559,7 +560,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid) } -static int pls_tm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -573,7 +574,7 @@ static int pls_tm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) } /* order them to kill their local procs for this job */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -590,7 +591,7 @@ CLEANUP: /** * Terminate the orteds for a given job */ -int pls_tm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +int pls_tm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -604,7 +605,7 @@ int pls_tm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) } /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } @@ -661,6 +662,21 @@ static int pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal) } +/** + * Cancel an operation involving comm to an orted + */ +static int pls_tm_cancel_operation(void) +{ + int rc; + + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + /* * Free stuff */ diff --git a/orte/mca/pls/xcpu/pls_xcpu.c b/orte/mca/pls/xcpu/pls_xcpu.c index 1ba57b2046..bdda5cd7b9 100644 --- a/orte/mca/pls/xcpu/pls_xcpu.c +++ b/orte/mca/pls/xcpu/pls_xcpu.c @@ -40,6 +40,9 @@ #ifdef HAVE_STRING_H #include #endif /* HAVE_STRING_H */ +#ifdef HAVE_SYS_TIME_H +#include +#endif #include "opal/event/event.h" #include "opal/mca/base/mca_base_param.h" @@ -86,6 +89,7 @@ orte_pls_base_module_t orte_pls_xcpu_module = { orte_pls_xcpu_terminate_proc, orte_pls_xcpu_signal_job, orte_pls_xcpu_signal_proc, + orte_pls_xcpu_cancel_operation, orte_pls_xcpu_finalize }; @@ -357,7 +361,7 @@ error: return rc; } -int orte_pls_xcpu_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +int orte_pls_xcpu_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int i, rc; orte_job_map_t *map; @@ -378,7 +382,7 @@ int orte_pls_xcpu_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) return ORTE_SUCCESS; } -int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, opal_list_t * attrs) +int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t * attrs) { return ORTE_SUCCESS; } @@ -424,6 +428,14 @@ int orte_pls_xcpu_signal_proc(const orte_process_name_t* proc_name, int32_t sig) return ORTE_SUCCESS; } +/** + * Cancel an operation involving comm to an orted + */ +int orte_pls_xcpu_cancel_operation(void) +{ + return ORTE_SUCCESS; +} + int orte_pls_xcpu_finalize(void) { return ORTE_SUCCESS; diff --git a/orte/mca/pls/xcpu/pls_xcpu.h b/orte/mca/pls/xcpu/pls_xcpu.h index e61968d0b7..11b03b439e 100644 --- a/orte/mca/pls/xcpu/pls_xcpu.h +++ b/orte/mca/pls/xcpu/pls_xcpu.h @@ -32,6 +32,11 @@ #define orte_pls_xcpu_H_ #include "orte_config.h" + +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "orte/class/orte_pointer_array.h" #include "orte/orte_constants.h" #include "orte/mca/pls/base/base.h" @@ -57,11 +62,12 @@ orte_pls_base_module_t* orte_pls_xcpu_init(int *priority); /* in component file * Interface */ int orte_pls_xcpu_launch_job(orte_jobid_t); -int orte_pls_xcpu_terminate_job(orte_jobid_t, opal_list_t *); - int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, opal_list_t * attrs); +int orte_pls_xcpu_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t *); +int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t * attrs); int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name); int orte_pls_xcpu_signal_job(orte_jobid_t jobid, int32_t sig, opal_list_t*); int orte_pls_xcpu_signal_proc(const orte_process_name_t* proc_name, int32_t sig); +int orte_pls_xcpu_cancel_operation(void); int orte_pls_xcpu_finalize(void); void orte_pls_xcpu_close_sessions(void); diff --git a/orte/mca/pls/xgrid/src/pls_xgrid_module.m b/orte/mca/pls/xgrid/src/pls_xgrid_module.m index 50bfdf48c5..de407845d2 100644 --- a/orte/mca/pls/xgrid/src/pls_xgrid_module.m +++ b/orte/mca/pls/xgrid/src/pls_xgrid_module.m @@ -27,6 +27,9 @@ #import #import #import +#ifdef HAVE_SYS_TIME_H +#include +#endif #import "orte/orte_constants.h" #import "opal/util/argv.h" @@ -45,14 +48,14 @@ #import "pls_xgrid.h" int orte_pls_xgrid_launch(orte_jobid_t jobid); -int orte_pls_xgrid_terminate_job(orte_jobid_t jobid, opal_list_t *attrs); -int orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs); +int orte_pls_xgrid_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); +int orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs); int orte_pls_xgrid_terminate_proc(const orte_process_name_t* proc); int orte_pls_xgrid_signal_job(orte_jobid_t job, int32_t signal, opal_list_t *attrs); int orte_pls_xgrid_signal_proc(const orte_process_name_t* proc_name, int32_t signal); +int orte_pls_xgrid_cancel_operation(void); int orte_pls_xgrid_finalize(void); - orte_pls_base_module_1_3_0_t orte_pls_xgrid_module = { orte_pls_xgrid_launch, orte_pls_xgrid_terminate_job, @@ -60,6 +63,7 @@ orte_pls_base_module_1_3_0_t orte_pls_xgrid_module = { orte_pls_xgrid_terminate_proc, orte_pls_xgrid_signal_job, orte_pls_xgrid_signal_proc, + orte_pls_xgrid_cancel_operation, orte_pls_xgrid_finalize }; @@ -81,7 +85,7 @@ orte_pls_xgrid_launch(orte_jobid_t jobid) * Terminate all processes for a given job */ int -orte_pls_xgrid_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) +orte_pls_xgrid_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -95,7 +99,7 @@ orte_pls_xgrid_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) } /* order them to kill their local procs for this job */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -118,7 +122,7 @@ CLEANUP: * Terminate the orteds for a given job */ int -orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) +orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -132,7 +136,7 @@ orte_pls_xgrid_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) } /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } @@ -195,6 +199,19 @@ orte_pls_xgrid_signal_proc(const orte_process_name_t* proc, int32_t signal) } +int +orte_pls_xgrid_cancel_operation(void) +{ + int rc; + + if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + int orte_pls_xgrid_finalize(void) { diff --git a/orte/runtime/orte_init.c b/orte/runtime/orte_init.c index 4b33b1a89f..4b13226288 100644 --- a/orte/runtime/orte_init.c +++ b/orte/runtime/orte_init.c @@ -33,9 +33,6 @@ * @retval ORTE_ERROR Upon failure. */ -/* globals used by RTE */ -int orte_debug_flag=(int)false; - int orte_init(bool infrastructure) { int rc; diff --git a/orte/runtime/orte_init_stage1.c b/orte/runtime/orte_init_stage1.c index b71956fd63..b958516351 100644 --- a/orte/runtime/orte_init_stage1.c +++ b/orte/runtime/orte_init_stage1.c @@ -133,18 +133,6 @@ int orte_init_stage1(bool infrastructure) /***** ERROR LOGGING NOW AVAILABLE *****/ - /* check for debug flag */ - if (0 > (ret = mca_base_param_register_int("orte", "debug", NULL, NULL, 0))) { - ORTE_ERROR_LOG(ret); - error = "mca_base_param_register_int"; - goto error; - } - if (ORTE_SUCCESS != (ret = mca_base_param_lookup_int(ret, &orte_debug_flag))) { - ORTE_ERROR_LOG(ret); - error = "mca_base_param_lookup_int"; - goto error; - } - /* * Initialize the event library */ diff --git a/orte/runtime/orte_params.c b/orte/runtime/orte_params.c index 3dc0e56ec2..e2673fdeaa 100644 --- a/orte/runtime/orte_params.c +++ b/orte/runtime/orte_params.c @@ -20,14 +20,27 @@ #include "orte/orte_constants.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif + #include "opal/mca/base/mca_base_param.h" + #include "orte/runtime/runtime.h" +#include "orte/runtime/params.h" + +/* globals used by RTE */ +int orte_debug_flag; +struct timeval orte_abort_timeout; int orte_register_params(bool infrastructure) { + int value; + mca_base_param_reg_int_name("orte", "debug", "Top-level ORTE debug switch", - false, false, (int)false, NULL); + false, false, (int)false, &value); + orte_debug_flag = OPAL_INT_TO_BOOL(value); mca_base_param_reg_int_name("orte_debug", "daemons_file", "Whether want stdout/stderr of daemons to go to a file or not", @@ -51,8 +64,14 @@ int orte_register_params(bool infrastructure) "Sequence of user-level debuggers to search for in orterun", false, false, "totalview @mpirun@ -a @mpirun_args@ : fxp @mpirun@ -a @mpirun_args@", NULL); + + mca_base_param_reg_int_name("orte", "abort_timeout", + "Time to wait [in seconds] before giving up on aborting an ORTE operation", + false, false, 10, &value); + orte_abort_timeout.tv_sec = value; + orte_abort_timeout.tv_usec = 0; + /* All done */ - return ORTE_SUCCESS; } diff --git a/orte/runtime/params.h b/orte/runtime/params.h new file mode 100644 index 0000000000..610e06452f --- /dev/null +++ b/orte/runtime/params.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Global params for OpenRTE + */ +#ifndef ORTE_RUNTIME_PARAM_H +#define ORTE_RUNTIME_PARAM_H + +#include "orte_config.h" + +#ifdef HAVE_SYS_TIME_H +#include +#endif + + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +/* globals used by RTE - instanced in orte_params.c */ + +ORTE_DECLSPEC extern int orte_debug_flag; + +ORTE_DECLSPEC extern struct timeval orte_abort_timeout; + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif + +#endif /* ORTE_RUNTIME_PARAM_H */ diff --git a/orte/tools/orted/orted.c b/orte/tools/orted/orted.c index 233bc6768a..f430fa3797 100644 --- a/orte/tools/orted/orted.c +++ b/orte/tools/orted/orted.c @@ -69,6 +69,7 @@ #include "orte/mca/pls/pls.h" #include "orte/runtime/runtime.h" +#include "orte/runtime/params.h" #include "orte/tools/orted/orted.h" @@ -126,6 +127,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = { &orted_globals.bootproxy, OPAL_CMD_LINE_TYPE_INT, "Run as boot proxy for " }, + { NULL, NULL, NULL, '\0', NULL, "set-sid", 0, + &orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL, + "Direct the orted to separate from the current session"}, + { NULL, NULL, NULL, '\0', NULL, "name", 1, &orted_globals.name, OPAL_CMD_LINE_TYPE_STRING, "Set the orte process name"}, @@ -206,6 +211,9 @@ int main(int argc, char *argv[]) /* save the environment for use when launching application processes */ orted_globals.saved_environ = opal_argv_copy(environ); + /* setup mca param system */ + mca_base_param_init(); + /* setup to check common command line options that just report and die */ cmd_line = OBJ_NEW(opal_cmd_line_t); opal_cmd_line_create(cmd_line, orte_cmd_line_opts); @@ -229,6 +237,11 @@ int main(int argc, char *argv[]) return 1; } + /* see if we were directed to separate from current session */ + if (orted_globals.set_sid) { + setsid(); + } + /* see if they want us to spin until they can connect a debugger to us */ i=0; while (orted_globals.spin) { @@ -741,7 +754,7 @@ static void halt_vm(void) /* terminate the vm - this will also wake us up so we can exit */ OBJ_CONSTRUCT(&attrs, opal_list_t); orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); - ret = orte_pls.terminate_orteds(0, &attrs); + ret = orte_pls.terminate_orteds(0, &orte_abort_timeout, &attrs); while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attrs); diff --git a/orte/tools/orted/orted.h b/orte/tools/orted/orted.h index c2c558cc08..f93411a0ed 100644 --- a/orte/tools/orted/orted.h +++ b/orte/tools/orted/orted.h @@ -35,6 +35,7 @@ typedef struct { bool debug; bool debug_daemons; bool debug_daemons_file; + bool set_sid; char* ns_nds; char* name; char* vpid_start; diff --git a/orte/tools/orteprobe/orteprobe.c b/orte/tools/orteprobe/orteprobe.c index 920af1bf92..8214b5a138 100644 --- a/orte/tools/orteprobe/orteprobe.c +++ b/orte/tools/orteprobe/orteprobe.c @@ -68,10 +68,12 @@ #include "orte/mca/schema/base/base.h" #include "orte/runtime/runtime.h" +#include "orte/runtime/params.h" #include "orte/runtime/orte_wait.h" #include "orte/tools/orteprobe/orteprobe.h" +#if 0 orteprobe_globals_t orteprobe_globals; /* @@ -127,6 +129,7 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = { { NULL, NULL, NULL, '\0', NULL, NULL, 0, NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } }; +#endif #if !defined(__WINDOWS__) extern char **environ; @@ -134,6 +137,7 @@ extern char **environ; int main(int argc, char *argv[]) { +#if 0 int ret = 0, ortedargc; opal_cmd_line_t *cmd_line = NULL; char *contact_path = NULL, *orted=NULL; @@ -151,6 +155,10 @@ int main(int argc, char *argv[]) return ret; } + /* Setup MCA params */ + mca_base_param_init(); + orte_register_params(false); + /* setup to check common command line options that just report and die */ memset(&orteprobe_globals, 0, sizeof(orteprobe_globals)); cmd_line = OBJ_NEW(opal_cmd_line_t); @@ -464,4 +472,6 @@ int main(int argc, char *argv[]) fprintf(stderr, "orteprobe: system appears to not support remote probes\n"); exit(1); #endif +#endif + exit(1); } diff --git a/orte/tools/orterun/help-orterun.txt b/orte/tools/orterun/help-orterun.txt index 94ebfbd37a..9dde4d516d 100644 --- a/orte/tools/orterun/help-orterun.txt +++ b/orte/tools/orterun/help-orterun.txt @@ -96,6 +96,7 @@ WARNING: %s encountered an abnormal exit. This means that %s exited before it received notification that all started processes had terminated. You should double check and ensure that there are no runaway processes still executing. +# [orterun:sigint-while-processing] WARNING: %s is in the process of killing a job, but has detected an interruption (probably control-C). @@ -103,6 +104,13 @@ interruption (probably control-C). It is dangerous to interrupt %s while it is killing a job (proper termination may not be guaranteed). Hit control-C again within 1 second if you really want to kill %s immediately. +# +[orterun:forced-end-failed] +WARNING: %s was ordered to kill a job (probably with control-C), but +was unable to successfully complete that order (returned error %s). +You should double check and ensure that there are no runaway processes +still executing. +# [orterun:empty-prefix] A prefix was supplied to %s that only contained slashes. @@ -136,5 +144,5 @@ Things to check: - Ensure that any required licenses are available to run the debugger # [orterun:daemon-die] -%s was unable to cleanly terminate the daemons for this job. Returned value %d instead of ORTE_SUCCESS. +%s was unable to cleanly terminate the daemons for this job. Returned value %s instead of ORTE_SUCCESS. diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index dd81c06bcf..9d8d55e625 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -39,6 +39,9 @@ #ifdef HAVE_LIBGEN_H #include #endif +#ifdef HAVE_SYS_TIME_H +#include +#endif #include "opal/event/event.h" #include "opal/install_dirs.h" @@ -71,6 +74,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/runtime/runtime.h" +#include "orte/runtime/params.h" #include "orte/runtime/orte_wait.h" #include "orterun.h" @@ -268,7 +272,7 @@ opal_cmd_line_init_t cmd_line_init[] = { &orterun_globals.do_not_launch, OPAL_CMD_LINE_TYPE_BOOL, "Perform all necessary operations to prepare to launch the application, but do not actually launch it" }, - { NULL, NULL, NULL, '\0', "reuse-daemons", "reuse-daemons", 0, + { "pls", "base", "reuse_daemons", '\0', "reuse-daemons", "reuse-daemons", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "If set, reuse daemons to launch dynamically spawned processes"}, @@ -470,9 +474,9 @@ int orterun(int argc, char *argv[]) */ OBJ_CONSTRUCT(&attributes, opal_list_t); orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); - if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &attributes))) { - opal_show_help("help-orterun.txt", "orterun:daemon-die", false, - orterun_basename, NULL, NULL, ret); + if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &orte_abort_timeout, &attributes))) { + opal_show_help("help-orterun.txt", "orterun:daemon-die", true, + orterun_basename, ORTE_ERROR_NAME(ret)); } while (NULL != (item = opal_list_remove_first(&attributes))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attributes); @@ -686,9 +690,6 @@ static void exit_callback(int fd, short event, void *arg) { OPAL_TRACE(1); - opal_show_help("help-orterun.txt", "orterun:abnormal-exit", - true, orterun_basename, orterun_basename); - /* Remove the TERM and INT signal handlers */ opal_signal_del(&term_handler); opal_signal_del(&int_handler); @@ -720,62 +721,71 @@ typedef enum { static void abort_signal_callback(int fd, short flags, void *arg) { int ret; - struct timeval tv = { 1, 0 }; opal_event_t* event; opal_list_t attrs; opal_list_item_t *item; - static abort_signal_state_t state; + static abort_signal_state_t state=ABORT_SIGNAL_FIRST; static struct timeval invoked, now; double a, b; - + OPAL_TRACE(1); - + /* If this whole process has already completed, then bail */ switch (state) { - case ABORT_SIGNAL_FIRST: - /* This is the first time through */ - state = ABORT_SIGNAL_PROCESSING; - break; - - case ABORT_SIGNAL_WARNED: - gettimeofday(&now, NULL); - a = invoked.tv_sec * 1000000 + invoked.tv_usec; - b = now.tv_sec * 1000000 + invoked.tv_usec; - if (b - a <= 1000000) { - /* We are in an event handler; exit_callback() will delete - the handler that is currently running (which is a Bad - Thing), so we can't call it directly. Instead, we have - to exit this handler and setup to call exit_handler() - after this. */ - if (NULL != (event = (opal_event_t*) - malloc(sizeof(opal_event_t)))) { - opal_evtimer_set(event, exit_callback, NULL); - now.tv_sec = 0; - now.tv_usec = 0; - opal_evtimer_add(event, &now); - state = ABORT_SIGNAL_DONE; - } - return; - } - /* Otherwise fall through to PROCESSING and warn again */ - - case ABORT_SIGNAL_PROCESSING: - opal_show_help("help-orterun.txt", "orterun:sigint-while-processing", - true, orterun_basename, orterun_basename, - orterun_basename); - gettimeofday(&invoked, NULL); - state = ABORT_SIGNAL_WARNED; - return; - - case ABORT_SIGNAL_DONE: - /* Nothing to do -- return */ - return; + case ABORT_SIGNAL_FIRST: + /* This is the first time through */ + state = ABORT_SIGNAL_PROCESSING; + break; + + case ABORT_SIGNAL_WARNED: + gettimeofday(&now, NULL); + a = invoked.tv_sec * 1000000 + invoked.tv_usec; + b = now.tv_sec * 1000000 + invoked.tv_usec; + if (b - a <= 1000000) { + /* tell the pls to cancel the terminate request - + * obviously, something is wrong at this point + */ + if (ORTE_SUCCESS != (ret = orte_pls.cancel_operation())) { + ORTE_ERROR_LOG(ret); + } + /* give the user the warning about manual cleanup */ + opal_show_help("help-orterun.txt", "orterun:abnormal-exit", + true, orterun_basename, orterun_basename); + + /* We are in an event handler; exit_callback() will delete + the handler that is currently running (which is a Bad + Thing), so we can't call it directly. Instead, we have + to exit this handler and setup to call exit_handler() + after this. */ + if (NULL != (event = (opal_event_t*) + malloc(sizeof(opal_event_t)))) { + opal_evtimer_set(event, exit_callback, NULL); + now.tv_sec = 0; + now.tv_usec = 0; + opal_evtimer_add(event, &now); + state = ABORT_SIGNAL_DONE; + } + return; + } + /* Otherwise fall through to PROCESSING and warn again */ + + case ABORT_SIGNAL_PROCESSING: + opal_show_help("help-orterun.txt", "orterun:sigint-while-processing", + true, orterun_basename, orterun_basename, + orterun_basename); + gettimeofday(&invoked, NULL); + state = ABORT_SIGNAL_WARNED; + return; + + case ABORT_SIGNAL_DONE: + /* Nothing to do -- return */ + return; } if (!orterun_globals.quiet){ fprintf(stderr, "%s: killing job...\n\n", orterun_basename); } - + /* terminate the job - this will also wakeup orterun so * it can kill all the orteds. Be sure to kill all the job's * descendants, if any, so nothing is left hanging @@ -783,24 +793,20 @@ static void abort_signal_callback(int fd, short flags, void *arg) if (jobid != ORTE_JOBID_INVALID) { OBJ_CONSTRUCT(&attrs, opal_list_t); orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); - ret = orte_pls.terminate_job(jobid, &attrs); + ret = orte_pls.terminate_job(jobid, &orte_abort_timeout, &attrs); while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attrs); if (ORTE_SUCCESS != ret) { + opal_show_help("help-orterun.txt", "orterun:forced-end-failed", + true, orterun_basename, ORTE_ERROR_NAME(ret)); jobid = ORTE_JOBID_INVALID; } } - /* setup a delay to give the orteds time to complete their departure */ - if (NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) { - opal_evtimer_set(event, exit_callback, NULL); - opal_evtimer_add(event, &tv); - } - + state = ABORT_SIGNAL_DONE; state = ABORT_SIGNAL_DONE; } - /** * Pass user signals to the remote application processes */ @@ -965,7 +971,7 @@ static int parse_globals(int argc, char* argv[]) orterun_globals.by_slot = true; } - /* If we don't want to wait, we don't want to wait */ + /* If we don't want to wait, we don't want to wait */ if (orterun_globals.no_wait_for_job_completion) { wait_for_job_completion = false;