diff --git a/orte/mca/odls/odls_types.h b/orte/mca/odls/odls_types.h index 520a7eb413..0a3d719686 100644 --- a/orte/mca/odls/odls_types.h +++ b/orte/mca/odls/odls_types.h @@ -43,6 +43,7 @@ typedef uint8_t orte_daemon_cmd_flag_t; #define ORTE_DAEMON_ADD_LOCAL_PROCS (orte_daemon_cmd_flag_t) 6 #define ORTE_DAEMON_HEARTBEAT_CMD (orte_daemon_cmd_flag_t) 7 #define ORTE_DAEMON_EXIT_CMD (orte_daemon_cmd_flag_t) 8 +#define ORTE_DAEMON_HALT_VM_CMD (orte_daemon_cmd_flag_t) 9 #if defined(c_plusplus) || defined(__cplusplus) diff --git a/orte/tools/orteboot/orteboot.c b/orte/tools/orteboot/orteboot.c index a5f53775ed..5452490af3 100644 --- a/orte/tools/orteboot/orteboot.c +++ b/orte/tools/orteboot/orteboot.c @@ -320,212 +320,10 @@ int main(int argc, char *argv[]) free(tmp); } - /* detach from controlling terminal - * otherwise, remain attached so output can get to the user - */ - if(orteboot_globals.debug == false) { - opal_daemon_init(NULL); - } - - /* Intialize our Open RTE environment */ - /* Set the flag telling orte_init that I am NOT a - * singleton, but am "infrastructure" - prevents setting - * up incorrect infrastructure that only a singleton would - * require - */ - if (ORTE_SUCCESS != (rc = orte_init(true))) { - opal_show_help("help-orteboot.txt", "orteboot:init-failure", true, - "orte_init()", rc); - return rc; - } - - /** setup callbacks for abort signals */ - opal_signal_set(&term_handler, SIGTERM, - abort_signal_callback, &term_handler); - opal_signal_add(&term_handler, NULL); - opal_signal_set(&int_handler, SIGINT, - abort_signal_callback, &int_handler); - opal_signal_add(&int_handler, NULL); - - /* issue the non-blocking receive */ - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); - if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* Prep to start the virtual machine */ - /* construct the list of attributes */ - OBJ_CONSTRUCT(&attributes, opal_list_t); - - orte_rmgr.add_attribute(&attributes, ORTE_RMAPS_PERNODE, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_NO_OVERRIDE); - - /* Create the app - in this case, that's just a no_op to get the daemons launched */ - app = OBJ_NEW(orte_app_context_t); - if (NULL == app) { - opal_show_help("help-orteboot.txt", "orteboot:call-failed", - true, orteboot_basename, "system", "malloc returned NULL", errno); - exit(1); - } - - - /* Spawn the job */ - - rc = orte_rmgr.spawn_job(&app, 1, &jobid, 0, NULL, NULL, 0, &attributes); - if (ORTE_SUCCESS != rc) { - /* JMS show_help */ - opal_output(0, "%s: spawn failed with errno=%d\n", orteboot_basename, rc); - } - OBJ_DESTRUCT(&attributes); + /* just do a fork/exec of orted --seed --persistent and then exit */ - /* setup and enter the event monitor */ - OPAL_THREAD_LOCK(&orteboot_globals.lock); - - while (false == orteboot_globals.exit) { - opal_condition_wait(&orteboot_globals.cond, &orteboot_globals.lock); - } - - OPAL_THREAD_UNLOCK(&orteboot_globals.lock); - - orte_finalize(); - free(orteboot_basename); + free(orteboot_basename); return rc; } -static void exit_callback(int fd, short event, void *arg) -{ - /* Remove the TERM and INT signal handlers */ - opal_signal_del(&term_handler); - opal_signal_del(&int_handler); - - /* Trigger the normal exit conditions */ - orteboot_globals.exit = true; - opal_condition_signal(&orteboot_globals.cond); -} - -static void abort_signal_callback(int fd, short flags, void *arg) -{ - int ret; - struct timeval tv = { 1, 0 }; - opal_event_t* event; - opal_list_t attrs; - opal_list_item_t *item; - - static int signalled = 0; - - OPAL_TRACE(1); - - if (0 != signalled++) { - return; - } - - fprintf(stderr, "%s: killing job...\n\n", orteboot_basename); - - /* terminate the vm - this will also wake us up so we can exit */ - OBJ_CONSTRUCT(&attrs, opal_list_t); - orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); - ret = orte_pls.terminate_orteds(0, &attrs); - while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); - OBJ_DESTRUCT(&attrs); - - /* setup a delay to give the orteds time to complete their departure */ - if (NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) { - opal_evtimer_set(event, exit_callback, NULL); - opal_evtimer_add(event, &tv); - } -} - -static void orte_daemon_recv(int status, orte_process_name_t* sender, - orte_buffer_t *buffer, orte_rml_tag_t tag, - void* cbdata) -{ - orte_buffer_t *answer; - orte_daemon_cmd_flag_t command; - int ret; - orte_std_cntr_t n; - char *contact_info; - - OPAL_TRACE(1); - - OPAL_THREAD_LOCK(&orteboot_globals.lock); - - if (orteboot_globals.debug) { - opal_output(0, "orteboot: received message from [%ld,%ld,%ld]", ORTE_NAME_ARGS(sender)); - } - - answer = OBJ_NEW(orte_buffer_t); - if (NULL == answer) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - goto DONE; - } - - n = 1; - if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(ret); - goto CLEANUP; - } - - /**** EXIT COMMAND ****/ - if (ORTE_DAEMON_EXIT_CMD == command) { - if (orteboot_globals.debug) { - opal_output(0, "orteboot: received exit"); - } - - orteboot_globals.exit = true; - opal_condition_signal(&orteboot_globals.cond); - goto CLEANUP; - - /**** CONTACT QUERY COMMAND ****/ - } else if (ORTE_DAEMON_CONTACT_QUERY_CMD == command) { - /* send back contact info */ - contact_info = orte_rml.get_uri(); - - if (NULL == contact_info) { - ORTE_ERROR_LOG(ORTE_ERROR); - goto CLEANUP; - } - - if (ORTE_SUCCESS != (ret = orte_dss.pack(answer, &contact_info, 1, ORTE_STRING))) { - ORTE_ERROR_LOG(ret); - goto CLEANUP; - } - - if (0 > orte_rml.send_buffer(sender, answer, tag, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - } - - goto CLEANUP; - - /**** HOSTFILE COMMAND ****/ - } else if (ORTE_DAEMON_HOSTFILE_CMD == command) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); - goto CLEANUP; - - /**** SCRIPTFILE COMMAND ****/ - } else if (ORTE_DAEMON_SCRIPTFILE_CMD == command) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); - goto CLEANUP; - - /**** HEARTBEAT COMMAND ****/ - } else if (ORTE_DAEMON_HEARTBEAT_CMD == command) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); - goto CLEANUP; - } - -CLEANUP: - OBJ_RELEASE(answer); - -DONE: - OPAL_THREAD_UNLOCK(&orteboot_globals.lock); - - /* reissue the non-blocking receive */ - ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); - if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { - ORTE_ERROR_LOG(ret); - } - - return; -} - - diff --git a/orte/tools/orted/orted.c b/orte/tools/orted/orted.c index 3ed5c3ab7b..233bc6768a 100644 --- a/orte/tools/orted/orted.c +++ b/orte/tools/orted/orted.c @@ -66,6 +66,7 @@ #include "orte/mca/rmgr/rmgr.h" #include "orte/mca/rmgr/base/base.h" #include "orte/mca/odls/odls.h" +#include "orte/mca/pls/pls.h" #include "orte/runtime/runtime.h" @@ -721,6 +722,36 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender, return; } +static void exit_callback(int fd, short event, void *arg) +{ + /* Trigger the normal exit conditions */ + orted_globals.exit_condition = true; + opal_condition_signal(&orted_globals.condition); + OPAL_THREAD_UNLOCK(&orted_globals.mutex); +} + +static void halt_vm(void) +{ + int ret; + struct timeval tv = { 1, 0 }; + opal_event_t* event; + opal_list_t attrs; + opal_list_item_t *item; + + /* terminate the vm - this will also wake us up so we can exit */ + OBJ_CONSTRUCT(&attrs, opal_list_t); + orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); + ret = orte_pls.terminate_orteds(0, &attrs); + while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); + OBJ_DESTRUCT(&attrs); + + /* setup a delay to give the orteds time to complete their departure */ + if (NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) { + opal_evtimer_set(event, exit_callback, NULL); + opal_evtimer_add(event, &tv); + } +} + static void orte_daemon_recv(int status, orte_process_name_t* sender, orte_buffer_t *buffer, orte_rml_tag_t tag, void* cbdata) @@ -741,72 +772,84 @@ static void orte_daemon_recv(int status, orte_process_name_t* sender, ORTE_NAME_ARGS(sender)); } + n = 1; + if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(ret); + OPAL_THREAD_UNLOCK(&orted_globals.mutex); + return; + } + answer = OBJ_NEW(orte_buffer_t); if (NULL == answer) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); goto DONE; } - n = 1; - if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(ret); - goto CLEANUP; - } - - /**** EXIT COMMAND ****/ - if (ORTE_DAEMON_EXIT_CMD == command) { - if (orted_globals.debug_daemons) { - opal_output(0, "[%lu,%lu,%lu] orted_recv: received exit", - ORTE_NAME_ARGS(orte_process_info.my_name)); - } - - orted_globals.exit_condition = true; - opal_condition_signal(&orted_globals.condition); - - goto CLEANUP; - + switch(command) { + /**** EXIT COMMAND ****/ + case ORTE_DAEMON_EXIT_CMD: + if (orted_globals.debug_daemons) { + opal_output(0, "[%lu,%lu,%lu] orted_recv: received exit", + ORTE_NAME_ARGS(orte_process_info.my_name)); + } + + orted_globals.exit_condition = true; + opal_condition_signal(&orted_globals.condition); + break; + + /**** HALT VM COMMAND ****/ + case ORTE_DAEMON_HALT_VM_CMD: + if (orted_globals.debug_daemons) { + opal_output(0, "[%lu,%lu,%lu] orted_recv: received halt vm", + ORTE_NAME_ARGS(orte_process_info.my_name)); + } + halt_vm(); + break; + /**** CONTACT QUERY COMMAND ****/ - } else if (ORTE_DAEMON_CONTACT_QUERY_CMD == command) { - /* send back contact info */ - contact_info = orte_rml.get_uri(); - - if (NULL == contact_info) { - ORTE_ERROR_LOG(ORTE_ERROR); - goto CLEANUP; - } - - if (ORTE_SUCCESS != (ret = orte_dss.pack(answer, &contact_info, 1, ORTE_STRING))) { - ORTE_ERROR_LOG(ret); - goto CLEANUP; - } - - if (0 > orte_rml.send_buffer(sender, answer, tag, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - } - - goto CLEANUP; + case ORTE_DAEMON_CONTACT_QUERY_CMD: + /* send back contact info */ + contact_info = orte_rml.get_uri(); + + if (NULL == contact_info) { + ORTE_ERROR_LOG(ORTE_ERROR); + goto CLEANUP; + } + + if (ORTE_SUCCESS != (ret = orte_dss.pack(answer, &contact_info, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(ret); + goto CLEANUP; + } + + if (0 > orte_rml.send_buffer(sender, answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + } + break; /**** HOSTFILE COMMAND ****/ - } else if (ORTE_DAEMON_HOSTFILE_CMD == command) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); - goto CLEANUP; + case ORTE_DAEMON_HOSTFILE_CMD: + ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); + break; /**** SCRIPTFILE COMMAND ****/ - } else if (ORTE_DAEMON_SCRIPTFILE_CMD == command) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); - goto CLEANUP; + case ORTE_DAEMON_SCRIPTFILE_CMD: + ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); + break; /**** HEARTBEAT COMMAND ****/ - } else if (ORTE_DAEMON_HEARTBEAT_CMD == command) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); - goto CLEANUP; + case ORTE_DAEMON_HEARTBEAT_CMD: + ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); + break; + + default: + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); } CLEANUP: - OBJ_RELEASE(answer); + OBJ_RELEASE(answer); DONE: - OPAL_THREAD_UNLOCK(&orted_globals.mutex); + OPAL_THREAD_UNLOCK(&orted_globals.mutex); /* reissue the non-blocking receive */ ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); @@ -816,4 +859,3 @@ DONE: return; } - diff --git a/orte/tools/ortehalt/ortehalt.c b/orte/tools/ortehalt/ortehalt.c index 0f17a54005..f7e8e8f0be 100644 --- a/orte/tools/ortehalt/ortehalt.c +++ b/orte/tools/ortehalt/ortehalt.c @@ -44,29 +44,19 @@ #include "opal/event/event.h" #include "opal/install_dirs.h" #include "opal/mca/base/base.h" -#include "opal/threads/condition.h" -#include "opal/util/argv.h" #include "opal/util/basename.h" #include "opal/util/cmd_line.h" #include "opal/util/opal_environ.h" #include "opal/util/output.h" #include "opal/util/show_help.h" -#include "opal/util/trace.h" #include "opal/version.h" +#include "opal/threads/mutex.h" +#include "opal/threads/condition.h" -#include "orte/class/orte_pointer_array.h" -#include "orte/util/proc_info.h" -#include "orte/util/sys_info.h" -#include "orte/util/universe_setup_file_io.h" - -#include "orte/mca/ns/ns.h" -#include "orte/mca/gpr/gpr.h" -#include "orte/mca/pls/pls.h" -#include "orte/mca/rmaps/rmaps_types.h" -#include "orte/mca/rmgr/rmgr.h" -#include "orte/mca/schema/schema.h" -#include "orte/mca/smr/smr.h" +#include "orte/dss/dss.h" +#include "orte/mca/rml/rml.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/odls/odls_types.h" #include "orte/runtime/runtime.h" #include "orte/runtime/orte_wait.h" @@ -74,7 +64,7 @@ static char *ortehalt_basename = NULL; /* - * setup globals for catching orterun command line options + * setup globals for catching ortehalt command line options */ struct globals_t { bool help; @@ -101,9 +91,6 @@ opal_cmd_line_init_t cmd_line_init[] = { { NULL, NULL, NULL, 'v', NULL, "verbose", 0, &ortehalt_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL, "Be verbose" }, - { NULL, NULL, NULL, 'q', NULL, "quiet", 0, - &ortehalt_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL, - "Suppress helpful messages" }, /* OpenRTE arguments */ { "orte", "debug", NULL, 'd', NULL, "debug-devel", 0, @@ -116,7 +103,7 @@ opal_cmd_line_init_t cmd_line_init[] = { { NULL, NULL, NULL, '\0', NULL, "tmpdir", 1, &orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING, - "Set the root for the session directory tree for orterun ONLY" }, + "Set the root for the session directory tree for ortehalt ONLY" }, /* End of list */ { NULL, NULL, NULL, '\0', NULL, NULL, 0, @@ -129,6 +116,8 @@ extern char** environ; int main(int argc, char *argv[]) { + orte_buffer_t *cmd; + orte_daemon_cmd_flag_t command; int rc; int id, iparam; @@ -165,13 +154,39 @@ int main(int argc, char *argv[]) * require */ if (ORTE_SUCCESS != (rc = orte_init(true))) { - opal_show_help("help-orterun.txt", "orterun:init-failure", true, + opal_show_help("help-ortehalt.txt", "ortehalt:init-failure", true, "orte_init()", rc); return rc; } + cmd = OBJ_NEW(orte_buffer_t); + if (NULL == cmd) { + opal_show_help("help-ortehalt.txt", "ortehalt:init-failure", true, + "orte_init()", rc); + return ORTE_ERROR; + } + + command = ORTE_DAEMON_HALT_VM_CMD; + + rc = orte_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD); + if ( ORTE_SUCCESS != rc ) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, cmd, ORTE_RML_TAG_DAEMON, 0); + if ( 0 > rc ) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + return ORTE_ERR_COMM_FAILURE; + } + + OBJ_RELEASE(cmd); + orte_finalize(); free(ortehalt_basename); return rc; } +