1
1

Bring ortehalt to a preliminary capability. It will corectly order a persistent daemon to exit cleanly. Need to now interface it to orterun, clean up a few things here and there

This commit was SVN r12626.
Этот коммит содержится в:
Ralph Castain 2006-11-18 04:47:51 +00:00
родитель ea1e0d34c8
Коммит 33affed09c
4 изменённых файлов: 130 добавлений и 274 удалений

Просмотреть файл

@ -43,6 +43,7 @@ typedef uint8_t orte_daemon_cmd_flag_t;
#define ORTE_DAEMON_ADD_LOCAL_PROCS (orte_daemon_cmd_flag_t) 6
#define ORTE_DAEMON_HEARTBEAT_CMD (orte_daemon_cmd_flag_t) 7
#define ORTE_DAEMON_EXIT_CMD (orte_daemon_cmd_flag_t) 8
#define ORTE_DAEMON_HALT_VM_CMD (orte_daemon_cmd_flag_t) 9
#if defined(c_plusplus) || defined(__cplusplus)

Просмотреть файл

@ -320,212 +320,10 @@ int main(int argc, char *argv[])
free(tmp);
}
/* detach from controlling terminal
* otherwise, remain attached so output can get to the user
*/
if(orteboot_globals.debug == false) {
opal_daemon_init(NULL);
}
/* Intialize our Open RTE environment */
/* Set the flag telling orte_init that I am NOT a
* singleton, but am "infrastructure" - prevents setting
* up incorrect infrastructure that only a singleton would
* require
*/
if (ORTE_SUCCESS != (rc = orte_init(true))) {
opal_show_help("help-orteboot.txt", "orteboot:init-failure", true,
"orte_init()", rc);
return rc;
}
/** setup callbacks for abort signals */
opal_signal_set(&term_handler, SIGTERM,
abort_signal_callback, &term_handler);
opal_signal_add(&term_handler, NULL);
opal_signal_set(&int_handler, SIGINT,
abort_signal_callback, &int_handler);
opal_signal_add(&int_handler, NULL);
/* issue the non-blocking receive */
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* Prep to start the virtual machine */
/* construct the list of attributes */
OBJ_CONSTRUCT(&attributes, opal_list_t);
orte_rmgr.add_attribute(&attributes, ORTE_RMAPS_PERNODE, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_NO_OVERRIDE);
/* Create the app - in this case, that's just a no_op to get the daemons launched */
app = OBJ_NEW(orte_app_context_t);
if (NULL == app) {
opal_show_help("help-orteboot.txt", "orteboot:call-failed",
true, orteboot_basename, "system", "malloc returned NULL", errno);
exit(1);
}
/* Spawn the job */
rc = orte_rmgr.spawn_job(&app, 1, &jobid, 0, NULL, NULL, 0, &attributes);
if (ORTE_SUCCESS != rc) {
/* JMS show_help */
opal_output(0, "%s: spawn failed with errno=%d\n", orteboot_basename, rc);
}
OBJ_DESTRUCT(&attributes);
/* just do a fork/exec of orted --seed --persistent and then exit */
/* setup and enter the event monitor */
OPAL_THREAD_LOCK(&orteboot_globals.lock);
while (false == orteboot_globals.exit) {
opal_condition_wait(&orteboot_globals.cond, &orteboot_globals.lock);
}
OPAL_THREAD_UNLOCK(&orteboot_globals.lock);
orte_finalize();
free(orteboot_basename);
free(orteboot_basename);
return rc;
}
static void exit_callback(int fd, short event, void *arg)
{
/* Remove the TERM and INT signal handlers */
opal_signal_del(&term_handler);
opal_signal_del(&int_handler);
/* Trigger the normal exit conditions */
orteboot_globals.exit = true;
opal_condition_signal(&orteboot_globals.cond);
}
static void abort_signal_callback(int fd, short flags, void *arg)
{
int ret;
struct timeval tv = { 1, 0 };
opal_event_t* event;
opal_list_t attrs;
opal_list_item_t *item;
static int signalled = 0;
OPAL_TRACE(1);
if (0 != signalled++) {
return;
}
fprintf(stderr, "%s: killing job...\n\n", orteboot_basename);
/* terminate the vm - this will also wake us up so we can exit */
OBJ_CONSTRUCT(&attrs, opal_list_t);
orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
ret = orte_pls.terminate_orteds(0, &attrs);
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&attrs);
/* setup a delay to give the orteds time to complete their departure */
if (NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) {
opal_evtimer_set(event, exit_callback, NULL);
opal_evtimer_add(event, &tv);
}
}
static void orte_daemon_recv(int status, orte_process_name_t* sender,
orte_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
{
orte_buffer_t *answer;
orte_daemon_cmd_flag_t command;
int ret;
orte_std_cntr_t n;
char *contact_info;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orteboot_globals.lock);
if (orteboot_globals.debug) {
opal_output(0, "orteboot: received message from [%ld,%ld,%ld]", ORTE_NAME_ARGS(sender));
}
answer = OBJ_NEW(orte_buffer_t);
if (NULL == answer) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto DONE;
}
n = 1;
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/**** EXIT COMMAND ****/
if (ORTE_DAEMON_EXIT_CMD == command) {
if (orteboot_globals.debug) {
opal_output(0, "orteboot: received exit");
}
orteboot_globals.exit = true;
opal_condition_signal(&orteboot_globals.cond);
goto CLEANUP;
/**** CONTACT QUERY COMMAND ****/
} else if (ORTE_DAEMON_CONTACT_QUERY_CMD == command) {
/* send back contact info */
contact_info = orte_rml.get_uri();
if (NULL == contact_info) {
ORTE_ERROR_LOG(ORTE_ERROR);
goto CLEANUP;
}
if (ORTE_SUCCESS != (ret = orte_dss.pack(answer, &contact_info, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
if (0 > orte_rml.send_buffer(sender, answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
goto CLEANUP;
/**** HOSTFILE COMMAND ****/
} else if (ORTE_DAEMON_HOSTFILE_CMD == command) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
goto CLEANUP;
/**** SCRIPTFILE COMMAND ****/
} else if (ORTE_DAEMON_SCRIPTFILE_CMD == command) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
goto CLEANUP;
/**** HEARTBEAT COMMAND ****/
} else if (ORTE_DAEMON_HEARTBEAT_CMD == command) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
goto CLEANUP;
}
CLEANUP:
OBJ_RELEASE(answer);
DONE:
OPAL_THREAD_UNLOCK(&orteboot_globals.lock);
/* reissue the non-blocking receive */
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(ret);
}
return;
}

Просмотреть файл

@ -66,6 +66,7 @@
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/pls/pls.h"
#include "orte/runtime/runtime.h"
@ -721,6 +722,36 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
return;
}
static void exit_callback(int fd, short event, void *arg)
{
/* Trigger the normal exit conditions */
orted_globals.exit_condition = true;
opal_condition_signal(&orted_globals.condition);
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
}
static void halt_vm(void)
{
int ret;
struct timeval tv = { 1, 0 };
opal_event_t* event;
opal_list_t attrs;
opal_list_item_t *item;
/* terminate the vm - this will also wake us up so we can exit */
OBJ_CONSTRUCT(&attrs, opal_list_t);
orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
ret = orte_pls.terminate_orteds(0, &attrs);
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&attrs);
/* setup a delay to give the orteds time to complete their departure */
if (NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) {
opal_evtimer_set(event, exit_callback, NULL);
opal_evtimer_add(event, &tv);
}
}
static void orte_daemon_recv(int status, orte_process_name_t* sender,
orte_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
@ -741,72 +772,84 @@ static void orte_daemon_recv(int status, orte_process_name_t* sender,
ORTE_NAME_ARGS(sender));
}
n = 1;
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
return;
}
answer = OBJ_NEW(orte_buffer_t);
if (NULL == answer) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto DONE;
}
n = 1;
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/**** EXIT COMMAND ****/
if (ORTE_DAEMON_EXIT_CMD == command) {
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv: received exit",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
orted_globals.exit_condition = true;
opal_condition_signal(&orted_globals.condition);
goto CLEANUP;
switch(command) {
/**** EXIT COMMAND ****/
case ORTE_DAEMON_EXIT_CMD:
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv: received exit",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
orted_globals.exit_condition = true;
opal_condition_signal(&orted_globals.condition);
break;
/**** HALT VM COMMAND ****/
case ORTE_DAEMON_HALT_VM_CMD:
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv: received halt vm",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
halt_vm();
break;
/**** CONTACT QUERY COMMAND ****/
} else if (ORTE_DAEMON_CONTACT_QUERY_CMD == command) {
/* send back contact info */
contact_info = orte_rml.get_uri();
if (NULL == contact_info) {
ORTE_ERROR_LOG(ORTE_ERROR);
goto CLEANUP;
}
if (ORTE_SUCCESS != (ret = orte_dss.pack(answer, &contact_info, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
if (0 > orte_rml.send_buffer(sender, answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
goto CLEANUP;
case ORTE_DAEMON_CONTACT_QUERY_CMD:
/* send back contact info */
contact_info = orte_rml.get_uri();
if (NULL == contact_info) {
ORTE_ERROR_LOG(ORTE_ERROR);
goto CLEANUP;
}
if (ORTE_SUCCESS != (ret = orte_dss.pack(answer, &contact_info, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
if (0 > orte_rml.send_buffer(sender, answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
break;
/**** HOSTFILE COMMAND ****/
} else if (ORTE_DAEMON_HOSTFILE_CMD == command) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
goto CLEANUP;
case ORTE_DAEMON_HOSTFILE_CMD:
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
break;
/**** SCRIPTFILE COMMAND ****/
} else if (ORTE_DAEMON_SCRIPTFILE_CMD == command) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
goto CLEANUP;
case ORTE_DAEMON_SCRIPTFILE_CMD:
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
break;
/**** HEARTBEAT COMMAND ****/
} else if (ORTE_DAEMON_HEARTBEAT_CMD == command) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
goto CLEANUP;
case ORTE_DAEMON_HEARTBEAT_CMD:
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
}
CLEANUP:
OBJ_RELEASE(answer);
OBJ_RELEASE(answer);
DONE:
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
/* reissue the non-blocking receive */
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
@ -816,4 +859,3 @@ DONE:
return;
}

Просмотреть файл

@ -44,29 +44,19 @@
#include "opal/event/event.h"
#include "opal/install_dirs.h"
#include "opal/mca/base/base.h"
#include "opal/threads/condition.h"
#include "opal/util/argv.h"
#include "opal/util/basename.h"
#include "opal/util/cmd_line.h"
#include "opal/util/opal_environ.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "opal/util/trace.h"
#include "opal/version.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "orte/class/orte_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/sys_info.h"
#include "orte/util/universe_setup_file_io.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/schema/schema.h"
#include "orte/mca/smr/smr.h"
#include "orte/dss/dss.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wait.h"
@ -74,7 +64,7 @@
static char *ortehalt_basename = NULL;
/*
* setup globals for catching orterun command line options
* setup globals for catching ortehalt command line options
*/
struct globals_t {
bool help;
@ -101,9 +91,6 @@ opal_cmd_line_init_t cmd_line_init[] = {
{ NULL, NULL, NULL, 'v', NULL, "verbose", 0,
&ortehalt_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Be verbose" },
{ NULL, NULL, NULL, 'q', NULL, "quiet", 0,
&ortehalt_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
"Suppress helpful messages" },
/* OpenRTE arguments */
{ "orte", "debug", NULL, 'd', NULL, "debug-devel", 0,
@ -116,7 +103,7 @@ opal_cmd_line_init_t cmd_line_init[] = {
{ NULL, NULL, NULL, '\0', NULL, "tmpdir", 1,
&orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING,
"Set the root for the session directory tree for orterun ONLY" },
"Set the root for the session directory tree for ortehalt ONLY" },
/* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
@ -129,6 +116,8 @@ extern char** environ;
int main(int argc, char *argv[])
{
orte_buffer_t *cmd;
orte_daemon_cmd_flag_t command;
int rc;
int id, iparam;
@ -165,13 +154,39 @@ int main(int argc, char *argv[])
* require
*/
if (ORTE_SUCCESS != (rc = orte_init(true))) {
opal_show_help("help-orterun.txt", "orterun:init-failure", true,
opal_show_help("help-ortehalt.txt", "ortehalt:init-failure", true,
"orte_init()", rc);
return rc;
}
cmd = OBJ_NEW(orte_buffer_t);
if (NULL == cmd) {
opal_show_help("help-ortehalt.txt", "ortehalt:init-failure", true,
"orte_init()", rc);
return ORTE_ERROR;
}
command = ORTE_DAEMON_HALT_VM_CMD;
rc = orte_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD);
if ( ORTE_SUCCESS != rc ) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, cmd, ORTE_RML_TAG_DAEMON, 0);
if ( 0 > rc ) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
orte_finalize();
free(ortehalt_basename);
return rc;
}