From a0ffeb205a093e5018f5e36500dc6800da786bcd Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 11 Sep 2012 20:35:46 +0000 Subject: [PATCH] Add an orted component for staged operations and rename the staged component to "staged_hnp". This commit was SVN r27305. --- orte/mca/state/orted/state_orted.c | 45 ++- orte/mca/state/staged_hnp/Makefile.am | 37 ++ .../help-state-staged-hnp.txt} | 0 .../state_staged_hnp.c} | 18 +- .../state_staged_hnp.h} | 10 +- .../state_staged_hnp_component.c} | 37 +- .../{staged => staged_orted}/Makefile.am | 22 +- .../state/staged_orted/state_staged_orted.c | 342 ++++++++++++++++++ .../state/staged_orted/state_staged_orted.h | 36 ++ .../state_staged_orted_component.c | 91 +++++ .../data_type_support/orte_dt_print_fns.c | 5 +- 11 files changed, 572 insertions(+), 71 deletions(-) create mode 100644 orte/mca/state/staged_hnp/Makefile.am rename orte/mca/state/{staged/help-state-staged.txt => staged_hnp/help-state-staged-hnp.txt} (100%) rename orte/mca/state/{staged/state_staged.c => staged_hnp/state_staged_hnp.c} (95%) rename orte/mca/state/{staged/state_staged.h => staged_hnp/state_staged_hnp.h} (75%) rename orte/mca/state/{staged/state_staged_component.c => staged_hnp/state_staged_hnp_component.c} (58%) rename orte/mca/state/{staged => staged_orted}/Makefile.am (53%) create mode 100644 orte/mca/state/staged_orted/state_staged_orted.c create mode 100644 orte/mca/state/staged_orted/state_staged_orted.h create mode 100644 orte/mca/state/staged_orted/state_staged_orted_component.c diff --git a/orte/mca/state/orted/state_orted.c b/orte/mca/state/orted/state_orted.c index 071051fece..eaa4491800 100644 --- a/orte/mca/state/orted/state_orted.c +++ b/orte/mca/state/orted/state_orted.c @@ -38,24 +38,21 @@ static int init(void); static int finalize(void); /****************** - * ORTED module - just uses base functions after - * initializing the proc state machine. Job state - * machine is unused by ortedlication procs at this - * time. + * ORTED module ******************/ orte_state_base_module_t orte_state_orted_module = { - init, - finalize, - orte_state_base_activate_job_state, - orte_state_base_add_job_state, - orte_state_base_set_job_state_callback, - orte_state_base_set_job_state_priority, - orte_state_base_remove_job_state, - orte_state_base_activate_proc_state, - orte_state_base_add_proc_state, - orte_state_base_set_proc_state_callback, - orte_state_base_set_proc_state_priority, - orte_state_base_remove_proc_state + init, + finalize, + orte_state_base_activate_job_state, + orte_state_base_add_job_state, + orte_state_base_set_job_state_callback, + orte_state_base_set_job_state_priority, + orte_state_base_remove_job_state, + orte_state_base_activate_proc_state, + orte_state_base_add_proc_state, + orte_state_base_set_proc_state_callback, + orte_state_base_set_proc_state_priority, + orte_state_base_remove_proc_state }; /* Local functions */ @@ -73,16 +70,16 @@ static orte_state_cbfunc_t job_callbacks[] = { }; static orte_proc_state_t proc_states[] = { - ORTE_PROC_STATE_RUNNING, - ORTE_PROC_STATE_REGISTERED, - ORTE_PROC_STATE_IOF_COMPLETE, - ORTE_PROC_STATE_WAITPID_FIRED + ORTE_PROC_STATE_RUNNING, + ORTE_PROC_STATE_REGISTERED, + ORTE_PROC_STATE_IOF_COMPLETE, + ORTE_PROC_STATE_WAITPID_FIRED }; static orte_state_cbfunc_t proc_callbacks[] = { - track_procs, - track_procs, - track_procs, - track_procs + track_procs, + track_procs, + track_procs, + track_procs }; /************************ diff --git a/orte/mca/state/staged_hnp/Makefile.am b/orte/mca/state/staged_hnp/Makefile.am new file mode 100644 index 0000000000..8250ef1936 --- /dev/null +++ b/orte/mca/state/staged_hnp/Makefile.am @@ -0,0 +1,37 @@ +# +# Copyright (c) 2012 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_pkgdata_DATA = help-state-staged-hnp.txt + +sources = \ + state_staged_hnp.h \ + state_staged_hnp_component.c \ + state_staged_hnp.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_state_staged_hnp_DSO +component_noinst = +component_install = mca_state_staged_hnp.la +else +component_noinst = libmca_state_staged_hnp.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_state_staged_hnp_la_SOURCES = $(sources) +mca_state_staged_hnp_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_state_staged_hnp_la_SOURCES =$(sources) +libmca_state_staged_hnp_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/state/staged/help-state-staged.txt b/orte/mca/state/staged_hnp/help-state-staged-hnp.txt similarity index 100% rename from orte/mca/state/staged/help-state-staged.txt rename to orte/mca/state/staged_hnp/help-state-staged-hnp.txt diff --git a/orte/mca/state/staged/state_staged.c b/orte/mca/state/staged_hnp/state_staged_hnp.c similarity index 95% rename from orte/mca/state/staged/state_staged.c rename to orte/mca/state/staged_hnp/state_staged_hnp.c index 459778fb2f..b39782ee26 100644 --- a/orte/mca/state/staged/state_staged.c +++ b/orte/mca/state/staged_hnp/state_staged_hnp.c @@ -34,7 +34,7 @@ #include "orte/mca/state/state.h" #include "orte/mca/state/base/base.h" #include "orte/mca/state/base/state_private.h" -#include "state_staged.h" +#include "state_staged_hnp.h" /* * Module functions: Global @@ -45,7 +45,7 @@ static int finalize(void); /****************** * STAGED module ******************/ -orte_state_base_module_t orte_state_staged_module = { +orte_state_base_module_t orte_state_staged_hnp_module = { init, finalize, orte_state_base_activate_job_state, @@ -107,7 +107,7 @@ static orte_state_cbfunc_t launch_callbacks[] = { orte_quit }; -/* staged execution requires that we start as many +/* staged_hnp execution requires that we start as many * procs initially as we have resources - if we have * adequate resources, then we behave just like the * default HNP module. If we don't, then we will have @@ -218,8 +218,8 @@ static void setup_job_complete(int fd, short args, void *cbdata) continue; } if (app->num_procs <= 0) { - /* must specify -np for staged execution */ - orte_show_help("help-state-staged.txt", "no-np", true); + /* must specify -np for staged_hnp execution */ + orte_show_help("help-state-staged-hnp.txt", "no-np", true); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT); OBJ_RELEASE(caddy); return; @@ -248,7 +248,7 @@ static void setup_job_complete(int fd, short args, void *cbdata) } } - /* set the job map to use the staged mapper */ + /* set the job map to use the staged_hnp mapper */ if (NULL == jdata->map) { jdata->map = OBJ_NEW(orte_job_map_t); jdata->map->req_mapper = strdup("staged"); @@ -282,7 +282,7 @@ static void cleanup_node(orte_proc_t *proc) } } OPAL_OUTPUT_VERBOSE((5, orte_state_base_output, - "%s state:staged:track_procs node %s has %d slots, %d slots inuse", + "%s state:staged_hnp:track_procs node %s has %d slots, %d slots inuse", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, (int)node->slots, (int)node->slots_inuse)); } @@ -296,7 +296,7 @@ static void track_procs(int fd, short args, void *cbdata) orte_proc_t *pdata; OPAL_OUTPUT_VERBOSE((5, orte_state_base_output, - "%s state:staged:track_procs called for proc %s state %s", + "%s state:staged_hnp:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); @@ -316,7 +316,7 @@ static void track_procs(int fd, short args, void *cbdata) if (ORTE_PROC_STATE_REGISTERED == state) { if (pdata->mpi_proc && !jdata->gang_launched) { /* we can't support this - issue an error and abort */ - orte_show_help("help-state-staged.txt", "mpi-procs-not-supported", true); + orte_show_help("help-state-staged-hnp.txt", "mpi-procs-not-supported", true); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT); } /* update the proc state */ diff --git a/orte/mca/state/staged/state_staged.h b/orte/mca/state/staged_hnp/state_staged_hnp.h similarity index 75% rename from orte/mca/state/staged/state_staged.h rename to orte/mca/state/staged_hnp/state_staged_hnp.h index ab2cc2107b..185299c4dc 100644 --- a/orte/mca/state/staged/state_staged.h +++ b/orte/mca/state/staged_hnp/state_staged_hnp.h @@ -14,8 +14,8 @@ * */ -#ifndef MCA_STATE_STAGED_EXPORT_H -#define MCA_STATE_STAGED_EXPORT_H +#ifndef MCA_STATE_STAGED_HNP_EXPORT_H +#define MCA_STATE_STAGED_HNP_EXPORT_H #include "orte_config.h" @@ -27,10 +27,10 @@ BEGIN_C_DECLS * Local Component structures */ -ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_staged_component; +ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_staged_hnp_component; -ORTE_DECLSPEC extern orte_state_base_module_t orte_state_staged_module; +ORTE_DECLSPEC extern orte_state_base_module_t orte_state_staged_hnp_module; END_C_DECLS -#endif /* MCA_STATE_STAGED_EXPORT_H */ +#endif /* MCA_STATE_STAGED_HNP_EXPORT_H */ diff --git a/orte/mca/state/staged/state_staged_component.c b/orte/mca/state/staged_hnp/state_staged_hnp_component.c similarity index 58% rename from orte/mca/state/staged/state_staged_component.c rename to orte/mca/state/staged_hnp/state_staged_hnp_component.c index ba9522a5b6..eb005ca211 100644 --- a/orte/mca/state/staged/state_staged_component.c +++ b/orte/mca/state/staged_hnp/state_staged_hnp_component.c @@ -14,26 +14,26 @@ #include "orte/mca/state/state.h" #include "orte/mca/state/base/base.h" -#include "state_staged.h" +#include "state_staged_hnp.h" /* * Public string for version number */ -const char *orte_state_staged_component_version_string = - "ORTE STATE staged MCA component version " ORTE_VERSION; +const char *orte_state_staged_hnp_component_version_string = + "ORTE STATE staged_hnp MCA component version " ORTE_VERSION; /* * Local functionality */ -static int state_staged_open(void); -static int state_staged_close(void); -static int state_staged_component_query(mca_base_module_t **module, int *priority); +static int state_staged_hnp_open(void); +static int state_staged_hnp_close(void); +static int state_staged_hnp_component_query(mca_base_module_t **module, int *priority); /* * Instantiate the public struct with all of our public information * and pointer to our public functions in it */ -orte_state_base_component_t mca_state_staged_component = +orte_state_base_component_t mca_state_staged_hnp_component = { /* Handle the general mca_component_t struct containing * meta information about the component @@ -41,15 +41,15 @@ orte_state_base_component_t mca_state_staged_component = { ORTE_STATE_BASE_VERSION_1_0_0, /* Component name and version */ - "staged", + "staged_hnp", ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION, ORTE_RELEASE_VERSION, /* Component open and close functions */ - state_staged_open, - state_staged_close, - state_staged_component_query + state_staged_hnp_open, + state_staged_hnp_close, + state_staged_hnp_component_query }, { /* The component is checkpoint ready */ @@ -59,29 +59,28 @@ orte_state_base_component_t mca_state_staged_component = static bool select_me = false; -static int state_staged_open(void) +static int state_staged_hnp_open(void) { int tmp; - mca_base_component_t *c=&mca_state_staged_component.base_version; - mca_base_param_reg_int(c, "select", - "Use this component", - false, false, (int)false, &tmp); + mca_base_param_reg_int_name("state", "staged_select", + "Use this component", + false, false, (int)false, &tmp); select_me = OPAL_INT_TO_BOOL(tmp); return ORTE_SUCCESS; } -static int state_staged_close(void) +static int state_staged_hnp_close(void) { return ORTE_SUCCESS; } -static int state_staged_component_query(mca_base_module_t **module, int *priority) +static int state_staged_hnp_component_query(mca_base_module_t **module, int *priority) { if (ORTE_PROC_IS_HNP && select_me) { *priority = 1000; - *module = (mca_base_module_t *)&orte_state_staged_module; + *module = (mca_base_module_t *)&orte_state_staged_hnp_module; return ORTE_SUCCESS; } diff --git a/orte/mca/state/staged/Makefile.am b/orte/mca/state/staged_orted/Makefile.am similarity index 53% rename from orte/mca/state/staged/Makefile.am rename to orte/mca/state/staged_orted/Makefile.am index ff02ed135d..654c92034b 100644 --- a/orte/mca/state/staged/Makefile.am +++ b/orte/mca/state/staged_orted/Makefile.am @@ -8,30 +8,28 @@ # $HEADER$ # -dist_pkgdata_DATA = help-state-staged.txt - sources = \ - state_staged.h \ - state_staged_component.c \ - state_staged.c + state_staged_orted.h \ + state_staged_orted_component.c \ + state_staged_orted.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la # (for static builds). -if MCA_BUILD_orte_state_staged_DSO +if MCA_BUILD_orte_state_staged_orted_DSO component_noinst = -component_install = mca_state_staged.la +component_install = mca_state_staged_orted.la else -component_noinst = libmca_state_staged.la +component_noinst = libmca_state_staged_orted.la component_install = endif mcacomponentdir = $(pkglibdir) mcacomponent_LTLIBRARIES = $(component_install) -mca_state_staged_la_SOURCES = $(sources) -mca_state_staged_la_LDFLAGS = -module -avoid-version +mca_state_staged_orted_la_SOURCES = $(sources) +mca_state_staged_orted_la_LDFLAGS = -module -avoid-version noinst_LTLIBRARIES = $(component_noinst) -libmca_state_staged_la_SOURCES =$(sources) -libmca_state_staged_la_LDFLAGS = -module -avoid-version +libmca_state_staged_orted_la_SOURCES =$(sources) +libmca_state_staged_orted_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/state/staged_orted/state_staged_orted.c b/orte/mca/state/staged_orted/state_staged_orted.c new file mode 100644 index 0000000000..43e58b9a2a --- /dev/null +++ b/orte/mca/state/staged_orted/state_staged_orted.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/iof/iof.h" +#include "orte/mca/rml/rml.h" +#include "orte/util/session_dir.h" +#include "orte/runtime/orte_quit.h" + +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/base.h" +#include "orte/mca/state/base/state_private.h" +#include "state_staged_orted.h" + +/* + * Module functions: Global + */ +static int init(void); +static int finalize(void); + +/****************** + * STAGED_ORTED module + ******************/ +orte_state_base_module_t orte_state_staged_orted_module = { + init, + finalize, + orte_state_base_activate_job_state, + orte_state_base_add_job_state, + orte_state_base_set_job_state_callback, + orte_state_base_set_job_state_priority, + orte_state_base_remove_job_state, + orte_state_base_activate_proc_state, + orte_state_base_add_proc_state, + orte_state_base_set_proc_state_callback, + orte_state_base_set_proc_state_priority, + orte_state_base_remove_proc_state +}; + +/* Local functions */ +static void track_jobs(int fd, short argc, void *cbdata); +static void track_procs(int fd, short argc, void *cbdata); +static int pack_state_update(opal_buffer_t *buf, + orte_job_t *jdata, + orte_proc_t *proc); + +/* defined default state machines */ +static orte_job_state_t job_states[] = { + ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE, +}; +static orte_state_cbfunc_t job_callbacks[] = { + track_jobs +}; + +static orte_proc_state_t proc_states[] = { + ORTE_PROC_STATE_RUNNING, + ORTE_PROC_STATE_REGISTERED, + ORTE_PROC_STATE_IOF_COMPLETE, + ORTE_PROC_STATE_WAITPID_FIRED +}; +static orte_state_cbfunc_t proc_callbacks[] = { + track_procs, + track_procs, + track_procs, + track_procs +}; + +/************************ + * API Definitions + ************************/ +static int init(void) +{ + int num_states, i, rc; + + /* setup the state machine */ + OBJ_CONSTRUCT(&orte_job_states, opal_list_t); + OBJ_CONSTRUCT(&orte_proc_states, opal_list_t); + + num_states = sizeof(job_states) / sizeof(orte_job_state_t); + for (i=0; i < num_states; i++) { + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(job_states[i], + job_callbacks[i], + ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + } + } + /* add a default error response */ + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_FORCED_EXIT, + orte_quit, ORTE_ERROR_PRI))) { + ORTE_ERROR_LOG(rc); + } + /* add a state for when we are ordered to terminate */ + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED, + orte_quit, ORTE_ERROR_PRI))) { + ORTE_ERROR_LOG(rc); + } + if (5 < opal_output_get_verbosity(orte_state_base_output)) { + orte_state_base_print_job_state_machine(); + } + + /* populate the proc state machine to allow us to + * track proc lifecycle changes + */ + num_states = sizeof(proc_states) / sizeof(orte_proc_state_t); + for (i=0; i < num_states; i++) { + if (ORTE_SUCCESS != (rc = orte_state.add_proc_state(proc_states[i], + proc_callbacks[i], + ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + } + } + if (5 < opal_output_get_verbosity(orte_state_base_output)) { + orte_state_base_print_proc_state_machine(); + } + return ORTE_SUCCESS; +} + +static int finalize(void) +{ + opal_list_item_t *item; + + /* cleanup the state machines */ + while (NULL != (item = opal_list_remove_first(&orte_job_states))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&orte_job_states); + while (NULL != (item = opal_list_remove_first(&orte_proc_states))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&orte_proc_states); + + return ORTE_SUCCESS; +} + +static void track_jobs(int fd, short argc, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + + /* ignore this */ + OBJ_RELEASE(caddy); +} + +static void track_procs(int fd, short argc, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_process_name_t *proc = &caddy->name; + orte_proc_state_t state = caddy->proc_state; + orte_job_t *jdata; + orte_proc_t *pdata; + opal_buffer_t *alert; + int rc; + orte_plm_cmd_flag_t cmd; + + OPAL_OUTPUT_VERBOSE((5, orte_state_base_output, + "%s state:staged_orted:track_procs called for proc %s state %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state))); + + /* get the job object for this proc */ + if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + goto cleanup; + } + pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); + + switch (state) { + case ORTE_PROC_STATE_RUNNING: + /* update the proc state */ + pdata->state = state; + jdata->num_launched++; + /* we don't really care - nothing further to do */ + break; + + case ORTE_PROC_STATE_REGISTERED: + /* update the proc state */ + pdata->state = state; + /* if this proc registered as an MPI proc, and + * MPI is not allowed, then that is an error + */ + if (!jdata->gang_launched && pdata->mpi_proc) { + /* abort the proc */ + /* notify the HNP of the error */ + } + break; + + case ORTE_PROC_STATE_IOF_COMPLETE: + /* do NOT update the proc state as this can hit + * while we are still trying to notify the HNP of + * successful launch for short-lived procs + */ + pdata->iof_complete = true; + if (pdata->waitpid_recvd) { + /* the proc has terminated */ + pdata->alive = false; + pdata->state = ORTE_PROC_STATE_TERMINATED; + /* Clean up the session directory as if we were the process + * itself. This covers the case where the process died abnormally + * and didn't cleanup its own session directory. + */ + orte_session_dir_finalize(proc); + /* alert the HNP */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + alert = OBJ_NEW(opal_buffer_t); + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* pack the info */ + if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata, pdata))) { + ORTE_ERROR_LOG(rc); + } + /* send it */ + OPAL_OUTPUT_VERBOSE((5, orte_state_base_output, + "%s SENDING TERMINATION UPDATE FOR PROC %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&pdata->name))); + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + } + } + /* Release the stdin IOF file descriptor for this child, if one + * was defined. File descriptors for the other IOF channels - stdout, + * stderr, and stddiag - were released when their associated pipes + * were cleared and closed due to termination of the process + * Do this after we handle termination in case the IOF needs + * to check to see if all procs from the job are actually terminated + */ + if (NULL != orte_iof.close) { + orte_iof.close(proc, ORTE_IOF_STDIN); + } + break; + + case ORTE_PROC_STATE_WAITPID_FIRED: + /* do NOT update the proc state as this can hit + * while we are still trying to notify the HNP of + * successful launch for short-lived procs + */ + pdata->waitpid_recvd = true; + if (pdata->iof_complete) { + /* the proc has terminated */ + pdata->alive = false; + pdata->state = ORTE_PROC_STATE_TERMINATED; + /* Clean up the session directory as if we were the process + * itself. This covers the case where the process died abnormally + * and didn't cleanup its own session directory. + */ + orte_session_dir_finalize(proc); + /* alert the HNP */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + alert = OBJ_NEW(opal_buffer_t); + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* pack the info */ + if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata, pdata))) { + ORTE_ERROR_LOG(rc); + } + /* send it */ + OPAL_OUTPUT_VERBOSE((5, orte_state_base_output, + "%s SENDING TERMINATION UPDATE FOR PROC %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&pdata->name))); + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + } + } + break; + + default: + /* ignore */ + break; + } + + cleanup: + OBJ_RELEASE(caddy); +} + +static int pack_state_update(opal_buffer_t *alert, + orte_job_t *jdata, + orte_proc_t *child) +{ + int rc; + orte_vpid_t null=ORTE_VPID_INVALID; + + /* pack the jobid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jdata->jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the child's vpid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name.vpid), 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the pid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack its state */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack its exit code */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->exit_code, 1, ORTE_EXIT_CODE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* flag that this job is complete so the receiver can know */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; +} diff --git a/orte/mca/state/staged_orted/state_staged_orted.h b/orte/mca/state/staged_orted/state_staged_orted.h new file mode 100644 index 0000000000..66555f9e23 --- /dev/null +++ b/orte/mca/state/staged_orted/state_staged_orted.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + */ + +#ifndef MCA_STATE_STAGED_ORTED_EXPORT_H +#define MCA_STATE_STAGED_ORTED_EXPORT_H + +#include "orte_config.h" + +#include "orte/mca/state/state.h" + +BEGIN_C_DECLS + +/* + * Local Component structures + */ + +ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_staged_orted_component; + +ORTE_DECLSPEC extern orte_state_base_module_t orte_state_staged_orted_module; + +END_C_DECLS + +#endif /* MCA_STATE_STAGED_ORTED_EXPORT_H */ diff --git a/orte/mca/state/staged_orted/state_staged_orted_component.c b/orte/mca/state/staged_orted/state_staged_orted_component.c new file mode 100644 index 0000000000..e20235f534 --- /dev/null +++ b/orte/mca/state/staged_orted/state_staged_orted_component.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "opal/util/output.h" + +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/base.h" +#include "state_staged_orted.h" + +/* + * Public string for version number + */ +const char *orte_state_staged_orted_component_version_string = + "ORTE STATE staged_orted MCA component version " ORTE_VERSION; + +/* + * Local functionality + */ +static int state_staged_orted_open(void); +static int state_staged_orted_close(void); +static int state_staged_orted_component_query(mca_base_module_t **module, int *priority); + +/* + * Instantiate the public struct with all of our public information + * and pointer to our public functions in it + */ +orte_state_base_component_t mca_state_staged_orted_component = +{ + /* Handle the general mca_component_t struct containing + * meta information about the component + */ + { + ORTE_STATE_BASE_VERSION_1_0_0, + /* Component name and version */ + "staged_orted", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + state_staged_orted_open, + state_staged_orted_close, + state_staged_orted_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, +}; + +static bool select_me = false; + +static int state_staged_orted_open(void) +{ + int tmp; + + mca_base_param_reg_int_name("state", "staged_select", + "Use this component", + false, false, (int)false, &tmp); + select_me = OPAL_INT_TO_BOOL(tmp); + + return ORTE_SUCCESS; +} + +static int state_staged_orted_close(void) +{ + return ORTE_SUCCESS; +} + +static int state_staged_orted_component_query(mca_base_module_t **module, int *priority) +{ + if (ORTE_PROC_IS_DAEMON && select_me) { + /* set our priority high */ + *priority = 1000; + *module = (mca_base_module_t *)&orte_state_staged_orted_module; + return ORTE_SUCCESS; + } + + *priority = -1; + *module = NULL; + return ORTE_ERROR; +} diff --git a/orte/runtime/data_type_support/orte_dt_print_fns.c b/orte/runtime/data_type_support/orte_dt_print_fns.c index 970af353ef..690bec972a 100644 --- a/orte/runtime/data_type_support/orte_dt_print_fns.c +++ b/orte/runtime/data_type_support/orte_dt_print_fns.c @@ -217,12 +217,13 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty asprintf(&pfx2, "%s", prefix); } - asprintf(&tmp, "\n%sData for job: %s\tRecovery: %s(%s)\n%s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %s\tAbort: %s", pfx2, + asprintf(&tmp, "\n%sData for job: %s\tRecovery: %s(%s)\n%s\tNum apps: %ld\tControls: %0x\tMPI allowed: %s\tStdin target: %s\tState: %s\tAbort: %s", pfx2, ORTE_JOBID_PRINT(src->jobid), (src->enable_recovery) ? "ENABLED" : "DISABLED", (src->recovery_defined) ? "DEFINED" : "DEFAULT", pfx2, - (long)src->num_apps, src->controls, ORTE_VPID_PRINT(src->stdin_target), + (long)src->num_apps, src->controls, + src->gang_launched ? "YES" : "NO", ORTE_VPID_PRINT(src->stdin_target), orte_job_state_to_str(src->state), src->abort ? "True" : "False"); asprintf(&pfx, "%s\t", pfx2); free(pfx2);