Add an orted component for staged operations and rename the staged component to "staged_hnp".
This commit was SVN r27305.
Этот коммит содержится в:
родитель
387f657fc2
Коммит
a0ffeb205a
@ -38,24 +38,21 @@ static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
/******************
|
||||
* ORTED module - just uses base functions after
|
||||
* initializing the proc state machine. Job state
|
||||
* machine is unused by ortedlication procs at this
|
||||
* time.
|
||||
* ORTED module
|
||||
******************/
|
||||
orte_state_base_module_t orte_state_orted_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_state_base_activate_job_state,
|
||||
orte_state_base_add_job_state,
|
||||
orte_state_base_set_job_state_callback,
|
||||
orte_state_base_set_job_state_priority,
|
||||
orte_state_base_remove_job_state,
|
||||
orte_state_base_activate_proc_state,
|
||||
orte_state_base_add_proc_state,
|
||||
orte_state_base_set_proc_state_callback,
|
||||
orte_state_base_set_proc_state_priority,
|
||||
orte_state_base_remove_proc_state
|
||||
init,
|
||||
finalize,
|
||||
orte_state_base_activate_job_state,
|
||||
orte_state_base_add_job_state,
|
||||
orte_state_base_set_job_state_callback,
|
||||
orte_state_base_set_job_state_priority,
|
||||
orte_state_base_remove_job_state,
|
||||
orte_state_base_activate_proc_state,
|
||||
orte_state_base_add_proc_state,
|
||||
orte_state_base_set_proc_state_callback,
|
||||
orte_state_base_set_proc_state_priority,
|
||||
orte_state_base_remove_proc_state
|
||||
};
|
||||
|
||||
/* Local functions */
|
||||
@ -73,16 +70,16 @@ static orte_state_cbfunc_t job_callbacks[] = {
|
||||
};
|
||||
|
||||
static orte_proc_state_t proc_states[] = {
|
||||
ORTE_PROC_STATE_RUNNING,
|
||||
ORTE_PROC_STATE_REGISTERED,
|
||||
ORTE_PROC_STATE_IOF_COMPLETE,
|
||||
ORTE_PROC_STATE_WAITPID_FIRED
|
||||
ORTE_PROC_STATE_RUNNING,
|
||||
ORTE_PROC_STATE_REGISTERED,
|
||||
ORTE_PROC_STATE_IOF_COMPLETE,
|
||||
ORTE_PROC_STATE_WAITPID_FIRED
|
||||
};
|
||||
static orte_state_cbfunc_t proc_callbacks[] = {
|
||||
track_procs,
|
||||
track_procs,
|
||||
track_procs,
|
||||
track_procs
|
||||
track_procs,
|
||||
track_procs,
|
||||
track_procs,
|
||||
track_procs
|
||||
};
|
||||
|
||||
/************************
|
||||
|
37
orte/mca/state/staged_hnp/Makefile.am
Обычный файл
37
orte/mca/state/staged_hnp/Makefile.am
Обычный файл
@ -0,0 +1,37 @@
|
||||
#
|
||||
# Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-state-staged-hnp.txt
|
||||
|
||||
sources = \
|
||||
state_staged_hnp.h \
|
||||
state_staged_hnp_component.c \
|
||||
state_staged_hnp.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_state_staged_hnp_DSO
|
||||
component_noinst =
|
||||
component_install = mca_state_staged_hnp.la
|
||||
else
|
||||
component_noinst = libmca_state_staged_hnp.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_state_staged_hnp_la_SOURCES = $(sources)
|
||||
mca_state_staged_hnp_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_state_staged_hnp_la_SOURCES =$(sources)
|
||||
libmca_state_staged_hnp_la_LDFLAGS = -module -avoid-version
|
@ -34,7 +34,7 @@
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "orte/mca/state/base/state_private.h"
|
||||
#include "state_staged.h"
|
||||
#include "state_staged_hnp.h"
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
@ -45,7 +45,7 @@ static int finalize(void);
|
||||
/******************
|
||||
* STAGED module
|
||||
******************/
|
||||
orte_state_base_module_t orte_state_staged_module = {
|
||||
orte_state_base_module_t orte_state_staged_hnp_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_state_base_activate_job_state,
|
||||
@ -107,7 +107,7 @@ static orte_state_cbfunc_t launch_callbacks[] = {
|
||||
orte_quit
|
||||
};
|
||||
|
||||
/* staged execution requires that we start as many
|
||||
/* staged_hnp execution requires that we start as many
|
||||
* procs initially as we have resources - if we have
|
||||
* adequate resources, then we behave just like the
|
||||
* default HNP module. If we don't, then we will have
|
||||
@ -218,8 +218,8 @@ static void setup_job_complete(int fd, short args, void *cbdata)
|
||||
continue;
|
||||
}
|
||||
if (app->num_procs <= 0) {
|
||||
/* must specify -np for staged execution */
|
||||
orte_show_help("help-state-staged.txt", "no-np", true);
|
||||
/* must specify -np for staged_hnp execution */
|
||||
orte_show_help("help-state-staged-hnp.txt", "no-np", true);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
@ -248,7 +248,7 @@ static void setup_job_complete(int fd, short args, void *cbdata)
|
||||
}
|
||||
}
|
||||
|
||||
/* set the job map to use the staged mapper */
|
||||
/* set the job map to use the staged_hnp mapper */
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
jdata->map->req_mapper = strdup("staged");
|
||||
@ -282,7 +282,7 @@ static void cleanup_node(orte_proc_t *proc)
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
||||
"%s state:staged:track_procs node %s has %d slots, %d slots inuse",
|
||||
"%s state:staged_hnp:track_procs node %s has %d slots, %d slots inuse",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
|
||||
(int)node->slots, (int)node->slots_inuse));
|
||||
}
|
||||
@ -296,7 +296,7 @@ static void track_procs(int fd, short args, void *cbdata)
|
||||
orte_proc_t *pdata;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
||||
"%s state:staged:track_procs called for proc %s state %s",
|
||||
"%s state:staged_hnp:track_procs called for proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state)));
|
||||
@ -316,7 +316,7 @@ static void track_procs(int fd, short args, void *cbdata)
|
||||
if (ORTE_PROC_STATE_REGISTERED == state) {
|
||||
if (pdata->mpi_proc && !jdata->gang_launched) {
|
||||
/* we can't support this - issue an error and abort */
|
||||
orte_show_help("help-state-staged.txt", "mpi-procs-not-supported", true);
|
||||
orte_show_help("help-state-staged-hnp.txt", "mpi-procs-not-supported", true);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT);
|
||||
}
|
||||
/* update the proc state */
|
@ -14,8 +14,8 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_STATE_STAGED_EXPORT_H
|
||||
#define MCA_STATE_STAGED_EXPORT_H
|
||||
#ifndef MCA_STATE_STAGED_HNP_EXPORT_H
|
||||
#define MCA_STATE_STAGED_HNP_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
@ -27,10 +27,10 @@ BEGIN_C_DECLS
|
||||
* Local Component structures
|
||||
*/
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_staged_component;
|
||||
ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_staged_hnp_component;
|
||||
|
||||
ORTE_DECLSPEC extern orte_state_base_module_t orte_state_staged_module;
|
||||
ORTE_DECLSPEC extern orte_state_base_module_t orte_state_staged_hnp_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_STATE_STAGED_EXPORT_H */
|
||||
#endif /* MCA_STATE_STAGED_HNP_EXPORT_H */
|
@ -14,26 +14,26 @@
|
||||
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "state_staged.h"
|
||||
#include "state_staged_hnp.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_state_staged_component_version_string =
|
||||
"ORTE STATE staged MCA component version " ORTE_VERSION;
|
||||
const char *orte_state_staged_hnp_component_version_string =
|
||||
"ORTE STATE staged_hnp MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int state_staged_open(void);
|
||||
static int state_staged_close(void);
|
||||
static int state_staged_component_query(mca_base_module_t **module, int *priority);
|
||||
static int state_staged_hnp_open(void);
|
||||
static int state_staged_hnp_close(void);
|
||||
static int state_staged_hnp_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_state_base_component_t mca_state_staged_component =
|
||||
orte_state_base_component_t mca_state_staged_hnp_component =
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component
|
||||
@ -41,15 +41,15 @@ orte_state_base_component_t mca_state_staged_component =
|
||||
{
|
||||
ORTE_STATE_BASE_VERSION_1_0_0,
|
||||
/* Component name and version */
|
||||
"staged",
|
||||
"staged_hnp",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
state_staged_open,
|
||||
state_staged_close,
|
||||
state_staged_component_query
|
||||
state_staged_hnp_open,
|
||||
state_staged_hnp_close,
|
||||
state_staged_hnp_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
@ -59,29 +59,28 @@ orte_state_base_component_t mca_state_staged_component =
|
||||
|
||||
static bool select_me = false;
|
||||
|
||||
static int state_staged_open(void)
|
||||
static int state_staged_hnp_open(void)
|
||||
{
|
||||
int tmp;
|
||||
mca_base_component_t *c=&mca_state_staged_component.base_version;
|
||||
|
||||
mca_base_param_reg_int(c, "select",
|
||||
"Use this component",
|
||||
false, false, (int)false, &tmp);
|
||||
mca_base_param_reg_int_name("state", "staged_select",
|
||||
"Use this component",
|
||||
false, false, (int)false, &tmp);
|
||||
select_me = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int state_staged_close(void)
|
||||
static int state_staged_hnp_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int state_staged_component_query(mca_base_module_t **module, int *priority)
|
||||
static int state_staged_hnp_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (ORTE_PROC_IS_HNP && select_me) {
|
||||
*priority = 1000;
|
||||
*module = (mca_base_module_t *)&orte_state_staged_module;
|
||||
*module = (mca_base_module_t *)&orte_state_staged_hnp_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -8,30 +8,28 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-state-staged.txt
|
||||
|
||||
sources = \
|
||||
state_staged.h \
|
||||
state_staged_component.c \
|
||||
state_staged.c
|
||||
state_staged_orted.h \
|
||||
state_staged_orted_component.c \
|
||||
state_staged_orted.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_state_staged_DSO
|
||||
if MCA_BUILD_orte_state_staged_orted_DSO
|
||||
component_noinst =
|
||||
component_install = mca_state_staged.la
|
||||
component_install = mca_state_staged_orted.la
|
||||
else
|
||||
component_noinst = libmca_state_staged.la
|
||||
component_noinst = libmca_state_staged_orted.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_state_staged_la_SOURCES = $(sources)
|
||||
mca_state_staged_la_LDFLAGS = -module -avoid-version
|
||||
mca_state_staged_orted_la_SOURCES = $(sources)
|
||||
mca_state_staged_orted_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_state_staged_la_SOURCES =$(sources)
|
||||
libmca_state_staged_la_LDFLAGS = -module -avoid-version
|
||||
libmca_state_staged_orted_la_SOURCES =$(sources)
|
||||
libmca_state_staged_orted_la_LDFLAGS = -module -avoid-version
|
342
orte/mca/state/staged_orted/state_staged_orted.c
Обычный файл
342
orte/mca/state/staged_orted/state_staged_orted.c
Обычный файл
@ -0,0 +1,342 @@
|
||||
/*
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "orte/mca/state/base/state_private.h"
|
||||
#include "state_staged_orted.h"
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
*/
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
/******************
|
||||
* STAGED_ORTED module
|
||||
******************/
|
||||
orte_state_base_module_t orte_state_staged_orted_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_state_base_activate_job_state,
|
||||
orte_state_base_add_job_state,
|
||||
orte_state_base_set_job_state_callback,
|
||||
orte_state_base_set_job_state_priority,
|
||||
orte_state_base_remove_job_state,
|
||||
orte_state_base_activate_proc_state,
|
||||
orte_state_base_add_proc_state,
|
||||
orte_state_base_set_proc_state_callback,
|
||||
orte_state_base_set_proc_state_priority,
|
||||
orte_state_base_remove_proc_state
|
||||
};
|
||||
|
||||
/* Local functions */
|
||||
static void track_jobs(int fd, short argc, void *cbdata);
|
||||
static void track_procs(int fd, short argc, void *cbdata);
|
||||
static int pack_state_update(opal_buffer_t *buf,
|
||||
orte_job_t *jdata,
|
||||
orte_proc_t *proc);
|
||||
|
||||
/* defined default state machines */
|
||||
static orte_job_state_t job_states[] = {
|
||||
ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE,
|
||||
};
|
||||
static orte_state_cbfunc_t job_callbacks[] = {
|
||||
track_jobs
|
||||
};
|
||||
|
||||
static orte_proc_state_t proc_states[] = {
|
||||
ORTE_PROC_STATE_RUNNING,
|
||||
ORTE_PROC_STATE_REGISTERED,
|
||||
ORTE_PROC_STATE_IOF_COMPLETE,
|
||||
ORTE_PROC_STATE_WAITPID_FIRED
|
||||
};
|
||||
static orte_state_cbfunc_t proc_callbacks[] = {
|
||||
track_procs,
|
||||
track_procs,
|
||||
track_procs,
|
||||
track_procs
|
||||
};
|
||||
|
||||
/************************
|
||||
* API Definitions
|
||||
************************/
|
||||
static int init(void)
|
||||
{
|
||||
int num_states, i, rc;
|
||||
|
||||
/* setup the state machine */
|
||||
OBJ_CONSTRUCT(&orte_job_states, opal_list_t);
|
||||
OBJ_CONSTRUCT(&orte_proc_states, opal_list_t);
|
||||
|
||||
num_states = sizeof(job_states) / sizeof(orte_job_state_t);
|
||||
for (i=0; i < num_states; i++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(job_states[i],
|
||||
job_callbacks[i],
|
||||
ORTE_SYS_PRI))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
/* add a default error response */
|
||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_FORCED_EXIT,
|
||||
orte_quit, ORTE_ERROR_PRI))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* add a state for when we are ordered to terminate */
|
||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED,
|
||||
orte_quit, ORTE_ERROR_PRI))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
if (5 < opal_output_get_verbosity(orte_state_base_output)) {
|
||||
orte_state_base_print_job_state_machine();
|
||||
}
|
||||
|
||||
/* populate the proc state machine to allow us to
|
||||
* track proc lifecycle changes
|
||||
*/
|
||||
num_states = sizeof(proc_states) / sizeof(orte_proc_state_t);
|
||||
for (i=0; i < num_states; i++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_state.add_proc_state(proc_states[i],
|
||||
proc_callbacks[i],
|
||||
ORTE_SYS_PRI))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
if (5 < opal_output_get_verbosity(orte_state_base_output)) {
|
||||
orte_state_base_print_proc_state_machine();
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int finalize(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
/* cleanup the state machines */
|
||||
while (NULL != (item = opal_list_remove_first(&orte_job_states))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&orte_job_states);
|
||||
while (NULL != (item = opal_list_remove_first(&orte_proc_states))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&orte_proc_states);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void track_jobs(int fd, short argc, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
/* ignore this */
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static void track_procs(int fd, short argc, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_process_name_t *proc = &caddy->name;
|
||||
orte_proc_state_t state = caddy->proc_state;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *pdata;
|
||||
opal_buffer_t *alert;
|
||||
int rc;
|
||||
orte_plm_cmd_flag_t cmd;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
||||
"%s state:staged_orted:track_procs called for proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state)));
|
||||
|
||||
/* get the job object for this proc */
|
||||
if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
goto cleanup;
|
||||
}
|
||||
pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
|
||||
|
||||
switch (state) {
|
||||
case ORTE_PROC_STATE_RUNNING:
|
||||
/* update the proc state */
|
||||
pdata->state = state;
|
||||
jdata->num_launched++;
|
||||
/* we don't really care - nothing further to do */
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_REGISTERED:
|
||||
/* update the proc state */
|
||||
pdata->state = state;
|
||||
/* if this proc registered as an MPI proc, and
|
||||
* MPI is not allowed, then that is an error
|
||||
*/
|
||||
if (!jdata->gang_launched && pdata->mpi_proc) {
|
||||
/* abort the proc */
|
||||
/* notify the HNP of the error */
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_IOF_COMPLETE:
|
||||
/* do NOT update the proc state as this can hit
|
||||
* while we are still trying to notify the HNP of
|
||||
* successful launch for short-lived procs
|
||||
*/
|
||||
pdata->iof_complete = true;
|
||||
if (pdata->waitpid_recvd) {
|
||||
/* the proc has terminated */
|
||||
pdata->alive = false;
|
||||
pdata->state = ORTE_PROC_STATE_TERMINATED;
|
||||
/* Clean up the session directory as if we were the process
|
||||
* itself. This covers the case where the process died abnormally
|
||||
* and didn't cleanup its own session directory.
|
||||
*/
|
||||
orte_session_dir_finalize(proc);
|
||||
/* alert the HNP */
|
||||
cmd = ORTE_PLM_UPDATE_PROC_STATE;
|
||||
alert = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
/* pack the info */
|
||||
if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata, pdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* send it */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
||||
"%s SENDING TERMINATION UPDATE FOR PROC %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&pdata->name)));
|
||||
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
|
||||
ORTE_RML_TAG_PLM, 0,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
/* Release the stdin IOF file descriptor for this child, if one
|
||||
* was defined. File descriptors for the other IOF channels - stdout,
|
||||
* stderr, and stddiag - were released when their associated pipes
|
||||
* were cleared and closed due to termination of the process
|
||||
* Do this after we handle termination in case the IOF needs
|
||||
* to check to see if all procs from the job are actually terminated
|
||||
*/
|
||||
if (NULL != orte_iof.close) {
|
||||
orte_iof.close(proc, ORTE_IOF_STDIN);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_WAITPID_FIRED:
|
||||
/* do NOT update the proc state as this can hit
|
||||
* while we are still trying to notify the HNP of
|
||||
* successful launch for short-lived procs
|
||||
*/
|
||||
pdata->waitpid_recvd = true;
|
||||
if (pdata->iof_complete) {
|
||||
/* the proc has terminated */
|
||||
pdata->alive = false;
|
||||
pdata->state = ORTE_PROC_STATE_TERMINATED;
|
||||
/* Clean up the session directory as if we were the process
|
||||
* itself. This covers the case where the process died abnormally
|
||||
* and didn't cleanup its own session directory.
|
||||
*/
|
||||
orte_session_dir_finalize(proc);
|
||||
/* alert the HNP */
|
||||
cmd = ORTE_PLM_UPDATE_PROC_STATE;
|
||||
alert = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
/* pack the info */
|
||||
if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata, pdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* send it */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
||||
"%s SENDING TERMINATION UPDATE FOR PROC %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&pdata->name)));
|
||||
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
|
||||
ORTE_RML_TAG_PLM, 0,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
/* ignore */
|
||||
break;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static int pack_state_update(opal_buffer_t *alert,
|
||||
orte_job_t *jdata,
|
||||
orte_proc_t *child)
|
||||
{
|
||||
int rc;
|
||||
orte_vpid_t null=ORTE_VPID_INVALID;
|
||||
|
||||
/* pack the jobid */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jdata->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* pack the child's vpid */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name.vpid), 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* pack the pid */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* pack its state */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* pack its exit code */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->exit_code, 1, ORTE_EXIT_CODE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* flag that this job is complete so the receiver can know */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
36
orte/mca/state/staged_orted/state_staged_orted.h
Обычный файл
36
orte/mca/state/staged_orted/state_staged_orted.h
Обычный файл
@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_STATE_STAGED_ORTED_EXPORT_H
|
||||
#define MCA_STATE_STAGED_ORTED_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/state/state.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_staged_orted_component;
|
||||
|
||||
ORTE_DECLSPEC extern orte_state_base_module_t orte_state_staged_orted_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_STATE_STAGED_ORTED_EXPORT_H */
|
91
orte/mca/state/staged_orted/state_staged_orted_component.c
Обычный файл
91
orte/mca/state/staged_orted/state_staged_orted_component.c
Обычный файл
@ -0,0 +1,91 @@
|
||||
/*
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "state_staged_orted.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_state_staged_orted_component_version_string =
|
||||
"ORTE STATE staged_orted MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int state_staged_orted_open(void);
|
||||
static int state_staged_orted_close(void);
|
||||
static int state_staged_orted_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_state_base_component_t mca_state_staged_orted_component =
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component
|
||||
*/
|
||||
{
|
||||
ORTE_STATE_BASE_VERSION_1_0_0,
|
||||
/* Component name and version */
|
||||
"staged_orted",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
state_staged_orted_open,
|
||||
state_staged_orted_close,
|
||||
state_staged_orted_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
};
|
||||
|
||||
static bool select_me = false;
|
||||
|
||||
static int state_staged_orted_open(void)
|
||||
{
|
||||
int tmp;
|
||||
|
||||
mca_base_param_reg_int_name("state", "staged_select",
|
||||
"Use this component",
|
||||
false, false, (int)false, &tmp);
|
||||
select_me = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int state_staged_orted_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int state_staged_orted_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (ORTE_PROC_IS_DAEMON && select_me) {
|
||||
/* set our priority high */
|
||||
*priority = 1000;
|
||||
*module = (mca_base_module_t *)&orte_state_staged_orted_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
@ -217,12 +217,13 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
|
||||
asprintf(&pfx2, "%s", prefix);
|
||||
}
|
||||
|
||||
asprintf(&tmp, "\n%sData for job: %s\tRecovery: %s(%s)\n%s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %s\tAbort: %s", pfx2,
|
||||
asprintf(&tmp, "\n%sData for job: %s\tRecovery: %s(%s)\n%s\tNum apps: %ld\tControls: %0x\tMPI allowed: %s\tStdin target: %s\tState: %s\tAbort: %s", pfx2,
|
||||
ORTE_JOBID_PRINT(src->jobid),
|
||||
(src->enable_recovery) ? "ENABLED" : "DISABLED",
|
||||
(src->recovery_defined) ? "DEFINED" : "DEFAULT",
|
||||
pfx2,
|
||||
(long)src->num_apps, src->controls, ORTE_VPID_PRINT(src->stdin_target),
|
||||
(long)src->num_apps, src->controls,
|
||||
src->gang_launched ? "YES" : "NO", ORTE_VPID_PRINT(src->stdin_target),
|
||||
orte_job_state_to_str(src->state), src->abort ? "True" : "False");
|
||||
asprintf(&pfx, "%s\t", pfx2);
|
||||
free(pfx2);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user