Add an orted component for staged operations and rename the staged component to "staged_hnp".
This commit was SVN r27305.
Этот коммит содержится в:
родитель
387f657fc2
Коммит
a0ffeb205a
@ -38,24 +38,21 @@ static int init(void);
|
|||||||
static int finalize(void);
|
static int finalize(void);
|
||||||
|
|
||||||
/******************
|
/******************
|
||||||
* ORTED module - just uses base functions after
|
* ORTED module
|
||||||
* initializing the proc state machine. Job state
|
|
||||||
* machine is unused by ortedlication procs at this
|
|
||||||
* time.
|
|
||||||
******************/
|
******************/
|
||||||
orte_state_base_module_t orte_state_orted_module = {
|
orte_state_base_module_t orte_state_orted_module = {
|
||||||
init,
|
init,
|
||||||
finalize,
|
finalize,
|
||||||
orte_state_base_activate_job_state,
|
orte_state_base_activate_job_state,
|
||||||
orte_state_base_add_job_state,
|
orte_state_base_add_job_state,
|
||||||
orte_state_base_set_job_state_callback,
|
orte_state_base_set_job_state_callback,
|
||||||
orte_state_base_set_job_state_priority,
|
orte_state_base_set_job_state_priority,
|
||||||
orte_state_base_remove_job_state,
|
orte_state_base_remove_job_state,
|
||||||
orte_state_base_activate_proc_state,
|
orte_state_base_activate_proc_state,
|
||||||
orte_state_base_add_proc_state,
|
orte_state_base_add_proc_state,
|
||||||
orte_state_base_set_proc_state_callback,
|
orte_state_base_set_proc_state_callback,
|
||||||
orte_state_base_set_proc_state_priority,
|
orte_state_base_set_proc_state_priority,
|
||||||
orte_state_base_remove_proc_state
|
orte_state_base_remove_proc_state
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Local functions */
|
/* Local functions */
|
||||||
@ -73,16 +70,16 @@ static orte_state_cbfunc_t job_callbacks[] = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static orte_proc_state_t proc_states[] = {
|
static orte_proc_state_t proc_states[] = {
|
||||||
ORTE_PROC_STATE_RUNNING,
|
ORTE_PROC_STATE_RUNNING,
|
||||||
ORTE_PROC_STATE_REGISTERED,
|
ORTE_PROC_STATE_REGISTERED,
|
||||||
ORTE_PROC_STATE_IOF_COMPLETE,
|
ORTE_PROC_STATE_IOF_COMPLETE,
|
||||||
ORTE_PROC_STATE_WAITPID_FIRED
|
ORTE_PROC_STATE_WAITPID_FIRED
|
||||||
};
|
};
|
||||||
static orte_state_cbfunc_t proc_callbacks[] = {
|
static orte_state_cbfunc_t proc_callbacks[] = {
|
||||||
track_procs,
|
track_procs,
|
||||||
track_procs,
|
track_procs,
|
||||||
track_procs,
|
track_procs,
|
||||||
track_procs
|
track_procs
|
||||||
};
|
};
|
||||||
|
|
||||||
/************************
|
/************************
|
||||||
|
37
orte/mca/state/staged_hnp/Makefile.am
Обычный файл
37
orte/mca/state/staged_hnp/Makefile.am
Обычный файл
@ -0,0 +1,37 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||||
|
# All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
dist_pkgdata_DATA = help-state-staged-hnp.txt
|
||||||
|
|
||||||
|
sources = \
|
||||||
|
state_staged_hnp.h \
|
||||||
|
state_staged_hnp_component.c \
|
||||||
|
state_staged_hnp.c
|
||||||
|
|
||||||
|
# Make the output library in this directory, and name it either
|
||||||
|
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||||
|
# (for static builds).
|
||||||
|
|
||||||
|
if MCA_BUILD_orte_state_staged_hnp_DSO
|
||||||
|
component_noinst =
|
||||||
|
component_install = mca_state_staged_hnp.la
|
||||||
|
else
|
||||||
|
component_noinst = libmca_state_staged_hnp.la
|
||||||
|
component_install =
|
||||||
|
endif
|
||||||
|
|
||||||
|
mcacomponentdir = $(pkglibdir)
|
||||||
|
mcacomponent_LTLIBRARIES = $(component_install)
|
||||||
|
mca_state_staged_hnp_la_SOURCES = $(sources)
|
||||||
|
mca_state_staged_hnp_la_LDFLAGS = -module -avoid-version
|
||||||
|
|
||||||
|
noinst_LTLIBRARIES = $(component_noinst)
|
||||||
|
libmca_state_staged_hnp_la_SOURCES =$(sources)
|
||||||
|
libmca_state_staged_hnp_la_LDFLAGS = -module -avoid-version
|
@ -34,7 +34,7 @@
|
|||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
#include "orte/mca/state/base/base.h"
|
#include "orte/mca/state/base/base.h"
|
||||||
#include "orte/mca/state/base/state_private.h"
|
#include "orte/mca/state/base/state_private.h"
|
||||||
#include "state_staged.h"
|
#include "state_staged_hnp.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Module functions: Global
|
* Module functions: Global
|
||||||
@ -45,7 +45,7 @@ static int finalize(void);
|
|||||||
/******************
|
/******************
|
||||||
* STAGED module
|
* STAGED module
|
||||||
******************/
|
******************/
|
||||||
orte_state_base_module_t orte_state_staged_module = {
|
orte_state_base_module_t orte_state_staged_hnp_module = {
|
||||||
init,
|
init,
|
||||||
finalize,
|
finalize,
|
||||||
orte_state_base_activate_job_state,
|
orte_state_base_activate_job_state,
|
||||||
@ -107,7 +107,7 @@ static orte_state_cbfunc_t launch_callbacks[] = {
|
|||||||
orte_quit
|
orte_quit
|
||||||
};
|
};
|
||||||
|
|
||||||
/* staged execution requires that we start as many
|
/* staged_hnp execution requires that we start as many
|
||||||
* procs initially as we have resources - if we have
|
* procs initially as we have resources - if we have
|
||||||
* adequate resources, then we behave just like the
|
* adequate resources, then we behave just like the
|
||||||
* default HNP module. If we don't, then we will have
|
* default HNP module. If we don't, then we will have
|
||||||
@ -218,8 +218,8 @@ static void setup_job_complete(int fd, short args, void *cbdata)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (app->num_procs <= 0) {
|
if (app->num_procs <= 0) {
|
||||||
/* must specify -np for staged execution */
|
/* must specify -np for staged_hnp execution */
|
||||||
orte_show_help("help-state-staged.txt", "no-np", true);
|
orte_show_help("help-state-staged-hnp.txt", "no-np", true);
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT);
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT);
|
||||||
OBJ_RELEASE(caddy);
|
OBJ_RELEASE(caddy);
|
||||||
return;
|
return;
|
||||||
@ -248,7 +248,7 @@ static void setup_job_complete(int fd, short args, void *cbdata)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* set the job map to use the staged mapper */
|
/* set the job map to use the staged_hnp mapper */
|
||||||
if (NULL == jdata->map) {
|
if (NULL == jdata->map) {
|
||||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||||
jdata->map->req_mapper = strdup("staged");
|
jdata->map->req_mapper = strdup("staged");
|
||||||
@ -282,7 +282,7 @@ static void cleanup_node(orte_proc_t *proc)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
||||||
"%s state:staged:track_procs node %s has %d slots, %d slots inuse",
|
"%s state:staged_hnp:track_procs node %s has %d slots, %d slots inuse",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
|
||||||
(int)node->slots, (int)node->slots_inuse));
|
(int)node->slots, (int)node->slots_inuse));
|
||||||
}
|
}
|
||||||
@ -296,7 +296,7 @@ static void track_procs(int fd, short args, void *cbdata)
|
|||||||
orte_proc_t *pdata;
|
orte_proc_t *pdata;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
||||||
"%s state:staged:track_procs called for proc %s state %s",
|
"%s state:staged_hnp:track_procs called for proc %s state %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(proc),
|
ORTE_NAME_PRINT(proc),
|
||||||
orte_proc_state_to_str(state)));
|
orte_proc_state_to_str(state)));
|
||||||
@ -316,7 +316,7 @@ static void track_procs(int fd, short args, void *cbdata)
|
|||||||
if (ORTE_PROC_STATE_REGISTERED == state) {
|
if (ORTE_PROC_STATE_REGISTERED == state) {
|
||||||
if (pdata->mpi_proc && !jdata->gang_launched) {
|
if (pdata->mpi_proc && !jdata->gang_launched) {
|
||||||
/* we can't support this - issue an error and abort */
|
/* we can't support this - issue an error and abort */
|
||||||
orte_show_help("help-state-staged.txt", "mpi-procs-not-supported", true);
|
orte_show_help("help-state-staged-hnp.txt", "mpi-procs-not-supported", true);
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT);
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT);
|
||||||
}
|
}
|
||||||
/* update the proc state */
|
/* update the proc state */
|
@ -14,8 +14,8 @@
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef MCA_STATE_STAGED_EXPORT_H
|
#ifndef MCA_STATE_STAGED_HNP_EXPORT_H
|
||||||
#define MCA_STATE_STAGED_EXPORT_H
|
#define MCA_STATE_STAGED_HNP_EXPORT_H
|
||||||
|
|
||||||
#include "orte_config.h"
|
#include "orte_config.h"
|
||||||
|
|
||||||
@ -27,10 +27,10 @@ BEGIN_C_DECLS
|
|||||||
* Local Component structures
|
* Local Component structures
|
||||||
*/
|
*/
|
||||||
|
|
||||||
ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_staged_component;
|
ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_staged_hnp_component;
|
||||||
|
|
||||||
ORTE_DECLSPEC extern orte_state_base_module_t orte_state_staged_module;
|
ORTE_DECLSPEC extern orte_state_base_module_t orte_state_staged_hnp_module;
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
|
||||||
#endif /* MCA_STATE_STAGED_EXPORT_H */
|
#endif /* MCA_STATE_STAGED_HNP_EXPORT_H */
|
@ -14,26 +14,26 @@
|
|||||||
|
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
#include "orte/mca/state/base/base.h"
|
#include "orte/mca/state/base/base.h"
|
||||||
#include "state_staged.h"
|
#include "state_staged_hnp.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Public string for version number
|
* Public string for version number
|
||||||
*/
|
*/
|
||||||
const char *orte_state_staged_component_version_string =
|
const char *orte_state_staged_hnp_component_version_string =
|
||||||
"ORTE STATE staged MCA component version " ORTE_VERSION;
|
"ORTE STATE staged_hnp MCA component version " ORTE_VERSION;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Local functionality
|
* Local functionality
|
||||||
*/
|
*/
|
||||||
static int state_staged_open(void);
|
static int state_staged_hnp_open(void);
|
||||||
static int state_staged_close(void);
|
static int state_staged_hnp_close(void);
|
||||||
static int state_staged_component_query(mca_base_module_t **module, int *priority);
|
static int state_staged_hnp_component_query(mca_base_module_t **module, int *priority);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Instantiate the public struct with all of our public information
|
* Instantiate the public struct with all of our public information
|
||||||
* and pointer to our public functions in it
|
* and pointer to our public functions in it
|
||||||
*/
|
*/
|
||||||
orte_state_base_component_t mca_state_staged_component =
|
orte_state_base_component_t mca_state_staged_hnp_component =
|
||||||
{
|
{
|
||||||
/* Handle the general mca_component_t struct containing
|
/* Handle the general mca_component_t struct containing
|
||||||
* meta information about the component
|
* meta information about the component
|
||||||
@ -41,15 +41,15 @@ orte_state_base_component_t mca_state_staged_component =
|
|||||||
{
|
{
|
||||||
ORTE_STATE_BASE_VERSION_1_0_0,
|
ORTE_STATE_BASE_VERSION_1_0_0,
|
||||||
/* Component name and version */
|
/* Component name and version */
|
||||||
"staged",
|
"staged_hnp",
|
||||||
ORTE_MAJOR_VERSION,
|
ORTE_MAJOR_VERSION,
|
||||||
ORTE_MINOR_VERSION,
|
ORTE_MINOR_VERSION,
|
||||||
ORTE_RELEASE_VERSION,
|
ORTE_RELEASE_VERSION,
|
||||||
|
|
||||||
/* Component open and close functions */
|
/* Component open and close functions */
|
||||||
state_staged_open,
|
state_staged_hnp_open,
|
||||||
state_staged_close,
|
state_staged_hnp_close,
|
||||||
state_staged_component_query
|
state_staged_hnp_component_query
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
/* The component is checkpoint ready */
|
/* The component is checkpoint ready */
|
||||||
@ -59,29 +59,28 @@ orte_state_base_component_t mca_state_staged_component =
|
|||||||
|
|
||||||
static bool select_me = false;
|
static bool select_me = false;
|
||||||
|
|
||||||
static int state_staged_open(void)
|
static int state_staged_hnp_open(void)
|
||||||
{
|
{
|
||||||
int tmp;
|
int tmp;
|
||||||
mca_base_component_t *c=&mca_state_staged_component.base_version;
|
|
||||||
|
|
||||||
mca_base_param_reg_int(c, "select",
|
mca_base_param_reg_int_name("state", "staged_select",
|
||||||
"Use this component",
|
"Use this component",
|
||||||
false, false, (int)false, &tmp);
|
false, false, (int)false, &tmp);
|
||||||
select_me = OPAL_INT_TO_BOOL(tmp);
|
select_me = OPAL_INT_TO_BOOL(tmp);
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int state_staged_close(void)
|
static int state_staged_hnp_close(void)
|
||||||
{
|
{
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int state_staged_component_query(mca_base_module_t **module, int *priority)
|
static int state_staged_hnp_component_query(mca_base_module_t **module, int *priority)
|
||||||
{
|
{
|
||||||
if (ORTE_PROC_IS_HNP && select_me) {
|
if (ORTE_PROC_IS_HNP && select_me) {
|
||||||
*priority = 1000;
|
*priority = 1000;
|
||||||
*module = (mca_base_module_t *)&orte_state_staged_module;
|
*module = (mca_base_module_t *)&orte_state_staged_hnp_module;
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
@ -8,30 +8,28 @@
|
|||||||
# $HEADER$
|
# $HEADER$
|
||||||
#
|
#
|
||||||
|
|
||||||
dist_pkgdata_DATA = help-state-staged.txt
|
|
||||||
|
|
||||||
sources = \
|
sources = \
|
||||||
state_staged.h \
|
state_staged_orted.h \
|
||||||
state_staged_component.c \
|
state_staged_orted_component.c \
|
||||||
state_staged.c
|
state_staged_orted.c
|
||||||
|
|
||||||
# Make the output library in this directory, and name it either
|
# Make the output library in this directory, and name it either
|
||||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||||
# (for static builds).
|
# (for static builds).
|
||||||
|
|
||||||
if MCA_BUILD_orte_state_staged_DSO
|
if MCA_BUILD_orte_state_staged_orted_DSO
|
||||||
component_noinst =
|
component_noinst =
|
||||||
component_install = mca_state_staged.la
|
component_install = mca_state_staged_orted.la
|
||||||
else
|
else
|
||||||
component_noinst = libmca_state_staged.la
|
component_noinst = libmca_state_staged_orted.la
|
||||||
component_install =
|
component_install =
|
||||||
endif
|
endif
|
||||||
|
|
||||||
mcacomponentdir = $(pkglibdir)
|
mcacomponentdir = $(pkglibdir)
|
||||||
mcacomponent_LTLIBRARIES = $(component_install)
|
mcacomponent_LTLIBRARIES = $(component_install)
|
||||||
mca_state_staged_la_SOURCES = $(sources)
|
mca_state_staged_orted_la_SOURCES = $(sources)
|
||||||
mca_state_staged_la_LDFLAGS = -module -avoid-version
|
mca_state_staged_orted_la_LDFLAGS = -module -avoid-version
|
||||||
|
|
||||||
noinst_LTLIBRARIES = $(component_noinst)
|
noinst_LTLIBRARIES = $(component_noinst)
|
||||||
libmca_state_staged_la_SOURCES =$(sources)
|
libmca_state_staged_orted_la_SOURCES =$(sources)
|
||||||
libmca_state_staged_la_LDFLAGS = -module -avoid-version
|
libmca_state_staged_orted_la_LDFLAGS = -module -avoid-version
|
342
orte/mca/state/staged_orted/state_staged_orted.c
Обычный файл
342
orte/mca/state/staged_orted/state_staged_orted.c
Обычный файл
@ -0,0 +1,342 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||||
|
* All rights reserved.
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#include <sys/types.h>
|
||||||
|
#ifdef HAVE_UNISTD_H
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif /* HAVE_UNISTD_H */
|
||||||
|
#ifdef HAVE_STRING_H
|
||||||
|
#include <string.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "opal/util/output.h"
|
||||||
|
|
||||||
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
|
#include "orte/mca/iof/iof.h"
|
||||||
|
#include "orte/mca/rml/rml.h"
|
||||||
|
#include "orte/util/session_dir.h"
|
||||||
|
#include "orte/runtime/orte_quit.h"
|
||||||
|
|
||||||
|
#include "orte/mca/state/state.h"
|
||||||
|
#include "orte/mca/state/base/base.h"
|
||||||
|
#include "orte/mca/state/base/state_private.h"
|
||||||
|
#include "state_staged_orted.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Module functions: Global
|
||||||
|
*/
|
||||||
|
static int init(void);
|
||||||
|
static int finalize(void);
|
||||||
|
|
||||||
|
/******************
|
||||||
|
* STAGED_ORTED module
|
||||||
|
******************/
|
||||||
|
orte_state_base_module_t orte_state_staged_orted_module = {
|
||||||
|
init,
|
||||||
|
finalize,
|
||||||
|
orte_state_base_activate_job_state,
|
||||||
|
orte_state_base_add_job_state,
|
||||||
|
orte_state_base_set_job_state_callback,
|
||||||
|
orte_state_base_set_job_state_priority,
|
||||||
|
orte_state_base_remove_job_state,
|
||||||
|
orte_state_base_activate_proc_state,
|
||||||
|
orte_state_base_add_proc_state,
|
||||||
|
orte_state_base_set_proc_state_callback,
|
||||||
|
orte_state_base_set_proc_state_priority,
|
||||||
|
orte_state_base_remove_proc_state
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Local functions */
|
||||||
|
static void track_jobs(int fd, short argc, void *cbdata);
|
||||||
|
static void track_procs(int fd, short argc, void *cbdata);
|
||||||
|
static int pack_state_update(opal_buffer_t *buf,
|
||||||
|
orte_job_t *jdata,
|
||||||
|
orte_proc_t *proc);
|
||||||
|
|
||||||
|
/* defined default state machines */
|
||||||
|
static orte_job_state_t job_states[] = {
|
||||||
|
ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE,
|
||||||
|
};
|
||||||
|
static orte_state_cbfunc_t job_callbacks[] = {
|
||||||
|
track_jobs
|
||||||
|
};
|
||||||
|
|
||||||
|
static orte_proc_state_t proc_states[] = {
|
||||||
|
ORTE_PROC_STATE_RUNNING,
|
||||||
|
ORTE_PROC_STATE_REGISTERED,
|
||||||
|
ORTE_PROC_STATE_IOF_COMPLETE,
|
||||||
|
ORTE_PROC_STATE_WAITPID_FIRED
|
||||||
|
};
|
||||||
|
static orte_state_cbfunc_t proc_callbacks[] = {
|
||||||
|
track_procs,
|
||||||
|
track_procs,
|
||||||
|
track_procs,
|
||||||
|
track_procs
|
||||||
|
};
|
||||||
|
|
||||||
|
/************************
|
||||||
|
* API Definitions
|
||||||
|
************************/
|
||||||
|
static int init(void)
|
||||||
|
{
|
||||||
|
int num_states, i, rc;
|
||||||
|
|
||||||
|
/* setup the state machine */
|
||||||
|
OBJ_CONSTRUCT(&orte_job_states, opal_list_t);
|
||||||
|
OBJ_CONSTRUCT(&orte_proc_states, opal_list_t);
|
||||||
|
|
||||||
|
num_states = sizeof(job_states) / sizeof(orte_job_state_t);
|
||||||
|
for (i=0; i < num_states; i++) {
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(job_states[i],
|
||||||
|
job_callbacks[i],
|
||||||
|
ORTE_SYS_PRI))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* add a default error response */
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_FORCED_EXIT,
|
||||||
|
orte_quit, ORTE_ERROR_PRI))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
}
|
||||||
|
/* add a state for when we are ordered to terminate */
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED,
|
||||||
|
orte_quit, ORTE_ERROR_PRI))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
}
|
||||||
|
if (5 < opal_output_get_verbosity(orte_state_base_output)) {
|
||||||
|
orte_state_base_print_job_state_machine();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* populate the proc state machine to allow us to
|
||||||
|
* track proc lifecycle changes
|
||||||
|
*/
|
||||||
|
num_states = sizeof(proc_states) / sizeof(orte_proc_state_t);
|
||||||
|
for (i=0; i < num_states; i++) {
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_state.add_proc_state(proc_states[i],
|
||||||
|
proc_callbacks[i],
|
||||||
|
ORTE_SYS_PRI))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (5 < opal_output_get_verbosity(orte_state_base_output)) {
|
||||||
|
orte_state_base_print_proc_state_machine();
|
||||||
|
}
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int finalize(void)
|
||||||
|
{
|
||||||
|
opal_list_item_t *item;
|
||||||
|
|
||||||
|
/* cleanup the state machines */
|
||||||
|
while (NULL != (item = opal_list_remove_first(&orte_job_states))) {
|
||||||
|
OBJ_RELEASE(item);
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&orte_job_states);
|
||||||
|
while (NULL != (item = opal_list_remove_first(&orte_proc_states))) {
|
||||||
|
OBJ_RELEASE(item);
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&orte_proc_states);
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void track_jobs(int fd, short argc, void *cbdata)
|
||||||
|
{
|
||||||
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
|
|
||||||
|
/* ignore this */
|
||||||
|
OBJ_RELEASE(caddy);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void track_procs(int fd, short argc, void *cbdata)
|
||||||
|
{
|
||||||
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
|
orte_process_name_t *proc = &caddy->name;
|
||||||
|
orte_proc_state_t state = caddy->proc_state;
|
||||||
|
orte_job_t *jdata;
|
||||||
|
orte_proc_t *pdata;
|
||||||
|
opal_buffer_t *alert;
|
||||||
|
int rc;
|
||||||
|
orte_plm_cmd_flag_t cmd;
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
||||||
|
"%s state:staged_orted:track_procs called for proc %s state %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(proc),
|
||||||
|
orte_proc_state_to_str(state)));
|
||||||
|
|
||||||
|
/* get the job object for this proc */
|
||||||
|
if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
|
||||||
|
|
||||||
|
switch (state) {
|
||||||
|
case ORTE_PROC_STATE_RUNNING:
|
||||||
|
/* update the proc state */
|
||||||
|
pdata->state = state;
|
||||||
|
jdata->num_launched++;
|
||||||
|
/* we don't really care - nothing further to do */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ORTE_PROC_STATE_REGISTERED:
|
||||||
|
/* update the proc state */
|
||||||
|
pdata->state = state;
|
||||||
|
/* if this proc registered as an MPI proc, and
|
||||||
|
* MPI is not allowed, then that is an error
|
||||||
|
*/
|
||||||
|
if (!jdata->gang_launched && pdata->mpi_proc) {
|
||||||
|
/* abort the proc */
|
||||||
|
/* notify the HNP of the error */
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ORTE_PROC_STATE_IOF_COMPLETE:
|
||||||
|
/* do NOT update the proc state as this can hit
|
||||||
|
* while we are still trying to notify the HNP of
|
||||||
|
* successful launch for short-lived procs
|
||||||
|
*/
|
||||||
|
pdata->iof_complete = true;
|
||||||
|
if (pdata->waitpid_recvd) {
|
||||||
|
/* the proc has terminated */
|
||||||
|
pdata->alive = false;
|
||||||
|
pdata->state = ORTE_PROC_STATE_TERMINATED;
|
||||||
|
/* Clean up the session directory as if we were the process
|
||||||
|
* itself. This covers the case where the process died abnormally
|
||||||
|
* and didn't cleanup its own session directory.
|
||||||
|
*/
|
||||||
|
orte_session_dir_finalize(proc);
|
||||||
|
/* alert the HNP */
|
||||||
|
cmd = ORTE_PLM_UPDATE_PROC_STATE;
|
||||||
|
alert = OBJ_NEW(opal_buffer_t);
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
/* pack the info */
|
||||||
|
if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata, pdata))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
}
|
||||||
|
/* send it */
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
||||||
|
"%s SENDING TERMINATION UPDATE FOR PROC %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&pdata->name)));
|
||||||
|
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
|
||||||
|
ORTE_RML_TAG_PLM, 0,
|
||||||
|
orte_rml_send_callback, NULL))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* Release the stdin IOF file descriptor for this child, if one
|
||||||
|
* was defined. File descriptors for the other IOF channels - stdout,
|
||||||
|
* stderr, and stddiag - were released when their associated pipes
|
||||||
|
* were cleared and closed due to termination of the process
|
||||||
|
* Do this after we handle termination in case the IOF needs
|
||||||
|
* to check to see if all procs from the job are actually terminated
|
||||||
|
*/
|
||||||
|
if (NULL != orte_iof.close) {
|
||||||
|
orte_iof.close(proc, ORTE_IOF_STDIN);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ORTE_PROC_STATE_WAITPID_FIRED:
|
||||||
|
/* do NOT update the proc state as this can hit
|
||||||
|
* while we are still trying to notify the HNP of
|
||||||
|
* successful launch for short-lived procs
|
||||||
|
*/
|
||||||
|
pdata->waitpid_recvd = true;
|
||||||
|
if (pdata->iof_complete) {
|
||||||
|
/* the proc has terminated */
|
||||||
|
pdata->alive = false;
|
||||||
|
pdata->state = ORTE_PROC_STATE_TERMINATED;
|
||||||
|
/* Clean up the session directory as if we were the process
|
||||||
|
* itself. This covers the case where the process died abnormally
|
||||||
|
* and didn't cleanup its own session directory.
|
||||||
|
*/
|
||||||
|
orte_session_dir_finalize(proc);
|
||||||
|
/* alert the HNP */
|
||||||
|
cmd = ORTE_PLM_UPDATE_PROC_STATE;
|
||||||
|
alert = OBJ_NEW(opal_buffer_t);
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
/* pack the info */
|
||||||
|
if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata, pdata))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
}
|
||||||
|
/* send it */
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
||||||
|
"%s SENDING TERMINATION UPDATE FOR PROC %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&pdata->name)));
|
||||||
|
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
|
||||||
|
ORTE_RML_TAG_PLM, 0,
|
||||||
|
orte_rml_send_callback, NULL))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
/* ignore */
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
OBJ_RELEASE(caddy);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int pack_state_update(opal_buffer_t *alert,
|
||||||
|
orte_job_t *jdata,
|
||||||
|
orte_proc_t *child)
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
orte_vpid_t null=ORTE_VPID_INVALID;
|
||||||
|
|
||||||
|
/* pack the jobid */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jdata->jobid, 1, ORTE_JOBID))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
/* pack the child's vpid */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name.vpid), 1, ORTE_VPID))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
/* pack the pid */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
/* pack its state */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
/* pack its exit code */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->exit_code, 1, ORTE_EXIT_CODE))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* flag that this job is complete so the receiver can know */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
36
orte/mca/state/staged_orted/state_staged_orted.h
Обычный файл
36
orte/mca/state/staged_orted/state_staged_orted.h
Обычный файл
@ -0,0 +1,36 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MCA_STATE_STAGED_ORTED_EXPORT_H
|
||||||
|
#define MCA_STATE_STAGED_ORTED_EXPORT_H
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#include "orte/mca/state/state.h"
|
||||||
|
|
||||||
|
BEGIN_C_DECLS
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Local Component structures
|
||||||
|
*/
|
||||||
|
|
||||||
|
ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_staged_orted_component;
|
||||||
|
|
||||||
|
ORTE_DECLSPEC extern orte_state_base_module_t orte_state_staged_orted_module;
|
||||||
|
|
||||||
|
END_C_DECLS
|
||||||
|
|
||||||
|
#endif /* MCA_STATE_STAGED_ORTED_EXPORT_H */
|
91
orte/mca/state/staged_orted/state_staged_orted_component.c
Обычный файл
91
orte/mca/state/staged_orted/state_staged_orted_component.c
Обычный файл
@ -0,0 +1,91 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
#include "opal/util/output.h"
|
||||||
|
|
||||||
|
#include "orte/mca/state/state.h"
|
||||||
|
#include "orte/mca/state/base/base.h"
|
||||||
|
#include "state_staged_orted.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Public string for version number
|
||||||
|
*/
|
||||||
|
const char *orte_state_staged_orted_component_version_string =
|
||||||
|
"ORTE STATE staged_orted MCA component version " ORTE_VERSION;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Local functionality
|
||||||
|
*/
|
||||||
|
static int state_staged_orted_open(void);
|
||||||
|
static int state_staged_orted_close(void);
|
||||||
|
static int state_staged_orted_component_query(mca_base_module_t **module, int *priority);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Instantiate the public struct with all of our public information
|
||||||
|
* and pointer to our public functions in it
|
||||||
|
*/
|
||||||
|
orte_state_base_component_t mca_state_staged_orted_component =
|
||||||
|
{
|
||||||
|
/* Handle the general mca_component_t struct containing
|
||||||
|
* meta information about the component
|
||||||
|
*/
|
||||||
|
{
|
||||||
|
ORTE_STATE_BASE_VERSION_1_0_0,
|
||||||
|
/* Component name and version */
|
||||||
|
"staged_orted",
|
||||||
|
ORTE_MAJOR_VERSION,
|
||||||
|
ORTE_MINOR_VERSION,
|
||||||
|
ORTE_RELEASE_VERSION,
|
||||||
|
|
||||||
|
/* Component open and close functions */
|
||||||
|
state_staged_orted_open,
|
||||||
|
state_staged_orted_close,
|
||||||
|
state_staged_orted_component_query
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/* The component is checkpoint ready */
|
||||||
|
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
static bool select_me = false;
|
||||||
|
|
||||||
|
static int state_staged_orted_open(void)
|
||||||
|
{
|
||||||
|
int tmp;
|
||||||
|
|
||||||
|
mca_base_param_reg_int_name("state", "staged_select",
|
||||||
|
"Use this component",
|
||||||
|
false, false, (int)false, &tmp);
|
||||||
|
select_me = OPAL_INT_TO_BOOL(tmp);
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int state_staged_orted_close(void)
|
||||||
|
{
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int state_staged_orted_component_query(mca_base_module_t **module, int *priority)
|
||||||
|
{
|
||||||
|
if (ORTE_PROC_IS_DAEMON && select_me) {
|
||||||
|
/* set our priority high */
|
||||||
|
*priority = 1000;
|
||||||
|
*module = (mca_base_module_t *)&orte_state_staged_orted_module;
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
*priority = -1;
|
||||||
|
*module = NULL;
|
||||||
|
return ORTE_ERROR;
|
||||||
|
}
|
@ -217,12 +217,13 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
|
|||||||
asprintf(&pfx2, "%s", prefix);
|
asprintf(&pfx2, "%s", prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
asprintf(&tmp, "\n%sData for job: %s\tRecovery: %s(%s)\n%s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %s\tAbort: %s", pfx2,
|
asprintf(&tmp, "\n%sData for job: %s\tRecovery: %s(%s)\n%s\tNum apps: %ld\tControls: %0x\tMPI allowed: %s\tStdin target: %s\tState: %s\tAbort: %s", pfx2,
|
||||||
ORTE_JOBID_PRINT(src->jobid),
|
ORTE_JOBID_PRINT(src->jobid),
|
||||||
(src->enable_recovery) ? "ENABLED" : "DISABLED",
|
(src->enable_recovery) ? "ENABLED" : "DISABLED",
|
||||||
(src->recovery_defined) ? "DEFINED" : "DEFAULT",
|
(src->recovery_defined) ? "DEFINED" : "DEFAULT",
|
||||||
pfx2,
|
pfx2,
|
||||||
(long)src->num_apps, src->controls, ORTE_VPID_PRINT(src->stdin_target),
|
(long)src->num_apps, src->controls,
|
||||||
|
src->gang_launched ? "YES" : "NO", ORTE_VPID_PRINT(src->stdin_target),
|
||||||
orte_job_state_to_str(src->state), src->abort ? "True" : "False");
|
orte_job_state_to_str(src->state), src->abort ? "True" : "False");
|
||||||
asprintf(&pfx, "%s\t", pfx2);
|
asprintf(&pfx, "%s\t", pfx2);
|
||||||
free(pfx2);
|
free(pfx2);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user