Update the orte/ess framework
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
9fb80bd239
Коммит
d1071397ac
@ -10,6 +10,7 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
@ -50,4 +51,3 @@ libmca_ess_alps_la_SOURCES =$(sources)
|
||||
libmca_ess_alps_la_CPPFLAGS = $(ess_alps_CPPFLAGS)
|
||||
libmca_ess_alps_la_LDFLAGS = -module -avoid-version $(ess_alps_LDFLAGS)
|
||||
libmca_ess_alps_la_LIBADD = $(ess_alps_LIBS)
|
||||
|
||||
|
@ -45,7 +45,6 @@
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/proc.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/runtime/opal_cr.h"
|
||||
|
||||
#include "orte/mca/rml/base/base.h"
|
||||
#include "orte/mca/routed/base/base.h"
|
||||
@ -58,17 +57,12 @@
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#include "orte/mca/sstore/base/base.h"
|
||||
#endif
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
@ -285,44 +279,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
|
||||
goto error;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/*
|
||||
* Setup the SnapC
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_snapc_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_sstore_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, ORTE_PROC_IS_APP))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_snapc_base_select";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_sstore_base_select";
|
||||
goto error;
|
||||
}
|
||||
/* apps need the OPAL CR stuff */
|
||||
opal_cr_set_enabled(true);
|
||||
#else
|
||||
opal_cr_set_enabled(false);
|
||||
#endif
|
||||
/* Initalize the CR setup
|
||||
* Note: Always do this, even in non-FT builds.
|
||||
* If we don't some user level tools may hang.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_cr_init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_cr_init";
|
||||
goto error;
|
||||
}
|
||||
/* open the distributed file system */
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -344,13 +300,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
|
||||
|
||||
int orte_ess_base_app_finalize(void)
|
||||
{
|
||||
orte_cr_finalize();
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
(void) mca_base_framework_close(&orte_snapc_base_framework);
|
||||
(void) mca_base_framework_close(&orte_sstore_base_framework);
|
||||
#endif
|
||||
|
||||
/* release the conduits */
|
||||
orte_rml.close_conduit(orte_mgmt_conduit);
|
||||
orte_rml.close_conduit(orte_coll_conduit);
|
||||
@ -414,8 +363,7 @@ void orte_ess_base_app_abort(int status, bool report)
|
||||
* clean environment. Taken from orte_finalize():
|
||||
* - Assume errmgr cleans up child processes before we exit.
|
||||
*/
|
||||
/* CRS cleanup since it may have a named pipe and thread active */
|
||||
orte_cr_finalize();
|
||||
|
||||
/* If we were asked to report this termination, do so.
|
||||
* Since singletons don't start an HNP unless necessary, and
|
||||
* direct-launched procs don't have daemons at all, only send
|
||||
|
@ -38,11 +38,11 @@
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/runtime/opal_cr.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/mca/pmix/base/base.h"
|
||||
#include "opal/mca/pstat/base/base.h"
|
||||
#include "opal/util/arch.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/proc.h"
|
||||
|
||||
@ -61,10 +61,6 @@
|
||||
#include "orte/mca/regx/base/base.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#include "orte/mca/sstore/base/base.h"
|
||||
#endif
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
@ -73,7 +69,6 @@
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
@ -626,46 +621,6 @@ int orte_ess_base_orted_setup(void)
|
||||
goto error;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/*
|
||||
* Setup the SnapC
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_snapc_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_sstore_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_snapc_base_select(!ORTE_PROC_IS_HNP, ORTE_PROC_IS_DAEMON))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_snapc_base_select";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_sstore_base_select";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* For daemons, ORTE doesn't need the OPAL CR stuff */
|
||||
opal_cr_set_enabled(false);
|
||||
#else
|
||||
opal_cr_set_enabled(false);
|
||||
#endif
|
||||
/*
|
||||
* Initalize the CR setup
|
||||
* Note: Always do this, even in non-FT builds.
|
||||
* If we don't some user level tools may hang.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_cr_init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_cr_init";
|
||||
goto error;
|
||||
}
|
||||
/* setup the DFS framework */
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -680,7 +635,7 @@ int orte_ess_base_orted_setup(void)
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
error:
|
||||
orte_show_help("help-orte-runtime.txt",
|
||||
"orte_init:startup:internal-failure",
|
||||
true, error, ORTE_ERROR_NAME(ret), ret);
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
@ -37,25 +37,24 @@
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/mca/pmix/base/base.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/runtime/opal_cr.h"
|
||||
#include "opal/runtime/opal_progress_threads.h"
|
||||
#include "opal/util/arch.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/proc.h"
|
||||
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
#include "orte/mca/oob/base/base.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/rml/base/base.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/mca/routed/base/base.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
@ -96,8 +95,6 @@ int orte_ess_base_tool_setup(opal_list_t *flags)
|
||||
int ret;
|
||||
char *error = NULL;
|
||||
opal_list_t transports;
|
||||
orte_jobid_t jobid;
|
||||
orte_vpid_t vpid;
|
||||
opal_list_t info;
|
||||
opal_value_t *kv, *knext, val;
|
||||
opal_pmix_query_t *q;
|
||||
@ -123,65 +120,17 @@ int orte_ess_base_tool_setup(opal_list_t *flags)
|
||||
}
|
||||
if (NULL == opal_pmix.tool_init) {
|
||||
/* we no longer support non-pmix tools */
|
||||
orte_show_help("help-ess-base.txt",
|
||||
"legacy-tool", true);
|
||||
ret = ORTE_ERR_SILENT;
|
||||
error = "opal_pmix.tool_init";
|
||||
ret = ORTE_ERR_NOT_SUPPORTED;
|
||||
goto error;
|
||||
}
|
||||
/* set the event base for the pmix component code */
|
||||
opal_pmix_base_set_evbase(orte_event_base);
|
||||
|
||||
/* we have to define our name here */
|
||||
if (NULL != orte_ess_base_jobid &&
|
||||
NULL != orte_ess_base_vpid) {
|
||||
opal_output_verbose(2, orte_ess_base_framework.framework_output,
|
||||
"ess:tool:obtaining name from environment");
|
||||
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&jobid, orte_ess_base_jobid))) {
|
||||
return(ret);
|
||||
}
|
||||
ORTE_PROC_MY_NAME->jobid = jobid;
|
||||
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_vpid(&vpid, orte_ess_base_vpid))) {
|
||||
return(ret);
|
||||
}
|
||||
ORTE_PROC_MY_NAME->vpid = vpid;
|
||||
} else {
|
||||
/* If we are a tool with no name, then define it here */
|
||||
uint16_t jobfam;
|
||||
uint32_t hash32;
|
||||
uint32_t bias;
|
||||
|
||||
opal_output_verbose(2, orte_ess_base_framework.framework_output,
|
||||
"ess:tool:computing name");
|
||||
/* hash the nodename */
|
||||
OPAL_HASH_STR(orte_process_info.nodename, hash32);
|
||||
bias = (uint32_t)orte_process_info.pid;
|
||||
/* fold in the bias */
|
||||
hash32 = hash32 ^ bias;
|
||||
|
||||
/* now compress to 16-bits */
|
||||
jobfam = (uint16_t)(((0x0000ffff & (0xffff0000 & hash32) >> 16)) ^ (0x0000ffff & hash32));
|
||||
|
||||
/* set the name */
|
||||
ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16);
|
||||
ORTE_PROC_MY_NAME->vpid = 0;
|
||||
}
|
||||
/* my name is set, xfer it to the OPAL layer */
|
||||
orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME;
|
||||
|
||||
/* initialize - PMIx may set our name here if we attach to
|
||||
* a PMIx server */
|
||||
/* initialize */
|
||||
OBJ_CONSTRUCT(&info, opal_list_t);
|
||||
/* pass our name so the PMIx layer can use it */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_TOOL_NSPACE);
|
||||
orte_util_convert_jobid_to_string(&kv->data.string, ORTE_PROC_MY_NAME->jobid);
|
||||
kv->type = OPAL_STRING;
|
||||
opal_list_append(&info, &kv->super);
|
||||
/* ditto for our rank */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_TOOL_RANK);
|
||||
kv->data.name.vpid = ORTE_PROC_MY_NAME->vpid;
|
||||
kv->type = OPAL_VPID;
|
||||
opal_list_append(&info, &kv->super);
|
||||
if (NULL != flags) {
|
||||
/* pass along any directives */
|
||||
OPAL_LIST_FOREACH_SAFE(kv, knext, flags, opal_value_t) {
|
||||
@ -196,9 +145,9 @@ int orte_ess_base_tool_setup(opal_list_t *flags)
|
||||
goto error;
|
||||
}
|
||||
OPAL_LIST_DESTRUCT(&info);
|
||||
/* the PMIx server set our name - record it here */
|
||||
ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid;
|
||||
ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid;
|
||||
|
||||
orte_process_info.super.proc_hostname = strdup(orte_process_info.nodename);
|
||||
orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL;
|
||||
orte_process_info.super.proc_arch = opal_local_arch;
|
||||
@ -294,7 +243,7 @@ int orte_ess_base_tool_setup(opal_list_t *flags)
|
||||
}
|
||||
|
||||
/* setup I/O forwarding system - must come after we init routes */
|
||||
if (NULL != orte_process_info.my_hnp_uri) {
|
||||
if (NULL != orte_process_info.my_hnp_uri && NULL == opal_pmix.server_iof_push) {
|
||||
/* extract the name */
|
||||
if (ORTE_SUCCESS != orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL)) {
|
||||
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri);
|
||||
@ -374,13 +323,14 @@ int orte_ess_base_tool_finalize(void)
|
||||
* a very small subset of orte_init - ensure that
|
||||
* I only back those elements out
|
||||
*/
|
||||
if (NULL != orte_process_info.my_hnp_uri) {
|
||||
if (NULL != orte_process_info.my_hnp_uri && NULL == opal_pmix.server_iof_push) {
|
||||
(void) mca_base_framework_close(&orte_iof_base_framework);
|
||||
}
|
||||
(void) mca_base_framework_close(&orte_routed_base_framework);
|
||||
(void) mca_base_framework_close(&orte_rml_base_framework);
|
||||
(void) mca_base_framework_close(&orte_errmgr_base_framework);
|
||||
|
||||
opal_pmix.finalize();
|
||||
(void) mca_base_framework_close(&opal_pmix_base_framework);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -10,7 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -89,3 +89,9 @@ when OMPI was not configured --with-alps and we weren't able
|
||||
to discover an ALPS installation in the usual places.
|
||||
|
||||
Please configure as appropriate and try again.
|
||||
#
|
||||
[legacy-tool]
|
||||
We no longer support non-PMIx-based tools, and require a
|
||||
minimum level of PMIx v2.0.
|
||||
|
||||
Please update the tool and/or the PMIx version you are using.
|
||||
|
1
orte/mca/ess/env/Makefile.am
поставляемый
1
orte/mca/ess/env/Makefile.am
поставляемый
@ -11,6 +11,7 @@
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
|
210
orte/mca/ess/env/ess_env_module.c
поставляемый
210
orte/mca/ess/env/ess_env_module.c
поставляемый
@ -56,9 +56,6 @@
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#endif
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
@ -68,7 +65,6 @@
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/ess/base/base.h"
|
||||
#include "orte/mca/ess/env/ess_env.h"
|
||||
@ -78,19 +74,11 @@ static int env_set_name(void);
|
||||
static int rte_init(void);
|
||||
static int rte_finalize(void);
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
static int rte_ft_event(int state);
|
||||
#endif
|
||||
|
||||
orte_ess_base_module_t orte_ess_env_module = {
|
||||
rte_init,
|
||||
rte_finalize,
|
||||
orte_ess_base_app_abort,
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
rte_ft_event
|
||||
#else
|
||||
NULL
|
||||
#endif
|
||||
};
|
||||
|
||||
static int rte_init(void)
|
||||
@ -175,201 +163,3 @@ static int env_set_name(void)
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
static int rte_ft_event(int state)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
orte_proc_type_t svtype;
|
||||
|
||||
/******** Checkpoint Prep ********/
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
/*
|
||||
* Notify SnapC
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CHECKPOINT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify Routed
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CHECKPOINT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify RML -> OOB
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
/******** Continue Recovery ********/
|
||||
else if (OPAL_CRS_CONTINUE == state ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
|
||||
"ess:env ft_event(%2d) - %s is Continuing",
|
||||
state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/*
|
||||
* Notify RML -> OOB
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify Routed
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CONTINUE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify SnapC
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CONTINUE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (opal_cr_continue_like_restart) {
|
||||
/*
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( orte_cr_flush_restart_files ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
|
||||
"ess:env ft_event(%2d): %s "
|
||||
"Cleanup restart files...",
|
||||
state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
opal_crs_base_cleanup_flush();
|
||||
}
|
||||
}
|
||||
}
|
||||
/******** Restart Recovery ********/
|
||||
else if (OPAL_CRS_RESTART == state ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
|
||||
"ess:env ft_event(%2d) - %s is Restarting",
|
||||
state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/*
|
||||
* This should follow the ess init() function
|
||||
*/
|
||||
|
||||
/*
|
||||
* - Reset Contact information
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = env_set_name() ) ) {
|
||||
exit_status = ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify RML -> OOB
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Restart the routed framework
|
||||
* JJH: Lie to the finalize function so it does not try to contact the daemon.
|
||||
*/
|
||||
svtype = orte_process_info.proc_type;
|
||||
orte_process_info.proc_type = ORTE_PROC_TOOL;
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.finalize()) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
orte_process_info.proc_type = svtype;
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Restart the PLM - Does nothing at the moment, but included for completeness
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_plm.finalize())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_plm.init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* RML - Enable communications
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify Routed
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_RESTART))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( orte_cr_flush_restart_files ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
|
||||
"ess:env ft_event(%2d): %s "
|
||||
"Cleanup restart files...",
|
||||
state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
opal_crs_base_cleanup_flush();
|
||||
}
|
||||
|
||||
/*
|
||||
* Session directory re-init
|
||||
*/
|
||||
if (orte_create_session_dirs) {
|
||||
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.nodename,
|
||||
NULL, /* Batch ID -- Not used */
|
||||
ORTE_PROC_MY_NAME))) {
|
||||
exit_status = ret;
|
||||
}
|
||||
|
||||
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
|
||||
"output-", NULL, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify SnapC
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_RESTART))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
else if (OPAL_CRS_TERM == state ) {
|
||||
/* Nothing */
|
||||
}
|
||||
else {
|
||||
/* Error state = Nothing */
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
#endif
|
||||
|
@ -42,13 +42,13 @@
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/runtime/opal_cr.h"
|
||||
|
||||
#include "opal/util/arch.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/if.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/malloc.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/util/fd.h"
|
||||
@ -72,10 +72,6 @@
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#include "orte/mca/sstore/base/base.h"
|
||||
#endif
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
@ -95,7 +91,6 @@
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
|
||||
#include "orte/mca/ess/ess.h"
|
||||
@ -150,6 +145,7 @@ static int rte_init(void)
|
||||
orte_topology_t *t;
|
||||
opal_list_t transports;
|
||||
orte_ess_base_signal_t *sig;
|
||||
opal_value_t val;
|
||||
|
||||
/* run the prolog */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
|
||||
@ -473,6 +469,22 @@ static int rte_init(void)
|
||||
proc->pid = orte_process_info.pid;
|
||||
orte_oob_base_get_addr(&proc->rml_uri);
|
||||
orte_process_info.my_hnp_uri = strdup(proc->rml_uri);
|
||||
/* store it in the local PMIx repo for later retrieval */
|
||||
OBJ_CONSTRUCT(&val, opal_value_t);
|
||||
val.key = OPAL_PMIX_PROC_URI;
|
||||
val.type = OPAL_STRING;
|
||||
val.data.string = proc->rml_uri;
|
||||
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &val))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
val.key = NULL;
|
||||
val.data.string = NULL;
|
||||
OBJ_DESTRUCT(&val);
|
||||
error = "store uri";
|
||||
goto error;
|
||||
}
|
||||
val.key = NULL;
|
||||
val.data.string = NULL;
|
||||
OBJ_DESTRUCT(&val);
|
||||
/* we are also officially a daemon, so better update that field too */
|
||||
orte_process_info.my_daemon_uri = strdup(proc->rml_uri);
|
||||
proc->state = ORTE_PROC_STATE_RUNNING;
|
||||
@ -684,46 +696,7 @@ static int rte_init(void)
|
||||
error = "orte_filem_base_select";
|
||||
goto error;
|
||||
}
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/*
|
||||
* Setup the SnapC
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_snapc_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_sstore_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, ORTE_PROC_IS_APP))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_snapc_base_select";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_sstore_base_select";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* For HNP, ORTE doesn't need the OPAL CR stuff */
|
||||
opal_cr_set_enabled(false);
|
||||
#else
|
||||
opal_cr_set_enabled(false);
|
||||
#endif
|
||||
/*
|
||||
* Initalize the CR setup
|
||||
* Note: Always do this, even in non-FT builds.
|
||||
* If we don't some user level tools may hang.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_cr_init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_cr_init";
|
||||
goto error;
|
||||
}
|
||||
/* setup the dfs framework */
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -773,7 +746,7 @@ static int rte_init(void)
|
||||
opal_progress_set_yield_when_idle(false);
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
error:
|
||||
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
|
||||
orte_show_help("help-orte-runtime.txt",
|
||||
"orte_init:startup:internal-failure",
|
||||
@ -898,8 +871,6 @@ static void rte_abort(int status, bool report)
|
||||
* - Assume errmgr cleans up child processes before we exit.
|
||||
*/
|
||||
|
||||
/* CRS cleanup since it may have a named pipe and thread active */
|
||||
orte_cr_finalize();
|
||||
/* ensure we scrub the session directory tree */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
/* - Clean out the global structures
|
||||
|
@ -11,6 +11,7 @@
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
|
@ -2,7 +2,7 @@
|
||||
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2013 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2014 Intel, Inc. All rights reserved
|
||||
# Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
|
@ -10,6 +10,7 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
|
@ -10,6 +10,7 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
|
@ -10,6 +10,7 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
|
@ -11,6 +11,7 @@
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -40,7 +40,6 @@
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/ess/base/base.h"
|
||||
@ -125,7 +124,6 @@ static int rte_init(void)
|
||||
opal_list_append(&flags, &val->super);
|
||||
}
|
||||
|
||||
|
||||
/* do the standard tool init */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(&flags))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -137,7 +135,7 @@ static int rte_init(void)
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
error:
|
||||
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
|
||||
orte_show_help("help-orte-runtime.txt",
|
||||
"orte_init:startup:internal-failure",
|
||||
@ -176,9 +174,6 @@ static void rte_abort(int status, bool report)
|
||||
* - Assume errmgr cleans up child processes before we exit.
|
||||
*/
|
||||
|
||||
/* CRS cleanup since it may have a named pipe and thread active */
|
||||
orte_cr_finalize();
|
||||
|
||||
/* - Clean out the global structures
|
||||
* (not really necessary, but good practice)
|
||||
*/
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user