1
1
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2018-01-25 08:43:44 -08:00
родитель 9fb80bd239
Коммит d1071397ac
15 изменённых файлов: 50 добавлений и 429 удалений

Просмотреть файл

@ -10,6 +10,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California. # Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
@ -50,4 +51,3 @@ libmca_ess_alps_la_SOURCES =$(sources)
libmca_ess_alps_la_CPPFLAGS = $(ess_alps_CPPFLAGS) libmca_ess_alps_la_CPPFLAGS = $(ess_alps_CPPFLAGS)
libmca_ess_alps_la_LDFLAGS = -module -avoid-version $(ess_alps_LDFLAGS) libmca_ess_alps_la_LDFLAGS = -module -avoid-version $(ess_alps_LDFLAGS)
libmca_ess_alps_la_LIBADD = $(ess_alps_LIBS) libmca_ess_alps_la_LIBADD = $(ess_alps_LIBS)

Просмотреть файл

@ -45,7 +45,6 @@
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/proc.h" #include "opal/util/proc.h"
#include "opal/runtime/opal.h" #include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "orte/mca/rml/base/base.h" #include "orte/mca/rml/base/base.h"
#include "orte/mca/routed/base/base.h" #include "orte/mca/routed/base/base.h"
@ -58,17 +57,12 @@
#include "orte/mca/odls/odls_types.h" #include "orte/mca/odls/odls_types.h"
#include "orte/mca/filem/base/base.h" #include "orte/mca/filem/base/base.h"
#include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/base.h"
#if OPAL_ENABLE_FT_CR == 1
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/sstore/base/base.h"
#endif
#include "orte/mca/state/base/base.h" #include "orte/mca/state/base/base.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/util/session_dir.h" #include "orte/util/session_dir.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/runtime/orte_cr.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
@ -285,44 +279,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
goto error; goto error;
} }
#if OPAL_ENABLE_FT_CR == 1
/*
* Setup the SnapC
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_snapc_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_sstore_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, ORTE_PROC_IS_APP))) {
ORTE_ERROR_LOG(ret);
error = "orte_snapc_base_select";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_sstore_base_select";
goto error;
}
/* apps need the OPAL CR stuff */
opal_cr_set_enabled(true);
#else
opal_cr_set_enabled(false);
#endif
/* Initalize the CR setup
* Note: Always do this, even in non-FT builds.
* If we don't some user level tools may hang.
*/
if (ORTE_SUCCESS != (ret = orte_cr_init())) {
ORTE_ERROR_LOG(ret);
error = "orte_cr_init";
goto error;
}
/* open the distributed file system */ /* open the distributed file system */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
@ -344,13 +300,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
int orte_ess_base_app_finalize(void) int orte_ess_base_app_finalize(void)
{ {
orte_cr_finalize();
#if OPAL_ENABLE_FT_CR == 1
(void) mca_base_framework_close(&orte_snapc_base_framework);
(void) mca_base_framework_close(&orte_sstore_base_framework);
#endif
/* release the conduits */ /* release the conduits */
orte_rml.close_conduit(orte_mgmt_conduit); orte_rml.close_conduit(orte_mgmt_conduit);
orte_rml.close_conduit(orte_coll_conduit); orte_rml.close_conduit(orte_coll_conduit);
@ -414,8 +363,7 @@ void orte_ess_base_app_abort(int status, bool report)
* clean environment. Taken from orte_finalize(): * clean environment. Taken from orte_finalize():
* - Assume errmgr cleans up child processes before we exit. * - Assume errmgr cleans up child processes before we exit.
*/ */
/* CRS cleanup since it may have a named pipe and thread active */
orte_cr_finalize();
/* If we were asked to report this termination, do so. /* If we were asked to report this termination, do so.
* Since singletons don't start an HNP unless necessary, and * Since singletons don't start an HNP unless necessary, and
* direct-launched procs don't have daemons at all, only send * direct-launched procs don't have daemons at all, only send

Просмотреть файл

@ -38,11 +38,11 @@
#include "opal/dss/dss.h" #include "opal/dss/dss.h"
#include "opal/mca/event/event.h" #include "opal/mca/event/event.h"
#include "opal/runtime/opal.h" #include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "opal/mca/hwloc/base/base.h" #include "opal/mca/hwloc/base/base.h"
#include "opal/mca/pmix/base/base.h" #include "opal/mca/pmix/base/base.h"
#include "opal/mca/pstat/base/base.h" #include "opal/mca/pstat/base/base.h"
#include "opal/util/arch.h" #include "opal/util/arch.h"
#include "opal/util/opal_environ.h"
#include "opal/util/os_path.h" #include "opal/util/os_path.h"
#include "opal/util/proc.h" #include "opal/util/proc.h"
@ -61,10 +61,6 @@
#include "orte/mca/regx/base/base.h" #include "orte/mca/regx/base/base.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/base.h" #include "orte/mca/rmaps/base/base.h"
#if OPAL_ENABLE_FT_CR == 1
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/sstore/base/base.h"
#endif
#include "orte/mca/filem/base/base.h" #include "orte/mca/filem/base/base.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/util/session_dir.h" #include "orte/util/session_dir.h"
@ -73,7 +69,6 @@
#include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/base.h"
#include "orte/mca/state/base/base.h" #include "orte/mca/state/base/base.h"
#include "orte/mca/state/state.h" #include "orte/mca/state/state.h"
#include "orte/runtime/orte_cr.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_quit.h" #include "orte/runtime/orte_quit.h"
@ -626,46 +621,6 @@ int orte_ess_base_orted_setup(void)
goto error; goto error;
} }
#if OPAL_ENABLE_FT_CR == 1
/*
* Setup the SnapC
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_snapc_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_sstore_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_snapc_base_select(!ORTE_PROC_IS_HNP, ORTE_PROC_IS_DAEMON))) {
ORTE_ERROR_LOG(ret);
error = "orte_snapc_base_select";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_sstore_base_select";
goto error;
}
/* For daemons, ORTE doesn't need the OPAL CR stuff */
opal_cr_set_enabled(false);
#else
opal_cr_set_enabled(false);
#endif
/*
* Initalize the CR setup
* Note: Always do this, even in non-FT builds.
* If we don't some user level tools may hang.
*/
if (ORTE_SUCCESS != (ret = orte_cr_init())) {
ORTE_ERROR_LOG(ret);
error = "orte_cr_init";
goto error;
}
/* setup the DFS framework */ /* setup the DFS framework */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved. * Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
* *
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
@ -37,25 +37,24 @@
#include "opal/mca/event/event.h" #include "opal/mca/event/event.h"
#include "opal/mca/pmix/base/base.h" #include "opal/mca/pmix/base/base.h"
#include "opal/runtime/opal.h" #include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "opal/runtime/opal_progress_threads.h" #include "opal/runtime/opal_progress_threads.h"
#include "opal/util/arch.h" #include "opal/util/arch.h"
#include "opal/util/opal_environ.h"
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "opal/util/proc.h" #include "opal/util/proc.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/oob/base/base.h" #include "orte/mca/oob/base/base.h"
#include "orte/mca/plm/base/base.h" #include "orte/mca/plm/base/base.h"
#include "orte/mca/rml/base/base.h" #include "orte/mca/rml/base/base.h"
#include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/routed/base/base.h" #include "orte/mca/routed/base/base.h"
#include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/base.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/state/base/base.h" #include "orte/mca/state/base/base.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/util/session_dir.h" #include "orte/util/session_dir.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/runtime/orte_cr.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
@ -96,8 +95,6 @@ int orte_ess_base_tool_setup(opal_list_t *flags)
int ret; int ret;
char *error = NULL; char *error = NULL;
opal_list_t transports; opal_list_t transports;
orte_jobid_t jobid;
orte_vpid_t vpid;
opal_list_t info; opal_list_t info;
opal_value_t *kv, *knext, val; opal_value_t *kv, *knext, val;
opal_pmix_query_t *q; opal_pmix_query_t *q;
@ -123,65 +120,17 @@ int orte_ess_base_tool_setup(opal_list_t *flags)
} }
if (NULL == opal_pmix.tool_init) { if (NULL == opal_pmix.tool_init) {
/* we no longer support non-pmix tools */ /* we no longer support non-pmix tools */
orte_show_help("help-ess-base.txt",
"legacy-tool", true);
ret = ORTE_ERR_SILENT;
error = "opal_pmix.tool_init"; error = "opal_pmix.tool_init";
ret = ORTE_ERR_NOT_SUPPORTED;
goto error; goto error;
} }
/* set the event base for the pmix component code */ /* set the event base for the pmix component code */
opal_pmix_base_set_evbase(orte_event_base); opal_pmix_base_set_evbase(orte_event_base);
/* we have to define our name here */ /* initialize */
if (NULL != orte_ess_base_jobid &&
NULL != orte_ess_base_vpid) {
opal_output_verbose(2, orte_ess_base_framework.framework_output,
"ess:tool:obtaining name from environment");
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&jobid, orte_ess_base_jobid))) {
return(ret);
}
ORTE_PROC_MY_NAME->jobid = jobid;
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_vpid(&vpid, orte_ess_base_vpid))) {
return(ret);
}
ORTE_PROC_MY_NAME->vpid = vpid;
} else {
/* If we are a tool with no name, then define it here */
uint16_t jobfam;
uint32_t hash32;
uint32_t bias;
opal_output_verbose(2, orte_ess_base_framework.framework_output,
"ess:tool:computing name");
/* hash the nodename */
OPAL_HASH_STR(orte_process_info.nodename, hash32);
bias = (uint32_t)orte_process_info.pid;
/* fold in the bias */
hash32 = hash32 ^ bias;
/* now compress to 16-bits */
jobfam = (uint16_t)(((0x0000ffff & (0xffff0000 & hash32) >> 16)) ^ (0x0000ffff & hash32));
/* set the name */
ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16);
ORTE_PROC_MY_NAME->vpid = 0;
}
/* my name is set, xfer it to the OPAL layer */
orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME;
/* initialize - PMIx may set our name here if we attach to
* a PMIx server */
OBJ_CONSTRUCT(&info, opal_list_t); OBJ_CONSTRUCT(&info, opal_list_t);
/* pass our name so the PMIx layer can use it */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_TOOL_NSPACE);
orte_util_convert_jobid_to_string(&kv->data.string, ORTE_PROC_MY_NAME->jobid);
kv->type = OPAL_STRING;
opal_list_append(&info, &kv->super);
/* ditto for our rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_TOOL_RANK);
kv->data.name.vpid = ORTE_PROC_MY_NAME->vpid;
kv->type = OPAL_VPID;
opal_list_append(&info, &kv->super);
if (NULL != flags) { if (NULL != flags) {
/* pass along any directives */ /* pass along any directives */
OPAL_LIST_FOREACH_SAFE(kv, knext, flags, opal_value_t) { OPAL_LIST_FOREACH_SAFE(kv, knext, flags, opal_value_t) {
@ -196,9 +145,9 @@ int orte_ess_base_tool_setup(opal_list_t *flags)
goto error; goto error;
} }
OPAL_LIST_DESTRUCT(&info); OPAL_LIST_DESTRUCT(&info);
/* the PMIx server set our name - record it here */
ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid; ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid;
ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid; ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid;
orte_process_info.super.proc_hostname = strdup(orte_process_info.nodename); orte_process_info.super.proc_hostname = strdup(orte_process_info.nodename);
orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL; orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL;
orte_process_info.super.proc_arch = opal_local_arch; orte_process_info.super.proc_arch = opal_local_arch;
@ -294,7 +243,7 @@ int orte_ess_base_tool_setup(opal_list_t *flags)
} }
/* setup I/O forwarding system - must come after we init routes */ /* setup I/O forwarding system - must come after we init routes */
if (NULL != orte_process_info.my_hnp_uri) { if (NULL != orte_process_info.my_hnp_uri && NULL == opal_pmix.server_iof_push) {
/* extract the name */ /* extract the name */
if (ORTE_SUCCESS != orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL)) { if (ORTE_SUCCESS != orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL)) {
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri); orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri);
@ -374,13 +323,14 @@ int orte_ess_base_tool_finalize(void)
* a very small subset of orte_init - ensure that * a very small subset of orte_init - ensure that
* I only back those elements out * I only back those elements out
*/ */
if (NULL != orte_process_info.my_hnp_uri) { if (NULL != orte_process_info.my_hnp_uri && NULL == opal_pmix.server_iof_push) {
(void) mca_base_framework_close(&orte_iof_base_framework); (void) mca_base_framework_close(&orte_iof_base_framework);
} }
(void) mca_base_framework_close(&orte_routed_base_framework); (void) mca_base_framework_close(&orte_routed_base_framework);
(void) mca_base_framework_close(&orte_rml_base_framework); (void) mca_base_framework_close(&orte_rml_base_framework);
(void) mca_base_framework_close(&orte_errmgr_base_framework); (void) mca_base_framework_close(&orte_errmgr_base_framework);
opal_pmix.finalize();
(void) mca_base_framework_close(&opal_pmix_base_framework); (void) mca_base_framework_close(&opal_pmix_base_framework);
return ORTE_SUCCESS; return ORTE_SUCCESS;

Просмотреть файл

@ -10,7 +10,7 @@
# University of Stuttgart. All rights reserved. # University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California. # Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved. # Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -89,3 +89,9 @@ when OMPI was not configured --with-alps and we weren't able
to discover an ALPS installation in the usual places. to discover an ALPS installation in the usual places.
Please configure as appropriate and try again. Please configure as appropriate and try again.
#
[legacy-tool]
We no longer support non-PMIx-based tools, and require a
minimum level of PMIx v2.0.
Please update the tool and/or the PMIx version you are using.

1
orte/mca/ess/env/Makefile.am поставляемый
Просмотреть файл

@ -11,6 +11,7 @@
# All rights reserved. # All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved.
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow

210
orte/mca/ess/env/ess_env_module.c поставляемый
Просмотреть файл

@ -56,9 +56,6 @@
#include "orte/mca/plm/base/base.h" #include "orte/mca/plm/base/base.h"
#include "orte/mca/rmaps/base/base.h" #include "orte/mca/rmaps/base/base.h"
#if OPAL_ENABLE_FT_CR == 1
#include "orte/mca/snapc/base/base.h"
#endif
#include "orte/mca/filem/base/base.h" #include "orte/mca/filem/base/base.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/util/session_dir.h" #include "orte/util/session_dir.h"
@ -68,7 +65,6 @@
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_cr.h"
#include "orte/mca/ess/ess.h" #include "orte/mca/ess/ess.h"
#include "orte/mca/ess/base/base.h" #include "orte/mca/ess/base/base.h"
#include "orte/mca/ess/env/ess_env.h" #include "orte/mca/ess/env/ess_env.h"
@ -78,19 +74,11 @@ static int env_set_name(void);
static int rte_init(void); static int rte_init(void);
static int rte_finalize(void); static int rte_finalize(void);
#if OPAL_ENABLE_FT_CR == 1
static int rte_ft_event(int state);
#endif
orte_ess_base_module_t orte_ess_env_module = { orte_ess_base_module_t orte_ess_env_module = {
rte_init, rte_init,
rte_finalize, rte_finalize,
orte_ess_base_app_abort, orte_ess_base_app_abort,
#if OPAL_ENABLE_FT_CR == 1
rte_ft_event
#else
NULL NULL
#endif
}; };
static int rte_init(void) static int rte_init(void)
@ -175,201 +163,3 @@ static int env_set_name(void)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
#if OPAL_ENABLE_FT_CR == 1
static int rte_ft_event(int state)
{
int ret, exit_status = ORTE_SUCCESS;
orte_proc_type_t svtype;
/******** Checkpoint Prep ********/
if(OPAL_CRS_CHECKPOINT == state) {
/*
* Notify SnapC
*/
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CHECKPOINT))) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Notify Routed
*/
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CHECKPOINT))) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Notify RML -> OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) {
ORTE_ERROR_LOG(ret);
return ret;
}
}
/******** Continue Recovery ********/
else if (OPAL_CRS_CONTINUE == state ) {
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
"ess:env ft_event(%2d) - %s is Continuing",
state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/*
* Notify RML -> OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Notify Routed
*/
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CONTINUE))) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Notify SnapC
*/
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CONTINUE))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (opal_cr_continue_like_restart) {
/*
* Barrier to make all processes have been successfully restarted before
* we try to remove some restart only files.
*/
opal_pmix.fence(NULL, 0);
if( orte_cr_flush_restart_files ) {
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
"ess:env ft_event(%2d): %s "
"Cleanup restart files...",
state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
opal_crs_base_cleanup_flush();
}
}
}
/******** Restart Recovery ********/
else if (OPAL_CRS_RESTART == state ) {
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
"ess:env ft_event(%2d) - %s is Restarting",
state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/*
* This should follow the ess init() function
*/
/*
* - Reset Contact information
*/
if( ORTE_SUCCESS != (ret = env_set_name() ) ) {
exit_status = ret;
}
/*
* Notify RML -> OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Restart the routed framework
* JJH: Lie to the finalize function so it does not try to contact the daemon.
*/
svtype = orte_process_info.proc_type;
orte_process_info.proc_type = ORTE_PROC_TOOL;
if (ORTE_SUCCESS != (ret = orte_routed.finalize()) ) {
ORTE_ERROR_LOG(ret);
return ret;
}
orte_process_info.proc_type = svtype;
if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Restart the PLM - Does nothing at the moment, but included for completeness
*/
if (ORTE_SUCCESS != (ret = orte_plm.finalize())) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (ORTE_SUCCESS != (ret = orte_plm.init())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* RML - Enable communications
*/
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Notify Routed
*/
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_RESTART))) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Barrier to make all processes have been successfully restarted before
* we try to remove some restart only files.
*/
opal_pmix.fence(NULL, 0);
if( orte_cr_flush_restart_files ) {
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
"ess:env ft_event(%2d): %s "
"Cleanup restart files...",
state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
opal_crs_base_cleanup_flush();
}
/*
* Session directory re-init
*/
if (orte_create_session_dirs) {
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
orte_process_info.tmpdir_base,
orte_process_info.nodename,
NULL, /* Batch ID -- Not used */
ORTE_PROC_MY_NAME))) {
exit_status = ret;
}
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
"output-", NULL, NULL);
}
/*
* Notify SnapC
*/
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_RESTART))) {
ORTE_ERROR_LOG(ret);
return ret;
}
}
else if (OPAL_CRS_TERM == state ) {
/* Nothing */
}
else {
/* Error state = Nothing */
}
return exit_status;
}
#endif

Просмотреть файл

@ -42,13 +42,13 @@
#include "opal/class/opal_list.h" #include "opal/class/opal_list.h"
#include "opal/mca/event/event.h" #include "opal/mca/event/event.h"
#include "opal/runtime/opal.h" #include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "opal/util/arch.h" #include "opal/util/arch.h"
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "opal/util/if.h" #include "opal/util/if.h"
#include "opal/util/os_path.h" #include "opal/util/os_path.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/malloc.h" #include "opal/util/malloc.h"
#include "opal/util/basename.h" #include "opal/util/basename.h"
#include "opal/util/fd.h" #include "opal/util/fd.h"
@ -72,10 +72,6 @@
#include "orte/mca/plm/plm.h" #include "orte/mca/plm/plm.h"
#include "orte/mca/odls/base/base.h" #include "orte/mca/odls/base/base.h"
#include "orte/mca/rmaps/base/base.h" #include "orte/mca/rmaps/base/base.h"
#if OPAL_ENABLE_FT_CR == 1
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/sstore/base/base.h"
#endif
#include "orte/mca/filem/base/base.h" #include "orte/mca/filem/base/base.h"
#include "orte/mca/state/base/base.h" #include "orte/mca/state/base/base.h"
#include "orte/mca/state/state.h" #include "orte/mca/state/state.h"
@ -95,7 +91,6 @@
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_quit.h" #include "orte/runtime/orte_quit.h"
#include "orte/runtime/orte_cr.h"
#include "orte/runtime/orte_locks.h" #include "orte/runtime/orte_locks.h"
#include "orte/mca/ess/ess.h" #include "orte/mca/ess/ess.h"
@ -150,6 +145,7 @@ static int rte_init(void)
orte_topology_t *t; orte_topology_t *t;
opal_list_t transports; opal_list_t transports;
orte_ess_base_signal_t *sig; orte_ess_base_signal_t *sig;
opal_value_t val;
/* run the prolog */ /* run the prolog */
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
@ -473,6 +469,22 @@ static int rte_init(void)
proc->pid = orte_process_info.pid; proc->pid = orte_process_info.pid;
orte_oob_base_get_addr(&proc->rml_uri); orte_oob_base_get_addr(&proc->rml_uri);
orte_process_info.my_hnp_uri = strdup(proc->rml_uri); orte_process_info.my_hnp_uri = strdup(proc->rml_uri);
/* store it in the local PMIx repo for later retrieval */
OBJ_CONSTRUCT(&val, opal_value_t);
val.key = OPAL_PMIX_PROC_URI;
val.type = OPAL_STRING;
val.data.string = proc->rml_uri;
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &val))) {
ORTE_ERROR_LOG(ret);
val.key = NULL;
val.data.string = NULL;
OBJ_DESTRUCT(&val);
error = "store uri";
goto error;
}
val.key = NULL;
val.data.string = NULL;
OBJ_DESTRUCT(&val);
/* we are also officially a daemon, so better update that field too */ /* we are also officially a daemon, so better update that field too */
orte_process_info.my_daemon_uri = strdup(proc->rml_uri); orte_process_info.my_daemon_uri = strdup(proc->rml_uri);
proc->state = ORTE_PROC_STATE_RUNNING; proc->state = ORTE_PROC_STATE_RUNNING;
@ -684,46 +696,7 @@ static int rte_init(void)
error = "orte_filem_base_select"; error = "orte_filem_base_select";
goto error; goto error;
} }
#if OPAL_ENABLE_FT_CR == 1
/*
* Setup the SnapC
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_snapc_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_sstore_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, ORTE_PROC_IS_APP))) {
ORTE_ERROR_LOG(ret);
error = "orte_snapc_base_select";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_sstore_base_select";
goto error;
}
/* For HNP, ORTE doesn't need the OPAL CR stuff */
opal_cr_set_enabled(false);
#else
opal_cr_set_enabled(false);
#endif
/*
* Initalize the CR setup
* Note: Always do this, even in non-FT builds.
* If we don't some user level tools may hang.
*/
if (ORTE_SUCCESS != (ret = orte_cr_init())) {
ORTE_ERROR_LOG(ret);
error = "orte_cr_init";
goto error;
}
/* setup the dfs framework */ /* setup the dfs framework */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
@ -898,8 +871,6 @@ static void rte_abort(int status, bool report)
* - Assume errmgr cleans up child processes before we exit. * - Assume errmgr cleans up child processes before we exit.
*/ */
/* CRS cleanup since it may have a named pipe and thread active */
orte_cr_finalize();
/* ensure we scrub the session directory tree */ /* ensure we scrub the session directory tree */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
/* - Clean out the global structures /* - Clean out the global structures

Просмотреть файл

@ -11,6 +11,7 @@
# All rights reserved. # All rights reserved.
# Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved.
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow

Просмотреть файл

@ -2,7 +2,7 @@
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2013 Los Alamos National Security, LLC. # Copyright (c) 2013 Los Alamos National Security, LLC.
# All rights reserved. # All rights reserved.
# Copyright (c) 2014 Intel, Inc. All rights reserved # Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #

Просмотреть файл

@ -10,6 +10,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California. # Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved.
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow

Просмотреть файл

@ -10,6 +10,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California. # Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved.
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow

Просмотреть файл

@ -10,6 +10,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California. # Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved.
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow

Просмотреть файл

@ -11,6 +11,7 @@
# All rights reserved. # All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved.
# Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -40,7 +40,6 @@
#include "orte/mca/plm/plm.h" #include "orte/mca/plm/plm.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/runtime/orte_cr.h"
#include "orte/mca/ess/ess.h" #include "orte/mca/ess/ess.h"
#include "orte/mca/ess/base/base.h" #include "orte/mca/ess/base/base.h"
@ -125,7 +124,6 @@ static int rte_init(void)
opal_list_append(&flags, &val->super); opal_list_append(&flags, &val->super);
} }
/* do the standard tool init */ /* do the standard tool init */
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(&flags))) { if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(&flags))) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
@ -176,9 +174,6 @@ static void rte_abort(int status, bool report)
* - Assume errmgr cleans up child processes before we exit. * - Assume errmgr cleans up child processes before we exit.
*/ */
/* CRS cleanup since it may have a named pipe and thread active */
orte_cr_finalize();
/* - Clean out the global structures /* - Clean out the global structures
* (not really necessary, but good practice) * (not really necessary, but good practice)
*/ */