
assumptions in the FT restart code for the ORTE layer. This fixes those problems by having the RML completely shutdown and restart the OOB framework (instead of just the module as before). This makes it much easier to manage, and maintainable as the OOB changes in the future. The SDS now does communication as part of its startup procedure, so we need to make sure we restart the RML before the SDS so that it can communicate properly. OOB base [close|open] used a static bool to determine if they have been called previously or not. I needed to expose this boolean so that I can close() then open() the oob base in the restart procedure. The functionality has not changed, we just now have the ability to open/close the framework as many times as we need to as long as we always call them in that order. (So calling open twice in a row is not allowed as before, it is only allowed if you open(), close(), then open() again). Things seem to be working now. This commit was SVN r14515.
200 строки
5.2 KiB
C
200 строки
5.2 KiB
C
/*
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/orte_constants.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/mca/base/base.h"
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
#include "orte/mca/rml/base/base.h"
|
|
#include "orte/mca/oob/oob.h"
|
|
#include "orte/mca/oob/base/base.h"
|
|
#include "rml_oob.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
static orte_rml_module_t* orte_rml_oob_init(int* priority);
|
|
static int orte_rml_oob_open(void);
|
|
static int orte_rml_oob_close(void);
|
|
|
|
|
|
/**
|
|
* component definition
|
|
*/
|
|
orte_rml_component_t mca_rml_oob_component = {
|
|
/* First, the mca_base_component_t struct containing meta
|
|
information about the component itself */
|
|
|
|
{
|
|
/* Indicate that we are a rml v1.0.0 component (which also
|
|
implies a specific MCA version) */
|
|
|
|
ORTE_RML_BASE_VERSION_1_0_0,
|
|
|
|
"oob", /* MCA component name */
|
|
ORTE_MAJOR_VERSION, /* MCA component major version */
|
|
ORTE_MINOR_VERSION, /* MCA component minor version */
|
|
ORTE_RELEASE_VERSION, /* MCA component release version */
|
|
orte_rml_oob_open, /* component open */
|
|
orte_rml_oob_close, /* component close */
|
|
},
|
|
|
|
/* Next the MCA v1.0.0 component meta data */
|
|
{
|
|
/* The component is checkpoint ready */
|
|
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
|
},
|
|
orte_rml_oob_init
|
|
};
|
|
|
|
orte_rml_module_t orte_rml_oob_module = {
|
|
mca_oob_base_module_init,
|
|
NULL,
|
|
(orte_rml_module_get_uri_fn_t)mca_oob_get_contact_info,
|
|
(orte_rml_module_set_uri_fn_t)mca_oob_set_contact_info,
|
|
(orte_rml_module_parse_uris_fn_t)mca_oob_parse_contact_info,
|
|
(orte_rml_module_ping_fn_t)mca_oob_ping,
|
|
(orte_rml_module_send_fn_t)mca_oob_send,
|
|
(orte_rml_module_send_nb_fn_t)mca_oob_send_nb,
|
|
(orte_rml_module_send_buffer_fn_t)mca_oob_send_packed,
|
|
(orte_rml_module_send_buffer_nb_fn_t)mca_oob_send_packed_nb,
|
|
(orte_rml_module_recv_fn_t)mca_oob_recv,
|
|
(orte_rml_module_recv_nb_fn_t)mca_oob_recv_nb,
|
|
(orte_rml_module_recv_buffer_fn_t)mca_oob_recv_packed,
|
|
(orte_rml_module_recv_buffer_nb_fn_t)mca_oob_recv_packed_nb,
|
|
(orte_rml_module_recv_cancel_fn_t)mca_oob_recv_cancel,
|
|
(orte_rml_module_xcast_fn_t)mca_oob_xcast,
|
|
(orte_rml_module_exception_fn_t)mca_oob_add_exception_handler,
|
|
(orte_rml_module_exception_fn_t)mca_oob_del_exception_handler,
|
|
(orte_rml_module_ft_event_fn_t)orte_rml_oob_ft_event
|
|
};
|
|
|
|
|
|
static orte_rml_module_t* orte_rml_oob_init(int* priority)
|
|
{
|
|
if(mca_oob_base_init() != ORTE_SUCCESS)
|
|
return NULL;
|
|
*priority = 1;
|
|
return &orte_rml_oob_module;
|
|
}
|
|
|
|
|
|
/*
|
|
* initialize the underlying oob infrastructure so that all the
|
|
* pointers in the RML struct can be valid.
|
|
*/
|
|
static int
|
|
orte_rml_oob_open(void)
|
|
{
|
|
int rc;
|
|
|
|
if (ORTE_SUCCESS != (rc = mca_oob_base_open())) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
* shut down the OOB, since we started it.
|
|
*/
|
|
static int
|
|
orte_rml_oob_close(void)
|
|
{
|
|
int rc;
|
|
|
|
|
|
if (ORTE_SUCCESS != (rc = mca_oob_base_close())) {
|
|
return rc;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
int orte_rml_oob_ft_event(int state) {
|
|
int exit_status = ORTE_SUCCESS;
|
|
int ret;
|
|
|
|
if(OPAL_CRS_CHECKPOINT == state) {
|
|
;
|
|
}
|
|
else if(OPAL_CRS_CONTINUE == state) {
|
|
;
|
|
}
|
|
else if(OPAL_CRS_RESTART == state) {
|
|
;
|
|
}
|
|
else if(OPAL_CRS_TERM == state ) {
|
|
;
|
|
}
|
|
else {
|
|
;
|
|
}
|
|
|
|
if( ORTE_SUCCESS != (ret = mca_oob.oob_ft_event(state)) ) {
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
|
|
if(OPAL_CRS_CHECKPOINT == state) {
|
|
;
|
|
}
|
|
else if(OPAL_CRS_CONTINUE == state) {
|
|
;
|
|
}
|
|
else if(OPAL_CRS_RESTART == state) {
|
|
if( ORTE_SUCCESS != (ret = mca_oob_base_close())) {
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( ORTE_SUCCESS != (ret = mca_oob_base_open())) {
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( ORTE_SUCCESS != (ret = mca_oob_base_init())) {
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if(NULL != orte_process_info.ns_replica_uri) {
|
|
mca_oob_set_contact_info(orte_process_info.ns_replica_uri);
|
|
}
|
|
|
|
if(NULL != orte_process_info.gpr_replica_uri) {
|
|
mca_oob_set_contact_info(orte_process_info.gpr_replica_uri);
|
|
}
|
|
}
|
|
else if(OPAL_CRS_TERM == state ) {
|
|
;
|
|
}
|
|
else {
|
|
;
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|