1
1

Seems that the recent changes in the sds and oob exposed some invalid

assumptions in the FT restart code for the ORTE layer.

This fixes those problems by having the RML completely shutdown and 
restart the OOB framework (instead of just the module as before).
This makes it much easier to manage, and maintainable as the OOB
changes in the future.

The SDS now does communication as part of its startup procedure, so
we need to make sure we restart the RML before the SDS so that it can
communicate properly.

OOB base [close|open] used a static bool to determine if they have
been called previously or not. I needed to expose this boolean so 
that I can close() then open() the oob base in the restart procedure.
The functionality has not changed, we just now have the ability to 
open/close the framework as many times as we need to as long as we
always call them in that order. (So calling open twice in a row is not allowed
as before, it is only allowed if you open(), close(), then open() again).

Things seem to be working now.

This commit was SVN r14515.
Этот коммит содержится в:
Josh Hursey 2007-04-25 19:51:52 +00:00
родитель 4b8bb70afb
Коммит 596062d34b
7 изменённых файлов: 129 добавлений и 64 удалений

Просмотреть файл

@ -49,7 +49,12 @@ extern "C" {
ORTE_DECLSPEC extern bool orte_oob_base_timing;
ORTE_DECLSPEC extern bool orte_oob_xcast_timing;
ORTE_DECLSPEC extern int orte_oob_xcast_mode;
/*
* Flag indicating if this framework has been opened
*/
ORTE_DECLSPEC extern bool orte_oob_base_already_opened;
/*
* OOB API
*/

Просмотреть файл

@ -31,13 +31,12 @@
int mca_oob_base_close(void)
{
opal_list_item_t* item;
static bool already_closed = false;
/* Sanity check. This may be able to be removed when the rml/oob
interface is re-worked (the current infrastructure may invoke
this function twice: once as a standalone, and once via the rml
oob component). */
if (already_closed) {
if (!orte_oob_base_already_opened) {
return ORTE_SUCCESS;
}
@ -66,7 +65,8 @@ int mca_oob_base_close(void)
free(mca_oob_base_exclude);
/* All done */
already_closed = true;
orte_oob_base_already_opened = false;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -50,6 +50,8 @@ bool orte_oob_base_timing;
bool orte_oob_xcast_timing;
int orte_oob_xcast_mode;
bool orte_oob_base_already_opened = false;
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
@ -58,13 +60,12 @@ int mca_oob_base_open(void)
{
int param, value;
char *mode;
static bool already_opened = false;
/* Sanity check. This may be able to be removed when the rml/oob
interface is re-worked (the current infrastructure may invoke
this function twice: once as a standalone, and once via the rml
oob component). */
if (already_opened) {
if (orte_oob_base_already_opened) {
return ORTE_SUCCESS;
}
@ -127,7 +128,8 @@ int mca_oob_base_open(void)
}
/* All done */
already_opened = true;
orte_oob_base_already_opened = true;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1682,55 +1682,11 @@ int mca_oob_tcp_ft_event(int state) {
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
}
else if(OPAL_CRS_RESTART == state) {
/* Module cleanup */
/*
* Resume event processing
*/
opal_event_enable();
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
mca_oob_tcp_fini();
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
/* Clean up bad peer info */
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_subscriptions);
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_list);
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peers);
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_names);
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_free);
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_events);
/* do subset of mca_oob_tcp_component_open() */
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_subscriptions, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_list, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peers, opal_hash_table_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_names, opal_hash_table_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_free, opal_free_list_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_events, opal_list_t);
mca_oob_tcp_component.tcp_shutdown = false;
mca_oob_tcp_component.tcp_listen_sd = -1;
mca_oob_tcp_component.tcp_match_count = 0;
mca_oob_tcp_component.tcp_last_copy_time = 0;
/* Do subset of mca_oob_tcp_component_init() */
opal_hash_table_init(&mca_oob_tcp_component.tcp_peers, 128);
opal_hash_table_init(&mca_oob_tcp_component.tcp_peer_names, 128);
opal_free_list_init(&mca_oob_tcp_component.tcp_peer_free,
sizeof(mca_oob_tcp_peer_t),
OBJ_CLASS(mca_oob_tcp_peer_t),
8, /* initial number */
mca_oob_tcp_component.tcp_peer_limit, /* maximum number */
8); /* increment to grow by */
/* Reset seed contact information */
if(NULL != orte_process_info.ns_replica_uri) {
mca_oob_set_contact_info(orte_process_info.ns_replica_uri);
}
if(NULL != orte_process_info.gpr_replica_uri) {
mca_oob_set_contact_info(orte_process_info.gpr_replica_uri);
}
mca_oob_tcp_init();
}
else if(OPAL_CRS_TERM == state ) {
;

Просмотреть файл

@ -456,12 +456,47 @@ int orte_rml_ftrm_ft_event(int state)
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: ft_event()");
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
/*
* The wrapped component is responsible for calling the OOB modules
*/
if( NULL != wrapped_module.ft_event ) {
if( ORTE_SUCCESS != (ret = wrapped_module.ft_event(state))) {
return ret;
}
}
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -128,5 +128,72 @@ orte_rml_oob_close(void)
}
int orte_rml_oob_ft_event(int state) {
return mca_oob.oob_ft_event(state);
int exit_status = ORTE_SUCCESS;
int ret;
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
if( ORTE_SUCCESS != (ret = mca_oob.oob_ft_event(state)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
if( ORTE_SUCCESS != (ret = mca_oob_base_close())) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if( ORTE_SUCCESS != (ret = mca_oob_base_open())) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if( ORTE_SUCCESS != (ret = mca_oob_base_init())) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if(NULL != orte_process_info.ns_replica_uri) {
mca_oob_set_contact_info(orte_process_info.ns_replica_uri);
}
if(NULL != orte_process_info.gpr_replica_uri) {
mca_oob_set_contact_info(orte_process_info.gpr_replica_uri);
}
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
cleanup:
return exit_status;
}

Просмотреть файл

@ -422,6 +422,14 @@ static int orte_cr_coord_post_restart(void) {
exit_status = ret;
}
/*
* Notify RML & OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;
goto cleanup;
}
/*
* Startup Discovery Service:
* - Connect to the universe
@ -491,14 +499,6 @@ static int orte_cr_coord_post_restart(void) {
exit_status = ret;
}
/*
* Notify RML & OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;
goto cleanup;
}
/*
* Notify NS
*/