1
1

Fix a checkpoint/restart bug that causes a restarted application to occasionally throw a SIGSEGV or SIGPIPE due to invalid socket descriptors.

The problem was caused by a bad ordering between the restart of the ORTE level tcp connections (in the OOB - out-of-band communication) and the Open MPI level tcp connections (BTLs). Before this commit ORTE would shutdown and restart the OOB completely before the OMPI level restarted its tcp connections. What would happen is that a socket descriptor used by the OMPI level on checkpoint was assigned to the ORTE level on restart. But the OMPI level had no knowledge that the socket descriptor it was previously using has been recycled so it closed it on restart. This caused the ORTE level to break as the newly created socket descriptor was closed without its knowledge.

The fix is to have the OMPI level shutdown tcp connections, allow the ORTE level to restart, and then allow the OMPi level to restart its connections. This seems obvious, and I'm surprised that this bug has not cropped up sooner. I'm confident that this specific problem has been fixed with this commit.

Thanks to Eric Roman and Tamer El Sayed for their help in identifying this problem, and patience while I was fixing it.

 * Add a new state {{{OPAL_CRS_RESTART_PRE}}}. This state identifies when we are on the down slope of the INC (finalize-like) which is useful when you want to close, but not reopen a component set for fear of interfering with a lower level.
 * Use this new state in OMPI level coordination. Here we want to make sure to play well with both the OMPI/BTL/TCP and ORTE/OOB/TCP components.
 * Update ft_event functions in PML and BML to handle the new restart state.
 * Add an additional flag to the error output in OOB/TCP so we can see what the socket descriptor was on failure as this can be helpful in debugging.

This commit was SVN r18276.
Этот коммит содержится в:
Josh Hursey 2008-04-24 17:54:22 +00:00
родитель 3ccac4f803
Коммит 2c736873bb
6 изменённых файлов: 92 добавлений и 52 удалений

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
@ -40,11 +40,13 @@
#include "bml_r2.h"
#include "bml_r2_ft.h"
int mca_bml_r2_ft_event(int state) {
int mca_bml_r2_ft_event(int state)
{
ompi_proc_t** procs = NULL;
size_t num_procs;
size_t btl_idx;
int ret, p;
int loc_state;
if(OPAL_CRS_CHECKPOINT == state) {
/* Do nothing for now */
@ -52,7 +54,10 @@ int mca_bml_r2_ft_event(int state) {
else if(OPAL_CRS_CONTINUE == state) {
/* Since nothing in Checkpoint, we are fine here */
}
else if(OPAL_CRS_RESTART == state) {
else if(OPAL_CRS_RESTART_PRE == state ) {
/* Nothing here */
}
else if(OPAL_CRS_RESTART == state ) {
procs = ompi_proc_all(&num_procs);
if(NULL == procs) {
return OMPI_ERR_OUT_OF_RESOURCE;
@ -65,6 +70,20 @@ int mca_bml_r2_ft_event(int state) {
;
}
/* Never call the ft_event functions attached to the BTLs on the second
* pass of RESTART since on the first pass they were unloaded and therefore
* no longer exist.
*/
if( OPAL_CRS_RESTART != state ) {
/* Since we only ever call into the BTLs once during the first restart
* pass, just lie to them on this pass for a bit of local clarity.
*/
if( OPAL_CRS_RESTART_PRE == state ) {
loc_state = OPAL_CRS_RESTART;
} else {
loc_state = state;
}
/*
* Call ft_event in:
* - BTL modules
@ -81,7 +100,7 @@ int mca_bml_r2_ft_event(int state) {
opal_output_verbose(10, ompi_cr_output,
"bml:r2: ft_event: Notify the %s BTL.\n",
(mca_bml_r2.btl_modules[btl_idx])->btl_component->btl_version.mca_component_name);
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event(state) ) ) {
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event(loc_state) ) ) {
continue;
}
}
@ -94,11 +113,12 @@ int mca_bml_r2_ft_event(int state) {
opal_output_verbose(10, ompi_cr_output,
"bml:r2: ft_event: Notify the %s MPool.\n",
(mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_component->mpool_version.mca_component_name);
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event(state) ) ) {
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event(loc_state) ) ) {
continue;
}
}
}
}
if(OPAL_CRS_CHECKPOINT == state) {
;
@ -106,8 +126,7 @@ int mca_bml_r2_ft_event(int state) {
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
else if(OPAL_CRS_RESTART_PRE == state ) {
mca_bml_r2.num_btl_modules = 0;
mca_bml_r2.num_btl_progress = 0;
@ -136,6 +155,8 @@ int mca_bml_r2_ft_event(int state) {
return ret;
}
}
else if(OPAL_CRS_RESTART == state ) {
/*
* Re-open the BTL framework to get the full list of components.
*/

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
@ -465,7 +465,10 @@ int mca_pml_ob1_ft_event( int state )
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
else if(OPAL_CRS_RESTART_PRE == state ) {
/* Nothing here */
}
else if(OPAL_CRS_RESTART == state ) {
/*
* Get a list of processes
*/
@ -519,7 +522,10 @@ int mca_pml_ob1_ft_event( int state )
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
else if(OPAL_CRS_RESTART_PRE == state ) {
/* Nothing here */
}
else if(OPAL_CRS_RESTART == state ) {
/*
* Exchange the modex information once again.
* BTLs will have republished their modex information.

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
@ -294,14 +294,25 @@ static int ompi_cr_coord_pre_ckpt(void) {
}
static int ompi_cr_coord_pre_restart(void) {
/*
* Can not really do much until ORTE is up and running,
* so defer action until the post_restart function.
*/
int ret, exit_status = OMPI_SUCCESS;
opal_output_verbose(10, ompi_cr_output,
"ompi_cr: coord_pre_restart: ompi_cr_coord_pre_restart()");
return OMPI_SUCCESS;
/*
* Notify PML
* - Will notify BML and BTL's
* - The intention here is to have the PML shutdown all the old components
* and handles. On the second pass (once ORTE is restarted) we can
* reconnect processes.
*/
if( ORTE_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_RESTART_PRE))) {
exit_status = ret;
goto cleanup;
}
cleanup:
return exit_status;
}
static int ompi_cr_coord_pre_continue(void) {

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -60,7 +60,8 @@ extern "C" {
*/
enum opal_crs_state_type_t {
OPAL_CRS_CHECKPOINT,
OPAL_CRS_RESTART,
OPAL_CRS_RESTART_PRE,
OPAL_CRS_RESTART, /* RESTART_POST */
OPAL_CRS_CONTINUE,
OPAL_CRS_TERM,
OPAL_CRS_RUNNING,

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -810,7 +810,7 @@ static void opal_cr_sigpipe_debug_signal_handler (int signo)
return;
}
opal_output_verbose(10, opal_cr_output,
opal_output(0,
"opal_cr: sigpipe_debug: Debug SIGPIPE [%d]: PID (%d)\n",
signo, getpid());
while(sleeper == 1 ) {

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
@ -273,11 +273,12 @@ bool mca_oob_tcp_msg_send_handler(mca_oob_tcp_msg_t* msg, struct mca_oob_tcp_pee
else if (opal_socket_errno == EAGAIN || opal_socket_errno == EWOULDBLOCK)
return false;
else {
opal_output(0, "%s-%s mca_oob_tcp_msg_send_handler: writev failed: %s (%d)",
opal_output(0, "%s-%s mca_oob_tcp_msg_send_handler: writev failed: %s (%d) [sd = %d]",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->peer_name)),
strerror(opal_socket_errno),
opal_socket_errno);
opal_socket_errno,
peer->peer_sd);
mca_oob_tcp_peer_close(peer);
msg->msg_rc = ORTE_ERR_CONNECTION_FAILED;
return true;