Fix C/R functionality with the new libtool. This fixes the case where the restarted process cannot be checkpointed or finalized.
Short Version: -------------- Event engine needs to be flushed so it does not use old/stale file descriptors. Long Version: ------------- The problem was that the restarted process was waiting for the socket to the local daemon to finish establishing during the 'sync' operation. The core problem was that the daemon was sending a header of 36 bytes, but the restarted process only received 35 bytes of the message. So the restarted process became stuck waiting for the last byte to arrive. After many hours of digging, I figured out that the event engine was using the same file descriptor for its evsig_cb functionality (to signal itself when a signal arrives). So when the daemon wrote in to the new fd the event engine was stealing the first byte (*shakes fist at event engine*) before the recv() could be posted. The solution is to use the event_reinit() function on restart to re-establish the now-stale file descriptors in the event engine. This seems to have fixed the problem. A few other minor things: ------------------------- * Add a check to make sure the event engine is balanced in its init/finalize * Add the opal_event_base_close() to the BLCR restart exec function (still not 100% sure it is needed, but there it is). This commit was SVN r24296.
Этот коммит содержится в:
родитель
e4d13d338f
Коммит
66af515061
@ -512,7 +512,7 @@ int opal_crs_blcr_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_ch
|
||||
* along very well.
|
||||
*/
|
||||
opal_progress_finalize();
|
||||
/* opal_event.finalize(); JJH: Is something like this still needed? */
|
||||
opal_event_base_close();
|
||||
|
||||
if (!spawn_child) {
|
||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
||||
|
@ -20,6 +20,8 @@
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
extern int opal_event_base_inited;
|
||||
|
||||
/**
|
||||
* Initialize the event MCA framework
|
||||
*
|
||||
|
@ -19,6 +19,8 @@ int opal_event_base_close(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
opal_event_base_inited--;
|
||||
|
||||
/* release the event base */
|
||||
opal_event_base_finalize(opal_event_base);
|
||||
|
||||
|
@ -34,11 +34,16 @@
|
||||
int opal_event_base_output = -1;
|
||||
opal_list_t opal_event_components;
|
||||
opal_event_base_t *opal_event_base=NULL;
|
||||
int opal_event_base_inited = 0;
|
||||
|
||||
int opal_event_base_open(void)
|
||||
{
|
||||
int value, rc = OPAL_SUCCESS;
|
||||
|
||||
|
||||
if( opal_event_base_inited++ < 0 ) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* Debugging / verbose output */
|
||||
mca_base_param_reg_int_name("event", "base_verbose",
|
||||
"Verbosity level of the event framework",
|
||||
|
@ -114,6 +114,8 @@ OPAL_DECLSPEC void opal_event_base_finalize(opal_event_base_t *base);
|
||||
|
||||
OPAL_DECLSPEC int opal_event_init(void);
|
||||
|
||||
OPAL_DECLSPEC int opal_event_reinit(opal_event_base_t *base);
|
||||
|
||||
/* thread support APIs */
|
||||
#if OPAL_EVENT_HAVE_THREAD_SUPPORT
|
||||
#ifdef WIN32
|
||||
|
@ -289,3 +289,8 @@ int opal_event_init(void)
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int opal_event_reinit(opal_event_base_t *evbase)
|
||||
{
|
||||
return event_reinit(evbase->base);
|
||||
}
|
||||
|
@ -770,6 +770,13 @@ int opal_cr_coord(int state)
|
||||
else if (OPAL_CRS_RESTART == state ) {
|
||||
/* Do Restart Phase work */
|
||||
|
||||
/*
|
||||
* Re-initialize the event engine
|
||||
* Otherwise it may/will use stale file descriptors which will disrupt
|
||||
* the intended users of the soon-to-be newly assigned file descriptors.
|
||||
*/
|
||||
opal_event_reinit(opal_event_base);
|
||||
|
||||
/*
|
||||
* Flush if() functionality, since it caches system specific info.
|
||||
*/
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user