
Short Version: -------------- Event engine needs to be flushed so it does not use old/stale file descriptors. Long Version: ------------- The problem was that the restarted process was waiting for the socket to the local daemon to finish establishing during the 'sync' operation. The core problem was that the daemon was sending a header of 36 bytes, but the restarted process only received 35 bytes of the message. So the restarted process became stuck waiting for the last byte to arrive. After many hours of digging, I figured out that the event engine was using the same file descriptor for its evsig_cb functionality (to signal itself when a signal arrives). So when the daemon wrote in to the new fd the event engine was stealing the first byte (*shakes fist at event engine*) before the recv() could be posted. The solution is to use the event_reinit() function on restart to re-establish the now-stale file descriptors in the event engine. This seems to have fixed the problem. A few other minor things: ------------------------- * Add a check to make sure the event engine is balanced in its init/finalize * Add the opal_event_base_close() to the BLCR restart exec function (still not 100% sure it is needed, but there it is). This commit was SVN r24296.
87 строки
2.1 KiB
C
87 строки
2.1 KiB
C
/*
|
|
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
|
|
#include "opal_config.h"
|
|
|
|
#include "opal/constants.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/mca/mca.h"
|
|
#include "opal/mca/base/base.h"
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
|
|
#include "opal/mca/event/event.h"
|
|
#include "opal/mca/event/base/base.h"
|
|
|
|
|
|
/*
|
|
* The following file was created by configure. It contains extern
|
|
* statements and the definition of an array of pointers to each
|
|
* component's public mca_base_component_t struct.
|
|
*/
|
|
#include "opal/mca/event/base/static-components.h"
|
|
|
|
|
|
/*
|
|
* Globals
|
|
*/
|
|
int opal_event_base_output = -1;
|
|
opal_list_t opal_event_components;
|
|
opal_event_base_t *opal_event_base=NULL;
|
|
int opal_event_base_inited = 0;
|
|
|
|
int opal_event_base_open(void)
|
|
{
|
|
int value, rc = OPAL_SUCCESS;
|
|
|
|
if( opal_event_base_inited++ < 0 ) {
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/* Debugging / verbose output */
|
|
mca_base_param_reg_int_name("event", "base_verbose",
|
|
"Verbosity level of the event framework",
|
|
false, false,
|
|
0, &value);
|
|
if (0 != value) {
|
|
opal_event_base_output = opal_output_open(NULL);
|
|
} else {
|
|
opal_event_base_output = -1;
|
|
}
|
|
|
|
/* to support tools such as ompi_info, add the components
|
|
* to a list
|
|
*/
|
|
OBJ_CONSTRUCT(&opal_event_components, opal_list_t);
|
|
if (OPAL_SUCCESS !=
|
|
mca_base_components_open("event", opal_event_base_output,
|
|
mca_event_base_static_components,
|
|
&opal_event_components, true)) {
|
|
return OPAL_ERROR;
|
|
}
|
|
|
|
/* init the lib */
|
|
if (OPAL_SUCCESS != (rc = opal_event_init())) {
|
|
return rc;
|
|
}
|
|
|
|
/* Declare our intent to use threads. If event library internal
|
|
* thread support was not enabled during configuration, this
|
|
* function defines to no-op
|
|
*/
|
|
opal_event_use_threads();
|
|
|
|
/* get our event base */
|
|
if (NULL == (opal_event_base = opal_event_base_create())) {
|
|
rc = OPAL_ERROR;
|
|
}
|
|
|
|
return rc;
|
|
}
|