Fix a problem in the plm "failed to start" code observed by Jeff. When we are unable to launch to a specific node because it doesn't exist or is down, the system would hang and/or segv. The reason for the hang was that we were "firing" the orted exit trigger prior to its timer event being defined - thus "locking" that one-shot and preventing it from firing when we actually were ready to use it.
The segv was caused by the fact that we don't really know which daemon failed to start (at least, in most cases), so we didn't set a pointer to the aborted proc object. All we really wanted, though, was to ensure that mpirun returned a non-zero exit status, so the fix was to simply return the default error status. This commit was SVN r19754.
Этот коммит содержится в:
родитель
f0fe8ddb59
Коммит
48c3de1865
@ -383,7 +383,7 @@ CLEANUP:
|
||||
ORTE_NAME_PRINT(&mev->sender)));
|
||||
|
||||
if (orted_failed_launch) {
|
||||
orte_errmgr.incomplete_start(ORTE_PROC_MY_NAME->jobid, jdatorted->aborted_proc->exit_code);
|
||||
orte_errmgr.incomplete_start(ORTE_PROC_MY_NAME->jobid, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
} else {
|
||||
orted_num_callback++;
|
||||
}
|
||||
|
@ -284,6 +284,7 @@ static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
unsigned long deltat;
|
||||
orte_std_cntr_t cnt=1;
|
||||
uint8_t flag;
|
||||
orte_job_t *jdata;
|
||||
|
||||
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { /* if abnormal exit */
|
||||
/* if we are not the HNP, send a message to the HNP alerting it
|
||||
@ -305,12 +306,16 @@ static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
OBJ_DESTRUCT(&buf);
|
||||
} else {
|
||||
orte_proc_t *daemon=(orte_proc_t*)cbdata;
|
||||
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s daemon %d failed with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(int)daemon->name.vpid, WEXITSTATUS(status)));
|
||||
/* note that this daemon failed */
|
||||
daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
/* increment the #daemons terminated so we will exit properly */
|
||||
jdata->num_terminated++;
|
||||
/* report that the daemon has failed so we can exit */
|
||||
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, pid, status, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
}
|
||||
|
@ -384,7 +384,7 @@ int orte_daemon(int argc, char *argv[])
|
||||
* fd as orterun so that orte_comm can wake either of us up
|
||||
* since we share that code
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_wait_event(&orted_exit_event, &orte_exit, shutdown_callback))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_wait_event(&orted_exit_event, &orte_exit, "orted_shutdown", shutdown_callback))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
@ -182,15 +182,21 @@ static OBJ_CLASS_INSTANCE(registered_cb_item_t, opal_list_item_t, NULL, NULL);
|
||||
static void
|
||||
trigger_event_constructor(orte_trigger_event_t *trig)
|
||||
{
|
||||
trig->name = NULL;
|
||||
trig->channel = -1;
|
||||
opal_atomic_init(&trig->lock, OPAL_ATOMIC_UNLOCKED);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
trigger_event_destructor(orte_trigger_event_t *trig)
|
||||
{
|
||||
if (NULL != trig->name) {
|
||||
free(trig->name);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(orte_trigger_event_t,
|
||||
opal_object_t,
|
||||
trigger_event_constructor,
|
||||
NULL);
|
||||
trigger_event_destructor);
|
||||
|
||||
/*********************************************************************
|
||||
*
|
||||
@ -473,6 +479,7 @@ orte_wait_cb_enable()
|
||||
|
||||
|
||||
int orte_wait_event(opal_event_t **event, orte_trigger_event_t *trig,
|
||||
char *trigger_name,
|
||||
void (*cbfunc)(int, short, void*))
|
||||
{
|
||||
int p[2];
|
||||
@ -482,6 +489,9 @@ int orte_wait_event(opal_event_t **event, orte_trigger_event_t *trig,
|
||||
return ORTE_ERR_SYS_LIMITS_PIPES;
|
||||
}
|
||||
|
||||
/* save the trigger name */
|
||||
trig->name = strdup(trigger_name);
|
||||
|
||||
/* create the event */
|
||||
*event = (opal_event_t*)malloc(sizeof(opal_event_t));
|
||||
|
||||
@ -503,6 +513,19 @@ void orte_trigger_event(orte_trigger_event_t *trig)
|
||||
{
|
||||
int data=1;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
|
||||
"%s calling %s trigger",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trig->name));
|
||||
|
||||
/* if the trigger isn't setup yet, just return */
|
||||
if (trig->channel < 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* if we already fired it, don't do it again - this automatically
|
||||
* records that we did fire it
|
||||
*/
|
||||
if (!opal_atomic_trylock(&trig->lock)) { /* returns 1 if already locked */
|
||||
return;
|
||||
}
|
||||
@ -800,16 +823,24 @@ static void opal_process_handle_destruct( opal_object_t* obj )
|
||||
static OBJ_CLASS_INSTANCE( opal_process_handle_t, opal_list_item_t,
|
||||
opal_process_handle_construct, opal_process_handle_destruct );
|
||||
|
||||
static void trigger_event_constructor(orte_trigger_event_t *trig)
|
||||
static void
|
||||
trigger_event_constructor(orte_trigger_event_t *trig)
|
||||
{
|
||||
trig->name = NULL;
|
||||
trig->channel = -1;
|
||||
opal_atomic_init(&trig->lock, OPAL_ATOMIC_UNLOCKED);
|
||||
}
|
||||
|
||||
static void
|
||||
trigger_event_destructor(orte_trigger_event_t *trig)
|
||||
{
|
||||
if (NULL != trig->name) {
|
||||
free(trig->name);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(orte_trigger_event_t,
|
||||
opal_object_t,
|
||||
trigger_event_constructor,
|
||||
NULL);
|
||||
trigger_event_destructor);
|
||||
|
||||
/*********************************************************************
|
||||
*
|
||||
@ -852,6 +883,16 @@ void orte_trigger_event(orte_trigger_event_t *trig)
|
||||
{
|
||||
int data=1;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
|
||||
"%s calling %s trigger",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
trig->name));
|
||||
|
||||
/* if the trigger isn't setup yet, just return */
|
||||
if (trig->channel < 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!opal_atomic_trylock(&trig->lock)) { /* returns 1 if already locked */
|
||||
return;
|
||||
}
|
||||
@ -1056,6 +1097,7 @@ orte_wait_cb_enable(void)
|
||||
|
||||
|
||||
int orte_wait_event(opal_event_t **event, orte_trigger_event_t *trig,
|
||||
char *trigger_name,
|
||||
void (*cbfunc)(int, short, void*))
|
||||
{
|
||||
int p[2];
|
||||
@ -1064,6 +1106,9 @@ int orte_wait_event(opal_event_t **event, orte_trigger_event_t *trig,
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* save the trigger name */
|
||||
trig->name = strdup(trigger_name);
|
||||
|
||||
/* create the event */
|
||||
*event = (opal_event_t*)malloc(sizeof(opal_event_t));
|
||||
|
||||
@ -1166,6 +1211,7 @@ void orte_trigger_event(orte_trigger_event_t *trig)
|
||||
|
||||
int
|
||||
orte_wait_event(opal_event_t **event, int *trig,
|
||||
char *trigger_name,
|
||||
void (*cbfunc)(int, short, void*))
|
||||
{
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
|
@ -49,6 +49,7 @@ BEGIN_C_DECLS
|
||||
|
||||
typedef struct {
|
||||
opal_object_t super;
|
||||
char *name;
|
||||
int channel;
|
||||
opal_atomic_lock_t lock;
|
||||
} orte_trigger_event_t;
|
||||
@ -118,6 +119,7 @@ ORTE_DECLSPEC int orte_wait_cb_enable(void);
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_wait_event(opal_event_t **event,
|
||||
orte_trigger_event_t *trig,
|
||||
char *trigger_name,
|
||||
void (*cbfunc)(int, short, void*));
|
||||
|
||||
/**
|
||||
@ -251,25 +253,26 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_message_event_t);
|
||||
* that the computed wait time doesn't exceed the desired max
|
||||
* wait
|
||||
*/
|
||||
#define ORTE_DETECT_TIMEOUT(event, n, deltat, maxwait, cbfunc) \
|
||||
do { \
|
||||
struct timeval now; \
|
||||
opal_event_t *tmp; \
|
||||
int timeout; \
|
||||
tmp = (opal_event_t*)malloc(sizeof(opal_event_t)); \
|
||||
opal_evtimer_set(tmp, (cbfunc), NULL); \
|
||||
timeout = (deltat) * (n); \
|
||||
if ((maxwait) > 0 && timeout > (maxwait)) { \
|
||||
timeout = (maxwait); \
|
||||
} \
|
||||
now.tv_sec = timeout/1000000; \
|
||||
now.tv_usec = timeout%1000000; \
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
||||
"defining timeout: %ld sec %ld usec", \
|
||||
(long)now.tv_sec, (long)now.tv_usec)); \
|
||||
opal_evtimer_add(tmp, &now); \
|
||||
*(event) = tmp; \
|
||||
}while(0); \
|
||||
#define ORTE_DETECT_TIMEOUT(event, n, deltat, maxwait, cbfunc) \
|
||||
do { \
|
||||
struct timeval now; \
|
||||
opal_event_t *tmp; \
|
||||
int timeout; \
|
||||
tmp = (opal_event_t*)malloc(sizeof(opal_event_t)); \
|
||||
opal_evtimer_set(tmp, (cbfunc), NULL); \
|
||||
timeout = (deltat) * (n); \
|
||||
if ((maxwait) > 0 && timeout > (maxwait)) { \
|
||||
timeout = (maxwait); \
|
||||
} \
|
||||
now.tv_sec = timeout/1000000; \
|
||||
now.tv_usec = timeout%1000000; \
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
||||
"defining timeout: %ld sec %ld usec at %s:%d", \
|
||||
(long)now.tv_sec, (long)now.tv_usec, \
|
||||
__FILE__, __LINE__)); \
|
||||
opal_evtimer_add(tmp, &now); \
|
||||
*(event) = tmp; \
|
||||
}while(0); \
|
||||
|
||||
|
||||
/**
|
||||
@ -277,19 +280,20 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_message_event_t);
|
||||
* wakeup to do something, and then go back to sleep again. Setting
|
||||
* a timer allows us to do this
|
||||
*/
|
||||
#define ORTE_TIMER_EVENT(time, cbfunc) \
|
||||
do { \
|
||||
struct timeval now; \
|
||||
opal_event_t *tmp; \
|
||||
tmp = (opal_event_t*)malloc(sizeof(opal_event_t)); \
|
||||
opal_evtimer_set(tmp, (cbfunc), tmp); \
|
||||
now.tv_sec = (time); \
|
||||
now.tv_usec = 0; \
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
||||
"defining timer event: %ld sec", \
|
||||
(long)now.tv_sec)); \
|
||||
opal_evtimer_add(tmp, &now); \
|
||||
}while(0); \
|
||||
#define ORTE_TIMER_EVENT(time, cbfunc) \
|
||||
do { \
|
||||
struct timeval now; \
|
||||
opal_event_t *tmp; \
|
||||
tmp = (opal_event_t*)malloc(sizeof(opal_event_t)); \
|
||||
opal_evtimer_set(tmp, (cbfunc), tmp); \
|
||||
now.tv_sec = (time); \
|
||||
now.tv_usec = 0; \
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
||||
"defining timer event: %ld sec at %s:%d", \
|
||||
(long)now.tv_sec, \
|
||||
__FILE__, __LINE__)); \
|
||||
opal_evtimer_add(tmp, &now); \
|
||||
}while(0); \
|
||||
|
||||
|
||||
/**
|
||||
|
@ -604,7 +604,7 @@ int orterun(int argc, char *argv[])
|
||||
* same exit fd as the daemon does so that orted_comm
|
||||
* can cause either of us to exit since we share that code
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_wait_event(&orterun_event, &orte_exit, job_completed))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_wait_event(&orterun_event, &orte_exit, "job_complete", job_completed))) {
|
||||
orte_show_help("help-orterun.txt", "orterun:event-def-failed", true,
|
||||
orterun_basename, ORTE_ERROR_NAME(rc));
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
@ -683,7 +683,7 @@ static void job_completed(int trigpipe, short event, void *arg)
|
||||
* trigger when the orteds are gone and tell the orteds that it is
|
||||
* okay to finalize and exit, we are done with them.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_wait_event(&orteds_exit_event, &orteds_exit, terminated))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_wait_event(&orteds_exit_event, &orteds_exit, "orted_exit", terminated))) {
|
||||
orte_show_help("help-orterun.txt", "orterun:event-def-failed", true,
|
||||
orterun_basename, ORTE_ERROR_NAME(rc));
|
||||
goto DONE;
|
||||
@ -691,12 +691,10 @@ static void job_completed(int trigpipe, short event, void *arg)
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
|
||||
/* since we know that the sends didn't completely go out,
|
||||
* we know that the prior event will never fire. Delete it
|
||||
* for completeness, and replace it with a timeout so
|
||||
* we know that the prior event will never fire. Add a timeout so
|
||||
* that those daemons that can respond have a chance to do
|
||||
* so
|
||||
*/
|
||||
opal_event_del(orteds_exit_event);
|
||||
/* get the orted job data object */
|
||||
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
/* we are totally hozed */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user