Fixes trac:1160. There is still some other problem in the OOB, but we
wanted to commit this to get wider testing. This commit was SVN r16445. The following Trac tickets were found above: Ticket 1160 --> https://svn.open-mpi.org/trac/ompi/ticket/1160
Этот коммит содержится в:
родитель
f16a42947a
Коммит
423f23eb6a
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -21,6 +22,7 @@
|
||||
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/event/event.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/oob/oob.h"
|
||||
#include "orte/dss/dss_types.h"
|
||||
@ -34,6 +36,8 @@ struct orte_rml_oob_module_t {
|
||||
opal_mutex_t exceptions_lock;
|
||||
opal_list_t queued_routing_messages;
|
||||
opal_mutex_t queued_lock;
|
||||
opal_event_t *timer_event;
|
||||
struct timeval timeout;
|
||||
};
|
||||
typedef struct orte_rml_oob_module_t orte_rml_oob_module_t;
|
||||
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -40,7 +41,7 @@ static void rml_oob_recv_route_callback(int status,
|
||||
int count,
|
||||
orte_rml_tag_t tag,
|
||||
void *cbdata);
|
||||
|
||||
static void rml_oob_queued_progress(int fd, short event, void *arg);
|
||||
|
||||
/**
|
||||
* component definition
|
||||
@ -138,6 +139,15 @@ rml_oob_init(int* priority)
|
||||
OBJ_CONSTRUCT(&orte_rml_oob_module.exceptions_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&orte_rml_oob_module.queued_routing_messages, opal_list_t);
|
||||
OBJ_CONSTRUCT(&orte_rml_oob_module.queued_lock, opal_mutex_t);
|
||||
/* Set default timeout for queued messages to be 1/2 second */
|
||||
orte_rml_oob_module.timeout.tv_sec = 0;
|
||||
orte_rml_oob_module.timeout.tv_usec = 500000;
|
||||
orte_rml_oob_module.timer_event = malloc(sizeof(opal_event_t));
|
||||
if (NULL == orte_rml_oob_module.timer_event) {
|
||||
return NULL;
|
||||
}
|
||||
opal_evtimer_set(orte_rml_oob_module.timer_event, rml_oob_queued_progress,
|
||||
NULL);
|
||||
|
||||
orte_rml_oob_module.active_oob = &mca_oob;
|
||||
orte_rml_oob_module.active_oob->oob_exception_callback =
|
||||
@ -315,22 +325,18 @@ rml_oob_recv_route_queued_send_callback(int status,
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
rml_oob_queued_progress(void)
|
||||
static void
|
||||
rml_oob_queued_progress(int fd, short event, void *arg)
|
||||
{
|
||||
orte_rml_oob_queued_msg_t *qmsg;
|
||||
orte_rml_oob_msg_header_t *hdr;
|
||||
int real_tag;
|
||||
int ret;
|
||||
orte_process_name_t next, origin;
|
||||
int count = 0;
|
||||
|
||||
while (true) {
|
||||
OPAL_THREAD_LOCK(&orte_rml_oob_module.queued_lock);
|
||||
qmsg = (orte_rml_oob_queued_msg_t*) opal_list_remove_first(&orte_rml_oob_module.queued_routing_messages);
|
||||
if (0 == opal_list_get_size(&orte_rml_oob_module.queued_routing_messages)) {
|
||||
opal_progress_unregister(rml_oob_queued_progress);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&orte_rml_oob_module.queued_lock);
|
||||
if (NULL == qmsg) break;
|
||||
|
||||
@ -385,7 +391,8 @@ rml_oob_queued_progress(void)
|
||||
opal_list_append(&orte_rml_oob_module.queued_routing_messages,
|
||||
&qmsg->super);
|
||||
if (1 == opal_list_get_size(&orte_rml_oob_module.queued_routing_messages)) {
|
||||
opal_progress_register(rml_oob_queued_progress);
|
||||
opal_evtimer_add(orte_rml_oob_module.timer_event,
|
||||
&orte_rml_oob_module.timeout);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&orte_rml_oob_module.queued_lock);
|
||||
} else {
|
||||
@ -398,11 +405,7 @@ rml_oob_queued_progress(void)
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static void
|
||||
@ -495,7 +498,8 @@ rml_oob_recv_route_callback(int status,
|
||||
opal_list_append(&orte_rml_oob_module.queued_routing_messages,
|
||||
&qmsg->super);
|
||||
if (1 == opal_list_get_size(&orte_rml_oob_module.queued_routing_messages)) {
|
||||
opal_progress_register(rml_oob_queued_progress);
|
||||
opal_evtimer_add(orte_rml_oob_module.timer_event,
|
||||
&orte_rml_oob_module.timeout);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&orte_rml_oob_module.queued_lock);
|
||||
} else {
|
||||
|
@ -502,6 +502,16 @@ int orte_daemon(int argc, char *argv[])
|
||||
require OOB messages for wireup, etc.). */
|
||||
opal_progress_set_yield_when_idle(false);
|
||||
|
||||
/* Change the default behavior of libevent such that we want to
|
||||
continually block rather than blocking for the default timeout
|
||||
and then looping around the progress engine again. There
|
||||
should be nothing in the orted that cannot block in libevent
|
||||
until "something" happens (i.e., there's no need to keep
|
||||
cycling through progress because the only things that should
|
||||
happen will happen in libevent). This is a minor optimization,
|
||||
but what the heck... :-) */
|
||||
opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);
|
||||
|
||||
/* if requested, obtain and report a new process name and my uri to the indicated pipe */
|
||||
if (orted_globals.uri_pipe > 0) {
|
||||
orte_process_name_t name;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user