Some cleanup to remove calls to opal_progress when running with orte progress threads, and to ensure that all orte-related events are in the orte event base.
This commit was SVN r26591.
Этот коммит содержится в:
родитель
75e66ad51e
Коммит
269cb2b8d9
@ -58,7 +58,7 @@
|
|||||||
/* Local static variables */
|
/* Local static variables */
|
||||||
static opal_mutex_t ompi_dpm_port_mutex;
|
static opal_mutex_t ompi_dpm_port_mutex;
|
||||||
static orte_rml_tag_t next_tag;
|
static orte_rml_tag_t next_tag;
|
||||||
static bool recv_completed;
|
static bool waiting_for_recv = false;
|
||||||
static opal_buffer_t *cabuf=NULL;
|
static opal_buffer_t *cabuf=NULL;
|
||||||
static orte_process_name_t carport;
|
static orte_process_name_t carport;
|
||||||
|
|
||||||
@ -205,14 +205,12 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
|||||||
/* send the request - doesn't have to include any data */
|
/* send the request - doesn't have to include any data */
|
||||||
rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, nbuf, ORTE_RML_TAG_COLL_ID_REQ, 0, rml_cbfunc, NULL);
|
rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, nbuf, ORTE_RML_TAG_COLL_ID_REQ, 0, rml_cbfunc, NULL);
|
||||||
/* wait for the id */
|
/* wait for the id */
|
||||||
recv_completed = false;
|
waiting_for_recv = true;
|
||||||
cabuf = OBJ_NEW(opal_buffer_t);
|
cabuf = OBJ_NEW(opal_buffer_t);
|
||||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLL_ID,
|
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLL_ID,
|
||||||
ORTE_RML_NON_PERSISTENT, recv_cb, NULL);
|
ORTE_RML_NON_PERSISTENT, recv_cb, NULL);
|
||||||
/* wait for response */
|
/* wait for response */
|
||||||
while (!recv_completed) {
|
ORTE_WAIT_FOR_COMPLETION(waiting_for_recv);
|
||||||
opal_progress();
|
|
||||||
}
|
|
||||||
i=1;
|
i=1;
|
||||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(cabuf, &id, &i, ORTE_GRPCOMM_COLL_ID_T))) {
|
if (OPAL_SUCCESS != (rc = opal_dss.unpack(cabuf, &id, &i, ORTE_GRPCOMM_COLL_ID_T))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -232,14 +230,12 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
|||||||
rc = orte_rml.send_buffer_nb(&port, nbuf, tag, 0, rml_cbfunc, NULL);
|
rc = orte_rml.send_buffer_nb(&port, nbuf, tag, 0, rml_cbfunc, NULL);
|
||||||
} else {
|
} else {
|
||||||
/* wait to recv the collective id */
|
/* wait to recv the collective id */
|
||||||
recv_completed = false;
|
waiting_for_recv = true;
|
||||||
cabuf = OBJ_NEW(opal_buffer_t);
|
cabuf = OBJ_NEW(opal_buffer_t);
|
||||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag,
|
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag,
|
||||||
ORTE_RML_NON_PERSISTENT, recv_cb, NULL);
|
ORTE_RML_NON_PERSISTENT, recv_cb, NULL);
|
||||||
/* wait for response */
|
/* wait for response */
|
||||||
while (!recv_completed) {
|
ORTE_WAIT_FOR_COMPLETION(waiting_for_recv);
|
||||||
opal_progress();
|
|
||||||
}
|
|
||||||
i=1;
|
i=1;
|
||||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(cabuf, &id, &i, ORTE_GRPCOMM_COLL_ID_T))) {
|
if (OPAL_SUCCESS != (rc = opal_dss.unpack(cabuf, &id, &i, ORTE_GRPCOMM_COLL_ID_T))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -316,13 +312,11 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
|||||||
OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output,
|
OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output,
|
||||||
"%s dpm:orte:connect_accept waiting for response",
|
"%s dpm:orte:connect_accept waiting for response",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
recv_completed = false;
|
waiting_for_recv = true;
|
||||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag,
|
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag,
|
||||||
ORTE_RML_NON_PERSISTENT, recv_cb, NULL);
|
ORTE_RML_NON_PERSISTENT, recv_cb, NULL);
|
||||||
/* wait for response */
|
/* wait for response */
|
||||||
while (!recv_completed) {
|
ORTE_WAIT_FOR_COMPLETION(waiting_for_recv);
|
||||||
opal_progress();
|
|
||||||
}
|
|
||||||
OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output,
|
OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output,
|
||||||
"%s dpm:orte:connect_accept got data from %s",
|
"%s dpm:orte:connect_accept got data from %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -333,13 +327,11 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
|||||||
"%s dpm:orte:connect_accept recving first",
|
"%s dpm:orte:connect_accept recving first",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
/* setup to recv */
|
/* setup to recv */
|
||||||
recv_completed = false;
|
waiting_for_recv = true;
|
||||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag,
|
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag,
|
||||||
ORTE_RML_NON_PERSISTENT, recv_cb, NULL);
|
ORTE_RML_NON_PERSISTENT, recv_cb, NULL);
|
||||||
/* wait for response */
|
/* wait for response */
|
||||||
while (!recv_completed) {
|
ORTE_WAIT_FOR_COMPLETION(waiting_for_recv);
|
||||||
opal_progress();
|
|
||||||
}
|
|
||||||
/* now send our info */
|
/* now send our info */
|
||||||
OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output,
|
OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output,
|
||||||
"%s dpm:orte:connect_accept sending info to %s",
|
"%s dpm:orte:connect_accept sending info to %s",
|
||||||
@ -1634,6 +1626,6 @@ static void recv_cb(int status, orte_process_name_t* sender,
|
|||||||
carport.vpid = sender->vpid;
|
carport.vpid = sender->vpid;
|
||||||
|
|
||||||
/* flag complete */
|
/* flag complete */
|
||||||
recv_completed = true;
|
waiting_for_recv = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -37,7 +37,6 @@
|
|||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/runtime/opal.h"
|
#include "opal/runtime/opal.h"
|
||||||
#include "opal/runtime/opal_cr.h"
|
#include "opal/runtime/opal_cr.h"
|
||||||
#include "opal/runtime/opal_progress.h"
|
|
||||||
|
|
||||||
#include "orte/mca/rml/base/base.h"
|
#include "orte/mca/rml/base/base.h"
|
||||||
#include "orte/mca/routed/base/base.h"
|
#include "orte/mca/routed/base/base.h"
|
||||||
@ -272,9 +271,7 @@ int orte_ess_base_app_setup(void)
|
|||||||
error = "orte barrier";
|
error = "orte barrier";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
while (coll.active) {
|
ORTE_WAIT_FOR_COMPLETION(coll.active);
|
||||||
opal_progress(); /* block in progress pending events */
|
|
||||||
}
|
|
||||||
OBJ_DESTRUCT(&coll);
|
OBJ_DESTRUCT(&coll);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -339,14 +336,14 @@ int orte_ess_base_app_finalize(void)
|
|||||||
* to prevent the abort file from being created. This allows the
|
* to prevent the abort file from being created. This allows the
|
||||||
* session directory tree to cleanly be eliminated.
|
* session directory tree to cleanly be eliminated.
|
||||||
*/
|
*/
|
||||||
static bool sync_recvd;
|
static bool sync_waiting = false;
|
||||||
|
|
||||||
static void report_sync(int status, orte_process_name_t* sender,
|
static void report_sync(int status, orte_process_name_t* sender,
|
||||||
opal_buffer_t *buffer,
|
opal_buffer_t *buffer,
|
||||||
orte_rml_tag_t tag, void *cbdata)
|
orte_rml_tag_t tag, void *cbdata)
|
||||||
{
|
{
|
||||||
/* flag as complete */
|
/* flag as complete */
|
||||||
sync_recvd = true;
|
sync_waiting = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void orte_ess_base_app_abort(int status, bool report)
|
void orte_ess_base_app_abort(int status, bool report)
|
||||||
@ -379,14 +376,12 @@ void orte_ess_base_app_abort(int status, bool report)
|
|||||||
* gets serviced by the event library on the orted prior to the
|
* gets serviced by the event library on the orted prior to the
|
||||||
* process exiting
|
* process exiting
|
||||||
*/
|
*/
|
||||||
sync_recvd = false;
|
sync_waiting = true;
|
||||||
if (ORTE_SUCCESS != orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ABORT,
|
if (ORTE_SUCCESS != orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ABORT,
|
||||||
ORTE_RML_NON_PERSISTENT, report_sync, NULL)) {
|
ORTE_RML_NON_PERSISTENT, report_sync, NULL)) {
|
||||||
exit(status);
|
exit(status);
|
||||||
}
|
}
|
||||||
while (!sync_recvd) {
|
ORTE_WAIT_FOR_COMPLETION(sync_waiting);
|
||||||
opal_progress();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* - Clean out the global structures
|
/* - Clean out the global structures
|
||||||
|
@ -636,6 +636,7 @@ static int rte_init(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if !ORTE_ENABLE_PROGRESS_THREADS
|
||||||
/* We actually do *not* want an HNP to voluntarily yield() the
|
/* We actually do *not* want an HNP to voluntarily yield() the
|
||||||
processor more than necessary. Orterun already blocks when
|
processor more than necessary. Orterun already blocks when
|
||||||
it is doing nothing, so it doesn't use any more CPU cycles than
|
it is doing nothing, so it doesn't use any more CPU cycles than
|
||||||
@ -655,6 +656,7 @@ static int rte_init(void)
|
|||||||
problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
|
problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
|
||||||
require OOB messages for wireup, etc.). */
|
require OOB messages for wireup, etc.). */
|
||||||
opal_progress_set_yield_when_idle(false);
|
opal_progress_set_yield_when_idle(false);
|
||||||
|
#endif
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
|
||||||
|
@ -30,6 +30,12 @@
|
|||||||
#ifdef HAVE_UNISTD_H
|
#ifdef HAVE_UNISTD_H
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif /* HAVE_UNISTD_H */
|
#endif /* HAVE_UNISTD_H */
|
||||||
|
#if HAVE_TIME_H
|
||||||
|
#include <time.h>
|
||||||
|
#endif
|
||||||
|
#if HAVE_SYS_TIME_H
|
||||||
|
#include <sys/time.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "opal/mca/mca.h"
|
#include "opal/mca/mca.h"
|
||||||
#include "opal/mca/base/base.h"
|
#include "opal/mca/base/base.h"
|
||||||
@ -279,7 +285,17 @@ int orte_filem_rsh_module_finalize(void)
|
|||||||
*/
|
*/
|
||||||
if( orte_filem_base_is_active ) {
|
if( orte_filem_base_is_active ) {
|
||||||
while(0 < opal_list_get_size(&work_pool_active) ) {
|
while(0 < opal_list_get_size(&work_pool_active) ) {
|
||||||
|
#if ORTE_ENABLE_PROGRESS_THREADS
|
||||||
|
{
|
||||||
|
/* provide a very short quiet period so we
|
||||||
|
* don't hammer the cpu while we wait
|
||||||
|
*/
|
||||||
|
struct timespec tp = {0, 100};
|
||||||
|
nanosleep(&tp, NULL);
|
||||||
|
}
|
||||||
|
#else
|
||||||
opal_progress();
|
opal_progress();
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -50,8 +50,10 @@ static bool event_started = false;
|
|||||||
void mca_oob_ud_event_start_monitor (mca_oob_ud_device_t *device)
|
void mca_oob_ud_event_start_monitor (mca_oob_ud_device_t *device)
|
||||||
{
|
{
|
||||||
if (!event_started) {
|
if (!event_started) {
|
||||||
|
#if !ORTE_ENABLE_PROGRESS_THREADS
|
||||||
opal_progress_event_users_increment ();
|
opal_progress_event_users_increment ();
|
||||||
opal_event_set (opal_event_base, &device->event, device->ib_channel->fd,
|
#endif
|
||||||
|
opal_event_set (orte_event_base, &device->event, device->ib_channel->fd,
|
||||||
OPAL_EV_READ, mca_oob_ud_event_dispatch, (void *) device);
|
OPAL_EV_READ, mca_oob_ud_event_dispatch, (void *) device);
|
||||||
opal_event_add (&device->event, NULL);
|
opal_event_add (&device->event, NULL);
|
||||||
event_started = true;
|
event_started = true;
|
||||||
@ -61,7 +63,9 @@ void mca_oob_ud_event_start_monitor (mca_oob_ud_device_t *device)
|
|||||||
void mca_oob_ud_event_stop_monitor (mca_oob_ud_device_t *device)
|
void mca_oob_ud_event_stop_monitor (mca_oob_ud_device_t *device)
|
||||||
{
|
{
|
||||||
if (event_started) {
|
if (event_started) {
|
||||||
|
#if !ORTE_ENABLE_PROGRESS_THREADS
|
||||||
opal_progress_event_users_decrement ();
|
opal_progress_event_users_decrement ();
|
||||||
|
#endif
|
||||||
opal_event_del (&device->event);
|
opal_event_del (&device->event);
|
||||||
mca_oob_ud_stop_events (device);
|
mca_oob_ud_stop_events (device);
|
||||||
event_started = false;
|
event_started = false;
|
||||||
@ -385,7 +389,7 @@ void mca_oob_ud_event_queue_completed (mca_oob_ud_req_t *req)
|
|||||||
mca_oob_ud_req_append_to_list (req, &mca_oob_ud_component.ud_event_queued_reqs);
|
mca_oob_ud_req_append_to_list (req, &mca_oob_ud_component.ud_event_queued_reqs);
|
||||||
|
|
||||||
if (!opal_event_evtimer_pending (&mca_oob_ud_component.ud_complete_event, &now)) {
|
if (!opal_event_evtimer_pending (&mca_oob_ud_component.ud_complete_event, &now)) {
|
||||||
opal_event_evtimer_set (opal_event_base, &mca_oob_ud_component.ud_complete_event,
|
opal_event_evtimer_set (orte_event_base, &mca_oob_ud_component.ud_complete_event,
|
||||||
mca_oob_ud_complete_dispatch, NULL);
|
mca_oob_ud_complete_dispatch, NULL);
|
||||||
opal_event_add (&mca_oob_ud_component.ud_complete_event, &now);
|
opal_event_add (&mca_oob_ud_component.ud_complete_event, &now);
|
||||||
}
|
}
|
||||||
|
@ -363,7 +363,7 @@ void mca_oob_ud_peer_start_timer (mca_oob_ud_peer_t *peer)
|
|||||||
if (!peer->peer_timer.active && opal_list_get_size (&peer->peer_flying_messages)) {
|
if (!peer->peer_timer.active && opal_list_get_size (&peer->peer_flying_messages)) {
|
||||||
peer->peer_timer.active = true;
|
peer->peer_timer.active = true;
|
||||||
|
|
||||||
opal_event_evtimer_set (opal_event_base, &peer->peer_timer.event,
|
opal_event_evtimer_set (orte_event_base, &peer->peer_timer.event,
|
||||||
mca_oob_ud_peer_msg_timeout, (void *) peer);
|
mca_oob_ud_peer_msg_timeout, (void *) peer);
|
||||||
opal_event_evtimer_add (&peer->peer_timer.event, &peer->peer_timer.value);
|
opal_event_evtimer_add (&peer->peer_timer.event, &peer->peer_timer.value);
|
||||||
}
|
}
|
||||||
|
@ -71,7 +71,7 @@ static void mca_oob_ud_req_destruct (mca_oob_ud_req_t *req)
|
|||||||
void mca_oob_ud_req_timer_set (mca_oob_ud_req_t *req, const struct timeval *timeout,
|
void mca_oob_ud_req_timer_set (mca_oob_ud_req_t *req, const struct timeval *timeout,
|
||||||
int max_tries, void (*cb)(evutil_socket_t, short, void *))
|
int max_tries, void (*cb)(evutil_socket_t, short, void *))
|
||||||
{
|
{
|
||||||
opal_event_evtimer_set (opal_event_base, &req->timer.event, cb, (void *) req);
|
opal_event_evtimer_set (orte_event_base, &req->timer.event, cb, (void *) req);
|
||||||
req->timer.value.tv_sec = timeout->tv_sec;
|
req->timer.value.tv_sec = timeout->tv_sec;
|
||||||
req->timer.value.tv_usec = timeout->tv_usec;
|
req->timer.value.tv_usec = timeout->tv_usec;
|
||||||
opal_event_evtimer_add (&req->timer.event, &req->timer.value);
|
opal_event_evtimer_add (&req->timer.event, &req->timer.value);
|
||||||
|
@ -33,7 +33,6 @@
|
|||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
|
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/runtime/opal_progress.h"
|
|
||||||
#include "opal/class/opal_pointer_array.h"
|
#include "opal/class/opal_pointer_array.h"
|
||||||
#include "opal/dss/dss.h"
|
#include "opal/dss/dss.h"
|
||||||
#include "opal/mca/base/mca_base_param.h"
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
|
@ -63,7 +63,6 @@
|
|||||||
#include "opal/util/opal_environ.h"
|
#include "opal/util/opal_environ.h"
|
||||||
#include "opal/util/basename.h"
|
#include "opal/util/basename.h"
|
||||||
#include "opal/mca/base/mca_base_param.h"
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
#include "opal/runtime/opal_progress.h"
|
|
||||||
|
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
@ -544,7 +543,7 @@ static int plm_tm_connect(void)
|
|||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
struct tm_roots tm_root;
|
struct tm_roots tm_root;
|
||||||
int count, progress;
|
int count;
|
||||||
|
|
||||||
/* try a couple times to connect - might get busy signals every
|
/* try a couple times to connect - might get busy signals every
|
||||||
now and then */
|
now and then */
|
||||||
@ -554,12 +553,27 @@ static int plm_tm_connect(void)
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (progress = 0 ; progress < 10 ; ++progress) {
|
#if ORTE_ENABLE_PROGRESS_THREADS
|
||||||
opal_progress();
|
{
|
||||||
|
/* provide a very short quiet period so we
|
||||||
|
* don't hammer the cpu while we wait
|
||||||
|
*/
|
||||||
|
struct timespec tp = {0, 100};
|
||||||
|
nanosleep(&tp, NULL);
|
||||||
#if HAVE_SCHED_YIELD
|
#if HAVE_SCHED_YIELD
|
||||||
sched_yield();
|
sched_yield();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
{
|
||||||
|
int progress;
|
||||||
|
for (progress = 0 ; progress < 10 ; ++progress) {
|
||||||
|
opal_progress();
|
||||||
|
#if HAVE_SCHED_YIELD
|
||||||
|
sched_yield();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ORTE_ERR_RESOURCE_BUSY;
|
return ORTE_ERR_RESOURCE_BUSY;
|
||||||
|
@ -23,21 +23,14 @@
|
|||||||
#include "orte/constants.h"
|
#include "orte/constants.h"
|
||||||
#include "orte/types.h"
|
#include "orte/types.h"
|
||||||
|
|
||||||
#if HAVE_TIME_H
|
|
||||||
#include <time.h>
|
|
||||||
#endif
|
|
||||||
#if HAVE_SYS_TIME_H
|
|
||||||
#include <sys/time.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "opal/dss/dss.h"
|
#include "opal/dss/dss.h"
|
||||||
#include "opal/runtime/opal_progress.h"
|
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/ess/ess.h"
|
#include "orte/mca/ess/ess.h"
|
||||||
#include "orte/mca/odls/odls_types.h"
|
#include "orte/mca/odls/odls_types.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
#include "orte/runtime/orte_wait.h"
|
||||||
|
|
||||||
#include "orte/mca/routed/base/base.h"
|
#include "orte/mca/routed/base/base.h"
|
||||||
|
|
||||||
@ -233,7 +226,7 @@ void orte_routed_base_coll_peers(orte_grpcomm_collective_t *coll,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static bool sync_recvd;
|
static bool sync_waiting = false;
|
||||||
|
|
||||||
static void report_sync(int status, orte_process_name_t* sender,
|
static void report_sync(int status, orte_process_name_t* sender,
|
||||||
opal_buffer_t *buffer,
|
opal_buffer_t *buffer,
|
||||||
@ -242,7 +235,7 @@ static void report_sync(int status, orte_process_name_t* sender,
|
|||||||
/* just copy the payload to the sync_buf */
|
/* just copy the payload to the sync_buf */
|
||||||
opal_dss.copy_payload(orte_process_info.sync_buf, buffer);
|
opal_dss.copy_payload(orte_process_info.sync_buf, buffer);
|
||||||
/* flag as complete */
|
/* flag as complete */
|
||||||
sync_recvd = true;
|
sync_waiting = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int orte_routed_base_register_sync(bool setup)
|
int orte_routed_base_register_sync(bool setup)
|
||||||
@ -304,7 +297,7 @@ int orte_routed_base_register_sync(bool setup)
|
|||||||
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
|
||||||
"%s registering sync waiting for ack",
|
"%s registering sync waiting for ack",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
sync_recvd = false;
|
sync_waiting = true;
|
||||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SYNC,
|
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SYNC,
|
||||||
ORTE_RML_NON_PERSISTENT, report_sync, NULL);
|
ORTE_RML_NON_PERSISTENT, report_sync, NULL);
|
||||||
if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) {
|
if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) {
|
||||||
@ -313,19 +306,7 @@ int orte_routed_base_register_sync(bool setup)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* it is okay to block here as we are -not- in an event */
|
/* it is okay to block here as we are -not- in an event */
|
||||||
while (!sync_recvd) {
|
ORTE_WAIT_FOR_COMPLETION(sync_waiting);
|
||||||
#if !ORTE_ENABLE_PROGRESS_THREADS
|
|
||||||
opal_progress();
|
|
||||||
#else
|
|
||||||
{
|
|
||||||
/* provide a very short quiet period so we
|
|
||||||
* don't hammer the cpu while we wait
|
|
||||||
*/
|
|
||||||
struct timespec tp = {0, 100};
|
|
||||||
nanosleep(&tp, NULL);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
|
||||||
"%s registering sync ack recvd",
|
"%s registering sync ack recvd",
|
||||||
|
@ -1,10 +1,8 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2007-2011 Los Alamos National Security, LLC.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||||
* of Tennessee Research Foundation. All rights
|
* of Tennessee Research Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -21,7 +19,6 @@
|
|||||||
#include "opal/dss/dss.h"
|
#include "opal/dss/dss.h"
|
||||||
#include "opal/class/opal_pointer_array.h"
|
#include "opal/class/opal_pointer_array.h"
|
||||||
#include "opal/class/opal_bitmap.h"
|
#include "opal/class/opal_bitmap.h"
|
||||||
#include "opal/runtime/opal_progress.h"
|
|
||||||
#include "opal/util/bit_ops.h"
|
#include "opal/util/bit_ops.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
|
|
||||||
@ -87,7 +84,7 @@ static orte_process_name_t *lifeline=NULL;
|
|||||||
static orte_process_name_t local_lifeline;
|
static orte_process_name_t local_lifeline;
|
||||||
static int num_children;
|
static int num_children;
|
||||||
static opal_list_t my_children;
|
static opal_list_t my_children;
|
||||||
static bool ack_recvd;
|
static bool ack_waiting = false;
|
||||||
static bool hnp_direct=true;
|
static bool hnp_direct=true;
|
||||||
|
|
||||||
static int init(void)
|
static int init(void)
|
||||||
@ -450,7 +447,7 @@ static void recv_ack(int status, orte_process_name_t* sender,
|
|||||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||||
void* cbdata)
|
void* cbdata)
|
||||||
{
|
{
|
||||||
ack_recvd = true;
|
ack_waiting = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -617,13 +614,11 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
|
|||||||
/* wait right here until the HNP acks the update to ensure that
|
/* wait right here until the HNP acks the update to ensure that
|
||||||
* any subsequent messaging can succeed
|
* any subsequent messaging can succeed
|
||||||
*/
|
*/
|
||||||
ack_recvd = false;
|
ack_waiting = true;
|
||||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK,
|
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK,
|
||||||
ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
|
ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
|
||||||
|
|
||||||
while (!ack_recvd) {
|
ORTE_WAIT_FOR_COMPLETION(ack_waiting);
|
||||||
opal_progress();
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
|
||||||
"%s routed_binomial_init_routes: ack recvd",
|
"%s routed_binomial_init_routes: ack recvd",
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2007-2011 Los Alamos National Security, LLC.
|
* Copyright (c) 2007-2012 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
@ -22,7 +22,6 @@
|
|||||||
#include "opal/dss/dss.h"
|
#include "opal/dss/dss.h"
|
||||||
#include "opal/class/opal_hash_table.h"
|
#include "opal/class/opal_hash_table.h"
|
||||||
#include "opal/class/opal_bitmap.h"
|
#include "opal/class/opal_bitmap.h"
|
||||||
#include "opal/runtime/opal_progress.h"
|
|
||||||
#include "opal/util/bit_ops.h"
|
#include "opal/util/bit_ops.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
|
|
||||||
@ -85,7 +84,7 @@ orte_routed_module_t orte_routed_cm_module = {
|
|||||||
/* local globals */
|
/* local globals */
|
||||||
static orte_process_name_t *lifeline=NULL;
|
static orte_process_name_t *lifeline=NULL;
|
||||||
static orte_process_name_t local_lifeline;
|
static orte_process_name_t local_lifeline;
|
||||||
static bool ack_recvd;
|
static bool ack_waiting = false;
|
||||||
|
|
||||||
|
|
||||||
static int init(void)
|
static int init(void)
|
||||||
@ -411,7 +410,7 @@ static void recv_ack(int status, orte_process_name_t* sender,
|
|||||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||||
void* cbdata)
|
void* cbdata)
|
||||||
{
|
{
|
||||||
ack_recvd = true;
|
ack_waiting = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -627,13 +626,11 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
|
|||||||
/* wait right here until the HNP acks the update to ensure that
|
/* wait right here until the HNP acks the update to ensure that
|
||||||
* any subsequent messaging can succeed
|
* any subsequent messaging can succeed
|
||||||
*/
|
*/
|
||||||
ack_recvd = false;
|
ack_waiting = true;
|
||||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK,
|
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK,
|
||||||
ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
|
ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
|
||||||
|
|
||||||
while (!ack_recvd) {
|
ORTE_WAIT_FOR_COMPLETION(ack_waiting);
|
||||||
opal_progress();
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
|
||||||
"%s routed_cm_init_routes: ack recvd",
|
"%s routed_cm_init_routes: ack recvd",
|
||||||
|
@ -83,7 +83,7 @@ orte_routed_module_t orte_routed_debruijn_module = {
|
|||||||
static orte_process_name_t *lifeline=NULL;
|
static orte_process_name_t *lifeline=NULL;
|
||||||
static orte_process_name_t local_lifeline;
|
static orte_process_name_t local_lifeline;
|
||||||
static opal_list_t my_children;
|
static opal_list_t my_children;
|
||||||
static bool ack_recvd;
|
static bool ack_waiting = false;
|
||||||
static bool hnp_direct=true;
|
static bool hnp_direct=true;
|
||||||
static int log_nranks;
|
static int log_nranks;
|
||||||
static int log_npeers;
|
static int log_npeers;
|
||||||
@ -433,7 +433,7 @@ static void recv_ack(int status, orte_process_name_t* sender,
|
|||||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||||
void* cbdata)
|
void* cbdata)
|
||||||
{
|
{
|
||||||
ack_recvd = true;
|
ack_waiting = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -599,7 +599,7 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
|
|||||||
/* wait right here until the HNP acks the update to ensure that
|
/* wait right here until the HNP acks the update to ensure that
|
||||||
* any subsequent messaging can succeed
|
* any subsequent messaging can succeed
|
||||||
*/
|
*/
|
||||||
ack_recvd = false;
|
ack_waiting = true;
|
||||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK,
|
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK,
|
||||||
ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
|
ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
|
||||||
|
|
||||||
|
@ -81,7 +81,14 @@ const char orte_version_string[] = ORTE_IDENT_STRING;
|
|||||||
#if !ORTE_DISABLE_FULL_SUPPORT && ORTE_ENABLE_PROGRESS_THREADS
|
#if !ORTE_DISABLE_FULL_SUPPORT && ORTE_ENABLE_PROGRESS_THREADS
|
||||||
static void ignore_callback(int fd, short args, void *cbdata)
|
static void ignore_callback(int fd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
/* nothing to do here */
|
if (NULL == cbdata) {
|
||||||
|
/* nothing to do here */
|
||||||
|
} else {
|
||||||
|
opal_event_t *ev = (opal_event_t*)cbdata;
|
||||||
|
struct timeval tv = {1, 0};
|
||||||
|
opal_output(0, "TIMER FIRED");
|
||||||
|
opal_event_evtimer_add(ev, &tv);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -151,6 +158,16 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags)
|
|||||||
*/
|
*/
|
||||||
opal_event_set(orte_event_base, &orte_finalize_event, -1, OPAL_EV_WRITE, ignore_callback, NULL);
|
opal_event_set(orte_event_base, &orte_finalize_event, -1, OPAL_EV_WRITE, ignore_callback, NULL);
|
||||||
opal_event_set_priority(&orte_finalize_event, ORTE_ERROR_PRI);
|
opal_event_set_priority(&orte_finalize_event, ORTE_ERROR_PRI);
|
||||||
|
{
|
||||||
|
/* seems strange, but wake us up once a second just so we can check for new events */
|
||||||
|
opal_event_t *ev;
|
||||||
|
struct timeval tv = {1,0};
|
||||||
|
ev = opal_event_alloc();
|
||||||
|
opal_event_evtimer_set(orte_event_base,
|
||||||
|
ev, ignore_callback, ev);
|
||||||
|
opal_event_set_priority(ev, ORTE_INFO_PRI);
|
||||||
|
opal_event_evtimer_add(ev, &tv);
|
||||||
|
}
|
||||||
/* construct the thread object */
|
/* construct the thread object */
|
||||||
OBJ_CONSTRUCT(&orte_progress_thread, opal_thread_t);
|
OBJ_CONSTRUCT(&orte_progress_thread, opal_thread_t);
|
||||||
/* fork off a thread to progress it */
|
/* fork off a thread to progress it */
|
||||||
|
@ -34,7 +34,10 @@
|
|||||||
#ifdef HAVE_SYS_TYPES_H
|
#ifdef HAVE_SYS_TYPES_H
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAVE_SYS_TIME_H
|
#if HAVE_TIME_H
|
||||||
|
#include <time.h>
|
||||||
|
#endif
|
||||||
|
#if HAVE_SYS_TIME_H
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -42,6 +45,7 @@
|
|||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/sys/atomic.h"
|
#include "opal/sys/atomic.h"
|
||||||
#include "opal/mca/event/event.h"
|
#include "opal/mca/event/event.h"
|
||||||
|
#include "opal/runtime/opal_progress.h"
|
||||||
|
|
||||||
#include "orte/types.h"
|
#include "orte/types.h"
|
||||||
#include "orte/mca/rml/rml_types.h"
|
#include "orte/mca/rml/rml_types.h"
|
||||||
@ -107,6 +111,33 @@ typedef struct {
|
|||||||
} orte_timer_t;
|
} orte_timer_t;
|
||||||
OBJ_CLASS_DECLARATION(orte_timer_t);
|
OBJ_CLASS_DECLARATION(orte_timer_t);
|
||||||
|
|
||||||
|
/* In a few places, we need to barrier until something happens
|
||||||
|
* that changes a flag to indicate we can release - e.g., waiting
|
||||||
|
* for a specific message to arrive. If no progress thread is running,
|
||||||
|
* we cycle across opal_progress - however, if a progress thread
|
||||||
|
* is active, then we need to just nanosleep to avoid cross-thread
|
||||||
|
* confusion
|
||||||
|
*/
|
||||||
|
#if ORTE_ENABLE_PROGRESS_THREADS
|
||||||
|
#define ORTE_WAIT_FOR_COMPLETION(flg) \
|
||||||
|
do { \
|
||||||
|
while ((flg)) { \
|
||||||
|
/* provide a very short quiet period so we \
|
||||||
|
* don't hammer the cpu while we wait \
|
||||||
|
*/ \
|
||||||
|
struct timespec tp = {0, 100}; \
|
||||||
|
nanosleep(&tp, NULL); \
|
||||||
|
} \
|
||||||
|
}while(0);
|
||||||
|
#else
|
||||||
|
#define ORTE_WAIT_FOR_COMPLETION(flg) \
|
||||||
|
do { \
|
||||||
|
while ((flg)) { \
|
||||||
|
opal_progress(); \
|
||||||
|
} \
|
||||||
|
}while(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* In a number of places within the code, we want to setup a timer
|
* In a number of places within the code, we want to setup a timer
|
||||||
* to detect when some procedure failed to complete. For example,
|
* to detect when some procedure failed to complete. For example,
|
||||||
|
@ -784,16 +784,6 @@ int orterun(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Change the default behavior of libevent such that we want to
|
|
||||||
continually block rather than blocking for the default timeout
|
|
||||||
and then looping around the progress engine again. There
|
|
||||||
should be nothing in the orted that cannot block in libevent
|
|
||||||
until "something" happens (i.e., there's no need to keep
|
|
||||||
cycling through progress because the only things that should
|
|
||||||
happen will happen in libevent). This is a minor optimization,
|
|
||||||
but what the heck... :-) */
|
|
||||||
opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);
|
|
||||||
|
|
||||||
/* If we have a prefix, then modify the PATH and
|
/* If we have a prefix, then modify the PATH and
|
||||||
LD_LIBRARY_PATH environment variables in our copy. This
|
LD_LIBRARY_PATH environment variables in our copy. This
|
||||||
will ensure that any locally-spawned children will
|
will ensure that any locally-spawned children will
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user