1
1
1. fix in oshmem scoll component: basic algorithms should
   call basic collectives since their implementation
   incompatible with others (fca, hcoll).

2. Set OPAL_EVLOOP_ONCE flag ON for libevent in the case 
   of yoda smpl. Otherwise there is possible deadlock in 
   atomic_basic_lock call

fixed by Val, Igor, reviewed by Miked

cmr=v1.7.5:reviewer=ompi-rm1.7

This commit was SVN r30762.
Этот коммит содержится в:
Mike Dubman 2014-02-18 15:07:03 +00:00
родитель 69aba904ed
Коммит 982149d8c8
5 изменённых файлов: 74 добавлений и 53 удалений

Просмотреть файл

@ -19,6 +19,17 @@
BEGIN_C_DECLS BEGIN_C_DECLS
/* These functions (BARRIER_FUNC, BCAST_FUNC) may be called from any basic algorithm.
* In case of shmem, the implementation of broadcast doesn't require
* each process to know message size ( just root should know).
* It differs from other implementations, so it may cause problems if
* BCAST_FUNC is a callback to another implementation (e.g, fca, hcoll).
* So we replace a callback (group->g_scoll.scoll_[func])
* with a corresponding basic function. */
#define BARRIER_FUNC mca_scoll_basic_barrier
#define BCAST_FUNC mca_scoll_basic_broadcast
/* Globally exported variables */ /* Globally exported variables */
OSHMEM_MODULE_DECLSPEC extern mca_scoll_base_component_1_0_0_t OSHMEM_MODULE_DECLSPEC extern mca_scoll_base_component_1_0_0_t

Просмотреть файл

@ -151,7 +151,7 @@ static int _algorithm_central_counter(struct oshmem_group_t *group,
/* Wait for operation completion to set needed size */ /* Wait for operation completion to set needed size */
if (rc == OSHMEM_SUCCESS) { if (rc == OSHMEM_SUCCESS) {
SCOLL_VERBOSE(14, "[#%d] Wait for operation completion", group->my_pe); SCOLL_VERBOSE(14, "[#%d] Wait for operation completion", group->my_pe);
rc = group->g_scoll.scoll_barrier(group, rc = BARRIER_FUNC(group,
(pSync + 1), (pSync + 1),
SCOLL_DEFAULT_ALG); SCOLL_DEFAULT_ALG);
} }

Просмотреть файл

@ -187,7 +187,7 @@ static int _algorithm_f_central_counter(struct oshmem_group_t *group,
SCOLL_VERBOSE(14, SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d", "[#%d] Broadcast from the root #%d",
group->my_pe, PE_root); group->my_pe, PE_root);
rc = group->g_scoll.scoll_broadcast(group, rc = BCAST_FUNC(group,
PE_root, PE_root,
target, target,
target, target,
@ -297,7 +297,7 @@ static int _algorithm_f_tournament(struct oshmem_group_t *group,
SCOLL_VERBOSE(14, SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d", "[#%d] Broadcast from the root #%d",
group->my_pe, PE_root); group->my_pe, PE_root);
rc = group->g_scoll.scoll_broadcast(group, rc = BCAST_FUNC(group,
PE_root, PE_root,
target, target,
target, target,
@ -612,7 +612,8 @@ static int _algorithm_central_collector(struct oshmem_group_t *group,
SCOLL_VERBOSE(14, SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d", "[#%d] Broadcast from the root #%d",
group->my_pe, PE_root); group->my_pe, PE_root);
rc = group->g_scoll.scoll_broadcast(group,
rc = BCAST_FUNC(group,
PE_root, PE_root,
target, target,
target, target,

Просмотреть файл

@ -231,7 +231,7 @@ static int _algorithm_central_counter(struct oshmem_group_t *group,
SCOLL_VERBOSE(14, SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d", "[#%d] Broadcast from the root #%d",
group->my_pe, PE_root); group->my_pe, PE_root);
rc = group->g_scoll.scoll_broadcast(group, rc = BCAST_FUNC(group,
PE_root, PE_root,
target, target,
target, target,
@ -349,7 +349,7 @@ static int _algorithm_tournament(struct oshmem_group_t *group,
SCOLL_VERBOSE(14, SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d", "[#%d] Broadcast from the root #%d",
group->my_pe, PE_root); group->my_pe, PE_root);
rc = group->g_scoll.scoll_broadcast(group, rc = BCAST_FUNC(group,
PE_root, PE_root,
target, target,
target, target,
@ -628,7 +628,7 @@ static int _algorithm_linear(struct oshmem_group_t *group,
SCOLL_VERBOSE(14, SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d", "[#%d] Broadcast from the root #%d",
group->my_pe, root_pe); group->my_pe, root_pe);
rc = group->g_scoll.scoll_broadcast(group, rc = BCAST_FUNC(group,
root_pe, root_pe,
target, target,
target, target,
@ -796,7 +796,7 @@ static int _algorithm_log(struct oshmem_group_t *group,
SCOLL_VERBOSE(14, SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d", "[#%d] Broadcast from the root #%d",
rank, root_pe); rank, root_pe);
rc = group->g_scoll.scoll_broadcast(group, rc = BCAST_FUNC(group,
root_pe, root_pe,
target, target,
target, target,

Просмотреть файл

@ -917,6 +917,15 @@ int mca_spml_yoda_enable(bool enable)
mca_spml_yoda.enabled = true; mca_spml_yoda.enabled = true;
/* The following line resolves the issue with BTL tcp and SPML yoda. In this case the
* atomic_basic_lock(root_rank) function may behave as DoS attack on root_rank, since
* all the procceses will do shmem_int_get from root_rank. These calls would go through
* bml active messaging and will trigger replays in libevent on root rank. If the flag
* OPAL_ENVLOOP_ONCE is not set then libevent will continously progress constantly
* incoming events thus causing root_rank to stuck in libevent loop.
*/
opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK | OPAL_EVLOOP_ONCE);
#if OSHMEM_WAIT_COMPLETION_DEBUG == 1 #if OSHMEM_WAIT_COMPLETION_DEBUG == 1
condition_dbg_init(); condition_dbg_init();
#endif #endif