1
1
1. fix in oshmem scoll component: basic algorithms should
   call basic collectives since their implementation
   incompatible with others (fca, hcoll).

2. Set OPAL_EVLOOP_ONCE flag ON for libevent in the case 
   of yoda smpl. Otherwise there is possible deadlock in 
   atomic_basic_lock call

fixed by Val, Igor, reviewed by Miked

cmr=v1.7.5:reviewer=ompi-rm1.7

This commit was SVN r30762.
Этот коммит содержится в:
Mike Dubman 2014-02-18 15:07:03 +00:00
родитель 69aba904ed
Коммит 982149d8c8
5 изменённых файлов: 74 добавлений и 53 удалений

Просмотреть файл

@ -19,6 +19,17 @@
BEGIN_C_DECLS
/* These functions (BARRIER_FUNC, BCAST_FUNC) may be called from any basic algorithm.
* In case of shmem, the implementation of broadcast doesn't require
* each process to know message size ( just root should know).
* It differs from other implementations, so it may cause problems if
* BCAST_FUNC is a callback to another implementation (e.g, fca, hcoll).
* So we replace a callback (group->g_scoll.scoll_[func])
* with a corresponding basic function. */
#define BARRIER_FUNC mca_scoll_basic_barrier
#define BCAST_FUNC mca_scoll_basic_broadcast
/* Globally exported variables */
OSHMEM_MODULE_DECLSPEC extern mca_scoll_base_component_1_0_0_t

Просмотреть файл

@ -151,9 +151,9 @@ static int _algorithm_central_counter(struct oshmem_group_t *group,
/* Wait for operation completion to set needed size */
if (rc == OSHMEM_SUCCESS) {
SCOLL_VERBOSE(14, "[#%d] Wait for operation completion", group->my_pe);
rc = group->g_scoll.scoll_barrier(group,
(pSync + 1),
SCOLL_DEFAULT_ALG);
rc = BARRIER_FUNC(group,
(pSync + 1),
SCOLL_DEFAULT_ALG);
}
return rc;

Просмотреть файл

@ -187,13 +187,13 @@ static int _algorithm_f_central_counter(struct oshmem_group_t *group,
SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d",
group->my_pe, PE_root);
rc = group->g_scoll.scoll_broadcast(group,
PE_root,
target,
target,
group->proc_count * nlong,
(pSync + 1),
SCOLL_DEFAULT_ALG);
rc = BCAST_FUNC(group,
PE_root,
target,
target,
group->proc_count * nlong,
(pSync + 1),
SCOLL_DEFAULT_ALG);
}
SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]);
@ -297,13 +297,13 @@ static int _algorithm_f_tournament(struct oshmem_group_t *group,
SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d",
group->my_pe, PE_root);
rc = group->g_scoll.scoll_broadcast(group,
PE_root,
target,
target,
group->proc_count * nlong,
(pSync + 1),
SCOLL_DEFAULT_ALG);
rc = BCAST_FUNC(group,
PE_root,
target,
target,
group->proc_count * nlong,
(pSync + 1),
SCOLL_DEFAULT_ALG);
}
SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]);
@ -612,13 +612,14 @@ static int _algorithm_central_collector(struct oshmem_group_t *group,
SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d",
group->my_pe, PE_root);
rc = group->g_scoll.scoll_broadcast(group,
PE_root,
target,
target,
offset,
(pSync + 1),
SCOLL_DEFAULT_ALG);
rc = BCAST_FUNC(group,
PE_root,
target,
target,
offset,
(pSync + 1),
SCOLL_DEFAULT_ALG);
}
return rc;

Просмотреть файл

@ -231,13 +231,13 @@ static int _algorithm_central_counter(struct oshmem_group_t *group,
SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d",
group->my_pe, PE_root);
rc = group->g_scoll.scoll_broadcast(group,
PE_root,
target,
target,
nlong,
(pSync + 1),
SCOLL_DEFAULT_ALG);
rc = BCAST_FUNC(group,
PE_root,
target,
target,
nlong,
(pSync + 1),
SCOLL_DEFAULT_ALG);
}
return rc;
@ -349,13 +349,13 @@ static int _algorithm_tournament(struct oshmem_group_t *group,
SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d",
group->my_pe, PE_root);
rc = group->g_scoll.scoll_broadcast(group,
PE_root,
target,
target,
nlong,
(pSync + 1),
SCOLL_DEFAULT_ALG);
rc = BCAST_FUNC(group,
PE_root,
target,
target,
nlong,
(pSync + 1),
SCOLL_DEFAULT_ALG);
}
free(target_cur);
@ -628,13 +628,13 @@ static int _algorithm_linear(struct oshmem_group_t *group,
SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d",
group->my_pe, root_pe);
rc = group->g_scoll.scoll_broadcast(group,
root_pe,
target,
target,
nlong,
(pSync + 1),
SCOLL_DEFAULT_ALG);
rc = BCAST_FUNC(group,
root_pe,
target,
target,
nlong,
(pSync + 1),
SCOLL_DEFAULT_ALG);
}
/* All done */
@ -796,13 +796,13 @@ static int _algorithm_log(struct oshmem_group_t *group,
SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d",
rank, root_pe);
rc = group->g_scoll.scoll_broadcast(group,
root_pe,
target,
target,
nlong,
(pSync + 1),
SCOLL_DEFAULT_ALG);
rc = BCAST_FUNC(group,
root_pe,
target,
target,
nlong,
(pSync + 1),
SCOLL_DEFAULT_ALG);
}
/* All done */

Просмотреть файл

@ -418,7 +418,7 @@ mca_spml_mkey_t *mca_spml_yoda_register(void* addr,
SPML_VERBOSE(5,
"rank %d btl %s address 0x%p len %llu shmid 0x%X|0x%X",
oshmem_proc_local_proc->proc_name.vpid, btl_type2str(ybtl->btl_type),
oshmem_proc_local_proc->proc_name.vpid, btl_type2str(ybtl->btl_type),
mkeys[i].va_base, (unsigned long long)size, MEMHEAP_SHM_GET_TYPE(shmid), MEMHEAP_SHM_GET_ID(shmid));
}
OBJ_DESTRUCT(&convertor);
@ -917,6 +917,15 @@ int mca_spml_yoda_enable(bool enable)
mca_spml_yoda.enabled = true;
/* The following line resolves the issue with BTL tcp and SPML yoda. In this case the
* atomic_basic_lock(root_rank) function may behave as DoS attack on root_rank, since
* all the procceses will do shmem_int_get from root_rank. These calls would go through
* bml active messaging and will trigger replays in libevent on root rank. If the flag
* OPAL_ENVLOOP_ONCE is not set then libevent will continously progress constantly
* incoming events thus causing root_rank to stuck in libevent loop.
*/
opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK | OPAL_EVLOOP_ONCE);
#if OSHMEM_WAIT_COMPLETION_DEBUG == 1
condition_dbg_init();
#endif