OSHMEM: added processing of zero-length collectives
- according spec 1.4, annex C shmem collectives should process
calls where number of elements is zero independently from pointer
value
- added zero-count processing - it just call barrier to
sync ranks
Signed-off-by: Sergey Oblomov <sergeyo@mellanox.com>
(cherry picked from commit 9de128afaf
)
Этот коммит содержится в:
родитель
7fc0841791
Коммит
dea9cf6b63
@ -61,6 +61,7 @@ int mca_scoll_basic_alltoall(struct oshmem_group_t *group,
|
||||
return OSHMEM_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
if (nelems) {
|
||||
if ((sst == 1) && (dst == 1)) {
|
||||
rc = a2a_alg_simple(group, target, source, nelems, element_size);
|
||||
} else {
|
||||
@ -71,6 +72,7 @@ int mca_scoll_basic_alltoall(struct oshmem_group_t *group,
|
||||
if (rc != OSHMEM_SUCCESS) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* quiet is needed because scoll level barrier does not
|
||||
* guarantee put completion
|
||||
|
@ -131,7 +131,7 @@ static int _algorithm_central_counter(struct oshmem_group_t *group,
|
||||
group->my_pe, pSync[0], PE_root);
|
||||
|
||||
/* Check if this PE is the root */
|
||||
if (PE_root == group->my_pe) {
|
||||
if ((PE_root == group->my_pe) && nlong) {
|
||||
int pe_cur = 0;
|
||||
|
||||
SCOLL_VERBOSE(14,
|
||||
@ -192,6 +192,16 @@ static int _algorithm_binomial_tree(struct oshmem_group_t *group,
|
||||
"[#%d] pSync[0] = %ld root = #%d",
|
||||
group->my_pe, pSync[0], PE_root);
|
||||
|
||||
if (OPAL_UNLIKELY(!nlong)) {
|
||||
SCOLL_VERBOSE(14, "[#%d] Wait for operation completion", group->my_pe);
|
||||
/* wait until root finishes sending data */
|
||||
rc = BARRIER_FUNC(group,
|
||||
(pSync + 1),
|
||||
SCOLL_DEFAULT_ALG);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
vrank = (my_id + group->proc_count - root_id) % group->proc_count;
|
||||
hibit = opal_hibit(vrank, dim);
|
||||
|
||||
|
@ -66,7 +66,7 @@ int mca_scoll_basic_collect(struct oshmem_group_t *group,
|
||||
if ((rc == OSHMEM_SUCCESS) && oshmem_proc_group_is_member(group)) {
|
||||
int i = 0;
|
||||
|
||||
if (nlong_type) {
|
||||
if (nlong_type && nlong) {
|
||||
alg = (alg == SCOLL_DEFAULT_ALG ?
|
||||
mca_scoll_basic_param_collect_algorithm : alg);
|
||||
switch (alg) {
|
||||
@ -156,7 +156,7 @@ static int _algorithm_f_central_counter(struct oshmem_group_t *group,
|
||||
group->my_pe);
|
||||
SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]);
|
||||
|
||||
if (PE_root == group->my_pe) {
|
||||
if ((PE_root == group->my_pe) && nlong) {
|
||||
int pe_cur = 0;
|
||||
|
||||
memcpy((void*) ((unsigned char*) target + 0 * nlong),
|
||||
@ -543,7 +543,7 @@ static int _algorithm_central_collector(struct oshmem_group_t *group,
|
||||
/* Set own data size */
|
||||
pSync[0] = (nlong ? (long)nlong : SHMEM_SYNC_READY);
|
||||
|
||||
if (PE_root == group->my_pe) {
|
||||
if ((PE_root == group->my_pe) && nlong) {
|
||||
long value = 0;
|
||||
int pe_cur = 0;
|
||||
long wait_pe_count = 0;
|
||||
|
@ -79,8 +79,9 @@ int mca_scoll_basic_reduce(struct oshmem_group_t *group,
|
||||
int i = 0;
|
||||
|
||||
if (pSync) {
|
||||
alg = (alg == SCOLL_DEFAULT_ALG ?
|
||||
mca_scoll_basic_param_reduce_algorithm : alg);
|
||||
alg = (nlong ? (alg == SCOLL_DEFAULT_ALG ?
|
||||
mca_scoll_basic_param_reduce_algorithm : alg) :
|
||||
SCOLL_ALG_REDUCE_CENTRAL_COUNTER );
|
||||
switch (alg) {
|
||||
case SCOLL_ALG_REDUCE_CENTRAL_COUNTER:
|
||||
{
|
||||
@ -185,7 +186,7 @@ static int _algorithm_central_counter(struct oshmem_group_t *group,
|
||||
|
||||
SCOLL_VERBOSE(12, "[#%d] Reduce algorithm: Central Counter", group->my_pe);
|
||||
|
||||
if (PE_root == group->my_pe) {
|
||||
if ((PE_root == group->my_pe) && nlong) {
|
||||
int pe_cur = 0;
|
||||
void *target_cur = NULL;
|
||||
|
||||
|
@ -61,7 +61,7 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
|
||||
* and considering this contradiction, we cast size_t to int here
|
||||
* in case if the value is less than INT_MAX and fallback to previous module otherwise. */
|
||||
#ifdef INCOMPATIBLE_SHMEM_OMPI_COLL_APIS
|
||||
if (INT_MAX < nlong) {
|
||||
if ((INT_MAX < nlong) || !nlong) {
|
||||
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK BCAST");
|
||||
PREVIOUS_SCOLL_FN(mpi_module, broadcast, group,
|
||||
PE_root,
|
||||
@ -104,7 +104,7 @@ int mca_scoll_mpi_collect(struct oshmem_group_t *group,
|
||||
void *sbuf, *rbuf;
|
||||
MPI_COLL_VERBOSE(20,"RUNNING MPI ALLGATHER");
|
||||
mpi_module = (mca_scoll_mpi_module_t *) group->g_scoll.scoll_collect_module;
|
||||
if (nlong_type == true) {
|
||||
if ((nlong_type == true) && nlong) {
|
||||
sbuf = (void *) source;
|
||||
rbuf = target;
|
||||
stype = &ompi_mpi_char.dt;
|
||||
@ -184,7 +184,7 @@ int mca_scoll_mpi_reduce(struct oshmem_group_t *group,
|
||||
* and considering this contradiction, we cast size_t to int here
|
||||
* in case if the value is less than INT_MAX and fallback to previous module otherwise. */
|
||||
#ifdef INCOMPATIBLE_SHMEM_OMPI_COLL_APIS
|
||||
if (INT_MAX < count) {
|
||||
if ((INT_MAX < count) || !nlong) {
|
||||
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK REDUCE");
|
||||
PREVIOUS_SCOLL_FN(mpi_module, reduce, group,
|
||||
op,
|
||||
|
@ -200,6 +200,13 @@ OSHMEM_DECLSPEC int oshmem_shmem_register_params(void);
|
||||
RUNTIME_CHECK_ERROR("Required address %p is not in symmetric space\n", ((void*)x)); \
|
||||
oshmem_shmem_abort(-1); \
|
||||
}
|
||||
/* Check if address is in symmetric space or size is zero */
|
||||
#define RUNTIME_CHECK_ADDR_SIZE(x,s) \
|
||||
if (OPAL_UNLIKELY((s) && !MCA_MEMHEAP_CALL(is_symmetric_addr((x))))) \
|
||||
{ \
|
||||
RUNTIME_CHECK_ERROR("Required address %p is not in symmetric space\n", ((void*)x)); \
|
||||
oshmem_shmem_abort(-1); \
|
||||
}
|
||||
#define RUNTIME_CHECK_WITH_MEMHEAP_SIZE(x) \
|
||||
if (OPAL_UNLIKELY((long)(x) > MCA_MEMHEAP_CALL(size))) \
|
||||
{ \
|
||||
@ -212,6 +219,7 @@ OSHMEM_DECLSPEC int oshmem_shmem_register_params(void);
|
||||
#define RUNTIME_CHECK_INIT()
|
||||
#define RUNTIME_CHECK_PE(x)
|
||||
#define RUNTIME_CHECK_ADDR(x)
|
||||
#define RUNTIME_CHECK_ADDR_SIZE(x,s)
|
||||
#define RUNTIME_CHECK_WITH_MEMHEAP_SIZE(x)
|
||||
|
||||
#endif /* OSHMEM_PARAM_CHECK */
|
||||
|
@ -40,8 +40,8 @@ static void _shmem_alltoall(void *target,
|
||||
long *pSync) \
|
||||
{ \
|
||||
RUNTIME_CHECK_INIT(); \
|
||||
RUNTIME_CHECK_ADDR(target); \
|
||||
RUNTIME_CHECK_ADDR(source); \
|
||||
RUNTIME_CHECK_ADDR_SIZE(target, nelems); \
|
||||
RUNTIME_CHECK_ADDR_SIZE(source, nelems); \
|
||||
\
|
||||
_shmem_alltoall(target, source, 1, 1, nelems, element_size, \
|
||||
PE_start, logPE_stride, PE_size, \
|
||||
@ -59,8 +59,8 @@ static void _shmem_alltoall(void *target,
|
||||
long *pSync) \
|
||||
{ \
|
||||
RUNTIME_CHECK_INIT(); \
|
||||
RUNTIME_CHECK_ADDR(target); \
|
||||
RUNTIME_CHECK_ADDR(source); \
|
||||
RUNTIME_CHECK_ADDR_SIZE(target, nelems); \
|
||||
RUNTIME_CHECK_ADDR_SIZE(source, nelems); \
|
||||
\
|
||||
_shmem_alltoall(target, source, dst, sst, nelems, element_size, \
|
||||
PE_start, logPE_stride, PE_size, \
|
||||
|
@ -40,8 +40,8 @@ static void _shmem_broadcast(void *target,
|
||||
long *pSync) \
|
||||
{ \
|
||||
RUNTIME_CHECK_INIT(); \
|
||||
RUNTIME_CHECK_ADDR(target); \
|
||||
RUNTIME_CHECK_ADDR(source); \
|
||||
RUNTIME_CHECK_ADDR_SIZE(target, nelems); \
|
||||
RUNTIME_CHECK_ADDR_SIZE(source, nelems); \
|
||||
\
|
||||
_shmem_broadcast( target, source, nelems * element_size, \
|
||||
PE_root, PE_start, logPE_stride, PE_size, \
|
||||
|
@ -39,8 +39,8 @@ static void _shmem_collect(void *target,
|
||||
long *pSync) \
|
||||
{ \
|
||||
RUNTIME_CHECK_INIT(); \
|
||||
RUNTIME_CHECK_ADDR(target); \
|
||||
RUNTIME_CHECK_ADDR(source); \
|
||||
RUNTIME_CHECK_ADDR_SIZE(target, nelems); \
|
||||
RUNTIME_CHECK_ADDR_SIZE(source, nelems); \
|
||||
\
|
||||
_shmem_collect( target, source, nelems * element_size, \
|
||||
PE_start, logPE_stride, PE_size, \
|
||||
|
@ -40,8 +40,8 @@
|
||||
oshmem_group_t* group = NULL; \
|
||||
\
|
||||
RUNTIME_CHECK_INIT(); \
|
||||
RUNTIME_CHECK_ADDR(target); \
|
||||
RUNTIME_CHECK_ADDR(source); \
|
||||
RUNTIME_CHECK_ADDR_SIZE(target, nreduce); \
|
||||
RUNTIME_CHECK_ADDR_SIZE(source, nreduce); \
|
||||
\
|
||||
{ \
|
||||
group = oshmem_proc_group_create_nofail(PE_start, 1<<logPE_stride, PE_size); \
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user