1
1

oshmem/scoll: fix shmem_collect32/64 for zero-size length

Fixes scoll_basic failures with shmem_verifier, caused by recent changes
in handling of zero-size collectives.

- Check for zero-size length only for fixed size collect (shmem_fcollect),
  but not for variable-size collect (shmem_collect)
- Add 'nlong_type' parameter to internal broadcast function, to indicate
  whether the 'nlong' parameter is valid on non-root PEs, since it's
  used by shmem_collect algorithm. Before this change, some components
  assumed it's true (scoll_mpi) while others assumed it's false
  (scoll_basic).
- In scoll_basic, if nlong_type==false, do not exit if nlong==0, since
  this parameter may not be the same on all PEs.
- In scoll_mpi, fallback to scoll_basic if nlong_type==false, since MPI
  requires the 'count' argument of MPI_Bcast to be valid on all ranks.

Signed-off-by: Yossi Itigin <yosefe@mellanox.com>
Этот коммит содержится в:
Yossi Itigin 2019-01-01 19:50:28 +02:00
родитель e54496bf2a
Коммит 939162ed33
12 изменённых файлов: 45 добавлений и 22 удалений

Просмотреть файл

@ -77,6 +77,7 @@ static int scoll_null_broadcast(struct oshmem_group_t *group,
const void *source,
size_t nlong,
long *pSync,
bool nlong_type,
int alg)
{
if (oshmem_proc_group_is_member(group)) {

Просмотреть файл

@ -61,6 +61,7 @@ int mca_scoll_basic_broadcast(struct oshmem_group_t *group,
const void *source,
size_t nlong,
long *pSync,
bool nlong_type,
int alg);
int mca_scoll_basic_collect(struct oshmem_group_t *group,
void *target,

Просмотреть файл

@ -41,6 +41,7 @@ int mca_scoll_basic_broadcast(struct oshmem_group_t *group,
const void *source,
size_t nlong,
long *pSync,
bool nlong_type,
int alg)
{
int rc = OSHMEM_SUCCESS;
@ -56,7 +57,7 @@ int mca_scoll_basic_broadcast(struct oshmem_group_t *group,
int i = 0;
/* Do nothing on zero-length request */
if (OPAL_UNLIKELY(!nlong)) {
if (OPAL_UNLIKELY(nlong_type && !nlong)) {
return OSHMEM_SUCCESS;
}

Просмотреть файл

@ -66,12 +66,13 @@ int mca_scoll_basic_collect(struct oshmem_group_t *group,
if ((rc == OSHMEM_SUCCESS) && oshmem_proc_group_is_member(group)) {
int i = 0;
/* Do nothing on zero-length request */
if (OPAL_UNLIKELY(!nlong)) {
return OPAL_SUCCESS;
}
if (nlong_type) {
/* Do nothing on zero-length request */
if (OPAL_UNLIKELY(!nlong)) {
return OPAL_SUCCESS;
}
alg = (alg == SCOLL_DEFAULT_ALG ?
mca_scoll_basic_param_collect_algorithm : alg);
switch (alg) {
@ -198,6 +199,7 @@ static int _algorithm_f_central_counter(struct oshmem_group_t *group,
target,
group->proc_count * nlong,
(pSync + 1),
true,
SCOLL_DEFAULT_ALG);
}
@ -308,6 +310,7 @@ static int _algorithm_f_tournament(struct oshmem_group_t *group,
target,
group->proc_count * nlong,
(pSync + 1),
true,
SCOLL_DEFAULT_ALG);
}
@ -629,6 +632,7 @@ static int _algorithm_central_collector(struct oshmem_group_t *group,
target,
offset,
(pSync + 1),
false,
SCOLL_DEFAULT_ALG);
}

Просмотреть файл

@ -242,6 +242,7 @@ static int _algorithm_central_counter(struct oshmem_group_t *group,
target,
nlong,
(pSync + 1),
true,
SCOLL_DEFAULT_ALG);
}
@ -360,6 +361,7 @@ static int _algorithm_tournament(struct oshmem_group_t *group,
target,
nlong,
(pSync + 1),
true,
SCOLL_DEFAULT_ALG);
}
@ -639,6 +641,7 @@ static int _algorithm_linear(struct oshmem_group_t *group,
target,
nlong,
(pSync + 1),
true,
SCOLL_DEFAULT_ALG);
}
@ -807,6 +810,7 @@ static int _algorithm_log(struct oshmem_group_t *group,
target,
nlong,
(pSync + 1),
true,
SCOLL_DEFAULT_ALG);
}

Просмотреть файл

@ -115,6 +115,7 @@ int mca_scoll_fca_broadcast(struct oshmem_group_t *group,
const void *source,
size_t nlong,
long *pSync,
bool nlong_type,
int algorithm_type);
int mca_scoll_fca_collect(struct oshmem_group_t *group,
void *target,

Просмотреть файл

@ -50,6 +50,7 @@ int mca_scoll_fca_broadcast(struct oshmem_group_t *group,
const void *source,
size_t nlong,
long *pSync,
bool nlong_type,
int alg)
{
mca_scoll_fca_module_t *fca_module =

Просмотреть файл

@ -90,6 +90,7 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
const void *source,
size_t nlong,
long *pSync,
bool nlong_type,
int alg);
int mca_scoll_mpi_collect(struct oshmem_group_t *group,

Просмотреть файл

@ -38,6 +38,7 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
const void *source,
size_t nlong,
long *pSync,
bool nlong_type,
int alg)
{
mca_scoll_mpi_module_t *mpi_module;
@ -54,20 +55,14 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
}
dtype = &ompi_mpi_char.dt;
root = oshmem_proc_group_find_id(group, PE_root);
/* Do nothing on zero-length request */
if (OPAL_UNLIKELY(!nlong)) {
return OSHMEM_SUCCESS;
}
/* Open SHMEM specification has the following constrains (page 85):
* "If using C/C++, nelems must be of type integer. If you are using Fortran, it must be a
* default integer value". And also fortran signature says "INTEGER".
* Since ompi coll components doesn't support size_t at the moment,
* and considering this contradiction, we cast size_t to int here
* in case if the value is less than INT_MAX and fallback to previous module otherwise. */
if (OPAL_UNLIKELY(!nlong_type || (INT_MAX < nlong))) {
#ifdef INCOMPATIBLE_SHMEM_OMPI_COLL_APIS
if (INT_MAX < nlong) {
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK BCAST");
PREVIOUS_SCOLL_FN(mpi_module, broadcast, group,
PE_root,
@ -75,13 +70,21 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
source,
nlong,
pSync,
nlong_type,
SCOLL_DEFAULT_ALG);
return rc;
}
rc = mpi_module->comm->c_coll->coll_bcast(buf, (int)nlong, dtype, root, mpi_module->comm, mpi_module->comm->c_coll->coll_bcast_module);
#else
rc = mpi_module->comm->c_coll->coll_bcast(buf, nlong, dtype, root, mpi_module->comm, mpi_module->comm->c_coll->coll_bcast_module);
MPI_COLL_ERROR(20, "variable broadcast length, or exceeds INT_MAX: %zu", nlong);
return OSHMEM_ERR_NOT_SUPPORTED;
#endif
}
/* Do nothing on zero-length request */
if (OPAL_UNLIKELY(!nlong)) {
return OSHMEM_SUCCESS;
}
rc = mpi_module->comm->c_coll->coll_bcast(buf, nlong, dtype, root, mpi_module->comm, mpi_module->comm->c_coll->coll_bcast_module);
if (OMPI_SUCCESS != rc){
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK BCAST");
PREVIOUS_SCOLL_FN(mpi_module, broadcast, group,
@ -90,6 +93,7 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
source,
nlong,
pSync,
nlong_type,
SCOLL_DEFAULT_ALG);
}
return rc;
@ -111,12 +115,13 @@ int mca_scoll_mpi_collect(struct oshmem_group_t *group,
MPI_COLL_VERBOSE(20,"RUNNING MPI ALLGATHER");
mpi_module = (mca_scoll_mpi_module_t *) group->g_scoll.scoll_collect_module;
/* Do nothing on zero-length request */
if (OPAL_UNLIKELY(!nlong)) {
return OSHMEM_SUCCESS;
}
if (nlong_type == true) {
/* Do nothing on zero-length request */
if (OPAL_UNLIKELY(!nlong)) {
return OSHMEM_SUCCESS;
}
sbuf = (void *) source;
rbuf = target;
stype = &ompi_mpi_char.dt;

Просмотреть файл

@ -122,6 +122,7 @@ typedef int (*mca_scoll_base_module_broadcast_fn_t)(struct oshmem_group_t *group
const void *source,
size_t nlong,
long *pSync,
bool nlong_type,
int alg);
typedef int (*mca_scoll_base_module_collect_fn_t)(struct oshmem_group_t *group,
void *target,

Просмотреть файл

@ -78,6 +78,7 @@ static void _shmem_broadcast(void *target,
source,
nbytes,
pSync,
true,
SCOLL_DEFAULT_ALG);
out:
oshmem_proc_group_destroy(group);

Просмотреть файл

@ -93,7 +93,9 @@ SHMEM_GENERATE_FORTRAN_BINDINGS_SUB (void,
FPTR_2_VOID_PTR(target), \
FPTR_2_VOID_PTR(source), \
OMPI_FINT_2_INT(*nlong) * op->dt_size, \
FPTR_2_VOID_PTR(pSync), SCOLL_DEFAULT_ALG );\
FPTR_2_VOID_PTR(pSync), \
true, \
SCOLL_DEFAULT_ALG );\
out: \
oshmem_proc_group_destroy(group);\
RUNTIME_CHECK_RC(rc); \