oshmem/scoll: fix shmem_collect32/64 for zero-size length
Fixes scoll_basic failures with shmem_verifier, caused by recent changes in handling of zero-size collectives. - Check for zero-size length only for fixed size collect (shmem_fcollect), but not for variable-size collect (shmem_collect) - Add 'nlong_type' parameter to internal broadcast function, to indicate whether the 'nlong' parameter is valid on non-root PEs, since it's used by shmem_collect algorithm. Before this change, some components assumed it's true (scoll_mpi) while others assumed it's false (scoll_basic). - In scoll_basic, if nlong_type==false, do not exit if nlong==0, since this parameter may not be the same on all PEs. - In scoll_mpi, fallback to scoll_basic if nlong_type==false, since MPI requires the 'count' argument of MPI_Bcast to be valid on all ranks. Signed-off-by: Yossi Itigin <yosefe@mellanox.com>
Этот коммит содержится в:
родитель
e54496bf2a
Коммит
939162ed33
@ -77,6 +77,7 @@ static int scoll_null_broadcast(struct oshmem_group_t *group,
|
||||
const void *source,
|
||||
size_t nlong,
|
||||
long *pSync,
|
||||
bool nlong_type,
|
||||
int alg)
|
||||
{
|
||||
if (oshmem_proc_group_is_member(group)) {
|
||||
|
@ -61,6 +61,7 @@ int mca_scoll_basic_broadcast(struct oshmem_group_t *group,
|
||||
const void *source,
|
||||
size_t nlong,
|
||||
long *pSync,
|
||||
bool nlong_type,
|
||||
int alg);
|
||||
int mca_scoll_basic_collect(struct oshmem_group_t *group,
|
||||
void *target,
|
||||
|
@ -41,6 +41,7 @@ int mca_scoll_basic_broadcast(struct oshmem_group_t *group,
|
||||
const void *source,
|
||||
size_t nlong,
|
||||
long *pSync,
|
||||
bool nlong_type,
|
||||
int alg)
|
||||
{
|
||||
int rc = OSHMEM_SUCCESS;
|
||||
@ -56,7 +57,7 @@ int mca_scoll_basic_broadcast(struct oshmem_group_t *group,
|
||||
int i = 0;
|
||||
|
||||
/* Do nothing on zero-length request */
|
||||
if (OPAL_UNLIKELY(!nlong)) {
|
||||
if (OPAL_UNLIKELY(nlong_type && !nlong)) {
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -66,12 +66,13 @@ int mca_scoll_basic_collect(struct oshmem_group_t *group,
|
||||
if ((rc == OSHMEM_SUCCESS) && oshmem_proc_group_is_member(group)) {
|
||||
int i = 0;
|
||||
|
||||
/* Do nothing on zero-length request */
|
||||
if (OPAL_UNLIKELY(!nlong)) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
if (nlong_type) {
|
||||
|
||||
/* Do nothing on zero-length request */
|
||||
if (OPAL_UNLIKELY(!nlong)) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
alg = (alg == SCOLL_DEFAULT_ALG ?
|
||||
mca_scoll_basic_param_collect_algorithm : alg);
|
||||
switch (alg) {
|
||||
@ -198,6 +199,7 @@ static int _algorithm_f_central_counter(struct oshmem_group_t *group,
|
||||
target,
|
||||
group->proc_count * nlong,
|
||||
(pSync + 1),
|
||||
true,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
}
|
||||
|
||||
@ -308,6 +310,7 @@ static int _algorithm_f_tournament(struct oshmem_group_t *group,
|
||||
target,
|
||||
group->proc_count * nlong,
|
||||
(pSync + 1),
|
||||
true,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
}
|
||||
|
||||
@ -629,6 +632,7 @@ static int _algorithm_central_collector(struct oshmem_group_t *group,
|
||||
target,
|
||||
offset,
|
||||
(pSync + 1),
|
||||
false,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
}
|
||||
|
||||
|
@ -242,6 +242,7 @@ static int _algorithm_central_counter(struct oshmem_group_t *group,
|
||||
target,
|
||||
nlong,
|
||||
(pSync + 1),
|
||||
true,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
}
|
||||
|
||||
@ -360,6 +361,7 @@ static int _algorithm_tournament(struct oshmem_group_t *group,
|
||||
target,
|
||||
nlong,
|
||||
(pSync + 1),
|
||||
true,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
}
|
||||
|
||||
@ -639,6 +641,7 @@ static int _algorithm_linear(struct oshmem_group_t *group,
|
||||
target,
|
||||
nlong,
|
||||
(pSync + 1),
|
||||
true,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
}
|
||||
|
||||
@ -807,6 +810,7 @@ static int _algorithm_log(struct oshmem_group_t *group,
|
||||
target,
|
||||
nlong,
|
||||
(pSync + 1),
|
||||
true,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
}
|
||||
|
||||
|
@ -115,6 +115,7 @@ int mca_scoll_fca_broadcast(struct oshmem_group_t *group,
|
||||
const void *source,
|
||||
size_t nlong,
|
||||
long *pSync,
|
||||
bool nlong_type,
|
||||
int algorithm_type);
|
||||
int mca_scoll_fca_collect(struct oshmem_group_t *group,
|
||||
void *target,
|
||||
|
@ -50,6 +50,7 @@ int mca_scoll_fca_broadcast(struct oshmem_group_t *group,
|
||||
const void *source,
|
||||
size_t nlong,
|
||||
long *pSync,
|
||||
bool nlong_type,
|
||||
int alg)
|
||||
{
|
||||
mca_scoll_fca_module_t *fca_module =
|
||||
|
@ -90,6 +90,7 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
|
||||
const void *source,
|
||||
size_t nlong,
|
||||
long *pSync,
|
||||
bool nlong_type,
|
||||
int alg);
|
||||
|
||||
int mca_scoll_mpi_collect(struct oshmem_group_t *group,
|
||||
|
@ -38,6 +38,7 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
|
||||
const void *source,
|
||||
size_t nlong,
|
||||
long *pSync,
|
||||
bool nlong_type,
|
||||
int alg)
|
||||
{
|
||||
mca_scoll_mpi_module_t *mpi_module;
|
||||
@ -54,20 +55,14 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
|
||||
}
|
||||
dtype = &ompi_mpi_char.dt;
|
||||
root = oshmem_proc_group_find_id(group, PE_root);
|
||||
|
||||
/* Do nothing on zero-length request */
|
||||
if (OPAL_UNLIKELY(!nlong)) {
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
/* Open SHMEM specification has the following constrains (page 85):
|
||||
* "If using C/C++, nelems must be of type integer. If you are using Fortran, it must be a
|
||||
* default integer value". And also fortran signature says "INTEGER".
|
||||
* Since ompi coll components doesn't support size_t at the moment,
|
||||
* and considering this contradiction, we cast size_t to int here
|
||||
* in case if the value is less than INT_MAX and fallback to previous module otherwise. */
|
||||
if (OPAL_UNLIKELY(!nlong_type || (INT_MAX < nlong))) {
|
||||
#ifdef INCOMPATIBLE_SHMEM_OMPI_COLL_APIS
|
||||
if (INT_MAX < nlong) {
|
||||
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK BCAST");
|
||||
PREVIOUS_SCOLL_FN(mpi_module, broadcast, group,
|
||||
PE_root,
|
||||
@ -75,13 +70,21 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
|
||||
source,
|
||||
nlong,
|
||||
pSync,
|
||||
nlong_type,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
return rc;
|
||||
}
|
||||
rc = mpi_module->comm->c_coll->coll_bcast(buf, (int)nlong, dtype, root, mpi_module->comm, mpi_module->comm->c_coll->coll_bcast_module);
|
||||
#else
|
||||
rc = mpi_module->comm->c_coll->coll_bcast(buf, nlong, dtype, root, mpi_module->comm, mpi_module->comm->c_coll->coll_bcast_module);
|
||||
MPI_COLL_ERROR(20, "variable broadcast length, or exceeds INT_MAX: %zu", nlong);
|
||||
return OSHMEM_ERR_NOT_SUPPORTED;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Do nothing on zero-length request */
|
||||
if (OPAL_UNLIKELY(!nlong)) {
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
rc = mpi_module->comm->c_coll->coll_bcast(buf, nlong, dtype, root, mpi_module->comm, mpi_module->comm->c_coll->coll_bcast_module);
|
||||
if (OMPI_SUCCESS != rc){
|
||||
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK BCAST");
|
||||
PREVIOUS_SCOLL_FN(mpi_module, broadcast, group,
|
||||
@ -90,6 +93,7 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
|
||||
source,
|
||||
nlong,
|
||||
pSync,
|
||||
nlong_type,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
}
|
||||
return rc;
|
||||
@ -111,12 +115,13 @@ int mca_scoll_mpi_collect(struct oshmem_group_t *group,
|
||||
MPI_COLL_VERBOSE(20,"RUNNING MPI ALLGATHER");
|
||||
mpi_module = (mca_scoll_mpi_module_t *) group->g_scoll.scoll_collect_module;
|
||||
|
||||
/* Do nothing on zero-length request */
|
||||
if (OPAL_UNLIKELY(!nlong)) {
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
if (nlong_type == true) {
|
||||
|
||||
/* Do nothing on zero-length request */
|
||||
if (OPAL_UNLIKELY(!nlong)) {
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
sbuf = (void *) source;
|
||||
rbuf = target;
|
||||
stype = &ompi_mpi_char.dt;
|
||||
|
@ -122,6 +122,7 @@ typedef int (*mca_scoll_base_module_broadcast_fn_t)(struct oshmem_group_t *group
|
||||
const void *source,
|
||||
size_t nlong,
|
||||
long *pSync,
|
||||
bool nlong_type,
|
||||
int alg);
|
||||
typedef int (*mca_scoll_base_module_collect_fn_t)(struct oshmem_group_t *group,
|
||||
void *target,
|
||||
|
@ -78,6 +78,7 @@ static void _shmem_broadcast(void *target,
|
||||
source,
|
||||
nbytes,
|
||||
pSync,
|
||||
true,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
out:
|
||||
oshmem_proc_group_destroy(group);
|
||||
|
@ -93,7 +93,9 @@ SHMEM_GENERATE_FORTRAN_BINDINGS_SUB (void,
|
||||
FPTR_2_VOID_PTR(target), \
|
||||
FPTR_2_VOID_PTR(source), \
|
||||
OMPI_FINT_2_INT(*nlong) * op->dt_size, \
|
||||
FPTR_2_VOID_PTR(pSync), SCOLL_DEFAULT_ALG );\
|
||||
FPTR_2_VOID_PTR(pSync), \
|
||||
true, \
|
||||
SCOLL_DEFAULT_ALG );\
|
||||
out: \
|
||||
oshmem_proc_group_destroy(group);\
|
||||
RUNTIME_CHECK_RC(rc); \
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user