1
1

OSHMEM: scoll fix corner cases

- fix segv
- proper enable/disable and prio handling

fixed by Elena, reviewed by Igor/Mike

cmr=v1.7.5:reviewer=ompi-rm1.7

This commit was SVN r30962.
Этот коммит содержится в:
Mike Dubman 2014-03-07 12:31:29 +00:00
родитель b51733c456
Коммит c784aab7d8
4 изменённых файлов: 91 добавлений и 58 удалений

Просмотреть файл

@ -17,22 +17,24 @@ int mca_scoll_fca_barrier(struct oshmem_group_t *group, long *pSync, int alg)
{
mca_scoll_fca_module_t *fca_module =
(mca_scoll_fca_module_t *) group->g_scoll.scoll_barrier_module;
int ret;
int rc;
FCA_VERBOSE(5, "Using FCA Barrier");
ret = fca_do_barrier(fca_module->fca_comm);
if (ret < 0) {
if (ret == -EUSESHMEM) {
rc = fca_do_barrier(fca_module->fca_comm);
if (rc < 0) {
if (rc == -EUSESHMEM) {
FCA_VERBOSE(5, "FCA Barrier failed, using original barrier");
goto orig_barrier;
}
FCA_ERROR("Barrier failed: %s", fca_strerror(ret));
FCA_ERROR("Barrier failed: %s", fca_strerror(rc));
return OSHMEM_ERROR;
}
return OSHMEM_SUCCESS;
orig_barrier: return fca_module->previous_barrier(group,
pSync,
SCOLL_DEFAULT_ALG);
orig_barrier:
PREVIOUS_SCOLL_FN(fca_module, barrier, group,
pSync,
SCOLL_DEFAULT_ALG);
return rc;
}
int mca_scoll_fca_broadcast(struct oshmem_group_t *group,
@ -46,7 +48,7 @@ int mca_scoll_fca_broadcast(struct oshmem_group_t *group,
mca_scoll_fca_module_t *fca_module =
(mca_scoll_fca_module_t *) group->g_scoll.scoll_broadcast_module;
fca_bcast_spec_t spec;
int ret;
int rc;
FCA_VERBOSE(5, "rank %i, DOING FCA BCAST\n", group->my_pe);
spec.root = oshmem_proc_group_find_id(group, PE_root);
@ -61,23 +63,25 @@ int mca_scoll_fca_broadcast(struct oshmem_group_t *group,
spec.size);
goto orig_bcast;
}
ret = fca_do_bcast(fca_module->fca_comm, &spec);
if (ret < 0) {
if (ret == -EUSESHMEM) {
rc = fca_do_bcast(fca_module->fca_comm, &spec);
if (rc < 0) {
if (rc == -EUSESHMEM) {
FCA_VERBOSE(5, "FCA Broadcast failed, using original Broadcast");
goto orig_bcast;
}
FCA_ERROR("Bcast failed: %s", fca_strerror(ret));
FCA_ERROR("Bcast failed: %s", fca_strerror(rc));
return OSHMEM_ERROR;
}
return OSHMEM_SUCCESS;
orig_bcast: return fca_module->previous_broadcast(group,
PE_root,
target,
source,
nlong,
pSync,
SCOLL_DEFAULT_ALG);
orig_bcast:
PREVIOUS_SCOLL_FN(fca_module, broadcast, group,
PE_root,
target,
source,
nlong,
pSync,
SCOLL_DEFAULT_ALG);
return rc;
}
int mca_scoll_fca_collect(struct oshmem_group_t *group,
@ -88,6 +92,7 @@ int mca_scoll_fca_collect(struct oshmem_group_t *group,
bool nlong_type,
int alg)
{
int rc, i;
mca_scoll_fca_module_t *fca_module =
(mca_scoll_fca_module_t *) group->g_scoll.scoll_collect_module;
@ -97,24 +102,22 @@ int mca_scoll_fca_collect(struct oshmem_group_t *group,
#if OSHMEM_FCA_ALLGATHER
if (nlong_type == true) {
fca_gather_spec_t spec = {0,};
int ret;
spec.size = (int)nlong;
spec.sbuf = (void *)source;
spec.rbuf = target;
ret = fca_do_allgather(fca_module->fca_comm, &spec);
if (ret < 0) {
if (ret == -EUSESHMEM) {
rc = fca_do_allgather(fca_module->fca_comm, &spec);
if (rc < 0) {
if (rc == -EUSESHMEM) {
FCA_VERBOSE(5,"FCA Fcollect(allgather) failed, using original Fcollect");
goto orig_collect;
}
FCA_ERROR("Fcollect(allgather) failed: %s", fca_strerror(ret));
FCA_ERROR("Fcollect(allgather) failed: %s", fca_strerror(rc));
return OSHMEM_ERROR;
}
return OSHMEM_SUCCESS;
}
else
{
int i, ret;
size_t *sendcounts = (size_t *)malloc(group->proc_count*sizeof(size_t));
mca_scoll_fca_collect(group,sendcounts,(void *)&nlong,sizeof(size_t),pSync,true,SCOLL_DEFAULT_ALG);
fca_gatherv_spec_t spec;
@ -130,13 +133,13 @@ int mca_scoll_fca_collect(struct oshmem_group_t *group,
for (i=1; i<group->proc_count; i++) {
spec.displs[i] = spec.displs[i-1]+spec.recvsizes[i-1];
}
ret = fca_do_allgatherv(fca_module->fca_comm, &spec);
if (ret < 0) {
if (ret == -EUSESHMEM) {
rc = fca_do_allgatherv(fca_module->fca_comm, &spec);
if (rc < 0) {
if (rc == -EUSESHMEM) {
FCA_VERBOSE(5,"FCA Collect(allgatherv) failed, using original Collect");
goto orig_collect;
}
FCA_ERROR("Collect(allgatherv) failed: %s", fca_strerror(ret));
FCA_ERROR("Collect(allgatherv) failed: %s", fca_strerror(rc));
return OSHMEM_ERROR;
}
free(sendcounts);
@ -144,13 +147,14 @@ int mca_scoll_fca_collect(struct oshmem_group_t *group,
}
orig_collect:
#endif
return fca_module->previous_collect(group,
PREVIOUS_SCOLL_FN(fca_module, collect, group,
target,
source,
nlong,
pSync,
nlong_type,
SCOLL_DEFAULT_ALG);
return rc;
}
#define FCA_DTYPE_8_SIGNED 1
@ -245,7 +249,7 @@ int mca_scoll_fca_reduce(struct oshmem_group_t *group,
(mca_scoll_fca_module_t *) group->g_scoll.scoll_reduce_module;
int fca_dtype;
int fca_op;
int ret;
int rc;
fca_reduce_spec_t spec;
FCA_VERBOSE(5, "rank %i, DOING FCA_REDUCE\n", group->my_pe);
@ -266,23 +270,25 @@ int mca_scoll_fca_reduce(struct oshmem_group_t *group,
spec.dtype = (enum fca_reduce_dtype_t) fca_dtype;
spec.op = (enum fca_reduce_op_t) fca_op;
spec.length = (int) (nlong / op->dt_size);
ret = fca_do_all_reduce(fca_module->fca_comm, &spec);
if (ret < 0) {
if (ret == -EUSESHMEM) {
rc = fca_do_all_reduce(fca_module->fca_comm, &spec);
if (rc < 0) {
if (rc == -EUSESHMEM) {
FCA_VERBOSE(5,
"FCA Reduce(allreduce) failed, using original Reduce");
goto orig_reduce;
}
FCA_ERROR("Reduce (allreduce) failed: %s", fca_strerror(ret));
FCA_ERROR("Reduce (allreduce) failed: %s", fca_strerror(rc));
return OSHMEM_ERROR;
}
return OSHMEM_SUCCESS;
orig_reduce: return fca_module->previous_reduce(group,
op,
target,
source,
nlong,
pSync,
pWrk,
SCOLL_DEFAULT_ALG);
orig_reduce:
PREVIOUS_SCOLL_FN(fca_module, reduce, group,
op,
target,
source,
nlong,
pSync,
pWrk,
SCOLL_DEFAULT_ALG);
return rc;
}

Просмотреть файл

@ -54,7 +54,7 @@ mca_scoll_mpi_component_t mca_scoll_mpi_component = {
mca_scoll_mpi_init_query,
mca_scoll_mpi_comm_query,
},
60, /* priority */
77, /* priority */
0, /* verbose level */
0, /* mpi_enable */
2 /*mpi_np */
@ -130,26 +130,26 @@ static int mpi_register(void)
CHECK(reg_int("priority",NULL,
"Priority of the mpi coll component",
90,
mca_scoll_mpi_component.mpi_priority,
&mca_scoll_mpi_component.mpi_priority,
0));
CHECK(reg_int("verbose", NULL,
"Verbose level of the mpi coll component",
0,
mca_scoll_mpi_component.mpi_verbose,
&mca_scoll_mpi_component.mpi_verbose,
0));
CHECK(reg_int("enable",NULL,
"[1|0|] Enable/Disable MPI scoll component",
1 /*enable by default*/,
mca_scoll_mpi_component.mpi_enable,
&mca_scoll_mpi_component.mpi_enable,
0));
CHECK(reg_int("np",NULL,
"Minimal number of processes in the communicator"
" for the corresponding mpi context to be created (default: 32)",
2 /*enable by default*/,
" for the corresponding mpi context to be created",
mca_scoll_mpi_component.mpi_np,
&mca_scoll_mpi_component.mpi_np,
0));

Просмотреть файл

@ -24,7 +24,9 @@ int mca_scoll_mpi_barrier(struct oshmem_group_t *group, long *pSync, int alg)
rc = mpi_module->comm->c_coll.coll_barrier(mpi_module->comm, mpi_module->comm->c_coll.coll_barrier_module);
if (OMPI_SUCCESS != rc){
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK BARRIER");
rc = mpi_module->previous_barrier(group, pSync, SCOLL_DEFAULT_ALG);
PREVIOUS_SCOLL_FN(mpi_module, barrier, group,
pSync,
SCOLL_DEFAULT_ALG);
}
return rc;
}
@ -60,7 +62,7 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
#ifdef INCOMPATIBLE_SHMEM_OMPI_COLL_APIS
if (INT_MAX < nlong) {
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK BCAST");
rc = mpi_module->previous_broadcast(group,
PREVIOUS_SCOLL_FN(mpi_module, broadcast, group,
PE_root,
target,
source,
@ -75,7 +77,7 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
#endif
if (OMPI_SUCCESS != rc){
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK BCAST");
rc = mpi_module->previous_broadcast(group,
PREVIOUS_SCOLL_FN(mpi_module, broadcast, group,
PE_root,
target,
source,
@ -115,7 +117,13 @@ int mca_scoll_mpi_collect(struct oshmem_group_t *group,
#ifdef INCOMPATIBLE_SHMEM_OMPI_COLL_APIS
if (INT_MAX < nlong) {
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK COLLECT");
rc = mpi_module->previous_collect(group, target, source, nlong, pSync, nlong_type, SCOLL_DEFAULT_ALG);
PREVIOUS_SCOLL_FN(mpi_module, collect, group,
target,
source,
nlong,
pSync,
nlong_type,
SCOLL_DEFAULT_ALG);
return rc;
}
rc = mpi_module->comm->c_coll.coll_allgather(sbuf, (int)nlong, stype, rbuf, (int)nlong, rtype, mpi_module->comm, mpi_module->comm->c_coll.coll_allgather_module);
@ -124,11 +132,23 @@ int mca_scoll_mpi_collect(struct oshmem_group_t *group,
#endif
if (OMPI_SUCCESS != rc){
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK FCOLLECT");
rc = mpi_module->previous_collect(group, target, source, nlong, pSync, nlong_type, SCOLL_DEFAULT_ALG);
PREVIOUS_SCOLL_FN(mpi_module, collect, group,
target,
source,
nlong,
pSync,
nlong_type,
SCOLL_DEFAULT_ALG);
}
} else {
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK COLLECT");
rc = mpi_module->previous_collect(group, target, source, nlong, pSync, nlong_type, SCOLL_DEFAULT_ALG);
PREVIOUS_SCOLL_FN(mpi_module, collect, group,
target,
source,
nlong,
pSync,
nlong_type,
SCOLL_DEFAULT_ALG);
}
return rc;
}
@ -165,7 +185,7 @@ int mca_scoll_mpi_reduce(struct oshmem_group_t *group,
#ifdef INCOMPATIBLE_SHMEM_OMPI_COLL_APIS
if (INT_MAX < count) {
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK REDUCE");
rc = mpi_module->previous_reduce(group,
PREVIOUS_SCOLL_FN(mpi_module, reduce, group,
op,
target,
source,
@ -181,7 +201,7 @@ int mca_scoll_mpi_reduce(struct oshmem_group_t *group,
#endif
if (OMPI_SUCCESS != rc){
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK REDUCE");
rc = mpi_module->previous_reduce(group,
PREVIOUS_SCOLL_FN(mpi_module, reduce, group,
op,
target,
source,

Просмотреть файл

@ -183,6 +183,13 @@ struct mca_scoll_base_group_scoll_t {
mca_scoll_base_module_1_0_0_t *scoll_reduce_module;
};
typedef struct mca_scoll_base_group_scoll_t mca_scoll_base_group_scoll_t;
#define PREVIOUS_SCOLL_FN(module, __api, group, ...) do { \
group->g_scoll.scoll_ ## __api ## _module = (mca_scoll_base_module_1_0_0_t*) module->previous_ ## __api ## _module; \
rc = module->previous_ ## __api (group, __VA_ARGS__); \
group->g_scoll.scoll_ ## __api ## _module = (mca_scoll_base_module_1_0_0_t*) module; \
} while(0)
END_C_DECLS
#endif /* OSHMEM_MCA_SCOLL_H */