OSHMEM: scoll fix corner cases
- fix segv - proper enable/disable and prio handling fixed by Elena, reviewed by Igor/Mike cmr=v1.7.5:reviewer=ompi-rm1.7 This commit was SVN r30962.
Этот коммит содержится в:
родитель
b51733c456
Коммит
c784aab7d8
@ -17,22 +17,24 @@ int mca_scoll_fca_barrier(struct oshmem_group_t *group, long *pSync, int alg)
|
||||
{
|
||||
mca_scoll_fca_module_t *fca_module =
|
||||
(mca_scoll_fca_module_t *) group->g_scoll.scoll_barrier_module;
|
||||
int ret;
|
||||
int rc;
|
||||
|
||||
FCA_VERBOSE(5, "Using FCA Barrier");
|
||||
ret = fca_do_barrier(fca_module->fca_comm);
|
||||
if (ret < 0) {
|
||||
if (ret == -EUSESHMEM) {
|
||||
rc = fca_do_barrier(fca_module->fca_comm);
|
||||
if (rc < 0) {
|
||||
if (rc == -EUSESHMEM) {
|
||||
FCA_VERBOSE(5, "FCA Barrier failed, using original barrier");
|
||||
goto orig_barrier;
|
||||
}
|
||||
FCA_ERROR("Barrier failed: %s", fca_strerror(ret));
|
||||
FCA_ERROR("Barrier failed: %s", fca_strerror(rc));
|
||||
return OSHMEM_ERROR;
|
||||
}
|
||||
return OSHMEM_SUCCESS;
|
||||
orig_barrier: return fca_module->previous_barrier(group,
|
||||
pSync,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
orig_barrier:
|
||||
PREVIOUS_SCOLL_FN(fca_module, barrier, group,
|
||||
pSync,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mca_scoll_fca_broadcast(struct oshmem_group_t *group,
|
||||
@ -46,7 +48,7 @@ int mca_scoll_fca_broadcast(struct oshmem_group_t *group,
|
||||
mca_scoll_fca_module_t *fca_module =
|
||||
(mca_scoll_fca_module_t *) group->g_scoll.scoll_broadcast_module;
|
||||
fca_bcast_spec_t spec;
|
||||
int ret;
|
||||
int rc;
|
||||
|
||||
FCA_VERBOSE(5, "rank %i, DOING FCA BCAST\n", group->my_pe);
|
||||
spec.root = oshmem_proc_group_find_id(group, PE_root);
|
||||
@ -61,23 +63,25 @@ int mca_scoll_fca_broadcast(struct oshmem_group_t *group,
|
||||
spec.size);
|
||||
goto orig_bcast;
|
||||
}
|
||||
ret = fca_do_bcast(fca_module->fca_comm, &spec);
|
||||
if (ret < 0) {
|
||||
if (ret == -EUSESHMEM) {
|
||||
rc = fca_do_bcast(fca_module->fca_comm, &spec);
|
||||
if (rc < 0) {
|
||||
if (rc == -EUSESHMEM) {
|
||||
FCA_VERBOSE(5, "FCA Broadcast failed, using original Broadcast");
|
||||
goto orig_bcast;
|
||||
}
|
||||
FCA_ERROR("Bcast failed: %s", fca_strerror(ret));
|
||||
FCA_ERROR("Bcast failed: %s", fca_strerror(rc));
|
||||
return OSHMEM_ERROR;
|
||||
}
|
||||
return OSHMEM_SUCCESS;
|
||||
orig_bcast: return fca_module->previous_broadcast(group,
|
||||
PE_root,
|
||||
target,
|
||||
source,
|
||||
nlong,
|
||||
pSync,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
orig_bcast:
|
||||
PREVIOUS_SCOLL_FN(fca_module, broadcast, group,
|
||||
PE_root,
|
||||
target,
|
||||
source,
|
||||
nlong,
|
||||
pSync,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mca_scoll_fca_collect(struct oshmem_group_t *group,
|
||||
@ -88,6 +92,7 @@ int mca_scoll_fca_collect(struct oshmem_group_t *group,
|
||||
bool nlong_type,
|
||||
int alg)
|
||||
{
|
||||
int rc, i;
|
||||
mca_scoll_fca_module_t *fca_module =
|
||||
(mca_scoll_fca_module_t *) group->g_scoll.scoll_collect_module;
|
||||
|
||||
@ -97,24 +102,22 @@ int mca_scoll_fca_collect(struct oshmem_group_t *group,
|
||||
#if OSHMEM_FCA_ALLGATHER
|
||||
if (nlong_type == true) {
|
||||
fca_gather_spec_t spec = {0,};
|
||||
int ret;
|
||||
spec.size = (int)nlong;
|
||||
spec.sbuf = (void *)source;
|
||||
spec.rbuf = target;
|
||||
ret = fca_do_allgather(fca_module->fca_comm, &spec);
|
||||
if (ret < 0) {
|
||||
if (ret == -EUSESHMEM) {
|
||||
rc = fca_do_allgather(fca_module->fca_comm, &spec);
|
||||
if (rc < 0) {
|
||||
if (rc == -EUSESHMEM) {
|
||||
FCA_VERBOSE(5,"FCA Fcollect(allgather) failed, using original Fcollect");
|
||||
goto orig_collect;
|
||||
}
|
||||
FCA_ERROR("Fcollect(allgather) failed: %s", fca_strerror(ret));
|
||||
FCA_ERROR("Fcollect(allgather) failed: %s", fca_strerror(rc));
|
||||
return OSHMEM_ERROR;
|
||||
}
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
int i, ret;
|
||||
size_t *sendcounts = (size_t *)malloc(group->proc_count*sizeof(size_t));
|
||||
mca_scoll_fca_collect(group,sendcounts,(void *)&nlong,sizeof(size_t),pSync,true,SCOLL_DEFAULT_ALG);
|
||||
fca_gatherv_spec_t spec;
|
||||
@ -130,13 +133,13 @@ int mca_scoll_fca_collect(struct oshmem_group_t *group,
|
||||
for (i=1; i<group->proc_count; i++) {
|
||||
spec.displs[i] = spec.displs[i-1]+spec.recvsizes[i-1];
|
||||
}
|
||||
ret = fca_do_allgatherv(fca_module->fca_comm, &spec);
|
||||
if (ret < 0) {
|
||||
if (ret == -EUSESHMEM) {
|
||||
rc = fca_do_allgatherv(fca_module->fca_comm, &spec);
|
||||
if (rc < 0) {
|
||||
if (rc == -EUSESHMEM) {
|
||||
FCA_VERBOSE(5,"FCA Collect(allgatherv) failed, using original Collect");
|
||||
goto orig_collect;
|
||||
}
|
||||
FCA_ERROR("Collect(allgatherv) failed: %s", fca_strerror(ret));
|
||||
FCA_ERROR("Collect(allgatherv) failed: %s", fca_strerror(rc));
|
||||
return OSHMEM_ERROR;
|
||||
}
|
||||
free(sendcounts);
|
||||
@ -144,13 +147,14 @@ int mca_scoll_fca_collect(struct oshmem_group_t *group,
|
||||
}
|
||||
orig_collect:
|
||||
#endif
|
||||
return fca_module->previous_collect(group,
|
||||
PREVIOUS_SCOLL_FN(fca_module, collect, group,
|
||||
target,
|
||||
source,
|
||||
nlong,
|
||||
pSync,
|
||||
nlong_type,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#define FCA_DTYPE_8_SIGNED 1
|
||||
@ -245,7 +249,7 @@ int mca_scoll_fca_reduce(struct oshmem_group_t *group,
|
||||
(mca_scoll_fca_module_t *) group->g_scoll.scoll_reduce_module;
|
||||
int fca_dtype;
|
||||
int fca_op;
|
||||
int ret;
|
||||
int rc;
|
||||
fca_reduce_spec_t spec;
|
||||
|
||||
FCA_VERBOSE(5, "rank %i, DOING FCA_REDUCE\n", group->my_pe);
|
||||
@ -266,23 +270,25 @@ int mca_scoll_fca_reduce(struct oshmem_group_t *group,
|
||||
spec.dtype = (enum fca_reduce_dtype_t) fca_dtype;
|
||||
spec.op = (enum fca_reduce_op_t) fca_op;
|
||||
spec.length = (int) (nlong / op->dt_size);
|
||||
ret = fca_do_all_reduce(fca_module->fca_comm, &spec);
|
||||
if (ret < 0) {
|
||||
if (ret == -EUSESHMEM) {
|
||||
rc = fca_do_all_reduce(fca_module->fca_comm, &spec);
|
||||
if (rc < 0) {
|
||||
if (rc == -EUSESHMEM) {
|
||||
FCA_VERBOSE(5,
|
||||
"FCA Reduce(allreduce) failed, using original Reduce");
|
||||
goto orig_reduce;
|
||||
}
|
||||
FCA_ERROR("Reduce (allreduce) failed: %s", fca_strerror(ret));
|
||||
FCA_ERROR("Reduce (allreduce) failed: %s", fca_strerror(rc));
|
||||
return OSHMEM_ERROR;
|
||||
}
|
||||
return OSHMEM_SUCCESS;
|
||||
orig_reduce: return fca_module->previous_reduce(group,
|
||||
op,
|
||||
target,
|
||||
source,
|
||||
nlong,
|
||||
pSync,
|
||||
pWrk,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
orig_reduce:
|
||||
PREVIOUS_SCOLL_FN(fca_module, reduce, group,
|
||||
op,
|
||||
target,
|
||||
source,
|
||||
nlong,
|
||||
pSync,
|
||||
pWrk,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
return rc;
|
||||
}
|
||||
|
@ -54,7 +54,7 @@ mca_scoll_mpi_component_t mca_scoll_mpi_component = {
|
||||
mca_scoll_mpi_init_query,
|
||||
mca_scoll_mpi_comm_query,
|
||||
},
|
||||
60, /* priority */
|
||||
77, /* priority */
|
||||
0, /* verbose level */
|
||||
0, /* mpi_enable */
|
||||
2 /*mpi_np */
|
||||
@ -130,26 +130,26 @@ static int mpi_register(void)
|
||||
|
||||
CHECK(reg_int("priority",NULL,
|
||||
"Priority of the mpi coll component",
|
||||
90,
|
||||
mca_scoll_mpi_component.mpi_priority,
|
||||
&mca_scoll_mpi_component.mpi_priority,
|
||||
0));
|
||||
|
||||
CHECK(reg_int("verbose", NULL,
|
||||
"Verbose level of the mpi coll component",
|
||||
0,
|
||||
mca_scoll_mpi_component.mpi_verbose,
|
||||
&mca_scoll_mpi_component.mpi_verbose,
|
||||
0));
|
||||
|
||||
CHECK(reg_int("enable",NULL,
|
||||
"[1|0|] Enable/Disable MPI scoll component",
|
||||
1 /*enable by default*/,
|
||||
mca_scoll_mpi_component.mpi_enable,
|
||||
&mca_scoll_mpi_component.mpi_enable,
|
||||
0));
|
||||
|
||||
CHECK(reg_int("np",NULL,
|
||||
"Minimal number of processes in the communicator"
|
||||
" for the corresponding mpi context to be created (default: 32)",
|
||||
2 /*enable by default*/,
|
||||
" for the corresponding mpi context to be created",
|
||||
mca_scoll_mpi_component.mpi_np,
|
||||
&mca_scoll_mpi_component.mpi_np,
|
||||
0));
|
||||
|
||||
|
@ -24,7 +24,9 @@ int mca_scoll_mpi_barrier(struct oshmem_group_t *group, long *pSync, int alg)
|
||||
rc = mpi_module->comm->c_coll.coll_barrier(mpi_module->comm, mpi_module->comm->c_coll.coll_barrier_module);
|
||||
if (OMPI_SUCCESS != rc){
|
||||
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK BARRIER");
|
||||
rc = mpi_module->previous_barrier(group, pSync, SCOLL_DEFAULT_ALG);
|
||||
PREVIOUS_SCOLL_FN(mpi_module, barrier, group,
|
||||
pSync,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
@ -60,7 +62,7 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
|
||||
#ifdef INCOMPATIBLE_SHMEM_OMPI_COLL_APIS
|
||||
if (INT_MAX < nlong) {
|
||||
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK BCAST");
|
||||
rc = mpi_module->previous_broadcast(group,
|
||||
PREVIOUS_SCOLL_FN(mpi_module, broadcast, group,
|
||||
PE_root,
|
||||
target,
|
||||
source,
|
||||
@ -75,7 +77,7 @@ int mca_scoll_mpi_broadcast(struct oshmem_group_t *group,
|
||||
#endif
|
||||
if (OMPI_SUCCESS != rc){
|
||||
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK BCAST");
|
||||
rc = mpi_module->previous_broadcast(group,
|
||||
PREVIOUS_SCOLL_FN(mpi_module, broadcast, group,
|
||||
PE_root,
|
||||
target,
|
||||
source,
|
||||
@ -115,7 +117,13 @@ int mca_scoll_mpi_collect(struct oshmem_group_t *group,
|
||||
#ifdef INCOMPATIBLE_SHMEM_OMPI_COLL_APIS
|
||||
if (INT_MAX < nlong) {
|
||||
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK COLLECT");
|
||||
rc = mpi_module->previous_collect(group, target, source, nlong, pSync, nlong_type, SCOLL_DEFAULT_ALG);
|
||||
PREVIOUS_SCOLL_FN(mpi_module, collect, group,
|
||||
target,
|
||||
source,
|
||||
nlong,
|
||||
pSync,
|
||||
nlong_type,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
return rc;
|
||||
}
|
||||
rc = mpi_module->comm->c_coll.coll_allgather(sbuf, (int)nlong, stype, rbuf, (int)nlong, rtype, mpi_module->comm, mpi_module->comm->c_coll.coll_allgather_module);
|
||||
@ -124,11 +132,23 @@ int mca_scoll_mpi_collect(struct oshmem_group_t *group,
|
||||
#endif
|
||||
if (OMPI_SUCCESS != rc){
|
||||
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK FCOLLECT");
|
||||
rc = mpi_module->previous_collect(group, target, source, nlong, pSync, nlong_type, SCOLL_DEFAULT_ALG);
|
||||
PREVIOUS_SCOLL_FN(mpi_module, collect, group,
|
||||
target,
|
||||
source,
|
||||
nlong,
|
||||
pSync,
|
||||
nlong_type,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
}
|
||||
} else {
|
||||
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK COLLECT");
|
||||
rc = mpi_module->previous_collect(group, target, source, nlong, pSync, nlong_type, SCOLL_DEFAULT_ALG);
|
||||
PREVIOUS_SCOLL_FN(mpi_module, collect, group,
|
||||
target,
|
||||
source,
|
||||
nlong,
|
||||
pSync,
|
||||
nlong_type,
|
||||
SCOLL_DEFAULT_ALG);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
@ -165,7 +185,7 @@ int mca_scoll_mpi_reduce(struct oshmem_group_t *group,
|
||||
#ifdef INCOMPATIBLE_SHMEM_OMPI_COLL_APIS
|
||||
if (INT_MAX < count) {
|
||||
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK REDUCE");
|
||||
rc = mpi_module->previous_reduce(group,
|
||||
PREVIOUS_SCOLL_FN(mpi_module, reduce, group,
|
||||
op,
|
||||
target,
|
||||
source,
|
||||
@ -181,7 +201,7 @@ int mca_scoll_mpi_reduce(struct oshmem_group_t *group,
|
||||
#endif
|
||||
if (OMPI_SUCCESS != rc){
|
||||
MPI_COLL_VERBOSE(20,"RUNNING FALLBACK REDUCE");
|
||||
rc = mpi_module->previous_reduce(group,
|
||||
PREVIOUS_SCOLL_FN(mpi_module, reduce, group,
|
||||
op,
|
||||
target,
|
||||
source,
|
||||
|
@ -183,6 +183,13 @@ struct mca_scoll_base_group_scoll_t {
|
||||
mca_scoll_base_module_1_0_0_t *scoll_reduce_module;
|
||||
};
|
||||
typedef struct mca_scoll_base_group_scoll_t mca_scoll_base_group_scoll_t;
|
||||
|
||||
#define PREVIOUS_SCOLL_FN(module, __api, group, ...) do { \
|
||||
group->g_scoll.scoll_ ## __api ## _module = (mca_scoll_base_module_1_0_0_t*) module->previous_ ## __api ## _module; \
|
||||
rc = module->previous_ ## __api (group, __VA_ARGS__); \
|
||||
group->g_scoll.scoll_ ## __api ## _module = (mca_scoll_base_module_1_0_0_t*) module; \
|
||||
} while(0)
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* OSHMEM_MCA_SCOLL_H */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user