coll/libnbc: Work around for non-uniform data types in ibcast
* If (legal) non-uniform data type signatures are used in ibcast then the chosen algorithm may fail on the request, and worst case it could produce wrong answers. * Add an MCA parameter that, by default, protects the user from this scenario. If the user really wants to use it then they have to 'opt-in' by setting the following parameter to false: - `-mca coll_libnbc_ibcast_skip_dt_decision f` * Once the following Issues are resolved then this parameter can be removed. - https://github.com/open-mpi/ompi/issues/2256 - https://github.com/open-mpi/ompi/issues/1763 Signed-off-by: Joshua Hursey <jhursey@us.ibm.com>
Этот коммит содержится в:
родитель
6074c2a2a9
Коммит
350ef67fe0
@ -15,6 +15,7 @@
|
|||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2014-2015 Research Organization for Information Science
|
* Copyright (c) 2014-2015 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* Copyright (c) 2016 IBM Corporation. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -67,6 +68,8 @@ BEGIN_C_DECLS
|
|||||||
/* number of implemented collective functions */
|
/* number of implemented collective functions */
|
||||||
#define NBC_NUM_COLL 17
|
#define NBC_NUM_COLL 17
|
||||||
|
|
||||||
|
extern bool libnbc_ibcast_skip_dt_decision;
|
||||||
|
|
||||||
struct ompi_coll_libnbc_component_t {
|
struct ompi_coll_libnbc_component_t {
|
||||||
mca_coll_base_component_2_0_0_t super;
|
mca_coll_base_component_2_0_0_t super;
|
||||||
opal_free_list_t requests;
|
opal_free_list_t requests;
|
||||||
|
@ -39,6 +39,7 @@ const char *mca_coll_libnbc_component_version_string =
|
|||||||
|
|
||||||
|
|
||||||
static int libnbc_priority = 10;
|
static int libnbc_priority = 10;
|
||||||
|
bool libnbc_ibcast_skip_dt_decision = true;
|
||||||
|
|
||||||
|
|
||||||
static int libnbc_open(void);
|
static int libnbc_open(void);
|
||||||
@ -131,6 +132,27 @@ libnbc_register(void)
|
|||||||
MCA_BASE_VAR_SCOPE_READONLY,
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
&libnbc_priority);
|
&libnbc_priority);
|
||||||
|
|
||||||
|
/* ibcast decision function can make the wrong decision if a legal
|
||||||
|
* non-uniform data type signature is used. This has resulted in the
|
||||||
|
* collective operation failing, and possibly producing wrong answers.
|
||||||
|
* We are investigating a fix for this problem, but it is taking a while.
|
||||||
|
* https://github.com/open-mpi/ompi/issues/2256
|
||||||
|
* https://github.com/open-mpi/ompi/issues/1763
|
||||||
|
* As a result we are adding an MCA parameter to make a conservative
|
||||||
|
* decision to avoid this issue. If the user knows that their application
|
||||||
|
* does not use data types in this way, then they can set this parameter
|
||||||
|
* to get the old behavior. Once the issue is truely fixed, then this
|
||||||
|
* parameter can be removed.
|
||||||
|
*/
|
||||||
|
libnbc_ibcast_skip_dt_decision = true;
|
||||||
|
(void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version,
|
||||||
|
"ibcast_skip_dt_decision",
|
||||||
|
"In ibcast only use size of communicator to choose algorithm, exclude data type signature. Set to 'false' to use data type signature in decision. WARNING: If you set this to 'false' then your application should not use non-uniform data type signatures in calls to ibcast.",
|
||||||
|
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||||
|
OPAL_INFO_LVL_9,
|
||||||
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
|
&libnbc_ibcast_skip_dt_decision);
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
|
* Copyright (c) 2016 IBM Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* Author(s): Torsten Hoefler <htor@cs.indiana.edu>
|
* Author(s): Torsten Hoefler <htor@cs.indiana.edu>
|
||||||
*
|
*
|
||||||
@ -65,16 +66,26 @@ int ompi_coll_libnbc_ibcast(void *buffer, int count, MPI_Datatype datatype, int
|
|||||||
|
|
||||||
segsize = 16384;
|
segsize = 16384;
|
||||||
/* algorithm selection */
|
/* algorithm selection */
|
||||||
if (p <= 4) {
|
if( libnbc_ibcast_skip_dt_decision ) {
|
||||||
alg = NBC_BCAST_LINEAR;
|
if (p <= 4) {
|
||||||
} else if (size * count < 65536) {
|
alg = NBC_BCAST_LINEAR;
|
||||||
alg = NBC_BCAST_BINOMIAL;
|
}
|
||||||
} else if (size * count < 524288) {
|
else {
|
||||||
alg = NBC_BCAST_CHAIN;
|
alg = NBC_BCAST_BINOMIAL;
|
||||||
segsize = 8192;
|
}
|
||||||
} else {
|
}
|
||||||
alg = NBC_BCAST_CHAIN;
|
else {
|
||||||
segsize = 32768;
|
if (p <= 4) {
|
||||||
|
alg = NBC_BCAST_LINEAR;
|
||||||
|
} else if (size * count < 65536) {
|
||||||
|
alg = NBC_BCAST_BINOMIAL;
|
||||||
|
} else if (size * count < 524288) {
|
||||||
|
alg = NBC_BCAST_CHAIN;
|
||||||
|
segsize = 8192;
|
||||||
|
} else {
|
||||||
|
alg = NBC_BCAST_CHAIN;
|
||||||
|
segsize = 32768;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef NBC_CACHE_SCHEDULE
|
#ifdef NBC_CACHE_SCHEDULE
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user