Fix the calls to ibv_fork_init and remove btl_openib_want_fork_support.
In order to have an effect, ibv_fork_init should be called in the beginning of the verbs initialization flow - before the calls to the ibv_create_qp and ibv_create_cq verbs. These functions are called from the oob/ud code and by the time the other verbs components (btl openib, pml yalla, ...) call ibv_fork_init, it's too late. This commit forces the call to ibv_fork_init (if it's requested) right at the beginning of all the components that are using verbs. (ibv_fork_init() can be safely called multiple times) This commit also removes the btl_openib_want_fork_support mca parameter and adds a new mca parameter instead - opal_verbs_want_fork_support. Through this new parameter, fork support may be requested for ALL components. The default value for this parameter is set to 1. Before this commit the btl_openib_want_fork_support parameter didn't provide fork support for the openib btl if its value was set to 1. (because when openib called ibv_fork_init, it was already after the calls to ibv_create_* in oob/ud and thereofre it failed).
Этот коммит содержится в:
родитель
0ac2f08460
Коммит
e4c4e7df5e
@ -278,8 +278,6 @@ struct mca_btl_openib_component_t {
|
||||
unsigned int cq_poll_progress;
|
||||
unsigned int cq_poll_batch;
|
||||
unsigned int eager_rdma_poll_ratio;
|
||||
/** Whether we want fork support or not */
|
||||
int want_fork_support;
|
||||
int rdma_qp;
|
||||
int credits_qp; /* qp used for software flow control */
|
||||
bool cpc_explicitly_defined;
|
||||
|
@ -2634,23 +2634,10 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
goto no_btls;
|
||||
}
|
||||
|
||||
/* If we want fork support, try to enable it */
|
||||
#ifdef HAVE_IBV_FORK_INIT
|
||||
if (0 != mca_btl_openib_component.want_fork_support) {
|
||||
if (0 != ibv_fork_init()) {
|
||||
/* If the want_fork_support MCA parameter is >0, then the
|
||||
user was specifically asking for fork support and we
|
||||
couldn't provide it. So print an error and deactivate
|
||||
this BTL. */
|
||||
if (mca_btl_openib_component.want_fork_support > 0) {
|
||||
opal_show_help("help-mpi-btl-openib.txt",
|
||||
"ibv_fork_init fail", true,
|
||||
opal_process_info.nodename);
|
||||
goto no_btls;
|
||||
}
|
||||
}
|
||||
/* If fork support is requested, try to enable it */
|
||||
if (OPAL_SUCCESS != (ret = opal_common_verbs_fork_test())) {
|
||||
goto no_btls;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Parse the include and exclude lists, checking for errors */
|
||||
mca_btl_openib_component.if_include_list =
|
||||
|
@ -270,11 +270,6 @@ int btl_openib_register_mca_params(void)
|
||||
MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT, &mca_btl_openib_component.cq_poll_batch,
|
||||
REGINT_GE_ONE));
|
||||
|
||||
CHECK(reg_int("want_fork_support", NULL,
|
||||
"Whether fork support is desired or not "
|
||||
"(negative = try to enable fork support, but continue even if it is not available, 0 = do not enable fork support, positive = try to enable fork support and fail if it is not available)",
|
||||
0, &mca_btl_openib_component.want_fork_support, 0));
|
||||
|
||||
asprintf(&str, "%s/mca-btl-openib-device-params.ini",
|
||||
opal_install_dirs.opaldatadir);
|
||||
if (NULL == str) {
|
||||
|
@ -164,6 +164,20 @@ opal_common_verbs_find_max_inline(struct ibv_device *device,
|
||||
*/
|
||||
OPAL_DECLSPEC int opal_common_verbs_qp_test(struct ibv_context *device_context,
|
||||
int flags);
|
||||
/*
|
||||
* ibv_fork_init testing - if fork support is requested then ibv_fork_init
|
||||
* should be called right at the beginning of the verbs initialization flow, before ibv_create_* call.
|
||||
*
|
||||
* Known limitations:
|
||||
* If ibv_fork_init is called after ibv_create_* functions - it will have no effect.
|
||||
* OMPI initializes verbs many times during initialization in the following verbs components:
|
||||
* oob/ud, btl/openib, mtl/mxm, pml/yalla, oshmem/ikrit, oshmem/yoda, ompi/mca/coll/{fca,hcoll}
|
||||
*
|
||||
* So, ibv_fork_init should be called once, in the beginning of the init flow of every verb component
|
||||
* to proper request fork support.
|
||||
*
|
||||
*/
|
||||
int opal_common_verbs_fork_test(void);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -35,6 +35,9 @@ const char *ibv_get_sysfs_path(void);
|
||||
#endif
|
||||
|
||||
#include "common_verbs.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/proc.h"
|
||||
|
||||
/***********************************************************************/
|
||||
|
||||
@ -61,3 +64,31 @@ bool opal_common_verbs_check_basics(void)
|
||||
return true;
|
||||
}
|
||||
|
||||
int opal_common_verbs_fork_test(void)
|
||||
{
|
||||
/* Make sure that ibv_fork_init is called before the calls to other memory registering verbs,
|
||||
* which will be called after this function */
|
||||
#ifdef HAVE_IBV_FORK_INIT
|
||||
if (0 != opal_verbs_want_fork_support) {
|
||||
/* Check if fork support is requested by the user */
|
||||
if (0 != ibv_fork_init()) {
|
||||
/* If the opal_want_fork_support MCA parameter is >0 but
|
||||
* the call to ibv_fork_init() failed, then return an error code.
|
||||
*/
|
||||
if (opal_verbs_want_fork_support > 0) {
|
||||
opal_show_help("help-opal-common-verbs.txt",
|
||||
"ibv_fork_init fail", true,
|
||||
opal_proc_local_get()->proc_hostname, errno,
|
||||
strerror(errno));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
} else {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
} else {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -46,3 +46,9 @@ specified:
|
||||
These entities will be ignored. You can disable this warning by
|
||||
setting the ompi_common_verbs_warn_nonexistent_if MCA parameter to 0.
|
||||
#
|
||||
[ibv_fork_init fail]
|
||||
Fork support was requested but the library call ibv_fork_init() failed.
|
||||
|
||||
Hostname: %s
|
||||
Error (%d): %s
|
||||
#
|
||||
|
@ -275,6 +275,16 @@ int opal_register_params(void)
|
||||
return ret;
|
||||
}
|
||||
|
||||
opal_verbs_want_fork_support = 1;
|
||||
ret = mca_base_var_register("opal", "opal", NULL, "verbs_want_fork_support",
|
||||
"Whether fork support is desired or not "
|
||||
"(negative = try to enable fork support, but continue even "
|
||||
"if it is not available, 0 = do not enable fork support, "
|
||||
"positive = try to enable fork support and fail if it is not available)",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
|
||||
&opal_verbs_want_fork_support);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -30,6 +30,8 @@ extern char *opal_signal_string;
|
||||
extern char *opal_net_private_ipv4;
|
||||
extern char *opal_set_max_sys_limits;
|
||||
|
||||
int opal_verbs_want_fork_support;
|
||||
|
||||
#if OPAL_ENABLE_TIMING
|
||||
extern char *opal_timing_sync_file;
|
||||
extern char *opal_timing_output;
|
||||
@ -38,6 +40,7 @@ extern bool opal_timing_overhead;
|
||||
|
||||
OPAL_DECLSPEC extern int opal_initialized;
|
||||
OPAL_DECLSPEC extern bool opal_built_with_cuda_support;
|
||||
|
||||
/**
|
||||
* * Whether we want to enable CUDA GPU buffer send and receive support.
|
||||
* */
|
||||
|
@ -55,7 +55,8 @@ mcacomponentdir = $(ortelibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_oob_ud_la_SOURCES = $(sources)
|
||||
mca_oob_ud_la_LDFLAGS = -module -avoid-version $(orte_oob_ud_LDFLAGS)
|
||||
mca_oob_ud_la_LIBADD = $(orte_oob_ud_LIBS)
|
||||
mca_oob_ud_la_LIBADD = $(orte_oob_ud_LIBS) \
|
||||
$(OPAL_TOP_BUILDDIR)/opal/mca/common/verbs/lib@OPAL_LIB_PREFIX@mca_common_verbs.la
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_oob_ud_la_SOURCES = $(sources)
|
||||
|
@ -24,6 +24,8 @@
|
||||
|
||||
#include "oob_ud_component.h"
|
||||
|
||||
#include "opal/mca/common/verbs/common_verbs.h"
|
||||
|
||||
static int mca_oob_ud_component_open (void);
|
||||
static int mca_oob_ud_component_close (void);
|
||||
static int mca_oob_ud_component_register (void);
|
||||
@ -217,6 +219,16 @@ static inline int mca_oob_ud_device_setup (mca_oob_ud_device_t *device,
|
||||
"%s oob:ud:device_setup attempting to setup ib device %p",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) ib_device);
|
||||
|
||||
|
||||
/* If fork support is requested, try to enable it */
|
||||
rc = opal_common_verbs_fork_test();
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||
"%s oob:ud:device_setup failed in ibv_fork_init. errno = %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
device->ib_context = ibv_open_device (ib_device);
|
||||
if (NULL == device->ib_context) {
|
||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||
|
@ -30,7 +30,8 @@ mcacomponentdir = $(oshmemlibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sshmem_verbs_la_SOURCES = $(sources)
|
||||
mca_sshmem_verbs_la_LDFLAGS = -module -avoid-version $(oshmem_verbs_LDFLAGS)
|
||||
mca_sshmem_verbs_la_LIBADD = $(oshmem_verbs_LIBS)
|
||||
mca_sshmem_verbs_la_LIBADD = $(oshmem_verbs_LIBS) \
|
||||
$(OPAL_TOP_BUILDDIR)/opal/mca/common/verbs/lib@OPAL_LIB_PREFIX@mca_common_verbs.la
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sshmem_verbs_la_SOURCES =$(sources)
|
||||
|
@ -19,6 +19,7 @@
|
||||
|
||||
#include "opal/constants.h"
|
||||
#include "opal/util/sys_limits.h"
|
||||
#include "opal/mca/common/verbs/common_verbs.h"
|
||||
|
||||
#include "oshmem/mca/sshmem/sshmem.h"
|
||||
#include "oshmem/mca/sshmem/base/base.h"
|
||||
@ -100,6 +101,11 @@ verbs_runtime_query(mca_base_module_t **module,
|
||||
*priority = 0;
|
||||
*module = NULL;
|
||||
|
||||
/* If fork support is requested, try to enable it */
|
||||
if (OSHMEM_SUCCESS != (rc = opal_common_verbs_fork_test())) {
|
||||
return OSHMEM_ERROR;
|
||||
}
|
||||
|
||||
memset(device, 0, sizeof(*device));
|
||||
|
||||
#ifdef HAVE_IBV_GET_DEVICE_LIST
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user