diff --git a/opal/mca/btl/vader/btl_vader.h b/opal/mca/btl/vader/btl_vader.h index 028c7a3816..eab5f5a87d 100644 --- a/opal/mca/btl/vader/btl_vader.h +++ b/opal/mca/btl/vader/btl_vader.h @@ -84,7 +84,12 @@ union vader_modex_t { void *segment_base; } xpmem; #endif - opal_shmem_ds_t seg_ds; + struct vader_modex_other_t { + ino_t user_ns_id; + int seg_ds_size; + /* seg_ds needs to be the last element */ + opal_shmem_ds_t seg_ds; + } other; }; /** @@ -270,6 +275,8 @@ int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif +ino_t mca_btl_vader_get_user_ns_id(void); + int mca_btl_vader_get_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, diff --git a/opal/mca/btl/vader/btl_vader_component.c b/opal/mca/btl/vader/btl_vader_component.c index 98ac462b71..44de7e004c 100644 --- a/opal/mca/btl/vader/btl_vader_component.c +++ b/opal/mca/btl/vader/btl_vader_component.c @@ -41,6 +41,10 @@ #include "btl_vader_fbox.h" #include "btl_vader_xpmem.h" +#ifdef HAVE_SYS_STAT_H +#include +#endif + #include #include @@ -350,6 +354,25 @@ static int mca_btl_vader_component_close(void) return OPAL_SUCCESS; } +/* + * mca_btl_vader_parse_proc_ns_user() tries to get the user namespace ID + * of the current process. + * Returns the ID of the user namespace. In the case of an error '0' is returned. + */ +ino_t mca_btl_vader_get_user_ns_id(void) +{ + struct stat buf; + + if (0 > stat("/proc/self/ns/user", &buf)) { + /* + * Something went wrong, probably an old kernel that does not support namespaces + * simply assume all processes are in the same user namespace and return 0 + */ + return 0; + } + + return buf.st_ino; +} static int mca_btl_base_vader_modex_send (void) { union vader_modex_t modex; @@ -363,8 +386,16 @@ static int mca_btl_base_vader_modex_send (void) modex_size = sizeof (modex.xpmem); } else { #endif - modex_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds); - memmove (&modex.seg_ds, &mca_btl_vader_component.seg_ds, modex_size); + modex.other.seg_ds_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds); + memmove (&modex.other.seg_ds, &mca_btl_vader_component.seg_ds, modex.other.seg_ds_size); + modex.other.user_ns_id = mca_btl_vader_get_user_ns_id(); + /* + * If modex.other.user_ns_id is '0' something did not work out + * during user namespace detection. Assuming there are no + * namespaces available it will return '0' for all processes and + * the check later will see '0' everywhere and not disable CMA. + */ + modex_size = sizeof (modex.other); #if OPAL_BTL_VADER_HAVE_XPMEM } diff --git a/opal/mca/btl/vader/btl_vader_module.c b/opal/mca/btl/vader/btl_vader_module.c index a1cd167e22..e54c02b569 100644 --- a/opal/mca/btl/vader/btl_vader_module.c +++ b/opal/mca/btl/vader/btl_vader_module.c @@ -27,6 +27,7 @@ */ #include "opal_config.h" +#include "opal/util/show_help.h" #include "btl_vader.h" #include "btl_vader_endpoint.h" @@ -79,6 +80,28 @@ mca_btl_vader_t mca_btl_vader = { } }; +/* + * Exit function copied from btl_usnic_util.c + * + * The following comment tells Coverity that this function does not return. + * See https://scan.coverity.com/tune. + */ + +/* coverity[+kill] */ +static void vader_btl_exit(mca_btl_vader_t *btl) +{ + if (NULL != btl && NULL != btl->error_cb) { + btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL, + (opal_proc_t*) opal_proc_local_get(), + "The vader BTL is aborting the MPI job (via PML error callback)."); + } + + /* If the PML error callback returns (or if there wasn't one), just exit. Shrug. */ + fprintf(stderr, "*** The Open MPI vader BTL is aborting the MPI job (via exit(3)).\n"); + fflush(stderr); + exit(1); +} + static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n) { mca_btl_vader_component_t *component = &mca_btl_vader_component; @@ -173,6 +196,7 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n) static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_proc_t *proc, int remote_rank) { mca_btl_vader_component_t *component = &mca_btl_vader_component; union vader_modex_t *modex; + ino_t my_user_ns_id; size_t msg_size; int rc; @@ -197,17 +221,58 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_ } else { #endif /* store a copy of the segment information for detach */ - ep->segment_data.other.seg_ds = malloc (msg_size); + ep->segment_data.other.seg_ds = malloc (modex->other.seg_ds_size); if (NULL == ep->segment_data.other.seg_ds) { return OPAL_ERR_OUT_OF_RESOURCE; } - memcpy (ep->segment_data.other.seg_ds, &modex->seg_ds, msg_size); + memcpy (ep->segment_data.other.seg_ds, &modex->other.seg_ds, modex->other.seg_ds_size); ep->segment_base = opal_shmem_segment_attach (ep->segment_data.other.seg_ds); if (NULL == ep->segment_base) { return OPAL_ERROR; } + + if (MCA_BTL_VADER_CMA == mca_btl_vader_component.single_copy_mechanism) { + my_user_ns_id = mca_btl_vader_get_user_ns_id(); + if (my_user_ns_id != modex->other.user_ns_id) { + mca_base_var_source_t source; + int vari; + rc = mca_base_var_find_by_name("btl_vader_single_copy_mechanism", &vari); + if (OPAL_ERROR == rc) { + return OPAL_ERROR; + } + rc = mca_base_var_get_value(vari, NULL, &source, NULL); + if (OPAL_ERROR == rc) { + return OPAL_ERROR; + } + /* + * CMA is not possible as different user namespaces are in use. + * Currently the kernel does not allow * process_vm_{read,write}v() + * for processes running in different user namespaces even if + * all involved user IDs are mapped to the same user ID. + * + * Fallback to MCA_BTL_VADER_EMUL. + */ + if (MCA_BASE_VAR_SOURCE_DEFAULT != source) { + /* If CMA has been explicitly selected we want to error out */ + opal_show_help("help-btl-vader.txt", "cma-different-user-namespace-error", + true, opal_process_info.nodename); + vader_btl_exit(&mca_btl_vader); + } + /* + * If CMA has been selected because it is the default or + * some fallback, this falls back even further. + */ + opal_show_help("help-btl-vader.txt", "cma-different-user-namespace-warning", + true, opal_process_info.nodename); + mca_btl_vader_component.single_copy_mechanism = MCA_BTL_VADER_EMUL; + mca_btl_vader.super.btl_get = mca_btl_vader_get_sc_emu; + mca_btl_vader.super.btl_put = mca_btl_vader_put_sc_emu; + mca_btl_vader.super.btl_put_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t); + mca_btl_vader.super.btl_get_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t); + } + } #if OPAL_BTL_VADER_HAVE_XPMEM } #endif diff --git a/opal/mca/btl/vader/help-btl-vader.txt b/opal/mca/btl/vader/help-btl-vader.txt index 9d87267564..ea87559d45 100644 --- a/opal/mca/btl/vader/help-btl-vader.txt +++ b/opal/mca/btl/vader/help-btl-vader.txt @@ -121,6 +121,25 @@ WARNING: Linux kernel CMA support was requested via the btl_vader_single_copy_mechanism MCA variable, but CMA support is not available due to restrictive ptrace settings. +The vader shared memory BTL will fall back on another single-copy +mechanism if one is available. This may result in lower performance. + + Local host: %s +# +[cma-different-user-namespace-error] +ERROR: Linux kernel CMA support was requested via the +btl_vader_single_copy_mechanism MCA variable, but CMA support is +not available due to different user namespaces. + +Your MPI job will abort now. Please select another value for +btl_vader_single_copy_mechanism. + + Local host: %s +# +[cma-different-user-namespace-warning] +WARNING: The default btl_vader_single_copy_mechanism CMA is +not available due to different user namespaces. + The vader shared memory BTL will fall back on another single-copy mechanism if one is available. This may result in lower performance.