From fc68d8a90fe86284e9dc730f878b55c0412f01d2 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 25 Jul 2019 12:32:33 +0200 Subject: [PATCH] Do not use CMA in user namespaces Trying out to run processes via mpirun in Podman containers has shown that the CMA btl_vader_single_copy_mechanism does not work when user namespaces are involved. Creating containers with Podman requires at least user namespaces to be able to do unprivileged mounts in a container Even if running the container with user namespace user ID mappings which result in the same user ID on the inside and outside of all involved containers, the check in the kernel to allow ptrace (and thus process_vm_{read,write}v()), fails if the same IDs are not in the same user namespace. One workaround is to specify '--mca btl_vader_single_copy_mechanism none' and this commit adds code to automatically skip CMA if user namespaces are detected and fall back to MCA_BTL_VADER_EMUL. Signed-off-by: Adrian Reber --- opal/mca/btl/vader/btl_vader.h | 9 +++- opal/mca/btl/vader/btl_vader_component.c | 35 +++++++++++- opal/mca/btl/vader/btl_vader_module.c | 69 +++++++++++++++++++++++- opal/mca/btl/vader/help-btl-vader.txt | 19 +++++++ 4 files changed, 127 insertions(+), 5 deletions(-) diff --git a/opal/mca/btl/vader/btl_vader.h b/opal/mca/btl/vader/btl_vader.h index 028c7a3816..eab5f5a87d 100644 --- a/opal/mca/btl/vader/btl_vader.h +++ b/opal/mca/btl/vader/btl_vader.h @@ -84,7 +84,12 @@ union vader_modex_t { void *segment_base; } xpmem; #endif - opal_shmem_ds_t seg_ds; + struct vader_modex_other_t { + ino_t user_ns_id; + int seg_ds_size; + /* seg_ds needs to be the last element */ + opal_shmem_ds_t seg_ds; + } other; }; /** @@ -270,6 +275,8 @@ int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif +ino_t mca_btl_vader_get_user_ns_id(void); + int mca_btl_vader_get_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, diff --git a/opal/mca/btl/vader/btl_vader_component.c b/opal/mca/btl/vader/btl_vader_component.c index 3d49fd3471..bbee937fea 100644 --- a/opal/mca/btl/vader/btl_vader_component.c +++ b/opal/mca/btl/vader/btl_vader_component.c @@ -41,6 +41,10 @@ #include "btl_vader_fbox.h" #include "btl_vader_xpmem.h" +#ifdef HAVE_SYS_STAT_H +#include +#endif + #include #include @@ -350,6 +354,25 @@ static int mca_btl_vader_component_close(void) return OPAL_SUCCESS; } +/* + * mca_btl_vader_parse_proc_ns_user() tries to get the user namespace ID + * of the current process. + * Returns the ID of the user namespace. In the case of an error '0' is returned. + */ +ino_t mca_btl_vader_get_user_ns_id(void) +{ + struct stat buf; + + if (0 > stat("/proc/self/ns/user", &buf)) { + /* + * Something went wrong, probably an old kernel that does not support namespaces + * simply assume all processes are in the same user namespace and return 0 + */ + return 0; + } + + return buf.st_ino; +} static int mca_btl_base_vader_modex_send (void) { union vader_modex_t modex; @@ -363,8 +386,16 @@ static int mca_btl_base_vader_modex_send (void) modex_size = sizeof (modex.xpmem); } else { #endif - modex_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds); - memmove (&modex.seg_ds, &mca_btl_vader_component.seg_ds, modex_size); + modex.other.seg_ds_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds); + memmove (&modex.other.seg_ds, &mca_btl_vader_component.seg_ds, modex.other.seg_ds_size); + modex.other.user_ns_id = mca_btl_vader_get_user_ns_id(); + /* + * If modex.other.user_ns_id is '0' something did not work out + * during user namespace detection. Assuming there are no + * namespaces available it will return '0' for all processes and + * the check later will see '0' everywhere and not disable CMA. + */ + modex_size = sizeof (modex.other); #if OPAL_BTL_VADER_HAVE_XPMEM } diff --git a/opal/mca/btl/vader/btl_vader_module.c b/opal/mca/btl/vader/btl_vader_module.c index a1cd167e22..e54c02b569 100644 --- a/opal/mca/btl/vader/btl_vader_module.c +++ b/opal/mca/btl/vader/btl_vader_module.c @@ -27,6 +27,7 @@ */ #include "opal_config.h" +#include "opal/util/show_help.h" #include "btl_vader.h" #include "btl_vader_endpoint.h" @@ -79,6 +80,28 @@ mca_btl_vader_t mca_btl_vader = { } }; +/* + * Exit function copied from btl_usnic_util.c + * + * The following comment tells Coverity that this function does not return. + * See https://scan.coverity.com/tune. + */ + +/* coverity[+kill] */ +static void vader_btl_exit(mca_btl_vader_t *btl) +{ + if (NULL != btl && NULL != btl->error_cb) { + btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL, + (opal_proc_t*) opal_proc_local_get(), + "The vader BTL is aborting the MPI job (via PML error callback)."); + } + + /* If the PML error callback returns (or if there wasn't one), just exit. Shrug. */ + fprintf(stderr, "*** The Open MPI vader BTL is aborting the MPI job (via exit(3)).\n"); + fflush(stderr); + exit(1); +} + static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n) { mca_btl_vader_component_t *component = &mca_btl_vader_component; @@ -173,6 +196,7 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n) static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_proc_t *proc, int remote_rank) { mca_btl_vader_component_t *component = &mca_btl_vader_component; union vader_modex_t *modex; + ino_t my_user_ns_id; size_t msg_size; int rc; @@ -197,17 +221,58 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_ } else { #endif /* store a copy of the segment information for detach */ - ep->segment_data.other.seg_ds = malloc (msg_size); + ep->segment_data.other.seg_ds = malloc (modex->other.seg_ds_size); if (NULL == ep->segment_data.other.seg_ds) { return OPAL_ERR_OUT_OF_RESOURCE; } - memcpy (ep->segment_data.other.seg_ds, &modex->seg_ds, msg_size); + memcpy (ep->segment_data.other.seg_ds, &modex->other.seg_ds, modex->other.seg_ds_size); ep->segment_base = opal_shmem_segment_attach (ep->segment_data.other.seg_ds); if (NULL == ep->segment_base) { return OPAL_ERROR; } + + if (MCA_BTL_VADER_CMA == mca_btl_vader_component.single_copy_mechanism) { + my_user_ns_id = mca_btl_vader_get_user_ns_id(); + if (my_user_ns_id != modex->other.user_ns_id) { + mca_base_var_source_t source; + int vari; + rc = mca_base_var_find_by_name("btl_vader_single_copy_mechanism", &vari); + if (OPAL_ERROR == rc) { + return OPAL_ERROR; + } + rc = mca_base_var_get_value(vari, NULL, &source, NULL); + if (OPAL_ERROR == rc) { + return OPAL_ERROR; + } + /* + * CMA is not possible as different user namespaces are in use. + * Currently the kernel does not allow * process_vm_{read,write}v() + * for processes running in different user namespaces even if + * all involved user IDs are mapped to the same user ID. + * + * Fallback to MCA_BTL_VADER_EMUL. + */ + if (MCA_BASE_VAR_SOURCE_DEFAULT != source) { + /* If CMA has been explicitly selected we want to error out */ + opal_show_help("help-btl-vader.txt", "cma-different-user-namespace-error", + true, opal_process_info.nodename); + vader_btl_exit(&mca_btl_vader); + } + /* + * If CMA has been selected because it is the default or + * some fallback, this falls back even further. + */ + opal_show_help("help-btl-vader.txt", "cma-different-user-namespace-warning", + true, opal_process_info.nodename); + mca_btl_vader_component.single_copy_mechanism = MCA_BTL_VADER_EMUL; + mca_btl_vader.super.btl_get = mca_btl_vader_get_sc_emu; + mca_btl_vader.super.btl_put = mca_btl_vader_put_sc_emu; + mca_btl_vader.super.btl_put_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t); + mca_btl_vader.super.btl_get_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t); + } + } #if OPAL_BTL_VADER_HAVE_XPMEM } #endif diff --git a/opal/mca/btl/vader/help-btl-vader.txt b/opal/mca/btl/vader/help-btl-vader.txt index 9d87267564..ea87559d45 100644 --- a/opal/mca/btl/vader/help-btl-vader.txt +++ b/opal/mca/btl/vader/help-btl-vader.txt @@ -121,6 +121,25 @@ WARNING: Linux kernel CMA support was requested via the btl_vader_single_copy_mechanism MCA variable, but CMA support is not available due to restrictive ptrace settings. +The vader shared memory BTL will fall back on another single-copy +mechanism if one is available. This may result in lower performance. + + Local host: %s +# +[cma-different-user-namespace-error] +ERROR: Linux kernel CMA support was requested via the +btl_vader_single_copy_mechanism MCA variable, but CMA support is +not available due to different user namespaces. + +Your MPI job will abort now. Please select another value for +btl_vader_single_copy_mechanism. + + Local host: %s +# +[cma-different-user-namespace-warning] +WARNING: The default btl_vader_single_copy_mechanism CMA is +not available due to different user namespaces. + The vader shared memory BTL will fall back on another single-copy mechanism if one is available. This may result in lower performance.