1
1

btl/uct: bug fixes and general improvements

This commit updates the uct btl to change the transports parameter
into a priority list. The dc_mlx5, rc_mlx5, and ud transports to the
priority list. This will give better out of the box performance for
multi-threaded codes beacuse the *_mlx5 transports can avoid the mlx5
lock inside libmlx5_rdmav2.

This commit also fixes a number of leaks and a possible deadlock when
using RDMA.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
(cherry picked from commit 39be6ec15c)
Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2018-10-03 09:26:26 -06:00
родитель d18ea98263
Коммит 1153082a0f
8 изменённых файлов: 148 добавлений и 91 удалений

Просмотреть файл

@ -106,9 +106,6 @@ struct mca_btl_uct_module_t {
/** large registered frags for packing non-contiguous data */
opal_free_list_t max_frags;
/** RDMA completions */
opal_free_list_t rdma_completions;
/** frags that were waiting on connections that are now ready to send */
opal_list_t pending_frags;
};

Просмотреть файл

@ -104,8 +104,10 @@ int mca_btl_uct_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
rc = OPAL_SUCCESS;
} else if (UCS_OK == ucs_status) {
rc = 1;
mca_btl_uct_uct_completion_release (comp);
} else {
rc = OPAL_ERR_OUT_OF_RESOURCE;
mca_btl_uct_uct_completion_release (comp);
}
uct_rkey_release (&rkey);
@ -176,8 +178,10 @@ int mca_btl_uct_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
rc = OPAL_SUCCESS;
} else if (UCS_OK == ucs_status) {
rc = 1;
mca_btl_uct_uct_completion_release (comp);
} else {
rc = OPAL_ERR_OUT_OF_RESOURCE;
mca_btl_uct_uct_completion_release (comp);
}
uct_rkey_release (&rkey);

Просмотреть файл

@ -28,6 +28,9 @@
#include "opal/mca/btl/base/base.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/util/argv.h"
#include "opal/memoryhooks/memory.h"
#include "opal/mca/memory/base/base.h"
#include <ucm/api/ucm.h>
#include <string.h>
@ -47,13 +50,13 @@ static int mca_btl_uct_component_register(void)
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_uct_component.memory_domains);
mca_btl_uct_component.allowed_transports = "any";
mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,ud,any";
(void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version,
"transports", "Comma-delimited list of transports of the form to use."
" The list of transports available can be queried using ucx_info. Special"
"values: any (any available) (default: any)", MCA_BASE_VAR_TYPE_STRING,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_uct_component.allowed_transports);
"transports", "Comma-delimited list of transports to use sorted by increasing "
"priority. The list of transports available can be queried using ucx_info. Special"
"values: any (any available) (default: dc_mlx5,rc_mlx5,ud,any)",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.allowed_transports);
mca_btl_uct_component.num_contexts_per_module = 0;
(void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version,
@ -93,6 +96,11 @@ static int mca_btl_uct_component_register(void)
&module->super);
}
static void mca_btl_uct_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc)
{
ucm_vm_munmap(buf, length);
}
static int mca_btl_uct_component_open(void)
{
if (0 == mca_btl_uct_component.num_contexts_per_module) {
@ -112,6 +120,15 @@ static int mca_btl_uct_component_open(void)
}
}
if (mca_btl_uct_component.num_contexts_per_module > MCA_BTL_UCT_MAX_WORKERS) {
mca_btl_uct_component.num_contexts_per_module = MCA_BTL_UCT_MAX_WORKERS;
}
if (mca_btl_uct_component.disable_ucx_memory_hooks) {
ucm_set_external_event(UCM_EVENT_VM_UNMAPPED);
opal_mem_hooks_register_release(mca_btl_uct_mem_release_cb, NULL);
}
return OPAL_SUCCESS;
}
@ -121,6 +138,10 @@ static int mca_btl_uct_component_open(void)
*/
static int mca_btl_uct_component_close(void)
{
if (mca_btl_uct_component.disable_ucx_memory_hooks) {
opal_mem_hooks_unregister_release (mca_btl_uct_mem_release_cb);
}
return OPAL_SUCCESS;
}
@ -247,7 +268,6 @@ static mca_btl_uct_module_t *mca_btl_uct_alloc_module (const char *md_name, mca_
OBJ_CONSTRUCT(&module->short_frags, opal_free_list_t);
OBJ_CONSTRUCT(&module->eager_frags, opal_free_list_t);
OBJ_CONSTRUCT(&module->max_frags, opal_free_list_t);
OBJ_CONSTRUCT(&module->rdma_completions, opal_free_list_t);
OBJ_CONSTRUCT(&module->pending_frags, opal_list_t);
OBJ_CONSTRUCT(&module->lock, opal_mutex_t);

Просмотреть файл

@ -23,7 +23,7 @@
* @param[in] tl btl uct tl pointer
* @param[in] context_id identifier for this context (0..MCA_BTL_UCT_MAX_WORKERS-1)
*/
mca_btl_uct_device_context_t *mca_btl_uct_context_create (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, int context_id);
mca_btl_uct_device_context_t *mca_btl_uct_context_create (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, int context_id, bool enable_progress);
/**
* @brief Destroy a device context and release all resources
@ -91,7 +91,7 @@ mca_btl_uct_module_get_tl_context_specific (mca_btl_uct_module_t *module, mca_bt
if (OPAL_UNLIKELY(NULL == context)) {
mca_btl_uct_device_context_t *new_context;
new_context = mca_btl_uct_context_create (module, tl, context_id);
new_context = mca_btl_uct_context_create (module, tl, context_id, true);
if (!opal_atomic_compare_exchange_strong_ptr (&tl->uct_dev_contexts[context_id], &context, new_context)) {
mca_btl_uct_context_destroy (new_context);
} else {

Просмотреть файл

@ -31,15 +31,6 @@
#include "btl_uct_endpoint.h"
#include "btl_uct_am.h"
#include "opal/memoryhooks/memory.h"
#include "opal/mca/memory/base/base.h"
#include <ucm/api/ucm.h>
static void mca_btl_uct_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc)
{
ucm_vm_munmap(buf, length);
}
struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep (struct mca_btl_base_module_t *module, opal_proc_t *proc)
{
mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) module;
@ -111,18 +102,6 @@ static int mca_btl_uct_add_procs (mca_btl_base_module_t *btl,
NULL, 0, uct_module->rcache, NULL, NULL);
}
if (rdma_tl) {
rc = opal_free_list_init (&uct_module->rdma_completions, sizeof (mca_btl_uct_uct_completion_t),
opal_cache_line_size, OBJ_CLASS(mca_btl_uct_uct_completion_t),
0, opal_cache_line_size, 0, 4096, 128, NULL, 0, NULL, NULL,
NULL);
}
if (mca_btl_uct_component.disable_ucx_memory_hooks) {
ucm_set_external_event(UCM_EVENT_VM_UNMAPPED);
opal_mem_hooks_register_release(mca_btl_uct_mem_release_cb, NULL);
}
uct_module->initialized = true;
}
@ -288,10 +267,6 @@ int mca_btl_uct_finalize (mca_btl_base_module_t* btl)
mca_btl_uct_endpoint_t *endpoint;
uint64_t key;
if (mca_btl_uct_component.disable_ucx_memory_hooks) {
opal_mem_hooks_unregister_release (mca_btl_uct_mem_release_cb);
}
/* clean up any leftover endpoints */
OPAL_HASH_TABLE_FOREACH(key, uint64, endpoint, &uct_module->id_to_endpoint) {
OBJ_RELEASE(endpoint);
@ -300,7 +275,6 @@ int mca_btl_uct_finalize (mca_btl_base_module_t* btl)
OBJ_DESTRUCT(&uct_module->short_frags);
OBJ_DESTRUCT(&uct_module->eager_frags);
OBJ_DESTRUCT(&uct_module->max_frags);
OBJ_DESTRUCT(&uct_module->rdma_completions);
OBJ_DESTRUCT(&uct_module->pending_frags);
OBJ_DESTRUCT(&uct_module->lock);

Просмотреть файл

@ -30,13 +30,14 @@ static void mca_btl_uct_uct_completion_construct (mca_btl_uct_uct_completion_t *
OBJ_CLASS_INSTANCE(mca_btl_uct_uct_completion_t, opal_free_list_item_t, mca_btl_uct_uct_completion_construct, NULL);
mca_btl_uct_uct_completion_t *
mca_btl_uct_uct_completion_alloc (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_uct_device_context_t *dev_context, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
mca_btl_uct_uct_completion_t *comp = (mca_btl_uct_uct_completion_t *) opal_free_list_get (&uct_btl->rdma_completions);
mca_btl_uct_uct_completion_t *comp = (mca_btl_uct_uct_completion_t *) opal_free_list_get (&dev_context->rdma_completions);
if (OPAL_LIKELY(NULL != comp)) {
comp->uct_comp.count = 1;
comp->btl = &uct_btl->super;
@ -55,8 +56,7 @@ mca_btl_uct_uct_completion_alloc (mca_btl_uct_module_t *uct_btl, mca_btl_base_en
void mca_btl_uct_uct_completion_release (mca_btl_uct_uct_completion_t *comp)
{
if (comp) {
mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) comp->btl;
opal_free_list_return (&uct_btl->rdma_completions, &comp->super);
opal_free_list_return (&comp->dev_context->rdma_completions, &comp->super);
}
}
@ -122,6 +122,8 @@ int mca_btl_uct_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
mca_btl_uct_uct_completion_release (comp);
} else if (UCS_INPROGRESS == ucs_status) {
ucs_status = UCS_OK;
} else {
mca_btl_uct_uct_completion_release (comp);
}
BTL_VERBOSE(("get issued. status = %d", ucs_status));
@ -157,6 +159,8 @@ int mca_btl_uct_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
ucs_status_t ucs_status;
uct_rkey_bundle_t rkey;
uct_ep_h ep_handle;
bool use_short = false;
bool use_bcopy = false;
int rc;
BTL_VERBOSE(("performing put operation. local address: %p, length: %lu", local_address, (unsigned long) size));
@ -177,12 +181,19 @@ int mca_btl_uct_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
mca_btl_uct_context_lock (context);
/* determine what UCT prototol should be used */
if (size <= uct_btl->super.btl_put_local_registration_threshold) {
use_short = size <= uct_btl->rdma_tl->uct_iface_attr.cap.put.max_short;
use_bcopy = !use_short;
}
do {
if (size <= uct_btl->rdma_tl->uct_iface_attr.cap.put.max_short) {
if (use_short) {
ucs_status = uct_ep_put_short (ep_handle, local_address, size, remote_address, rkey.rkey);
} else if (size <= uct_btl->super.btl_put_local_registration_threshold) {
} else if (use_bcopy) {
ssize_t tmp = uct_ep_put_bcopy (ep_handle, mca_btl_uct_put_pack,
&(mca_btl_uct_put_pack_args_t) {.local_address = local_address, .size = size},
&(mca_btl_uct_put_pack_args_t) {.local_address = local_address,
.size = size},
remote_address, rkey.rkey);
ucs_status = (tmp == (ssize_t) size) ? UCS_OK : UCS_ERR_NO_RESOURCE;
} else {

Просмотреть файл

@ -237,7 +237,20 @@ static int mca_btl_uct_setup_connection_tl (mca_btl_uct_module_t *module)
return UCS_OK == ucs_status ? OPAL_SUCCESS : OPAL_ERROR;
}
mca_btl_uct_device_context_t *mca_btl_uct_context_create (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, int context_id)
static void mca_btl_uct_context_enable_progress (mca_btl_uct_device_context_t *context)
{
if (!context->progress_enabled) {
#if HAVE_DECL_UCT_PROGRESS_THREAD_SAFE
uct_iface_progress_enable (context->uct_iface, UCT_PROGRESS_THREAD_SAFE | UCT_PROGRESS_SEND |
UCT_PROGRESS_RECV);
#else
uct_iface_progress_enable (context->uct_iface, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV);
#endif
context->progress_enabled = true;
}
}
mca_btl_uct_device_context_t *mca_btl_uct_context_create (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, int context_id, bool enable_progress)
{
uct_iface_params_t iface_params = {.rndv_cb = NULL, .eager_cb = NULL, .stats_root = NULL,
.rx_headroom = 0, .open_mode = UCT_IFACE_OPEN_MODE_DEVICE,
@ -245,6 +258,7 @@ mca_btl_uct_device_context_t *mca_btl_uct_context_create (mca_btl_uct_module_t *
.dev_name = tl->uct_dev_name}}};
mca_btl_uct_device_context_t *context;
ucs_status_t ucs_status;
int rc;
context = calloc (1, sizeof (*context));
if (OPAL_UNLIKELY(NULL == context)) {
@ -255,44 +269,47 @@ mca_btl_uct_device_context_t *mca_btl_uct_context_create (mca_btl_uct_module_t *
context->uct_btl = module;
OBJ_CONSTRUCT(&context->completion_fifo, opal_fifo_t);
OBJ_CONSTRUCT(&context->mutex, opal_recursive_mutex_t);
OBJ_CONSTRUCT(&context->rdma_completions, opal_free_list_t);
do {
/* apparently (in contradiction to the spec) UCT is *not* thread safe. because we have to
* use our own locks just go ahead and use UCS_THREAD_MODE_SINGLE. if they ever fix their
* api then change this back to UCS_THREAD_MODE_MULTI and remove the locks around the
* various UCT calls. */
ucs_status = uct_worker_create (module->ucs_async, UCS_THREAD_MODE_SINGLE, &context->uct_worker);
if (UCS_OK != ucs_status) {
BTL_VERBOSE(("could not create a UCT worker"));
mca_btl_uct_context_destroy (context);
context = NULL;
break;
}
rc = opal_free_list_init (&context->rdma_completions, sizeof (mca_btl_uct_uct_completion_t),
opal_cache_line_size, OBJ_CLASS(mca_btl_uct_uct_completion_t),
0, opal_cache_line_size, 0, 4096, 128, NULL, 0, NULL, NULL,
NULL);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
mca_btl_uct_context_destroy (context);
return NULL;
}
ucs_status = uct_iface_open (tl->uct_md->uct_md, context->uct_worker, &iface_params,
tl->uct_tl_config, &context->uct_iface);
if (UCS_OK != ucs_status) {
BTL_VERBOSE(("could not open UCT interface. error code: %d", ucs_status));
mca_btl_uct_context_destroy (context);
context = NULL;
break;
}
/* apparently (in contradiction to the spec) UCT is *not* thread safe. because we have to
* use our own locks just go ahead and use UCS_THREAD_MODE_SINGLE. if they ever fix their
* api then change this back to UCS_THREAD_MODE_MULTI and remove the locks around the
* various UCT calls. */
ucs_status = uct_worker_create (module->ucs_async, UCS_THREAD_MODE_SERIALIZED, &context->uct_worker);
if (OPAL_UNLIKELY(UCS_OK != ucs_status)) {
BTL_VERBOSE(("could not create a UCT worker"));
mca_btl_uct_context_destroy (context);
return NULL;
}
BTL_VERBOSE(("enabling progress for tl %p context id %d", (void *) tl, context_id));
ucs_status = uct_iface_open (tl->uct_md->uct_md, context->uct_worker, &iface_params,
tl->uct_tl_config, &context->uct_iface);
if (OPAL_UNLIKELY(UCS_OK != ucs_status)) {
BTL_VERBOSE(("could not open UCT interface. error code: %d", ucs_status));
mca_btl_uct_context_destroy (context);
return NULL;
}
#if HAVE_DECL_UCT_PROGRESS_THREAD_SAFE
uct_iface_progress_enable (context->uct_iface, UCT_PROGRESS_THREAD_SAFE | UCT_PROGRESS_SEND |
UCT_PROGRESS_RECV);
#else
uct_iface_progress_enable (context->uct_iface, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV);
#endif
BTL_VERBOSE(("enabling progress for tl %p context id %d", (void *) tl, context_id));
if (context_id > 0 && tl == module->am_tl) {
BTL_VERBOSE(("installing AM handler for tl %p context id %d", (void *) tl, context_id));
uct_iface_set_am_handler (context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler,
context, UCT_CB_FLAG_SYNC);
}
} while (0);
if (enable_progress) {
mca_btl_uct_context_enable_progress (context);
}
if (context_id > 0 && tl == module->am_tl) {
BTL_VERBOSE(("installing AM handler for tl %p context id %d", (void *) tl, context_id));
uct_iface_set_am_handler (context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler,
context, UCT_CB_FLAG_SYNC);
}
return context;
}
@ -310,6 +327,7 @@ void mca_btl_uct_context_destroy (mca_btl_uct_device_context_t *context)
}
OBJ_DESTRUCT(&context->completion_fifo);
OBJ_DESTRUCT(&context->rdma_completions);
free (context);
}
@ -347,7 +365,7 @@ static mca_btl_uct_tl_t *mca_btl_uct_create_tl (mca_btl_uct_module_t *module, mc
(void) uct_md_iface_config_read (md->uct_md, tl_desc->tl_name, NULL, NULL, &tl->uct_tl_config);
/* always create a 0 context (needed to query) */
tl->uct_dev_contexts[0] = mca_btl_uct_context_create (module, tl, 0);
tl->uct_dev_contexts[0] = mca_btl_uct_context_create (module, tl, 0, false);
if (NULL == tl->uct_dev_contexts[0]) {
BTL_VERBOSE(("could not create a uct device context"));
OBJ_RELEASE(tl);
@ -385,12 +403,8 @@ static void mca_btl_uct_set_tl_rdma (mca_btl_uct_module_t *module, mca_btl_uct_t
module->super.btl_put_limit = tl->uct_iface_attr.cap.put.max_zcopy;
module->super.btl_put_alignment = 0;
/* no registration needed when using short put */
if (tl->uct_iface_attr.cap.put.max_bcopy > tl->uct_iface_attr.cap.put.max_short) {
module->super.btl_put_local_registration_threshold = tl->uct_iface_attr.cap.put.max_bcopy;
} else {
module->super.btl_put_local_registration_threshold = tl->uct_iface_attr.cap.put.max_short;
}
/* no registration needed when using short/bcopy put */
module->super.btl_put_local_registration_threshold = tl->uct_iface_attr.cap.put.max_bcopy;
module->rdma_tl = tl;
OBJ_RETAIN(tl);
@ -478,6 +492,11 @@ static int mca_btl_uct_evaluate_tl (mca_btl_uct_module_t *module, mca_btl_uct_tl
module->super.btl_latency = 1;
}
if (tl == module->rdma_tl || tl == module->am_tl || tl == module->conn_tl) {
/* make sure progress is enabled on the default context now that we know this TL will be used */
mca_btl_uct_context_enable_progress (tl->uct_dev_contexts[0]);
}
return OPAL_SUCCESS;
}
@ -487,6 +506,7 @@ int mca_btl_uct_query_tls (mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, u
mca_btl_uct_tl_t *tl;
opal_list_t tl_list;
char **tl_filter;
int any_priority = 0;
OBJ_CONSTRUCT(&tl_list, opal_list_t);
@ -499,23 +519,46 @@ int mca_btl_uct_query_tls (mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, u
free (tl_filter[0]);
tl_filter[0] = tmp;
include = false;
} else if (0 == strcmp (tl_filter[0], "any")) {
any = true;
}
/* check for the any keyword */
for (unsigned j = 0 ; tl_filter[j] ; ++j) {
if (0 == strcmp (tl_filter[j], "any")) {
any_priority = j;
any = true;
break;
}
}
if (any && !include) {
opal_argv_free (tl_filter);
return OPAL_ERR_NOT_AVAILABLE;
}
for (unsigned i = 0 ; i < tl_count ; ++i) {
bool try_tl = any;
int priority = 0;
int priority = any_priority;
for (unsigned j = 0 ; tl_filter[j] && !try_tl ; ++j) {
try_tl = (0 == strcmp (tl_filter[j], tl_descs[i].tl_name)) == include;
priority = j;
for (unsigned j = 0 ; tl_filter[j] ; ++j) {
if (0 == strcmp (tl_filter[j], tl_descs[i].tl_name)) {
try_tl = include;
priority = j;
break;
}
}
BTL_VERBOSE(("tl filter: tl_name = %s, use = %d, priority = %d", tl_descs[i].tl_name, try_tl, priority));
if (!try_tl) {
continue;
}
if (0 == strcmp (tl_descs[i].tl_name, "ud")) {
/* ud looks like any normal transport but we do not want to use it for anything other
* than connection management so ensure it gets evaluated last */
priority = INT_MAX;
}
tl = mca_btl_uct_create_tl (module, md, tl_descs + i, priority);
if (tl) {
@ -523,6 +566,8 @@ int mca_btl_uct_query_tls (mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, u
}
}
opal_argv_free (tl_filter);
if (0 == opal_list_get_size (&tl_list)) {
BTL_VERBOSE(("no suitable tls match filter: %s", mca_btl_uct_component.allowed_transports));
OBJ_DESTRUCT(&tl_list);

Просмотреть файл

@ -141,9 +141,15 @@ struct mca_btl_uct_device_context_t {
/** UCT interface handle */
uct_iface_h uct_iface;
/** RDMA completions */
opal_free_list_t rdma_completions;
/** complete fragments and rdma operations. this fifo is used to avoid making
* callbacks while holding the device lock. */
opal_fifo_t completion_fifo;
/** progress is enabled on this context */
bool progress_enabled;
};
typedef struct mca_btl_uct_device_context_t mca_btl_uct_device_context_t;