Merge pull request #6154 from hjelmn/uct_updates
btl/uct: fix some issues when using UCX over ugni
Этот коммит содержится в:
Коммит
9007819c0b
@ -15,6 +15,8 @@
|
||||
* Copyright (c) 2018 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||
* Copyright (c) 2018 Triad National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -53,7 +55,7 @@ static int mca_btl_uct_component_register(void)
|
||||
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_uct_component.memory_domains);
|
||||
|
||||
mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,ud,any";
|
||||
mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,ud,ugni_rdma,ugni_smsg,any";
|
||||
(void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version,
|
||||
"transports", "Comma-delimited list of transports to use sorted by increasing "
|
||||
"priority. The list of transports available can be queried using ucx_info. Special"
|
||||
|
@ -2,6 +2,8 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Triad National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -137,11 +139,26 @@ static void mca_btl_uct_connection_ep_destruct (mca_btl_uct_connection_ep_t *ep)
|
||||
OBJ_CLASS_INSTANCE(mca_btl_uct_connection_ep_t, opal_object_t, mca_btl_uct_connection_ep_construct,
|
||||
mca_btl_uct_connection_ep_destruct);
|
||||
|
||||
struct mca_btl_uct_conn_completion_t {
|
||||
uct_completion_t super;
|
||||
volatile bool complete;
|
||||
};
|
||||
typedef struct mca_btl_uct_conn_completion_t mca_btl_uct_conn_completion_t;
|
||||
|
||||
static void mca_btl_uct_endpoint_flush_complete (uct_completion_t *self, ucs_status_t status)
|
||||
{
|
||||
mca_btl_uct_conn_completion_t *completion = (mca_btl_uct_conn_completion_t *) self;
|
||||
BTL_VERBOSE(("connection flush complete"));
|
||||
completion->complete = true;
|
||||
}
|
||||
|
||||
static int mca_btl_uct_endpoint_send_conn_req (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint,
|
||||
mca_btl_uct_device_context_t *conn_tl_context,
|
||||
mca_btl_uct_conn_req_t *request, size_t request_length)
|
||||
{
|
||||
mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep;
|
||||
mca_btl_uct_conn_completion_t completion = {.super = {.count = 1, .func = mca_btl_uct_endpoint_flush_complete},
|
||||
.complete = false};
|
||||
ucs_status_t ucs_status;
|
||||
|
||||
BTL_VERBOSE(("sending connection request to peer. context id: %d, type: %d, length: %" PRIsize_t,
|
||||
@ -170,10 +187,18 @@ static int mca_btl_uct_endpoint_send_conn_req (mca_btl_uct_module_t *uct_btl, mc
|
||||
} while (1);
|
||||
|
||||
/* for now we just wait for the connection request to complete before continuing */
|
||||
do {
|
||||
ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, NULL);
|
||||
mca_btl_uct_context_progress (conn_tl_context);
|
||||
} while (UCS_INPROGRESS == ucs_status);
|
||||
ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, &completion.super);
|
||||
if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status) {
|
||||
/* NTH: I don't know if this path is needed. For some networks we must use a completion. */
|
||||
do {
|
||||
ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, NULL);
|
||||
mca_btl_uct_context_progress (conn_tl_context);
|
||||
} while (UCS_INPROGRESS == ucs_status);
|
||||
} else {
|
||||
do {
|
||||
mca_btl_uct_context_progress (conn_tl_context);
|
||||
} while (!completion.complete);
|
||||
}
|
||||
|
||||
opal_mutex_lock (&endpoint->ep_lock);
|
||||
|
||||
@ -284,8 +309,8 @@ int mca_btl_uct_endpoint_connect (mca_btl_uct_module_t *uct_btl, mca_btl_uct_end
|
||||
void *ep_addr, int tl_index)
|
||||
{
|
||||
mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[context_id] + tl_index;
|
||||
mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_rdma_context_specific (uct_btl, context_id);
|
||||
mca_btl_uct_tl_t *tl = (tl_index == uct_btl->rdma_tl->tl_index) ? uct_btl->rdma_tl : uct_btl->am_tl;
|
||||
mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_tl_context_specific (uct_btl, tl, context_id);
|
||||
uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data;
|
||||
mca_btl_uct_connection_ep_t *conn_ep = NULL;
|
||||
mca_btl_uct_modex_t *modex;
|
||||
|
@ -4,6 +4,8 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2018 Triad National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -26,7 +28,7 @@
|
||||
* @brief Convert UCT capabilities to BTL flags
|
||||
*/
|
||||
static uint64_t mca_btl_uct_cap_to_btl_flag[][2] = {
|
||||
{UCT_IFACE_FLAG_AM_ZCOPY, MCA_BTL_FLAGS_SEND},
|
||||
{UCT_IFACE_FLAG_AM_SHORT, MCA_BTL_FLAGS_SEND},
|
||||
{UCT_IFACE_FLAG_PUT_ZCOPY, MCA_BTL_FLAGS_PUT},
|
||||
{UCT_IFACE_FLAG_GET_ZCOPY, MCA_BTL_FLAGS_GET},
|
||||
{0,0},
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user