1
1

Merge pull request #6154 from hjelmn/uct_updates

btl/uct: fix some issues when using UCX over ugni
Этот коммит содержится в:
Nathan Hjelm 2018-12-06 10:03:59 -07:00 коммит произвёл GitHub
родитель a21602d993 e07a64c52d
Коммит 9007819c0b
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 36 добавлений и 7 удалений

Просмотреть файл

@ -15,6 +15,8 @@
* Copyright (c) 2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2018 Triad National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -53,7 +55,7 @@ static int mca_btl_uct_component_register(void)
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_uct_component.memory_domains);
mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,ud,any";
mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,ud,ugni_rdma,ugni_smsg,any";
(void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version,
"transports", "Comma-delimited list of transports to use sorted by increasing "
"priority. The list of transports available can be queried using ucx_info. Special"

Просмотреть файл

@ -2,6 +2,8 @@
/*
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018 Triad National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -137,11 +139,26 @@ static void mca_btl_uct_connection_ep_destruct (mca_btl_uct_connection_ep_t *ep)
OBJ_CLASS_INSTANCE(mca_btl_uct_connection_ep_t, opal_object_t, mca_btl_uct_connection_ep_construct,
mca_btl_uct_connection_ep_destruct);
struct mca_btl_uct_conn_completion_t {
uct_completion_t super;
volatile bool complete;
};
typedef struct mca_btl_uct_conn_completion_t mca_btl_uct_conn_completion_t;
static void mca_btl_uct_endpoint_flush_complete (uct_completion_t *self, ucs_status_t status)
{
mca_btl_uct_conn_completion_t *completion = (mca_btl_uct_conn_completion_t *) self;
BTL_VERBOSE(("connection flush complete"));
completion->complete = true;
}
static int mca_btl_uct_endpoint_send_conn_req (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint,
mca_btl_uct_device_context_t *conn_tl_context,
mca_btl_uct_conn_req_t *request, size_t request_length)
{
mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep;
mca_btl_uct_conn_completion_t completion = {.super = {.count = 1, .func = mca_btl_uct_endpoint_flush_complete},
.complete = false};
ucs_status_t ucs_status;
BTL_VERBOSE(("sending connection request to peer. context id: %d, type: %d, length: %" PRIsize_t,
@ -170,10 +187,18 @@ static int mca_btl_uct_endpoint_send_conn_req (mca_btl_uct_module_t *uct_btl, mc
} while (1);
/* for now we just wait for the connection request to complete before continuing */
do {
ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, NULL);
mca_btl_uct_context_progress (conn_tl_context);
} while (UCS_INPROGRESS == ucs_status);
ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, &completion.super);
if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status) {
/* NTH: I don't know if this path is needed. For some networks we must use a completion. */
do {
ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, NULL);
mca_btl_uct_context_progress (conn_tl_context);
} while (UCS_INPROGRESS == ucs_status);
} else {
do {
mca_btl_uct_context_progress (conn_tl_context);
} while (!completion.complete);
}
opal_mutex_lock (&endpoint->ep_lock);
@ -284,8 +309,8 @@ int mca_btl_uct_endpoint_connect (mca_btl_uct_module_t *uct_btl, mca_btl_uct_end
void *ep_addr, int tl_index)
{
mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[context_id] + tl_index;
mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_rdma_context_specific (uct_btl, context_id);
mca_btl_uct_tl_t *tl = (tl_index == uct_btl->rdma_tl->tl_index) ? uct_btl->rdma_tl : uct_btl->am_tl;
mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_tl_context_specific (uct_btl, tl, context_id);
uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data;
mca_btl_uct_connection_ep_t *conn_ep = NULL;
mca_btl_uct_modex_t *modex;

Просмотреть файл

@ -4,6 +4,8 @@
* reserved.
* Copyright (c) 2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2018 Triad National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -26,7 +28,7 @@
* @brief Convert UCT capabilities to BTL flags
*/
static uint64_t mca_btl_uct_cap_to_btl_flag[][2] = {
{UCT_IFACE_FLAG_AM_ZCOPY, MCA_BTL_FLAGS_SEND},
{UCT_IFACE_FLAG_AM_SHORT, MCA_BTL_FLAGS_SEND},
{UCT_IFACE_FLAG_PUT_ZCOPY, MCA_BTL_FLAGS_PUT},
{UCT_IFACE_FLAG_GET_ZCOPY, MCA_BTL_FLAGS_GET},
{0,0},