1
1

Merge pull request #6395 from brminich/topic/ucx_net_waddr_4.0.x

PML/UCX: Use net worker address for remote peers - v4.0.x
Этот коммит содержится в:
Howard Pritchard 2019-02-21 20:29:47 -07:00 коммит произвёл GitHub
родитель e82523fade 1c514948f6
Коммит 7aeb65579b
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 66 добавлений и 5 удалений

Просмотреть файл

@ -120,6 +120,10 @@ AC_DEFUN([OMPI_CHECK_UCX],[
UCP_ATOMIC_FETCH_OP_FXOR], UCP_ATOMIC_FETCH_OP_FXOR],
[], [], [], [],
[#include <ucp/api/ucp.h>]) [#include <ucp/api/ucp.h>])
AC_CHECK_DECLS([UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS],
[AC_DEFINE([HAVE_UCP_WORKER_ADDRESS_FLAGS], [1],
[have worker address attribute])], [],
[#include <ucp/api/ucp.h>])
CPPFLAGS=$old_CPPFLAGS CPPFLAGS=$old_CPPFLAGS
OPAL_SUMMARY_ADD([[Transports]],[[Open UCX]],[$1],[$ompi_check_ucx_happy])])]) OPAL_SUMMARY_ADD([[Transports]],[[Open UCX]],[$1],[$ompi_check_ucx_happy])])])

Просмотреть файл

@ -82,11 +82,46 @@ mca_pml_ucx_module_t ompi_pml_ucx = {
#define PML_UCX_REQ_ALLOCA() \ #define PML_UCX_REQ_ALLOCA() \
((char *)alloca(ompi_pml_ucx.request_size) + ompi_pml_ucx.request_size); ((char *)alloca(ompi_pml_ucx.request_size) + ompi_pml_ucx.request_size);
#if HAVE_UCP_WORKER_ADDRESS_FLAGS
static int mca_pml_ucx_send_worker_address_type(int addr_flags, int modex_scope)
{
ucs_status_t status;
ucp_worker_attr_t attrs;
int rc;
attrs.field_mask = UCP_WORKER_ATTR_FIELD_ADDRESS |
UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS;
attrs.address_flags = addr_flags;
status = ucp_worker_query(ompi_pml_ucx.ucp_worker, &attrs);
if (UCS_OK != status) {
PML_UCX_ERROR("Failed to query UCP worker address");
return OMPI_ERROR;
}
OPAL_MODEX_SEND(rc, modex_scope, &mca_pml_ucx_component.pmlm_version,
(void*)attrs.address, attrs.address_length);
ucp_worker_release_address(ompi_pml_ucx.ucp_worker, attrs.address);
if (OMPI_SUCCESS != rc) {
return OMPI_ERROR;
}
PML_UCX_VERBOSE(2, "Pack %s worker address, size %ld",
(modex_scope == OPAL_PMIX_LOCAL) ? "local" : "remote",
attrs.address_length);
return OMPI_SUCCESS;
}
#endif
static int mca_pml_ucx_send_worker_address(void) static int mca_pml_ucx_send_worker_address(void)
{ {
ucp_address_t *address;
ucs_status_t status; ucs_status_t status;
#if !HAVE_UCP_WORKER_ADDRESS_FLAGS
ucp_address_t *address;
size_t addrlen; size_t addrlen;
int rc; int rc;
@ -96,16 +131,35 @@ static int mca_pml_ucx_send_worker_address(void)
return OMPI_ERROR; return OMPI_ERROR;
} }
PML_UCX_VERBOSE(2, "Pack worker address, size %ld", addrlen);
OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,
&mca_pml_ucx_component.pmlm_version, (void*)address, addrlen); &mca_pml_ucx_component.pmlm_version, (void*)address, addrlen);
if (OMPI_SUCCESS != rc) {
PML_UCX_ERROR("Open MPI couldn't distribute EP connection details");
return OMPI_ERROR;
}
ucp_worker_release_address(ompi_pml_ucx.ucp_worker, address); ucp_worker_release_address(ompi_pml_ucx.ucp_worker, address);
if (OMPI_SUCCESS != rc) {
goto err;
}
#else
/* Pack just network device addresses for remote node peers */
status = mca_pml_ucx_send_worker_address_type(UCP_WORKER_ADDRESS_FLAG_NET_ONLY,
OPAL_PMIX_REMOTE);
if (UCS_OK != status) {
goto err;
}
status = mca_pml_ucx_send_worker_address_type(0, OPAL_PMIX_LOCAL);
if (UCS_OK != status) {
goto err;
}
#endif
return OMPI_SUCCESS; return OMPI_SUCCESS;
err:
PML_UCX_ERROR("Open MPI couldn't distribute EP connection details");
return OMPI_ERROR;
} }
static int mca_pml_ucx_recv_worker_address(ompi_proc_t *proc, static int mca_pml_ucx_recv_worker_address(ompi_proc_t *proc,
@ -121,6 +175,9 @@ static int mca_pml_ucx_recv_worker_address(ompi_proc_t *proc,
PML_UCX_ERROR("Failed to receive UCX worker address: %s (%d)", PML_UCX_ERROR("Failed to receive UCX worker address: %s (%d)",
opal_strerror(ret), ret); opal_strerror(ret), ret);
} }
PML_UCX_VERBOSE(2, "Got proc %d address, size %ld",
proc->super.proc_name.vpid, *addrlen_p);
return ret; return ret;
} }