MTL MXM: push commit r27987 back, now with right user.
r27987 - MTL MXM: ver. 2.0 interface changes. This commit was SVN r28026. The following SVN revision numbers were found above: r27987 --> open-mpi/ompi@2735658d81
Этот коммит содержится в:
родитель
aa5e436479
Коммит
21b170b43b
@ -98,14 +98,14 @@ static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info, mxm
|
|||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info,
|
static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info, int dest_rank,
|
||||||
mxm_domain_id_t domain)
|
mxm_domain_id_t domain)
|
||||||
{
|
{
|
||||||
size_t addrlen;
|
size_t addrlen;
|
||||||
mxm_error_t err;
|
mxm_error_t err;
|
||||||
|
|
||||||
addrlen = sizeof(ep_info->dest_addr[domain]);
|
addrlen = sizeof(ep_info->dest_addr[domain]);
|
||||||
err = mxm_ep_address(ompi_mtl_mxm.ep, domain,
|
err = mxm_ep_address(ompi_mtl_mxm.ep, domain, dest_rank,
|
||||||
(struct sockaddr *) &ep_info->dest_addr[domain], &addrlen);
|
(struct sockaddr *) &ep_info->dest_addr[domain], &addrlen);
|
||||||
if (MXM_OK == err) {
|
if (MXM_OK == err) {
|
||||||
ep_info->domain_bitmap |= MXM_BIT(domain);
|
ep_info->domain_bitmap |= MXM_BIT(domain);
|
||||||
@ -123,7 +123,11 @@ static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info,
|
|||||||
#define max(a,b) ((a)>(b)?(a):(b))
|
#define max(a,b) ((a)>(b)?(a):(b))
|
||||||
|
|
||||||
static mxm_error_t ompi_mtl_mxm_create_ep(mxm_h ctx, mxm_ep_h *ep, unsigned ptl_bitmap, int lr,
|
static mxm_error_t ompi_mtl_mxm_create_ep(mxm_h ctx, mxm_ep_h *ep, unsigned ptl_bitmap, int lr,
|
||||||
uint32_t jobid, uint64_t mxlr, int nlps) {
|
uint32_t jobid, uint64_t mxlr, int nlps
|
||||||
|
#if MXM_API >= MXM_VERSION(2, 0)
|
||||||
|
, int totps
|
||||||
|
#endif
|
||||||
|
) {
|
||||||
mxm_error_t err;
|
mxm_error_t err;
|
||||||
|
|
||||||
#if MXM_API < MXM_VERSION(1,5)
|
#if MXM_API < MXM_VERSION(1,5)
|
||||||
@ -169,10 +173,15 @@ static mxm_error_t ompi_mtl_mxm_create_ep(mxm_h ctx, mxm_ep_h *ep, unsigned ptl_
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if MXM_API >= MXM_VERSION(2, 0)
|
||||||
|
err = mxm_ep_create(ctx, ep_opts, ep, totps);
|
||||||
|
#else
|
||||||
ep_opts->job_id = jobid;
|
ep_opts->job_id = jobid;
|
||||||
ep_opts->local_rank = lr;
|
ep_opts->local_rank = lr;
|
||||||
ep_opts->num_local_procs = nlps;
|
ep_opts->num_local_procs = nlps;
|
||||||
err = mxm_ep_create(ctx, ep_opts, ep);
|
err = mxm_ep_create(ctx, ep_opts, ep);
|
||||||
|
#endif
|
||||||
|
|
||||||
mxm_config_free(ep_opts);
|
mxm_config_free(ep_opts);
|
||||||
#endif
|
#endif
|
||||||
return err;
|
return err;
|
||||||
@ -190,9 +199,92 @@ static void ompi_mtl_mxm_set_conn_req(mxm_conn_req_t *conn_req, ompi_mtl_mxm_ep_
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if MXM_API >= MXM_VERSION(2,0)
|
||||||
|
#define MTL_MXM_MODEX_MAX_SIZE ((size_t)0x60)
|
||||||
|
static int ompi_mtl_mxm_send_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info, int totps)
|
||||||
|
{
|
||||||
|
int rc, dest;
|
||||||
|
|
||||||
|
mca_mtl_base_component_2_0_0_t *cm = &mca_mtl_mxm_component.super;
|
||||||
|
char *modex_key, *mxm_version = mca_base_component_to_string(&cm->mtl_version);
|
||||||
|
|
||||||
|
/* Rough approximation of the next string length: mxm_version-dest_rank-portion_num */
|
||||||
|
modex_key = malloc(strlen(mxm_version) + 8 * sizeof(int) + 8 * sizeof(size_t) + 2);
|
||||||
|
if (NULL == modex_key) {
|
||||||
|
MXM_ERROR("Cannot allocate memory.");
|
||||||
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Send information using modex (in some case there is limitation on data size for example ess/pmi)
|
||||||
|
* set size of data sent for once
|
||||||
|
*/
|
||||||
|
for (dest = 0; dest < totps; ++dest) {
|
||||||
|
/*
|
||||||
|
* Get address for each PTL on this endpoint, and share it with other ranks.
|
||||||
|
*/
|
||||||
|
int modex_name_id = 0;
|
||||||
|
|
||||||
|
size_t modex_cur_size, modex_buf_size = sizeof(*ep_info);
|
||||||
|
unsigned char *modex_buf_ptr = (unsigned char *) ep_info;
|
||||||
|
|
||||||
|
modex_cur_size = modex_buf_size < MTL_MXM_MODEX_MAX_SIZE ?
|
||||||
|
modex_buf_size : MTL_MXM_MODEX_MAX_SIZE;
|
||||||
|
|
||||||
|
ep_info->domain_bitmap = 0;
|
||||||
|
|
||||||
|
rc = ompi_mtl_mxm_get_ep_address(ep_info, dest, MXM_DOMAIN_SELF);
|
||||||
|
if (OMPI_SUCCESS != rc) {
|
||||||
|
MXM_ERROR("Failed to get endpoint address: for domain SELF dest %d.", dest);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
rc = ompi_mtl_mxm_get_ep_address(ep_info, dest, MXM_DOMAIN_SHM);
|
||||||
|
if (OMPI_SUCCESS != rc) {
|
||||||
|
MXM_ERROR("Failed to get endpoint address: for domain SHM dest %d.", dest);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
rc = ompi_mtl_mxm_get_ep_address(ep_info, dest, MXM_DOMAIN_IB);
|
||||||
|
if (OMPI_SUCCESS != rc) {
|
||||||
|
MXM_ERROR("Failed to get endpoint address: for domain IB dest %d.", dest);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (modex_buf_size) {
|
||||||
|
/* Modex key looks as mtl.mxm.1.5-1-18 where mtl.mxm.1.5 is the component,
|
||||||
|
1 is a destination rank and 18 is a portion index */
|
||||||
|
sprintf(modex_key, "%s-%d-%d", mxm_version, dest, modex_name_id);
|
||||||
|
|
||||||
|
rc = ompi_modex_send_string((const char *) modex_key, modex_buf_ptr, modex_cur_size);
|
||||||
|
if (OMPI_SUCCESS != rc) {
|
||||||
|
MXM_ERROR("Open MPI couldn't distribute EP connection details");
|
||||||
|
|
||||||
|
free(modex_key);
|
||||||
|
free(mxm_version);
|
||||||
|
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
++modex_name_id;
|
||||||
|
|
||||||
|
modex_buf_ptr += modex_cur_size;
|
||||||
|
modex_buf_size -= modex_cur_size;
|
||||||
|
|
||||||
|
modex_cur_size = modex_buf_size < MTL_MXM_MODEX_MAX_SIZE ?
|
||||||
|
modex_buf_size : MTL_MXM_MODEX_MAX_SIZE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
free(modex_key);
|
||||||
|
free(mxm_version);
|
||||||
|
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
int ompi_mtl_mxm_module_init(void)
|
int ompi_mtl_mxm_module_init(void)
|
||||||
{
|
{
|
||||||
|
|
||||||
ompi_mtl_mxm_ep_conn_info_t ep_info;
|
ompi_mtl_mxm_ep_conn_info_t ep_info;
|
||||||
mxm_error_t err;
|
mxm_error_t err;
|
||||||
uint32_t jobid;
|
uint32_t jobid;
|
||||||
@ -248,7 +340,11 @@ int ompi_mtl_mxm_module_init(void)
|
|||||||
|
|
||||||
/* Open MXM endpoint */
|
/* Open MXM endpoint */
|
||||||
err = ompi_mtl_mxm_create_ep(ompi_mtl_mxm.mxm_context, &ompi_mtl_mxm.ep,
|
err = ompi_mtl_mxm_create_ep(ompi_mtl_mxm.mxm_context, &ompi_mtl_mxm.ep,
|
||||||
ptl_bitmap, lr, jobid, mxlr, nlps);
|
ptl_bitmap, lr, jobid, mxlr, nlps
|
||||||
|
#if MXM_API >= MXM_VERSION(2, 0)
|
||||||
|
, totps
|
||||||
|
#endif
|
||||||
|
);
|
||||||
|
|
||||||
if (MXM_OK != err) {
|
if (MXM_OK != err) {
|
||||||
ompi_show_help("help-mtl-mxm.txt", "unable to create endpoint", true,
|
ompi_show_help("help-mtl-mxm.txt", "unable to create endpoint", true,
|
||||||
@ -272,19 +368,7 @@ int ompi_mtl_mxm_module_init(void)
|
|||||||
OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_SHM)) {
|
OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_SHM)) {
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
ep_info.domain_bitmap = 0;
|
|
||||||
if (OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_DOMAIN_SELF)) {
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
if (OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_DOMAIN_SHM)) {
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
if (OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_DOMAIN_IB)) {
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* send information using modex (in some case there is limitation on data size for example ess/pmi)
|
* send information using modex (in some case there is limitation on data size for example ess/pmi)
|
||||||
* set size of data sent for once
|
* set size of data sent for once
|
||||||
@ -303,10 +387,10 @@ int ompi_mtl_mxm_module_init(void)
|
|||||||
sprintf(modex_name, "%s-%d", modex_component_name, modex_name_id);
|
sprintf(modex_name, "%s-%d", modex_component_name, modex_name_id);
|
||||||
|
|
||||||
if (OMPI_SUCCESS != ompi_modex_send_string((const char *)modex_name, modex_buf_ptr, modex_cur_size)) {
|
if (OMPI_SUCCESS != ompi_modex_send_string((const char *)modex_name, modex_buf_ptr, modex_cur_size)) {
|
||||||
MXM_ERROR("Open MPI couldn't distribute EP connection details");
|
MXM_ERROR("Open MPI couldn't distribute EP connection details");
|
||||||
free(modex_component_name);
|
free(modex_component_name);
|
||||||
free(modex_name);
|
free(modex_name);
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
modex_name_id++;
|
modex_name_id++;
|
||||||
modex_buf_ptr += modex_cur_size;
|
modex_buf_ptr += modex_cur_size;
|
||||||
@ -316,7 +400,17 @@ int ompi_mtl_mxm_module_init(void)
|
|||||||
free(modex_component_name);
|
free(modex_component_name);
|
||||||
free(modex_name);
|
free(modex_name);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
rc = ompi_mtl_mxm_send_ep_address(&ep_info, totps);
|
||||||
|
if (OMPI_SUCCESS != rc) {
|
||||||
|
MXM_ERROR("Modex session failed.");
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Register the MXM progress function */
|
/* Register the MXM progress function */
|
||||||
opal_progress_register(ompi_mtl_mxm_progress);
|
opal_progress_register(ompi_mtl_mxm_progress);
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
@ -363,12 +457,20 @@ int ompi_mtl_mxm_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs,
|
|||||||
size_t modex_buf_size = sizeof(ompi_mtl_mxm_ep_conn_info_t);
|
size_t modex_buf_size = sizeof(ompi_mtl_mxm_ep_conn_info_t);
|
||||||
size_t modex_cur_size = 0;
|
size_t modex_cur_size = 0;
|
||||||
char *modex_component_name = mca_base_component_to_string(&mca_mtl_mxm_component.super.mtl_version);
|
char *modex_component_name = mca_base_component_to_string(&mca_mtl_mxm_component.super.mtl_version);
|
||||||
char *modex_name = malloc(strlen(modex_component_name) + 5);
|
|
||||||
int modex_name_id = 0;
|
int modex_name_id = 0;
|
||||||
|
#if MXM_API < MXM_VERSION(2,0)
|
||||||
|
char *modex_name = malloc(strlen(modex_component_name) + 5);
|
||||||
|
#else
|
||||||
|
char *modex_name = malloc(strlen(modex_component_name) + 8 * sizeof(int) + 8 * sizeof(size_t) + 2);
|
||||||
|
#endif
|
||||||
|
|
||||||
while (modex_buf_size > 0) {
|
while (modex_buf_size > 0) {
|
||||||
/* modex name looks as mtl.mxm.1.5-18 where mtl.mxm.1.5 is the component and 18 is portion index */
|
/* modex name looks as mtl.mxm.1.5-18 where mtl.mxm.1.5 is the component and 18 is portion index */
|
||||||
|
#if MXM_API < MXM_VERSION(2,0)
|
||||||
sprintf(modex_name, "%s-%d", modex_component_name, modex_name_id);
|
sprintf(modex_name, "%s-%d", modex_component_name, modex_name_id);
|
||||||
|
#else
|
||||||
|
sprintf(modex_name, "%s-%d-%d", modex_component_name, ompi_process_info.my_name.vpid, modex_name_id);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (OMPI_SUCCESS != ompi_modex_recv_string((const char *)modex_name, procs[i], (void**)&modex_buf_ptr, &modex_cur_size)) {
|
if (OMPI_SUCCESS != ompi_modex_recv_string((const char *)modex_name, procs[i], (void**)&modex_buf_ptr, &modex_cur_size)) {
|
||||||
MXM_ERROR("Open MPI couldn't distribute EP connection details");
|
MXM_ERROR("Open MPI couldn't distribute EP connection details");
|
||||||
|
@ -17,7 +17,15 @@ int ompi_mtl_mxm_cancel(struct mca_mtl_base_module_t* mtl,
|
|||||||
mca_mtl_mxm_request_t *mtl_mxm_request = (mca_mtl_mxm_request_t*) mtl_request;
|
mca_mtl_mxm_request_t *mtl_mxm_request = (mca_mtl_mxm_request_t*) mtl_request;
|
||||||
mxm_error_t err;
|
mxm_error_t err;
|
||||||
|
|
||||||
|
#if MXM_API >= MXM_VERSION(2,0)
|
||||||
|
if (mtl_mxm_request->is_send) {
|
||||||
|
err = mxm_req_cancel_send(&mtl_mxm_request->mxm.send);
|
||||||
|
} else {
|
||||||
|
err = mxm_req_cancel_recv(&mtl_mxm_request->mxm.recv);
|
||||||
|
}
|
||||||
|
#else
|
||||||
err = mxm_req_cancel(&mtl_mxm_request->mxm.base);
|
err = mxm_req_cancel(&mtl_mxm_request->mxm.base);
|
||||||
|
#endif
|
||||||
if ((err != MXM_OK) && (err != MXM_ERR_NO_PROGRESS)) {
|
if ((err != MXM_OK) && (err != MXM_ERR_NO_PROGRESS)) {
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
@ -50,6 +50,10 @@ static inline __opal_attribute_always_inline__ int
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if MXM_API >= MXM_VERSION(2,0)
|
||||||
|
mtl_mxm_request->is_send = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
mxm_recv_req->base.state = MXM_REQ_NEW;
|
mxm_recv_req->base.state = MXM_REQ_NEW;
|
||||||
|
|
||||||
mxm_recv_req->base.flags = 0;
|
mxm_recv_req->base.flags = 0;
|
||||||
|
@ -17,10 +17,13 @@
|
|||||||
struct mca_mtl_mxm_request_t {
|
struct mca_mtl_mxm_request_t {
|
||||||
struct mca_mtl_request_t super;
|
struct mca_mtl_request_t super;
|
||||||
union {
|
union {
|
||||||
mxm_req_base_t base;
|
mxm_req_base_t base;
|
||||||
mxm_send_req_t send;
|
mxm_send_req_t send;
|
||||||
mxm_recv_req_t recv;
|
mxm_recv_req_t recv;
|
||||||
} mxm;
|
} mxm;
|
||||||
|
#if MXM_API >= MXM_VERSION(2,0)
|
||||||
|
int is_send;
|
||||||
|
#endif
|
||||||
/* mxm_segment_t mxm_segment[1]; */
|
/* mxm_segment_t mxm_segment[1]; */
|
||||||
void *buf;
|
void *buf;
|
||||||
size_t length;
|
size_t length;
|
||||||
|
@ -162,6 +162,9 @@ int ompi_mtl_mxm_isend(struct mca_mtl_base_module_t* mtl,
|
|||||||
}
|
}
|
||||||
|
|
||||||
mxm_send_req = &mtl_mxm_request->mxm.send;
|
mxm_send_req = &mtl_mxm_request->mxm.send;
|
||||||
|
#if MXM_API >= MXM_VERSION(2,0)
|
||||||
|
mtl_mxm_request->is_send = 1;
|
||||||
|
#endif
|
||||||
|
|
||||||
/* prepare a send request embedded in the MTL request */
|
/* prepare a send request embedded in the MTL request */
|
||||||
mxm_send_req->base.state = MXM_REQ_NEW;
|
mxm_send_req->base.state = MXM_REQ_NEW;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user