MTL MXM: push commit r27987 back, now with right user.
r27987 - MTL MXM: ver. 2.0 interface changes. This commit was SVN r28026. The following SVN revision numbers were found above: r27987 --> open-mpi/ompi@2735658d81
Этот коммит содержится в:
родитель
aa5e436479
Коммит
21b170b43b
@ -98,14 +98,14 @@ static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info, mxm
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#else
|
||||
static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info,
|
||||
static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info, int dest_rank,
|
||||
mxm_domain_id_t domain)
|
||||
{
|
||||
size_t addrlen;
|
||||
mxm_error_t err;
|
||||
|
||||
addrlen = sizeof(ep_info->dest_addr[domain]);
|
||||
err = mxm_ep_address(ompi_mtl_mxm.ep, domain,
|
||||
err = mxm_ep_address(ompi_mtl_mxm.ep, domain, dest_rank,
|
||||
(struct sockaddr *) &ep_info->dest_addr[domain], &addrlen);
|
||||
if (MXM_OK == err) {
|
||||
ep_info->domain_bitmap |= MXM_BIT(domain);
|
||||
@ -123,7 +123,11 @@ static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info,
|
||||
#define max(a,b) ((a)>(b)?(a):(b))
|
||||
|
||||
static mxm_error_t ompi_mtl_mxm_create_ep(mxm_h ctx, mxm_ep_h *ep, unsigned ptl_bitmap, int lr,
|
||||
uint32_t jobid, uint64_t mxlr, int nlps) {
|
||||
uint32_t jobid, uint64_t mxlr, int nlps
|
||||
#if MXM_API >= MXM_VERSION(2, 0)
|
||||
, int totps
|
||||
#endif
|
||||
) {
|
||||
mxm_error_t err;
|
||||
|
||||
#if MXM_API < MXM_VERSION(1,5)
|
||||
@ -169,10 +173,15 @@ static mxm_error_t ompi_mtl_mxm_create_ep(mxm_h ctx, mxm_ep_h *ep, unsigned ptl_
|
||||
return err;
|
||||
}
|
||||
|
||||
#if MXM_API >= MXM_VERSION(2, 0)
|
||||
err = mxm_ep_create(ctx, ep_opts, ep, totps);
|
||||
#else
|
||||
ep_opts->job_id = jobid;
|
||||
ep_opts->local_rank = lr;
|
||||
ep_opts->num_local_procs = nlps;
|
||||
err = mxm_ep_create(ctx, ep_opts, ep);
|
||||
#endif
|
||||
|
||||
mxm_config_free(ep_opts);
|
||||
#endif
|
||||
return err;
|
||||
@ -190,9 +199,92 @@ static void ompi_mtl_mxm_set_conn_req(mxm_conn_req_t *conn_req, ompi_mtl_mxm_ep_
|
||||
}
|
||||
#endif
|
||||
|
||||
#if MXM_API >= MXM_VERSION(2,0)
|
||||
#define MTL_MXM_MODEX_MAX_SIZE ((size_t)0x60)
|
||||
static int ompi_mtl_mxm_send_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info, int totps)
|
||||
{
|
||||
int rc, dest;
|
||||
|
||||
mca_mtl_base_component_2_0_0_t *cm = &mca_mtl_mxm_component.super;
|
||||
char *modex_key, *mxm_version = mca_base_component_to_string(&cm->mtl_version);
|
||||
|
||||
/* Rough approximation of the next string length: mxm_version-dest_rank-portion_num */
|
||||
modex_key = malloc(strlen(mxm_version) + 8 * sizeof(int) + 8 * sizeof(size_t) + 2);
|
||||
if (NULL == modex_key) {
|
||||
MXM_ERROR("Cannot allocate memory.");
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Send information using modex (in some case there is limitation on data size for example ess/pmi)
|
||||
* set size of data sent for once
|
||||
*/
|
||||
for (dest = 0; dest < totps; ++dest) {
|
||||
/*
|
||||
* Get address for each PTL on this endpoint, and share it with other ranks.
|
||||
*/
|
||||
int modex_name_id = 0;
|
||||
|
||||
size_t modex_cur_size, modex_buf_size = sizeof(*ep_info);
|
||||
unsigned char *modex_buf_ptr = (unsigned char *) ep_info;
|
||||
|
||||
modex_cur_size = modex_buf_size < MTL_MXM_MODEX_MAX_SIZE ?
|
||||
modex_buf_size : MTL_MXM_MODEX_MAX_SIZE;
|
||||
|
||||
ep_info->domain_bitmap = 0;
|
||||
|
||||
rc = ompi_mtl_mxm_get_ep_address(ep_info, dest, MXM_DOMAIN_SELF);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
MXM_ERROR("Failed to get endpoint address: for domain SELF dest %d.", dest);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
rc = ompi_mtl_mxm_get_ep_address(ep_info, dest, MXM_DOMAIN_SHM);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
MXM_ERROR("Failed to get endpoint address: for domain SHM dest %d.", dest);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
rc = ompi_mtl_mxm_get_ep_address(ep_info, dest, MXM_DOMAIN_IB);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
MXM_ERROR("Failed to get endpoint address: for domain IB dest %d.", dest);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
while (modex_buf_size) {
|
||||
/* Modex key looks as mtl.mxm.1.5-1-18 where mtl.mxm.1.5 is the component,
|
||||
1 is a destination rank and 18 is a portion index */
|
||||
sprintf(modex_key, "%s-%d-%d", mxm_version, dest, modex_name_id);
|
||||
|
||||
rc = ompi_modex_send_string((const char *) modex_key, modex_buf_ptr, modex_cur_size);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
MXM_ERROR("Open MPI couldn't distribute EP connection details");
|
||||
|
||||
free(modex_key);
|
||||
free(mxm_version);
|
||||
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
++modex_name_id;
|
||||
|
||||
modex_buf_ptr += modex_cur_size;
|
||||
modex_buf_size -= modex_cur_size;
|
||||
|
||||
modex_cur_size = modex_buf_size < MTL_MXM_MODEX_MAX_SIZE ?
|
||||
modex_buf_size : MTL_MXM_MODEX_MAX_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
free(modex_key);
|
||||
free(mxm_version);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
int ompi_mtl_mxm_module_init(void)
|
||||
{
|
||||
|
||||
ompi_mtl_mxm_ep_conn_info_t ep_info;
|
||||
mxm_error_t err;
|
||||
uint32_t jobid;
|
||||
@ -248,7 +340,11 @@ int ompi_mtl_mxm_module_init(void)
|
||||
|
||||
/* Open MXM endpoint */
|
||||
err = ompi_mtl_mxm_create_ep(ompi_mtl_mxm.mxm_context, &ompi_mtl_mxm.ep,
|
||||
ptl_bitmap, lr, jobid, mxlr, nlps);
|
||||
ptl_bitmap, lr, jobid, mxlr, nlps
|
||||
#if MXM_API >= MXM_VERSION(2, 0)
|
||||
, totps
|
||||
#endif
|
||||
);
|
||||
|
||||
if (MXM_OK != err) {
|
||||
ompi_show_help("help-mtl-mxm.txt", "unable to create endpoint", true,
|
||||
@ -272,18 +368,6 @@ int ompi_mtl_mxm_module_init(void)
|
||||
OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_SHM)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#else
|
||||
ep_info.domain_bitmap = 0;
|
||||
if (OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_DOMAIN_SELF)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
if (OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_DOMAIN_SHM)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
if (OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_DOMAIN_IB)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* send information using modex (in some case there is limitation on data size for example ess/pmi)
|
||||
@ -316,6 +400,16 @@ int ompi_mtl_mxm_module_init(void)
|
||||
free(modex_component_name);
|
||||
free(modex_name);
|
||||
}
|
||||
#else
|
||||
{
|
||||
int rc;
|
||||
rc = ompi_mtl_mxm_send_ep_address(&ep_info, totps);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
MXM_ERROR("Modex session failed.");
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Register the MXM progress function */
|
||||
opal_progress_register(ompi_mtl_mxm_progress);
|
||||
@ -363,12 +457,20 @@ int ompi_mtl_mxm_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs,
|
||||
size_t modex_buf_size = sizeof(ompi_mtl_mxm_ep_conn_info_t);
|
||||
size_t modex_cur_size = 0;
|
||||
char *modex_component_name = mca_base_component_to_string(&mca_mtl_mxm_component.super.mtl_version);
|
||||
char *modex_name = malloc(strlen(modex_component_name) + 5);
|
||||
int modex_name_id = 0;
|
||||
#if MXM_API < MXM_VERSION(2,0)
|
||||
char *modex_name = malloc(strlen(modex_component_name) + 5);
|
||||
#else
|
||||
char *modex_name = malloc(strlen(modex_component_name) + 8 * sizeof(int) + 8 * sizeof(size_t) + 2);
|
||||
#endif
|
||||
|
||||
while (modex_buf_size > 0) {
|
||||
/* modex name looks as mtl.mxm.1.5-18 where mtl.mxm.1.5 is the component and 18 is portion index */
|
||||
#if MXM_API < MXM_VERSION(2,0)
|
||||
sprintf(modex_name, "%s-%d", modex_component_name, modex_name_id);
|
||||
#else
|
||||
sprintf(modex_name, "%s-%d-%d", modex_component_name, ompi_process_info.my_name.vpid, modex_name_id);
|
||||
#endif
|
||||
|
||||
if (OMPI_SUCCESS != ompi_modex_recv_string((const char *)modex_name, procs[i], (void**)&modex_buf_ptr, &modex_cur_size)) {
|
||||
MXM_ERROR("Open MPI couldn't distribute EP connection details");
|
||||
|
@ -17,7 +17,15 @@ int ompi_mtl_mxm_cancel(struct mca_mtl_base_module_t* mtl,
|
||||
mca_mtl_mxm_request_t *mtl_mxm_request = (mca_mtl_mxm_request_t*) mtl_request;
|
||||
mxm_error_t err;
|
||||
|
||||
#if MXM_API >= MXM_VERSION(2,0)
|
||||
if (mtl_mxm_request->is_send) {
|
||||
err = mxm_req_cancel_send(&mtl_mxm_request->mxm.send);
|
||||
} else {
|
||||
err = mxm_req_cancel_recv(&mtl_mxm_request->mxm.recv);
|
||||
}
|
||||
#else
|
||||
err = mxm_req_cancel(&mtl_mxm_request->mxm.base);
|
||||
#endif
|
||||
if ((err != MXM_OK) && (err != MXM_ERR_NO_PROGRESS)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
@ -50,6 +50,10 @@ static inline __opal_attribute_always_inline__ int
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if MXM_API >= MXM_VERSION(2,0)
|
||||
mtl_mxm_request->is_send = 0;
|
||||
#endif
|
||||
|
||||
mxm_recv_req->base.state = MXM_REQ_NEW;
|
||||
|
||||
mxm_recv_req->base.flags = 0;
|
||||
|
@ -21,6 +21,9 @@ struct mca_mtl_mxm_request_t {
|
||||
mxm_send_req_t send;
|
||||
mxm_recv_req_t recv;
|
||||
} mxm;
|
||||
#if MXM_API >= MXM_VERSION(2,0)
|
||||
int is_send;
|
||||
#endif
|
||||
/* mxm_segment_t mxm_segment[1]; */
|
||||
void *buf;
|
||||
size_t length;
|
||||
|
@ -162,6 +162,9 @@ int ompi_mtl_mxm_isend(struct mca_mtl_base_module_t* mtl,
|
||||
}
|
||||
|
||||
mxm_send_req = &mtl_mxm_request->mxm.send;
|
||||
#if MXM_API >= MXM_VERSION(2,0)
|
||||
mtl_mxm_request->is_send = 1;
|
||||
#endif
|
||||
|
||||
/* prepare a send request embedded in the MTL request */
|
||||
mxm_send_req->base.state = MXM_REQ_NEW;
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user