* Add support for using modex to exchange NID/PID pairs when using Portals4.
Rather than try to support a bunch of lightweight environments like I did with the Portals3 code, always use the "modex" and hack the grpcomm for the SHMEM implementation to return the right nid/pid for a remote process by "magic". This commit was SVN r24733.
Этот коммит содержится в:
родитель
81b6c50daa
Коммит
beb1bc70b2
@ -20,16 +20,18 @@
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <portals4.h>
|
||||
#include <portals4_runtime.h>
|
||||
|
||||
#include "ompi/mca/mtl/mtl.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "ompi/runtime/ompi_module_exchange.h"
|
||||
|
||||
#include "mtl_portals4.h"
|
||||
#include "mtl_portals4_endpoint.h"
|
||||
#include "mtl_portals4_request.h"
|
||||
#include "mtl_portals4_recv_short.h"
|
||||
|
||||
extern mca_mtl_base_component_2_0_0_t mca_mtl_portals4_component;
|
||||
|
||||
mca_mtl_portals4_module_t ompi_mtl_portals4 = {
|
||||
{
|
||||
8191, /* max cid - 2^13 - 1 */
|
||||
@ -61,7 +63,6 @@ ompi_mtl_portals4_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
ptl_md_t md;
|
||||
ptl_me_t me;
|
||||
size_t i;
|
||||
struct runtime_proc_t *ptlprocs;
|
||||
int nptlprocs;
|
||||
ptl_pt_index_t pt;
|
||||
|
||||
@ -169,19 +170,10 @@ ompi_mtl_portals4_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
opal_progress_register(ompi_mtl_portals4_progress);
|
||||
|
||||
/* Get the list of ptl_process_id_t from the runtime and copy into structure */
|
||||
nptlprocs = runtime_get_nidpid_map(&ptlprocs);
|
||||
if ((size_t)nptlprocs != nprocs) {
|
||||
PtlMEUnlink(ompi_mtl_portals4.long_overflow_me_h);
|
||||
PtlMDRelease(ompi_mtl_portals4.zero_md_h);
|
||||
PtlPTFree(ompi_mtl_portals4.ni_h, PTL_READ_TABLE_ID);
|
||||
PtlPTFree(ompi_mtl_portals4.ni_h, PTL_SEND_TABLE_ID);
|
||||
PtlEQFree(ompi_mtl_portals4.eq_h);
|
||||
opal_output(ompi_mtl_base_output,
|
||||
"%s:%d: nptlprocs != nprocs: %d\n",
|
||||
__FILE__, __LINE__, ret);
|
||||
return OMPI_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
for (i = 0 ; i < nprocs ; ++i) {
|
||||
ptl_process_t *id;
|
||||
size_t size;
|
||||
|
||||
mtl_peer_data[i] = malloc(sizeof(struct mca_mtl_base_endpoint_t));
|
||||
if (NULL == mtl_peer_data[i]) {
|
||||
PtlMEUnlink(ompi_mtl_portals4.long_overflow_me_h);
|
||||
@ -195,8 +187,21 @@ ompi_mtl_portals4_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
mtl_peer_data[i]->ptl_proc.phys.nid = ptlprocs[i].nid;
|
||||
mtl_peer_data[i]->ptl_proc.phys.pid = ptlprocs[i].pid;
|
||||
ret = ompi_modex_recv(&mca_mtl_portals4_component.mtl_version,
|
||||
procs[i], (void**) &id, &size);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_output(ompi_mtl_base_output,
|
||||
"%s:%d: ompi_modex_recv failed: %d\n",
|
||||
__FILE__, __LINE__, ret);
|
||||
return ret;
|
||||
} else if (sizeof(ptl_process_t) != size) {
|
||||
opal_output(ompi_mtl_base_output,
|
||||
"%s:%d: ompi_modex_recv failed: %d\n",
|
||||
__FILE__, __LINE__, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
mtl_peer_data[i]->ptl_proc = *id;
|
||||
}
|
||||
|
||||
ompi_mtl_portals4.send_count = malloc(nptlprocs * sizeof(uint64_t));
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "ompi/runtime/ompi_module_exchange.h"
|
||||
|
||||
#include "mtl_portals4.h"
|
||||
#include "mtl_portals4_request.h"
|
||||
@ -61,7 +62,7 @@ mca_mtl_base_component_2_0_0_t mca_mtl_portals4_component = {
|
||||
static int
|
||||
ompi_mtl_portals4_component_open(void)
|
||||
{
|
||||
int tmp;
|
||||
int tmp, ret;
|
||||
|
||||
ompi_mtl_portals4.base.mtl_request_size =
|
||||
sizeof(ompi_mtl_portals4_request_t) -
|
||||
@ -101,9 +102,11 @@ ompi_mtl_portals4_component_open(void)
|
||||
1024,
|
||||
&ompi_mtl_portals4.queue_size);
|
||||
|
||||
ompi_mtl_portals4.protocol = eager;
|
||||
ompi_mtl_portals4.ni_h = PTL_INVALID_HANDLE;
|
||||
|
||||
return ompi_mtl_portals4_get_error(PtlInit());
|
||||
ret = PtlInit();
|
||||
return ompi_mtl_portals4_get_error(ret);
|
||||
}
|
||||
|
||||
|
||||
@ -118,7 +121,10 @@ static mca_mtl_base_module_t*
|
||||
ompi_mtl_portals4_component_init(bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
if (PTL_OK != PtlNIInit(PTL_IFACE_DEFAULT,
|
||||
ptl_process_t id;
|
||||
int ret;
|
||||
|
||||
ret = PtlNIInit(PTL_IFACE_DEFAULT,
|
||||
PTL_NI_PHYSICAL | PTL_NI_MATCHING,
|
||||
PTL_PID_ANY,
|
||||
NULL,
|
||||
@ -126,11 +132,30 @@ ompi_mtl_portals4_component_init(bool enable_progress_threads,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
&ompi_mtl_portals4.ni_h)) {
|
||||
&ompi_mtl_portals4.ni_h);
|
||||
if (PTL_OK != ret) {
|
||||
opal_output(ompi_mtl_base_output,
|
||||
"%s:%d: PtlNIInit failed: %d\n",
|
||||
__FILE__, __LINE__, ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ompi_mtl_portals4.protocol = rndv;
|
||||
ret = PtlGetId(ompi_mtl_portals4.ni_h, &id);
|
||||
if (PTL_OK != ret) {
|
||||
opal_output(ompi_mtl_base_output,
|
||||
"%s:%d: PtlGetId failed: %d\n",
|
||||
__FILE__, __LINE__, ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ret = ompi_modex_send(&mca_mtl_portals4_component.mtl_version,
|
||||
&id, sizeof(id));
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_output(ompi_mtl_base_output,
|
||||
"%s:%d: ompi_modex_send failed: %d\n",
|
||||
__FILE__, __LINE__, ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return &ompi_mtl_portals4.base;
|
||||
}
|
||||
@ -169,9 +194,6 @@ ompi_mtl_portals4_get_error(int ptl_error)
|
||||
case PTL_LIST_TOO_LONG:
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
break;
|
||||
case PTL_NI_NOT_LOGICAL:
|
||||
ret = OMPI_ERR_FATAL;
|
||||
break;
|
||||
case PTL_NO_INIT:
|
||||
ret = OMPI_ERR_FATAL;
|
||||
break;
|
||||
|
@ -77,6 +77,10 @@ orte_grpcomm_base_module_t orte_grpcomm_portals4_shmem_module = {
|
||||
purge_proc_attrs
|
||||
};
|
||||
|
||||
static int nprocs;
|
||||
static struct runtime_proc_t *map;
|
||||
static int is_logical;
|
||||
|
||||
/**
|
||||
* Init the module
|
||||
*/
|
||||
@ -148,6 +152,13 @@ static int set_proc_attr(const char *attr_name,
|
||||
const void *data,
|
||||
size_t size)
|
||||
{
|
||||
/* special case for Portals MTL modex */
|
||||
if (0 == strncmp(attr_name, "mtl.portals4", strlen("mtl.portals4"))) {
|
||||
if (size != sizeof(ptl_process_t)) {
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -155,6 +166,23 @@ static int get_proc_attr(const orte_process_name_t proc,
|
||||
const char * attribute_name, void **val,
|
||||
size_t *size)
|
||||
{
|
||||
ptl_process_t *id;
|
||||
|
||||
/* special case for Portals MTL modex */
|
||||
if (0 == strncmp(attribute_name, "mtl.portals4", strlen("mtl.portals4"))) {
|
||||
id = malloc(sizeof(ptl_process_t));
|
||||
|
||||
/* proc name and nid / pid match somewhat in shmem code */
|
||||
id->phys.nid = 0;
|
||||
id->phys.pid = proc.vpid;
|
||||
|
||||
*val = id;
|
||||
*size = sizeof(ptl_process_t);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user