1
1

* Add support for using modex to exchange NID/PID pairs when using Portals4.

Rather than try to support a bunch of lightweight environments like I did
  with the Portals3 code, always use the "modex" and hack the grpcomm for
  the SHMEM implementation to return the right nid/pid for a remote
  process by "magic".

This commit was SVN r24733.
Этот коммит содержится в:
Brian Barrett 2011-05-25 22:10:27 +00:00
родитель 81b6c50daa
Коммит beb1bc70b2
3 изменённых файлов: 86 добавлений и 31 удалений

Просмотреть файл

@ -20,16 +20,18 @@
#include "ompi_config.h"
#include <portals4.h>
#include <portals4_runtime.h>
#include "ompi/mca/mtl/mtl.h"
#include "opal/class/opal_list.h"
#include "ompi/runtime/ompi_module_exchange.h"
#include "mtl_portals4.h"
#include "mtl_portals4_endpoint.h"
#include "mtl_portals4_request.h"
#include "mtl_portals4_recv_short.h"
extern mca_mtl_base_component_2_0_0_t mca_mtl_portals4_component;
mca_mtl_portals4_module_t ompi_mtl_portals4 = {
{
8191, /* max cid - 2^13 - 1 */
@ -61,7 +63,6 @@ ompi_mtl_portals4_add_procs(struct mca_mtl_base_module_t *mtl,
ptl_md_t md;
ptl_me_t me;
size_t i;
struct runtime_proc_t *ptlprocs;
int nptlprocs;
ptl_pt_index_t pt;
@ -169,19 +170,10 @@ ompi_mtl_portals4_add_procs(struct mca_mtl_base_module_t *mtl,
opal_progress_register(ompi_mtl_portals4_progress);
/* Get the list of ptl_process_id_t from the runtime and copy into structure */
nptlprocs = runtime_get_nidpid_map(&ptlprocs);
if ((size_t)nptlprocs != nprocs) {
PtlMEUnlink(ompi_mtl_portals4.long_overflow_me_h);
PtlMDRelease(ompi_mtl_portals4.zero_md_h);
PtlPTFree(ompi_mtl_portals4.ni_h, PTL_READ_TABLE_ID);
PtlPTFree(ompi_mtl_portals4.ni_h, PTL_SEND_TABLE_ID);
PtlEQFree(ompi_mtl_portals4.eq_h);
opal_output(ompi_mtl_base_output,
"%s:%d: nptlprocs != nprocs: %d\n",
__FILE__, __LINE__, ret);
return OMPI_ERR_NOT_SUPPORTED;
}
for (i = 0 ; i < nprocs ; ++i) {
ptl_process_t *id;
size_t size;
mtl_peer_data[i] = malloc(sizeof(struct mca_mtl_base_endpoint_t));
if (NULL == mtl_peer_data[i]) {
PtlMEUnlink(ompi_mtl_portals4.long_overflow_me_h);
@ -194,9 +186,22 @@ ompi_mtl_portals4_add_procs(struct mca_mtl_base_module_t *mtl,
__FILE__, __LINE__, ret);
return OMPI_ERROR;
}
ret = ompi_modex_recv(&mca_mtl_portals4_component.mtl_version,
procs[i], (void**) &id, &size);
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_output,
"%s:%d: ompi_modex_recv failed: %d\n",
__FILE__, __LINE__, ret);
return ret;
} else if (sizeof(ptl_process_t) != size) {
opal_output(ompi_mtl_base_output,
"%s:%d: ompi_modex_recv failed: %d\n",
__FILE__, __LINE__, ret);
return ret;
}
mtl_peer_data[i]->ptl_proc.phys.nid = ptlprocs[i].nid;
mtl_peer_data[i]->ptl_proc.phys.pid = ptlprocs[i].pid;
mtl_peer_data[i]->ptl_proc = *id;
}
ompi_mtl_portals4.send_count = malloc(nptlprocs * sizeof(uint64_t));

Просмотреть файл

@ -22,6 +22,7 @@
#include "opal/mca/event/event.h"
#include "opal/util/output.h"
#include "opal/mca/base/mca_base_param.h"
#include "ompi/runtime/ompi_module_exchange.h"
#include "mtl_portals4.h"
#include "mtl_portals4_request.h"
@ -61,7 +62,7 @@ mca_mtl_base_component_2_0_0_t mca_mtl_portals4_component = {
static int
ompi_mtl_portals4_component_open(void)
{
int tmp;
int tmp, ret;
ompi_mtl_portals4.base.mtl_request_size =
sizeof(ompi_mtl_portals4_request_t) -
@ -101,9 +102,11 @@ ompi_mtl_portals4_component_open(void)
1024,
&ompi_mtl_portals4.queue_size);
ompi_mtl_portals4.protocol = eager;
ompi_mtl_portals4.ni_h = PTL_INVALID_HANDLE;
return ompi_mtl_portals4_get_error(PtlInit());
ret = PtlInit();
return ompi_mtl_portals4_get_error(ret);
}
@ -118,19 +121,41 @@ static mca_mtl_base_module_t*
ompi_mtl_portals4_component_init(bool enable_progress_threads,
bool enable_mpi_threads)
{
if (PTL_OK != PtlNIInit(PTL_IFACE_DEFAULT,
PTL_NI_PHYSICAL | PTL_NI_MATCHING,
PTL_PID_ANY,
NULL,
NULL,
0,
NULL,
NULL,
&ompi_mtl_portals4.ni_h)) {
ptl_process_t id;
int ret;
ret = PtlNIInit(PTL_IFACE_DEFAULT,
PTL_NI_PHYSICAL | PTL_NI_MATCHING,
PTL_PID_ANY,
NULL,
NULL,
0,
NULL,
NULL,
&ompi_mtl_portals4.ni_h);
if (PTL_OK != ret) {
opal_output(ompi_mtl_base_output,
"%s:%d: PtlNIInit failed: %d\n",
__FILE__, __LINE__, ret);
return NULL;
}
ompi_mtl_portals4.protocol = rndv;
ret = PtlGetId(ompi_mtl_portals4.ni_h, &id);
if (PTL_OK != ret) {
opal_output(ompi_mtl_base_output,
"%s:%d: PtlGetId failed: %d\n",
__FILE__, __LINE__, ret);
return NULL;
}
ret = ompi_modex_send(&mca_mtl_portals4_component.mtl_version,
&id, sizeof(id));
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_output,
"%s:%d: ompi_modex_send failed: %d\n",
__FILE__, __LINE__, ret);
return NULL;
}
return &ompi_mtl_portals4.base;
}
@ -169,9 +194,6 @@ ompi_mtl_portals4_get_error(int ptl_error)
case PTL_LIST_TOO_LONG:
ret = OMPI_ERR_OUT_OF_RESOURCE;
break;
case PTL_NI_NOT_LOGICAL:
ret = OMPI_ERR_FATAL;
break;
case PTL_NO_INIT:
ret = OMPI_ERR_FATAL;
break;

Просмотреть файл

@ -77,6 +77,10 @@ orte_grpcomm_base_module_t orte_grpcomm_portals4_shmem_module = {
purge_proc_attrs
};
static int nprocs;
static struct runtime_proc_t *map;
static int is_logical;
/**
* Init the module
*/
@ -148,6 +152,13 @@ static int set_proc_attr(const char *attr_name,
const void *data,
size_t size)
{
/* special case for Portals MTL modex */
if (0 == strncmp(attr_name, "mtl.portals4", strlen("mtl.portals4"))) {
if (size != sizeof(ptl_process_t)) {
return ORTE_ERR_NOT_IMPLEMENTED;
}
}
return ORTE_SUCCESS;
}
@ -155,6 +166,23 @@ static int get_proc_attr(const orte_process_name_t proc,
const char * attribute_name, void **val,
size_t *size)
{
ptl_process_t *id;
/* special case for Portals MTL modex */
if (0 == strncmp(attribute_name, "mtl.portals4", strlen("mtl.portals4"))) {
id = malloc(sizeof(ptl_process_t));
/* proc name and nid / pid match somewhat in shmem code */
id->phys.nid = 0;
id->phys.pid = proc.vpid;
*val = id;
*size = sizeof(ptl_process_t);
return ORTE_SUCCESS;
}
return ORTE_ERR_NOT_IMPLEMENTED;
}