Modify the base collective algorithms to take an array of arbitrary vpids instead of assuming everything is ordered in a particular way. Modify the hier grpcomm module to support arbitrary mappings
This commit was SVN r20599.
Этот коммит содержится в:
родитель
6151f7b60c
Коммит
8359477387
@ -86,7 +86,7 @@ ORTE_DECLSPEC void orte_grpcomm_base_coll_recv(int status, orte_process_name_t*
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata);
|
||||
ORTE_DECLSPEC int orte_grpcomm_base_allgather(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries,
|
||||
orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t step);
|
||||
orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids);
|
||||
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
@ -53,11 +53,11 @@
|
||||
|
||||
/**** AVAILABLE ALGORITHMS ****/
|
||||
static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries,
|
||||
orte_jobid_t jobid, orte_vpid_t step);
|
||||
orte_jobid_t jobid, orte_vpid_t *vpids);
|
||||
static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries,
|
||||
orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t step);
|
||||
orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids);
|
||||
static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries,
|
||||
orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t step);
|
||||
orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids);
|
||||
|
||||
/**** LOCAL VARIABLES USED IN COLLECTIVES ****/
|
||||
static int num_recvd;
|
||||
@ -105,15 +105,15 @@ void orte_grpcomm_base_coll_recv(int status, orte_process_name_t* sender,
|
||||
* Switchyard for selecting the collective algorithm to use
|
||||
*/
|
||||
int orte_grpcomm_base_allgather(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries,
|
||||
orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t step)
|
||||
orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids)
|
||||
{
|
||||
bool has_one;
|
||||
orte_vpid_t n;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
|
||||
"%s grpcomm:coll:allgather called with %d entries np %d step %d",
|
||||
"%s grpcomm:coll:allgather called with %d entries np %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
num_entries, (int)np, (int)step));
|
||||
num_entries, (int)np));
|
||||
|
||||
/* if we only have one proc participating, just copy the data across and return */
|
||||
if (1 == np) {
|
||||
@ -123,7 +123,7 @@ int orte_grpcomm_base_allgather(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf,
|
||||
|
||||
if (2 == np) {
|
||||
/* only two procs in collective */
|
||||
return twoproc(sendbuf, recvbuf, num_entries, jobid, step);
|
||||
return twoproc(sendbuf, recvbuf, num_entries, jobid, vpids);
|
||||
}
|
||||
|
||||
/* if we have power of 2 participants, use recursive doubling - otherwise,
|
||||
@ -134,14 +134,14 @@ int orte_grpcomm_base_allgather(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf,
|
||||
for ( ; n > 0; n >>= 1) {
|
||||
if (n & 0x1) {
|
||||
if (has_one) {
|
||||
return bruck(sendbuf, recvbuf, num_entries, jobid, np, step);
|
||||
return bruck(sendbuf, recvbuf, num_entries, jobid, np, vpids);
|
||||
}
|
||||
has_one = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* must be power of two! */
|
||||
return recursivedoubling(sendbuf, recvbuf, num_entries, jobid, np, step);
|
||||
return recursivedoubling(sendbuf, recvbuf, num_entries, jobid, np, vpids);
|
||||
}
|
||||
|
||||
|
||||
@ -152,7 +152,7 @@ int orte_grpcomm_base_allgather(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf,
|
||||
* Zero adds its data to message, sends result back to one
|
||||
*/
|
||||
static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries,
|
||||
orte_jobid_t jobid, orte_vpid_t step)
|
||||
orte_jobid_t jobid, orte_vpid_t *vpids)
|
||||
{
|
||||
orte_process_name_t peer;
|
||||
int32_t num_remote, cnt;
|
||||
@ -165,9 +165,9 @@ static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_e
|
||||
"%s grpcomm:coll:two-proc algo employed",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
if (0 == ORTE_PROC_MY_NAME->vpid) {
|
||||
if (vpids[0] == ORTE_PROC_MY_NAME->vpid) {
|
||||
/* I send first */
|
||||
peer.vpid = step;
|
||||
peer.vpid = vpids[1];
|
||||
/* setup a temp buffer so I can inform the other side as to the
|
||||
* number of entries in my buffer
|
||||
*/
|
||||
@ -222,7 +222,7 @@ static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_e
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32);
|
||||
opal_dss.copy_payload(&buf, sendbuf);
|
||||
peer.vpid = 0;
|
||||
peer.vpid = vpids[0];
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
|
||||
"%s grpcomm:coll:two-proc sending to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -264,9 +264,9 @@ static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_e
|
||||
* ompi/mca/coll/tuned/coll_tuned_allgather.c
|
||||
*/
|
||||
static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries,
|
||||
orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t step)
|
||||
orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids)
|
||||
{
|
||||
orte_vpid_t rank, distance, stp;
|
||||
orte_vpid_t rank, distance, nv;
|
||||
orte_process_name_t peer;
|
||||
int32_t num_remote, total_entries, cnt;
|
||||
opal_buffer_t collection, buf;
|
||||
@ -292,16 +292,28 @@ static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_ent
|
||||
- sends message containing all data collected so far to rank r - distance
|
||||
- receives message containing all data collected so far from rank (r + distance)
|
||||
*/
|
||||
/* find my position in the group of participants - it always starts at rank=0. This
|
||||
/* find my position in the group of participants. This
|
||||
* value is the "rank" we will use in the algo
|
||||
*/
|
||||
rank = (ORTE_PROC_MY_NAME->vpid) / step;
|
||||
rank = ORTE_VPID_INVALID;
|
||||
for (nv=0; nv < np; nv++) {
|
||||
if (vpids[nv] == ORTE_PROC_MY_NAME->vpid) {
|
||||
rank = nv;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* check for bozo case */
|
||||
if (ORTE_VPID_INVALID == rank) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
for (distance = 1; distance < np; distance <<= 1) {
|
||||
|
||||
/* first send my current contents */
|
||||
stp = (rank - distance + np) % np;
|
||||
peer.vpid = (stp * step);
|
||||
nv = (rank - distance + np) % np;
|
||||
peer.vpid = vpids[nv];
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32);
|
||||
opal_dss.copy_payload(&buf, &collection);
|
||||
@ -317,8 +329,8 @@ static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_ent
|
||||
|
||||
/* now setup to recv from my other partner */
|
||||
num_recvd = 0;
|
||||
stp = (rank + distance) % np;
|
||||
peer.vpid = (stp * step);
|
||||
nv = (rank + distance) % np;
|
||||
peer.vpid = vpids[nv];
|
||||
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer,
|
||||
ORTE_RML_TAG_DAEMON_COLLECTIVE,
|
||||
@ -367,9 +379,9 @@ static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_ent
|
||||
* ompi/mca/coll/tuned/coll_tuned_allgather.c
|
||||
*/
|
||||
static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries,
|
||||
orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t step)
|
||||
orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids)
|
||||
{
|
||||
orte_vpid_t rank, distance, stp;
|
||||
orte_vpid_t rank, distance, nv;
|
||||
int32_t num_remote, total_entries, cnt;
|
||||
opal_buffer_t collection, buf;
|
||||
orte_process_name_t peer;
|
||||
@ -393,16 +405,28 @@ static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int
|
||||
At every step i, rank r:
|
||||
- exchanges message containing all data collected so far with rank peer = (r ^ 2^i).
|
||||
*/
|
||||
/* find my position in the group of participants - it always starts at rank=0. This
|
||||
/* find my position in the group of participants. This
|
||||
* value is the "rank" we will use in the algo
|
||||
*/
|
||||
rank = (ORTE_PROC_MY_NAME->vpid) / step;
|
||||
|
||||
rank = ORTE_VPID_INVALID;
|
||||
for (nv=0; nv < np; nv++) {
|
||||
if (vpids[nv] == ORTE_PROC_MY_NAME->vpid) {
|
||||
rank = nv;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* check for bozo case */
|
||||
if (ORTE_VPID_INVALID == rank) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
for (distance = 0x1; distance < np; distance<<=1) {
|
||||
|
||||
/* first send my current contents */
|
||||
stp = rank ^ distance;
|
||||
peer.vpid = (stp * step);
|
||||
nv = rank ^ distance;
|
||||
peer.vpid = vpids[nv];
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32);
|
||||
opal_dss.copy_payload(&buf, &collection);
|
||||
|
@ -38,18 +38,11 @@ int orte_grpcomm_hier_open(void);
|
||||
int orte_grpcomm_hier_close(void);
|
||||
int orte_grpcomm_hier_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/* Hier component */
|
||||
typedef struct {
|
||||
orte_grpcomm_base_component_t super;
|
||||
orte_vpid_t num_nodes;
|
||||
orte_vpid_t step;
|
||||
} orte_grpcomm_hier_component_t;
|
||||
|
||||
/*
|
||||
* Grpcomm interfaces
|
||||
*/
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_grpcomm_hier_component_t mca_grpcomm_hier_component;
|
||||
ORTE_MODULE_DECLSPEC extern orte_grpcomm_base_component_t mca_grpcomm_hier_component;
|
||||
extern orte_grpcomm_base_module_t orte_grpcomm_hier_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -41,9 +41,8 @@
|
||||
/*
|
||||
* Struct of function pointers that need to be initialized
|
||||
*/
|
||||
orte_grpcomm_hier_component_t mca_grpcomm_hier_component = {
|
||||
orte_grpcomm_base_component_t mca_grpcomm_hier_component = {
|
||||
{
|
||||
{
|
||||
ORTE_GRPCOMM_BASE_VERSION_2_0_0,
|
||||
|
||||
"hier", /* MCA module name */
|
||||
@ -53,11 +52,10 @@ orte_grpcomm_hier_component_t mca_grpcomm_hier_component = {
|
||||
orte_grpcomm_hier_open, /* module open */
|
||||
orte_grpcomm_hier_close, /* module close */
|
||||
orte_grpcomm_hier_component_query /* module query */
|
||||
},
|
||||
{
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -74,30 +72,8 @@ int orte_grpcomm_hier_close(void)
|
||||
|
||||
int orte_grpcomm_hier_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
mca_base_component_t *c = &mca_grpcomm_hier_component.super.base_version;
|
||||
int tmp;
|
||||
|
||||
/* check for required params */
|
||||
mca_base_param_reg_int(c, "num_nodes",
|
||||
"How many nodes are in the job (must be > 0)",
|
||||
false, false, -1, &tmp);
|
||||
if (tmp < 0) {
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
mca_grpcomm_hier_component.num_nodes = tmp;
|
||||
|
||||
mca_base_param_reg_int(c, "step",
|
||||
"Step in local_rank=0 vpids between nodes (must be > 0)",
|
||||
false, false, -1, &tmp);
|
||||
if (tmp < 0) {
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
mca_grpcomm_hier_component.step = tmp;
|
||||
|
||||
/* we need to be selected */
|
||||
*priority = 100;
|
||||
/* only selected upon request */
|
||||
*priority = 0;
|
||||
*module = (mca_base_module_t *)&orte_grpcomm_hier_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -83,6 +83,8 @@ static opal_list_t my_local_peers;
|
||||
static orte_process_name_t my_local_rank_zero_proc;
|
||||
static int num_local_peers;
|
||||
static bool coll_initialized = false;
|
||||
static orte_vpid_t *my_coll_peers=NULL;
|
||||
static int cpeers=0;
|
||||
|
||||
/**
|
||||
* Initialize the module
|
||||
@ -114,6 +116,10 @@ static void finalize(void)
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&my_local_peers);
|
||||
|
||||
if (NULL != my_coll_peers) {
|
||||
free(my_coll_peers);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -293,7 +299,7 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
||||
opal_list_item_t *item;
|
||||
orte_namelist_t *nm;
|
||||
opal_buffer_t final_buf;
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||
"%s grpcomm:hier entering allgather",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
@ -301,21 +307,31 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
||||
/* have I initialized my local info? */
|
||||
if (!coll_initialized) {
|
||||
orte_process_name_t proc;
|
||||
orte_vpid_t v, vmax;
|
||||
orte_vpid_t v;
|
||||
|
||||
/* no - cycle through the procs to find those that are local */
|
||||
/* get my local rank so I can locally cache it */
|
||||
my_local_rank = orte_ess.get_local_rank(ORTE_PROC_MY_NAME);
|
||||
|
||||
/* if I am local_rank=0 for this node and job, then setup
|
||||
* my array of local_rank=0 peers
|
||||
*/
|
||||
if (0 == my_local_rank) {
|
||||
/* we need one entry/node in this job */
|
||||
my_coll_peers = (orte_vpid_t*)malloc(orte_process_info.num_nodes * sizeof(orte_vpid_t));
|
||||
cpeers = 0;
|
||||
}
|
||||
|
||||
/* cycle through the procs to create a list of those that are local to me */
|
||||
proc.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
vmax = ORTE_VPID_MAX;
|
||||
my_local_rank = 0;
|
||||
num_local_peers = 0; /* don't count myself */
|
||||
|
||||
for (v=0; v < orte_process_info.num_procs; v++) {
|
||||
/* ignore if this is me */
|
||||
if (v == ORTE_PROC_MY_NAME->vpid) {
|
||||
continue;
|
||||
}
|
||||
proc.vpid = v;
|
||||
if (!OPAL_PROC_ON_LOCAL_NODE(orte_ess.proc_get_locality(&proc))) {
|
||||
/* is this proc local_rank=0 on its node? */
|
||||
if (0 == my_local_rank && 0 == orte_ess.get_local_rank(&proc)) {
|
||||
my_coll_peers[cpeers++] = v;
|
||||
}
|
||||
/* if this is me, or this proc isn't on our node, ignore it */
|
||||
if (v == ORTE_PROC_MY_NAME->vpid ||
|
||||
!OPAL_PROC_ON_LOCAL_NODE(orte_ess.proc_get_locality(&proc))) {
|
||||
continue;
|
||||
}
|
||||
/* add this proc to our list of local peers */
|
||||
@ -323,23 +339,16 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
||||
nm->name.jobid = proc.jobid;
|
||||
nm->name.vpid = proc.vpid;
|
||||
opal_list_append(&my_local_peers, &nm->item);
|
||||
/* keep count */
|
||||
num_local_peers++;
|
||||
/* is this our locally lowest rank? */
|
||||
if (v < vmax) {
|
||||
vmax = v;
|
||||
}
|
||||
/* is this rank lower than mine? */
|
||||
if (v < ORTE_PROC_MY_NAME->vpid) {
|
||||
my_local_rank++;
|
||||
/* if I am not local_rank=0, is this one? */
|
||||
if (0 != my_local_rank &&
|
||||
0 == orte_ess.get_local_rank(&proc)) {
|
||||
my_local_rank_zero_proc.jobid = proc.jobid;
|
||||
my_local_rank_zero_proc.vpid = proc.vpid;
|
||||
}
|
||||
}
|
||||
|
||||
/* if I am not the local_rank=0 proc, record who is */
|
||||
if (0 != my_local_rank) {
|
||||
my_local_rank_zero_proc.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
my_local_rank_zero_proc.vpid = vmax;
|
||||
}
|
||||
|
||||
/* compute the number of local peers */
|
||||
num_local_peers = opal_list_get_size(&my_local_peers);
|
||||
|
||||
/* flag that I have initialized things */
|
||||
coll_initialized = true;
|
||||
@ -405,8 +414,7 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
||||
OBJ_CONSTRUCT(&final_buf, opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_allgather(&allgather_buf, rbuf, num_local_peers + 1,
|
||||
ORTE_PROC_MY_NAME->jobid,
|
||||
mca_grpcomm_hier_component.num_nodes,
|
||||
mca_grpcomm_hier_component.step))) {
|
||||
cpeers, my_coll_peers))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&allgather_buf);
|
||||
OBJ_DESTRUCT(&final_buf);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user