1
1

Cleanup the nidmap lookup functions and add some comments explaining how we handle the nid, job, and pmap arrays. This fixes a problem we have less-than-full participation in a comm_spawn, causing holes to exist in the pmap array.

Update the slave spawn tests to properly indicate participation as being solely MPI_COMM_SELF.

This commit was SVN r20961.
Этот коммит содержится в:
Ralph Castain 2009-04-09 02:48:33 +00:00
родитель 7d0c6b68dc
Коммит 9c2f17eb01
5 изменённых файлов: 54 добавлений и 24 удалений

Просмотреть файл

@ -226,9 +226,17 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs, bool modex_db)
ORTE_JOBID_PRINT(proc_name.jobid)));
jmap = OBJ_NEW(orte_jmap_t);
jmap->job = proc_name.jobid;
/* unfortunately, job objects cannot be stored
* by index number as the jobid is a constructed
* value. So we have to just add it to the end
* of the array
*/
opal_pointer_array_add(&orte_jobmap, jmap);
jmap->num_procs = 1;
/* have to add the pidmap entry too */
/* have to add the pidmap entry too, but this
* can be done at the specific site corresponding
* to the proc's vpid
*/
pmap = OBJ_NEW(orte_pmap_t);
pmap->node = nid->index;
pmap->local_rank = local_rank;
@ -246,7 +254,12 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs, bool modex_db)
pmap->node = nid->index;
pmap->local_rank = local_rank;
pmap->node_rank = node_rank;
/* this can be done at the specific site corresponding
* to the proc's vpid
*/
opal_pointer_array_set_item(&jmap->pmap, proc_name.vpid, pmap);
/* account for the proc entry in the jmap */
jmap->num_procs++;
}
}

Просмотреть файл

@ -62,7 +62,7 @@ int main(int argc, char* argv[])
pid = getpid();
printf("Cell_spawn [pid %ld] about to spawn!\n", (long)pid);
if (MPI_SUCCESS != (rc = MPI_Comm_spawn(app, MPI_ARGV_NULL, 1, info,
0, MPI_COMM_WORLD, &child, MPI_ERRCODES_IGNORE))) {
0, MPI_COMM_SELF, &child, MPI_ERRCODES_IGNORE))) {
printf("Cell slave failed to spawn\n");
return rc;
}

Просмотреть файл

@ -8,6 +8,8 @@ int main(int argc, char *argv[])
int i = 8, j;
int self;
int do_barrier = 0;
int k;
double pi;
if (getenv("DO_BARRIER")) {
do_barrier = 1;
@ -16,6 +18,11 @@ int main(int argc, char *argv[])
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &self);
while (1) {
#if 0
for (k=0; k < (7-self)*1000; k++) {
pi = 3.14159 * 18.0 / 35.3;
}
#endif
MPI_Reduce(&i, &j, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
if (do_barrier) {
MPI_Barrier(MPI_COMM_WORLD);

Просмотреть файл

@ -46,7 +46,7 @@ int main(int argc, char* argv[])
pid = getpid();
printf("Slave_spawn [pid %ld] about to spawn!\n", (long)pid);
if (MPI_SUCCESS != (rc = MPI_Comm_spawn(app, MPI_ARGV_NULL, 1, info,
0, MPI_COMM_WORLD, &child, MPI_ERRCODES_IGNORE))) {
0, MPI_COMM_SELF, &child, MPI_ERRCODES_IGNORE))) {
printf("Slave failed to spawn\n");
return rc;
}

Просмотреть файл

@ -827,7 +827,11 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
OBJ_CONSTRUCT(&buf, opal_buffer_t);
jobs = (orte_job_t**)orte_job_data->addr;
/* for each job... */
/* unfortunately, job objects cannot be stored
* by index number as the jobid is a constructed
* value. So we have no choice but to cycle through
* the job pointer array and look at each entry
*/
for (j=1; j < orte_job_data->size; j++) {
/* the job array is no longer left-justified and may
* have holes in it as we recover resources at job
@ -920,7 +924,11 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
/* cycle through the buffer */
while (ORTE_SUCCESS == (rc = opal_dss.unpack(&buf, &jobid, &n, ORTE_JOBID))) {
jobs = (orte_jmap_t**)orte_jobmap.addr;
/* is this job already in the map? */
/* unfortunately, job objects cannot be stored
* by index number as the jobid is a constructed
* value. So we have no choice but to cycle through
* the jobmap pointer array and look for this entry
*/
already_present = false;
for (j=0; j < orte_jobmap.size && NULL != jobs[j]; j++) {
if (jobid == jobs[j]->job) {
@ -965,7 +973,11 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
/* if we don't already have this data, store it */
if (!already_present) {
/* create and add an entry for the job */
/* unfortunately, job objects cannot be stored
* by index number as the jobid is a constructed
* value. So we have to just add it to the end
* of the array
*/
jmap = OBJ_NEW(orte_jmap_t);
jmap->job = jobid;
jmap->num_procs = num_procs;
@ -982,6 +994,9 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
pmap->node = nodes[i];
pmap->local_rank = local_rank[i];
pmap->node_rank = node_rank[i];
/* add the pidmap entry at the specific site corresponding
* to the proc's vpid
*/
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(&jmap->pmap, i, pmap))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
@ -1012,6 +1027,12 @@ orte_jmap_t* orte_util_lookup_jmap(orte_jobid_t job)
int i;
orte_jmap_t **jmaps;
/* unfortunately, job objects cannot be stored
* by index number as the jobid is a constructed
* value. So we have no choice but to cycle through
* the jobmap pointer array and look for the entry
* we want
*/
jmaps = (orte_jmap_t**)orte_jobmap.addr;
for (i=0; i < orte_jobmap.size && NULL != jmaps[i]; i++) {
OPAL_OUTPUT_VERBOSE((10, orte_debug_output,
@ -1034,19 +1055,11 @@ orte_pmap_t* orte_util_lookup_pmap(orte_process_name_t *proc)
if (NULL == (jmap = orte_util_lookup_jmap(proc->jobid))) {
return NULL;
}
if (proc->vpid >= jmap->num_procs) {
return NULL;
}
/* is this index in range? */
if (jmap->pmap.size <= (int)proc->vpid) {
return NULL;
}
/* now that we know the vpid is within range, we can safely
* retrieve the value
/* the get_item function will check the array index range,
* so we can just access it here
*/
return (orte_pmap_t*)jmap->pmap.addr[proc->vpid];
return opal_pointer_array_get_item(&jmap->pmap, proc->vpid);
}
/* the daemon's vpid does not necessarily correlate
@ -1095,12 +1108,9 @@ orte_nid_t* orte_util_lookup_nid(orte_process_name_t *proc)
return NULL;
}
if (pmap->node < 0 || orte_nidmap.size <= pmap->node) {
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
return NULL;
}
nids = (orte_nid_t**)orte_nidmap.addr;
return nids[pmap->node];
/* the get_item function will check the array index range,
* so we can just access it here
*/
return opal_pointer_array_get_item(&orte_nidmap, pmap->node);
}