1
1

Fix comm_spawn yet again...add another test

This commit was SVN r25579.
Этот коммит содержится в:
Ralph Castain 2011-12-06 20:15:40 +00:00
родитель 90b7f2a7bf
Коммит 15facc4ba6
5 изменённых файлов: 101 добавлений и 3 удалений

Просмотреть файл

@ -487,6 +487,7 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
} else {
item = opal_list_get_first(node_list);
}
nd1 = NULL;
while (item != cur_node_item) {
nd1 = (orte_node_t*)item;
if (nd1->slots_inuse < nd1->slots_alloc) {
@ -514,7 +515,8 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
* that is minimally overloaded if it is better than
* what we already have
*/
if ((nd1->slots_inuse - nd1->slots_alloc) < (node->slots_inuse - node->slots_alloc)) {
if (NULL != nd1 &&
(nd1->slots_inuse - nd1->slots_alloc) < (node->slots_inuse - node->slots_alloc)) {
cur_node_item = (opal_list_item_t*)ndmin;
}
}

Просмотреть файл

@ -74,6 +74,9 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr:slot working node %s",
node->name);
#if OPAL_HAVE_HWLOC
/* get the root object as we are not assigning
* locale except at the node level
@ -83,6 +86,9 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
}
#endif
if (node->slots_alloc == node->slots_inuse) {
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr:slot working node %s is full - skipping",
node->name);
continue;
}
num_procs_to_assign = node->slots_alloc - node->slots_inuse;
@ -112,6 +118,10 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
return ORTE_SUCCESS;
}
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr:slot job %s is oversubscribed - performing second pass",
ORTE_JOBID_PRINT(jdata->jobid));
/* second pass: if we haven't mapped everyone yet, it is
* because we are oversubscribed. Figure out how many procs
* to add
@ -133,6 +143,9 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr:slot working node %s",
node->name);
#if OPAL_HAVE_HWLOC
/* get the root object as we are not assigning
* locale except at the node level
@ -141,6 +154,16 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
obj = hwloc_get_root_obj(node->topology);
}
#endif
/* add this node to the map - do it only once */
if (!node->mapped) {
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(rc);
return rc;
}
node->mapped = true;
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
if (add_one) {
if (0 == nxtra_nodes) {
--extra_procs_to_assign;
@ -150,6 +173,9 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
}
}
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr:slot adding up to %d procs to node %s",
num_procs_to_assign, node->name);
for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
return ORTE_ERR_OUT_OF_RESOURCE;

Просмотреть файл

@ -1,4 +1,4 @@
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave_spawn slave cell_spawn reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster hello++ hellof90 early_abort debugger singleton_client_server intercomm_create
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave_spawn slave cell_spawn reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster hello++ hellof90 early_abort debugger singleton_client_server intercomm_create spawn_tree
all: $(PROGS)

Просмотреть файл

@ -54,4 +54,6 @@ EXTRA_DIST += \
test/mpi/spawn_multiple.c \
test/mpi/ziatest.c \
test/mpi/ziaprobe.c \
test/mpi/singleton_client_server.c
test/mpi/singleton_client_server.c \
test/mpi/spawn_tree.c

68
orte/test/mpi/spawn_tree.c Обычный файл
Просмотреть файл

@ -0,0 +1,68 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <mpi.h>
int main(int argc, char ** argv){
int i;
int rank, size, child_rank;
char nomehost[20];
MPI_Comm parent, intercomm1, intercomm2;
int erro;
int level, curr_level;
if (argc < 2) {
fprintf(stderr, "Usage: spawn_tree <#levels>\n");
exit(1);
}
level = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_get_parent(&parent);
if(parent == MPI_COMM_NULL){
rank=0;
}
else{
MPI_Recv(&rank, 1, MPI_INT, 0, 0, parent, MPI_STATUS_IGNORE);
}
curr_level = (int) log2(rank+1);
printf(" --> rank: %d and curr_level: %d\n", rank, curr_level);
// Node propagation
if(curr_level < level){
// 2^(curr_level+1) - 1 + 2*(rank - 2^curr_level - 1) = 2*rank + 1
child_rank = 2*rank + 1;
printf("(%d) Before create rank %d\n", rank, child_rank);
MPI_Comm_spawn(argv[0], &argv[1], 1, MPI_INFO_NULL, 0,
MPI_COMM_SELF, &intercomm1, &erro);
printf("(%d) After create rank %d\n", rank, child_rank);
MPI_Send(&child_rank, 1, MPI_INT, 0, 0, intercomm1);
//sleep(1);
child_rank = child_rank + 1;
printf("(%d) Before create rank %d\n", rank, child_rank);
MPI_Comm_spawn(argv[0], &argv[1], 1, MPI_INFO_NULL, 0,
MPI_COMM_SELF, &intercomm2, &erro);
printf("(%d) After create rank %d\n", rank, child_rank);
MPI_Send(&child_rank, 1, MPI_INT, 0, 0, intercomm2);
}
gethostname(nomehost, 20);
printf("(%d) in %s\n", rank, nomehost);
MPI_Finalize();
return(0);
}