Fix comm_spawn yet again...add another test
This commit was SVN r25579.
Этот коммит содержится в:
родитель
90b7f2a7bf
Коммит
15facc4ba6
@ -487,6 +487,7 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
|
||||
} else {
|
||||
item = opal_list_get_first(node_list);
|
||||
}
|
||||
nd1 = NULL;
|
||||
while (item != cur_node_item) {
|
||||
nd1 = (orte_node_t*)item;
|
||||
if (nd1->slots_inuse < nd1->slots_alloc) {
|
||||
@ -514,7 +515,8 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
|
||||
* that is minimally overloaded if it is better than
|
||||
* what we already have
|
||||
*/
|
||||
if ((nd1->slots_inuse - nd1->slots_alloc) < (node->slots_inuse - node->slots_alloc)) {
|
||||
if (NULL != nd1 &&
|
||||
(nd1->slots_inuse - nd1->slots_alloc) < (node->slots_inuse - node->slots_alloc)) {
|
||||
cur_node_item = (opal_list_item_t*)ndmin;
|
||||
}
|
||||
}
|
||||
|
@ -74,6 +74,9 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
item != opal_list_get_end(node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr:slot working node %s",
|
||||
node->name);
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* get the root object as we are not assigning
|
||||
* locale except at the node level
|
||||
@ -83,6 +86,9 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
}
|
||||
#endif
|
||||
if (node->slots_alloc == node->slots_inuse) {
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr:slot working node %s is full - skipping",
|
||||
node->name);
|
||||
continue;
|
||||
}
|
||||
num_procs_to_assign = node->slots_alloc - node->slots_inuse;
|
||||
@ -112,6 +118,10 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr:slot job %s is oversubscribed - performing second pass",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
|
||||
/* second pass: if we haven't mapped everyone yet, it is
|
||||
* because we are oversubscribed. Figure out how many procs
|
||||
* to add
|
||||
@ -133,6 +143,9 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
item != opal_list_get_end(node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr:slot working node %s",
|
||||
node->name);
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* get the root object as we are not assigning
|
||||
* locale except at the node level
|
||||
@ -141,6 +154,16 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
obj = hwloc_get_root_obj(node->topology);
|
||||
}
|
||||
#endif
|
||||
/* add this node to the map - do it only once */
|
||||
if (!node->mapped) {
|
||||
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
node->mapped = true;
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
}
|
||||
if (add_one) {
|
||||
if (0 == nxtra_nodes) {
|
||||
--extra_procs_to_assign;
|
||||
@ -150,6 +173,9 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
}
|
||||
}
|
||||
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr:slot adding up to %d procs to node %s",
|
||||
num_procs_to_assign, node->name);
|
||||
for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
|
||||
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
|
@ -1,4 +1,4 @@
|
||||
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave_spawn slave cell_spawn reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster hello++ hellof90 early_abort debugger singleton_client_server intercomm_create
|
||||
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave_spawn slave cell_spawn reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster hello++ hellof90 early_abort debugger singleton_client_server intercomm_create spawn_tree
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
|
@ -54,4 +54,6 @@ EXTRA_DIST += \
|
||||
test/mpi/spawn_multiple.c \
|
||||
test/mpi/ziatest.c \
|
||||
test/mpi/ziaprobe.c \
|
||||
test/mpi/singleton_client_server.c
|
||||
test/mpi/singleton_client_server.c \
|
||||
test/mpi/spawn_tree.c
|
||||
|
||||
|
68
orte/test/mpi/spawn_tree.c
Обычный файл
68
orte/test/mpi/spawn_tree.c
Обычный файл
@ -0,0 +1,68 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <mpi.h>
|
||||
|
||||
int main(int argc, char ** argv){
|
||||
|
||||
int i;
|
||||
int rank, size, child_rank;
|
||||
char nomehost[20];
|
||||
MPI_Comm parent, intercomm1, intercomm2;
|
||||
int erro;
|
||||
int level, curr_level;
|
||||
|
||||
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: spawn_tree <#levels>\n");
|
||||
exit(1);
|
||||
}
|
||||
level = atoi(argv[1]);
|
||||
|
||||
MPI_Init(&argc, &argv);
|
||||
|
||||
|
||||
MPI_Comm_get_parent(&parent);
|
||||
|
||||
if(parent == MPI_COMM_NULL){
|
||||
rank=0;
|
||||
}
|
||||
else{
|
||||
MPI_Recv(&rank, 1, MPI_INT, 0, 0, parent, MPI_STATUS_IGNORE);
|
||||
}
|
||||
|
||||
curr_level = (int) log2(rank+1);
|
||||
|
||||
printf(" --> rank: %d and curr_level: %d\n", rank, curr_level);
|
||||
|
||||
// Node propagation
|
||||
if(curr_level < level){
|
||||
// 2^(curr_level+1) - 1 + 2*(rank - 2^curr_level - 1) = 2*rank + 1
|
||||
child_rank = 2*rank + 1;
|
||||
printf("(%d) Before create rank %d\n", rank, child_rank);
|
||||
MPI_Comm_spawn(argv[0], &argv[1], 1, MPI_INFO_NULL, 0,
|
||||
MPI_COMM_SELF, &intercomm1, &erro);
|
||||
printf("(%d) After create rank %d\n", rank, child_rank);
|
||||
|
||||
MPI_Send(&child_rank, 1, MPI_INT, 0, 0, intercomm1);
|
||||
|
||||
//sleep(1);
|
||||
|
||||
child_rank = child_rank + 1;
|
||||
printf("(%d) Before create rank %d\n", rank, child_rank);
|
||||
MPI_Comm_spawn(argv[0], &argv[1], 1, MPI_INFO_NULL, 0,
|
||||
MPI_COMM_SELF, &intercomm2, &erro);
|
||||
printf("(%d) After create rank %d\n", rank, child_rank);
|
||||
|
||||
MPI_Send(&child_rank, 1, MPI_INT, 0, 0, intercomm2);
|
||||
|
||||
}
|
||||
|
||||
gethostname(nomehost, 20);
|
||||
printf("(%d) in %s\n", rank, nomehost);
|
||||
|
||||
MPI_Finalize();
|
||||
return(0);
|
||||
|
||||
}
|
Загрузка…
Ссылка в новой задаче
Block a user