diff --git a/ompi/mca/dpm/orte/dpm_orte.c b/ompi/mca/dpm/orte/dpm_orte.c index e5ee3aa898..1fd309c02e 100644 --- a/ompi/mca/dpm/orte/dpm_orte.c +++ b/ompi/mca/dpm/orte/dpm_orte.c @@ -35,6 +35,9 @@ #include "orte/mca/plm/plm.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" +#include "orte/mca/rmaps/rmaps.h" +#include "orte/mca/rmaps/rmaps_types.h" +#include "orte/mca/rmaps/base/base.h" #include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/routed/routed.h" #include "orte/util/name_fns.h" @@ -506,6 +509,7 @@ static int spawn(int count, char **array_of_commands, orte_job_t *jdata; orte_app_context_t *app; bool local_spawn, non_mpi; + bool local_bynode = false; /* parse the info object */ /* check potentially for: @@ -665,6 +669,32 @@ static int spawn(int count, char **array_of_commands, jdata->controls |= ORTE_JOB_CONTROL_LOCAL_SLAVE; } + /* check for 'map_bynode' */ + ompi_info_get_bool(array_of_info[i], "map_bynode", &local_bynode, &flag); + if ( flag ) { + jdata->map = OBJ_NEW(orte_job_map_t); + if (NULL == jdata->map) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + /* load it with the system defaults */ + jdata->map->policy = orte_default_mapping_policy; + jdata->map->npernode = orte_rmaps_base.npernode; + jdata->map->nperboard = orte_rmaps_base.nperboard; + jdata->map->npersocket = orte_rmaps_base.npersocket; + jdata->map->cpus_per_rank = orte_rmaps_base.cpus_per_rank; + jdata->map->stride = orte_rmaps_base.stride; + jdata->map->oversubscribe = orte_rmaps_base.oversubscribe; + jdata->map->display_map = orte_rmaps_base.display_map; + + if( local_bynode ) { + jdata->map->policy = ORTE_MAPPING_BYNODE; + } + else { + jdata->map->policy = ORTE_MAPPING_BYSLOT; + } + } + /* check for 'preload_binary' */ ompi_info_get_bool(array_of_info[i], "ompi_preload_binary", &local_spawn, &flag); if ( flag ) { diff --git a/ompi/mpi/man/man3/MPI_Comm_spawn.3in b/ompi/mpi/man/man3/MPI_Comm_spawn.3in index ca4eae4c0e..9da8d1e84d 100644 --- a/ompi/mpi/man/man3/MPI_Comm_spawn.3in +++ b/ompi/mpi/man/man3/MPI_Comm_spawn.3in @@ -152,6 +152,10 @@ ompi_param char * Pass an OMPI MCA parameter to the chil If that parameter already exists in the environment, the value will be overwritten by the provided value. +map_bynode bool If set to true, the processes are mapped bynode. + If set to false, the processes are mapped byslot. + By default, mapping is determined by the default + mapping policy set when the job was started. .fi \fIbool\fP info keys are actually strings but are evaluated as diff --git a/ompi/mpi/man/man3/MPI_Comm_spawn_multiple.3in b/ompi/mpi/man/man3/MPI_Comm_spawn_multiple.3in index 5465944475..18849df3bd 100644 --- a/ompi/mpi/man/man3/MPI_Comm_spawn_multiple.3in +++ b/ompi/mpi/man/man3/MPI_Comm_spawn_multiple.3in @@ -163,6 +163,10 @@ ompi_param char * Pass an OMPI MCA parameter to the chil If that parameter already exists in the environment, the value will be overwritten by the provided value. +map_bynode bool If set to true, the processes are mapped bynode. + If set to false, the processes are mapped byslot. + By default, mapping is determined by the default + mapping policy set when the job was started. .fi .sp diff --git a/orte/mca/plm/base/plm_base_receive.c b/orte/mca/plm/base/plm_base_receive.c index 244bd416b9..a158fc273b 100644 --- a/orte/mca/plm/base/plm_base_receive.c +++ b/orte/mca/plm/base/plm_base_receive.c @@ -232,14 +232,18 @@ static void process_msg(int fd, short event, void *data) goto ANSWER_LAUNCH; } - /* find the sender's node in the job map */ - if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, msgpkt->sender.vpid))) { - /* set the bookmark so the child starts from that place - this means - * that the first child process could be co-located with the proc - * that called comm_spawn, assuming slots remain on that node. Otherwise, - * the procs will start on the next available node - */ - jdata->bookmark = proc->node; + if( NULL == parent->bookmark ) { + /* find the sender's node in the job map */ + if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, msgpkt->sender.vpid))) { + /* set the bookmark so the child starts from that place - this means + * that the first child process could be co-located with the proc + * that called comm_spawn, assuming slots remain on that node. Otherwise, + * the procs will start on the next available node + */ + jdata->bookmark = proc->node; + } + } else { + jdata->bookmark = parent->bookmark; } /* launch it */ diff --git a/orte/mca/rmaps/base/rmaps_base_common_mappers.c b/orte/mca/rmaps/base/rmaps_base_common_mappers.c index 0e5d33c0df..58ca7960fe 100644 --- a/orte/mca/rmaps/base/rmaps_base_common_mappers.c +++ b/orte/mca/rmaps/base/rmaps_base_common_mappers.c @@ -342,5 +342,8 @@ int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app, cur_node_item = next; } + /* save the bookmark */ + jdata->bookmark = (orte_node_t*)cur_node_item; + return ORTE_SUCCESS; }