1
1

At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:

https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement

The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.

In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:

1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.

2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.

3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.

As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.

This commit was SVN r25476.
Этот коммит содержится в:
Ralph Castain 2011-11-15 03:40:11 +00:00
родитель c8e105bd8c
Коммит 6310361532
96 изменённых файлов: 6979 добавлений и 5558 удалений

Просмотреть файл

@ -3,7 +3,7 @@ enable_multicast=no
enable_dlopen=no enable_dlopen=no
enable_pty_support=no enable_pty_support=no
with_blcr=no with_blcr=no
with_openib=yes with_openib=no
with_memory_manager=no with_memory_manager=no
enable_mem_debug=yes enable_mem_debug=yes
enable_mem_profile=no enable_mem_profile=no

Просмотреть файл

@ -62,6 +62,7 @@
mca_component_show_load_errors = 0 mca_component_show_load_errors = 0
mpi_param_check = 0 mpi_param_check = 0
orte_abort_timeout = 10 orte_abort_timeout = 10
hwloc_base_mem_bind_failure_action = silent
## Protect the shared file systems ## Protect the shared file systems
@ -72,22 +73,13 @@ oob_tcp_disable_family = IPv6
#oob_tcp_connect_timeout=600 #oob_tcp_connect_timeout=600
## Define the MPI interconnects ## Define the MPI interconnects
btl = sm,openib,self btl = sm,tcp,self
#mpi_leave_pinned = 1
## Setup shared memory ## Setup shared memory
btl_sm_free_list_max = 768 btl_sm_free_list_max = 768
## Setup OpenIB
btl_openib_want_fork_support = 0
btl_openib_cpc_include = oob
#btl_openib_receive_queues = P,128,256,64,32,32:S,2048,1024,128,32:S,12288,1024,128,32:S,65536,1024,128,32
## Setup TCP ## Setup TCP
btl_tcp_if_include = ib0 btl_tcp_if_include = ib0
## Configure the PML ## Configure the PML
pml_ob1_use_early_completion = 0 pml_ob1_use_early_completion = 0
## Enable cpu affinity
opal_paffinity_alone = 1

Просмотреть файл

@ -58,6 +58,8 @@ enum {
OMPI_ERR_DATA_OVERWRITE_ATTEMPT = OPAL_ERR_DATA_OVERWRITE_ATTEMPT, OMPI_ERR_DATA_OVERWRITE_ATTEMPT = OPAL_ERR_DATA_OVERWRITE_ATTEMPT,
OMPI_ERR_BUFFER = OPAL_ERR_BUFFER, OMPI_ERR_BUFFER = OPAL_ERR_BUFFER,
OMPI_ERR_SILENT = OPAL_ERR_SILENT,
OMPI_ERR_REQUEST = OMPI_ERR_BASE - 1 OMPI_ERR_REQUEST = OMPI_ERR_BASE - 1
}; };

Просмотреть файл

@ -568,7 +568,7 @@ static int spawn(int count, char **array_of_commands,
char stdin_target[OPAL_PATH_MAX]; char stdin_target[OPAL_PATH_MAX];
char params[OPAL_PATH_MAX]; char params[OPAL_PATH_MAX];
char mapper[OPAL_PATH_MAX]; char mapper[OPAL_PATH_MAX];
int nperxxx; int npernode;
char slot_list[OPAL_PATH_MAX]; char slot_list[OPAL_PATH_MAX];
orte_job_t *jdata; orte_job_t *jdata;
@ -735,7 +735,7 @@ static int spawn(int count, char **array_of_commands,
} }
/* check for 'mapper' */ /* check for 'mapper' */
ompi_info_get (array_of_info[i], "mapper", sizeof(mapper) - 1, mapper, &flag); ompi_info_get(array_of_info[i], "mapper", sizeof(mapper) - 1, mapper, &flag);
if ( flag ) { if ( flag ) {
if (NULL == jdata->map) { if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t); jdata->map = OBJ_NEW(orte_job_map_t);
@ -743,20 +743,27 @@ static int spawn(int count, char **array_of_commands,
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
/* load it with the system defaults */
jdata->map->policy = orte_default_mapping_policy;
jdata->map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
jdata->map->stride = orte_rmaps_base.stride;
jdata->map->oversubscribe = orte_rmaps_base.oversubscribe;
jdata->map->display_map = orte_rmaps_base.display_map;
} }
jdata->map->req_mapper = strdup(mapper); jdata->map->req_mapper = strdup(mapper);
} }
/* check for 'npernode' */ /* check for 'display_map' */
ompi_info_get_bool(array_of_info[i], "display_map", &local_spawn, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
jdata->map->display_map = true;
}
/* check for 'npernode' and 'ppr' */
ompi_info_get (array_of_info[i], "npernode", sizeof(slot_list) - 1, slot_list, &flag); ompi_info_get (array_of_info[i], "npernode", sizeof(slot_list) - 1, slot_list, &flag);
if ( flag ) { if ( flag ) {
if (ORTE_SUCCESS != ompi_info_value_to_int(slot_list, &nperxxx)) { if (ORTE_SUCCESS != ompi_info_value_to_int(slot_list, &npernode)) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM; return ORTE_ERR_BAD_PARAM;
} }
@ -766,18 +773,14 @@ static int spawn(int count, char **array_of_commands,
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
/* load it with the system defaults */
jdata->map->policy = orte_default_mapping_policy;
jdata->map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
jdata->map->stride = orte_rmaps_base.stride;
jdata->map->oversubscribe = orte_rmaps_base.oversubscribe;
jdata->map->display_map = orte_rmaps_base.display_map;
} }
jdata->map->npernode = nperxxx; if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
return OMPI_ERROR;
}
jdata->map->mapping |= ORTE_MAPPING_PPR;
asprintf(&(jdata->map->ppr), "%d:n", npernode);
} }
ompi_info_get (array_of_info[i], "pernode", sizeof(slot_list) - 1, slot_list, &flag);
/* check for 'map_bynode' */
ompi_info_get_bool(array_of_info[i], "map_bynode", &local_bynode, &flag);
if ( flag ) { if ( flag ) {
if (NULL == jdata->map) { if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t); jdata->map = OBJ_NEW(orte_job_map_t);
@ -785,20 +788,438 @@ static int spawn(int count, char **array_of_commands,
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
/* load it with the system defaults */
jdata->map->policy = orte_default_mapping_policy;
jdata->map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
jdata->map->stride = orte_rmaps_base.stride;
jdata->map->oversubscribe = orte_rmaps_base.oversubscribe;
jdata->map->display_map = orte_rmaps_base.display_map;
} }
if( local_bynode ) { if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
jdata->map->policy = ORTE_MAPPING_BYNODE; return OMPI_ERROR;
}
else {
jdata->map->policy = ORTE_MAPPING_BYSLOT;
} }
jdata->map->mapping |= ORTE_MAPPING_PPR;
jdata->map->ppr = strdup("1:n");
} }
ompi_info_get (array_of_info[i], "ppr", sizeof(slot_list) - 1, slot_list, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
return OMPI_ERROR;
}
jdata->map->mapping |= ORTE_MAPPING_PPR;
jdata->map->ppr = strdup(slot_list);
}
/* check for 'map_byxxx' */
ompi_info_get_bool(array_of_info[i], "map_by_node", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
return OMPI_ERROR;
}
jdata->map->mapping |= ORTE_MAPPING_BYNODE;
}
#if OPAL_HAVE_HWLOC
ompi_info_get_bool(array_of_info[i], "map_by_board", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
return OMPI_ERROR;
}
jdata->map->mapping |= ORTE_MAPPING_BYBOARD;
}
ompi_info_get_bool(array_of_info[i], "map_by_numa", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
return OMPI_ERROR;
}
jdata->map->mapping |= ORTE_MAPPING_BYNUMA;
}
ompi_info_get_bool(array_of_info[i], "map_by_socket", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
return OMPI_ERROR;
}
jdata->map->mapping |= ORTE_MAPPING_BYSOCKET;
}
ompi_info_get_bool(array_of_info[i], "map_by_l3cache", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
return OMPI_ERROR;
}
jdata->map->mapping |= ORTE_MAPPING_BYL3CACHE;
}
ompi_info_get_bool(array_of_info[i], "map_by_l2cache", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
return OMPI_ERROR;
}
jdata->map->mapping |= ORTE_MAPPING_BYL2CACHE;
}
ompi_info_get_bool(array_of_info[i], "map_by_l1cache", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
return OMPI_ERROR;
}
jdata->map->mapping |= ORTE_MAPPING_BYL1CACHE;
}
ompi_info_get_bool(array_of_info[i], "map_by_core", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
return OMPI_ERROR;
}
jdata->map->mapping |= ORTE_MAPPING_BYCORE;
}
ompi_info_get_bool(array_of_info[i], "map_by_hwthread", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
return OMPI_ERROR;
}
jdata->map->mapping |= ORTE_MAPPING_BYHWTHREAD;
}
#endif
/* check for 'rank_byxxx' */
ompi_info_get_bool(array_of_info[i], "rank_by_node", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (0 != jdata->map->ranking) {
return OMPI_ERROR;
}
jdata->map->ranking = ORTE_RANK_BY_NODE;
}
#if OPAL_HAVE_HWLOC
ompi_info_get_bool(array_of_info[i], "rank_by_board", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (0 != jdata->map->ranking) {
return OMPI_ERROR;
}
jdata->map->ranking = ORTE_RANK_BY_BOARD;
}
ompi_info_get_bool(array_of_info[i], "rank_by_numa", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (0 != jdata->map->ranking) {
return OMPI_ERROR;
}
jdata->map->ranking = ORTE_RANK_BY_NUMA;
}
ompi_info_get_bool(array_of_info[i], "rank_by_socket", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (0 != jdata->map->ranking) {
return OMPI_ERROR;
}
jdata->map->ranking = ORTE_RANK_BY_SOCKET;
}
ompi_info_get_bool(array_of_info[i], "rank_by_l3cache", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (0 != jdata->map->ranking) {
return OMPI_ERROR;
}
jdata->map->ranking = ORTE_RANK_BY_L3CACHE;
}
ompi_info_get_bool(array_of_info[i], "rank_by_l2cache", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (0 != jdata->map->ranking) {
return OMPI_ERROR;
}
jdata->map->ranking = ORTE_RANK_BY_L2CACHE;
}
ompi_info_get_bool(array_of_info[i], "rank_by_l1cache", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (0 != jdata->map->ranking) {
return OMPI_ERROR;
}
jdata->map->ranking = ORTE_RANK_BY_L1CACHE;
}
ompi_info_get_bool(array_of_info[i], "rank_by_core", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (0 != jdata->map->ranking) {
return OMPI_ERROR;
}
jdata->map->ranking = ORTE_RANK_BY_CORE;
}
ompi_info_get_bool(array_of_info[i], "rank_by_hwthread", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (0 != jdata->map->ranking) {
return OMPI_ERROR;
}
jdata->map->ranking = ORTE_RANK_BY_HWTHREAD;
}
/* check for 'bind_toxxx' */
ompi_info_get_bool(array_of_info[i], "bind_if_supported", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
jdata->map->binding |= OPAL_BIND_IF_SUPPORTED;
}
ompi_info_get_bool(array_of_info[i], "bind_overload_allowed", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
jdata->map->binding |= OPAL_BIND_ALLOW_OVERLOAD;
}
ompi_info_get_bool(array_of_info[i], "bind_to_none", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
return OMPI_ERROR;
}
jdata->map->binding |= OPAL_BIND_TO_NONE;
}
ompi_info_get_bool(array_of_info[i], "bind_to_board", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
return OMPI_ERROR;
}
jdata->map->binding |= OPAL_BIND_TO_BOARD;
}
ompi_info_get_bool(array_of_info[i], "bind_to_numa", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
return OMPI_ERROR;
}
jdata->map->binding |= OPAL_BIND_TO_NUMA;
}
ompi_info_get_bool(array_of_info[i], "bind_to_socket", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
return OMPI_ERROR;
}
jdata->map->binding |= OPAL_BIND_TO_SOCKET;
}
ompi_info_get_bool(array_of_info[i], "bind_to_l3cache", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
return OMPI_ERROR;
}
jdata->map->binding |= OPAL_BIND_TO_L3CACHE;
}
ompi_info_get_bool(array_of_info[i], "bind_to_l2cache", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
return OMPI_ERROR;
}
jdata->map->binding |= OPAL_BIND_TO_L2CACHE;
}
ompi_info_get_bool(array_of_info[i], "bind_to_l1cache", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
return OMPI_ERROR;
}
jdata->map->binding |= OPAL_BIND_TO_L1CACHE;
}
ompi_info_get_bool(array_of_info[i], "bind_to_core", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
return OMPI_ERROR;
}
jdata->map->binding |= OPAL_BIND_TO_CORE;
}
ompi_info_get_bool(array_of_info[i], "bind_to_hwthread", &local_bynode, &flag);
if ( flag ) {
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
if (NULL == jdata->map) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
return OMPI_ERROR;
}
jdata->map->binding |= OPAL_BIND_TO_HWTHREAD;
}
#endif
/* check for 'preload_binary' */ /* check for 'preload_binary' */
ompi_info_get_bool(array_of_info[i], "ompi_preload_binary", &local_spawn, &flag); ompi_info_get_bool(array_of_info[i], "ompi_preload_binary", &local_spawn, &flag);

Просмотреть файл

@ -287,14 +287,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
int param, value; int param, value;
struct timeval ompistart, ompistop; struct timeval ompistart, ompistop;
char *event_val = NULL; char *event_val = NULL;
opal_paffinity_base_cpu_set_t mask;
bool proc_bound;
#if 0
/* see comment below about sched_yield */
int num_processors;
#endif
bool orte_setup = false; bool orte_setup = false;
bool paffinity_enabled = false;
/* bitflag of the thread level support provided. To be used /* bitflag of the thread level support provided. To be used
* for the modex in order to work in heterogeneous environments. */ * for the modex in order to work in heterogeneous environments. */
@ -371,6 +364,18 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
gettimeofday(&ompistart, NULL); gettimeofday(&ompistart, NULL);
} }
#if OPAL_HAVE_HWLOC
/* if hwloc is available but didn't get setup for some
* reason, do so now
*/
if (NULL == opal_hwloc_topology) {
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
error = "Topology init";
goto error;
}
}
#endif
/* Register errhandler callback with orte errmgr */ /* Register errhandler callback with orte errmgr */
if (NULL != orte_errmgr.set_fault_callback) { if (NULL != orte_errmgr.set_fault_callback) {
orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback); orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback);
@ -412,17 +417,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
goto error; goto error;
} }
#if OPAL_HAVE_HWLOC
/* If orte_init() didn't fill in opal_hwloc_topology, then we need
to go fill it in ourselves. */
if (NULL == opal_hwloc_topology) {
if (0 != hwloc_topology_init(&opal_hwloc_topology) ||
0 != hwloc_topology_load(opal_hwloc_topology)) {
return OPAL_ERR_NOT_SUPPORTED;
}
}
#endif
/* Once we've joined the RTE, see if any MCA parameters were /* Once we've joined the RTE, see if any MCA parameters were
passed to the MPI level */ passed to the MPI level */
@ -442,106 +436,217 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
} }
#endif #endif
/* if it hasn't already been done, setup process affinity. #if OPAL_HAVE_HWLOC
* First check to see if a slot list was {
* specified. If so, use it. If no slot list was specified, hwloc_obj_t node, obj;
* that's not an error -- just fall through and try the next hwloc_cpuset_t cpus, nodeset;
* paffinity scheme. bool paffinity_enabled=false;
*/ orte_node_rank_t nrank;
ret = opal_paffinity_base_get(&mask); hwloc_obj_type_t target;
if (OPAL_SUCCESS == ret) { unsigned cache_level;
/* paffinity is supported - check for binding */ struct hwloc_topology_support *support;
OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &proc_bound);
if (proc_bound || opal_paffinity_base_bound) { /* see if we were bound when launched */
/* someone external set it - indicate it is set if (NULL == getenv("OMPI_MCA_opal_bound_at_launch")) {
* so that we know /* we were not bound at launch */
*/ if (NULL != opal_hwloc_topology) {
paffinity_enabled = true; support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
} else { /* get our node object */
/* the system is capable of doing processor affinity, but it node = hwloc_get_root_obj(opal_hwloc_topology);
* has not yet been set - see if a slot_list was given nodeset = hwloc_bitmap_alloc();
*/ hwloc_bitmap_and(nodeset, node->online_cpuset, node->allowed_cpuset);
if (NULL != opal_paffinity_base_slot_list) { /* get our cpuset */
/* It's an error if multiple paffinity schemes were specified */ cpus = hwloc_bitmap_alloc();
if (opal_paffinity_alone) { hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS);
ret = OMPI_ERR_BAD_PARAM; /* we are bound if the two cpusets are not equal */
error = "Multiple processor affinity schemes specified (can only specify one)"; if (0 != hwloc_bitmap_compare(cpus, nodeset)) {
goto error; /* someone external set it - indicate it is set
} * so that we know
ret = opal_paffinity_base_slot_list_set((long)ORTE_PROC_MY_NAME->vpid, opal_paffinity_base_slot_list, &mask);
if (OPAL_SUCCESS != ret && OPAL_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) {
error = "opal_paffinity_base_slot_list_set() returned an error";
goto error;
}
#if !ORTE_DISABLE_FULL_SUPPORT
/* print out a warning if result is no-op, if not suppressed */
OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &proc_bound);
if (!proc_bound && orte_odls_base.warn_if_not_bound) {
orte_show_help("help-orte-odls-base.txt",
"orte-odls-base:warn-not-bound",
true, "slot-list",
"Request resulted in binding to all available processors",
orte_process_info.nodename,
"bind-to-slot-list", opal_paffinity_base_slot_list, argv[0]);
}
#endif
paffinity_enabled = true;
} else if (opal_paffinity_alone) {
/* no slot_list, but they asked for paffinity */
int phys_cpu;
orte_node_rank_t nrank;
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) {
/* this is okay - we probably were direct-launched, which means
* we won't get our node rank until the modex. So just ignore
*/ */
goto MOVEON; paffinity_enabled = true;
hwloc_bitmap_free(nodeset);
hwloc_bitmap_free(cpus);
} else if (support->cpubind->set_thisproc_cpubind &&
OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
/* the system is capable of doing processor affinity, but it
* has not yet been set - see if a slot_list was given
*/
hwloc_bitmap_zero(cpus);
if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
if (ORTE_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list,
opal_hwloc_topology, cpus))) {
error = "Setting processor affinity failed";
hwloc_bitmap_free(nodeset);
hwloc_bitmap_free(cpus);
goto error;
}
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
error = "Setting processor affinity failed";
hwloc_bitmap_free(nodeset);
hwloc_bitmap_free(cpus);
goto error;
}
/* try to find a level and index for this location */
opal_hwloc_base_get_level_and_index(cpus, &orte_process_info.bind_level, &orte_process_info.bind_idx);
/* cleanup */
hwloc_bitmap_free(nodeset);
hwloc_bitmap_free(cpus);
paffinity_enabled = true;
} else {
/* cleanup */
hwloc_bitmap_free(nodeset);
hwloc_bitmap_free(cpus);
/* get the node rank */
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) {
/* this is not an error - could be due to being
* direct launched - so just ignore and leave
* us unbound
*/
goto MOVEON;
}
/* if the binding policy is hwthread, then we bind to the nrank-th
* hwthread on this node
*/
if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_PU,
0, nrank, OPAL_HWLOC_LOGICAL))) {
ret = OMPI_ERR_NOT_FOUND;
error = "Getting hwthread object";
goto error;
}
cpus = hwloc_bitmap_alloc();
hwloc_bitmap_and(cpus, obj->online_cpuset, obj->allowed_cpuset);
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
ret = OMPI_ERROR;
error = "Setting processor affinity failed";
hwloc_bitmap_free(cpus);
goto error;
}
hwloc_bitmap_free(cpus);
orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
orte_process_info.bind_idx = nrank;
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
/* if the binding policy is core, then we bind to the nrank-th
* core on this node
*/
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE,
0, nrank, OPAL_HWLOC_LOGICAL))) {
ret = OMPI_ERR_NOT_FOUND;
error = "Getting core object";
goto error;
}
cpus = hwloc_bitmap_alloc();
hwloc_bitmap_and(cpus, obj->online_cpuset, obj->allowed_cpuset);
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
error = "Setting processor affinity failed";
hwloc_bitmap_free(cpus);
ret = OMPI_ERROR;
goto error;
}
hwloc_bitmap_free(cpus);
orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL;
orte_process_info.bind_idx = nrank;
} else {
/* for all higher binding policies, we bind to the specified
* object that the nrank-th core belongs to
*/
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE,
0, nrank, OPAL_HWLOC_LOGICAL))) {
ret = OMPI_ERR_NOT_FOUND;
error = "Getting core object";
goto error;
}
if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
target = HWLOC_OBJ_CACHE;
cache_level = 1;
orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
} else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
target = HWLOC_OBJ_CACHE;
cache_level = 2;
orte_process_info.bind_level = OPAL_HWLOC_L2CACHE_LEVEL;
} else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
target = HWLOC_OBJ_CACHE;
cache_level = 3;
orte_process_info.bind_level = OPAL_HWLOC_L3CACHE_LEVEL;
} else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
target = HWLOC_OBJ_SOCKET;
orte_process_info.bind_level = OPAL_HWLOC_SOCKET_LEVEL;
} else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
target = HWLOC_OBJ_NODE;
orte_process_info.bind_level = OPAL_HWLOC_NUMA_LEVEL;
} else {
ret = OMPI_ERR_NOT_FOUND;
error = "Binding policy not known";
goto error;
}
for (obj = obj->parent; NULL != obj; obj = obj->parent) {
if (target == obj->type) {
if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
continue;
}
/* this is the place! */
cpus = hwloc_bitmap_alloc();
hwloc_bitmap_and(cpus, obj->online_cpuset, obj->allowed_cpuset);
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
ret = OMPI_ERROR;
error = "Setting processor affinity failed";
hwloc_bitmap_free(cpus);
goto error;
}
hwloc_bitmap_free(cpus);
orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology,
obj, OPAL_HWLOC_LOGICAL);
paffinity_enabled = true;
break;
}
}
if (!paffinity_enabled) {
ret = OMPI_ERROR;
error = "Setting processor affinity failed";
goto error;
}
}
paffinity_enabled = true;
}
} }
OPAL_PAFFINITY_CPU_ZERO(mask); /* If we were able to set processor affinity, try setting up
ret = opal_paffinity_base_get_physical_processor_id(nrank, &phys_cpu); memory affinity */
if (OPAL_SUCCESS != ret) { if (!opal_maffinity_setup && paffinity_enabled) {
error = "Could not get physical processor id - cannot set processor affinity"; if (OPAL_SUCCESS == opal_maffinity_base_open() &&
goto error; OPAL_SUCCESS == opal_maffinity_base_select()) {
opal_maffinity_setup = true;
}
} }
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
ret = opal_paffinity_base_set(mask);
if (OPAL_SUCCESS != ret) {
error = "Setting processor affinity failed";
goto error;
}
#if !ORTE_DISABLE_FULL_SUPPORT
/* print out a warning if result is no-op, if not suppressed */
OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &proc_bound);
if (!proc_bound && orte_odls_base.warn_if_not_bound) {
orte_show_help("help-orte-odls-base.txt",
"orte-odls-base:warn-not-bound",
true, "cpu",
"Request resulted in binding to all available processors",
orte_process_info.nodename,
"[opal|mpi]_paffinity_alone set non-zero", "n/a", argv[0]);
}
#endif
paffinity_enabled = true;
} }
} }
} }
MOVEON: MOVEON:
#if OPAL_HAVE_HWLOC
/* get or update our local cpuset - it will get used multiple /* get or update our local cpuset - it will get used multiple
* times, so it's more efficient to keep a global copy * times, so it's more efficient to keep a global copy
*/ */
opal_hwloc_base_get_local_cpuset(); opal_hwloc_base_get_local_cpuset();
#endif /* report bindings, if requested */
if (opal_hwloc_report_bindings) {
/* If we were able to set processor affinity, try setting up char bindings[64];
memory affinity */ hwloc_obj_t root;
if (!opal_maffinity_setup && paffinity_enabled) { hwloc_cpuset_t cpus;
if (OPAL_SUCCESS == opal_maffinity_base_open() && /* get the root object for this node */
OPAL_SUCCESS == opal_maffinity_base_select()) { root = hwloc_get_root_obj(opal_hwloc_topology);
opal_maffinity_setup = true; cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, root);
if (0 == hwloc_bitmap_compare(cpus, opal_hwloc_my_cpuset)) {
opal_output(0, "%s is not bound",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
} else {
hwloc_bitmap_list_snprintf(bindings, 64, opal_hwloc_my_cpuset);
opal_output(0, "%s is bound to cpus %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
bindings);
} }
} }
#endif
/* initialize datatypes. This step should be done early as it will /* initialize datatypes. This step should be done early as it will
* create the local convertor and local arch used in the proc * create the local convertor and local arch used in the proc
* init. * init.
@ -649,7 +754,7 @@ MOVEON:
if (OMPI_SUCCESS != if (OMPI_SUCCESS !=
(ret = ompi_osc_base_find_available(OMPI_ENABLE_PROGRESS_THREADS, (ret = ompi_osc_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
OMPI_ENABLE_THREAD_MULTIPLE))) { OMPI_ENABLE_THREAD_MULTIPLE))) {
error = "ompi_osc_base_find_available() failed"; error = "ompi_osc_base_find_available() failed";
goto error; goto error;
} }
@ -801,16 +906,16 @@ MOVEON:
* Dump all MCA parameters if requested * Dump all MCA parameters if requested
*/ */
if (ompi_mpi_show_mca_params) { if (ompi_mpi_show_mca_params) {
ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank, ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank,
nprocs, nprocs,
orte_process_info.nodename); orte_process_info.nodename);
} }
/* Do we need to wait for a debugger? */ /* Do we need to wait for a debugger? */
ompi_wait_for_debugger(); ompi_wait_for_debugger();
/* check for timing request - get stop time and report elapsed /* check for timing request - get stop time and report elapsed
time if so, then start the clock again */ time if so, then start the clock again */
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) { if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
gettimeofday(&ompistop, NULL); gettimeofday(&ompistop, NULL);
opal_output(0, "ompi_mpi_init[%ld]: time from modex to first barrier %ld usec", opal_output(0, "ompi_mpi_init[%ld]: time from modex to first barrier %ld usec",

Просмотреть файл

@ -68,7 +68,8 @@ enum {
OPAL_ERR_INVALID_PHYS_CPU = (OPAL_ERR_BASE - 39), OPAL_ERR_INVALID_PHYS_CPU = (OPAL_ERR_BASE - 39),
OPAL_ERR_MULTIPLE_AFFINITIES = (OPAL_ERR_BASE - 40), OPAL_ERR_MULTIPLE_AFFINITIES = (OPAL_ERR_BASE - 40),
OPAL_ERR_SLOT_LIST_RANGE = (OPAL_ERR_BASE - 41), OPAL_ERR_SLOT_LIST_RANGE = (OPAL_ERR_BASE - 41),
OPAL_ERR_NETWORK_NOT_PARSEABLE = (OPAL_ERR_BASE - 42) OPAL_ERR_NETWORK_NOT_PARSEABLE = (OPAL_ERR_BASE - 42),
OPAL_ERR_SILENT = (OPAL_ERR_BASE - 43)
}; };
#define OPAL_ERR_MAX (OPAL_ERR_BASE - 100) #define OPAL_ERR_MAX (OPAL_ERR_BASE - 100)

Просмотреть файл

@ -9,6 +9,8 @@
EXTRA_DIST = base/.windows EXTRA_DIST = base/.windows
dist_pkgdata_DATA = base/help-opal-hwloc-base.txt
headers += \ headers += \
base/base.h base/base.h

Просмотреть файл

@ -77,7 +77,111 @@ OPAL_DECLSPEC extern bool opal_hwloc_base_inited;
OPAL_DECLSPEC extern bool opal_hwloc_topology_inited; OPAL_DECLSPEC extern bool opal_hwloc_topology_inited;
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
OPAL_DECLSPEC extern char *opal_hwloc_base_slot_list;
OPAL_DECLSPEC extern char *opal_hwloc_base_cpu_set; OPAL_DECLSPEC extern char *opal_hwloc_base_cpu_set;
OPAL_DECLSPEC extern hwloc_cpuset_t opal_hwloc_base_given_cpus;
/**
* Report a bind failure using the normal mechanisms if a component
* fails to bind memory -- according to the value of the
* hwloc_base_bind_failure_action MCA parameter.
*/
OPAL_DECLSPEC int opal_hwloc_base_report_bind_failure(const char *file,
int line,
const char *msg,
int rc);
OPAL_DECLSPEC opal_paffinity_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t topo,
opal_hwloc_level_t level1,
unsigned int peer1,
opal_hwloc_level_t level2,
unsigned int peer2);
OPAL_DECLSPEC void opal_hwloc_base_get_local_cpuset(void);
/**
* Enum for what memory allocation policy we want for user allocations.
* MAP = memory allocation policy.
*/
typedef enum {
OPAL_HWLOC_BASE_MAP_NONE,
OPAL_HWLOC_BASE_MAP_LOCAL_ONLY
} opal_hwloc_base_map_t;
/**
* Global reflecting the MAP (set by MCA param).
*/
OPAL_DECLSPEC extern opal_hwloc_base_map_t opal_hwloc_base_map;
/**
* Enum for what to do if the hwloc framework tries to bind memory
* and fails. BFA = bind failure action.
*/
typedef enum {
OPAL_HWLOC_BASE_MBFA_SILENT,
OPAL_HWLOC_BASE_MBFA_WARN,
OPAL_HWLOC_BASE_MBFA_ERROR
} opal_hwloc_base_mbfa_t;
/**
* Global reflecting the BFA (set by MCA param).
*/
OPAL_DECLSPEC extern opal_hwloc_base_mbfa_t opal_hwloc_base_mbfa;
/* some critical helper functions */
OPAL_DECLSPEC int opal_hwloc_base_filter_cpus(hwloc_topology_t topo);
OPAL_DECLSPEC int opal_hwloc_base_get_topology(void);
OPAL_DECLSPEC void opal_hwloc_base_free_topology(hwloc_topology_t topo);
OPAL_DECLSPEC hwloc_cpuset_t opal_hwloc_base_get_available_cpus(hwloc_topology_t topo,
hwloc_obj_t obj);
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo,
hwloc_obj_type_t target,
unsigned cache_level,
opal_hwloc_resource_type_t rtype);
OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_obj_by_type(hwloc_topology_t topo,
hwloc_obj_type_t target,
unsigned cache_level,
unsigned int instance,
opal_hwloc_resource_type_t rtype);
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_obj_idx(hwloc_topology_t topo,
hwloc_obj_t obj,
opal_hwloc_resource_type_t rtype);
OPAL_DECLSPEC void opal_hwloc_base_get_level_and_index(hwloc_cpuset_t cpus,
opal_hwloc_level_t *bind_level,
unsigned int *bind_idx);
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_npus(hwloc_topology_t topo,
hwloc_obj_t target);
OPAL_DECLSPEC char* opal_hwloc_base_print_binding(opal_binding_policy_t binding);
OPAL_DECLSPEC char* opal_hwloc_base_print_locality(opal_paffinity_locality_t locality);
OPAL_DECLSPEC char* opal_hwloc_base_print_level(opal_hwloc_level_t level);
/**
* Provide a utility to parse a slot list against the local
* logical cpus, and produce a cpuset for the described binding
*/
OPAL_DECLSPEC int opal_hwloc_base_slot_list_parse(const char *slot_str,
hwloc_topology_t topo,
hwloc_cpuset_t cpumask);
/**
* Report a bind failure using the normal mechanisms if a component
* fails to bind memory -- according to the value of the
* hwloc_base_bind_failure_action MCA parameter.
*/
OPAL_DECLSPEC int opal_hwloc_base_report_bind_failure(const char *file,
int line,
const char *msg,
int rc);
/**
* This function sets the process-wide memory affinity policy
* according to opal_hwloc_base_map and opal_hwloc_base_mbfa. It needs
* to be a separate, standalone function (as opposed to being done
* during opal_hwloc_base_open()) because opal_hwloc_topology is not
* loaded by opal_hwloc_base_open(). Hence, an upper layer needs to
* invoke this function after opal_hwloc_topology has been loaded.
*/
OPAL_DECLSPEC int opal_hwloc_base_set_process_membind_policy(void);
/* datatype support */ /* datatype support */
OPAL_DECLSPEC int opal_hwloc_pack(opal_buffer_t *buffer, const void *src, OPAL_DECLSPEC int opal_hwloc_pack(opal_buffer_t *buffer, const void *src,
@ -100,80 +204,8 @@ OPAL_DECLSPEC int opal_hwloc_size(size_t *size,
opal_data_type_t type); opal_data_type_t type);
OPAL_DECLSPEC void opal_hwloc_release(opal_dss_value_t *value); OPAL_DECLSPEC void opal_hwloc_release(opal_dss_value_t *value);
/**
* Report a bind failure using the normal mechanisms if a component
* fails to bind memory -- according to the value of the
* hwloc_base_bind_failure_action MCA parameter.
*/
OPAL_DECLSPEC int opal_hwloc_base_report_bind_failure(const char *file,
int line,
const char *msg,
int rc);
OPAL_DECLSPEC opal_paffinity_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t topo,
hwloc_cpuset_t peer1,
hwloc_cpuset_t peer2);
OPAL_DECLSPEC void opal_hwloc_base_get_local_cpuset(void);
/* some critical helper functions */
OPAL_DECLSPEC int opal_hwloc_base_filter_cpus(hwloc_topology_t topo);
OPAL_DECLSPEC int opal_hwloc_base_get_topology(void);
OPAL_DECLSPEC void opal_hwloc_base_free_topology(hwloc_topology_t topo);
OPAL_DECLSPEC hwloc_cpuset_t opal_hwloc_base_get_available_cpus(hwloc_topology_t topo,
hwloc_obj_t obj);
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo,
hwloc_obj_type_t target,
unsigned cache_level,
opal_hwloc_resource_type_t rtype);
OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_obj_by_type(hwloc_topology_t topo,
hwloc_obj_type_t target,
unsigned cache_level,
unsigned int instance,
opal_hwloc_resource_type_t rtype);
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_npus(hwloc_topology_t topo,
hwloc_obj_t target);
#endif #endif
/**
* Enum for what memory allocation policy we want for user allocations.
* MAP = memory allocation policy.
*/
typedef enum {
OPAL_HWLOC_BASE_MAP_NONE,
OPAL_HWLOC_BASE_MAP_LOCAL_ONLY
} opal_hwloc_base_map_t;
/**
* Global reflecting the MAP (set by MCA param).
*/
OPAL_DECLSPEC extern opal_hwloc_base_map_t opal_hwloc_base_map;
/**
* Enum for what to do if the hwloc framework tries to bind memory
* and fails. BFA = bind failure action.
*/
typedef enum {
OPAL_HWLOC_BASE_MBFA_WARN,
OPAL_HWLOC_BASE_MBFA_ERROR
} opal_hwloc_base_mbfa_t;
/**
* Global reflecting the BFA (set by MCA param).
*/
OPAL_DECLSPEC extern opal_hwloc_base_mbfa_t opal_hwloc_base_mbfa;
/**
* This function sets the process-wide memory affinity policy
* according to opal_hwloc_base_map and opal_hwloc_base_mbfa. It needs
* to be a separate, standalone function (as opposed to being done
* during opal_hwloc_base_open()) because opal_hwloc_topology is not
* loaded by opal_hwloc_base_open(). Hence, an upper layer needs to
* invoke this function after opal_hwloc_topology has been loaded.
*/
OPAL_DECLSPEC int opal_hwloc_base_set_process_membind_policy(void);
END_C_DECLS END_C_DECLS
#endif /* OPAL_HWLOC_BASE_H */ #endif /* OPAL_HWLOC_BASE_H */

Просмотреть файл

@ -39,3 +39,25 @@ message will only be reported at most once per process.
File: %s:%d File: %s:%d
Message: %s Message: %s
Severity: %s Severity: %s
#
[unrecognized-policy]
The specified %s policy is not recognized:
Policy: %s
Please check for a typo or ensure that the option is a supported
one.
#
[logical-cpu-not-found]
A specified logical processor does not exist in this topology:
Cpu set given: %s
#
[redefining-policy]
Conflicting directives for binding policy are causing the policy
to be redefined:
New policy: %s
Prior policy: %s
Please check that only one policy is defined.

Просмотреть файл

@ -12,6 +12,7 @@
#include "opal/constants.h" #include "opal/constants.h"
#include "opal/dss/dss.h" #include "opal/dss/dss.h"
#include "opal/util/argv.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/show_help.h" #include "opal/util/show_help.h"
#include "opal/mca/mca.h" #include "opal/mca/mca.h"
@ -39,7 +40,13 @@ bool opal_hwloc_base_inited = false;
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
hwloc_topology_t opal_hwloc_topology=NULL; hwloc_topology_t opal_hwloc_topology=NULL;
hwloc_cpuset_t opal_hwloc_my_cpuset=NULL; hwloc_cpuset_t opal_hwloc_my_cpuset=NULL;
hwloc_cpuset_t opal_hwloc_base_given_cpus=NULL;
opal_hwloc_base_map_t opal_hwloc_base_map = OPAL_HWLOC_BASE_MAP_NONE;
opal_hwloc_base_mbfa_t opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_WARN;
opal_binding_policy_t opal_hwloc_binding_policy=0;
char *opal_hwloc_base_slot_list=NULL;
char *opal_hwloc_base_cpu_set=NULL; char *opal_hwloc_base_cpu_set=NULL;
bool opal_hwloc_report_bindings=false;
hwloc_obj_type_t opal_hwloc_levels[] = { hwloc_obj_type_t opal_hwloc_levels[] = {
HWLOC_OBJ_MACHINE, HWLOC_OBJ_MACHINE,
HWLOC_OBJ_NODE, HWLOC_OBJ_NODE,
@ -50,10 +57,8 @@ hwloc_obj_type_t opal_hwloc_levels[] = {
HWLOC_OBJ_CORE, HWLOC_OBJ_CORE,
HWLOC_OBJ_PU HWLOC_OBJ_PU
}; };
bool opal_hwloc_use_hwthreads_as_cpus = false;
#endif #endif
opal_hwloc_base_map_t opal_hwloc_base_map = OPAL_HWLOC_BASE_MAP_NONE;
opal_hwloc_base_mbfa_t opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_ERROR;
int opal_hwloc_base_open(void) int opal_hwloc_base_open(void)
@ -65,10 +70,11 @@ int opal_hwloc_base_open(void)
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
{ {
int value; int value, i;
opal_data_type_t tmp; opal_data_type_t tmp;
char *str_value; char *str_value;
char **tmpvals, **quals;
/* Debugging / verbose output */ /* Debugging / verbose output */
mca_base_param_reg_int_name("hwloc", "base_verbose", mca_base_param_reg_int_name("hwloc", "base_verbose",
"Verbosity level of the hwloc framework", "Verbosity level of the hwloc framework",
@ -102,12 +108,16 @@ int opal_hwloc_base_open(void)
gethostname(hostname, sizeof(hostname)); gethostname(hostname, sizeof(hostname));
opal_show_help("help-opal-hwloc-base.txt", "invalid policy", opal_show_help("help-opal-hwloc-base.txt", "invalid policy",
true, hostname, getpid(), str_value); true, hostname, getpid(), str_value);
free(str_value);
return OPAL_ERR_BAD_PARAM; return OPAL_ERR_BAD_PARAM;
} }
free(str_value); free(str_value);
/* hwloc_base_bind_failure_action */ /* hwloc_base_bind_failure_action */
switch (opal_hwloc_base_mbfa) { switch (opal_hwloc_base_mbfa) {
case OPAL_HWLOC_BASE_MBFA_SILENT:
str_value = "silent";
break;
case OPAL_HWLOC_BASE_MBFA_WARN: case OPAL_HWLOC_BASE_MBFA_WARN:
str_value = "warn"; str_value = "warn";
break; break;
@ -116,9 +126,11 @@ int opal_hwloc_base_open(void)
break; break;
} }
mca_base_param_reg_string_name("hwloc", "base_mem_bind_failure_action", mca_base_param_reg_string_name("hwloc", "base_mem_bind_failure_action",
"What Open MPI will do if it explicitly tries to bind memory to a specific NUMA location, and fails. Note that this is a different case than the general allocation policy described by hwloc_base_alloc_policy. A value of \"warn\" means that Open MPI will warn the first time this happens, but allow the job to continue (possibly with degraded performance). A value of \"error\" means that Open MPI will abort the job if this happens.", "What Open MPI will do if it explicitly tries to bind memory to a specific NUMA location, and fails. Note that this is a different case than the general allocation policy described by hwloc_base_alloc_policy. A value of \"silent\" means that Open MPI will proceed without comment. A value of \"warn\" means that Open MPI will warn the first time this happens, but allow the job to continue (possibly with degraded performance). A value of \"error\" means that Open MPI will abort the job if this happens.",
false, false, str_value, &str_value); false, false, str_value, &str_value);
if (strcasecmp(str_value, "warn") == 0) { if (strcasecmp(str_value, "silent") == 0) {
opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_SILENT;
} else if (strcasecmp(str_value, "warn") == 0) {
opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_WARN; opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_WARN;
} else if (strcasecmp(str_value, "error") == 0) { } else if (strcasecmp(str_value, "error") == 0) {
opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_ERROR; opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_ERROR;
@ -127,14 +139,123 @@ int opal_hwloc_base_open(void)
gethostname(hostname, sizeof(hostname)); gethostname(hostname, sizeof(hostname));
opal_show_help("help-opal-hwloc-base.txt", "invalid error action", opal_show_help("help-opal-hwloc-base.txt", "invalid error action",
true, hostname, getpid(), str_value); true, hostname, getpid(), str_value);
free(str_value);
return OPAL_ERR_BAD_PARAM; return OPAL_ERR_BAD_PARAM;
} }
free(str_value); free(str_value);
/* binding specification */
mca_base_param_reg_string_name("hwloc", "base_binding_policy",
"Policy for binding processes [none (default) | hwthread | core | l1cache | l2cache | l3cache | socket | numa | board] (supported qualifiers: overload-allowed,if-supported)",
false, false, NULL, &str_value);
if (NULL == str_value) {
opal_hwloc_binding_policy = OPAL_BIND_TO_NONE;
/* mark that no binding policy was specified */
opal_hwloc_binding_policy &= ~OPAL_BIND_GIVEN;
} else if (0 == strncasecmp(str_value, "none", strlen("none"))) {
opal_hwloc_binding_policy = OPAL_BIND_TO_NONE;
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
} else {
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
tmpvals = opal_argv_split(str_value, ':');
if (1 < opal_argv_count(tmpvals)) {
quals = opal_argv_split(tmpvals[1], ',');
for (i=0; NULL != quals[i]; i++) {
if (0 == strcasecmp(quals[i], "if-supported")) {
opal_hwloc_binding_policy |= OPAL_BIND_IF_SUPPORTED;
} else if (0 == strcasecmp(quals[i], "overload-allowed")) {
opal_hwloc_binding_policy |= OPAL_BIND_ALLOW_OVERLOAD;
} else {
/* unknown option */
opal_output(0, "Unknown qualifier to orte_process_binding: %s", str_value);
return OPAL_ERR_BAD_PARAM;
}
}
opal_argv_free(quals);
}
if (0 == strcasecmp(tmpvals[0], "hwthread")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
} else if (0 == strcasecmp(tmpvals[0], "core")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
} else if (0 == strcasecmp(tmpvals[0], "l1cache")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_L1CACHE);
} else if (0 == strcasecmp(tmpvals[0], "l2cache")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_L2CACHE);
} else if (0 == strcasecmp(tmpvals[0], "l3cache")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_L3CACHE);
} else if (0 == strcasecmp(tmpvals[0], "socket")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_SOCKET);
} else if (0 == strcasecmp(tmpvals[0], "numa")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_NUMA);
} else if (0 == strcasecmp(tmpvals[0], "board")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_BOARD);
} else {
opal_show_help("help-opal-hwloc-base.txt", "unrecognized-policy", true, "binding", str_value);
opal_argv_free(tmpvals);
free(str_value);
return OPAL_ERR_BAD_PARAM;
}
opal_argv_free(tmpvals);
}
free(str_value);
/* backward compatibility */
mca_base_param_reg_int_name("hwloc", "base_bind_to_core",
"Bind processes to cores",
false, false, (int)false, &value);
if (value) {
/* set binding policy to core - error if something else already set */
if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) != OPAL_BIND_TO_CORE) {
/* error - cannot redefine the default ranking policy */
opal_show_help("help-opal-hwloc-base.txt", "redefining-policy", true,
"core", opal_hwloc_base_print_binding(opal_hwloc_binding_policy));
return OPAL_ERR_SILENT;
}
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
}
mca_base_param_reg_int_name("hwloc", "base_bind_to_socket",
"Bind processes to sockets",
false, false, (int)false, &value);
if (value) {
/* set binding policy to socket - error if something else already set */
if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) != OPAL_BIND_TO_SOCKET) {
/* error - cannot redefine the default ranking policy */
opal_show_help("help-opal-hwloc-base.txt", "redefining-policy", true,
"socket", opal_hwloc_base_print_binding(opal_hwloc_binding_policy));
return OPAL_ERR_SILENT;
}
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_SOCKET);
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
}
mca_base_param_reg_int_name("hwloc", "base_report_bindings",
"Report bindings to stderr",
false, false, (int)false, &value);
opal_hwloc_report_bindings = OPAL_INT_TO_BOOL(value);
/* did the user provide a slot list? */
tmp = mca_base_param_reg_string_name("hwloc", "base_slot_list",
"List of processor IDs to bind processes to [default=NULL]",
false, false, NULL, &opal_hwloc_base_slot_list);
if (NULL != opal_hwloc_base_slot_list) {
/* if we already were given a policy, then this is an error */
if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
opal_show_help("help-opal-hwloc-base.txt", "redefining-policy", true,
"socket", opal_hwloc_base_print_binding(opal_hwloc_binding_policy));
return OPAL_ERR_SILENT;
}
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CPUSET);
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
}
/* cpu allocation specification */ /* cpu allocation specification */
mca_base_param_reg_string_name("hwloc", "base_cpu_set", mca_base_param_reg_string_name("hwloc", "base_cpu_set",
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]", "Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]",
false, false, NULL, &opal_hwloc_base_cpu_set); false, false, NULL, &opal_hwloc_base_cpu_set);
/* to support tools such as ompi_info, add the components /* to support tools such as ompi_info, add the components
* to a list * to a list
@ -147,6 +268,12 @@ int opal_hwloc_base_open(void)
return OPAL_ERROR; return OPAL_ERROR;
} }
/* declare hwthreads as independent cpus */
mca_base_param_reg_int_name("hwloc", "base_use_hwthreads_as_cpus",
"Use hardware threads as independent cpus",
false, false, (int)false, &value);
opal_hwloc_use_hwthreads_as_cpus = OPAL_INT_TO_BOOL(value);
/* declare the hwloc data types */ /* declare the hwloc data types */
tmp = OPAL_HWLOC_TOPO; tmp = OPAL_HWLOC_TOPO;
if (OPAL_SUCCESS != (value = opal_dss.register_type(opal_hwloc_pack, if (OPAL_SUCCESS != (value = opal_dss.register_type(opal_hwloc_pack,
@ -171,6 +298,7 @@ static void obj_data_const(opal_hwloc_obj_data_t *ptr)
{ {
ptr->available = NULL; ptr->available = NULL;
ptr->npus = 0; ptr->npus = 0;
ptr->idx = UINT_MAX;
} }
static void obj_data_dest(opal_hwloc_obj_data_t *ptr) static void obj_data_dest(opal_hwloc_obj_data_t *ptr)
{ {

Просмотреть файл

@ -34,7 +34,7 @@ int opal_hwloc_base_set_process_membind_policy(void)
if (NULL == opal_hwloc_topology) { if (NULL == opal_hwloc_topology) {
return OPAL_ERR_BAD_PARAM; return OPAL_ERR_BAD_PARAM;
} }
/* Set the default memory allocation policy according to MCA /* Set the default memory allocation policy according to MCA
param */ param */
switch (opal_hwloc_base_map) { switch (opal_hwloc_base_map) {
@ -54,10 +54,20 @@ int opal_hwloc_base_set_process_membind_policy(void)
if (NULL == cpuset) { if (NULL == cpuset) {
rc = OPAL_ERR_OUT_OF_RESOURCE; rc = OPAL_ERR_OUT_OF_RESOURCE;
} else { } else {
int e;
hwloc_get_cpubind(opal_hwloc_topology, cpuset, 0); hwloc_get_cpubind(opal_hwloc_topology, cpuset, 0);
rc = hwloc_set_membind(opal_hwloc_topology, rc = hwloc_set_membind(opal_hwloc_topology,
cpuset, HWLOC_MEMBIND_BIND, flags); cpuset, policy, flags);
e = errno;
hwloc_bitmap_free(cpuset); hwloc_bitmap_free(cpuset);
/* See if hwloc was able to do it. If hwloc failed due to
ENOSYS, but the base_map == NONE, then it's not really an
error. */
if (0 != rc && ENOSYS == e &&
OPAL_HWLOC_BASE_MAP_NONE == opal_hwloc_base_map) {
rc = 0;
}
} }
return (0 == rc) ? OPAL_SUCCESS : OPAL_ERROR; return (0 == rc) ? OPAL_SUCCESS : OPAL_ERROR;

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -24,6 +24,9 @@
#include <stdarg.h> #include <stdarg.h>
#endif #endif
#include "opal/class/opal_list.h"
#include "opal/class/opal_value_array.h"
#include "opal/mca/mca.h" #include "opal/mca/mca.h"
#include "opal/mca/base/base.h" #include "opal/mca/base/base.h"
@ -62,17 +65,13 @@ typedef struct opal_hwloc_base_component_2_0_0_t opal_hwloc_component_t;
END_C_DECLS END_C_DECLS
/* include implementation to call */
#if OPAL_HAVE_HWLOC
#include MCA_hwloc_IMPLEMENTATION_HEADER
/* Define a hierarchical level value that /* Define a hierarchical level value that
* helps resolve the hwloc behavior of * helps resolve the hwloc behavior of
* treating caches as a single type of * treating caches as a single type of
* entity * entity - must always be available
*/ */
typedef enum { typedef enum uint8_t {
OPAL_HWLOC_NODE_LEVEL=1, OPAL_HWLOC_NODE_LEVEL=0,
OPAL_HWLOC_NUMA_LEVEL, OPAL_HWLOC_NUMA_LEVEL,
OPAL_HWLOC_SOCKET_LEVEL, OPAL_HWLOC_SOCKET_LEVEL,
OPAL_HWLOC_L3CACHE_LEVEL, OPAL_HWLOC_L3CACHE_LEVEL,
@ -81,6 +80,12 @@ typedef enum {
OPAL_HWLOC_CORE_LEVEL, OPAL_HWLOC_CORE_LEVEL,
OPAL_HWLOC_HWTHREAD_LEVEL OPAL_HWLOC_HWTHREAD_LEVEL
} opal_hwloc_level_t; } opal_hwloc_level_t;
#define OPAL_HWLOC_LEVEL_T OPAL_UINT8
/* include implementation to call */
#if OPAL_HAVE_HWLOC
#include MCA_hwloc_IMPLEMENTATION_HEADER
/* define type of processor info requested */ /* define type of processor info requested */
typedef uint8_t opal_hwloc_resource_type_t; typedef uint8_t opal_hwloc_resource_type_t;
@ -93,6 +98,7 @@ typedef struct {
opal_object_t super; opal_object_t super;
hwloc_cpuset_t available; hwloc_cpuset_t available;
unsigned int npus; unsigned int npus;
unsigned int idx;
} opal_hwloc_obj_data_t; } opal_hwloc_obj_data_t;
OBJ_CLASS_DECLARATION(opal_hwloc_obj_data_t); OBJ_CLASS_DECLARATION(opal_hwloc_obj_data_t);
@ -112,9 +118,46 @@ typedef struct {
} opal_hwloc_topo_data_t; } opal_hwloc_topo_data_t;
OBJ_CLASS_DECLARATION(opal_hwloc_topo_data_t); OBJ_CLASS_DECLARATION(opal_hwloc_topo_data_t);
/* define binding policies */
typedef uint16_t opal_binding_policy_t;
#define OPAL_BINDING_POLICY OPAL_UINT16
/* binding directives */
#define OPAL_BIND_IF_SUPPORTED 0x1000
#define OPAL_BIND_ALLOW_OVERLOAD 0x2000
#define OPAL_BIND_GIVEN 0x4000
/* binding policies */
#define OPAL_BIND_TO_NONE 1
#define OPAL_BIND_TO_BOARD 2
#define OPAL_BIND_TO_NUMA 3
#define OPAL_BIND_TO_SOCKET 4
#define OPAL_BIND_TO_L3CACHE 5
#define OPAL_BIND_TO_L2CACHE 6
#define OPAL_BIND_TO_L1CACHE 7
#define OPAL_BIND_TO_CORE 8
#define OPAL_BIND_TO_HWTHREAD 9
#define OPAL_BIND_TO_CPUSET 10
#define OPAL_GET_BINDING_POLICY(pol) \
((pol) & 0x0fff)
#define OPAL_SET_BINDING_POLICY(target, pol) \
(target) = (pol) | ((target) & 0xf000)
/* check if policy is set */
#define OPAL_BINDING_POLICY_IS_SET(pol) \
((pol) & 0x4000)
/* macro to detect if binding was qualified */
#define OPAL_BINDING_REQUIRED(n) \
(!(OPAL_BIND_IF_SUPPORTED & (n)))
/* macro to detect if binding is forced */
#define OPAL_BIND_OVERLOAD_ALLOWED(n) \
(OPAL_BIND_ALLOW_OVERLOAD & (n))
/* some global values */
OPAL_DECLSPEC extern hwloc_topology_t opal_hwloc_topology; OPAL_DECLSPEC extern hwloc_topology_t opal_hwloc_topology;
OPAL_DECLSPEC extern opal_binding_policy_t opal_hwloc_binding_policy;
OPAL_DECLSPEC extern hwloc_cpuset_t opal_hwloc_my_cpuset; OPAL_DECLSPEC extern hwloc_cpuset_t opal_hwloc_my_cpuset;
OPAL_DECLSPEC extern bool opal_hwloc_report_bindings;
OPAL_DECLSPEC extern hwloc_obj_type_t opal_hwloc_levels[]; OPAL_DECLSPEC extern hwloc_obj_type_t opal_hwloc_levels[];
OPAL_DECLSPEC extern bool opal_hwloc_use_hwthreads_as_cpus;
#endif #endif

Просмотреть файл

@ -1,6 +1,7 @@
# Copyright © 2009-2010 INRIA. All rights reserved. # Copyright © 2009-2010 INRIA. All rights reserved.
# Copyright © 2009-2010 Université Bordeaux 1 # Copyright © 2009-2010 Université Bordeaux 1
# Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved. # Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
# Copyright © 2011 Oracle and/or its affiliates. All rights reserved.
# See COPYING in top-level directory. # See COPYING in top-level directory.
# Only install the headers if we're in standalone mode (meaning: # Only install the headers if we're in standalone mode (meaning:
@ -33,6 +34,11 @@ include_hwloc_HEADERS += \
hwloc/linux-libnuma.h hwloc/linux-libnuma.h
endif HWLOC_HAVE_LINUX endif HWLOC_HAVE_LINUX
if HWLOC_HAVE_SOLARIS
include_hwloc_HEADERS += \
private/solaris-chiptype.h
endif HWLOC_HAVE_SOLARIS
if HWLOC_HAVE_SCHED_SETAFFINITY if HWLOC_HAVE_SCHED_SETAFFINITY
include_hwloc_HEADERS += hwloc/glibc-sched.h include_hwloc_HEADERS += hwloc/glibc-sched.h
endif HWLOC_HAVE_SCHED_SETAFFINITY endif HWLOC_HAVE_SCHED_SETAFFINITY

Просмотреть файл

@ -0,0 +1,46 @@
/*
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/* SPARC Chip Modes. */
#define MODE_UNKNOWN 0
#define MODE_SPITFIRE 1
#define MODE_BLACKBIRD 2
#define MODE_CHEETAH 3
#define MODE_SPARC64_VI 4
#define MODE_T1 5
#define MODE_T2 6
#define MODE_SPARC64_VII 7
#define MODE_ROCK 8
/* SPARC Chip Implementations. */
#define IMPL_SPARC64_VI 0x6
#define IMPL_SPARC64_VII 0x7
#define IMPL_SPITFIRE 0x10
#define IMPL_BLACKBIRD 0x11
#define IMPL_SABRE 0x12
#define IMPL_HUMMINGBIRD 0x13
#define IMPL_CHEETAH 0x14
#define IMPL_CHEETAHPLUS 0x15
#define IMPL_JALAPENO 0x16
#define IMPL_JAGUAR 0x18
#define IMPL_PANTHER 0x19
#define IMPL_NIAGARA 0x23
#define IMPL_NIAGARA_2 0x24
#define IMPL_ROCK 0x25
/* Default Mfg, Cache, Speed settings */
#define TI_MANUFACTURER 0x17
#define TWO_MEG_CACHE 2097152
#define SPITFIRE_SPEED 142943750
char* hwloc_solaris_get_chip_type(void);
char* hwloc_solaris_get_chip_model(void);

Просмотреть файл

@ -1,6 +1,7 @@
# Copyright © 2009-2010 INRIA. All rights reserved. # Copyright © 2009-2010 INRIA. All rights reserved.
# Copyright © 2009-2010 Université Bordeaux 1 # Copyright © 2009-2010 Université Bordeaux 1
# Copyright © 2009-2010 Cisco Systems, Inc. All rights reserved. # Copyright © 2009-2010 Cisco Systems, Inc. All rights reserved.
# Copyright © 2011 Oracle and/or its affiliates. All rights reserved.
# See COPYING in top-level directory. # See COPYING in top-level directory.
AM_CFLAGS = $(HWLOC_CFLAGS) AM_CFLAGS = $(HWLOC_CFLAGS)
@ -35,6 +36,8 @@ ldflags =
if HWLOC_HAVE_SOLARIS if HWLOC_HAVE_SOLARIS
sources += topology-solaris.c sources += topology-solaris.c
sources += topology-solaris-chiptype.c
ldflags += -lpicl
endif HWLOC_HAVE_SOLARIS endif HWLOC_HAVE_SOLARIS
if HWLOC_HAVE_LINUX if HWLOC_HAVE_LINUX

Просмотреть файл

@ -0,0 +1,321 @@
/*
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <private/solaris-chiptype.h>
#include <stdlib.h>
#include <strings.h>
#include <sys/systeminfo.h>
#include <picl.h>
/*****************************************************************************
Order of this list is important for the assign_value and
assign_string_value routines
*****************************************************************************/
static const char* items[] = {
"clock-frequency",
"cpu-mhz",
"ecache-size",
"l2-cache-size",
"sectored-l2-cache-size",
"implementation#",
"manufacturer#",
"compatible",
"ProcessorType",
"vendor-id",
"brand-string"
};
#define NUM_ITEMS (sizeof(items) / sizeof(items[0]))
/*****************************************************************************
SPARC strings for chip modes and implementation
*****************************************************************************/
static const char* sparc_modes[] = {
"UNKNOWN",
"SPITFIRE",
"BLACKBIRD",
"CHEETAH",
"SPARC64_VI",
"T1",
"T2",
"SPARC64_VII",
"ROCK"
};
/*****************************************************************************
Default values are for Unknown so we can build up from there.
*****************************************************************************/
static long dss_chip_mode = MODE_UNKNOWN;
static long dss_chip_impl = IMPL_SPITFIRE;
static long dss_chip_cache = TWO_MEG_CACHE;
static long dss_chip_manufacturer = TI_MANUFACTURER;
static long long dss_chip_speed = SPITFIRE_SPEED;
static char dss_chip_type[PICL_PROPNAMELEN_MAX];
static char dss_chip_model[PICL_PROPNAMELEN_MAX];
static int called_cpu_probe = 0;
/*****************************************************************************
Assigns values based on the value of index. For this reason, the order of
the items array is important.
*****************************************************************************/
static void assign_value(int index, long long val) {
if (index == 0) { /* clock-frequency */
dss_chip_speed = val;
}
if (index == 1) { /* cpu-mhz */
dss_chip_speed = val * 1000000; /* Scale since value was in MHz */
}
else if ((index >= 2) && (index <= 4)) {
/* ecache-size, l2-cache-size, sectored-l2-cache-size */
dss_chip_cache = val;
}
else if (index == 5) {
/* implementation# T1, T2, and Rock do not have this, see RFE 6615268 */
dss_chip_impl = val;
if (dss_chip_impl == IMPL_SPITFIRE) {
dss_chip_mode = 1;
}
else if ((dss_chip_impl >= IMPL_BLACKBIRD) &&
(dss_chip_impl <= IMPL_HUMMINGBIRD)) {
dss_chip_mode = 2;
}
else if ((dss_chip_impl >= IMPL_CHEETAH) &&
(dss_chip_impl <= IMPL_PANTHER)) {
dss_chip_mode = 3;
}
else if (dss_chip_impl == IMPL_SPARC64_VI) {
dss_chip_mode = 4;
}
else if (dss_chip_impl == IMPL_NIAGARA) {
dss_chip_mode = 5;
}
else if (dss_chip_impl == IMPL_NIAGARA_2) {
dss_chip_mode = 6;
}
else if (dss_chip_impl == IMPL_SPARC64_VII) {
dss_chip_mode = 7;
}
else if (dss_chip_impl == IMPL_ROCK) {
dss_chip_mode = 8;
}
}
else if (index == 6) { /* manufacturer# */
dss_chip_manufacturer = val;
}
}
/*****************************************************************************
Assigns values based on the value of index. For this reason, the order of
the items array is important.
*****************************************************************************/
static void assign_string_value(int index, char* string_val) {
if (index == 7) { /* compatible */
if (strncasecmp(string_val, "FJSV,SPARC64-VI",
PICL_PROPNAMELEN_MAX) == 0) {
dss_chip_mode = 4;
}
else if (strncasecmp(string_val, "SUNW,UltraSPARC-T1",
PICL_PROPNAMELEN_MAX) == 0) {
dss_chip_mode = 5;
}
else if (strncasecmp(string_val, "SUNW,UltraSPARC-T2",
PICL_PROPNAMELEN_MAX) == 0) {
dss_chip_mode = 6;
}
else if (strncasecmp(string_val, "FJSV,SPARC64-VII",
PICL_PROPNAMELEN_MAX) == 0) {
dss_chip_mode = 7;
}
else if (strncasecmp(string_val, "SUNW,Rock",
PICL_PROPNAMELEN_MAX) == 0) {
dss_chip_mode = 8;
}
} else if (index == 8) { /* ProcessorType */
strncpy(&dss_chip_type[0], string_val, PICL_PROPNAMELEN_MAX);
} else if (index == 10) { /* brand-string */
strncpy(&dss_chip_model[0], string_val, PICL_PROPNAMELEN_MAX);
}
}
/*****************************************************************************
Gets called by probe_cpu. Cycles through the table values until we find
what we are looking for.
*****************************************************************************/
static int search_table(int index, picl_prophdl_t table_hdl) {
picl_prophdl_t col_hdl;
picl_prophdl_t row_hdl;
picl_propinfo_t p_info;
int val;
char string_val[PICL_PROPNAMELEN_MAX];
for (val = picl_get_next_by_col(table_hdl, &row_hdl); val != PICL_ENDOFLIST;
val = picl_get_next_by_col(row_hdl, &row_hdl)) {
if (val == PICL_SUCCESS) {
for (col_hdl = row_hdl; val != PICL_ENDOFLIST;
val = picl_get_next_by_row(col_hdl, &col_hdl)) {
if (val == PICL_SUCCESS) {
val = picl_get_propinfo(col_hdl, &p_info);
if (val == PICL_SUCCESS) {
if (p_info.type == PICL_PTYPE_CHARSTRING) {
val = picl_get_propval(col_hdl, &string_val, sizeof(string_val));
if (val == PICL_SUCCESS) {
assign_string_value(index, string_val);
}
}
}
}
}
}
}
}
/*****************************************************************************
Gets called by picl_walk_tree_by_class. Then it cycles through the properties
until we find what we are looking for. Once we are done, we return
PICL_WALK_TERMINATE to stop picl_walk_tree_by_class from traversing the tree.
Note that PICL_PTYPE_UNSIGNED_INT and PICL_PTYPE_INT can either be 4-bytes
or 8-bytes.
*****************************************************************************/
static int probe_cpu(picl_nodehdl_t node_hdl, void* dummy_arg) {
picl_prophdl_t p_hdl;
picl_prophdl_t table_hdl;
picl_propinfo_t p_info;
long long long_long_val;
unsigned int uint_val;
int index;
int int_val;
int val;
char string_val[PICL_PROPNAMELEN_MAX];
val = picl_get_first_prop(node_hdl, &p_hdl);
while (val == PICL_SUCCESS) {
called_cpu_probe = 1;
val = picl_get_propinfo(p_hdl, &p_info);
if (val == PICL_SUCCESS) {
for (index = 0; index < NUM_ITEMS; index++) {
if (strcasecmp(p_info.name, items[index]) == 0) {
if (p_info.type == PICL_PTYPE_UNSIGNED_INT) {
if (p_info.size == sizeof(uint_val)) {
val = picl_get_propval(p_hdl, &uint_val, sizeof(uint_val));
if (val == PICL_SUCCESS) {
long_long_val = uint_val;
assign_value(index, long_long_val);
}
}
else if (p_info.size == sizeof(long_long_val)) {
val = picl_get_propval(p_hdl, &long_long_val,
sizeof(long_long_val));
if (val == PICL_SUCCESS) {
assign_value(index, long_long_val);
}
}
}
else if (p_info.type == PICL_PTYPE_INT) {
if (p_info.size == sizeof(int_val)) {
val = picl_get_propval(p_hdl, &int_val, sizeof(int_val));
if (val == PICL_SUCCESS) {
long_long_val = int_val;
assign_value(index, long_long_val);
}
}
else if (p_info.size == sizeof(long_long_val)) {
val = picl_get_propval(p_hdl, &long_long_val,
sizeof(long_long_val));
if (val == PICL_SUCCESS) {
assign_value(index, long_long_val);
}
}
}
else if (p_info.type == PICL_PTYPE_CHARSTRING) {
val = picl_get_propval(p_hdl, &string_val, sizeof(string_val));
if (val == PICL_SUCCESS) {
assign_string_value(index, string_val);
}
}
else if (p_info.type == PICL_PTYPE_TABLE) {
val = picl_get_propval(p_hdl, &table_hdl, p_info.size);
if (val == PICL_SUCCESS) {
search_table(index, table_hdl);
}
}
break;
} else if (index == NUM_ITEMS-1) {
if (p_info.type == PICL_PTYPE_CHARSTRING) {
val = picl_get_propval(p_hdl, &string_val, sizeof(string_val));
if (val == PICL_SUCCESS) {
}
}
}
}
}
val = picl_get_next_prop(p_hdl, &p_hdl);
}
return PICL_WALK_TERMINATE;
}
/*****************************************************************************
Initializes, gets the root, then walks the picl tree looking for information
Currently, the "core" class is only needed for OPL systems
*****************************************************************************/
char* hwloc_solaris_get_chip_type(void) {
picl_nodehdl_t root;
int val;
static char chip_type[PICL_PROPNAMELEN_MAX];
val = picl_initialize();
if (val != PICL_SUCCESS) { /* Can't initialize session with PICL daemon */
return(NULL);
}
val = picl_get_root(&root);
if (val != PICL_SUCCESS) { /* Failed to get root node of the PICL tree */
return(NULL);
}
val = picl_walk_tree_by_class(root, "cpu", (void *)NULL, probe_cpu);
val = picl_walk_tree_by_class(root, "core", (void *)NULL, probe_cpu);
picl_shutdown();
if (called_cpu_probe) {
strncpy(chip_type, dss_chip_type, PICL_PROPNAMELEN_MAX);
} else {
/* no picl information on machine available */
sysinfo(SI_HW_PROVIDER, chip_type, PICL_PROPNAMELEN_MAX);
}
return(chip_type);
}
/*****************************************************************************
Initializes, gets the root, then walks the picl tree looking for information
Currently, the "core" class is only needed for OPL systems
*****************************************************************************/
char *hwloc_solaris_get_chip_model(void) {
if (called_cpu_probe) {
if (dss_chip_mode != MODE_UNKNOWN) { /* SPARC chip */
strncpy(dss_chip_model, sparc_modes[dss_chip_mode],
PICL_PROPNAMELEN_MAX);
}
} else {
/* no picl information on machine available */
sysinfo(SI_PLATFORM, dss_chip_model, PICL_PROPNAMELEN_MAX);
}
return(dss_chip_model);
}

Просмотреть файл

@ -3,6 +3,7 @@
* Copyright © 2009-2011 INRIA. All rights reserved. * Copyright © 2009-2011 INRIA. All rights reserved.
* Copyright © 2009-2011 Université Bordeaux 1 * Copyright © 2009-2011 Université Bordeaux 1
* Copyright © 2011 Cisco Systems, Inc. All rights reserved. * Copyright © 2011 Cisco Systems, Inc. All rights reserved.
* Copyright © 2011 Oracle and/or its affiliates. All rights reserved.
* See COPYING in top-level directory. * See COPYING in top-level directory.
*/ */
@ -10,6 +11,7 @@
#include <hwloc.h> #include <hwloc.h>
#include <private/private.h> #include <private/private.h>
#include <private/debug.h> #include <private/debug.h>
#include <private/solaris-chiptype.h>
#include <stdio.h> #include <stdio.h>
#include <errno.h> #include <errno.h>
@ -607,9 +609,22 @@ hwloc_look_kstat(struct hwloc_topology *topology)
* pkg_core_id for the core ID (not unique). They are not useful to us * pkg_core_id for the core ID (not unique). They are not useful to us
* however. */ * however. */
} }
if (look_chips) {
if (look_chips) /* Set up the Socket object inline instead of using hwloc_setup_level
hwloc_setup_level(procid_max, numsockets, osphysids, proc_physids, topology, HWLOC_OBJ_SOCKET); * so we can add the CPUVendor and CPUModel info objects.
*/
struct hwloc_obj *obj;
unsigned j;
for (j = 0; j < numsockets; j++) {
obj = hwloc_alloc_setup_object(HWLOC_OBJ_SOCKET, osphysids[j]);
hwloc_object_cpuset_from_array(obj, j, proc_physids, procid_max);
hwloc_debug_2args_bitmap("%s %d has cpuset %s\n",
hwloc_obj_type_string(HWLOC_OBJ_SOCKET),
j, obj->cpuset);
hwloc_insert_object_by_cpuset(topology, obj);
}
hwloc_debug("%s", "\n");
}
if (look_cores) if (look_cores)
hwloc_setup_level(procid_max, numcores, oscoreids, proc_coreids, topology, HWLOC_OBJ_CORE); hwloc_setup_level(procid_max, numcores, oscoreids, proc_coreids, topology, HWLOC_OBJ_CORE);
@ -627,17 +642,30 @@ void
hwloc_look_solaris(struct hwloc_topology *topology) hwloc_look_solaris(struct hwloc_topology *topology)
{ {
unsigned nbprocs = hwloc_fallback_nbprocessors (topology); unsigned nbprocs = hwloc_fallback_nbprocessors (topology);
char *CPUType;
char *CPUModel;
#ifdef HAVE_LIBLGRP #ifdef HAVE_LIBLGRP
hwloc_look_lgrp(topology); hwloc_look_lgrp(topology);
#endif /* HAVE_LIBLGRP */ #endif /* HAVE_LIBLGRP */
#ifdef HAVE_LIBKSTAT #ifdef HAVE_LIBKSTAT
nbprocs = 0; nbprocs = 0;
if (hwloc_look_kstat(topology)) if (hwloc_look_kstat(topology)) {
return; /* Set CPU Type and Model for machine. */
CPUType = hwloc_solaris_get_chip_type();
CPUModel = hwloc_solaris_get_chip_model();
hwloc_add_object_info(topology->levels[0][0], "CPUType", CPUType);
hwloc_add_object_info(topology->levels[0][0], "CPUModel", CPUModel);
return;
}
#endif /* HAVE_LIBKSTAT */ #endif /* HAVE_LIBKSTAT */
hwloc_setup_pu_level(topology, nbprocs); hwloc_setup_pu_level(topology, nbprocs);
hwloc_add_object_info(topology->levels[0][0], "Backend", "Solaris"); hwloc_add_object_info(topology->levels[0][0], "Backend", "Solaris");
/* Set CPU Type and Model for machine. */
CPUType = hwloc_solaris_get_chip_type();
CPUModel = hwloc_solaris_get_chip_model();
hwloc_add_object_info(topology->levels[0][0], "CPUType", CPUType);
hwloc_add_object_info(topology->levels[0][0], "CPUModel", CPUModel);
} }
void void

Просмотреть файл

@ -205,6 +205,9 @@ opal_err2str(int errnum, const char **errmsg)
case OPAL_ERR_NETWORK_NOT_PARSEABLE: case OPAL_ERR_NETWORK_NOT_PARSEABLE:
retval = "Provided network specification is not parseable"; retval = "Provided network specification is not parseable";
break; break;
case OPAL_ERR_SILENT:
retval = NULL;
break;
default: default:
retval = NULL; retval = NULL;
} }

Просмотреть файл

@ -74,7 +74,8 @@ enum {
ORTE_ERR_INVALID_PHYS_CPU = OPAL_ERR_INVALID_PHYS_CPU, ORTE_ERR_INVALID_PHYS_CPU = OPAL_ERR_INVALID_PHYS_CPU,
ORTE_ERR_MULTIPLE_AFFINITIES = OPAL_ERR_MULTIPLE_AFFINITIES, ORTE_ERR_MULTIPLE_AFFINITIES = OPAL_ERR_MULTIPLE_AFFINITIES,
ORTE_ERR_SLOT_LIST_RANGE = OPAL_ERR_SLOT_LIST_RANGE, ORTE_ERR_SLOT_LIST_RANGE = OPAL_ERR_SLOT_LIST_RANGE,
ORTE_ERR_SILENT = OPAL_ERR_SILENT,
/* error codes specific to ORTE - don't forget to update /* error codes specific to ORTE - don't forget to update
orte/util/error_strings.c when adding new error codes!! orte/util/error_strings.c when adding new error codes!!
Otherwise, the error reporting system will potentially crash, Otherwise, the error reporting system will potentially crash,
@ -95,35 +96,34 @@ enum {
ORTE_ERR_INDETERMINATE_STATE_INFO = (ORTE_ERR_BASE - 13), ORTE_ERR_INDETERMINATE_STATE_INFO = (ORTE_ERR_BASE - 13),
ORTE_ERR_NODE_FULLY_USED = (ORTE_ERR_BASE - 14), ORTE_ERR_NODE_FULLY_USED = (ORTE_ERR_BASE - 14),
ORTE_ERR_INVALID_NUM_PROCS = (ORTE_ERR_BASE - 15), ORTE_ERR_INVALID_NUM_PROCS = (ORTE_ERR_BASE - 15),
ORTE_ERR_SILENT = (ORTE_ERR_BASE - 16), ORTE_ERR_ADDRESSEE_UNKNOWN = (ORTE_ERR_BASE - 16),
ORTE_ERR_ADDRESSEE_UNKNOWN = (ORTE_ERR_BASE - 17), ORTE_ERR_SYS_LIMITS_PIPES = (ORTE_ERR_BASE - 17),
ORTE_ERR_SYS_LIMITS_PIPES = (ORTE_ERR_BASE - 18), ORTE_ERR_PIPE_SETUP_FAILURE = (ORTE_ERR_BASE - 18),
ORTE_ERR_PIPE_SETUP_FAILURE = (ORTE_ERR_BASE - 19), ORTE_ERR_SYS_LIMITS_CHILDREN = (ORTE_ERR_BASE - 19),
ORTE_ERR_SYS_LIMITS_CHILDREN = (ORTE_ERR_BASE - 20), ORTE_ERR_FAILED_GET_TERM_ATTRS = (ORTE_ERR_BASE - 20),
ORTE_ERR_FAILED_GET_TERM_ATTRS = (ORTE_ERR_BASE - 21), ORTE_ERR_WDIR_NOT_FOUND = (ORTE_ERR_BASE - 21),
ORTE_ERR_WDIR_NOT_FOUND = (ORTE_ERR_BASE - 22), ORTE_ERR_EXE_NOT_FOUND = (ORTE_ERR_BASE - 22),
ORTE_ERR_EXE_NOT_FOUND = (ORTE_ERR_BASE - 23), ORTE_ERR_PIPE_READ_FAILURE = (ORTE_ERR_BASE - 23),
ORTE_ERR_PIPE_READ_FAILURE = (ORTE_ERR_BASE - 24), ORTE_ERR_EXE_NOT_ACCESSIBLE = (ORTE_ERR_BASE - 24),
ORTE_ERR_EXE_NOT_ACCESSIBLE = (ORTE_ERR_BASE - 25), ORTE_ERR_FAILED_TO_START = (ORTE_ERR_BASE - 25),
ORTE_ERR_FAILED_TO_START = (ORTE_ERR_BASE - 26), ORTE_ERR_FILE_NOT_EXECUTABLE = (ORTE_ERR_BASE - 26),
ORTE_ERR_FILE_NOT_EXECUTABLE = (ORTE_ERR_BASE - 27), ORTE_ERR_HNP_COULD_NOT_START = (ORTE_ERR_BASE - 27),
ORTE_ERR_HNP_COULD_NOT_START = (ORTE_ERR_BASE - 28), ORTE_ERR_SYS_LIMITS_SOCKETS = (ORTE_ERR_BASE - 28),
ORTE_ERR_SYS_LIMITS_SOCKETS = (ORTE_ERR_BASE - 29), ORTE_ERR_SOCKET_NOT_AVAILABLE = (ORTE_ERR_BASE - 29),
ORTE_ERR_SOCKET_NOT_AVAILABLE = (ORTE_ERR_BASE - 30), ORTE_ERR_SYSTEM_WILL_BOOTSTRAP = (ORTE_ERR_BASE - 30),
ORTE_ERR_SYSTEM_WILL_BOOTSTRAP = (ORTE_ERR_BASE - 31), ORTE_ERR_RESTART_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 31),
ORTE_ERR_RESTART_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32), ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 32),
ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 33), ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 33),
ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34), ORTE_ERR_UNRECOVERABLE = (ORTE_ERR_BASE - 34),
ORTE_ERR_UNRECOVERABLE = (ORTE_ERR_BASE - 35), ORTE_ERR_MEM_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 35),
ORTE_ERR_MEM_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 36), ORTE_ERR_HEARTBEAT_LOST = (ORTE_ERR_BASE - 36),
ORTE_ERR_HEARTBEAT_LOST = (ORTE_ERR_BASE - 37), ORTE_ERR_PROC_STALLED = (ORTE_ERR_BASE - 37),
ORTE_ERR_PROC_STALLED = (ORTE_ERR_BASE - 38), ORTE_ERR_NO_APP_SPECIFIED = (ORTE_ERR_BASE - 38),
ORTE_ERR_NO_APP_SPECIFIED = (ORTE_ERR_BASE - 39), ORTE_ERR_NO_EXE_SPECIFIED = (ORTE_ERR_BASE - 39),
ORTE_ERR_NO_EXE_SPECIFIED = (ORTE_ERR_BASE - 40), ORTE_ERR_COMM_DISABLED = (ORTE_ERR_BASE - 40),
ORTE_ERR_COMM_DISABLED = (ORTE_ERR_BASE - 41), ORTE_ERR_FAILED_TO_MAP = (ORTE_ERR_BASE - 41),
ORTE_ERR_FAILED_TO_MAP = (ORTE_ERR_BASE - 42), ORTE_ERR_TAKE_NEXT_OPTION = (ORTE_ERR_BASE - 42),
ORTE_ERR_TAKE_NEXT_OPTION = (ORTE_ERR_BASE - 43), ORTE_ERR_SENSOR_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 43)
ORTE_ERR_SENSOR_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 44)
}; };
#define ORTE_ERR_MAX (ORTE_ERR_BASE - 100) #define ORTE_ERR_MAX (ORTE_ERR_BASE - 100)

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2010-2011 Los Alamos National Security, LLC. * Copyright (c) 2010-2011 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
@ -318,12 +318,12 @@ static void attach_debugger(int fd, short event, void *arg)
build_debugger_args(app); build_debugger_args(app);
opal_pointer_array_add(jdata->apps, app); opal_pointer_array_add(jdata->apps, app);
jdata->num_apps = 1; jdata->num_apps = 1;
/* setup the mapping policy to bynode so we get one /* setup the mapping policy to pernode so we get one
* daemon on each node * daemon on each node
*/ */
jdata->map = OBJ_NEW(orte_job_map_t); jdata->map = OBJ_NEW(orte_job_map_t);
jdata->map->policy = ORTE_MAPPING_BYNODE; jdata->map->mapping = ORTE_MAPPING_PPR;
jdata->map->npernode = 1; jdata->map->ppr = strdup("1:n");
/* now go ahead and spawn this job */ /* now go ahead and spawn this job */
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -282,14 +282,14 @@ static int modex(opal_list_t *procs)
{ {
int rc; int rc;
opal_buffer_t buf, rbuf; opal_buffer_t buf, rbuf;
char *locale=NULL;
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
"%s grpcomm:bad: modex entered", "%s grpcomm:bad: modex entered",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (NULL == procs) { if (NULL == procs) {
/* The modex will be realized in the background by the daemons. The processes will /* This is a modex across our peers at startup. The modex will be realized in the
* background by the daemons. The processes will
* only be informed when all data has been collected from all processes. The get_attr * only be informed when all data has been collected from all processes. The get_attr
* will realize the blocking, it will not return until the data has been received. * will realize the blocking, it will not return until the data has been received.
*/ */
@ -308,47 +308,6 @@ static int modex(opal_list_t *procs)
goto cleanup; goto cleanup;
} }
#if OPAL_HAVE_HWLOC
{
if (NULL != opal_hwloc_topology) {
/* our cpuset should already be known, but check for safety */
if (NULL == opal_hwloc_my_cpuset) {
opal_hwloc_base_get_local_cpuset();
}
/* convert to a string */
hwloc_bitmap_list_asprintf(&locale, opal_hwloc_my_cpuset);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:bad LOCALE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), locale));
/* pack it */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
free(locale);
goto cleanup;
}
free(locale);
} else {
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:bad NO TOPO - ADDING PLACEHOLDER",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* pack a placeholder */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
}
#else
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:bad NO HWLOC - ADDING PLACEHOLDER",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* pack a placeholder */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
#endif
/* pack the entries we have received */ /* pack the entries we have received */
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf))) { if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -375,6 +334,9 @@ static int modex(opal_list_t *procs)
return rc; return rc;
} else { } else {
/* this is a modex across a specified list of procs, usually during
* a connect/accept.
*/
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_full_modex(procs))) { if (ORTE_SUCCESS != (rc = orte_grpcomm_base_full_modex(procs))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
} }

Просмотреть файл

@ -90,7 +90,6 @@ ORTE_DECLSPEC int orte_grpcomm_base_set_proc_attr(const char *attr_name,
ORTE_DECLSPEC int orte_grpcomm_base_get_proc_attr(const orte_process_name_t proc, ORTE_DECLSPEC int orte_grpcomm_base_get_proc_attr(const orte_process_name_t proc,
const char * attribute_name, void **val, const char * attribute_name, void **val,
size_t *size); size_t *size);
ORTE_DECLSPEC int orte_grpcomm_base_peer_modex(void);
ORTE_DECLSPEC int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf); ORTE_DECLSPEC int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf);
ORTE_DECLSPEC int orte_grpcomm_base_full_modex(opal_list_t *procs); ORTE_DECLSPEC int orte_grpcomm_base_full_modex(opal_list_t *procs);
ORTE_DECLSPEC int orte_grpcomm_base_purge_proc_attrs(void); ORTE_DECLSPEC int orte_grpcomm_base_purge_proc_attrs(void);

Просмотреть файл

@ -61,7 +61,6 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs)
orte_pmap_t *pmap; orte_pmap_t *pmap;
orte_vpid_t daemon; orte_vpid_t daemon;
char *hostname; char *hostname;
char *locale=NULL;
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
"%s grpcomm:base:full:modex: performing modex", "%s grpcomm:base:full:modex: performing modex",
@ -104,42 +103,12 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs)
} }
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
{ /* pack our binding info so other procs can determine our locality */
/* get and pack our cpuset so other procs can determine our locality */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.bind_level, 1, OPAL_HWLOC_LEVEL_T))) {
if (NULL != opal_hwloc_topology) { ORTE_ERROR_LOG(rc);
/* our cpuset should already be known, but check for safety */ goto cleanup;
if (NULL == opal_hwloc_my_cpuset) {
opal_hwloc_base_get_local_cpuset();
}
/* convert to a string */
hwloc_bitmap_list_asprintf(&locale, opal_hwloc_my_cpuset);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex LOCALE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), locale));
/* pack it */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
free(locale);
goto cleanup;
}
free(locale);
} else {
/* pack a placeholder */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex NO TOPO - ADDING PLACEHOLDER",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
} }
#else if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.bind_idx, 1, OPAL_UINT))) {
/* pack a placeholder */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex NO HWLOC - ADDING PLACEHOLDER",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; goto cleanup;
} }
@ -155,12 +124,20 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs)
"%s grpcomm:base:full:modex: executing allgather", "%s grpcomm:base:full:modex: executing allgather",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* exchange the buffer with the list of peers */ if (NULL == procs) {
if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather_list(procs, &buf, &rbuf))) { /* exchange the buffer with my peers */
ORTE_ERROR_LOG(rc); if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(&buf, &rbuf))) {
goto cleanup; ORTE_ERROR_LOG(rc);
goto cleanup;
}
} else {
/* exchange the buffer with the list of peers */
if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather_list(procs, &buf, &rbuf))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
} }
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
"%s grpcomm:base:full:modex: processing modex info", "%s grpcomm:base:full:modex: processing modex info",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -234,7 +211,8 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs)
/* node wasn't found - let's add it */ /* node wasn't found - let's add it */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:full:modex no nidmap entry for node %s", "%s grpcomm:base:full:modex no nidmap entry for node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostname)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == hostname) ? "NULL" : hostname));
nid = OBJ_NEW(orte_nid_t); nid = OBJ_NEW(orte_nid_t);
nid->name = strdup(hostname); nid->name = strdup(hostname);
nid->daemon = daemon; nid->daemon = daemon;
@ -287,19 +265,63 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs)
} }
} }
/* unpack the locality info */ #if OPAL_HAVE_HWLOC
cnt = 1; {
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &locale, &cnt, OPAL_STRING))) { opal_hwloc_level_t bind_level;
ORTE_ERROR_LOG(rc); unsigned int bind_idx;
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
"%s grpcomm:base:modex setting proc %s locale %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name),
(NULL == locale) ? "NULL" : locale));
/* store on the pmap */ /* unpack the locality info */
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &bind_level, &cnt, OPAL_HWLOC_LEVEL_T))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &bind_idx, &cnt, OPAL_UINT))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
"%s grpcomm:base:modex setting proc %s level %s idx %u",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name),
opal_hwloc_base_print_level(bind_level), bind_idx));
/* store on the pmap */
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &proc_name, ORTE_PROC_MY_NAME)) {
/* if this data is from myself, then set locality to all */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex setting proc %s locale ALL",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name)));
pmap->locality = OPAL_PROC_ALL_LOCAL;
} else if (daemon != ORTE_PROC_MY_DAEMON->vpid) {
/* this is on a different node, then mark as non-local */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex setting proc %s locale NONLOCAL",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name)));
pmap->locality = OPAL_PROC_NON_LOCAL;
} else if (OPAL_HWLOC_NODE_LEVEL == orte_process_info.bind_level ||
OPAL_HWLOC_NODE_LEVEL == bind_level) {
/* one or both of us is not bound, so all we can say is we are on the
* same node
*/
pmap->locality = OPAL_PROC_ON_NODE;
} else {
/* determine relative location on our node */
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
orte_process_info.bind_level,
orte_process_info.bind_idx,
bind_level, bind_idx);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex setting proc %s locale %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name),
opal_hwloc_base_print_locality(pmap->locality)));
}
}
#else
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &proc_name, ORTE_PROC_MY_NAME)) { if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &proc_name, ORTE_PROC_MY_NAME)) {
/* if this data is from myself, then set locality to all */ /* if this data is from myself, then set locality to all */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
@ -314,38 +336,11 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name))); ORTE_NAME_PRINT(&proc_name)));
pmap->locality = OPAL_PROC_NON_LOCAL; pmap->locality = OPAL_PROC_NON_LOCAL;
} else if (NULL == locale || 0 == strlen(locale)){
/* if we share a node, but we don't know anything more, then
* mark us as on the node as this is all we know
*/
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex setting proc %s locale NODE",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name)));
pmap->locality = OPAL_PROC_ON_NODE;
} else { } else {
#if OPAL_HAVE_HWLOC /* must be on our node */
/* convert the locale to a cpuset */ pmap->locality = OPAL_PROC_ON_NODE;
if (NULL == orte_grpcomm_base.working_cpuset) {
orte_grpcomm_base.working_cpuset = hwloc_bitmap_alloc();
}
if (0 != hwloc_bitmap_list_sscanf(orte_grpcomm_base.working_cpuset, locale)) {
/* got a bad locale */
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
goto cleanup;
}
/* determine relative location on our node */
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
opal_hwloc_my_cpuset,
orte_grpcomm_base.working_cpuset);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex setting proc %s locale %04x",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name), pmap->locality));
#endif
} }
#endif
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:full:modex: adding modex entry for proc %s", "%s grpcomm:base:full:modex: adding modex entry for proc %s",
@ -373,7 +368,6 @@ int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf)
int rc=ORTE_SUCCESS; int rc=ORTE_SUCCESS;
orte_vpid_t daemon; orte_vpid_t daemon;
orte_pmap_t *pmap; orte_pmap_t *pmap;
char *locale;
/* process the results */ /* process the results */
/* extract the number of procs that put data in the buffer */ /* extract the number of procs that put data in the buffer */
@ -402,9 +396,7 @@ int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf)
goto cleanup; goto cleanup;
} }
/* SINCE THIS IS AMONGST PEERS, THERE IS NO NEED TO UPDATE THE NIDMAP/PIDMAP /* SINCE THIS IS AMONGST PEERS, THERE IS NO NEED TO UPDATE THE NIDMAP/PIDMAP */
* ITSELF, EXCEPT FOR LOCALITY INFO
*/
if (ORTE_VPID_INVALID == (daemon = orte_ess.proc_get_daemon(&proc_name))) { if (ORTE_VPID_INVALID == (daemon = orte_ess.proc_get_daemon(&proc_name))) {
/* clear problem */ /* clear problem */
@ -420,65 +412,6 @@ int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf)
goto cleanup; goto cleanup;
} }
/* unpack the locality info */
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &locale, &cnt, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex:unpack received proc %s locale %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name),
(NULL == locale) ? "NULL" : locale));
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &proc_name, ORTE_PROC_MY_NAME)) {
/* if this data is from myself, then set locality to all */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex:unpack setting proc %s locale ALL",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name)));
pmap->locality = OPAL_PROC_ALL_LOCAL;
} else if (daemon != ORTE_PROC_MY_DAEMON->vpid) {
/* this is on a different node, then mark as non-local */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex:unpack setting proc %s locale NONLOCAL",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name)));
pmap->locality = OPAL_PROC_NON_LOCAL;
} else if (NULL == locale || 0 == strlen(locale)){
/* if we share a node, but we don't know anything more, then
* mark us as on the node as this is all we know
*/
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex:unpack setting proc %s locale NODE",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name)));
pmap->locality = OPAL_PROC_ON_NODE;
} else {
#if OPAL_HAVE_HWLOC
/* convert the locale to a cpuset */
if (NULL == orte_grpcomm_base.working_cpuset) {
orte_grpcomm_base.working_cpuset = hwloc_bitmap_alloc();
}
if (0 != hwloc_bitmap_list_sscanf(orte_grpcomm_base.working_cpuset, locale)) {
/* got a bad locale */
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
goto cleanup;
}
/* determine relative location on our node */
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
opal_hwloc_my_cpuset,
orte_grpcomm_base.working_cpuset);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex:unpack setting proc %s locale %04x",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name), pmap->locality));
#endif
}
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex:unpack: adding modex entry for proc %s", "%s grpcomm:base:modex:unpack: adding modex entry for proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -495,99 +428,6 @@ int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf)
return rc; return rc;
} }
int orte_grpcomm_base_peer_modex(void)
{
opal_buffer_t buf, rbuf;
int rc = ORTE_SUCCESS;
char *locale=NULL;
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
"%s grpcomm:base:peer:modex: performing modex",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* setup the buffer that will actually be sent */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
OBJ_CONSTRUCT(&rbuf, opal_buffer_t);
/* put our process name in the buffer so it can be unpacked later */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
#if OPAL_HAVE_HWLOC
{
if (NULL != opal_hwloc_topology) {
/* our cpuset should already be known, but check for safety */
if (NULL == opal_hwloc_my_cpuset) {
opal_hwloc_base_get_local_cpuset();
}
/* convert to a string */
hwloc_bitmap_list_asprintf(&locale, opal_hwloc_my_cpuset);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:peer:modex LOCALE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), locale));
/* pack it */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
free(locale);
goto cleanup;
}
free(locale);
} else {
/* pack a placeholder */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:peer:modex NO TOPO - ADDING PLACEHOLDER",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
}
#else
/* pack a placeholder */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:peer:modex NO HWLOC - ADDING PLACEHOLDER",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
#endif
/* pack the entries we have received */
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
"%s grpcomm:base:peer:modex: executing allgather",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* exchange the buffer with my peers */
if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(&buf, &rbuf))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
"%s grpcomm:base:peer:modex: processing modex info",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_unpack(&rbuf)) ) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
cleanup:
OBJ_DESTRUCT(&buf);
OBJ_DESTRUCT(&rbuf);
return rc;
}
/** /**
* MODEX DATABASE DESIGN * MODEX DATABASE DESIGN
* *

Просмотреть файл

@ -68,11 +68,8 @@ int orte_grpcomm_hier_close(void)
int orte_grpcomm_hier_component_query(mca_base_module_t **module, int *priority) int orte_grpcomm_hier_component_query(mca_base_module_t **module, int *priority)
{ {
if (ORTE_PROC_IS_MPI) { /* only select if directed */
*priority = 1000; *priority = 0;
} else {
*priority = 0;
}
*module = (mca_base_module_t *)&orte_grpcomm_hier_module; *module = (mca_base_module_t *)&orte_grpcomm_hier_module;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -54,7 +54,6 @@ static int xcast(orte_jobid_t job,
orte_rml_tag_t tag); orte_rml_tag_t tag);
static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf); static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf);
static int hier_barrier(void); static int hier_barrier(void);
static int modex(opal_list_t *procs);
/* Module def */ /* Module def */
orte_grpcomm_base_module_t orte_grpcomm_hier_module = { orte_grpcomm_base_module_t orte_grpcomm_hier_module = {
@ -66,7 +65,7 @@ orte_grpcomm_base_module_t orte_grpcomm_hier_module = {
hier_barrier, hier_barrier,
orte_grpcomm_base_set_proc_attr, orte_grpcomm_base_set_proc_attr,
orte_grpcomm_base_get_proc_attr, orte_grpcomm_base_get_proc_attr,
modex, orte_grpcomm_base_full_modex,
orte_grpcomm_base_purge_proc_attrs orte_grpcomm_base_purge_proc_attrs
}; };
@ -421,35 +420,3 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
/*** MODEX SECTION ***/
static int modex(opal_list_t *procs)
{
int rc;
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
"%s grpcomm:hier: modex entered",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if we were given a list of procs to modex with, then this is happening
* as part of a connect/accept operation
*/
if (NULL != procs) {
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_full_modex(procs))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* otherwise, we are doing this across our peers */
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_peer_modex())) {
ORTE_ERROR_LOG(rc);
}
}
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
"%s grpcomm:hier: modex completed",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return rc;
}

Просмотреть файл

@ -344,43 +344,25 @@ static int modex(opal_list_t *procs)
free(rml_uri); free(rml_uri);
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
{ if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, "BIND_LEVEL"))) {
char *locale; ORTE_ERROR_LOG(rc);
return rc;
/* provide the locality info */ }
if (NULL != opal_hwloc_topology) { snprintf(val, 64, "%u", (unsigned int)orte_process_info.bind_level);
/* our cpuset should already be known, but check for safety */ rc = kvs_put(pmi_kvs_key, val);
if (NULL == opal_hwloc_my_cpuset) { if (PMI_SUCCESS != rc) {
opal_hwloc_base_get_local_cpuset(); ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
} return ORTE_ERROR;
/* convert to a string */ }
hwloc_bitmap_list_asprintf(&locale, opal_hwloc_my_cpuset); if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, "BIND_IDX"))) {
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, ORTE_ERROR_LOG(rc);
"%s grpcomm:pmi LOCALE %s", return rc;
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), locale)); }
/* NTH: some characters are not allowed in pmi2 land - not sure snprintf(val, 64, "%u", orte_process_info.bind_idx);
* if hwloc would use them, but just to be safe we need to encode rc = kvs_put(pmi_kvs_key, val);
*/ if (PMI_SUCCESS != rc) {
if (ORTE_SUCCESS != (rc = pmi_encode(locale, strlen(locale)))) { ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
ORTE_ERROR_LOG(rc); return ORTE_ERROR;
free(locale);
return rc;
}
/* get the key */
if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, "HWLOC"))) {
ORTE_ERROR_LOG(rc);
free(locale);
return rc;
}
/* encoding puts the encoded value in pmi_attr_val */
rc = kvs_put(pmi_kvs_key, pmi_attr_val);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
free(locale);
return ORTE_ERROR;
}
free(locale);
}
} }
#endif #endif
@ -527,10 +509,11 @@ static int modex(opal_list_t *procs)
(unsigned int)pmap->node_rank)); (unsigned int)pmap->node_rank));
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
{ {
char *locale; opal_hwloc_level_t bind_level;
unsigned int bind_idx;
/* get the proc's locality info, if available */ /* get the proc's locality info, if available */
if (ORTE_SUCCESS != (rc = setup_key(&name, "HWLOC"))) { if (ORTE_SUCCESS != (rc = setup_key(&name, "BIND_LEVEL"))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
@ -561,30 +544,28 @@ static int modex(opal_list_t *procs)
ORTE_NAME_PRINT(&name))); ORTE_NAME_PRINT(&name)));
pmap->locality = OPAL_PROC_ON_NODE; pmap->locality = OPAL_PROC_ON_NODE;
} else { } else {
/* we encoded to protect against pmi2 restrictions */ bind_level = strtol(pmi_attr_val, NULL, 10);
locale = pmi_decode(&len); if (ORTE_SUCCESS != (rc = setup_key(&name, "BIND_IDX"))) {
if (NULL == locale) { ORTE_ERROR_LOG(rc);
return ORTE_ERROR; return rc;
} }
/* convert the locale to a cpuset */ rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
if (NULL == orte_grpcomm_base.working_cpuset) { if (PMI_SUCCESS != rc) {
orte_grpcomm_base.working_cpuset = hwloc_bitmap_alloc(); /* all we know is we share a node */
pmap->locality = OPAL_PROC_ON_NODE;
} else {
bind_idx = strtol(pmi_attr_val, NULL, 10);
/* determine relative location on our node */
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
orte_process_info.bind_level,
orte_process_info.bind_idx,
bind_level, bind_idx);
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
"%s grpcommpmi setting proc %s locale %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&name),
opal_hwloc_base_print_locality(pmap->locality)));
} }
if (0 != hwloc_bitmap_list_sscanf(orte_grpcomm_base.working_cpuset, locale)) {
/* got a bad locale */
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
free(locale);
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
}
free(locale);
/* determine relative location on our node */
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
opal_hwloc_my_cpuset,
orte_grpcomm_base.working_cpuset);
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
"%s grpcommpmi setting proc %s locale %04x",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&name), pmap->locality));
} }
} }
} }

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -53,8 +54,6 @@ typedef struct orte_odls_base_t {
opal_list_t available_components; opal_list_t available_components;
/** selected component */ /** selected component */
orte_odls_base_component_t selected_component; orte_odls_base_component_t selected_component;
/* warn if binding no-op */
bool warn_if_not_bound;
} orte_odls_base_t; } orte_odls_base_t;
/** /**

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC. * Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -47,7 +48,7 @@
#include "opal/util/path.h" #include "opal/util/path.h"
#include "opal/util/sys_limits.h" #include "opal/util/sys_limits.h"
#include "opal/dss/dss.h" #include "opal/dss/dss.h"
#include "opal/mca/paffinity/base/base.h" #include "opal/mca/hwloc/hwloc.h"
#include "opal/mca/shmem/base/base.h" #include "opal/mca/shmem/base/base.h"
#include "opal/mca/pstat/pstat.h" #include "opal/mca/pstat/pstat.h"
@ -85,8 +86,6 @@
#include "orte/mca/odls/base/base.h" #include "orte/mca/odls/base/base.h"
#include "orte/mca/odls/base/odls_private.h" #include "orte/mca/odls/base/odls_private.h"
static bool override_oversubscribed = false;
/* IT IS CRITICAL THAT ANY CHANGE IN THE ORDER OF THE INFO PACKED IN /* IT IS CRITICAL THAT ANY CHANGE IN THE ORDER OF THE INFO PACKED IN
* THIS FUNCTION BE REFLECTED IN THE CONSTRUCT_CHILD_LIST PARSER BELOW * THIS FUNCTION BE REFLECTED IN THE CONSTRUCT_CHILD_LIST PARSER BELOW
*/ */
@ -99,10 +98,8 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
orte_job_map_t *map=NULL; orte_job_map_t *map=NULL;
opal_buffer_t *wireup; opal_buffer_t *wireup;
opal_byte_object_t bo, *boptr; opal_byte_object_t bo, *boptr;
int32_t numbytes, *restarts; int32_t numbytes;
int8_t flag; int8_t flag;
orte_app_idx_t *app_idx;
orte_vpid_t i;
int j; int j;
orte_daemon_cmd_flag_t command; orte_daemon_cmd_flag_t command;
orte_app_context_t *app; orte_app_context_t *app;
@ -265,30 +262,14 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
return rc; return rc;
} }
/* pack the oversubscribe override flag */ #if OPAL_HAVE_HWLOC
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->oversubscribe_override, 1, OPAL_BOOL))) { /* pack the binding policy so the daemon knows if binding is required */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->map->binding, 1, OPAL_BINDING_POLICY))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
#endif
/* pack the map & binding policy for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->policy, 1, ORTE_MAPPING_POLICY))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the cpus_per_rank for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->cpus_per_rank, 1, OPAL_INT16))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the stride for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->stride, 1, OPAL_INT16))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the control flags for this job */ /* pack the control flags for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->controls, 1, ORTE_JOB_CONTROL))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->controls, 1, ORTE_JOB_CONTROL))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -337,52 +318,17 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
/* release the data since it has now been copied into our buffer */ /* release the data since it has now been copied into our buffer */
free(bo.bytes); free(bo.bytes);
/* transfer and pack the app_idx and restart arrays for this job */ /* pack the procs for this job */
app_idx = (orte_app_idx_t*)malloc(jdata->num_procs * sizeof(orte_app_idx_t)); for (j=0; j < jdata->procs->size; j++) {
restarts = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t));
for (j=0, i=0; i < jdata->num_procs && j < jdata->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
continue; continue;
} }
app_idx[i] = proc->app_idx; if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &proc, 1, ORTE_PROC))) {
restarts[i++] = proc->restarts;
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, app_idx, jdata->num_procs, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(app_idx);
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, restarts, jdata->num_procs, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(restarts);
/* are there cpu_list strings? */
if (jdata->map->cpu_lists) {
flag = (int8_t)true;
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &flag, 1, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
for (j=0, i=0; i < jdata->num_procs && j < jdata->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
continue;
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &proc->slot_list, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
i++;
}
} else {
flag = (int8_t)false;
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &flag, 1, OPAL_INT8))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -474,18 +420,15 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
orte_vpid_t j, host_daemon; orte_vpid_t j, host_daemon;
orte_odls_child_t *child; orte_odls_child_t *child;
orte_std_cntr_t cnt; orte_std_cntr_t cnt;
orte_process_name_t proc;
orte_odls_job_t *jobdat=NULL; orte_odls_job_t *jobdat=NULL;
opal_byte_object_t *bo; opal_byte_object_t *bo;
opal_list_item_t *item; opal_list_item_t *item;
int8_t flag; int8_t flag;
orte_app_idx_t *app_idx=NULL;
int32_t *restarts=NULL;
char **slot_str=NULL;
orte_jobid_t debugger; orte_jobid_t debugger;
bool add_child; bool add_child;
orte_ns_cmp_bitmask_t mask; orte_ns_cmp_bitmask_t mask;
orte_app_context_t *app; orte_app_context_t *app;
orte_proc_t *pptr;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:constructing child list", "%s odls:constructing child list",
@ -614,30 +557,14 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto REPORT_ERROR; goto REPORT_ERROR;
} }
/* unpack the override oversubscribed flag */ #if OPAL_HAVE_HWLOC
/* unpack the binding policy */
cnt=1; cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &override_oversubscribed, &cnt, OPAL_BOOL))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->binding, &cnt, OPAL_BINDING_POLICY))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the mapping policy for the job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->policy, &cnt, ORTE_MAPPING_POLICY))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the cpus/rank for the job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->cpus_per_rank, &cnt, OPAL_INT16))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the stride for the job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->stride, &cnt, OPAL_INT16))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto REPORT_ERROR; goto REPORT_ERROR;
} }
#endif
/* unpack the control flags for the job */ /* unpack the control flags for the job */
cnt=1; cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->controls, &cnt, ORTE_JOB_CONTROL))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->controls, &cnt, ORTE_JOB_CONTROL))) {
@ -693,53 +620,21 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
goto REPORT_ERROR; goto REPORT_ERROR;
} }
/* allocate memory for app_idx */ /* unpack the procs */
app_idx = (orte_app_idx_t*)malloc(jobdat->num_procs * sizeof(orte_app_idx_t));
/* unpack app_idx in one shot */
cnt=jobdat->num_procs;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, app_idx, &cnt, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* allocate memory for restarts */
restarts = (int32_t*)malloc(jobdat->num_procs * sizeof(int32_t));
/* unpack restarts in one shot */
cnt=jobdat->num_procs;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, restarts, &cnt, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack flag to indicate if slot_strings are present */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
if (flag) {
/* allocate space */
slot_str = (char**)malloc(jobdat->num_procs * sizeof(char*));
for (j=0; j < jobdat->num_procs; j++) {
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &slot_str[j], &cnt, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
}
}
/* cycle through the procs and find mine */
proc.jobid = jobdat->jobid;
for (j=0; j < jobdat->num_procs; j++) { for (j=0; j < jobdat->num_procs; j++) {
proc.vpid = j; cnt=1;
ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc)); if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &pptr, &cnt, ORTE_PROC))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* see if it is one of mine */
ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&pptr->name));
/* get the vpid of the daemon that is to host this proc */ /* get the vpid of the daemon that is to host this proc */
OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output, OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output,
"%s odls:constructing child list - looking for daemon for proc %s", "%s odls:constructing child list - looking for daemon for proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name)));
if (ORTE_VPID_INVALID == (host_daemon = orte_ess.proc_get_daemon(&proc))) { if (ORTE_VPID_INVALID == (host_daemon = orte_ess.proc_get_daemon(&pptr->name))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rc = ORTE_ERR_NOT_FOUND; rc = ORTE_ERR_NOT_FOUND;
goto REPORT_ERROR; goto REPORT_ERROR;
@ -747,7 +642,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output, OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output,
"%s odls:constructing child list - checking proc %s on daemon %s", "%s odls:constructing child list - checking proc %s on daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name),
ORTE_VPID_PRINT(host_daemon))); ORTE_VPID_PRINT(host_daemon)));
/* does this proc belong to us? */ /* does this proc belong to us? */
@ -755,7 +650,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output, OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output,
"%s odls:constructing child list - found proc %s for me!", "%s odls:constructing child list - found proc %s for me!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name)));
add_child = true; add_child = true;
/* if this job is restarting procs, then we need to treat things /* if this job is restarting procs, then we need to treat things
@ -773,17 +668,17 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
mask = ORTE_NS_CMP_ALL; mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == if (OPAL_EQUAL ==
orte_util_compare_name_fields(mask, child->name, &proc)) { orte_util_compare_name_fields(mask, child->name, &pptr->name)) {
/* do not duplicate this child on the list! */ /* do not duplicate this child on the list! */
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"proc %s is on list and is %s", "proc %s is on list and is %s",
ORTE_NAME_PRINT(&proc), ORTE_NAME_PRINT(&pptr->name),
(child->alive) ? "ALIVE" : "DEAD")); (child->alive) ? "ALIVE" : "DEAD"));
add_child = false; add_child = false;
child->restarts = restarts[j]; child->restarts = pptr->restarts;
child->do_not_barrier = true; child->do_not_barrier = true;
/* mark that this app_context is being used on this node */ /* mark that this app_context is being used on this node */
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, app_idx[j]); app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, pptr->app_idx);
app->used_on_node = true; app->used_on_node = true;
break; break;
} }
@ -794,27 +689,29 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
if (add_child) { if (add_child) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"adding proc %s to my local list", "adding proc %s to my local list",
ORTE_NAME_PRINT(&proc))); ORTE_NAME_PRINT(&pptr->name)));
/* keep tabs of the number of local procs */ /* keep tabs of the number of local procs */
jobdat->num_local_procs++; jobdat->num_local_procs++;
/* add this proc to our child list */ /* add this proc to our child list */
child = OBJ_NEW(orte_odls_child_t); child = OBJ_NEW(orte_odls_child_t);
/* copy the name to preserve it */ /* copy the name to preserve it */
if (ORTE_SUCCESS != (rc = opal_dss.copy((void**)&child->name, &proc, ORTE_NAME))) { if (ORTE_SUCCESS != (rc = opal_dss.copy((void**)&child->name, &pptr->name, ORTE_NAME))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto REPORT_ERROR; goto REPORT_ERROR;
} }
child->app_idx = app_idx[j]; /* save the index into the app_context objects */ child->app_idx = pptr->app_idx; /* save the index into the app_context objects */
child->restarts = restarts[j]; child->restarts = pptr->restarts;
/* if the job is in restart mode, the child must not barrier when launched */ /* if the job is in restart mode, the child must not barrier when launched */
if (ORTE_JOB_STATE_RESTART == jobdat->state) { if (ORTE_JOB_STATE_RESTART == jobdat->state) {
child->do_not_barrier = true; child->do_not_barrier = true;
} }
if (NULL != slot_str && NULL != slot_str[j]) { #if OPAL_HAVE_HWLOC
child->slot_list = strdup(slot_str[j]); if (NULL != pptr->cpu_bitmap) {
child->cpu_bitmap = strdup(pptr->cpu_bitmap);
} }
#endif
/* mark that this app_context is being used on this node */ /* mark that this app_context is being used on this node */
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, app_idx[j]); app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, pptr->app_idx);
app->used_on_node = true; app->used_on_node = true;
/* protect operation on the global list of children */ /* protect operation on the global list of children */
OPAL_THREAD_LOCK(&orte_odls_globals.mutex); OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
@ -823,6 +720,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
} }
} }
OBJ_RELEASE(pptr);
} }
/* flag that the launch msg has been processed so daemon collectives can proceed */ /* flag that the launch msg has been processed so daemon collectives can proceed */
@ -832,22 +730,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
OPAL_THREAD_UNLOCK(&jobdat->lock); OPAL_THREAD_UNLOCK(&jobdat->lock);
done: done:
if (NULL != app_idx) {
free(app_idx);
app_idx = NULL;
}
if (NULL != restarts) {
free(restarts);
restarts = NULL;
}
if (NULL != slot_str) {
for (j=0; j < jobdat->num_procs; j++) {
free(slot_str[j]);
}
free(slot_str);
slot_str = NULL;
}
return ORTE_SUCCESS; return ORTE_SUCCESS;
REPORT_ERROR: REPORT_ERROR:
@ -860,24 +742,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
orte_errmgr.update_state(*job, ORTE_JOB_STATE_NEVER_LAUNCHED, orte_errmgr.update_state(*job, ORTE_JOB_STATE_NEVER_LAUNCHED,
NULL, ORTE_PROC_STATE_UNDEF, 0, rc); NULL, ORTE_PROC_STATE_UNDEF, 0, rc);
if (NULL != app_idx) {
free(app_idx);
app_idx = NULL;
}
if (NULL != restarts) {
free(restarts);
restarts = NULL;
}
if (NULL != slot_str && NULL != jobdat) {
for (j=0; j < jobdat->num_procs; j++) {
if (NULL != slot_str[j]) {
free(slot_str[j]);
}
}
free(slot_str);
slot_str = NULL;
}
return rc; return rc;
} }
@ -900,15 +764,15 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
} }
/* special case handling for --prefix: this is somewhat icky, /* special case handling for --prefix: this is somewhat icky,
but at least some users do this. :-\ It is possible that but at least some users do this. :-\ It is possible that
when using --prefix, the user will also "-x PATH" and/or when using --prefix, the user will also "-x PATH" and/or
"-x LD_LIBRARY_PATH", which would therefore clobber the "-x LD_LIBRARY_PATH", which would therefore clobber the
work that was done in the prior pls to ensure that we have work that was done in the prior pls to ensure that we have
the prefix at the beginning of the PATH and the prefix at the beginning of the PATH and
LD_LIBRARY_PATH. So examine the context->env and see if we LD_LIBRARY_PATH. So examine the context->env and see if we
find PATH or LD_LIBRARY_PATH. If found, that means the find PATH or LD_LIBRARY_PATH. If found, that means the
prior work was clobbered, and we need to re-prefix those prior work was clobbered, and we need to re-prefix those
variables. */ variables. */
for (i = 0; NULL != context->prefix_dir && NULL != context->env && NULL != context->env[i]; ++i) { for (i = 0; NULL != context->prefix_dir && NULL != context->env && NULL != context->env[i]; ++i) {
char *newenv; char *newenv;
@ -979,19 +843,42 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
free(param); free(param);
free(param2); free(param2);
/* pass a param telling the child what type and model of cpu we are on, #if OPAL_HAVE_HWLOC
* if we know it {
*/ /* pass a param telling the child what type and model of cpu we are on,
if (NULL != orte_local_cpu_type) { * if we know it. If hwloc has the value, use what it knows. Otherwise,
param = mca_base_param_environ_variable("orte","cpu","type"); * see if we were explicitly given it and use that value.
opal_setenv(param, orte_local_cpu_type, true, environ_copy); */
free(param); hwloc_obj_t obj;
} char *htmp;
if (NULL != orte_local_cpu_model) { if (NULL != opal_hwloc_topology) {
param = mca_base_param_environ_variable("orte","cpu","model"); obj = hwloc_get_root_obj(opal_hwloc_topology);
opal_setenv(param, orte_local_cpu_model, true, environ_copy); if (NULL != (htmp = hwloc_obj_get_info_by_name(obj, "CPUType")) ||
free(param); NULL != (htmp = orte_local_cpu_type)) {
param = mca_base_param_environ_variable("orte","cpu","type");
opal_setenv(param, htmp, true, environ_copy);
free(param);
}
if (NULL != (htmp = hwloc_obj_get_info_by_name(obj, "CPUModel")) ||
NULL != (htmp = orte_local_cpu_model)) {
param = mca_base_param_environ_variable("orte","cpu","model");
opal_setenv(param, htmp, true, environ_copy);
free(param);
}
} else {
if (NULL != orte_local_cpu_type) {
param = mca_base_param_environ_variable("orte","cpu","type");
opal_setenv(param, orte_local_cpu_type, true, environ_copy);
free(param);
}
if (NULL != orte_local_cpu_model) {
param = mca_base_param_environ_variable("orte","cpu","model");
opal_setenv(param, orte_local_cpu_model, true, environ_copy);
free(param);
}
}
} }
#endif
/* get shmem's best component name so we can provide a hint to the shmem /* get shmem's best component name so we can provide a hint to the shmem
* framework. the idea here is to have someone figure out what component to * framework. the idea here is to have someone figure out what component to
@ -1277,7 +1164,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
opal_list_item_t *item; opal_list_item_t *item;
orte_app_context_t *app, *dbg; orte_app_context_t *app, *dbg;
orte_odls_child_t *child=NULL; orte_odls_child_t *child=NULL;
int num_processors;
bool oversubscribed; bool oversubscribed;
int rc=ORTE_SUCCESS; int rc=ORTE_SUCCESS;
bool launch_failed=true; bool launch_failed=true;
@ -1386,48 +1272,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
orte_sstore.wait_all_deps(); orte_sstore.wait_all_deps();
#endif #endif
/* if the mapper says we are oversubscribed, then we trust it - unless
* it told us -not- to!
*/
if (oversubscribed && !override_oversubscribed) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:launch mapper declares this node oversubscribed",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} else {
/* if the mapper thinks we are not oversubscribed, then we
* do a final smoke test by checking against the #processors. This
* is done solely in case the mapper had incorrect knowledge of
* the #local processors
*/
/* compute the number of local procs alive or about to be launched
* as part of this job
*/
total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs;
/* get the number of local processors */
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_processor_info(&num_processors))) {
/* if we cannot find the number of local processors, we have no choice
* but to default to conservative settings
*/
oversubscribed = true;
} else {
if (total_num_local_procs > num_processors) {
/* if the #procs > #processors, declare us oversubscribed. This
* covers the case where the user didn't tell us anything about the
* number of available slots, so we defaulted to a value of 1
*/
oversubscribed = true;
} else {
/* otherwise, declare us to not be oversubscribed so we can be aggressive */
oversubscribed = false;
}
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:launch found %d processors for %d children and locally set oversubscribed to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(ORTE_SUCCESS == rc) ? num_processors : -1, (int)opal_list_get_size(&orte_local_children),
oversubscribed ? "true" : "false"));
}
/* setup to report the proc state to the HNP */ /* setup to report the proc state to the HNP */
OBJ_CONSTRUCT(&alert, opal_buffer_t); OBJ_CONSTRUCT(&alert, opal_buffer_t);
@ -2016,7 +1860,11 @@ void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid)
opal_buffer_t buffer; opal_buffer_t buffer;
opal_byte_object_t *bo; opal_byte_object_t *bo;
int rc; int rc;
#if OPAL_HAVE_HWLOC
opal_hwloc_level_t bind_level;
unsigned int bind_idx;
#endif
/* create a job tracking object for it */ /* create a job tracking object for it */
jobdat = OBJ_NEW(orte_odls_job_t); jobdat = OBJ_NEW(orte_odls_job_t);
jobdat->jobid = jobid; jobdat->jobid = jobid;
@ -2028,12 +1876,20 @@ void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid)
opal_dss.pack(&buffer, &jobid, 1, ORTE_JOBID); /* jobid */ opal_dss.pack(&buffer, &jobid, 1, ORTE_JOBID); /* jobid */
vpid1 = 1; vpid1 = 1;
opal_dss.pack(&buffer, &vpid1, 1, ORTE_VPID); /* num_procs */ opal_dss.pack(&buffer, &vpid1, 1, ORTE_VPID); /* num_procs */
#if OPAL_HAVE_HWLOC
bind_level = OPAL_HWLOC_NODE_LEVEL;
opal_dss.pack(&buffer, &bind_level, 1, OPAL_HWLOC_LEVEL_T); /* num_procs */
#endif
one32 = 0; one32 = 0;
opal_dss.pack(&buffer, &one32, 1, OPAL_INT32); /* node index */ opal_dss.pack(&buffer, &one32, 1, OPAL_INT32); /* node index */
lrank = 0; lrank = 0;
opal_dss.pack(&buffer, &lrank, 1, ORTE_LOCAL_RANK); /* local rank */ opal_dss.pack(&buffer, &lrank, 1, ORTE_LOCAL_RANK); /* local rank */
nrank = 0; nrank = 0;
opal_dss.pack(&buffer, &nrank, 1, ORTE_NODE_RANK); /* node rank */ opal_dss.pack(&buffer, &nrank, 1, ORTE_NODE_RANK); /* node rank */
#if OPAL_HAVE_HWLOC
bind_idx = 0;
opal_dss.pack(&buffer, &bind_idx, 1, OPAL_UINT);
#endif
/* setup a byte object and unload the packed data to it */ /* setup a byte object and unload the packed data to it */
bo = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t)); bo = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t));
opal_dss.unload(&buffer, (void**)&bo->bytes, &bo->size); opal_dss.unload(&buffer, (void**)&bo->bytes, &bo->size);
@ -2189,14 +2045,14 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:sync sending byte object", "%s odls:sync sending byte object",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
opal_dss.pack(&buffer, &orte_odls_globals.dmap, 1, OPAL_BYTE_OBJECT);
opal_dss.pack(&buffer, &jobdat->pmap, 1, OPAL_BYTE_OBJECT);
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
/* send the local topology so the individual apps /* send the local topology so the individual apps
* don't hammer the system to collect it themselves * don't hammer the system to collect it themselves
*/ */
opal_dss.pack(&buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO); opal_dss.pack(&buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO);
#endif #endif
opal_dss.pack(&buffer, &orte_odls_globals.dmap, 1, OPAL_BYTE_OBJECT);
opal_dss.pack(&buffer, &jobdat->pmap, 1, OPAL_BYTE_OBJECT);
} }
} }

Просмотреть файл

@ -10,6 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -31,7 +32,6 @@
#include "opal/mca/base/base.h" #include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
#include "opal/mca/hwloc/hwloc.h" #include "opal/mca/hwloc/hwloc.h"
#include "opal/mca/paffinity/base/base.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/path.h" #include "opal/util/path.h"
#include "opal/util/argv.h" #include "opal/util/argv.h"
@ -89,7 +89,7 @@ orte_odls_globals_t orte_odls_globals;
int orte_odls_base_open(void) int orte_odls_base_open(void)
{ {
char **ranks=NULL, *tmp; char **ranks=NULL, *tmp;
int i, rank, sock, core; int i, rank;
orte_namelist_t *nm; orte_namelist_t *nm;
bool xterm_hold; bool xterm_hold;
@ -101,13 +101,6 @@ int orte_odls_base_open(void)
"Time to wait for a process to die after issuing a kill signal to it", "Time to wait for a process to die after issuing a kill signal to it",
false, false, 1, &orte_odls_globals.timeout_before_sigkill); false, false, 1, &orte_odls_globals.timeout_before_sigkill);
mca_base_param_reg_int_name("odls", "warn_if_not_bound",
"If nonzero, issue a warning if the program asked "
"for a binding that results in a no-op (ex: "
"bind-to-socket on a single socket node)",
false, false, 1, &i);
orte_odls_base.warn_if_not_bound = OPAL_INT_TO_BOOL(i);
/* initialize the global list of local children and job data */ /* initialize the global list of local children and job data */
OBJ_CONSTRUCT(&orte_local_children, opal_list_t); OBJ_CONSTRUCT(&orte_local_children, opal_list_t);
OBJ_CONSTRUCT(&orte_local_children_lock, opal_mutex_t); OBJ_CONSTRUCT(&orte_local_children_lock, opal_mutex_t);
@ -125,43 +118,6 @@ int orte_odls_base_open(void)
orte_odls_globals.debugger = NULL; orte_odls_globals.debugger = NULL;
orte_odls_globals.debugger_launched = false; orte_odls_globals.debugger_launched = false;
/* get any external processor bindings */
OPAL_PAFFINITY_CPU_ZERO(orte_odls_globals.my_cores);
orte_odls_globals.bound = false;
orte_odls_globals.num_processors = 0;
OBJ_CONSTRUCT(&orte_odls_globals.sockets, opal_bitmap_t);
opal_bitmap_init(&orte_odls_globals.sockets, 16);
/* default the number of sockets to those found during startup */
orte_odls_globals.num_sockets = orte_default_num_sockets_per_board;
/* see if paffinity is supported */
if (ORTE_SUCCESS == opal_paffinity_base_get(&orte_odls_globals.my_cores)) {
/* get the number of local processors */
opal_paffinity_base_get_processor_info(&orte_odls_globals.num_processors);
/* determine if we are bound */
OPAL_PAFFINITY_PROCESS_IS_BOUND(orte_odls_globals.my_cores, &orte_odls_globals.bound);
/* if we are bound, determine the number of sockets - and which ones - that are available to us */
if (orte_odls_globals.bound) {
for (i=0; i < orte_odls_globals.num_processors; i++) {
if (OPAL_PAFFINITY_CPU_ISSET(i, orte_odls_globals.my_cores)) {
opal_paffinity_base_get_map_to_socket_core(i, &sock, &core);
opal_bitmap_set_bit(&orte_odls_globals.sockets, sock);
}
}
/* determine how many sockets we have available to us */
orte_odls_globals.num_sockets = 0;
for (i=0; i < opal_bitmap_size(&orte_odls_globals.sockets); i++) {
if (opal_bitmap_is_set_bit(&orte_odls_globals.sockets, i)) {
orte_odls_globals.num_sockets++;
}
}
if (orte_report_bindings) {
orte_show_help("help-odls-base.txt",
"orte-odls-base:show-bindings",
false, orte_odls_globals.my_cores.bitmask[0]);
}
}
}
/* check if the user requested that we display output in xterms */ /* check if the user requested that we display output in xterms */
if (NULL != orte_xterm) { if (NULL != orte_xterm) {
/* construct a list of ranks to be displayed */ /* construct a list of ranks to be displayed */
@ -250,13 +206,15 @@ static void orte_odls_child_constructor(orte_odls_child_t *ptr)
ptr->init_recvd = false; ptr->init_recvd = false;
ptr->fini_recvd = false; ptr->fini_recvd = false;
ptr->rml_uri = NULL; ptr->rml_uri = NULL;
ptr->slot_list = NULL;
ptr->waitpid_recvd = false; ptr->waitpid_recvd = false;
ptr->iof_complete = false; ptr->iof_complete = false;
ptr->do_not_barrier = false; ptr->do_not_barrier = false;
ptr->notified = false; ptr->notified = false;
OBJ_CONSTRUCT(&ptr->stats, opal_ring_buffer_t); OBJ_CONSTRUCT(&ptr->stats, opal_ring_buffer_t);
opal_ring_buffer_init(&ptr->stats, orte_stat_history_size); opal_ring_buffer_init(&ptr->stats, orte_stat_history_size);
#if OPAL_HAVE_HWLOC
ptr->cpu_bitmap = NULL;
#endif
} }
static void orte_odls_child_destructor(orte_odls_child_t *ptr) static void orte_odls_child_destructor(orte_odls_child_t *ptr)
{ {
@ -264,12 +222,16 @@ static void orte_odls_child_destructor(orte_odls_child_t *ptr)
if (NULL != ptr->name) free(ptr->name); if (NULL != ptr->name) free(ptr->name);
if (NULL != ptr->rml_uri) free(ptr->rml_uri); if (NULL != ptr->rml_uri) free(ptr->rml_uri);
if (NULL != ptr->slot_list) free(ptr->slot_list);
while (NULL != (st = (opal_pstats_t*)opal_ring_buffer_pop(&ptr->stats))) { while (NULL != (st = (opal_pstats_t*)opal_ring_buffer_pop(&ptr->stats))) {
OBJ_RELEASE(st); OBJ_RELEASE(st);
} }
OBJ_DESTRUCT(&ptr->stats); OBJ_DESTRUCT(&ptr->stats);
#if OPAL_HAVE_HWLOC
if (NULL != ptr->cpu_bitmap) {
free(ptr->cpu_bitmap);
}
#endif
} }
OBJ_CLASS_INSTANCE(orte_odls_child_t, OBJ_CLASS_INSTANCE(orte_odls_child_t,
opal_list_item_t, opal_list_item_t,
@ -288,7 +250,9 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
OBJ_CONSTRUCT(&ptr->apps, opal_pointer_array_t); OBJ_CONSTRUCT(&ptr->apps, opal_pointer_array_t);
opal_pointer_array_init(&ptr->apps, 2, INT_MAX, 2); opal_pointer_array_init(&ptr->apps, 2, INT_MAX, 2);
ptr->num_apps = 0; ptr->num_apps = 0;
ptr->policy = 0; #if OPAL_HAVE_HWLOC
ptr->binding = 0;
#endif
ptr->cpus_per_rank = 1; ptr->cpus_per_rank = 1;
ptr->stride = 1; ptr->stride = 1;
ptr->controls = 0; ptr->controls = 0;

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -33,7 +34,6 @@
#include "opal/threads/mutex.h" #include "opal/threads/mutex.h"
#include "opal/threads/condition.h" #include "opal/threads/condition.h"
#include "opal/dss/dss_types.h" #include "opal/dss/dss_types.h"
#include "opal/mca/paffinity/paffinity.h"
#include "orte/mca/grpcomm/grpcomm_types.h" #include "orte/mca/grpcomm/grpcomm_types.h"
#include "orte/mca/rml/rml_types.h" #include "orte/mca/rml/rml_types.h"
@ -66,18 +66,6 @@ typedef struct {
opal_list_t xterm_ranks; opal_list_t xterm_ranks;
/* the xterm cmd to be used */ /* the xterm cmd to be used */
char **xtermcmd; char **xtermcmd;
/* any externally provided bindings */
opal_paffinity_base_cpu_set_t my_cores;
/* flag whether or not we are bound */
bool bound;
/* local number of processors */
int num_processors;
/* map of locally available sockets
* as determined by external bindings
*/
opal_bitmap_t sockets;
/* number of sockets available to us */
int num_sockets;
} orte_odls_globals_t; } orte_odls_globals_t;
ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals; ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals;

Просмотреть файл

@ -11,7 +11,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California. # Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. # Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -94,3 +94,24 @@ Your job may behave unpredictably after this, or abort.
Application name: %s Application name: %s
Function: %s Function: %s
Location: %s:%d Location: %s:%d
#
[memory not bound]
WARNING: Open MPI tried to bind a process but failed. This is a
warning only; your job will continue, though performance may
be degraded.
Local host: %s
Application name: %s
Error message: %s
Location: %s:%d
#
[memory binding error]
Open MPI tried to bind memory for a new process but something went
wrong. The process was killed without launching the target
application. Your job will now abort.
Local host: %s
Application name: %s
Error message: %s
Location: %s:%d

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved. * Copyright (c) 2007 Evergrid, Inc. All rights reserved.
* Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved.
* *
* $COPYRIGHT$ * $COPYRIGHT$
@ -105,11 +105,11 @@
#include <sys/select.h> #include <sys/select.h>
#endif #endif
#include "opal/mca/hwloc/hwloc.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/mca/maffinity/base/base.h" #include "opal/mca/maffinity/base/base.h"
#include "opal/mca/paffinity/base/base.h"
#include "opal/class/opal_pointer_array.h" #include "opal/class/opal_pointer_array.h"
#include "opal/util/opal_environ.h" #include "opal/util/opal_environ.h"
#include "opal/util/opal_sos.h"
#include "opal/util/show_help.h" #include "opal/util/show_help.h"
#include "opal/util/fd.h" #include "opal/util/fd.h"
@ -379,745 +379,6 @@ static void send_error_show_help(int fd, int exit_status,
exit(exit_status); exit(exit_status);
} }
/*
* Bind the process to a specific slot list
*/
static int bind_to_slot_list(orte_app_context_t* context,
orte_odls_child_t *child,
orte_odls_job_t *jobdat,
bool *bound, int pipe_fd)
{
int rc;
opal_paffinity_base_cpu_set_t mask;
char *msg = NULL;
*bound = false;
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:fork binding child %s to slot_list %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name),
child->slot_list));
if (opal_paffinity_alone) {
send_error_show_help(pipe_fd, 1,
"help-orte-odls-default.txt",
"slot list and paffinity_alone",
orte_process_info.nodename, context->app);
/* Does not return */
}
if (orte_report_bindings) {
opal_output(0, "%s odls:default:fork binding child %s to slot_list %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name), child->slot_list);
}
rc = opal_paffinity_base_slot_list_set((long)child->name->vpid,
child->slot_list, &mask);
if (ORTE_SUCCESS != rc) {
if (ORTE_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
/* OS doesn't support providing topology information */
send_error_show_help(pipe_fd, 1, "help-orte-odls-default.txt",
"binding not supported",
orte_process_info.nodename, context->app);
/* Does not return */
}
asprintf(&msg, "opal_paffinity_base_slot_list_set() returned \"%s\"",
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
if (NULL == msg) {
msg = "opal_paffinity_base_slot_list_set() returned failure";
}
send_error_show_help(pipe_fd, 1, "help-orte-odls-default.txt",
"binding generic error",
orte_process_info.nodename, context->app, msg,
__FILE__, __LINE__);
/* Does not return */
}
/* if we didn't wind up bound, then generate a warning unless
suppressed */
OPAL_PAFFINITY_PROCESS_IS_BOUND((mask), bound);
if (!bound && orte_odls_base.warn_if_not_bound) {
send_warn_show_help(pipe_fd, "help-orte-odls-base.txt",
"warn not bound", "slot list"
"Request resulted in binding to all available processors",
orte_process_info.nodename, context->app,
"bind to slot list", child->slot_list);
}
return ORTE_SUCCESS;
}
/*
* This function always prints a message: it may be a warning or an
* error.
*
* If binding is not required for this process, then print a simple
* warning message and return an error code. If binding *is*
* required, then send an error message up the pipe to the parent and
* exit.
*/
static int bind_failed_msg(const char *msg, orte_mapping_policy_t policy,
int return_code_if_warning,
int pipe_fd, const char *app_name,
const char *filename, int line_num)
{
/* If binding is not required, then send a warning up the pipe and
then return an error code. */
if (ORTE_BINDING_NOT_REQUIRED(policy)) {
send_warn_show_help(pipe_fd,
"help-orte-odls-default.txt", "not bound",
orte_process_info.nodename, app_name, msg,
filename, line_num);
return return_code_if_warning;
}
/* If binding is required, send an error up the pipe (which exits
-- it doesn't return). */
send_error_show_help(pipe_fd, 1, "help-orte-odls-default.txt",
"binding generic error",
orte_process_info.nodename, app_name, msg,
filename, line_num);
/* Does not return */
}
/*
* Similar to bind_failed_msg(), but if binding is not required, do
* not output a message (just return an error code). If binding is
* required, handling is the same as for bind_failed_msg().
*/
static int bind_failed(const char *msg, orte_mapping_policy_t policy,
int return_code_if_warning,
int pipe_fd, const char *app_name,
const char *filename, int line_num)
{
if (ORTE_BINDING_NOT_REQUIRED(policy)) {
return return_code_if_warning;
}
/* This won't return, but use "return" statement here so that the
compiler won't complain. */
return bind_failed_msg(msg, policy, 0, pipe_fd, app_name,
filename, line_num);
}
/*
* Bind the process to a core
*/
static int bind_to_core(orte_app_context_t* context,
orte_odls_child_t *child,
orte_odls_job_t *jobdat,
bool *bound, int pipe_fd)
{
bool flag;
int i, rc;
char *tmp, *msg;
int16_t n;
orte_node_rank_t nrank, lrank;
opal_paffinity_base_cpu_set_t mask;
int target_socket, npersocket, logical_skt;
int logical_cpu, phys_core, phys_cpu, ncpu;
*bound = false;
/* we want to bind this proc to a specific core, or multiple cores
if the cpus_per_rank is > 0 */
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:default:fork binding child %s to core(s) cpus/rank %d stride %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name),
(int)jobdat->cpus_per_rank, (int)jobdat->stride));
/* get the node rank */
if (ORTE_NODE_RANK_INVALID ==
(nrank = orte_ess.get_node_rank(child->name))) {
send_error_show_help(pipe_fd, 1, "help-orte-odls-default.txt",
"binding generic error",
orte_process_info.nodename, context->app,
"ess.get_node_rank returned NODE_RANK_INVALID",
__FILE__, __LINE__);
/* Does not return */
}
/* get the local rank */
if (ORTE_LOCAL_RANK_INVALID ==
(lrank = orte_ess.get_local_rank(child->name))) {
send_error_show_help(pipe_fd, 1, "help-orte-odls-default.txt",
"binding generic error",
orte_process_info.nodename, context->app,
"ess.get_local_rank returned LOCAL_RANK_INVALID",
__FILE__, __LINE__);
/* Does not return */
}
/* init the mask */
OPAL_PAFFINITY_CPU_ZERO(mask);
if (ORTE_MAPPING_NPERXXX & jobdat->policy) {
/* we need to balance the children from this job
across the available sockets */
npersocket = jobdat->num_local_procs / orte_odls_globals.num_sockets;
/* determine the socket to use based on those available */
if (npersocket < 2) {
/* if we only have 1/sock, or we have less procs than
sockets, then just put it on the lrank socket */
logical_skt = lrank;
} else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) {
logical_skt = lrank % npersocket;
} else {
logical_skt = lrank / npersocket;
}
if (orte_odls_globals.bound) {
/* if we are already bound (by some other entity), use
this as an index into our available sockets */
for (n = target_socket = 0;
n < logical_skt &&
target_socket < opal_bitmap_size(&orte_odls_globals.sockets);
target_socket++) {
if (opal_bitmap_is_set_bit(&orte_odls_globals.sockets,
target_socket)) {
n++;
}
}
/* Did we have enough sockets? */
if (n < logical_skt) {
return bind_failed_msg("not enough processor sockets available",
jobdat->policy,
ORTE_ERR_NOT_FOUND,
pipe_fd, context->app,
__FILE__, __LINE__);
}
} else {
rc = opal_paffinity_base_get_physical_socket_id(logical_skt,
&target_socket);
if (ORTE_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
return bind_failed_msg("OS does not provide processor topology info (physical socket ID)",
jobdat->policy,
ORTE_ERR_NOT_FOUND,
pipe_fd, context->app,
__FILE__, __LINE__);
}
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:fork child %s local rank %d npersocket %d logical socket %d target socket %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name), lrank,
npersocket, logical_skt, target_socket));
/* set the starting point */
logical_cpu = (lrank % npersocket) * jobdat->cpus_per_rank;
/* bind to this socket */
goto bind_socket;
} else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) {
/* this corresponds to a mapping policy where
* local rank 0 goes on socket 0, and local
* rank 1 goes on socket 1, etc. - round robin
* until all ranks are mapped
*
* NOTE: we already know our number of sockets
* from when we initialized
*/
rc = opal_paffinity_base_get_physical_socket_id(lrank % orte_odls_globals.num_sockets, &target_socket);
if (OPAL_SUCCESS != rc) {
/* This may be a small memory leak, but this child is
exiting soon anyway; keep the logic simple by not
worrying about the small leak. */
asprintf(&msg, "opal_paffinity_base_get_physical_socket_id(%d) returned \"%s\"",
(lrank % orte_odls_globals.num_sockets),
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
if (NULL == msg) {
msg = "opal_paffinity_base_get_physical_socket_id() failed";
}
if (OPAL_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
msg = "OS does not provide processor topology information (physical socket ID)";
}
return bind_failed(msg, jobdat->policy, ORTE_ERR_NOT_SUPPORTED,
pipe_fd, context->app, __FILE__, __LINE__);
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"bysocket lrank %d numsocks %d logical socket %d target socket %d", (int)lrank,
(int)orte_odls_globals.num_sockets,
(int)(lrank % orte_odls_globals.num_sockets),
target_socket));
/* my starting core within this socket has to be
offset by cpus_per_rank */
logical_cpu = (lrank / orte_odls_globals.num_sockets) * jobdat->cpus_per_rank;
bind_socket:
/* cycle across the cpus_per_rank */
for (n=0; n < jobdat->cpus_per_rank; n++) {
/* get the physical core within this target socket */
rc = opal_paffinity_base_get_physical_core_id(target_socket, logical_cpu, &phys_core);
if (OPAL_SUCCESS != rc) {
/* Seem comment above about "This may be a small
memory leak" */
asprintf(&msg, "opal_paffinity_base_get_physical_core_id(%d, %d) returned \"%s\"",
target_socket, logical_cpu,
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
if (NULL == msg) {
msg = "opal_paffinity_base_get_physical_core_id() failed";
}
if (OPAL_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
msg = "OS does not provide processor topology information (physical core ID)";
}
return bind_failed(msg, jobdat->policy, ORTE_ERR_NOT_SUPPORTED,
pipe_fd, context->app, __FILE__, __LINE__);
}
/* map this to a physical cpu on this node */
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu))) {
/* Seem comment above about "This may be a small
memory leak" */
asprintf(&msg, "opal_paffinity_base_get_map_to_processor_id(%d, %d) returned \"%s\"",
target_socket, phys_core,
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
if (NULL == msg) {
msg = "opal_paffinity_base_get_map_to_processor_id() failed";
}
if (OPAL_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
msg = "OS does not provide processor topology information (map socket,core->ID)";
}
return bind_failed(msg, jobdat->policy, ORTE_ERR_NOT_SUPPORTED,
pipe_fd, context->app, __FILE__, __LINE__);
}
/* are we bound? */
if (orte_odls_globals.bound) {
/* see if this physical cpu is available to us */
if (!OPAL_PAFFINITY_CPU_ISSET(phys_cpu, orte_odls_globals.my_cores)) {
/* no it isn't - skip it */
continue;
}
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:fork mapping phys socket %d core %d to phys_cpu %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
target_socket, phys_core, phys_cpu));
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
/* increment logical cpu */
logical_cpu += jobdat->stride;
}
if (orte_report_bindings) {
tmp = opal_paffinity_base_print_binding(mask);
opal_output(0, "%s odls:default:fork binding child %s to socket %d cpus %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name), target_socket, tmp);
free(tmp);
}
} else {
/* my starting core has to be offset by cpus_per_rank */
logical_cpu = nrank * jobdat->cpus_per_rank;
for (n=0; n < jobdat->cpus_per_rank; n++) {
/* are we bound? */
if (orte_odls_globals.bound) {
/* if we are bound, then use the logical_cpu as an
index against our available cores */
ncpu = 0;
for (i = 0; i < OPAL_PAFFINITY_BITMASK_CPU_MAX &&
ncpu <= logical_cpu; i++) {
if (OPAL_PAFFINITY_CPU_ISSET(i,
orte_odls_globals.my_cores)) {
ncpu++;
phys_cpu = i;
}
}
/* if we don't have enough processors, that is an
error */
if (ncpu <= logical_cpu) {
if (ORTE_BINDING_NOT_REQUIRED(jobdat->policy)) {
return ORTE_ERR_NOT_SUPPORTED;
}
send_error_show_help(pipe_fd, 1,
"help-orte-odls-default.txt",
"binding generic error",
orte_process_info.nodename,
context->app,
"not enough logical processors",
__FILE__, __LINE__);
/* Does not return */
}
} else {
/* if we are not bound, then all processors are
available to us, so index into the node's array to
get the physical cpu */
rc = opal_paffinity_base_get_physical_processor_id(logical_cpu,
&phys_cpu);
if (OPAL_SUCCESS != rc) {
/* No processor to bind to */
/* Seem comment above about "This may be a small
memory leak" */
asprintf(&msg, "opal_paffinity_base_get_physical_processor_id(%d) returned \"%s\"",
logical_cpu,
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
if (NULL == msg) {
msg = "opal_paffinity_base_get_physical_processor_id() failed";
}
if (OPAL_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
msg = "OS does not provide processor topology information (physical processor ID)";
}
return bind_failed(msg, jobdat->policy,
ORTE_ERR_NOT_SUPPORTED,
pipe_fd, context->app,
__FILE__, __LINE__);
}
}
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
/* increment logical cpu */
logical_cpu += jobdat->stride;
}
if (orte_report_bindings) {
tmp = opal_paffinity_base_print_binding(mask);
opal_output(0, "%s odls:default:fork binding child %s to cpus %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name), tmp);
free(tmp);
}
}
/* Bind me! */
if (ORTE_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
/* Seem comment above about "This may be a small memory
leak" */
asprintf(&msg, "opal_paffinity_base_set returned \"%s\"",
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
if (NULL == msg) {
msg = "opal_paffinity_base_set() failed";
}
return bind_failed(msg,
jobdat->policy,
OPAL_SOS_GET_ERROR_CODE(rc),
pipe_fd, context->app, __FILE__, __LINE__);
}
*bound = true;
/* If the above work resulted in binding to everything (i.e.,
effectively not binding), warn -- unless the warning is
suppressed. */
OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &flag);
if (!flag && orte_odls_base.warn_if_not_bound) {
send_warn_show_help(pipe_fd,
"help-orte-odls-default.txt",
"bound to everything",
orte_process_info.nodename, context->app,
__FILE__, __LINE__);
} else if (orte_report_bindings) {
tmp = opal_paffinity_base_print_binding(mask);
opal_output(0, "%s odls:default:fork binding child %s to socket %d cpus %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name), target_socket, tmp);
free(tmp);
}
return ORTE_SUCCESS;
}
static int bind_to_socket(orte_app_context_t* context,
orte_odls_child_t *child,
orte_odls_job_t *jobdat,
bool *bound, int pipe_fd)
{
bool flag;
int i, rc;
char *tmp, *msg;
int16_t n;
orte_node_rank_t lrank;
opal_paffinity_base_cpu_set_t mask;
int target_socket, npersocket, logical_skt;
int logical_cpu, phys_core, phys_cpu, ncpu;
*bound = false;
/* bind this proc to a socket */
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:default:fork binding child %s to socket",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
/* layout this process across the sockets based on
* the provided mapping policy
*/
if (ORTE_LOCAL_RANK_INVALID == (lrank = orte_ess.get_local_rank(child->name))) {
send_error_show_help(pipe_fd, 1, "help-orte-odls-default.txt",
"binding generic error",
orte_process_info.nodename, context->app,
"ess.get_local_rank returned NODE_RANK_INVALID",
__FILE__, __LINE__);
/* Does not return */
}
if (ORTE_MAPPING_NPERXXX & jobdat->policy) {
/* we need to balance the children from this job
across the available sockets */
npersocket = jobdat->num_local_procs / orte_odls_globals.num_sockets;
/* determine the socket to use based on those available */
if (npersocket < 2) {
/* if we only have 1/sock, or we have less
* procs than sockets, then just put it on the
* lrank socket
*/
logical_skt = lrank;
} else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) {
logical_skt = lrank % npersocket;
} else {
logical_skt = lrank / npersocket;
}
if (orte_odls_globals.bound) {
/* if we are bound, use this as an index into
our available sockets */
for (target_socket=0, n = 0; target_socket < opal_bitmap_size(&orte_odls_globals.sockets) && n < logical_skt; target_socket++) {
if (opal_bitmap_is_set_bit(&orte_odls_globals.sockets, target_socket)) {
n++;
}
}
/* if we don't have enough sockets, that is an error */
if (n < logical_skt) {
return bind_failed_msg("not enough processor sockets available",
jobdat->policy,
ORTE_ERR_NOT_FOUND,
pipe_fd, context->app,
__FILE__, __LINE__);
}
} else {
rc = opal_paffinity_base_get_physical_socket_id(logical_skt, &target_socket);
if (ORTE_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
/* OS doesn't support providing topology
information */
return bind_failed_msg("OS does not provide processor topology info (physical socket ID)",
jobdat->policy,
ORTE_ERR_NOT_FOUND,
pipe_fd, context->app,
__FILE__, __LINE__);
}
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:fork child %s local rank %d npersocket %d logical socket %d target socket %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name), lrank,
npersocket, logical_skt, target_socket));
} else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) {
/* this corresponds to a mapping policy where
* local rank 0 goes on socket 0, and local
* rank 1 goes on socket 1, etc. - round robin
* until all ranks are mapped
*
* NOTE: we already know our number of sockets
* from when we initialized
*/
rc = opal_paffinity_base_get_physical_socket_id(lrank % orte_odls_globals.num_sockets, &target_socket);
if (ORTE_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
/* OS does not support providing topology
information */
return bind_failed_msg("OS does not provide processor topology info(physical socket ID)",
jobdat->policy,
ORTE_ERR_NOT_FOUND,
pipe_fd, context->app,
__FILE__, __LINE__);
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"bysocket lrank %d numsocks %d logical socket %d target socket %d", (int)lrank,
(int)orte_odls_globals.num_sockets,
(int)(lrank % orte_odls_globals.num_sockets),
target_socket));
} else {
/* use a byslot-like policy where local rank 0 goes on
* socket 0, and local rank 1 goes on socket 0, etc.
* following round-robin until all ranks mapped
*/
if (orte_odls_globals.bound) {
/* if we are bound, then we compute the
* logical socket id based on the number of
* available cores in each socket so that each
* rank gets its own core, adjusting for the
* cpus_per_task
*/
/* Find the lrank available core, accounting
for cpus_per_task */
logical_cpu = lrank * jobdat->cpus_per_rank;
/* use the logical_cpu as an index against our
available cores */
ncpu = 0;
for (i=0; i < orte_odls_globals.num_processors && ncpu <= logical_cpu; i++) {
if (OPAL_PAFFINITY_CPU_ISSET(i, orte_odls_globals.my_cores)) {
ncpu++;
phys_cpu = i;
}
}
/* if we don't have enough processors, that is
an error */
if (ncpu < logical_cpu) {
send_error_show_help(pipe_fd, 1,
"help-orte-odls-default.txt",
"binding generic error",
orte_process_info.nodename,
context->app,
"not enough logical processors",
__FILE__, __LINE__);
/* Does not return */
}
/* get the physical socket of that cpu */
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_map_to_socket_core(phys_cpu, &target_socket, &phys_core))) {
/* Seem comment above about "This may be a small
memory leak" */
asprintf(&msg, "opal_paffinity_base_get_map_to_socket_core(%d) returned \"%s\"",
phys_cpu, opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
if (NULL == msg) {
msg = "opal_paffinity_base_get_map_to_socket_core() failed";
}
if (OPAL_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
msg = "OS does not provide processor topology information (map socket,core->ID)";
}
return bind_failed(msg, jobdat->policy,
ORTE_ERR_NOT_SUPPORTED,
pipe_fd, context->app,
__FILE__, __LINE__);
}
} else {
/* if we are not bound, then just use all sockets */
if (1 == orte_odls_globals.num_sockets) {
/* if we only have one socket, then just
put it there */
rc = opal_paffinity_base_get_physical_socket_id(0, &target_socket);
if (ORTE_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
/* OS doesn't support providing
topology information */
return bind_failed_msg("OS does not provide processor topology info (physical socket ID)",
jobdat->policy,
ORTE_ERR_NOT_FOUND,
pipe_fd, context->app,
__FILE__, __LINE__);
}
} else {
/* compute the logical socket,
compensating for the number of
cpus_per_rank */
logical_skt = lrank / (orte_default_num_cores_per_socket / jobdat->cpus_per_rank);
/* wrap that around the number of sockets
so we round-robin */
logical_skt = logical_skt % orte_odls_globals.num_sockets;
/* now get the target physical socket */
rc = opal_paffinity_base_get_physical_socket_id(logical_skt, &target_socket);
if (ORTE_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
/* OS doesn't support providing
topology information */
return bind_failed_msg("OS does not provide processor topology info (physical socket ID)",
jobdat->policy,
ORTE_ERR_NOT_FOUND,
pipe_fd, context->app,
__FILE__, __LINE__);
}
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"byslot lrank %d socket %d", (int)lrank, target_socket));
}
}
OPAL_PAFFINITY_CPU_ZERO(mask);
for (n=0; n < orte_default_num_cores_per_socket; n++) {
/* get the physical core within this target socket */
rc = opal_paffinity_base_get_physical_core_id(target_socket, n, &phys_core);
if (OPAL_SUCCESS != rc) {
/* Seem comment above about "This may be a small memory
leak" */
asprintf(&msg, "opal_paffinity_base_get_physical_core_id(%d, %d) returned \"%s\"",
target_socket, n,
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
if (NULL == msg) {
msg = "opal_paffinity_base_get_physical_core_id() failed";
}
if (OPAL_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
msg = "OS does not provide processor topology information (physical core ID)";
}
return bind_failed(msg, jobdat->policy,
ORTE_ERR_NOT_SUPPORTED,
pipe_fd, context->app,
__FILE__, __LINE__);
}
/* map this to a physical cpu on this node */
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu))) {
/* Seem comment above about "This may be a small memory
leak" */
asprintf(&msg, "opal_paffinity_base_get_map_to_processor_id(%d, %d) returned \"%s\"",
target_socket, phys_core,
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
if (NULL == msg) {
msg = "opal_paffinity_base_get_map_to_processor_id()";
}
if (OPAL_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
msg = "OS does not provide processor topology information (map socket,core->ID)";
}
return bind_failed(msg, jobdat->policy,
ORTE_ERR_NOT_SUPPORTED,
pipe_fd, context->app,
__FILE__, __LINE__);
}
/* are we bound? */
if (orte_odls_globals.bound) {
/* see if this physical cpu is available to us */
if (!OPAL_PAFFINITY_CPU_ISSET(phys_cpu, orte_odls_globals.my_cores)) {
/* no it isn't - skip it */
continue;
}
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:fork mapping phys socket %d core %d to phys_cpu %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
target_socket, phys_core, phys_cpu));
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
}
/* Bind me! */
if (ORTE_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
/* Seem comment above about "This may be a small memory
leak" */
asprintf(&msg, "opal_paffinity_base_set() returned \"%s\"",
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
if (NULL == msg) {
msg = "opal_paffinity_base_set() failed";
}
return bind_failed(msg,
jobdat->policy,
OPAL_SOS_GET_ERROR_CODE(rc),
pipe_fd, context->app, __FILE__, __LINE__);
}
*bound = true;
/* If the above work resulted in binding to everything (i.e.,
effectively not binding), warn -- unless the warning is
suppressed. */
OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &flag);
if (!flag && orte_odls_base.warn_if_not_bound) {
send_warn_show_help(pipe_fd,
"help-orte-odls-default.txt",
"bound to everything",
orte_process_info.nodename, context->app,
__FILE__, __LINE__);
} else if (orte_report_bindings) {
tmp = opal_paffinity_base_print_binding(mask);
opal_output(0, "%s odls:default:fork binding child %s to socket %d cpus %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name), target_socket, tmp);
free(tmp);
}
return ORTE_SUCCESS;
}
static int bind_to_board(orte_app_context_t* context,
orte_odls_child_t *child,
orte_odls_job_t *jobdat,
bool *bound, int pipe_fd)
{
/* Not currently supported until multi-board paffinity enabled.
But this is not an error -- for now. */
*bound = false;
if (orte_odls_base.warn_if_not_bound) {
send_warn_show_help(pipe_fd, "help-orte-odls-base.txt",
"warn not bound", "board",
"Not currently supported by Open MPI",
orte_process_info.nodename, context->app,
"Bind to board", "");
}
return ORTE_ERR_NOT_SUPPORTED;
}
static int do_child(orte_app_context_t* context, static int do_child(orte_app_context_t* context,
orte_odls_child_t *child, orte_odls_child_t *child,
char **environ_copy, char **environ_copy,
@ -1127,10 +388,11 @@ static int do_child(orte_app_context_t* context,
int i; int i;
sigset_t sigs; sigset_t sigs;
long fd, fdmax = sysconf(_SC_OPEN_MAX); long fd, fdmax = sysconf(_SC_OPEN_MAX);
bool paffinity_enabled = false; #if OPAL_HAVE_HWLOC
char *param, *tmp; int rc;
opal_paffinity_base_cpu_set_t mask; char *param, *msg;
#endif
if (orte_forward_job_control) { if (orte_forward_job_control) {
/* Set a new process group for this child, so that a /* Set a new process group for this child, so that a
SIGSTOP can be sent to it without being sent to the SIGSTOP can be sent to it without being sent to the
@ -1164,31 +426,117 @@ static int do_child(orte_app_context_t* context,
orte_process_info.nodename, context->app); orte_process_info.nodename, context->app);
/* Does not return */ /* Does not return */
} }
/* Setup process affinity. Not for the meek. */ #if OPAL_HAVE_HWLOC
{
if (NULL != child->slot_list) { hwloc_cpuset_t cpuset;
bind_to_slot_list(context, child, jobdat,
&paffinity_enabled, write_fd); /* Set process affinity, if given */
} else if (ORTE_BIND_TO_CORE & jobdat->policy) { if (NULL != child->cpu_bitmap) {
bind_to_core(context, child, jobdat, /* convert the list to a cpu bitmap */
&paffinity_enabled, write_fd); cpuset = hwloc_bitmap_alloc();
} else if (ORTE_BIND_TO_SOCKET & jobdat->policy) { if (0 != (rc = hwloc_bitmap_list_sscanf(cpuset, child->cpu_bitmap))) {
bind_to_socket(context, child, jobdat, /* See comment above about "This may be a small memory leak" */
&paffinity_enabled, write_fd); asprintf(&msg, "hwloc_bitmap_sscanf returned \"%s\" for the string \"%s\"",
} else if (ORTE_BIND_TO_BOARD & jobdat->policy) { opal_strerror(rc), child->cpu_bitmap);
bind_to_board(context, child, jobdat, if (NULL == msg) {
&paffinity_enabled, write_fd); msg = "failed to convert bitmap list to hwloc bitmap";
} }
if (OPAL_BINDING_REQUIRED(jobdat->binding)) {
/* If we were able to set processor affinity, then also /* If binding is required, send an error up the pipe (which exits
setup memory affinity. */ -- it doesn't return). */
if (paffinity_enabled) { send_error_show_help(write_fd, 1, "help-orte-odls-default.txt",
if (OPAL_SUCCESS == opal_maffinity_base_open() && "binding generic error",
OPAL_SUCCESS == opal_maffinity_base_select()) { orte_process_info.nodename,
context->app, msg,
__FILE__, __LINE__);
} else {
send_warn_show_help(write_fd,
"help-orte-odls-default.txt", "not bound",
orte_process_info.nodename, context->app, msg,
__FILE__, __LINE__);
goto PROCEED;
}
}
/* bind as specified */
if (opal_hwloc_report_bindings) {
opal_output(0, "%s odls:default binding child %s to cpus %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name), child->cpu_bitmap);
}
rc = hwloc_set_cpubind(opal_hwloc_topology, cpuset, 0);
if (rc < 0) {
char *tmp = NULL;
if (errno == ENOSYS) {
msg = "hwloc indicates cpu binding not supported";
} else if (errno == EXDEV) {
msg = "hwloc indicates cpu binding cannot be enforced";
} else {
asprintf(&msg, "hwloc_set_cpubind returned \"%s\" for bitmap \"%s\"",
opal_strerror(rc), child->cpu_bitmap);
}
if (OPAL_BINDING_REQUIRED(jobdat->binding)) {
/* If binding is required, send an error up the pipe (which exits
-- it doesn't return). */
send_error_show_help(write_fd, 1, "help-orte-odls-default.txt",
"binding generic error",
orte_process_info.nodename, context->app, msg,
__FILE__, __LINE__);
} else {
send_warn_show_help(write_fd,
"help-orte-odls-default.txt", "not bound",
orte_process_info.nodename, context->app, msg,
__FILE__, __LINE__);
if (NULL != tmp) {
free(tmp);
free(msg);
}
goto PROCEED;
}
if (NULL != tmp) {
free(tmp);
free(msg);
}
}
/* set memory affinity policy */
if (ORTE_SUCCESS != opal_hwloc_base_set_process_membind_policy()) {
if (errno == ENOSYS) {
msg = "hwloc indicates memory binding not supported";
} else if (errno == EXDEV) {
msg = "hwloc indicates memory binding cannot be enforced";
} else {
msg = "failed to bind memory";
}
if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
/* If binding is required, send an error up the pipe (which exits
-- it doesn't return). */
send_error_show_help(write_fd, 1, "help-orte-odls-default.txt",
"memory binding error",
orte_process_info.nodename, context->app, msg,
__FILE__, __LINE__);
} else {
send_warn_show_help(write_fd,
"help-orte-odls-default.txt", "memory not bound",
orte_process_info.nodename, context->app, msg,
__FILE__, __LINE__);
goto PROCEED;
}
}
opal_maffinity_setup = true; opal_maffinity_setup = true;
/* Set an info MCA param that tells
the launched processes that it was bound by us (e.g., so that
MPI_INIT doesn't try to bind itself) */
param = mca_base_param_environ_variable("opal","bound","at_launch");
opal_setenv(param, "1", true, &environ_copy);
free(param);
/* ...and provide a nice string representation of what we
bound to */
param = mca_base_param_environ_variable("opal","base","applied_binding");
opal_setenv(param, child->cpu_bitmap, true, &environ_copy);
} }
} }
#endif
} else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) { } else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) {
/* tie stdin/out/err/internal to /dev/null */ /* tie stdin/out/err/internal to /dev/null */
int fdnull; int fdnull;
@ -1205,26 +553,10 @@ static int do_child(orte_app_context_t* context,
} }
close(fdnull); close(fdnull);
} }
/* If we are able to bind, then set an info MCA param that tells #if OPAL_HAVE_HWLOC
the launched processes that it was bound by us (e.g., so that PROCEED:
MPI_INIT doesn't try to bind itself) */ #endif
if (paffinity_enabled) {
param = mca_base_param_environ_variable("paffinity","base","bound");
opal_setenv(param, "1", true, &environ_copy);
free(param);
/* ...and provide a nice string representation of what we
bound to */
if (OPAL_SUCCESS == opal_paffinity_base_get(&mask)) {
tmp = opal_paffinity_base_print_binding(mask);
if (NULL != tmp) {
param = mca_base_param_environ_variable("paffinity","base","applied_binding");
opal_setenv(param, tmp, true, &environ_copy);
free(tmp);
}
}
}
/* close all file descriptors w/ exception of stdin/stdout/stderr, /* close all file descriptors w/ exception of stdin/stdout/stderr,
the pipe used for the IOF INTERNAL messages, and the pipe up to the pipe used for the IOF INTERNAL messages, and the pipe up to
the parent. */ the parent. */
@ -1310,7 +642,7 @@ static int do_parent(orte_app_context_t* context,
rc = opal_fd_read(read_fd, sizeof(msg), &msg); rc = opal_fd_read(read_fd, sizeof(msg), &msg);
/* If the pipe closed, then the child successfully launched */ /* If the pipe closed, then the child successfully launched */
if (OPAL_ERR_TIMEOUT == OPAL_SOS_GET_ERROR_CODE(rc)) { if (OPAL_ERR_TIMEOUT == rc) {
break; break;
} }
@ -1393,7 +725,7 @@ static int do_parent(orte_app_context_t* context,
child->alive = false; child->alive = false;
} }
close(read_fd); close(read_fd);
return ORTE_SUCCESS; return ORTE_ERR_FAILED_TO_START;
} }
} }

Просмотреть файл

@ -9,6 +9,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -33,9 +34,11 @@
#include "opal/dss/dss_types.h" #include "opal/dss/dss_types.h"
#include "opal/threads/mutex.h" #include "opal/threads/mutex.h"
#include "opal/threads/condition.h" #include "opal/threads/condition.h"
#include "opal/mca/hwloc/hwloc.h"
#include "orte/mca/plm/plm_types.h" #include "orte/mca/plm/plm_types.h"
#include "orte/mca/grpcomm/grpcomm_types.h" #include "orte/mca/grpcomm/grpcomm_types.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
BEGIN_C_DECLS BEGIN_C_DECLS
@ -107,7 +110,9 @@ typedef struct {
bool init_recvd; /* process called orte_init */ bool init_recvd; /* process called orte_init */
bool fini_recvd; /* process called orte_finalize */ bool fini_recvd; /* process called orte_finalize */
char *rml_uri; /* contact info for this child */ char *rml_uri; /* contact info for this child */
char *slot_list; /* list of slots for this child */ #if OPAL_HAVE_HWLOC
char *cpu_bitmap; /* binding pattern for this child */
#endif
bool waitpid_recvd; /* waitpid has detected proc termination */ bool waitpid_recvd; /* waitpid has detected proc termination */
bool iof_complete; /* IOF has noted proc terminating all channels */ bool iof_complete; /* IOF has noted proc terminating all channels */
struct timeval starttime; /* when the proc was started - for timing purposes only */ struct timeval starttime; /* when the proc was started - for timing purposes only */
@ -133,7 +138,9 @@ typedef struct orte_odls_job_t {
bool launch_msg_processed; /* launch msg has been fully processed */ bool launch_msg_processed; /* launch msg has been fully processed */
opal_pointer_array_t apps; /* app_contexts for this job */ opal_pointer_array_t apps; /* app_contexts for this job */
orte_app_idx_t num_apps; /* number of app_contexts */ orte_app_idx_t num_apps; /* number of app_contexts */
orte_mapping_policy_t policy; /* mapping policy */ #if OPAL_HAVE_HWLOC
opal_binding_policy_t binding; /* binding policy */
#endif
int16_t cpus_per_rank; /* number of cpus/rank */ int16_t cpus_per_rank; /* number of cpus/rank */
int16_t stride; /* step size between cores of multi-core/rank procs */ int16_t stride; /* step size between cores of multi-core/rank procs */
orte_job_controls_t controls; /* control flags for job */ orte_job_controls_t controls; /* control flags for job */

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 Institut National de Recherche en Informatique * Copyright (c) 2009 Institut National de Recherche en Informatique
* et Automatique. All rights reserved. * et Automatique. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -81,41 +81,44 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid))); ORTE_JOBID_PRINT(jdata->jobid)));
/* if the job is not being restarted or hasn't already been given a jobid, prep it */ /* if this is the daemon job, we don't perform certain functions */
if (ORTE_JOB_STATE_RESTART != jdata->state && ORTE_JOBID_INVALID == jdata->jobid) { if (jdata->jobid != ORTE_PROC_MY_NAME->jobid) {
/* get a jobid for it */ /* if the job is not being restarted or hasn't already been given a jobid, prep it */
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(jdata))) { if (ORTE_JOB_STATE_RESTART != jdata->state && ORTE_JOBID_INVALID == jdata->jobid) {
/* get a jobid for it */
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* store it on the global job data pool */
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* set the job state */
jdata->state = ORTE_JOB_STATE_INIT;
/* if job recovery is not defined, set it to default */
if (!jdata->recovery_defined) {
/* set to system default */
jdata->enable_recovery = orte_enable_recovery;
}
/* if app recovery is not defined, set apps to defaults */
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
if (!app->recovery_defined) {
app->max_restarts = orte_max_restarts;
}
}
}
/* get the allocation */
if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
/* store it on the global job data pool */
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* set the job state */
jdata->state = ORTE_JOB_STATE_INIT;
/* if job recovery is not defined, set it to default */
if (!jdata->recovery_defined) {
/* set to system default */
jdata->enable_recovery = orte_enable_recovery;
}
/* if app recovery is not defined, set apps to defaults */
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
if (!app->recovery_defined) {
app->max_restarts = orte_max_restarts;
}
}
}
/* get the allocation */
if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
} }
if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(jdata))) { if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(jdata))) {
@ -123,64 +126,6 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
return rc; return rc;
} }
#if 0
/* RHC: Please leave this code here - it is needed for
* rare debugging that doesn't merit a separate debug-flag,
* but is a pain to have to replicate when needed
*/
{
char *crud;
orte_odls_job_t *jobdat;
crud = orte_regex_encode_maps(jdata);
opal_output(0, "maps regex: %s", (NULL == crud) ? "NULL" : crud);
if (NULL == crud) {
orte_never_launched = true;
ORTE_UPDATE_EXIT_STATUS(0);
orte_jobs_complete();
return ORTE_ERROR;
}
orte_util_nidmap_init(NULL);
orte_regex_decode_maps(crud, &jobdat);
free(crud);
/* print-out the map */
orte_nidmap_dump();
orte_jobmap_dump();
/* printout the jobdat */
opal_output(orte_clean_output, "**** DUMP OF JOBDAT %s (%d nodes %d procs) ***",
ORTE_JOBID_PRINT(jobdat->jobid), (int)jobdat->num_nodes, (int)(jobdat->num_procs));
opal_output(orte_clean_output, "\tNum slots: %d\tControl: %x\tStdin: %d",
(int)jobdat->total_slots_alloc, jobdat->controls, (int)jobdat->stdin_target);
opal_output(orte_clean_output, "\tApp: %s", jobdat->apps[0]->app);
opal_output(orte_clean_output, "\tCwd: %s", jobdat->apps[0]->cwd);
crud = opal_argv_join(jobdat->apps[0]->argv, ',');
opal_output(orte_clean_output, "\tArgv: %s", crud);
free(crud);
crud = opal_argv_join(jobdat->apps[0]->env, ',');
opal_output(orte_clean_output, "\tEnv: %s", crud);
free(crud);
orte_never_launched = true;
ORTE_UPDATE_EXIT_STATUS(0);
orte_jobs_complete();
return ORTE_ERROR;
}
{
opal_byte_object_t bo;
/* construct a nodemap */
if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(&bo))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(&bo))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* print-out the map */
orte_nidmap_dump();
}
#endif
/* if we don't want to launch, now is the time to leave */ /* if we don't want to launch, now is the time to leave */
if (orte_do_not_launch) { if (orte_do_not_launch) {
orte_never_launched = true; orte_never_launched = true;
@ -192,7 +137,8 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
/* quick sanity check - is the stdin target within range /* quick sanity check - is the stdin target within range
* of the job? * of the job?
*/ */
if (ORTE_VPID_WILDCARD != jdata->stdin_target && if (jdata->jobid != ORTE_PROC_MY_NAME->jobid &&
ORTE_VPID_WILDCARD != jdata->stdin_target &&
ORTE_VPID_INVALID != jdata->stdin_target && ORTE_VPID_INVALID != jdata->stdin_target &&
jdata->num_procs <= jdata->stdin_target) { jdata->num_procs <= jdata->stdin_target) {
/* this request cannot be met */ /* this request cannot be met */
@ -551,6 +497,9 @@ static void process_orted_launch_report(int fd, short event, void *data)
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s RECEIVED TOPOLOGY FROM NODE %s", "%s RECEIVED TOPOLOGY FROM NODE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename));
if (10 < opal_output_get_verbosity(orte_plm_globals.output)) {
opal_dss.dump(0, topo, OPAL_HWLOC_TOPO);
}
/* do we already have this topology from some other node? */ /* do we already have this topology from some other node? */
found = false; found = false;
for (i=0; i < orte_node_topologies->size; i++) { for (i=0; i < orte_node_topologies->size; i++) {
@ -573,6 +522,7 @@ static void process_orted_launch_report(int fd, short event, void *data)
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s NEW TOPOLOGY - ADDING", "%s NEW TOPOLOGY - ADDING",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
opal_pointer_array_add(orte_node_topologies, topo); opal_pointer_array_add(orte_node_topologies, topo);
node->topology = topo; node->topology = topo;
} }
@ -776,12 +726,14 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
if (orted_spin_flag) { if (orted_spin_flag) {
opal_argv_append(argc, argv, "--spin"); opal_argv_append(argc, argv, "--spin");
} }
if (orte_report_bindings) { #if OPAL_HAVE_HWLOC
if (opal_hwloc_report_bindings) {
opal_argv_append(argc, argv, "--report-bindings"); opal_argv_append(argc, argv, "--report-bindings");
} }
if (orte_hetero_nodes) { if (orte_hetero_nodes) {
opal_argv_append(argc, argv, "--hetero-nodes"); opal_argv_append(argc, argv, "--hetero-nodes");
} }
#endif
if ((int)ORTE_VPID_INVALID != orted_debug_failure) { if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
opal_argv_append(argc, argv, "--debug-failure"); opal_argv_append(argc, argv, "--debug-failure");

Просмотреть файл

@ -203,7 +203,6 @@ int orte_plm_base_local_slave_launch(orte_job_t *jdata)
orte_node_t *node; orte_node_t *node;
char *nodename; char *nodename;
char *exec_path; char *exec_path;
bool flag;
orte_app_context_t *app; orte_app_context_t *app;
int rc; int rc;
pid_t pid; pid_t pid;
@ -221,7 +220,7 @@ int orte_plm_base_local_slave_launch(orte_job_t *jdata)
/* identify the target host - can only be one! */ /* identify the target host - can only be one! */
OBJ_CONSTRUCT(&hosts, opal_list_t); OBJ_CONSTRUCT(&hosts, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&hosts, &flag, app->dash_host))) { if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&hosts, app->dash_host))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&hosts); OBJ_DESTRUCT(&hosts);
return rc; return rc;

Просмотреть файл

@ -31,5 +31,8 @@ please check that none of the following exist:
MCA param file: orte_allocation_required = 1 MCA param file: orte_allocation_required = 1
Environment: OMPI_MCA_orte_allocation_required=1 Environment: OMPI_MCA_orte_allocation_required=1
Cmd line: -mca orte_allocation_required 1 Cmd line: -mca orte_allocation_required 1
#
[ras-sim:mismatch]
The number of topology files and the list of number of nodes
must match - i.e., a number of nodes must be given for each
topology.

Просмотреть файл

@ -29,11 +29,11 @@
#include "opal/mca/base/base.h" #include "opal/mca/base/base.h"
#include "opal/class/opal_list.h" #include "opal/class/opal_list.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/opal_sos.h" #include "opal/dss/dss.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "opal/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
@ -91,7 +91,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
opal_list_t nodes; opal_list_t nodes;
orte_node_t *node; orte_node_t *node;
orte_std_cntr_t i; orte_std_cntr_t i;
bool override_oversubscribed;
orte_app_context_t *app; orte_app_context_t *app;
OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output, OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output,
@ -139,7 +138,7 @@ int orte_ras_base_allocate(orte_job_t *jdata)
if (NULL != orte_ras_base.active_module) { if (NULL != orte_ras_base.active_module) {
/* read the allocation */ /* read the allocation */
if (ORTE_SUCCESS != (rc = orte_ras_base.active_module->allocate(&nodes))) { if (ORTE_SUCCESS != (rc = orte_ras_base.active_module->allocate(&nodes))) {
if (ORTE_ERR_SYSTEM_WILL_BOOTSTRAP == OPAL_SOS_GET_ERROR_CODE(rc)) { if (ORTE_ERR_SYSTEM_WILL_BOOTSTRAP == rc) {
/* this module indicates that nodes will be discovered /* this module indicates that nodes will be discovered
* on a bootstrap basis, so all we do here is add our * on a bootstrap basis, so all we do here is add our
* own node to the list * own node to the list
@ -162,6 +161,10 @@ int orte_ras_base_allocate(orte_job_t *jdata)
return rc; return rc;
} }
OBJ_DESTRUCT(&nodes); OBJ_DESTRUCT(&nodes);
/* default to no-oversubscribe-allowed for managed systems */
if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
}
goto DISPLAY; goto DISPLAY;
} else if (orte_allocation_required) { } else if (orte_allocation_required) {
/* if nothing was found, and an allocation is /* if nothing was found, and an allocation is
@ -197,7 +200,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
/* a default hostfile was provided - parse it */ /* a default hostfile was provided - parse it */
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
&override_oversubscribed,
orte_default_hostfile))) { orte_default_hostfile))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&nodes); OBJ_DESTRUCT(&nodes);
@ -214,8 +216,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
} }
/* update the jdata object with override_oversubscribed flag */
jdata->oversubscribe_override = override_oversubscribed;
/* cleanup */ /* cleanup */
OBJ_DESTRUCT(&nodes); OBJ_DESTRUCT(&nodes);
goto DISPLAY; goto DISPLAY;
@ -246,7 +246,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
/* hostfile was specified - parse it and add it to the list */ /* hostfile was specified - parse it and add it to the list */
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
&override_oversubscribed,
app->hostfile))) { app->hostfile))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&nodes); OBJ_DESTRUCT(&nodes);
@ -265,8 +264,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
} }
/* update the jdata object with override_oversubscribed flag */
jdata->oversubscribe_override = override_oversubscribed;
/* cleanup */ /* cleanup */
OBJ_DESTRUCT(&nodes); OBJ_DESTRUCT(&nodes);
goto DISPLAY; goto DISPLAY;
@ -300,7 +297,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
} }
if (NULL != app->dash_host) { if (NULL != app->dash_host) {
if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes,
&override_oversubscribed,
app->dash_host))) { app->dash_host))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&nodes); OBJ_DESTRUCT(&nodes);
@ -319,8 +315,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
} }
/* update the jdata object with override_oversubscribed flag */
jdata->oversubscribe_override = override_oversubscribed;
/* cleanup */ /* cleanup */
OBJ_DESTRUCT(&nodes); OBJ_DESTRUCT(&nodes);
goto DISPLAY; goto DISPLAY;
@ -336,7 +330,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
if (NULL != orte_rankfile) { if (NULL != orte_rankfile) {
/* check the rankfile for node information */ /* check the rankfile for node information */
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
&override_oversubscribed,
orte_rankfile))) { orte_rankfile))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&nodes); OBJ_DESTRUCT(&nodes);
@ -353,8 +346,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
} }
/* update the jdata object with override_oversubscribed flag */
jdata->oversubscribe_override = false;
/* cleanup */ /* cleanup */
OBJ_DESTRUCT(&nodes); OBJ_DESTRUCT(&nodes);
goto DISPLAY; goto DISPLAY;
@ -383,8 +374,6 @@ addlocal:
node->slots_inuse = 0; node->slots_inuse = 0;
node->slots_max = 0; node->slots_max = 0;
node->slots = 1; node->slots = 1;
/* indicate that we don't know anything about over_subscribing */
jdata->oversubscribe_override = true;
opal_list_append(&nodes, &node->super); opal_list_append(&nodes, &node->super);
/* store the results in the global resource pool - this removes the /* store the results in the global resource pool - this removes the
@ -416,7 +405,6 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
{ {
int rc; int rc;
opal_list_t nodes; opal_list_t nodes;
bool override_oversubscribed;
int i; int i;
orte_app_context_t *app; orte_app_context_t *app;
@ -448,7 +436,6 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
/* hostfile was specified - parse it and add it to the list */ /* hostfile was specified - parse it and add it to the list */
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
&override_oversubscribed,
app->add_hostfile))) { app->add_hostfile))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&nodes); OBJ_DESTRUCT(&nodes);
@ -472,7 +459,6 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
} }
if (NULL != app->add_host) { if (NULL != app->add_host) {
if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes,
&override_oversubscribed,
app->add_host))) { app->add_host))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&nodes); OBJ_DESTRUCT(&nodes);
@ -489,8 +475,6 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
} }
/* update the jdata object with override_oversubscribed flag */
jdata->oversubscribe_override = override_oversubscribed;
/* cleanup */ /* cleanup */
OBJ_DESTRUCT(&nodes); OBJ_DESTRUCT(&nodes);
} }

Просмотреть файл

@ -65,8 +65,6 @@ orte_ras_sim_component_t mca_ras_simulator_component = {
static int ras_sim_open(void) static int ras_sim_open(void)
{ {
int tmp;
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version, mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
"slots", "slots",
"Number of slots on each node to simulate", "Number of slots on each node to simulate",
@ -76,24 +74,28 @@ static int ras_sim_open(void)
"Number of max slots on each node to simulate", "Number of max slots on each node to simulate",
false, false, 0, &mca_ras_simulator_component.slots_max); false, false, 0, &mca_ras_simulator_component.slots_max);
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version, {
"num_nodes", int tmp;
"Comma-separated list of number of nodes to simulate for each topology",
false, false, NULL, &mca_ras_simulator_component.num_nodes); mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version, "num_nodes",
"topo_files", "Comma-separated list of number of nodes to simulate for each topology",
"Comma-separated list of files containing xml topology descriptions for simulated nodes", false, false, NULL, &mca_ras_simulator_component.num_nodes);
false, false, NULL, &mca_ras_simulator_component.topofiles); mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version, "topo_files",
"have_cpubind", "Comma-separated list of files containing xml topology descriptions for simulated nodes",
"Topology supports binding to cpus", false, false, NULL, &mca_ras_simulator_component.topofiles);
false, false, (int)true, &tmp); mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
mca_ras_simulator_component.have_cpubind = OPAL_INT_TO_BOOL(tmp); "have_cpubind",
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version, "Topology supports binding to cpus",
"have_membind", false, false, (int)true, &tmp);
"Topology supports binding to memory", mca_ras_simulator_component.have_cpubind = OPAL_INT_TO_BOOL(tmp);
false, false, (int)true, &tmp); mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
mca_ras_simulator_component.have_membind = OPAL_INT_TO_BOOL(tmp); "have_membind",
"Topology supports binding to memory",
false, false, (int)true, &tmp);
mca_ras_simulator_component.have_membind = OPAL_INT_TO_BOOL(tmp);
}
#else #else
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version, mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
"num_nodes", "num_nodes",

Просмотреть файл

@ -36,5 +36,12 @@ libmca_rmaps_la_SOURCES += \
base/rmaps_base_map_job.c \ base/rmaps_base_map_job.c \
base/rmaps_base_get_job_map.c \ base/rmaps_base_get_job_map.c \
base/rmaps_base_support_fns.c \ base/rmaps_base_support_fns.c \
base/rmaps_base_common_mappers.c base/rmaps_base_ranking.c \
base/rmaps_base_print_fns.c
if OPAL_HAVE_HWLOC
libmca_rmaps_la_SOURCES += \
base/rmaps_base_binding.c
endif
endif endif

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -58,18 +59,17 @@ typedef struct {
opal_list_t available_components; opal_list_t available_components;
/* list of selected modules */ /* list of selected modules */
opal_list_t selected_modules; opal_list_t selected_modules;
/** whether or not we allow oversubscription of nodes */ /* default ppr */
bool oversubscribe; char *ppr;
/* cpus per rank */ /* cpus per rank */
int cpus_per_rank; int cpus_per_rank;
/* stride */
int stride;
/* do not allow use of the localhost */
bool no_use_local;
/* display the map after it is computed */ /* display the map after it is computed */
bool display_map; bool display_map;
/* slot list, if provided by user */ /* slot list, if provided by user */
char *slot_list; char *slot_list;
/* default mapping directives */
orte_mapping_policy_t mapping;
orte_ranking_policy_t ranking;
} orte_rmaps_base_t; } orte_rmaps_base_t;
/** /**
@ -99,11 +99,19 @@ ORTE_DECLSPEC int orte_rmaps_base_get_vpid_range(orte_jobid_t jobid,
ORTE_DECLSPEC int orte_rmaps_base_set_vpid_range(orte_jobid_t jobid, ORTE_DECLSPEC int orte_rmaps_base_set_vpid_range(orte_jobid_t jobid,
orte_vpid_t start, orte_vpid_t range); orte_vpid_t start, orte_vpid_t range);
/* pretty-print functions */
ORTE_DECLSPEC char* orte_rmaps_base_print_mapping(orte_mapping_policy_t mapping);
ORTE_DECLSPEC char* orte_rmaps_base_print_ranking(orte_ranking_policy_t ranking);
/** /**
* Close down the rmaps framework * Close down the rmaps framework
*/ */
ORTE_DECLSPEC int orte_rmaps_base_close(void); ORTE_DECLSPEC int orte_rmaps_base_close(void);
#if OPAL_HAVE_HWLOC
ORTE_DECLSPEC int orte_rmaps_base_prep_topology(hwloc_topology_t topo);
#endif
#endif /* ORTE_DISABLE_FULL_SUPPORT */ #endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS END_C_DECLS

Просмотреть файл

@ -10,6 +10,7 @@
# University of Stuttgart. All rights reserved. # University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California. # Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -93,4 +94,94 @@ Your job failed to map. Either no mapper was available, or none
of the available mappers was able to perform the requested of the available mappers was able to perform the requested
mapping operation. This can happen if you request a map type mapping operation. This can happen if you request a map type
(e.g., loadbalance) and the corresponding mapper was not built. (e.g., loadbalance) and the corresponding mapper was not built.
#
[unrecognized-policy]
The specified %s policy is not recognized:
Policy: %s
Please check for a typo or ensure that the option is a supported
one.
#
[redefining-policy]
Conflicting directives for %s policy are causing the policy
to be redefined:
New policy: %s
Prior policy: %s
Please check that only one policy is defined.
#
[rmaps:binding-target-not-found]
A request was made to bind to %s, but an appropriate target could not
be found on node %s.
#
[rmaps:binding-overload]
A request was made to bind to that would result in binding more
processes than cpus on a resource:
Bind to: %s
Node: %s
#processes: %d
#cpus: %d
You can override this protection by adding the "overload-allowed"
option to your binding directive.
#
[rmaps:no-topology]
A request was made for nperxxx that requires knowledge of
a remote node's topology. However, no topology info is
available for the following node:
Node: %s
The job cannot be executed under this condition. Please either
remove the nperxxx directive and specify the number of processes
to use, or investigate the lack of topology info.
#
[rmaps:no-available-cpus]
While computing bindings, we found no available cpus on
the following node:
Node: %s
Please check your allocation.
#
[rmaps:cpubind-not-supported]
A request was made to bind a process, but at least one node does NOT
support binding processes to cpus.
Node: %s
#
[rmaps:membind-not-supported]
WARNING: a request was made to bind a process. While the system
supports binding the process itself, at least one node does NOT
support binding memory to the process location.
Node: %s
This is a warning only; your job will continue, though performance may
be degraded.
#
[rmaps:membind-not-supported-fatal]
A request was made to bind a process. While the system
supports binding the process itself, at least one node does NOT
support binding memory to the process location.
Node: %s
The provided memory binding policy requires that we abort the
job at this time.
#
[rmaps:no-bindable-objects]
No bindable objects of the specified type were available
on at least one node:
Node: %s
Target: %s
#
[rmaps:unknown-binding-level]
Unknown binding level:
Target: %s
Cache level: %u

688
orte/mca/rmaps/base/rmaps_base_binding.c Обычный файл
Просмотреть файл

@ -0,0 +1,688 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <string.h>
#include "opal/util/if.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/threads/tsd.h"
#include "orte/types.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/hostfile/hostfile.h"
#include "orte/util/dash_host/dash_host.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
static bool membind_warned=false;
static int bind_upwards(orte_job_t *jdata,
hwloc_obj_type_t target,
unsigned cache_level)
{
/* traverse the hwloc topology tree on each node upwards
* until we find an object of type target - and then bind
* the process to that target
*/
int i, j;
orte_job_map_t *map;
orte_node_t *node;
orte_proc_t *proc;
hwloc_obj_t obj;
hwloc_cpuset_t cpus;
unsigned int idx, ncpus, nobjs, nsave, *nbound=NULL;
struct hwloc_topology_support *support;
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bind upwards for job %s with bindings %s",
ORTE_JOBID_PRINT(jdata->jobid),
opal_hwloc_base_print_binding(jdata->map->binding));
/* initialize */
map = jdata->map;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
/* check if topology supports cpubind - if not, then we cannot bind */
support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
if (!support->cpubind->set_thisproc_cpubind) {
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) {
/* we are not required to bind, so ignore this */
continue;
}
orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
if (NULL != nbound) {
free(nbound);
}
return ORTE_ERR_SILENT;
}
/* check if topology supports membind */
if (!support->membind->set_thisproc_membind) {
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
membind_warned = true;
} else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
if (NULL != nbound) {
free(nbound);
}
return ORTE_ERR_SILENT;
}
}
/* get the number of objects of this type on this node */
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
cache_level, OPAL_HWLOC_AVAILABLE);
if (0 == nobjs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-bindable-objects", true,
node->name, hwloc_obj_type_string(target));
return ORTE_ERR_SILENT;
}
/* setup the array */
if (NULL == nbound) {
nbound = (unsigned int*)malloc(nobjs * sizeof(int));
nsave = nobjs;
} else if (nsave < nobjs) {
nbound = (unsigned int*)realloc(nbound, nobjs * sizeof(int));
}
memset(nbound, 0, nobjs * sizeof(int));
/* cycle thru the procs */
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
/* ignore procs that have already been bound - should
* never happen, but safer
*/
if (NULL != proc->cpu_bitmap) {
continue;
}
/* bozo check */
if (NULL == proc->locale) {
opal_output(0, "BIND UPWARDS: LOCALE FOR PROC %s IS NULL", ORTE_NAME_PRINT(&proc->name));
return ORTE_ERR_SILENT;
}
/* starting at the locale, move up thru the parents
* to find the target object type
*/
for (obj = proc->locale->parent; NULL != obj; obj = obj->parent) {
opal_output(0, "%s bind:upward target %s type %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
hwloc_obj_type_string(target),
hwloc_obj_type_string(obj->type));
if (target == obj->type) {
if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
continue;
}
/* get its index */
if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) {
free(nbound);
return ORTE_ERR_SILENT;
}
/* track the number bound */
++nbound[idx];
/* get the number of cpus under this location */
if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
free(nbound);
return ORTE_ERR_SILENT;
}
/* error out if adding a proc would cause overload and that wasn't allowed */
if (ncpus < nbound[idx] &&
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
opal_hwloc_base_print_binding(map->binding), node->name,
nbound[idx], ncpus);
free(nbound);
return ORTE_ERR_SILENT;
}
/* bind it here */
proc->bind_idx = idx;
cpus = opal_hwloc_base_get_available_cpus(node->topology, obj);
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"%s BOUND PROC %s TO %s[%s:%u] on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
proc->cpu_bitmap,
hwloc_obj_type_string(target),
idx, node->name);
break;
}
}
if (NULL == proc->cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) {
/* didn't find anyone to bind to - this is an error
* unless the user specified if-supported
*/
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true,
opal_hwloc_base_print_binding(map->binding), node->name);
free(nbound);
return ORTE_ERR_SILENT;
}
}
}
if (NULL != nbound) {
free(nbound);
}
return ORTE_SUCCESS;
}
static int bind_downwards(orte_job_t *jdata,
hwloc_obj_type_t target,
unsigned cache_level)
{
int i, j;
orte_job_map_t *map;
orte_node_t *node;
orte_proc_t *proc;
hwloc_obj_t obj;
hwloc_cpuset_t cpus;
unsigned int n, idx, minval, ncpus, nobjs, nsave, *nbound=NULL;
struct hwloc_topology_support *support;
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bind downward for job %s with bindings %s",
ORTE_JOBID_PRINT(jdata->jobid),
opal_hwloc_base_print_binding(jdata->map->binding));
/* initialize */
map = jdata->map;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
/* check if topology supports cpubind - if not, then we cannot bind */
support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
if (!support->cpubind->set_thisproc_cpubind) {
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) {
/* we are not required to bind, so ignore this */
continue;
}
orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
if (NULL != nbound) {
free(nbound);
}
return ORTE_ERR_SILENT;
}
/* check if topology supports membind */
if (!support->membind->set_thisproc_membind) {
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
membind_warned = true;
} else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
if (NULL != nbound) {
free(nbound);
}
return ORTE_ERR_SILENT;
}
}
/* get the number of objects of this type on this node */
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
cache_level, OPAL_HWLOC_AVAILABLE);
if (0 == nobjs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-bindable-objects", true,
node->name, hwloc_obj_type_string(target));
return ORTE_ERR_SILENT;
}
/* setup the array */
if (NULL == nbound) {
nbound = (unsigned int*)malloc(nobjs * sizeof(int));
nsave = nobjs;
} else if (nsave < nobjs) {
nbound = (unsigned int*)realloc(nbound, nobjs * sizeof(int));
}
memset(nbound, 0, nobjs * sizeof(int));
/* cycle thru the procs */
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
/* ignore procs that have already been bound - should
* never happen, but safer
*/
if (NULL != proc->cpu_bitmap) {
continue;
}
/* cycle across the target objects and select the one with
* minimum usage
*/
minval = UINT_MAX;
idx = 0;
for (n=0; n < nobjs; n++) {
if (nbound[n] < minval) {
minval = nbound[n];
idx = n;
}
}
/* track the number bound */
++nbound[idx];
/* get the number of cpus under this location */
obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level,
idx, OPAL_HWLOC_AVAILABLE);
if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
free(nbound);
return ORTE_ERR_SILENT;
}
/* error out if adding a proc would cause overload and that wasn't allowed */
if (ncpus < nbound[idx] &&
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
opal_hwloc_base_print_binding(map->binding), node->name,
nbound[idx], ncpus);
free(nbound);
return ORTE_ERR_SILENT;
}
/* bind the proc here */
proc->bind_idx = idx;
cpus = opal_hwloc_base_get_available_cpus(node->topology, obj);
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"%s BOUND PROC %s TO %s[%s:%u] on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
proc->cpu_bitmap, hwloc_obj_type_string(obj->type),
idx, node->name);
}
}
if (NULL != nbound) {
free(nbound);
}
return ORTE_SUCCESS;
}
static int bind_in_place(orte_job_t *jdata,
hwloc_obj_type_t target,
unsigned cache_level)
{
/* traverse the hwloc topology tree on each node downwards
* until we find an unused object of type target - and then bind
* the process to that target
*/
int i, j;
orte_job_map_t *map;
orte_node_t *node;
orte_proc_t *proc;
hwloc_cpuset_t cpus;
unsigned int idx, ncpus, nobjs, nsave, *nbound=NULL;
struct hwloc_topology_support *support;
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bind in place for job %s with bindings %s",
ORTE_JOBID_PRINT(jdata->jobid),
opal_hwloc_base_print_binding(jdata->map->binding));
/* initialize */
map = jdata->map;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
/* check if topology supports cpubind - if not, then we cannot bind */
support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
if (!support->cpubind->set_thisproc_cpubind) {
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) {
/* we are not required to bind, so ignore this */
continue;
}
orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
if (NULL != nbound) {
free(nbound);
}
return ORTE_ERR_SILENT;
}
/* check if topology supports membind */
if (!support->membind->set_thisproc_membind) {
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
membind_warned = true;
} else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
if (NULL != nbound) {
free(nbound);
}
return ORTE_ERR_SILENT;
}
}
/* get the number of objects of this type on this node */
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
cache_level, OPAL_HWLOC_AVAILABLE);
if (0 == nobjs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-bindable-objects", true,
node->name, hwloc_obj_type_string(target));
return ORTE_ERR_SILENT;
}
/* setup the array */
if (NULL == nbound) {
nbound = (unsigned int*)malloc(nobjs * sizeof(int));
nsave = nobjs;
} else if (nsave < nobjs) {
nbound = (unsigned int*)realloc(nbound, nobjs * sizeof(int));
}
memset(nbound, 0, nobjs * sizeof(int));
/* cycle thru the procs */
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
/* ignore procs that have already been bound - should
* never happen, but safer
*/
if (NULL != proc->cpu_bitmap) {
continue;
}
/* get the index of this location */
if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, proc->locale, OPAL_HWLOC_AVAILABLE))) {
free(nbound);
return ORTE_ERR_SILENT;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"BINDING PROC %s TO %s NUMBER %u",
ORTE_NAME_PRINT(&proc->name),
hwloc_obj_type_string(proc->locale->type), idx);
/* get the number of cpus under this location */
if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, proc->locale))) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
free(nbound);
return ORTE_ERR_SILENT;
}
/* track number bound */
++nbound[idx];
/* error out if adding a proc would cause overload and that wasn't allowed */
if (ncpus < nbound[idx] &&
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
opal_hwloc_base_print_binding(map->binding), node->name,
nbound[idx], ncpus);
free(nbound);
return ORTE_ERR_SILENT;
}
/* bind the proc here */
proc->bind_idx = idx;
cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale);
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"%s BOUND PROC %s TO %s[%s:%u] on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
proc->cpu_bitmap,
hwloc_obj_type_string(proc->locale->type),
idx, node->name);
}
}
if (NULL != nbound) {
free(nbound);
}
return ORTE_SUCCESS;
}
int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
{
if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) ||
OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
/* no binding requested */
return ORTE_SUCCESS;
}
if (OPAL_BIND_TO_BOARD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
/* doesn't do anything at this time */
return ORTE_SUCCESS;
}
/* binding requested */
/* if the job was mapped by the corresponding target, then
* there is nothing more to do - the launch message creator
* will see that the binding object is NULL and will simply
* use the locale as the place to bind the proc
*
* otherwise, we have to bind either up or down the hwloc
* tree. If we are binding upwards (e.g., mapped to hwthread
* but binding to core), then we just climb the tree to find
* the first matching object.
*
* if we are binding downwards (e.g., mapped to node and bind
* to core), then we have to do a round-robin assigment of
* procs to the resources below.
*/
if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
/* record the level for locality purposes */
jdata->map->bind_level = OPAL_HWLOC_HWTHREAD_LEVEL;
if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bindings for job %s - hwthread to hwthread",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_PU, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* HW threads are at the bottom, so all other bindings are upwards */
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_PU, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
/* record the level for locality purposes */
jdata->map->bind_level = OPAL_HWLOC_CORE_LEVEL;
if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bindings for job %s - core to core",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CORE, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if the mapping policy used is less than bycore, then it is a
* downward binding - i.e., the locale is above the binding location.
* for example, if we map-to-socket and bind-to-core, then we compare
* the mapping value of ORTE_MAPPING_BYCORE to ORTE_MAPPING_BYSOCKET.
* In this case, BYCORE > BYSOCKET, so we know that the locale is
* above the desired binding level (sockets are at a higher level than
* the desired core binding level), and we will have to bind downwards
*/
if (ORTE_MAPPING_BYCORE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CORE, 0))) {
ORTE_ERROR_LOG(rc);
}
} else {
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CORE, 0))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
} else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
/* record the level for locality purposes */
jdata->map->bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bindings for job %s - L1cache to L1cache",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 1))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if the mapping policy is less than l1cache, then it is a
* downward binding
*/
if (ORTE_MAPPING_BYL1CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 1))) {
ORTE_ERROR_LOG(rc);
}
} else {
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 1))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
} else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
/* record the level for locality purposes */
jdata->map->bind_level = OPAL_HWLOC_L2CACHE_LEVEL;
if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bindings for job %s - L2cache to L2cache",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 2))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if the mapping policy is less than l2cache, then it is a
* downward binding
*/
if (ORTE_MAPPING_BYL2CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 2))) {
ORTE_ERROR_LOG(rc);
}
} else {
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 2))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
} else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
/* record the level for locality purposes */
jdata->map->bind_level = OPAL_HWLOC_L3CACHE_LEVEL;
if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bindings for job %s - L3cache to L3cache",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 3))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if the mapping policy is less than l3cache, then it is a
* downward binding
*/
if (ORTE_MAPPING_BYL3CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 3))) {
ORTE_ERROR_LOG(rc);
}
} else {
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 3))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
} else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
/* record the level for locality purposes */
jdata->map->bind_level = OPAL_HWLOC_SOCKET_LEVEL;
if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bindings for job %s - socket to socket",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_SOCKET, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if the mapping policy is less than bysocket, then it is a
* downward binding
*/
if (ORTE_MAPPING_BYSOCKET > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
ORTE_ERROR_LOG(rc);
}
} else {
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
} else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
/* record the level for locality purposes */
jdata->map->bind_level = OPAL_HWLOC_NUMA_LEVEL;
if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: bindings for job %s - numa to numa",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if the mapping policy is less than numa, then it is a
* downward binding
*/
if (ORTE_MAPPING_BYNUMA > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_NODE, 0))) {
ORTE_ERROR_LOG(rc);
}
} else {
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_NODE, 0))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
} else {
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
return ORTE_ERR_NOT_SUPPORTED;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,356 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <string.h>
#include "opal/util/if.h"
#include "opal/util/output.h"
#include "opal/util/opal_sos.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/hostfile/hostfile.h"
#include "orte/util/dash_host/dash_host.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
/*
* determine the proper starting point for the next mapping operation
*/
opal_list_item_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list, orte_job_t *jdata)
{
opal_list_item_t *item, *cur_node_item;
orte_node_t *node, *nd1, *ndmin;
int overload;
/* if a bookmark exists from some prior mapping, set us to start there */
if (NULL != jdata->bookmark) {
cur_node_item = NULL;
/* find this node on the list */
for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
if (node->index == jdata->bookmark->index) {
cur_node_item = item;
break;
}
}
/* see if we found it - if not, just start at the beginning */
if (NULL == cur_node_item) {
cur_node_item = opal_list_get_first(node_list);
}
} else {
/* if no bookmark, then just start at the beginning of the list */
cur_node_item = opal_list_get_first(node_list);
}
/* is this node fully subscribed? If so, then the first
* proc we assign will oversubscribe it, so let's look
* for another candidate
*/
node = (orte_node_t*)cur_node_item;
ndmin = node;
overload = ndmin->slots_inuse - ndmin->slots_alloc;
if (node->slots_inuse >= node->slots_alloc) {
/* work down the list - is there another node that
* would not be oversubscribed?
*/
if (cur_node_item != opal_list_get_last(node_list)) {
item = opal_list_get_next(cur_node_item);
} else {
item = opal_list_get_first(node_list);
}
while (item != cur_node_item) {
nd1 = (orte_node_t*)item;
if (nd1->slots_inuse < nd1->slots_alloc) {
/* this node is not oversubscribed! use it! */
return item;
}
/* this one was also oversubscribed, keep track of the
* node that has the least usage - if we can't
* find anyone who isn't fully utilized, we will
* start with the least used node
*/
if (overload >= (nd1->slots_inuse - nd1->slots_alloc)) {
ndmin = nd1;
overload = ndmin->slots_inuse - ndmin->slots_alloc;
}
if (item == opal_list_get_last(node_list)) {
item = opal_list_get_first(node_list);
} else {
item= opal_list_get_next(item);
}
}
/* if we get here, then we cycled all the way around the
* list without finding a better answer - just use the node
* that is minimally overloaded
*/
cur_node_item = (opal_list_item_t*)ndmin;
}
return cur_node_item;
}
/*
* Query the registry for all nodes allocated to a specified app_context
*/
int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
opal_list_t *node_list, orte_vpid_t num_procs,
opal_list_item_t *cur_node_item)
{
int rc=ORTE_SUCCESS;
int i;
orte_node_t *node;
orte_proc_t *proc;
opal_list_item_t *next;
orte_vpid_t num_alloc = 0;
orte_vpid_t start;
int num_procs_to_assign, num_possible_procs;
/* This loop continues until all procs have been mapped or we run
out of resources. We determine that we have "run out of
resources" when either all nodes have slots_max processes mapped to them,
(thus there are no free slots for a process to be mapped), OR all nodes
have reached their soft limit and the user directed us to "no oversubscribe".
If we still have processes that haven't been mapped yet, then it's an
"out of resources" error. */
start = jdata->num_procs;
while ( num_alloc < num_procs) {
/** see if any nodes remain unused and available. We need to do this check
* each time since we may remove nodes from the list (as they become fully
* used) as we cycle through the loop */
if(0 >= opal_list_get_size(node_list) ) {
/* Everything is at max usage! :( */
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:alloc-error",
true, num_procs, app->app);
return ORTE_ERR_SILENT;
}
/* Save the next node we can use before claiming slots, since
* we may need to prune the nodes list removing overused nodes.
* Wrap around to beginning if we are at the end of the list */
if (opal_list_get_end(node_list) == opal_list_get_next(cur_node_item)) {
next = opal_list_get_first(node_list);
}
else {
next = opal_list_get_next(cur_node_item);
}
/** declare a shorter name for convenience in the code below */
node = (orte_node_t*) cur_node_item;
/* If we have available slots on this node, claim all of them
* If node_slots == 0, assume 1 slot for that node.
* JJH - is this assumption fully justified?
*
* If we are now oversubscribing the nodes, then we still take:
* (a) if the node has not been used yet, we take a full node_slots
* (b) if some of the slots are in-use, then we take the number of
* remaining slots before hitting the soft limit (node_slots)
* (c) if we are at or above the soft limit, we take a full node_slots
* unless we are loadbalancing, in which case we only take one
*
* Note: if node_slots is zero, then we always just take 1 slot
*
* We continue this process until either everything is done,
* or all nodes have hit their hard limit. This algorithm ensures we
* fully utilize each node before oversubscribing, and preserves the ratio
* of processes between the nodes thereafter (e.g., if one node has twice as
* many processes as another before oversubscribing, it will continue
* to do so after oversubscribing).
*/
if (node->slots_inuse >= node->slots_alloc || 0 == node->slots_inuse) {
if (0 == node->slots_alloc) {
num_procs_to_assign = 1;
} else {
/* 'num_possible_procs' defines the number of ranks */
num_possible_procs = node->slots_alloc;
if (0 == num_possible_procs) {
num_procs_to_assign = 1;
} else {
num_procs_to_assign = num_possible_procs;
}
}
} else {
/* 'num_possible_procs' define number of ranks on the node. Each
* rank occupies one slot. Each slot may represent more than one
* cpu, depending on the cpus-per-task setting
*/
num_possible_procs = (node->slots_alloc - node->slots_inuse);
if (0 == num_possible_procs) {
num_procs_to_assign = 1;
} else {
num_procs_to_assign = num_possible_procs;
}
}
/* check if we are in npernode mode - if so, then set the num_slots_to_take
* to the num_per_node
*/
if (0 < jdata->map->npernode) {
num_procs_to_assign = jdata->map->npernode;
}
for( i = 0; i < num_procs_to_assign; ++i) {
proc = NULL;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
node_list, jdata->map->oversubscribe,
true, &proc))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
* an error
*/
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* assign the vpid */
proc->name.vpid = start++;
/* Update the number of procs allocated */
++num_alloc;
/** if all the procs have been mapped, we return */
if (num_alloc == num_procs) {
goto complete;
}
/* if we have fully used up this node, then break from the loop */
if (ORTE_ERR_NODE_FULLY_USED == OPAL_SOS_GET_ERROR_CODE(rc)) {
break;
}
}
/* we move on to the next node in all cases EXCEPT if we came
* out of the loop without having taken a full bite AND the
* node is NOT max'd out
*
*/
if (i < (num_procs_to_assign-1) &&
ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
continue;
}
cur_node_item = next;
}
complete:
/* save the bookmark */
jdata->bookmark = (orte_node_t*)cur_node_item;
return ORTE_SUCCESS;
}
int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
opal_list_t *node_list, orte_vpid_t num_procs,
opal_list_item_t *cur_node_item)
{
int rc = ORTE_SUCCESS;
opal_list_item_t *next;
orte_node_t *node;
orte_proc_t *proc;
orte_vpid_t num_alloc=0;
orte_vpid_t start;
/* This loop continues until all procs have been mapped or we run
out of resources. We determine that we have "run out of
resources" when all nodes have slots_max processes mapped to them,
thus there are no free slots for a process to be mapped, or we have
hit the soft limit on all nodes and are in a "no oversubscribe" state.
If we still have processes that haven't been mapped yet, then it's an
"out of resources" error.
In this scenario, we rely on the claim_slot function to handle the
oversubscribed case. The claim_slot function will leave a node on the
list until it either reaches slots_max OR reaches the
soft limit and the "no_oversubscribe" flag has been set - at which point,
the node will be removed to prevent any more processes from being mapped to
it. Since we are taking one slot from each node as we cycle through, the
list, oversubscription is automatically taken care of via this logic.
*/
start = jdata->num_procs;
while (num_alloc < num_procs) {
/** see if any nodes remain unused and available. We need to do this check
* each time since we may remove nodes from the list (as they become fully
* used) as we cycle through the loop */
if(0 >= opal_list_get_size(node_list) ) {
/* No more nodes to allocate :( */
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:alloc-error",
true, num_procs, app->app);
return ORTE_ERR_SILENT;
}
/* Save the next node we can use before claiming slots, since
* we may need to prune the nodes list removing overused nodes.
* Wrap around to beginning if we are at the end of the list */
if (opal_list_get_end(node_list) == opal_list_get_next(cur_node_item)) {
next = opal_list_get_first(node_list);
}
else {
next = opal_list_get_next(cur_node_item);
}
/* Allocate a slot on this node */
node = (orte_node_t*) cur_node_item;
proc = NULL;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, jdata->map->cpus_per_rank, app->idx,
node_list, jdata->map->oversubscribe, true, &proc))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
* an error
*/
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* assign the vpid */
proc->name.vpid = start++;
/* Update the number of procs allocated */
++num_alloc;
cur_node_item = next;
}
/* save the bookmark */
jdata->bookmark = (orte_node_t*)cur_node_item;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -24,6 +25,7 @@
#include "opal/mca/mca.h" #include "opal/mca/mca.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/mca/base/base.h" #include "opal/mca/base/base.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/dss/dss.h" #include "opal/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
@ -51,6 +53,10 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
* DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP * DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP
*/ */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: mapping job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE /* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE
* THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE * THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE
* PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY * PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY
@ -71,10 +77,15 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
/* load it with the system defaults */ /* load it with the system defaults */
map->policy = orte_default_mapping_policy; map->mapping = orte_rmaps_base.mapping;
map->ranking = orte_rmaps_base.ranking;
#if OPAL_HAVE_HWLOC
map->binding = opal_hwloc_binding_policy;
#endif
if (NULL != orte_rmaps_base.ppr) {
map->ppr = strdup(orte_rmaps_base.ppr);
}
map->cpus_per_rank = orte_rmaps_base.cpus_per_rank; map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
map->stride = orte_rmaps_base.stride;
map->oversubscribe = orte_rmaps_base.oversubscribe;
map->display_map = orte_rmaps_base.display_map; map->display_map = orte_rmaps_base.display_map;
/* assign the map object to this job */ /* assign the map object to this job */
jdata->map = map; jdata->map = map;
@ -82,60 +93,174 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
if (!jdata->map->display_map) { if (!jdata->map->display_map) {
jdata->map->display_map = orte_rmaps_base.display_map; jdata->map->display_map = orte_rmaps_base.display_map;
} }
if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->policy)) { /* set the default mapping policy IFF it wasn't provided */
jdata->map->policy = jdata->map->policy | orte_default_mapping_policy; if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, orte_rmaps_base.mapping);
} }
if (!ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping));
}
/* ditto for rank and bind policies */
if (!ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) {
ORTE_SET_RANKING_POLICY(jdata->map->ranking, orte_rmaps_base.ranking);
}
#if OPAL_HAVE_HWLOC
if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
jdata->map->binding = opal_hwloc_binding_policy;
}
#endif
} }
/* if the job is the daemon job, then we are just mapping daemons and /* if the job is the daemon job, then we are just mapping daemons and
* not apps in preparation to launch a virtual machine * not apps in preparation to launch a virtual machine
*/ */
if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) { if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: mapping daemons");
if (ORTE_SUCCESS != (rc = orte_rmaps_base_setup_virtual_machine(jdata))) { if (ORTE_SUCCESS != (rc = orte_rmaps_base_setup_virtual_machine(jdata))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
}
return rc;
}
/* cycle thru the available mappers until one agrees to map
* the job
*/
did_map = false;
for (item = opal_list_get_first(&orte_rmaps_base.selected_modules);
item != opal_list_get_end(&orte_rmaps_base.selected_modules);
item = opal_list_get_next(item)) {
mod = (orte_rmaps_base_selected_module_t*)item;
if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata))) {
did_map = true;
break;
}
/* mappers return "next option" if they didn't attempt to
* map the job. anything else is a true error.
*/
if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
} else { }
/* cycle thru the available mappers until one agrees to map /* if we get here without doing the map, or with zero procs in
* the job * the map, then that's an error
*/ */
did_map = false; if (!did_map || 0 == jdata->num_procs) {
for (item = opal_list_get_first(&orte_rmaps_base.selected_modules); orte_show_help("help-orte-rmaps-base.txt", "failed-map", true);
item != opal_list_get_end(&orte_rmaps_base.selected_modules); return ORTE_ERR_FAILED_TO_MAP;
item = opal_list_get_next(item)) { }
mod = (orte_rmaps_base_selected_module_t*)item;
if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata))) { /* compute and save local ranks */
did_map = true; if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
break; ORTE_ERROR_LOG(rc);
} return rc;
/* mappers return "next option" if they didn't attempt to
* map the job. anything else is a true error.
*/
if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* if we get here without doing the map, or with zero procs in
* the map, then that's an error
*/
if (!did_map || 0 == jdata->num_procs) {
orte_show_help("help-orte-rmaps-base.txt", "failed-map", true);
return ORTE_ERR_FAILED_TO_MAP;
}
} }
/* if we wanted to display the map, now is the time to do it */ #if OPAL_HAVE_HWLOC
/* compute and save bindings */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#endif
/* if we wanted to display the map, now is the time to do it - ignore
* daemon job
*/
if (jdata->map->display_map) { if (jdata->map->display_map) {
char *output; char *output;
opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP); int i, j;
if (orte_xml_output) { orte_node_t *node;
fprintf(orte_xml_fp, "%s\n", output); orte_proc_t *proc;
fflush(orte_xml_fp);
if (orte_display_diffable_output) {
/* intended solely to test mapping methods, this output
* can become quite long when testing at scale. Rather
* than enduring all the malloc/free's required to
* create an arbitrary-length string, custom-generate
* the output a line at a time here
*/
/* display just the procs in a diffable format */
opal_output(orte_clean_output, "<map>");
fflush(stderr);
/* loop through nodes */
for (i=0; i < jdata->map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
continue;
}
opal_output(orte_clean_output, "\t<host name=%s>", (NULL == node->name) ? "UNKNOWN" : node->name);
fflush(stderr);
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
#if OPAL_HAVE_HWLOC
{
char locale[64];
if (NULL != proc->locale) {
hwloc_bitmap_list_snprintf(locale, 64, proc->locale->cpuset);
}
opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu locale=%s binding=%s[%s:%u]>",
ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx,
(unsigned long)proc->local_rank,
(unsigned long)proc->node_rank, locale,
(NULL == proc->cpu_bitmap) ? "NULL" : proc->cpu_bitmap,
opal_hwloc_base_print_level(jdata->map->bind_level), proc->bind_idx);
}
#else
opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu>",
ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx,
(unsigned long)proc->local_rank,
(unsigned long)proc->node_rank);
#endif
fflush(stderr);
}
opal_output(orte_clean_output, "\t</host>");
fflush(stderr);
}
#if OPAL_HAVE_HWLOC
{
opal_paffinity_locality_t locality;
orte_proc_t *p0;
/* test locality - for the first node, print the locality of each proc relative to the first one */
node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, 0);
p0 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, 0);
opal_output(orte_clean_output, "\t<locality>");
for (j=1; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
locality = opal_hwloc_base_get_relative_locality(node->topology,
jdata->map->bind_level,
p0->bind_idx,
jdata->map->bind_level,
proc->bind_idx);
opal_output(orte_clean_output, "\t\t<bind_level=%s rank=%s bind_idx=%u rank=%s bind_idx=%u locality=%s>",
opal_hwloc_base_print_level(jdata->map->bind_level),
ORTE_VPID_PRINT(p0->name.vpid),
p0->bind_idx, ORTE_VPID_PRINT(proc->name.vpid),
proc->bind_idx, opal_hwloc_base_print_locality(locality));
}
opal_output(orte_clean_output, "\t</locality>\n</map>");
fflush(stderr);
}
#else
opal_output(orte_clean_output, "\n</map>");
fflush(stderr);
#endif
} else { } else {
opal_output(orte_clean_output, "%s", output); opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP);
if (orte_xml_output) {
fprintf(orte_xml_fp, "%s\n", output);
fflush(orte_xml_fp);
} else {
opal_output(orte_clean_output, "%s", output);
}
free(output);
} }
free(output);
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;

Просмотреть файл

@ -30,7 +30,6 @@
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/mca/base/base.h" #include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
#include "opal/mca/paffinity/paffinity.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
@ -82,36 +81,231 @@ orte_rmaps_t orte_rmaps = {
*/ */
int orte_rmaps_base_open(void) int orte_rmaps_base_open(void)
{ {
int param, value; int param, value, i;
char *policy; char *policy;
bool btmp; bool btmp;
orte_mapping_policy_t tmp=0;
orte_ranking_policy_t rtmp=0;
char **ck, **ck2;
size_t len;
/* init the globals */ /* init the globals */
OBJ_CONSTRUCT(&orte_rmaps_base.selected_modules, opal_list_t); OBJ_CONSTRUCT(&orte_rmaps_base.selected_modules, opal_list_t);
orte_rmaps_base.ppr = NULL;
orte_rmaps_base.cpus_per_rank = 1;
orte_rmaps_base.display_map = false;
orte_rmaps_base.slot_list = NULL;
orte_rmaps_base.mapping = 0;
orte_rmaps_base.ranking = 0;
/* Debugging / verbose output. Always have stream open, with /* Debugging / verbose output. Always have stream open, with
verbose set by the mca open system... */ verbose set by the mca open system... */
orte_rmaps_base.rmaps_output = opal_output_open(NULL); orte_rmaps_base.rmaps_output = opal_output_open(NULL);
/* Are we scheduling by node or by slot? */ /* define default mapping policy */
param = mca_base_param_reg_string_name("rmaps", "base_schedule_policy", param = mca_base_param_reg_string_name("rmaps", "base_mapping_policy",
"Scheduling Policy for RMAPS. [slot (alias:core) | socket | board | node]", #if OPAL_HAVE_HWLOC
false, false, "slot", &policy); "Mapping Policy [slot (default) | hwthread | core | l1cache | l2cache | l3cache | socket | numa | board | node | seq], with allowed modifiers :SPAN,OVERSUBSCRIBE,NOOVERSUBSCRIBE",
#else
/* if something is specified, do not override what may already "Mapping Policy [slot (default) | node], with allowed modifiers :SPAN,OVERSUBSCRIBE,NOOVERSUBSCRIBE",
* be present - could have been given on cmd line #endif
*/ false, false, NULL, &policy);
if (0 == strcasecmp(policy, "slot") || mca_base_param_reg_syn_name(param, "rmaps", "base_schedule_policy", true);
0 == strcasecmp(policy, "core")) {
ORTE_XSET_MAPPING_POLICY(ORTE_MAPPING_BYSLOT); if (NULL == policy) {
} else if (0 == strcasecmp(policy, "socket")) { ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSLOT);
ORTE_XSET_MAPPING_POLICY(ORTE_MAPPING_BYSOCKET); ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
} else if (0 == strcasecmp(policy, "board")) { } else {
ORTE_XSET_MAPPING_POLICY(ORTE_MAPPING_BYBOARD); ck = opal_argv_split(policy, ':');
} else if (0 == strcasecmp(policy, "node")) { if (2 < opal_argv_count(ck)) {
ORTE_XSET_MAPPING_POLICY(ORTE_MAPPING_BYNODE); /* incorrect format */
orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "mapping", policy);
opal_argv_free(ck);
return ORTE_ERR_SILENT;
}
if (2 == opal_argv_count(ck)) {
ck2 = opal_argv_split(ck[1], ',');
for (i=0; NULL != ck2[i]; i++) {
if (0 == strncasecmp(ck2[i], "span", strlen(ck2[i]))) {
orte_rmaps_base.mapping |= ORTE_MAPPING_SPAN;
} else if (0 == strncasecmp(ck2[i], "oversubscribe", strlen(ck2[i]))) {
if (ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
/* error - cannot redefine the default mapping policy */
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
"oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
return ORTE_ERR_SILENT;
}
ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
} else if (0 == strncasecmp(ck2[i], "nooversubscribe", strlen(ck2[i]))) {
if (ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
/* error - cannot redefine the default mapping policy */
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
"nooversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
return ORTE_ERR_SILENT;
}
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
} else {
/* unrecognized modifier */
orte_show_help("help-orte-rmaps-base.txt", "unrecognized-modifier", true, "mapping", ck2[i]);
opal_argv_free(ck);
opal_argv_free(ck2);
return ORTE_ERR_SILENT;
}
}
opal_argv_free(ck2);
}
len = strlen(ck[0]);
if (0 == strncasecmp(ck[0], "slot", len)) {
tmp = ORTE_MAPPING_BYSLOT;
} else if (0 == strncasecmp(ck[0], "node", len)) {
tmp = ORTE_MAPPING_BYNODE;
#if OPAL_HAVE_HWLOC
} else if (0 == strncasecmp(ck[0], "core", len)) {
tmp = ORTE_MAPPING_BYCORE;
} else if (0 == strncasecmp(ck[0], "l1cache", len)) {
tmp = ORTE_MAPPING_BYL1CACHE;
} else if (0 == strncasecmp(ck[0], "l2cache", len)) {
tmp = ORTE_MAPPING_BYL2CACHE;
} else if (0 == strncasecmp(ck[0], "l3cache", len)) {
tmp = ORTE_MAPPING_BYL3CACHE;
} else if (0 == strncasecmp(ck[0], "socket", len)) {
tmp = ORTE_MAPPING_BYSOCKET;
} else if (0 == strncasecmp(ck[0], "numa", len)) {
tmp = ORTE_MAPPING_BYNUMA;
} else if (0 == strncasecmp(ck[0], "board", len)) {
tmp = ORTE_MAPPING_BYBOARD;
} else if (0 == strncasecmp(ck[0], "hwthread", len)) {
tmp = ORTE_MAPPING_BYHWTHREAD;
/* if we are mapping processes to individual hwthreads, then
* we need to treat those hwthreads as separate cpus
*/
opal_hwloc_use_hwthreads_as_cpus = true;
#endif
} else {
orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "mapping", policy);
opal_argv_free(ck);
return ORTE_ERR_SILENT;
}
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, tmp);
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
opal_argv_free(ck);
} }
/* define default ranking policy */
param = mca_base_param_reg_string_name("rmaps", "base_ranking_policy",
#if OPAL_HAVE_HWLOC
"Ranking Policy [slot (default) | hwthread | core | l1cache | l2cache | l3cache | socket | numa | board | node], with modifier :SPAN or :FILL",
#else
"Ranking Policy [slot (default) | node]",
#endif
false, false, NULL, &policy);
if (NULL == policy) {
ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_SLOT);
} else {
ck = opal_argv_split(policy, ':');
if (2 < opal_argv_count(ck)) {
/* incorrect format */
orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "ranking", policy);
opal_argv_free(ck);
return ORTE_ERR_SILENT;
}
if (2 == opal_argv_count(ck)) {
if (0 == strncasecmp(ck[1], "span", strlen(ck[1]))) {
orte_rmaps_base.ranking |= ORTE_RANKING_SPAN;
} else if (0 == strncasecmp(ck[1], "fill", strlen(ck[1]))) {
orte_rmaps_base.ranking |= ORTE_RANKING_FILL;
} else {
/* unrecognized modifier */
orte_show_help("help-orte-rmaps-base.txt", "unrecognized-modifier", true, "ranking", ck[1]);
opal_argv_free(ck);
return ORTE_ERR_SILENT;
}
}
len = strlen(ck[0]);
if (0 == strncasecmp(ck[0], "slot", len)) {
rtmp = ORTE_RANK_BY_SLOT;
} else if (0 == strncasecmp(ck[0], "node", len)) {
rtmp = ORTE_RANK_BY_NODE;
#if OPAL_HAVE_HWLOC
} else if (0 == strncasecmp(ck[0], "hwthread", len)) {
rtmp = ORTE_RANK_BY_HWTHREAD;
} else if (0 == strncasecmp(ck[0], "core", len)) {
rtmp = ORTE_RANK_BY_CORE;
} else if (0 == strncasecmp(ck[0], "l1cache", len)) {
rtmp = ORTE_RANK_BY_L1CACHE;
} else if (0 == strncasecmp(ck[0], "l2cache", len)) {
rtmp = ORTE_RANK_BY_L2CACHE;
} else if (0 == strncasecmp(ck[0], "l3cache", len)) {
rtmp = ORTE_RANK_BY_L3CACHE;
} else if (0 == strncasecmp(ck[0], "socket", len)) {
rtmp = ORTE_RANK_BY_SOCKET;
} else if (0 == strncasecmp(ck[0], "numa", len)) {
rtmp = ORTE_RANK_BY_NUMA;
} else if (0 == strncasecmp(ck[0], "board", len)) {
rtmp = ORTE_RANK_BY_BOARD;
#endif
} else {
orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "ranking", policy);
return ORTE_ERR_SILENT;
}
ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, rtmp);
ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN);
}
/* backward compatibility */
mca_base_param_reg_int_name("rmaps", "base_byslot",
"Whether to map and rank processes round-robin by slot",
false, false, (int)false, &value);
if (value) {
/* set mapping policy to byslot - error if something else already set */
if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYSLOT) {
/* error - cannot redefine the default mapping policy */
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
"byslot", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
return ORTE_ERR_SILENT;
}
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSLOT);
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
/* set ranking policy to byslot - error if something else already set */
if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) &&
ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_SLOT) {
/* error - cannot redefine the default ranking policy */
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking",
"byslot", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking));
return ORTE_ERR_SILENT;
}
ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_SLOT);
ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN);
}
mca_base_param_reg_int_name("rmaps", "base_bynode",
"Whether to map and rank processes round-robin by node",
false, false, (int)false, &value);
if (value) {
/* set mapping policy to bynode - error if something else already set */
if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYNODE) {
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
"bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
return ORTE_ERR_SILENT;
}
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE);
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
/* set ranking policy to bynode - error if something else already set */
if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) &&
ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_NODE) {
/* error - cannot redefine the default ranking policy */
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking",
"bynode", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking));
return ORTE_ERR_SILENT;
}
ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_NODE);
ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN);
}
#if 0
/* #cpus/rank to use */ /* #cpus/rank to use */
param = mca_base_param_reg_int_name("rmaps", "base_cpus_per_proc", param = mca_base_param_reg_int_name("rmaps", "base_cpus_per_proc",
"Number of cpus to use for each rank [1-2**15 (default=1)]", "Number of cpus to use for each rank [1-2**15 (default=1)]",
@ -119,38 +313,21 @@ int orte_rmaps_base_open(void)
mca_base_param_reg_syn_name(param, "rmaps", "base_cpus_per_rank", false); mca_base_param_reg_syn_name(param, "rmaps", "base_cpus_per_rank", false);
mca_base_param_lookup_int(param, &value); mca_base_param_lookup_int(param, &value);
orte_rmaps_base.cpus_per_rank = value; orte_rmaps_base.cpus_per_rank = value;
/* if the #cpus/rank > #cpus/socket, politely tell the user and abort
*
* NOTE: have to check that the default_num_cores_per_socket was set
* as ompi_info doesn't call the ess init function, and thus might
* leave this value at its default of zero
*/
if (0 < orte_default_num_cores_per_socket &&
orte_rmaps_base.cpus_per_rank > orte_default_num_cores_per_socket) {
orte_show_help("help-orte-rmaps-base.txt", "too-many-cpus-per-rank",
true, orte_rmaps_base.cpus_per_rank,
orte_default_num_cores_per_socket);
return ORTE_ERR_SILENT;
}
/* if the cpus/rank > 1, then we have to bind to cores UNLESS the binding has /* if the cpus/rank > 1, then we have to bind to cores UNLESS the binding has
* already been set to something else * already been set to something else
*/ */
if (1 < orte_rmaps_base.cpus_per_rank) { if (1 < orte_rmaps_base.cpus_per_rank &&
ORTE_XSET_BINDING_POLICY(ORTE_BIND_TO_CORE); !OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
opal_hwloc_binding_policy |= OPAL_BIND_TO_CORE;
} }
#endif
/* stride to use */
param = mca_base_param_reg_int_name("rmaps", "base_stride",
"When binding multiple cores to a rank, the step size to use between cores [1-2**15 (default: 1)]",
false, false, 1, &value);
orte_rmaps_base.stride = value;
/* Should we schedule on the local node or not? */ /* Should we schedule on the local node or not? */
mca_base_param_reg_int_name("rmaps", "base_no_schedule_local", mca_base_param_reg_int_name("rmaps", "base_no_schedule_local",
"If false, allow scheduling MPI applications on the same node as mpirun (default). If true, do not schedule any MPI applications on the same node as mpirun", "If false, allow scheduling MPI applications on the same node as mpirun (default). If true, do not schedule any MPI applications on the same node as mpirun",
false, false, (int)false, &value); false, false, (int)false, &value);
if (value) { if (value) {
orte_default_mapping_policy |= ORTE_MAPPING_NO_USE_LOCAL; orte_rmaps_base.mapping |= ORTE_MAPPING_NO_USE_LOCAL;
} }
/* Should we oversubscribe or not? */ /* Should we oversubscribe or not? */
@ -159,11 +336,33 @@ int orte_rmaps_base_open(void)
"If true, then do not allow oversubscription of nodes - mpirun will return an error if there aren't enough nodes to launch all processes without oversubscribing", "If true, then do not allow oversubscription of nodes - mpirun will return an error if there aren't enough nodes to launch all processes without oversubscribing",
false, false, (int)false, &value); false, false, (int)false, &value);
if (value) { if (value) {
orte_rmaps_base.oversubscribe = false; if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
} else { !(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
orte_rmaps_base.oversubscribe = true; /* error - cannot redefine the default mapping policy */
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
"no-oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
return ORTE_ERR_SILENT;
}
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
} }
/** force oversubscription permission */
mca_base_param_reg_int_name("rmaps", "base_oversubscribe",
"If true, then =allow oversubscription of nodes",
false, false, (int)false, &value);
if (value) {
if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
/* error - cannot redefine the default mapping policy */
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
"oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
return ORTE_ERR_SILENT;
}
ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
}
/* should we display the map after determining it? */ /* should we display the map after determining it? */
mca_base_param_reg_int_name("rmaps", "base_display_map", mca_base_param_reg_int_name("rmaps", "base_display_map",
"Whether to display the process map after it is computed", "Whether to display the process map after it is computed",
@ -200,11 +399,18 @@ int orte_rmaps_base_open(void)
mca_base_components_open("rmaps", orte_rmaps_base.rmaps_output, mca_base_components_open("rmaps", orte_rmaps_base.rmaps_output,
mca_rmaps_base_static_components, mca_rmaps_base_static_components,
&orte_rmaps_base.available_components, true)) { &orte_rmaps_base.available_components, true)) {
return ORTE_ERROR; return ORTE_ERROR;
}
/* check to see if any component indicated a problem */
if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
/* the component would have already reported the error, so
* tell the rest of the chain to shut up
*/
return ORTE_ERR_SILENT;
} }
/* All done */ /* All done */
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

222
orte/mca/rmaps/base/rmaps_base_print_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,222 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <string.h>
#include "opal/util/if.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/threads/tsd.h"
#include "orte/types.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/hostfile/hostfile.h"
#include "orte/util/dash_host/dash_host.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
#define ORTE_RMAPS_PRINT_MAX_SIZE 50
#define ORTE_RMAPS_PRINT_NUM_BUFS 16
static bool fns_init=false;
static opal_tsd_key_t print_tsd_key;
static char* orte_rmaps_print_null = "NULL";
typedef struct {
char *buffers[ORTE_RMAPS_PRINT_NUM_BUFS];
int cntr;
} orte_rmaps_print_buffers_t;
static void buffer_cleanup(void *value)
{
int i;
orte_rmaps_print_buffers_t *ptr;
if (NULL != value) {
ptr = (orte_rmaps_print_buffers_t*)value;
for (i=0; i < ORTE_RMAPS_PRINT_NUM_BUFS; i++) {
free(ptr->buffers[i]);
}
}
}
static orte_rmaps_print_buffers_t *get_print_buffer(void)
{
orte_rmaps_print_buffers_t *ptr;
int ret, i;
if (!fns_init) {
/* setup the print_args function */
if (ORTE_SUCCESS != (ret = opal_tsd_key_create(&print_tsd_key, buffer_cleanup))) {
ORTE_ERROR_LOG(ret);
return NULL;
}
fns_init = true;
}
ret = opal_tsd_getspecific(print_tsd_key, (void**)&ptr);
if (OPAL_SUCCESS != ret) return NULL;
if (NULL == ptr) {
ptr = (orte_rmaps_print_buffers_t*)malloc(sizeof(orte_rmaps_print_buffers_t));
for (i=0; i < ORTE_RMAPS_PRINT_NUM_BUFS; i++) {
ptr->buffers[i] = (char *) malloc((ORTE_RMAPS_PRINT_MAX_SIZE+1) * sizeof(char));
}
ptr->cntr = 0;
ret = opal_tsd_setspecific(print_tsd_key, (void*)ptr);
}
return (orte_rmaps_print_buffers_t*) ptr;
}
char* orte_rmaps_base_print_mapping(orte_mapping_policy_t mapping)
{
char *ret, *map, *mymap, *tmp;
orte_rmaps_print_buffers_t *ptr;
if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
return "CONFLICTED";
}
ptr = get_print_buffer();
if (NULL == ptr) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return orte_rmaps_print_null;
}
/* cycle around the ring */
if (ORTE_RMAPS_PRINT_NUM_BUFS == ptr->cntr) {
ptr->cntr = 0;
}
switch(ORTE_GET_MAPPING_POLICY(mapping)) {
case ORTE_MAPPING_BYNODE:
map = "BYNODE";
break;
case ORTE_MAPPING_BYBOARD:
map = "BYBOARD";
break;
case ORTE_MAPPING_BYNUMA:
map = "BYNUMA";
break;
case ORTE_MAPPING_BYSOCKET:
map = "BYSOCKET";
break;
case ORTE_MAPPING_BYL3CACHE:
map = "BYL3CACHE";
break;
case ORTE_MAPPING_BYL2CACHE:
map = "BYL2CACHE";
break;
case ORTE_MAPPING_BYL1CACHE:
map = "BYL1CACHE";
break;
case ORTE_MAPPING_BYCORE:
map = "BYCORE";
break;
case ORTE_MAPPING_BYHWTHREAD:
map = "BYHWTHREAD";
break;
case ORTE_MAPPING_BYSLOT:
map = "BYSLOT";
break;
case ORTE_MAPPING_SEQ:
map = "SEQUENTIAL";
break;
case ORTE_MAPPING_BYUSER:
map = "BYUSER";
break;
default:
if (ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
map = "PPR";
} else {
map = "UNKNOWN";
}
}
if (0 != strcmp(map, "PPR") && (ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(mapping))) {
asprintf(&mymap, "%s[PPR]:", map);
} else {
asprintf(&mymap, "%s:", map);
}
if (ORTE_MAPPING_NO_USE_LOCAL & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
asprintf(&tmp, "%sNO_USE_LOCAL,", mymap);
free(mymap);
mymap = tmp;
}
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
asprintf(&tmp, "%sNOOVERSUBSCRIBE,", mymap);
free(mymap);
mymap = tmp;
} else if (ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
asprintf(&tmp, "%sOVERSUBSCRIBE,", mymap);
free(mymap);
mymap = tmp;
}
if (ORTE_MAPPING_SPAN & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
asprintf(&tmp, "%sSPAN,", mymap);
free(mymap);
mymap = tmp;
}
/* remove the trailing mark */
mymap[strlen(mymap)-1] = '\0';
snprintf(ptr->buffers[ptr->cntr], ORTE_RMAPS_PRINT_MAX_SIZE, "%s", mymap);
free(mymap);
ret = ptr->buffers[ptr->cntr];
ptr->cntr++;
return ret;
}
char* orte_rmaps_base_print_ranking(orte_ranking_policy_t ranking)
{
switch(ORTE_GET_RANKING_POLICY(ranking)) {
case ORTE_RANK_BY_NODE:
return "NODE";
case ORTE_RANK_BY_BOARD:
return "BOARD";
case ORTE_RANK_BY_NUMA:
return "NUMA";
case ORTE_RANK_BY_SOCKET:
return "SOCKET";
case ORTE_RANK_BY_CORE:
return "CORE";
case ORTE_RANK_BY_HWTHREAD:
return "HWTHREAD";
case ORTE_RANK_BY_SLOT:
return "SLOT";
default:
return "UNKNOWN";
}
}

737
orte/mca/rmaps/base/rmaps_base_ranking.c Обычный файл
Просмотреть файл

@ -0,0 +1,737 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <string.h>
#include "opal/class/opal_pointer_array.h"
#include "opal/util/if.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/threads/tsd.h"
#include "orte/types.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/hostfile/hostfile.h"
#include "orte/util/dash_host/dash_host.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
#if OPAL_HAVE_HWLOC
static int rank_span(orte_job_t *jdata,
hwloc_obj_type_t target,
unsigned cache_level)
{
orte_job_map_t *map;
hwloc_obj_t obj;
int num_objs, i, j, n, rc;
orte_vpid_t num_ranked=0;
orte_node_t *node;
orte_proc_t *proc;
orte_vpid_t vpid;
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_span: for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/* if the ranking is spanned, then we perform the
* ranking as if it was one big node - i.e., we
* rank one proc on each object, step to the next object
* moving across all the nodes, then wrap around to the
* first object on the first node.
*
* Node 0 Node 1
* Obj 0 Obj 1 Obj 0 Obj 1
* 0 4 1 5 2 6 3 7
* 8 12 9 13 10 14 11 15
*/
/* In the interest of getting this committed in finite time,
* just loop across the nodes and objects until all procs
* are mapped
*/
map = jdata->map;
vpid = 0;
while (vpid < jdata->num_procs) {
for (n=0; n < map->nodes->size && vpid < jdata->num_procs; n++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
continue;
}
/* get the number of objects - only consider those we can actually use */
num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
cache_level, OPAL_HWLOC_AVAILABLE);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_span: found %d objects on node %s with %d procs",
num_objs, node->name, (int)node->num_procs);
/* for each object */
for (i=0; i < num_objs && vpid < jdata->num_procs; i++) {
obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
cache_level, i, OPAL_HWLOC_AVAILABLE);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_span: working object %d", i);
/* cycle thru the procs on this node */
for (j=0; j < node->procs->size && vpid < jdata->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_span skipping proc %s - from another job, num_ranked %d",
ORTE_NAME_PRINT(&proc->name), num_ranked);
continue;
}
/* ignore procs that are already assigned */
if (ORTE_VPID_INVALID != proc->name.vpid) {
continue;
}
/* protect against bozo case */
if (NULL == proc->locale) {
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
/* ignore procs not on this object */
if (!hwloc_bitmap_intersects(obj->cpuset, proc->locale->cpuset)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_span: proc at position %d is not on object %d",
j, i);
continue;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_span: assigning vpid %s", ORTE_VPID_PRINT(vpid));
proc->name.vpid = vpid++;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
/* If there is an invalid epoch here, it's because it doesn't exist yet. */
if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) {
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
}
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* move to next object */
break;
}
}
}
}
return ORTE_SUCCESS;
}
static int rank_fill(orte_job_t *jdata,
hwloc_obj_type_t target,
unsigned cache_level)
{
orte_job_map_t *map;
hwloc_obj_t obj;
int num_objs, i, j, n, rc;
orte_vpid_t num_ranked=0;
orte_node_t *node;
orte_proc_t *proc;
orte_vpid_t vpid;
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_fill: for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/* if the ranking is fill, then we rank all the procs
* within a given object before moving on to the next
*
* Node 0 Node 1
* Obj 0 Obj 1 Obj 0 Obj 1
* 0 1 4 5 8 9 12 13
* 2 3 6 7 10 11 14 15
*/
map = jdata->map;
vpid = 0;
for (n=0; n < map->nodes->size && vpid < jdata->num_procs; n++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
continue;
}
/* get the number of objects - only consider those we can actually use */
num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
cache_level, OPAL_HWLOC_AVAILABLE);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_fill: found %d objects on node %s with %d procs",
num_objs, node->name, (int)node->num_procs);
/* for each object */
for (i=0; i < num_objs && vpid < jdata->num_procs; i++) {
obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
cache_level, i, OPAL_HWLOC_AVAILABLE);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_fill: working object %d", i);
/* cycle thru the procs on this node */
for (j=0; j < node->procs->size && vpid < jdata->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_fill skipping proc %s - from another job, num_ranked %d",
ORTE_NAME_PRINT(&proc->name), num_ranked);
continue;
}
/* ignore procs that are already assigned */
if (ORTE_VPID_INVALID != proc->name.vpid) {
continue;
}
/* protect against bozo case */
if (NULL == proc->locale) {
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
/* ignore procs not on this object */
if (!hwloc_bitmap_intersects(obj->cpuset, proc->locale->cpuset)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_fill: proc at position %d is not on object %d",
j, i);
continue;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid));
proc->name.vpid = vpid++;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
/* If there is an invalid epoch here, it's because it doesn't exist yet. */
if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) {
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
}
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
}
return ORTE_SUCCESS;
}
static int rank_by(orte_job_t *jdata,
hwloc_obj_type_t target,
unsigned cache_level)
{
orte_job_map_t *map;
hwloc_obj_t obj;
int num_objs, i, j, n;
orte_vpid_t num_ranked=0;
orte_node_t *node;
orte_proc_t *proc;
orte_vpid_t vpid;
opal_pointer_array_t objs;
bool all_done;
if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
return rank_span(jdata, target, cache_level);
} else if (ORTE_RANKING_FILL & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
return rank_fill(jdata, target, cache_level);
}
/* if ranking is not spanned or filled, then we
* default to assign ranks sequentially across
* target objects within a node until that node
* is fully ranked, and then move on to the next
* node
*
* Node 0 Node 1
* Obj 0 Obj 1 Obj 0 Obj 1
* 0 2 1 3 8 10 9 11
* 4 6 5 7 12 14 13 15
*/
/* setup the pointer array */
OBJ_CONSTRUCT(&objs, opal_pointer_array_t);
opal_pointer_array_init(&objs, 2, INT_MAX, 2);
map = jdata->map;
vpid = 0;
for (n=0; n < map->nodes->size && vpid < jdata->num_procs; n++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
continue;
}
/* get the number of objects - only consider those we can actually use */
num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
cache_level, OPAL_HWLOC_AVAILABLE);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_by: found %d objects on node %s with %d procs",
num_objs, node->name, (int)node->num_procs);
/* collect all the objects */
for (i=0; i < num_objs; i++) {
obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
cache_level, i, OPAL_HWLOC_AVAILABLE);
opal_pointer_array_set_item(&objs, i, obj);
}
/* cycle across the objects, assigning a proc to each one,
* until all procs have been assigned - unfortunately, since
* more than this job may be mapped onto a node, the number
* of procs on the node can't be used to tell us when we
* are done. Instead, we have to just keep going until all
* procs are ranked - which means we have to make one extra
* pass thru the loop
*
* Perhaps someday someone will come up with a more efficient
* algorithm, but this works for now.
*/
all_done = false;
while (!all_done && vpid < jdata->num_procs) {
all_done = true;
/* cycle across the objects */
for (i=0; i < num_objs && vpid < jdata->num_procs; i++) {
obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i);
/* find the next proc on this object */
for (j=0; j < node->procs->size && vpid < jdata->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d",
ORTE_NAME_PRINT(&proc->name), num_ranked);
continue;
}
/* ignore procs that are already ranked */
if (ORTE_VPID_INVALID != proc->name.vpid) {
continue;
}
/* ignore procs on other objects */
if (!hwloc_bitmap_intersects(obj->cpuset, proc->locale->cpuset)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_by: proc at position %d is not on object %d",
j, i);
continue;
}
proc->name.vpid = vpid++;
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid));
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
/* If there is an invalid epoch here, it's because it doesn't exist yet. */
if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) {
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
}
/* flag that one was mapped */
all_done = false;
/* move to next object */
break;
}
}
}
}
/* cleanup */
OBJ_DESTRUCT(&objs);
return ORTE_SUCCESS;
}
#endif
int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
{
orte_job_map_t *map;
orte_vpid_t vpid, cnt;
int i, j;
orte_node_t *node;
orte_proc_t *proc, *ptr;
int rc;
map = jdata->map;
if (ORTE_RANK_BY_NODE == ORTE_GET_RANKING_POLICY(map->ranking) ||
ORTE_RANK_BY_BOARD == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:base: computing vpids by node for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/* assign the ranks round-robin across nodes - only one board/node
* at this time, so they are equivalent
*/
cnt=0;
vpid=0;
while (cnt < jdata->num_procs) {
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
if (ORTE_VPID_INVALID != proc->name.vpid) {
/* vpid was already assigned, probably by the
* round-robin mapper. Some mappers require that
* we insert the proc into the jdata->procs
* array, while others will have already done it - so check and
* do the operation if required
*/
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we added it to the array, then account for
* it in our loop - otherwise don't as we would be
* double counting
*/
cnt++;
}
continue;
}
/* find next available vpid */
while (NULL != (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid)) &&
ORTE_VPID_INVALID != ptr->name.vpid) {
vpid++;
}
proc->name.vpid = vpid++;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
/* insert the proc into the jdata->procs array - can't already
* be there as the only way to this point in the code is for the
* vpid to have been INVALID
*/
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);
return rc;
}
cnt++;
break; /* move on to next node */
}
}
}
return ORTE_SUCCESS;
}
if (ORTE_RANK_BY_SLOT == ORTE_GET_RANKING_POLICY(map->ranking)) {
/* assign the ranks sequentially */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:base: computing vpids by slot for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
vpid = 0;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
if (ORTE_VPID_INVALID == proc->name.vpid) {
/* find the next available vpid */
while (NULL != (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid)) &&
ORTE_VPID_INVALID != ptr->name.vpid) {
vpid++;
}
proc->name.vpid = vpid++;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
/* If there is an invalid epoch here, it's because it doesn't exist yet. */
if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) {
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
}
}
/* some mappers require that we insert the proc into the jdata->procs
* array, while others will have already done it - so check and
* do the operation if required
*/
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
}
return ORTE_SUCCESS;
}
#if OPAL_HAVE_HWLOC
if (ORTE_RANK_BY_NUMA == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: computing ranks by NUMA for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_NODE, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_SOCKET == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: computing ranks by socket for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_SOCKET, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_L3CACHE == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: computing ranks by L3cache for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 3))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_L2CACHE == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: computing ranks by L2cache for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 2))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_L1CACHE == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: computing ranks by L1cache for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 1))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_CORE == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: computing ranks by core for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CORE, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_HWTHREAD == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: computing ranks by hwthread for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_PU, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
#endif
return ORTE_ERR_NOT_IMPLEMENTED;
}
int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata)
{
orte_std_cntr_t i;
int j, k;
orte_node_t *node;
orte_proc_t *proc, *psave, *psave2;
orte_vpid_t minv, minv2;
orte_local_rank_t local_rank;
orte_job_map_t *map;
orte_app_context_t *app;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:compute_usage",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* point to map */
map = jdata->map;
/* for each node in the map... */
for (i=0; i < map->nodes->size; i++) {
/* cycle through the array of procs on this node, setting
* local and node ranks, until we
* have done so for all procs on nodes in this map
*/
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
/* init search values */
local_rank = 0;
/* the proc map may have holes in it, so cycle
* all the way through and avoid the holes
*/
for (k=0; k < node->procs->size; k++) {
/* if this proc is NULL, skip it */
if (NULL == opal_pointer_array_get_item(node->procs, k)) {
continue;
}
minv = ORTE_VPID_MAX;
minv2 = ORTE_VPID_MAX;
psave = NULL;
psave2 = NULL;
/* find the minimum vpid proc */
for (j=0; j < node->procs->size; j++) {
/* if this proc is NULL, skip it */
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* only look at procs for this job when
* determining local rank
*/
if (proc->name.jobid == jdata->jobid &&
ORTE_LOCAL_RANK_INVALID == proc->local_rank &&
proc->name.vpid < minv) {
minv = proc->name.vpid;
psave = proc;
}
/* no matter what job...still have to handle node_rank */
if (ORTE_NODE_RANK_INVALID == proc->node_rank &&
proc->name.vpid < minv2) {
minv2 = proc->name.vpid;
psave2 = proc;
}
}
if (NULL == psave && NULL == psave2) {
/* we must have processed them all for this node! */
break;
}
if (NULL != psave) {
psave->local_rank = local_rank;
++local_rank;
}
if (NULL != psave2) {
psave2->node_rank = node->next_node_rank;
node->next_node_rank++;
}
}
}
/* compute app_rank */
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
k=0;
/* loop thru all procs in job to find those from this app_context */
for (j=0; j < jdata->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
continue;
}
if (proc->app_idx != app->idx) {
continue;
}
proc->app_rank = k++;
}
}
return ORTE_SUCCESS;
}
/* when we restart a process on a different node, we have to
* ensure that the node and local ranks assigned to the proc
* don't overlap with any pre-existing proc on that node. If
* we don't, then it would be possible for procs to conflict
* when opening static ports, should that be enabled.
*/
void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode,
orte_node_t *newnode, orte_proc_t *newproc)
{
int k;
orte_node_rank_t node_rank;
orte_local_rank_t local_rank;
orte_proc_t *proc;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:update_usage",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if the node hasn't changed, then we can just use the
* pre-defined values
*/
if (oldnode == newnode) {
return;
}
/* if the node has changed, then search the new node for the
* lowest unused local and node rank
*/
node_rank = 0;
retry_nr:
for (k=0; k < newnode->procs->size; k++) {
/* if this proc is NULL, skip it */
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(newnode->procs, k))) {
continue;
}
if (node_rank == proc->node_rank) {
node_rank++;
goto retry_nr;
}
}
newproc->node_rank = node_rank;
local_rank = 0;
retry_lr:
for (k=0; k < newnode->procs->size; k++) {
/* if this proc is NULL, skip it */
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(newnode->procs, k))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
if (local_rank == proc->local_rank) {
local_rank++;
goto retry_lr;
}
}
newproc->local_rank = local_rank;
}

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -30,7 +31,8 @@
#include "opal/mca/mca.h" #include "opal/mca/mca.h"
#include "opal/mca/base/base.h" #include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
#include "orte/mca/ess/ess.h" #include "opal/mca/hwloc/base/base.h"
#include "opal/threads/tsd.h"
#include "orte/types.h" #include "orte/types.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
@ -39,6 +41,7 @@
#include "orte/util/hostfile/hostfile.h" #include "orte/util/hostfile/hostfile.h"
#include "orte/util/dash_host/dash_host.h" #include "orte/util/dash_host/dash_host.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/runtime/data_type_support/orte_dt_support.h" #include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/rmaps/base/rmaps_private.h" #include "orte/mca/rmaps/base/rmaps_private.h"
@ -60,13 +63,14 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
*total_num_slots = 0; *total_num_slots = 0;
/* if the hnp was allocated, include it unless flagged not to */ /* if the hnp was allocated, include it unless flagged not to */
if (orte_hnp_is_allocated) { if (orte_hnp_is_allocated && !(policy & ORTE_MAPPING_NO_USE_LOCAL)) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) {
if (ORTE_NODE_STATE_DO_NOT_USE == node->state) { if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
/* clear this for future use, but don't include it */ /* clear this for future use, but don't include it */
node->state = ORTE_NODE_STATE_UP; node->state = ORTE_NODE_STATE_UP;
} else if (ORTE_NODE_STATE_NOT_INCLUDED != node->state) { } else if (ORTE_NODE_STATE_NOT_INCLUDED != node->state) {
OBJ_RETAIN(node); OBJ_RETAIN(node);
node->mapped = false;
opal_list_append(allocated_nodes, &node->super); opal_list_append(allocated_nodes, &node->super);
} }
} }
@ -92,6 +96,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
* destructed along the way * destructed along the way
*/ */
OBJ_RETAIN(node); OBJ_RETAIN(node);
node->mapped = false;
opal_list_append(allocated_nodes, &node->super); opal_list_append(allocated_nodes, &node->super);
} }
} }
@ -264,32 +269,6 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
} }
#endif #endif
/* If the "no local" option was set, then remove the local node
* from the list
*/
if (policy & ORTE_MAPPING_NO_USE_LOCAL) {
/* we don't need to check through the entire list as
* the head node - if it is on the list at all - will
* always be in the first position
*/
item = opal_list_get_first(allocated_nodes);
node = (orte_node_t*)item;
/* need to check ifislocal because the name in the
* hostfile may not have been FQDN, while name returned
* by gethostname may have been (or vice versa)
*/
if (opal_ifislocal(node->name)) {
opal_list_remove_item(allocated_nodes, item);
OBJ_RELEASE(item); /* "un-retain" it */
}
/** if we aren't mapping daemons, check that anything is left! */
if (NULL != app && 0 == opal_list_get_size(allocated_nodes)) {
orte_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:nolocal-no-available-resources", true);
return ORTE_ERR_SILENT;
}
}
/* if the app is NULL, then we are mapping daemons - so remove /* if the app is NULL, then we are mapping daemons - so remove
* all nodes that already have a daemon on them * all nodes that already have a daemon on them
* *
@ -340,7 +319,12 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
opal_list_remove_item(allocated_nodes, item); opal_list_remove_item(allocated_nodes, item);
OBJ_RELEASE(item); /* "un-retain" it */ OBJ_RELEASE(item); /* "un-retain" it */
} else { /** otherwise, add the slots for our job to the total */ } else { /** otherwise, add the slots for our job to the total */
num_slots += node->slots_alloc; if (0 == node->slots_alloc) {
/* always allocate at least one */
num_slots++;
} else {
num_slots += node->slots_alloc;
}
} }
/** go on to next item */ /** go on to next item */
@ -359,682 +343,39 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
int orte_rmaps_base_add_proc_to_map(orte_job_map_t *map, orte_node_t *node,
bool oversubscribed, orte_proc_t *proc)
{
orte_std_cntr_t i;
orte_node_t *node_from_map;
int rc;
/* see if this node has already been assigned to the map - if
* not, then add the pointer to the pointer array
*/
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node_from_map = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
if (node_from_map->index == node->index) {
/* we have this node in the array */
goto PROCESS;
}
}
/* if we get here, then this node isn't already in the map - add it */
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base: adding node %s to map",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == node->name) ? "NULL" : node->name));
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(map->nodes, (void*)node))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OBJ_RETAIN(node); /* maintain accounting on object */
++map->num_nodes;
PROCESS:
/* add the proc to this node's local processes - it is assumed
* that the proc isn't already there as this would be an error
* in the mapper
*/
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base: mapping proc for job %s to node %s whose daemon is %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(proc->name.jobid),
(NULL == node->name) ? "NULL" : node->name,
(NULL == node->daemon) ? "NULL" : ORTE_NAME_PRINT(&(node->daemon->name))));
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* retain the proc struct so that we correctly track its release */
OBJ_RETAIN(proc);
++node->num_procs;
/* update the oversubscribed state of the node */
node->oversubscribed = oversubscribed;
return ORTE_SUCCESS;
}
/*
* Claim a slot for a specified job on a node
*/
int orte_rmaps_base_claim_slot(orte_job_t *jdata,
orte_node_t *current_node,
int32_t cpus_per_rank,
orte_std_cntr_t app_idx,
opal_list_t *nodes,
bool oversubscribe,
bool remove_from_list,
orte_proc_t **returnproc)
{
orte_proc_t *proc;
bool oversub;
int rc;
/* if we were given a proc, just use it */
if (NULL != returnproc && NULL != *returnproc) {
proc = *returnproc;
} else {
/* create mapped_proc object */
proc = OBJ_NEW(orte_proc_t);
if (NULL == proc) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* set the jobid */
proc->name.jobid = jdata->jobid;
/* flag the proc as ready for launch */
proc->state = ORTE_PROC_STATE_INIT;
/* we do not set the vpid here - this will be done
* during a second phase
*/
/* We do set the epoch here since they all start with the same value. */
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
proc->app_idx = app_idx;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:claim_slot: created new proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
/* provide returned proc, if requested */
if (NULL != returnproc) {
*returnproc = proc;
}
}
OBJ_RETAIN(current_node); /* maintain accounting on object */
proc->node = current_node;
proc->nodename = current_node->name;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:claim_slot mapping proc in job %s to node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid), current_node->name));
/* Be sure to demarcate the slots for this proc as claimed from the node */
current_node->slots_inuse += 1;
/* see if this node is oversubscribed now */
if (current_node->slots_inuse > current_node->slots) {
oversub = true;
} else {
oversub = false;
}
/* assign the proc to the node and ensure the node is on the map */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_add_proc_to_map(jdata->map, current_node,
oversub, proc))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(proc);
return rc;
}
/* If this node has reached its max number of allocatable slots OR it has
* reached the soft limit AND we are in a "no oversubscribe" state, then
* we need to return a flag telling the mapper this is the case so it
* can move on to the next node
*/
if ((0 != current_node->slots_max &&
current_node->slots_inuse >= current_node->slots_max) ||
(!oversubscribe && current_node->slots_inuse >= current_node->slots)) {
/* see if we are supposed to remove the node from the list - some
* mappers want us to do so to avoid any chance of continuing to
* add procs to it
*/
if (NULL != nodes && remove_from_list) {
opal_list_remove_item(nodes, (opal_list_item_t*)current_node);
/* release it - it was retained when we started, so this
* just ensures the instance counter is correctly updated
*/
OBJ_RELEASE(current_node);
}
/* now return the proper code so the caller knows this node
* is fully used
*/
return ORTE_ERR_NODE_FULLY_USED;
}
return ORTE_SUCCESS;
}
int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
{
orte_job_map_t *map;
orte_vpid_t vpid, cnt;
int i, j;
orte_node_t *node;
orte_proc_t *proc, *ptr;
int rc;
map = jdata->map;
if (ORTE_MAPPING_BYSLOT & map->policy ||
ORTE_MAPPING_BYSOCKET & map->policy ||
ORTE_MAPPING_BYBOARD & map->policy) {
/* assign the ranks sequentially */
vpid = 0;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
if (ORTE_VPID_INVALID == proc->name.vpid) {
/* find the next available vpid */
while (NULL != (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid)) &&
ORTE_VPID_INVALID != ptr->name.vpid) {
vpid++;
}
proc->name.vpid = vpid++;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
/* If there is an invalid epoch here, it's because it doesn't exist yet. */
if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) {
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
}
}
/* some mappers require that we insert the proc into the jdata->procs
* array, while others will have already done it - so check and
* do the operation if required
*/
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
}
return ORTE_SUCCESS;
}
if (ORTE_MAPPING_BYNODE & map->policy) {
/* assign the ranks round-robin across nodes */
cnt=0;
vpid=0;
while (cnt < jdata->num_procs) {
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
if (ORTE_VPID_INVALID != proc->name.vpid) {
/* vpid was already assigned, probably by the
* round-robin mapper. Some mappers require that
* we insert the proc into the jdata->procs
* array, while others will have already done it - so check and
* do the operation if required
*/
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we added it to the array, then account for
* it in our loop - otherwise don't as we would be
* double counting
*/
cnt++;
}
continue;
}
/* find next available vpid */
while (NULL != (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid)) &&
ORTE_VPID_INVALID != ptr->name.vpid) {
vpid++;
}
proc->name.vpid = vpid++;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
/* insert the proc into the jdata->procs array - can't already
* be there as the only way to this point in the code is for the
* vpid to have been INVALID
*/
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);
return rc;
}
cnt++;
break; /* move on to next node */
}
}
}
return ORTE_SUCCESS;
}
return ORTE_ERR_NOT_IMPLEMENTED;
}
int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata)
{
orte_std_cntr_t i;
int j, k;
orte_node_t *node;
orte_proc_t *proc, *psave, *psave2;
orte_vpid_t minv, minv2;
orte_local_rank_t local_rank;
orte_job_map_t *map;
orte_app_context_t *app;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:compute_usage",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* point to map */
map = jdata->map;
/* for each node in the map... */
for (i=0; i < map->nodes->size; i++) {
/* cycle through the array of procs on this node, setting
* local and node ranks, until we
* have done so for all procs on nodes in this map
*/
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
/* init search values */
local_rank = 0;
/* the proc map may have holes in it, so cycle
* all the way through and avoid the holes
*/
for (k=0; k < node->procs->size; k++) {
/* if this proc is NULL, skip it */
if (NULL == opal_pointer_array_get_item(node->procs, k)) {
continue;
}
minv = ORTE_VPID_MAX;
minv2 = ORTE_VPID_MAX;
psave = NULL;
psave2 = NULL;
/* find the minimum vpid proc */
for (j=0; j < node->procs->size; j++) {
/* if this proc is NULL, skip it */
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* only look at procs for this job when
* determining local rank
*/
if (proc->name.jobid == jdata->jobid &&
ORTE_LOCAL_RANK_INVALID == proc->local_rank &&
proc->name.vpid < minv) {
minv = proc->name.vpid;
psave = proc;
}
/* no matter what job...still have to handle node_rank */
if (ORTE_NODE_RANK_INVALID == proc->node_rank &&
proc->name.vpid < minv2) {
minv2 = proc->name.vpid;
psave2 = proc;
}
}
if (NULL == psave && NULL == psave2) {
/* we must have processed them all for this node! */
break;
}
if (NULL != psave) {
psave->local_rank = local_rank;
++local_rank;
}
if (NULL != psave2) {
psave2->node_rank = node->next_node_rank;
node->next_node_rank++;
}
}
}
/* compute app_rank */
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
k=0;
/* loop thru all procs in job to find those from this app_context */
for (j=0; j < jdata->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
continue;
}
if (proc->app_idx != app->idx) {
continue;
}
proc->app_rank = k++;
}
}
return ORTE_SUCCESS;
}
/* when we restart a process on a different node, we have to
* ensure that the node and local ranks assigned to the proc
* don't overlap with any pre-existing proc on that node. If
* we don't, then it would be possible for procs to conflict
* when opening static ports, should that be enabled.
*/
void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode,
orte_node_t *newnode, orte_proc_t *newproc)
{
int k;
orte_node_rank_t node_rank;
orte_local_rank_t local_rank;
orte_proc_t *proc;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:update_usage",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if the node hasn't changed, then we can just use the
* pre-defined values
*/
if (oldnode == newnode) {
return;
}
/* if the node has changed, then search the new node for the
* lowest unused local and node rank
*/
node_rank = 0;
retry_nr:
for (k=0; k < newnode->procs->size; k++) {
/* if this proc is NULL, skip it */
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(newnode->procs, k))) {
continue;
}
if (node_rank == proc->node_rank) {
node_rank++;
goto retry_nr;
}
}
newproc->node_rank = node_rank;
local_rank = 0;
retry_lr:
for (k=0; k < newnode->procs->size; k++) {
/* if this proc is NULL, skip it */
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(newnode->procs, k))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
if (local_rank == proc->local_rank) {
local_rank++;
goto retry_lr;
}
}
newproc->local_rank = local_rank;
}
int orte_rmaps_base_define_daemons(orte_job_t *jdata)
{
orte_job_map_t *map;
orte_node_t *node;
orte_proc_t *proc;
orte_job_t *daemons;
int i;
int rc;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:define_daemons",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_MAPPING_USE_VM & jdata->map->policy) {
/* nothing for us to do - all daemons are
* defined by definition!
*/
return ORTE_SUCCESS;
}
/* get the daemon job data struct */
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_HNP->jobid))) {
/* bad news */
ORTE_ERROR_LOG(ORTE_ERR_FATAL);
return ORTE_ERR_FATAL;
}
/* initialize the #new daemons */
map = jdata->map;
map->num_new_daemons = 0;
/* go through the nodes in the map, checking each one's daemon name
*/
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
if (NULL == node->daemon) {
/* we haven't defined one for it
* yet, so do so now and indicate it is to be launched
*/
proc = OBJ_NEW(orte_proc_t);
if (NULL == proc) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
proc->name.jobid = ORTE_PROC_MY_HNP->jobid;
if (ORTE_VPID_MAX-1 <= daemons->num_procs) {
/* no more daemons available */
orte_show_help("help-orte-rmaps-base.txt", "out-of-vpids", true);
OBJ_RELEASE(proc);
return ORTE_ERR_OUT_OF_RESOURCE;
}
proc->name.vpid = daemons->num_procs; /* take the next available vpid */
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
proc->node = node;
proc->nodename = node->name;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:define_daemons add new daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
/* add the daemon to the daemon job object */
if (0 > (rc = opal_pointer_array_add(daemons->procs, (void*)proc))) {
ORTE_ERROR_LOG(rc);
return rc;
}
++daemons->num_procs;
/* point the node to the daemon */
node->daemon = proc;
OBJ_RETAIN(proc); /* maintain accounting */
/* track number of daemons to be launched */
++map->num_new_daemons;
/* and their starting vpid */
if (ORTE_VPID_INVALID == map->daemon_vpid_start) {
map->daemon_vpid_start = proc->name.vpid;
}
}
/*
* If we are launching on a node where there used to be a daemon, but
* it had previously failed, try to relaunch it. (Daemon Recovery) Do
* this ONLY if there are procs mapped to that daemon!
*/
else if (node->daemon->state > ORTE_PROC_STATE_UNTERMINATED) {
/* If no processes are to be launched on this node, then exclude it */
if( 0 >= node->num_procs ) {
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:define_daemons Skipping the Recovery of daemon %s [0x%x] Launched: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&node->daemon->name),
node->daemon->state,
(node->daemon_launched ? "T" : "F")
));
/* since this daemon exists but is not needed, then flag it
* as "launched" to avoid relaunching it for no reason
*/
node->daemon_launched = true;
continue;
}
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:define_daemons RECOVERING daemon %s [0x%x] Launched: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&node->daemon->name),
node->daemon->state,
(node->daemon_launched ? "T" : "F")
));
/* flag that the daemon is no longer launched */
node->daemon_launched = false;
/* set the state to indicate launch is in progress */
node->daemon->state = ORTE_PROC_STATE_RESTART;
free(node->daemon->rml_uri);
node->daemon->rml_uri = NULL;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:define_daemons add new daemon %s (Recovering old daemon)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&node->daemon->name)));
/* track number of daemons to be launched */
++map->num_new_daemons;
}
else {
/* this daemon was previously defined - flag it */
node->daemon_launched = true;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:define_daemons existing daemon %s already launched",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&node->daemon->name)));
}
}
return ORTE_SUCCESS;
}
int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata) int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
{ {
orte_job_t *jdat;
orte_node_t *node; orte_node_t *node;
orte_proc_t *proc; orte_proc_t *proc;
orte_job_map_t *map; orte_job_map_t *map;
opal_list_t node_list; int rc, i;
opal_list_item_t *item;
orte_app_context_t *app;
orte_std_cntr_t num_slots;
int rc, i, n;
bool ignored;
/* get the daemon app if provided - may include -host or hostfile
* info about available nodes
*/
app = (orte_app_context_t *) opal_pointer_array_get_item(jdata->apps, 0);
map = jdata->map; map = jdata->map;
/* get the list of all available nodes that do not already /* cycle thru all available nodes and find those that do not already
* have a daemon on them * have a daemon on them - no need to include our own as we are
* obviously already here!
*/ */
OBJ_CONSTRUCT(&node_list, opal_list_t); for (i=1; i < orte_node_pool->size; i++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
app, map->policy))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&node_list);
return rc;
}
/* check all other known jobs to see if they have something to
* add to the allocation - we won't have seen these and the
* daemon job won't have any in its app
*/
for (i=0; i < orte_job_data->size; i++) {
if (NULL == (jdat = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
continue; continue;
} }
for (n=0; n < jdat->apps->size; n++) { /* if this node already has a daemon, skip it */
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdat->apps, n))) { if (NULL != node->daemon) {
continue;
}
if (NULL != app->hostfile) {
/* hostfile was specified - parse it and add it to the list. The
* function automatically ignores duplicates
*/
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&node_list,
&ignored,
app->hostfile))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&node_list);
return rc;
}
}
if (NULL != app->dash_host) {
/* parse and add to list, ignoring duplicates */
if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&node_list,
&ignored,
app->dash_host))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&node_list);
return rc;
}
}
}
}
/* add all these nodes to the map */
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
/* if this is my node, ignore it - we are already here */
if (0 == strcmp(node->name, orte_process_info.nodename)) {
continue; continue;
} }
/* add the node to the map */
opal_pointer_array_add(map->nodes, (void*)node); opal_pointer_array_add(map->nodes, (void*)node);
++(map->num_nodes); ++(map->num_nodes);
/* if this node already has a daemon, release that object /* maintain accounting */
* to maintain bookkeeping OBJ_RETAIN(node);
*/
if (NULL != node->daemon) {
OBJ_RELEASE(node->daemon);
}
/* create a new daemon object for this node */ /* create a new daemon object for this node */
proc = OBJ_NEW(orte_proc_t); proc = OBJ_NEW(orte_proc_t);
if (NULL == proc) { if (NULL == proc) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
proc->name.jobid = ORTE_PROC_MY_HNP->jobid; proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
if (ORTE_VPID_MAX-1 <= jdata->num_procs) { if (ORTE_VPID_MAX-1 <= jdata->num_procs) {
/* no more daemons available */ /* no more daemons available */
orte_show_help("help-orte-rmaps-base.txt", "out-of-vpids", true); orte_show_help("help-orte-rmaps-base.txt", "out-of-vpids", true);
@ -1044,6 +385,8 @@ int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
proc->name.vpid = jdata->num_procs; /* take the next available vpid */ proc->name.vpid = jdata->num_procs; /* take the next available vpid */
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
/* point the proc to the node and maintain accounting */
OBJ_RETAIN(node);
proc->node = node; proc->node = node;
proc->nodename = node->name; proc->nodename = node->name;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
@ -1051,7 +394,7 @@ int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name))); ORTE_NAME_PRINT(&proc->name)));
/* add the daemon to the daemon job object */ /* add the daemon to the daemon job object */
if (0 > (rc = opal_pointer_array_add(jdata->procs, (void*)proc))) { if (0 > (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, (void*)proc))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
@ -1066,7 +409,6 @@ int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
map->daemon_vpid_start = proc->name.vpid; map->daemon_vpid_start = proc->name.vpid;
} }
} }
OBJ_DESTRUCT(&node_list);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -48,16 +49,6 @@ ORTE_DECLSPEC orte_job_map_t* orte_rmaps_base_get_job_map(orte_jobid_t job);
/* LOCAL FUNCTIONS for use by RMAPS components */ /* LOCAL FUNCTIONS for use by RMAPS components */
/*
* Function to add a mapped_proc entry to a map
* Scans list of nodes on map to see if the specified one already
* exists - if so, just add this entry to that node's list of
* procs. If not, then add new node entry and put this proc
* on its list.
*/
int orte_rmaps_base_add_proc_to_map(orte_job_map_t *map, orte_node_t *node,
bool oversubscribed, orte_proc_t *proc);
ORTE_DECLSPEC int orte_rmaps_base_get_target_nodes(opal_list_t* node_list, ORTE_DECLSPEC int orte_rmaps_base_get_target_nodes(opal_list_t* node_list,
orte_std_cntr_t *total_num_slots, orte_std_cntr_t *total_num_slots,
orte_app_context_t *app, orte_app_context_t *app,
@ -70,39 +61,19 @@ ORTE_DECLSPEC int orte_rmaps_base_get_mapped_targets(opal_list_t *mapped_node_li
opal_list_t *master_node_list, opal_list_t *master_node_list,
orte_std_cntr_t *total_num_slots); orte_std_cntr_t *total_num_slots);
ORTE_DECLSPEC int orte_rmaps_base_claim_slot(orte_job_t *jdata,
orte_node_t *current_node,
int32_t stride,
orte_std_cntr_t app_idx,
opal_list_t *nodes,
bool oversubscribe,
bool remove_from_list,
orte_proc_t **returnproc);
ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata); ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata);
ORTE_DECLSPEC int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata); ORTE_DECLSPEC int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata);
ORTE_DECLSPEC int orte_rmaps_base_compute_bindings(orte_job_t *jdata);
ORTE_DECLSPEC void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode, ORTE_DECLSPEC void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode,
orte_node_t *newnode, orte_proc_t *newproc); orte_node_t *newnode, orte_proc_t *newproc);
ORTE_DECLSPEC int orte_rmaps_base_rearrange_map(orte_app_context_t *app, orte_job_map_t *map, opal_list_t *procs); ORTE_DECLSPEC int orte_rmaps_base_rearrange_map(orte_app_context_t *app, orte_job_map_t *map, opal_list_t *procs);
ORTE_DECLSPEC int orte_rmaps_base_define_daemons(orte_job_t *jdata);
ORTE_DECLSPEC int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata); ORTE_DECLSPEC int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata);
ORTE_DECLSPEC opal_list_item_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list, orte_job_t *jdata);
ORTE_DECLSPEC int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
opal_list_t *node_list, orte_vpid_t num_procs,
opal_list_item_t *cur_node_item);
ORTE_DECLSPEC int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
opal_list_t *node_list, orte_vpid_t num_procs,
opal_list_item_t *cur_node_item);
END_C_DECLS END_C_DECLS
#endif #endif

Просмотреть файл

@ -1,46 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-rmaps-lb.txt
sources = \
rmaps_lb.c \
rmaps_lb.h \
rmaps_lb_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_rmaps_load_balance_DSO
component_noinst =
component_install = mca_rmaps_load_balance.la
else
component_noinst = libmca_rmaps_load_balance.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_rmaps_load_balance_la_SOURCES = $(sources)
mca_rmaps_load_balance_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_rmaps_load_balance_la_SOURCES =$(sources)
libmca_rmaps_load_balance_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,53 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open RTE's orterun.
#
[orte-rmaps-rr:alloc-error]
There are not enough slots available in the system to satisfy the %d slots
that were requested by the application:
%s
Either request fewer slots for your application, or make more slots available
for use.
[orte-rmaps-rr:multi-apps-and-zero-np]
RMAPS found multiple applications to be launched, with
at least one that failed to specify the number of processes to execute.
When specifying multiple applications, you must specify how many processes
of each to launch via the -np argument.
[orte-rmaps-rr:per-node-and-too-many-procs]
There are not enough nodes in your allocation to satisfy your request to launch
%d processes on a per-node basis - only %d nodes were available.
Either request fewer processes, or obtain a larger allocation.
[orte-rmaps-rr:n-per-node-and-too-many-procs]
There are not enough nodes in your allocation to satisfy your request to launch
%d processes on a %d per-node basis - only %d nodes with a total of %d slots were available.
Either request fewer processes, or obtain a larger allocation.
[orte-rmaps-rr:n-per-node-and-not-enough-slots]
There are not enough slots on the nodes in your allocation to satisfy your request to launch on a %d process-per-node basis - only %d slots/node were available.
Either request fewer processes/node, or obtain a larger allocation.
[orte-rmaps-rr:no-np-and-user-map]
You have specified a rank-to-node/slot mapping, but failed to provide
the number of processes to be executed. For some reason, this information
could not be obtained from the mapping you provided, so we cannot continue
with executing the specified application.

Просмотреть файл

@ -1,544 +0,0 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/opal_sos.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
#include "rmaps_lb.h"
static int switchyard(orte_job_t *jdata);
orte_rmaps_base_module_t orte_rmaps_load_balance_module = {
switchyard
};
/* Local functions */
static int npernode(orte_job_t *jdata);
static int nperboard(orte_job_t *jdata);
static int npersocket(orte_job_t *jdata);
static int loadbalance(orte_job_t *jdata);
static int switchyard(orte_job_t *jdata)
{
int rc;
mca_base_component_t *c = &mca_rmaps_load_balance_component.super.base_version;
/* only handle initial launch of loadbalanced
* or NPERxxx jobs - allow restarting of failed apps
*/
if (ORTE_JOB_STATE_INIT != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lb: job %s not in initial state - loadbalance cannot map",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (NULL != jdata->map->req_mapper &&
0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
/* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lb: job %s not using loadbalance mapper",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:loadbalance: mapping job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */
if (NULL != jdata->map->last_mapper) {
free(jdata->map->last_mapper);
}
jdata->map->last_mapper = strdup(c->mca_component_name);
if (0 < mca_rmaps_load_balance_component.npernode ||
0 < jdata->map->npernode) {
rc = npernode(jdata);
} else if (0 < mca_rmaps_load_balance_component.nperboard ||
0 < jdata->map->nperboard) {
rc = nperboard(jdata);
} else if (0 < mca_rmaps_load_balance_component.npersocket ||
0 < jdata->map->npersocket) {
rc = npersocket(jdata);
} else {
rc = loadbalance(jdata);
}
if (ORTE_SUCCESS != rc) {
return rc;
}
/* compute and save local ranks */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* define the daemons that we will use for this job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* place specified #procs on each node, up to the specified total
* number of procs (if one was given).
*/
static int npernode(orte_job_t *jdata)
{
orte_app_context_t *app;
int j, rc=ORTE_SUCCESS;
opal_list_t node_list;
opal_list_item_t *item;
orte_std_cntr_t num_slots;
orte_node_t *node;
int np, nprocs;
int num_nodes;
/* setup the node list */
OBJ_CONSTRUCT(&node_list, opal_list_t);
/* can only have one app_context here */
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* use the number of procs if one was given */
if (0 < app->num_procs) {
np = app->num_procs;
} else {
np = INT_MAX;
}
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* loop through the list of nodes */
num_nodes = opal_list_get_size(&node_list);
nprocs = 0;
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
/* put the specified number of procs on each node */
for (j=0; j < mca_rmaps_load_balance_component.npernode && nprocs < np; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
* more procs to place, then that is an error
*/
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc) ||
j < mca_rmaps_load_balance_component.npernode-1) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(node);
goto error;
}
}
nprocs++;
}
OBJ_RELEASE(node);
}
/* if the user requested a specific number of procs and
* the total number of procs we were able to assign
* doesn't equal the number requested, then we have a
* problem
*/
if (0 < app->num_procs && nprocs < app->num_procs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
app->app, app->num_procs,
"number of nodes", num_nodes,
"npernode", mca_rmaps_load_balance_component.npernode);
return ORTE_ERR_SILENT;
}
/* update the number of procs in the job */
jdata->num_procs += nprocs;
/* compute vpids and add proc objects to the job - this has to be
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
error:
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}
static int nperboard(orte_job_t *jdata)
{
orte_app_context_t *app;
int j, k, rc=ORTE_SUCCESS;
opal_list_t node_list;
opal_list_item_t *item;
orte_std_cntr_t num_slots;
orte_node_t *node;
int np, nprocs;
int num_boards=orte_default_num_boards;
/* setup the node list */
OBJ_CONSTRUCT(&node_list, opal_list_t);
/* can only have one app_context here */
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* use the number of procs if one was given */
if (0 < app->num_procs) {
np = app->num_procs;
} else {
np = INT_MAX;
}
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* loop through the list of nodes */
nprocs = 0;
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
num_boards = node->boards;
/* loop through the number of boards in this node */
for (k=0; k < node->boards && nprocs < np; k++) {
/* put the specified number of procs on each board */
for (j=0; j < mca_rmaps_load_balance_component.nperboard && nprocs < np; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
* more procs to place, then that is an error
*/
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc) ||
j < mca_rmaps_load_balance_component.nperboard-1) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(node);
goto error;
}
}
nprocs++;
}
}
OBJ_RELEASE(node);
}
/* if the user requested a specific number of procs and
* the total number of procs we were able to assign
* doesn't equal the number requested, then we have a
* problem
*/
if (0 < app->num_procs && nprocs < app->num_procs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
app->app, app->num_procs,
"number of boards", num_boards,
"nperboard", mca_rmaps_load_balance_component.nperboard);
return ORTE_ERR_SILENT;
}
/* update the number of procs in the job */
jdata->num_procs += nprocs;
/* compute vpids and add proc objects to the job - this has to be
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
error:
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}
static int npersocket(orte_job_t *jdata)
{
orte_app_context_t *app;
int j, k, n, rc=ORTE_SUCCESS;
opal_list_t node_list;
opal_list_item_t *item;
orte_std_cntr_t num_slots;
orte_node_t *node;
int np, nprocs;
int num_sockets=orte_default_num_sockets_per_board;
/* setup the node list */
OBJ_CONSTRUCT(&node_list, opal_list_t);
/* can only have one app_context here */
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* use the number of procs if one was given */
if (0 < app->num_procs) {
np = app->num_procs;
} else {
np = INT_MAX;
}
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* loop through the list of nodes */
nprocs = 0;
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
num_sockets = node->sockets_per_board;
/* loop through the number of boards in this node */
for (k=0; k < node->boards && nprocs < np; k++) {
/* loop through the number of sockets/board */
for (n=0; n < node->sockets_per_board && nprocs < np; n++) {
/* put the specified number of procs on each socket */
for (j=0; j < mca_rmaps_load_balance_component.npersocket && nprocs < np; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
* more procs to place, then that is an error
*/
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc) ||
j < mca_rmaps_load_balance_component.npersocket-1) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(node);
goto error;
}
}
/* track the number of procs */
nprocs++;
}
}
}
OBJ_RELEASE(node);
}
/* if the user requested a specific number of procs and
* the total number of procs we were able to assign
* doesn't equal the number requested, then we have a
* problem
*/
if (0 < app->num_procs && nprocs < app->num_procs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
app->app, app->num_procs,
"number of sockets", num_sockets,
"npersocket", mca_rmaps_load_balance_component.npersocket);
return ORTE_ERR_SILENT;
}
/* update the number of procs in the job */
jdata->num_procs += nprocs;
/* compute vpids and add proc objects to the job - this has to be
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
error:
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}
/*
* Create a load balanced mapping for the job by assigning a constant #procs/node, with
* leftovers being spread one/node starting from the first node.
*/
static int loadbalance(orte_job_t *jdata)
{
orte_app_context_t *app;
int i, j;
opal_list_t node_list;
orte_std_cntr_t num_nodes, num_slots;
int rc=ORTE_SUCCESS, np, nprocs;
int ppn = 0;
opal_list_item_t *item, *start;
orte_node_t *node;
/* setup */
OBJ_CONSTRUCT(&node_list, opal_list_t);
/* compute total #procs we are going to add and the total number of nodes available */
for(i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
/* get the nodes and #slots available for this app_context */
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
}
if (0 < app->num_procs) {
np = app->num_procs;
} else {
/* set the num_procs to the #slots */
np = num_slots;
}
num_nodes = opal_list_get_size(&node_list);
/* compute the base ppn */
ppn = np / num_nodes;
/* if a bookmark exists from some prior mapping, set us to start there */
start = orte_rmaps_base_get_starting_point(&node_list, jdata);
/* loop through the list of nodes until we either assign all the procs
* or return to the starting point
*/
item = start;
nprocs = 0;
do {
node = (orte_node_t*)item;
/* put the specified number of procs on each node */
for (j=0; j < ppn; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
* more procs to place, then that is an error
*/
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc) ||
j < ppn-1) {
ORTE_ERROR_LOG(rc);
goto error;
}
}
nprocs++;
}
/* move to next node */
if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
item = opal_list_get_first(&node_list);
}
else {
item = opal_list_get_next(item);
}
} while (item != start && nprocs < np);
/* save the bookmark */
jdata->bookmark = node;
/* if we haven't assigned all the procs, then loop through the list
* again, assigning 1 per node until all are assigned
*/
item = start;
while (nprocs < np) {
node = (orte_node_t*)item;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/* if the code is not ORTE_ERR_NODE_FULLY_USED, then that is an error */
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
ORTE_ERROR_LOG(rc);
goto error;
}
}
nprocs++;
/* move to next node */
if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
item = opal_list_get_first(&node_list);
}
else {
item = opal_list_get_next(item);
}
}
/* save the bookmark */
jdata->bookmark = node;
/* cleanup */
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
/* if the user requested a specific number of procs and
* the total number of procs we were able to assign
* doesn't equal the number requested, then we have a
* problem
*/
if (0 < app->num_procs && nprocs < app->num_procs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
app->app, app->num_procs,
"number of slots", nprocs,
"number of nodes", num_nodes);
return ORTE_ERR_SILENT;
}
/* update the number of procs in the job */
jdata->num_procs += nprocs;
/* compute vpids and add proc objects to the job - this has to be
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
error:
while(NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}

Просмотреть файл

@ -1,45 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Resource Mapping
*/
#ifndef ORTE_RMAPS_LB_H
#define ORTE_RMAPS_LB_H
#include "orte_config.h"
#include "orte/mca/rmaps/rmaps.h"
BEGIN_C_DECLS
struct orte_rmaps_lb_component_t {
orte_rmaps_base_component_t super;
int npernode;
int nperboard;
int npersocket;
};
typedef struct orte_rmaps_lb_component_t orte_rmaps_lb_component_t;
ORTE_MODULE_DECLSPEC extern orte_rmaps_lb_component_t mca_rmaps_load_balance_component;
extern orte_rmaps_base_module_t orte_rmaps_load_balance_module;
END_C_DECLS
#endif

Просмотреть файл

@ -1,143 +0,0 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/paffinity/paffinity.h"
#include "orte/mca/rmaps/base/base.h"
#include "rmaps_lb.h"
/*
* Local functions
*/
static int orte_rmaps_lb_open(void);
static int orte_rmaps_lb_close(void);
static int orte_rmaps_lb_query(mca_base_module_t **module, int *priority);
static int my_priority;
orte_rmaps_lb_component_t mca_rmaps_load_balance_component = {
{
{
ORTE_RMAPS_BASE_VERSION_2_0_0,
"load_balance", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_rmaps_lb_open, /* component open */
orte_rmaps_lb_close, /* component close */
orte_rmaps_lb_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
}
};
/**
* component open/close/init function
*/
static int orte_rmaps_lb_open(void)
{
mca_base_component_t *c = &mca_rmaps_load_balance_component.super.base_version;
int value, tmp;
/* initialize */
mca_rmaps_load_balance_component.npernode = 0;
mca_rmaps_load_balance_component.nperboard = 0;
mca_rmaps_load_balance_component.npersocket = 0;
mca_base_param_reg_int(c, "priority",
"Priority of the loadbalance rmaps component",
false, false, 80,
&my_priority);
/* check for procs/xxx directives */
tmp = mca_base_param_reg_int(c, "pernode",
"Launch one ppn as directed",
false, false, (int)false, NULL);
mca_base_param_reg_syn_name(tmp, "rmaps", "base_pernode", false);
mca_base_param_lookup_int(tmp, &value);
if (value) {
mca_rmaps_load_balance_component.npernode = 1;
}
/* #procs/node */
tmp = mca_base_param_reg_int(c, "n_pernode",
"Launch n procs/node",
false, false, mca_rmaps_load_balance_component.npernode, NULL);
mca_base_param_reg_syn_name(tmp, "rmaps", "base_n_pernode", false);
mca_base_param_lookup_int(tmp, &mca_rmaps_load_balance_component.npernode);
/* #procs/board */
tmp = mca_base_param_reg_int(c, "n_perboard",
"Launch n procs/board",
false, false, -1, NULL);
mca_base_param_reg_syn_name(tmp, "rmaps", "base_n_perboard", false);
mca_base_param_lookup_int(tmp, &mca_rmaps_load_balance_component.nperboard);
if (0 < mca_rmaps_load_balance_component.nperboard) {
ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_NPERXXX);
}
/* #procs/socket */
tmp = mca_base_param_reg_int(c, "n_persocket",
"Launch n procs/socket",
false, false, -1, NULL);
mca_base_param_reg_syn_name(tmp, "rmaps", "base_n_persocket", false);
mca_base_param_lookup_int(tmp, &mca_rmaps_load_balance_component.npersocket);
if (0 < mca_rmaps_load_balance_component.npersocket) {
ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_NPERXXX);
/* force bind to socket if not overridden by user */
ORTE_XSET_BINDING_POLICY(ORTE_BIND_TO_SOCKET);
}
return ORTE_SUCCESS;
}
static int orte_rmaps_lb_query(mca_base_module_t **module, int *priority)
{
/* after rr, unless lb values are set */
if (0 < mca_rmaps_load_balance_component.npernode ||
0 < mca_rmaps_load_balance_component.nperboard ||
0 < mca_rmaps_load_balance_component.npersocket) {
my_priority = 10000;
}
*priority = my_priority;
*module = (mca_base_module_t *)&orte_rmaps_load_balance_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_rmaps_lb_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,26 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_orte_rmaps_ppr_CONFIG([action-if-found], [action-if-not-found])
# -------------------------------------------------------------------------
AC_DEFUN([MCA_orte_rmaps_ppr_CONFIG],[
AC_REQUIRE([MCA_opal_hwloc_CONFIG_REQUIRE])
AC_CONFIG_FILES([orte/mca/rmaps/ppr/Makefile])
# All we check for is whether $OPAL_HAVE_HWLOC is 1.
# See big comment in opal/mca/hwloc/configure.m4.
AC_MSG_CHECKING([if hwloc is enabled])
AS_IF([test $OPAL_HAVE_HWLOC -eq 1],
[AC_MSG_RESULT([yes])
$1],
[AC_MSG_RESULT([no])
$2])
])dnl

Просмотреть файл

@ -1,23 +1,12 @@
# -*- text -*- # -*- text -*-
# #
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
# #
# $HEADER$ # $HEADER$
# #
# This is the US/English general help file for Open RTE's orterun.
#
# #
[invalid-ppr] [invalid-ppr]
An invalid value was given for the number of processes An invalid value was given for the number of processes

Просмотреть файл

@ -29,37 +29,48 @@
#include "orte/mca/rmaps/base/base.h" #include "orte/mca/rmaps/base/base.h"
#include "rmaps_ppr.h" #include "rmaps_ppr.h"
static int ppr(orte_job_t *jdata); static int ppr_mapper(orte_job_t *jdata);
orte_rmaps_base_module_t orte_rmaps_ppr_module = { orte_rmaps_base_module_t orte_rmaps_ppr_module = {
ppr ppr_mapper
}; };
static orte_proc_t* setup_proc(orte_job_t *jdata, orte_node_t *node, static orte_proc_t* setup_proc(orte_job_t *jdata,
orte_node_t *node,
orte_app_idx_t idx); orte_app_idx_t idx);
#if OPAL_HAVE_HWLOC
static void prune(orte_jobid_t jobid, static void prune(orte_jobid_t jobid,
orte_app_idx_t app_idx, orte_app_idx_t app_idx,
orte_node_t *node, orte_node_t *node,
opal_hwloc_level_t *level, opal_hwloc_level_t *level,
orte_vpid_t *nmapped); orte_vpid_t *nmapped);
#endif
static int ppr(orte_job_t *jdata) static int ppr[OPAL_HWLOC_HWTHREAD_LEVEL+1];
static int ppr_mapper(orte_job_t *jdata)
{ {
int rc, local_limit, j; int rc, j, n;
orte_rmaps_ppr_component_t *c = &mca_rmaps_ppr_component; mca_base_component_t *c=&mca_rmaps_ppr_component.base_version;
orte_node_t *node; orte_node_t *node;
orte_proc_t *proc; orte_proc_t *proc;
orte_app_context_t *app; orte_app_context_t *app;
orte_vpid_t total_procs, nprocs_mapped; orte_vpid_t total_procs, nprocs_mapped;
opal_hwloc_level_t level, start=OPAL_HWLOC_NODE_LEVEL;
#if OPAL_HAVE_HWLOC
hwloc_obj_t obj; hwloc_obj_t obj;
hwloc_obj_type_t lowest; hwloc_obj_type_t lowest;
opal_hwloc_level_t level; unsigned cache_level=0;
unsigned cache_level; unsigned int nobjs, i;
#endif
opal_list_t node_list; opal_list_t node_list;
opal_list_item_t *item; opal_list_item_t *item;
orte_std_cntr_t num_slots; orte_std_cntr_t num_slots;
unsigned int nobjs, i;
orte_app_idx_t idx; orte_app_idx_t idx;
char **ppr_req, **ck;
size_t len;
bool pruning_reqd = false;
/* only handle initial launch of loadbalanced /* only handle initial launch of loadbalanced
* or NPERxxx jobs - allow restarting of failed apps * or NPERxxx jobs - allow restarting of failed apps
@ -71,37 +82,138 @@ static int ppr(orte_job_t *jdata)
return ORTE_ERR_TAKE_NEXT_OPTION; return ORTE_ERR_TAKE_NEXT_OPTION;
} }
if (NULL != jdata->map->req_mapper && if (NULL != jdata->map->req_mapper &&
0 != strcasecmp(jdata->map->req_mapper, c->super.base_version.mca_component_name)) { 0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
/* a mapper has been specified, and it isn't me */ /* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output, opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: job %s not using ppr mapper", "mca:rmaps:ppr: job %s not using ppr mapper",
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION; return ORTE_ERR_TAKE_NEXT_OPTION;
} }
if (NULL == jdata->map->ppr ||
!(ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) {
/* not for us */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: job %s not using ppr mapper",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output, opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: mapping job %s", "mca:rmaps:ppr: mapping job %s with ppr %s",
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr);
/* flag that I did the mapping */ /* flag that I did the mapping */
if (NULL != jdata->map->last_mapper) { if (NULL != jdata->map->last_mapper) {
free(jdata->map->last_mapper); free(jdata->map->last_mapper);
} }
jdata->map->last_mapper = strdup(c->super.base_version.mca_component_name); jdata->map->last_mapper = strdup(c->mca_component_name);
/* initialize */
memset(ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t));
/* parse option */
n=0;
ppr_req = opal_argv_split(jdata->map->ppr, ',');
for (j=0; NULL != ppr_req[j]; j++) {
/* split on the colon */
ck = opal_argv_split(ppr_req[j], ':');
if (2 != opal_argv_count(ck)) {
/* must provide a specification */
orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, jdata->map->ppr);
opal_argv_free(ppr_req);
opal_argv_free(ck);
return ORTE_ERR_SILENT;
}
len = strlen(ck[1]);
if (0 == strncasecmp(ck[1], "node", len)) {
ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10);
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNODE);
start = OPAL_HWLOC_NODE_LEVEL;
n++;
#if OPAL_HAVE_HWLOC
} else if (0 == strncasecmp(ck[1], "hwthread", len) ||
0 == strncasecmp(ck[1], "thread", len)) {
ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10);
start = OPAL_HWLOC_HWTHREAD_LEVEL;
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYHWTHREAD);
n++;
} else if (0 == strncasecmp(ck[1], "core", len)) {
ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_CORE_LEVEL) {
start = OPAL_HWLOC_CORE_LEVEL;
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYCORE);
}
n++;
} else if (0 == strncasecmp(ck[1], "socket", len) ||
0 == strncasecmp(ck[1], "skt", len)) {
ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_SOCKET_LEVEL) {
start = OPAL_HWLOC_SOCKET_LEVEL;
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSOCKET);
}
n++;
} else if (0 == strncasecmp(ck[1], "l1cache", len)) {
ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_L1CACHE_LEVEL) {
start = OPAL_HWLOC_L1CACHE_LEVEL;
cache_level = 1;
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL1CACHE);
}
n++;
} else if (0 == strncasecmp(ck[1], "l2cache", len)) {
ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_L2CACHE_LEVEL) {
start = OPAL_HWLOC_L2CACHE_LEVEL;
cache_level = 2;
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL2CACHE);
}
n++;
} else if (0 == strncasecmp(ck[1], "l3cache", len)) {
ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_L3CACHE_LEVEL) {
start = OPAL_HWLOC_L3CACHE_LEVEL;
cache_level = 3;
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL3CACHE);
}
n++;
} else if (0 == strncasecmp(ck[1], "numa", len)) {
ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_NUMA_LEVEL) {
start = OPAL_HWLOC_NUMA_LEVEL;
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNUMA);
}
n++;
#endif
} else {
/* unknown spec */
orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], jdata->map->ppr);
opal_argv_free(ppr_req);
opal_argv_free(ck);
return ORTE_ERR_SILENT;
}
opal_argv_free(ck);
}
opal_argv_free(ppr_req);
/* if nothing was given, that's an error */
if (0 == n) {
opal_output(0, "NOTHING GIVEN");
return ORTE_ERR_SILENT;
}
/* if more than one level was specified, then pruning will be reqd */
if (1 < n) {
pruning_reqd = true;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:ppr: job %s assigned policy %s",
ORTE_JOBID_PRINT(jdata->jobid),
orte_rmaps_base_print_mapping(jdata->map->mapping));
/* convenience */ /* convenience */
local_limit = mca_rmaps_ppr_component.ppr[mca_rmaps_ppr_component.start]; level = start;
level = mca_rmaps_ppr_component.start; #if OPAL_HAVE_HWLOC
lowest = opal_hwloc_levels[start];
/* find the lowest level that was defined in the ppr */ #endif
lowest = opal_hwloc_levels[mca_rmaps_ppr_component.start];
if (OPAL_HWLOC_L3CACHE_LEVEL == mca_rmaps_ppr_component.start) {
cache_level = 3;
} else if (OPAL_HWLOC_L2CACHE_LEVEL == mca_rmaps_ppr_component.start) {
cache_level = 2;
} else if (OPAL_HWLOC_L1CACHE_LEVEL == mca_rmaps_ppr_component.start) {
cache_level = 1;
}
for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) { for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
@ -121,7 +233,7 @@ static int ppr(orte_job_t *jdata)
/* get the available nodes */ /* get the available nodes */
OBJ_CONSTRUCT(&node_list, opal_list_t); OBJ_CONSTRUCT(&node_list, opal_list_t);
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) { jdata->map->mapping))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto error; goto error;
} }
@ -129,6 +241,7 @@ static int ppr(orte_job_t *jdata)
/* cycle across the nodes */ /* cycle across the nodes */
nprocs_mapped = 0; nprocs_mapped = 0;
while (NULL != (node = (orte_node_t*)opal_list_remove_first(&node_list))) { while (NULL != (node = (orte_node_t*)opal_list_remove_first(&node_list))) {
#if OPAL_HAVE_HWLOC
/* bozo check */ /* bozo check */
if (NULL == node->topology) { if (NULL == node->topology) {
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
@ -136,6 +249,7 @@ static int ppr(orte_job_t *jdata)
rc = ORTE_ERR_SILENT; rc = ORTE_ERR_SILENT;
goto error; goto error;
} }
#endif
/* add the node to the map */ /* add the node to the map */
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -146,15 +260,21 @@ static int ppr(orte_job_t *jdata)
/* if we are mapping solely at the node level, just put /* if we are mapping solely at the node level, just put
* that many procs on this node * that many procs on this node
*/ */
if (HWLOC_OBJ_MACHINE == lowest) { if (OPAL_HWLOC_NODE_LEVEL == start) {
for (j=0; j < local_limit && nprocs_mapped < total_procs; j++) { #if OPAL_HAVE_HWLOC
obj = hwloc_get_root_obj(node->topology);
#endif
for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
if (NULL == (proc = setup_proc(jdata, node, idx))) { if (NULL == (proc = setup_proc(jdata, node, idx))) {
rc = ORTE_ERR_OUT_OF_RESOURCE; rc = ORTE_ERR_OUT_OF_RESOURCE;
goto error; goto error;
} }
nprocs_mapped++; nprocs_mapped++;
#if OPAL_HAVE_HWLOC
proc->locale = obj; proc->locale = obj;
#endif
} }
#if OPAL_HAVE_HWLOC
} else { } else {
/* get the number of lowest resources on this node */ /* get the number of lowest resources on this node */
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
@ -168,7 +288,7 @@ static int ppr(orte_job_t *jdata)
obj = opal_hwloc_base_get_obj_by_type(node->topology, obj = opal_hwloc_base_get_obj_by_type(node->topology,
lowest, cache_level, lowest, cache_level,
i, OPAL_HWLOC_AVAILABLE); i, OPAL_HWLOC_AVAILABLE);
for (j=0; j < local_limit && nprocs_mapped < total_procs; j++) { for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
if (NULL == (proc = setup_proc(jdata, node, idx))) { if (NULL == (proc = setup_proc(jdata, node, idx))) {
rc = ORTE_ERR_OUT_OF_RESOURCE; rc = ORTE_ERR_OUT_OF_RESOURCE;
goto error; goto error;
@ -178,7 +298,7 @@ static int ppr(orte_job_t *jdata)
} }
} }
if (mca_rmaps_ppr_component.pruning_reqd) { if (pruning_reqd) {
/* go up the ladder and prune the procs according to /* go up the ladder and prune the procs according to
* the specification, adjusting the count of procs on the * the specification, adjusting the count of procs on the
* node as we go * node as we go
@ -186,6 +306,7 @@ static int ppr(orte_job_t *jdata)
level--; level--;
prune(jdata->jobid, idx, node, &level, &nprocs_mapped); prune(jdata->jobid, idx, node, &level, &nprocs_mapped);
} }
#endif
} }
/* set the total slots used to the number of procs placed /* set the total slots used to the number of procs placed
@ -197,12 +318,18 @@ static int ppr(orte_job_t *jdata)
* we have violated the total slot specification - regardless, * we have violated the total slot specification - regardless,
* if slots_max was given, we are not allowed to violate it! * if slots_max was given, we are not allowed to violate it!
*/ */
if ((!(jdata->map->oversubscribe) && node->slots < node->slots_inuse) || if ((node->slots < node->slots_inuse) ||
(0 < node->slots_max && node->slots_max < node->slots_inuse)) { (0 < node->slots_max && node->slots_max < node->slots_inuse)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
true, node->num_procs, app->app); orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
rc = ORTE_ERR_SILENT; true, node->num_procs, app->app);
goto error; rc = ORTE_ERR_SILENT;
goto error;
}
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
} }
/* update the number of procs in the job and the app */ /* update the number of procs in the job and the app */
@ -216,10 +343,10 @@ static int ppr(orte_job_t *jdata)
break; break;
} }
} }
if (nprocs_mapped < total_procs) { if (ORTE_VPID_MAX != total_procs && nprocs_mapped < total_procs) {
/* couldn't map them all */ /* couldn't map them all */
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs", orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs",
true, app->app, app->num_procs, mca_rmaps_ppr_component.given_ppr); true, app->app, app->num_procs, jdata->map->ppr);
rc = ORTE_ERR_SILENT; rc = ORTE_ERR_SILENT;
goto error; goto error;
} }
@ -231,17 +358,6 @@ static int ppr(orte_job_t *jdata)
} }
/* compute and save local ranks */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* define the daemons that we will use for this job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
ORTE_ERROR_LOG(rc);
}
error: error:
while (NULL != (item = opal_list_remove_first(&node_list))) { while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item); OBJ_RELEASE(item);
@ -250,6 +366,7 @@ static int ppr(orte_job_t *jdata)
return rc; return rc;
} }
#if OPAL_HAVE_HWLOC
static hwloc_obj_t find_split(hwloc_topology_t topo, hwloc_obj_t obj) static hwloc_obj_t find_split(hwloc_topology_t topo, hwloc_obj_t obj)
{ {
unsigned k; unsigned k;
@ -296,7 +413,7 @@ static void prune(orte_jobid_t jobid,
/* convenience */ /* convenience */
lvl = opal_hwloc_levels[ll]; lvl = opal_hwloc_levels[ll];
limit = mca_rmaps_ppr_component.ppr[ll]; limit = ppr[ll];
if (0 == limit) { if (0 == limit) {
/* no limit at this level, so move up if necessary */ /* no limit at this level, so move up if necessary */
@ -440,6 +557,7 @@ static void prune(orte_jobid_t jobid,
error: error:
opal_output(0, "INFINITE LOOP"); opal_output(0, "INFINITE LOOP");
} }
#endif
static orte_proc_t* setup_proc(orte_job_t *jdata, static orte_proc_t* setup_proc(orte_job_t *jdata,
orte_node_t *node, orte_node_t *node,

Просмотреть файл

@ -18,17 +18,7 @@
BEGIN_C_DECLS BEGIN_C_DECLS
struct orte_rmaps_ppr_component_t { ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_ppr_component;
orte_rmaps_base_component_t super;
char *given_ppr;
bool selected;
bool pruning_reqd;
int ppr[OPAL_HWLOC_HWTHREAD_LEVEL];
opal_hwloc_level_t start;
};
typedef struct orte_rmaps_ppr_component_t orte_rmaps_ppr_component_t;
ORTE_MODULE_DECLSPEC extern orte_rmaps_ppr_component_t mca_rmaps_ppr_component;
extern orte_rmaps_base_module_t orte_rmaps_ppr_module; extern orte_rmaps_base_module_t orte_rmaps_ppr_module;

Просмотреть файл

@ -26,23 +26,21 @@ static int orte_rmaps_ppr_open(void);
static int orte_rmaps_ppr_close(void); static int orte_rmaps_ppr_close(void);
static int orte_rmaps_ppr_query(mca_base_module_t **module, int *priority); static int orte_rmaps_ppr_query(mca_base_module_t **module, int *priority);
orte_rmaps_ppr_component_t mca_rmaps_ppr_component = { orte_rmaps_base_component_t mca_rmaps_ppr_component = {
{ {
{ ORTE_RMAPS_BASE_VERSION_2_0_0,
ORTE_RMAPS_BASE_VERSION_2_0_0,
"ppr", /* MCA component name */ "ppr", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */ ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */ ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */ ORTE_RELEASE_VERSION, /* MCA component release version */
orte_rmaps_ppr_open, /* component open */ orte_rmaps_ppr_open, /* component open */
orte_rmaps_ppr_close, /* component close */ orte_rmaps_ppr_close, /* component close */
orte_rmaps_ppr_query /* component query */ orte_rmaps_ppr_query /* component query */
}, },
{ {
/* The component is checkpoint ready */ /* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT MCA_BASE_METADATA_PARAM_CHECKPOINT
}
} }
}; };
@ -52,101 +50,104 @@ orte_rmaps_ppr_component_t mca_rmaps_ppr_component = {
*/ */
static int orte_rmaps_ppr_open(void) static int orte_rmaps_ppr_open(void)
{ {
char **ppr, *ctmp, **ck; int tmp, value;
int i, n; mca_base_component_t *c=&mca_rmaps_ppr_component.base_version;
size_t value;
opal_hwloc_level_t start=OPAL_HWLOC_NODE_LEVEL;
/* initialize */ /* check for pernode, npernode, and npersocket directives - reqd for backward compatibility */
mca_rmaps_ppr_component.selected = false; tmp = mca_base_param_reg_int(c, "pernode",
mca_rmaps_ppr_component.pruning_reqd = false; "Launch one ppn as directed",
memset(mca_rmaps_ppr_component.ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t)); false, false, (int)false, NULL);
n=0; mca_base_param_reg_syn_name(tmp, "rmaps", "base_pernode", false);
mca_base_param_lookup_int(tmp, &value);
mca_base_param_reg_string(&mca_rmaps_ppr_component.super.base_version, if (value) {
"pattern", if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
"Comma-separate list of number of processes on a given resource type [default: none]", /* if a non-default mapping is already specified, then we
false, false, NULL, &mca_rmaps_ppr_component.given_ppr); * have an error
ctmp = mca_rmaps_ppr_component.given_ppr; */
if (NULL != ctmp) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
ppr = opal_argv_split(ctmp, ','); "PERNODE", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_CONFLICTED);
/* check validity of mppr spec */ return ORTE_ERR_SILENT;
for (i=0; NULL != ppr[i]; i++) {
/* split on the colon */
ck = opal_argv_split(ppr[i], ':');
if (2 != opal_argv_count(ck)) {
/* must provide a specification */
orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, ctmp);
opal_argv_free(ppr);
opal_argv_free(ck);
free(ctmp);
return ORTE_ERR_SILENT;
}
value = strlen(ck[1]);
if (0 == strncasecmp(ck[1], "hwthread", value) ||
0 == strncasecmp(ck[1], "thread", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10);
start = OPAL_HWLOC_HWTHREAD_LEVEL;
n++;
} else if (0 == strncasecmp(ck[1], "core", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_CORE_LEVEL) {
start = OPAL_HWLOC_CORE_LEVEL;
}
n++;
} else if (0 == strncasecmp(ck[1], "socket", value) ||
0 == strncasecmp(ck[1], "skt", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_SOCKET_LEVEL) {
start = OPAL_HWLOC_SOCKET_LEVEL;
}
n++;
} else if (0 == strncasecmp(ck[1], "l1cache", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_L1CACHE_LEVEL) {
start = OPAL_HWLOC_L1CACHE_LEVEL;
}
n++;
} else if (0 == strncasecmp(ck[1], "l2cache", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_L2CACHE_LEVEL) {
start = OPAL_HWLOC_L2CACHE_LEVEL;
}
n++;
} else if (0 == strncasecmp(ck[1], "l3cache", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_L3CACHE_LEVEL) {
start = OPAL_HWLOC_L3CACHE_LEVEL;
}
n++;
} else if (0 == strncasecmp(ck[1], "numa", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10);
if (start < OPAL_HWLOC_NUMA_LEVEL) {
start = OPAL_HWLOC_NUMA_LEVEL;
}
n++;
} else if (0 == strncasecmp(ck[1], "node", value)) {
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10);
n++;
} else {
/* unknown spec */
orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], ctmp);
opal_argv_free(ppr);
opal_argv_free(ck);
free(ctmp);
return ORTE_ERR_SILENT;
}
opal_argv_free(ck);
} }
opal_argv_free(ppr); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_PPR);
mca_rmaps_ppr_component.selected = true; ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE);
mca_rmaps_ppr_component.start = start; orte_rmaps_base.ppr = strdup("1:node");
/* if more than one level was specified, then pruning will be reqd */ ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
if (1 < n) { }
mca_rmaps_ppr_component.pruning_reqd = true;
tmp = mca_base_param_reg_int(c, "n_pernode",
"Launch n procs/node",
false, false, (int)false, NULL);
mca_base_param_reg_syn_name(tmp, "rmaps", "base_n_pernode", false);
mca_base_param_lookup_int(tmp, &value);
if (value) {
if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
/* if a non-default mapping is already specified, then we
* have an error
*/
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
"NPERNODE", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_CONFLICTED);
return ORTE_ERR_SILENT;
}
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_PPR);
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE);
asprintf(&orte_rmaps_base.ppr, "%d:node", value);
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
}
#if OPAL_HAVE_HWLOC
{
char *ppr;
tmp = mca_base_param_reg_int(c, "n_persocket",
"Launch n procs/socket",
false, false, (int)false, NULL);
mca_base_param_reg_syn_name(tmp, "rmaps", "base_n_persocket", false);
mca_base_param_lookup_int(tmp, &value);
if (value) {
if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
/* if a non-default mapping is already specified, then we
* have an error
*/
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
"NPERSOCKET", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_CONFLICTED);
return ORTE_ERR_SILENT;
}
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_PPR);
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSOCKET);
/* this implies binding to the sockets, unless otherwise directed */
if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_SOCKET);
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
}
asprintf(&orte_rmaps_base.ppr, "%d:socket", value);
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
}
mca_base_param_reg_string(c, "pattern",
"Comma-separated list of number of processes on a given resource type [default: none]",
false, false, NULL, &ppr);
if (NULL != ppr) {
if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
/* if a non-default mapping is already specified, then we
* have an error
*/
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
"PPR", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_CONFLICTED);
return ORTE_ERR_SILENT;
}
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_PPR);
/* since we don't know what pattern was given, leave the policy undefined
* for now - we will assign it when we analyze the pattern later
*/
orte_rmaps_base.ppr = ppr;
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
} }
} }
#endif
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -154,16 +155,9 @@ static int orte_rmaps_ppr_open(void)
static int orte_rmaps_ppr_query(mca_base_module_t **module, int *priority) static int orte_rmaps_ppr_query(mca_base_module_t **module, int *priority)
{ {
if (mca_rmaps_ppr_component.selected) { *priority = 90;
*priority = 1000; *module = (mca_base_module_t *)&orte_rmaps_ppr_module;
*module = (mca_base_module_t *)&orte_rmaps_ppr_module; return ORTE_SUCCESS;
return ORTE_SUCCESS;
}
/* cannot run without ppr spec */
*priority = 0;
*module = NULL;
return ORTE_ERROR;
} }
/** /**
@ -172,9 +166,6 @@ static int orte_rmaps_ppr_query(mca_base_module_t **module, int *priority)
static int orte_rmaps_ppr_close(void) static int orte_rmaps_ppr_close(void)
{ {
if (NULL != mca_rmaps_ppr_component.given_ppr) {
free(mca_rmaps_ppr_component.given_ppr);
}
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -1,5 +1,6 @@
# Copyright (c) 2004-2005 The Regents of the University of California. # Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -9,17 +10,24 @@
# This is the US/English general help file for rankle utilities. # This is the US/English general help file for rankle utilities.
# #
# Voltaire # Voltaire
[no-hwloc]
A slot_list containing detailed location info was given, but
hwloc support is not available:
Rank: %d
Slot list: %s
Unfortunately, hwloc support is required for this action.
Please reconfigure OMPI for hwloc if binding to specified
cpus is desired.
[no-rankfile] [no-rankfile]
Open RTE was unable to open the rankfile: Open RTE was unable to open the rankfile:
%s %s
Check to make sure the path and filename are correct. Check to make sure the path and filename are correct.
usage: mpirun mca rmaps_rankfile_path rankfile ./app usage: mpirun -mca rmaps_rankfile_path rankfile ./app
all unspecified by rankfile ranks are assigned using example: cat hostfile
byslot or bynode policy.
example: cat hosfile
host1 host1
host2 host2
host3 host3
@ -89,24 +97,12 @@ at least one that failed to specify the number of processes to execute.
When specifying multiple applications, you must specify how many processes When specifying multiple applications, you must specify how many processes
of each to launch via the -np argument. of each to launch via the -np argument.
# #
[orte-rmaps-rf:per-node-and-too-many-procs] [missing-rank]
There are not enough nodes in your allocation to satisfy your request to A rank is missing its location specification:
launch
%d processes on a per-node basis - only %d nodes were available.
Either request fewer processes, or obtain a larger allocation. Rank: %d
# Rank file: %s
[orte-rmaps-rf:n-per-node-and-too-many-procs]
There are not enough nodes in your allocation to satisfy your request to
launch
%d processes on a %d per-node basis - only %d nodes with a total of %d slots
%were available.
Either request fewer processes, or obtain a larger allocation. All processes must have their location specified in the rank file. Either
# add an entry to the file, or provide a default slot_list to use for
[orte-rmaps-rf:n-per-node-and-not-enough-slots] any unspecified ranks.
There are not enough slots on the nodes in your allocation to satisfy your
request to launch on a %d process-per-node basis - only %d slots/node were
available.
Either request fewer processes/node, or obtain a larger allocation.

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
* *
* Copyright (c) 2008 Voltaire. All rights reserved * Copyright (c) 2008 Voltaire. All rights reserved
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
@ -36,8 +36,8 @@
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "opal/util/if.h" #include "opal/util/if.h"
#include "opal/util/opal_sos.h"
#include "opal/class/opal_pointer_array.h" #include "opal/class/opal_pointer_array.h"
#include "opal/mca/hwloc/base/base.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h" #include "orte/mca/ess/ess.h"
@ -58,224 +58,8 @@ char *orte_rmaps_rank_file_slot_list;
/* /*
* Local variable * Local variable
*/ */
static opal_list_item_t *cur_node_item = NULL;
static opal_pointer_array_t rankmap; static opal_pointer_array_t rankmap;
/*
* Create a default mapping for the application, mapping rank by rank_file and
* by node.
*/
static int map_app_by_node(orte_app_context_t* app,
orte_job_t* jdata,
orte_vpid_t vpid_start,
opal_list_t* nodes )
{
int rc = ORTE_SUCCESS;
opal_list_item_t *next;
orte_node_t *node;
orte_std_cntr_t num_alloc = 0;
orte_proc_t *proc;
/* This loop continues until all procs have been mapped or we run
out of resources. We determine that we have "run out of
resources" when all nodes have slots_max processes mapped to them,
thus there are no free slots for a process to be mapped, or we have
hit the soft limit on all nodes and are in a "no oversubscribe" state.
If we still have processes that haven't been mapped yet, then it's an
"out of resources" error.
In this scenario, we rely on the claim_slot function to handle the
oversubscribed case. The claim_slot function will leave a node on the
list until it either reaches slots_max OR reaches the
soft limit and the "no_oversubscribe" flag has been set - at which point,
the node will be removed to prevent any more processes from being mapped to
it. Since we are taking one slot from each node as we cycle through, the
list, oversubscription is automatically taken care of via this logic.
*/
while (num_alloc < app->num_procs) {
if (NULL != opal_pointer_array_get_item(&rankmap, vpid_start+num_alloc)) {
/* this rank was already mapped */
++num_alloc;
continue;
}
/** see if any nodes remain unused and available. We need to do this check
* each time since we may remove nodes from the list (as they become fully
* used) as we cycle through the loop */
if(0 >= opal_list_get_size(nodes) ) {
/* No more nodes to allocate :( */
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:alloc-error",
true, app->num_procs, app->app);
return ORTE_ERR_SILENT;
}
/* Save the next node we can use before claiming slots, since
* we may need to prune the nodes list removing overused nodes.
* Wrap around to beginning if we are at the end of the list */
if (opal_list_get_end(nodes) == opal_list_get_next(cur_node_item)) {
next = opal_list_get_first(nodes);
}
else {
next = opal_list_get_next(cur_node_item);
}
/* Allocate a slot on this node */
node = (orte_node_t*) cur_node_item;
/* grab the slot - have a new proc object created */
proc = NULL;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
nodes, jdata->map->oversubscribe, true, &proc))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
* an error
*/
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
if (NULL != mca_rmaps_rank_file_component.slot_list) {
proc->slot_list = strdup(mca_rmaps_rank_file_component.slot_list);
}
++num_alloc;
cur_node_item = next;
}
return ORTE_SUCCESS;
}
/*
* Create a default mapping for the application, scheduling ranks byr rank_file
* and by slot.
*/
static int map_app_by_slot(orte_app_context_t* app,
orte_job_t* jdata,
orte_vpid_t vpid_start,
opal_list_t* nodes )
{
int rc = ORTE_SUCCESS;
orte_std_cntr_t i, num_slots_to_take, num_alloc = 0;
orte_node_t *node;
opal_list_item_t *next;
orte_proc_t *proc;
/* This loop continues until all procs have been mapped or we run
out of resources. We determine that we have "run out of
resources" when either all nodes have slots_max processes mapped to them,
(thus there are no free slots for a process to be mapped), OR all nodes
have reached their soft limit and the user directed us to "no oversubscribe".
If we still have processes that haven't been mapped yet, then it's an
"out of resources" error. */
while ( num_alloc < app->num_procs) {
/** see if any nodes remain unused and available. We need to do this check
* each time since we may remove nodes from the list (as they become fully
* used) as we cycle through the loop */
if(0 >= opal_list_get_size(nodes) ) {
/* Everything is at max usage! :( */
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:alloc-error",
true, app->num_procs, app->app);
return ORTE_ERR_SILENT;
}
/* Save the next node we can use before claiming slots, since
* we may need to prune the nodes list removing overused nodes.
* Wrap around to beginning if we are at the end of the list */
if (opal_list_get_end(nodes) == opal_list_get_next(cur_node_item)) {
next = opal_list_get_first(nodes);
} else {
next = opal_list_get_next(cur_node_item);
}
/** declare a shorter name for convenience in the code below */
node = (orte_node_t*) cur_node_item;
/* If we have available slots on this node, claim all of them
* If node_slots == 0, assume 1 slot for that node.
* JJH - is this assumption fully justified?
*
* If we are now oversubscribing the nodes, then we still take:
* (a) if the node has not been used yet, we take a full node_slots
* (b) if some of the slots are in-use, then we take the number of
* remaining slots before hitting the soft limit (node_slots)
* (c) if we are at or above the soft limit, we take a full node_slots
*
* Note: if node_slots is zero, then we always just take 1 slot
*
* We continue this process until either everything is done,
* or all nodes have hit their hard limit. This algorithm ensures we
* fully utilize each node before oversubscribing, and preserves the ratio
* of processes between the nodes thereafter (e.g., if one node has twice as
* many processes as another before oversubscribing, it will continue
* to do so after oversubscribing).
*/
if (0 == node->slots_inuse ||
node->slots_inuse >= node->slots_alloc) {
num_slots_to_take = (node->slots_alloc == 0) ? 1 : node->slots_alloc;
} else {
num_slots_to_take = node->slots_alloc - node->slots_inuse;
}
/* check if we are in npernode mode - if so, then set the num_slots_to_take
* to the num_per_node
*/
if (0 < jdata->map->npernode) {
num_slots_to_take = jdata->map->npernode;
}
i = 0;
while (num_alloc < app->num_procs && i < num_slots_to_take) {
if (NULL != opal_pointer_array_get_item(&rankmap, vpid_start+num_alloc)) {
/* this rank was already mapped */
++num_alloc;
continue;
}
/* grab the slot - have a new proc object created */
proc = NULL;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
nodes, jdata->map->oversubscribe, true, &proc))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
* an error
*/
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
if (NULL != mca_rmaps_rank_file_component.slot_list) {
proc->slot_list = strdup(mca_rmaps_rank_file_component.slot_list);
}
/* Update the rank */
++num_alloc;
/* track #slots taken */
i++;
/** if all the procs have been mapped OR we have fully used up this node, then
* break from the loop
*/
if(num_alloc == app->num_procs ||
ORTE_ERR_NODE_FULLY_USED == OPAL_SOS_GET_ERROR_CODE(rc)) {
break;
}
}
/* we move on to the next node in all cases EXCEPT if we came
* out of the loop without having taken a full bite AND the
* node is NOT max'd out
*
*/
if (i < (num_slots_to_take-1) &&
ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
continue;
}
cur_node_item = next;
}
return ORTE_SUCCESS;
}
/* /*
* Create a rank_file mapping for the job. * Create a rank_file mapping for the job.
*/ */
@ -284,17 +68,17 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
orte_job_map_t *map; orte_job_map_t *map;
orte_app_context_t *app=NULL; orte_app_context_t *app=NULL;
orte_std_cntr_t i, k; orte_std_cntr_t i, k;
orte_vpid_t total_procs;
opal_list_t node_list; opal_list_t node_list;
opal_list_item_t *item; opal_list_item_t *item;
orte_node_t *node, *nd, *root_node; orte_node_t *node, *nd, *root_node;
orte_vpid_t rank, vpid_start; orte_vpid_t rank, vpid_start;
orte_std_cntr_t num_nodes, num_slots; orte_std_cntr_t num_nodes, num_slots;
orte_rmaps_rank_file_map_t *rfmap; orte_rmaps_rank_file_map_t *rfmap;
orte_std_cntr_t slots_per_node, relative_index, tmp_cnt; orte_std_cntr_t relative_index, tmp_cnt;
int rc; int rc;
orte_proc_t *proc; orte_proc_t *proc;
mca_base_component_t *c = &mca_rmaps_rank_file_component.super.base_version; mca_base_component_t *c = &mca_rmaps_rank_file_component.super.base_version;
char *slots;
/* only handle initial launch of rf job */ /* only handle initial launch of rf job */
if (ORTE_JOB_STATE_INIT != jdata->state) { if (ORTE_JOB_STATE_INIT != jdata->state) {
@ -311,7 +95,10 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION; return ORTE_ERR_TAKE_NEXT_OPTION;
} }
if (ORTE_MAPPING_BYUSER != ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping)) {
/* NOT FOR US */
return ORTE_ERR_TAKE_NEXT_OPTION;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output, opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_file: mapping job %s", "mca:rmaps:rank_file: mapping job %s",
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid));
@ -335,7 +122,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
/* if the number of processes wasn't specified, then we know there can be only /* if the number of processes wasn't specified, then we know there can be only
* one app_context allowed in the launch, and that we are to launch it across * one app_context allowed in the launch, and that we are to launch it across
* all available slots. We'll double-check the single app_context rule first * all available slots.
*/ */
if (0 == app->num_procs && 1 < jdata->num_apps) { if (0 == app->num_procs && 1 < jdata->num_apps) {
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:multi-apps-and-zero-np", orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:multi-apps-and-zero-np",
@ -344,24 +131,11 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
goto error; goto error;
} }
/* likewise, we only support pernode options for a single app_context */
if (0 < map->npernode && 1 < jdata->num_apps) {
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:multi-apps-and-zero-np",
true, jdata->num_apps, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
/* END SANITY CHECKS */ /* END SANITY CHECKS */
/* flag the map as containing cpu_lists */
map->cpu_lists = true;
/* start at the beginning... */ /* start at the beginning... */
vpid_start = 0; vpid_start = 0;
jdata->num_procs = 0; jdata->num_procs = 0;
total_procs = 0;
OBJ_CONSTRUCT(&node_list, opal_list_t); OBJ_CONSTRUCT(&node_list, opal_list_t);
OBJ_CONSTRUCT(&rankmap, opal_pointer_array_t); OBJ_CONSTRUCT(&rankmap, opal_pointer_array_t);
@ -384,70 +158,45 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
* option * option
*/ */
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
map->policy))) { map->mapping))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto error; goto error;
} }
num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list); num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
/* we already checked for sanity, so these are okay to just do here */ /* we already checked for sanity, so it's okay to just do here */
if (map->npernode == 1) { if (0 == app->num_procs) {
/* there are three use-cases that we need to deal with:
* (a) if -np was not provided, then we just use the number of nodes
* (b) if -np was provided AND #procs > #nodes, then error out
* (c) if -np was provided AND #procs <= #nodes, then launch
* the specified #procs one/node. In this case, we just
* leave app->num_procs alone
*/
if (0 == app->num_procs) {
app->num_procs = num_nodes;
} else if (app->num_procs > num_nodes) {
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:per-node-and-too-many-procs",
true, app->num_procs, num_nodes, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
} else if (map->npernode > 1) {
/* first, let's check to see if there are enough slots/node to
* meet the request - error out if not
*/
slots_per_node = num_slots / num_nodes;
if (map->npernode > slots_per_node) {
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:n-per-node-and-not-enough-slots",
true, map->npernode, slots_per_node, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
/* there are three use-cases that we need to deal with:
* (a) if -np was not provided, then we just use the n/node * #nodes
* (b) if -np was provided AND #procs > (n/node * #nodes), then error out
* (c) if -np was provided AND #procs <= (n/node * #nodes), then launch
* the specified #procs n/node. In this case, we just
* leave app->num_procs alone
*/
if (0 == app->num_procs) {
/* set the num_procs to equal the specified num/node * the number of nodes */
app->num_procs = map->npernode * num_nodes;
} else if (app->num_procs > (map->npernode * num_nodes)) {
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:n-per-node-and-too-many-procs",
true, app->num_procs, map->npernode, num_nodes, num_slots, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
} else if (0 == app->num_procs) {
/** set the num_procs to equal the number of slots on these mapped nodes */ /** set the num_procs to equal the number of slots on these mapped nodes */
app->num_procs = num_slots; app->num_procs = num_slots;
} }
/* keep track of the total #procs in this job */
total_procs += app->num_procs;
for (k=0; k < app->num_procs; k++) { for (k=0; k < app->num_procs; k++) {
rank = vpid_start + k; rank = vpid_start + k;
/* get the rankfile entry for this rank */ /* get the rankfile entry for this rank */
if (NULL == (rfmap = (orte_rmaps_rank_file_map_t*)opal_pointer_array_get_item(&rankmap, rank))) { if (NULL == (rfmap = (orte_rmaps_rank_file_map_t*)opal_pointer_array_get_item(&rankmap, rank))) {
/* no entry for this rank */ #if OPAL_HAVE_HWLOC
continue; /* no entry for this rank - if a default slot_list was given,
* then use it instead
*/
if (NULL != opal_hwloc_base_slot_list) {
slots = opal_hwloc_base_slot_list;
} else {
#endif
/* all ranks must be specified */
orte_show_help("help-rmaps_rank_file.txt", "missing-rank", true, rank, orte_rankfile);
rc = ORTE_ERR_SILENT;
goto error;
#if OPAL_HAVE_HWLOC
}
} else {
if (0 == strlen(rfmap->slot_list)) {
/* rank was specified but no slot list given - that's an error */
orte_show_help("help-rmaps_rank_file.txt","no-slot-list", true, rank, rfmap->node_name);
rc = ORTE_ERR_SILENT;
goto error;
}
slots = rfmap->slot_list;
#endif
} }
/* find the node where this proc was assigned */ /* find the node where this proc was assigned */
@ -460,51 +209,110 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
0 == strcmp(nd->name, rfmap->node_name)) { 0 == strcmp(nd->name, rfmap->node_name)) {
node = nd; node = nd;
break; break;
} else if (NULL != rfmap->node_name && } else if (NULL != rfmap->node_name &&
(('+' == rfmap->node_name[0]) && (('+' == rfmap->node_name[0]) &&
(('n' == rfmap->node_name[1]) || (('n' == rfmap->node_name[1]) ||
('N' == rfmap->node_name[1])))) { ('N' == rfmap->node_name[1])))) {
relative_index=atoi(strtok(rfmap->node_name,"+n")); relative_index=atoi(strtok(rfmap->node_name,"+n"));
if ( relative_index >= (int)opal_list_get_size (&node_list) || ( 0 > relative_index)){ if ( relative_index >= (int)opal_list_get_size (&node_list) || ( 0 > relative_index)){
orte_show_help("help-rmaps_rank_file.txt","bad-index", true,rfmap->node_name); orte_show_help("help-rmaps_rank_file.txt","bad-index", true,rfmap->node_name);
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM; return ORTE_ERR_BAD_PARAM;
} }
root_node = (orte_node_t*) opal_list_get_first(&node_list); root_node = (orte_node_t*) opal_list_get_first(&node_list);
for(tmp_cnt=0; tmp_cnt<relative_index; tmp_cnt++) { for(tmp_cnt=0; tmp_cnt<relative_index; tmp_cnt++) {
root_node = (orte_node_t*) opal_list_get_next(root_node); root_node = (orte_node_t*) opal_list_get_next(root_node);
} }
node = root_node; node = root_node;
break; break;
} }
} }
if (NULL == node) { if (NULL == node) {
orte_show_help("help-rmaps_rank_file.txt","bad-host", true, rfmap->node_name); orte_show_help("help-rmaps_rank_file.txt","bad-host", true, rfmap->node_name);
return ORTE_ERR_SILENT; rc = ORTE_ERR_SILENT;
goto error;
} }
if (0 == strlen(rfmap->slot_list)) { /* ensure the node is in the map */
/* rank was specified but no slot list given - that's an error */ if (!node->mapped) {
orte_show_help("help-rmaps_rank_file.txt","no-slot-list", true, rank, rfmap->node_name); OBJ_RETAIN(node);
return ORTE_ERR_SILENT; opal_pointer_array_add(map->nodes, node);
} node->mapped = true;
proc = NULL;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
&node_list, jdata->map->oversubscribe, true, &proc))) {
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
/* if this is a true error and not the node just being
* full, then report the error and abort
*/
ORTE_ERROR_LOG(rc);
return rc;
}
} }
proc = OBJ_NEW(orte_proc_t);
/* set the jobid */
proc->name.jobid = jdata->jobid;
proc->name.vpid = rank; proc->name.vpid = rank;
/* Either init or update the epoch. */ ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); /* flag the proc as ready for launch */
proc->state = ORTE_PROC_STATE_INIT;
proc->app_idx = i;
proc->slot_list = strdup(rfmap->slot_list); OBJ_RETAIN(node); /* maintain accounting on object */
proc->node = node;
proc->nodename = node->name;
node->num_procs++;
if ((node->slots < node->slots_inuse) ||
(0 < node->slots_max && node->slots_max < node->slots_inuse)) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, node->num_procs, app->app);
rc = ORTE_ERR_SILENT;
goto error;
}
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
}
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(proc);
return rc;
}
/* retain the proc struct so that we correctly track its release */
OBJ_RETAIN(proc);
#if OPAL_HAVE_HWLOC
if (NULL != slots) {
/* setup the bitmap */
hwloc_cpuset_t bitmap;
if (NULL == node->topology) {
/* not allowed - for rank-file, we must have
* the topology info
*/
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology", true, node->name);
rc = ORTE_ERR_SILENT;
goto error;
}
bitmap = hwloc_bitmap_alloc();
/* parse the slot_list to find the socket and core */
if (ORTE_SUCCESS != (rc = opal_hwloc_base_slot_list_parse(slots, node->topology, bitmap))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* note that we cannot set the proc locale to any specific object
* as the slot list may have assigned it to more than one - so
* leave that field NULL
*/
/* set the proc to the specified map */
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, bitmap);
/* cleanup */
hwloc_bitmap_free(bitmap);
}
#else
/* if we don't have hwloc, then all the rank_file can contain
* is the node assignment - it cannot contain any directives
* for socket, cores, etc. as we cannot honor them
*/
if (NULL != slots) {
orte_show_help("help-rmaps_rank_file.txt", "no-hwloc", true, rank, slots);
rc = ORTE_ERR_SILENT;
goto error;
}
#endif
/* insert the proc into the proper place */ /* insert the proc into the proper place */
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
proc->name.vpid, proc))) { proc->name.vpid, proc))) {
@ -518,7 +326,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
/* cleanup the node list - it can differ from one app_context /* cleanup the node list - it can differ from one app_context
* to another, so we have to get it every time * to another, so we have to get it every time
*/ */
while(NULL != (item = opal_list_remove_first(&node_list))) { while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item); OBJ_RELEASE(item);
} }
OBJ_DESTRUCT(&node_list); OBJ_DESTRUCT(&node_list);
@ -526,94 +334,6 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
} }
OBJ_DESTRUCT(&node_list); OBJ_DESTRUCT(&node_list);
/* did we map all the procs, or did the user's rankfile not contain
* a specification for every rank?
*/
if (jdata->num_procs < total_procs) {
/* we need to map the remainder of the procs according to the
* mapping policy
*/
vpid_start = 0;
for(i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
*/
OBJ_CONSTRUCT(&node_list, opal_list_t);
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* if a bookmark exists from some prior mapping, set us to start there */
if (NULL != jdata->bookmark) {
cur_node_item = NULL;
/* find this node on the list */
for (item = opal_list_get_first(&node_list);
item != opal_list_get_end(&node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
if (node->index == jdata->bookmark->index) {
cur_node_item = item;
break;
}
}
/* see if we found it - if not, just start at the beginning */
if (NULL == cur_node_item) {
cur_node_item = opal_list_get_first(&node_list);
}
} else {
/* if no bookmark, then just start at the beginning of the list */
cur_node_item = opal_list_get_first(&node_list);
}
if (map->policy & ORTE_MAPPING_BYNODE) {
rc = map_app_by_node(app, jdata, vpid_start, &node_list);
} else {
rc = map_app_by_slot(app, jdata, vpid_start, &node_list);
}
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto error;
}
vpid_start += app->num_procs;
/* cleanup the node list - it can differ from one app_context
* to another, so we have to get it every time
*/
while(NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
}
/* save the bookmark */
jdata->bookmark = (orte_node_t*)cur_node_item;
}
/* update the job's number of procs */
jdata->num_procs = total_procs;
/* compute vpids and add proc objects to the job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* compute and save convenience values */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* define the daemons that we will use for this job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* cleanup the rankmap */ /* cleanup the rankmap */
for (i=0; i < rankmap.size; i++) { for (i=0; i < rankmap.size; i++) {
if (NULL != (rfmap = opal_pointer_array_get_item(&rankmap, i))) { if (NULL != (rfmap = opal_pointer_array_get_item(&rankmap, i))) {
@ -621,9 +341,9 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
} }
} }
OBJ_DESTRUCT(&rankmap); OBJ_DESTRUCT(&rankmap);
return ORTE_SUCCESS; return rc;
error: error:
while(NULL != (item = opal_list_remove_first(&node_list))) { while(NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item); OBJ_RELEASE(item);
} }

Просмотреть файл

@ -11,6 +11,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2008 Voltaire. All rights reserved * Copyright (c) 2008 Voltaire. All rights reserved
* *
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -30,7 +31,6 @@
#include "orte_config.h" #include "orte_config.h"
#include "opal/class/opal_object.h" #include "opal/class/opal_object.h"
#include "opal/mca/paffinity/paffinity.h"
#include "orte/mca/rmaps/rmaps.h" #include "orte/mca/rmaps/rmaps.h"

Просмотреть файл

@ -11,6 +11,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2008 Voltaire. All rights reserved * Copyright (c) 2008 Voltaire. All rights reserved
* *
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -25,10 +26,11 @@
#include <string.h> #include <string.h>
#endif #endif
#include "orte/mca/ras/ras_types.h"
#include "opal/mca/base/base.h" #include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
#include "opal/mca/hwloc/base/base.h"
#include "orte/util/show_help.h"
#include "orte/mca/rmaps/base/base.h" #include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rmaps/base/rmaps_private.h" #include "orte/mca/rmaps/base/rmaps_private.h"
@ -82,17 +84,29 @@ static int orte_rmaps_rank_file_open(void)
false, false, 0, false, false, 0,
&my_priority); &my_priority);
/* did the user provide a slot list? */ tmp = mca_base_param_reg_string(c, "path",
tmp = mca_base_param_reg_string(c, "slot_list", "Name of the rankfile to be used for mapping processes (relative or absolute path)",
"List of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files) [default=NULL]", false, false, NULL, NULL);
false, false, NULL, NULL); mca_base_param_reg_syn_name(tmp, "orte", "rankfile", false);
mca_base_param_reg_syn_name(tmp, "rmaps", "base_slot_list", false); mca_base_param_lookup_string(tmp, &orte_rankfile);
mca_base_param_lookup_string(tmp, &mca_rmaps_rank_file_component.slot_list);
/* ensure we flag mapping by user */ /* ensure we flag mapping by user */
if (NULL != mca_rmaps_rank_file_component.slot_list || #if OPAL_HAVE_HWLOC
NULL != orte_rankfile) { if (NULL != opal_hwloc_base_slot_list || NULL != orte_rankfile) {
ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_BYUSER); #else
if (NULL != orte_rankfile) {
#endif
if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
/* if a non-default mapping is already specified, then we
* have an error
*/
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
"RANK_FILE", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_CONFLICTED);
return ORTE_ERR_SILENT;
}
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYUSER);
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
/* make us first */ /* make us first */
my_priority = 10000; my_priority = 10000;
} }

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana * Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
@ -59,9 +59,9 @@ static int map_to_ftgrps(orte_job_t *jdata);
static int orte_rmaps_resilient_map(orte_job_t *jdata) static int orte_rmaps_resilient_map(orte_job_t *jdata)
{ {
orte_app_context_t *app; orte_app_context_t *app;
int i; int i, j;
int rc = ORTE_SUCCESS; int rc = ORTE_SUCCESS;
orte_node_t *nd=NULL, *oldnode, *node; orte_node_t *nd=NULL, *oldnode, *node, *nptr;
orte_rmaps_res_ftgrp_t *target = NULL; orte_rmaps_res_ftgrp_t *target = NULL;
orte_proc_t *proc; orte_proc_t *proc;
orte_vpid_t totprocs; orte_vpid_t totprocs;
@ -69,6 +69,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
orte_std_cntr_t num_slots; orte_std_cntr_t num_slots;
opal_list_item_t *item; opal_list_item_t *item;
mca_base_component_t *c = &mca_rmaps_resilient_component.super.base_version; mca_base_component_t *c = &mca_rmaps_resilient_component.super.base_version;
bool found;
if (ORTE_JOB_STATE_INIT == jdata->state) { if (ORTE_JOB_STATE_INIT == jdata->state) {
if (NULL != jdata->map->req_mapper && if (NULL != jdata->map->req_mapper &&
@ -172,7 +173,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list,
&num_slots, &num_slots,
app, app,
jdata->map->policy))) { jdata->map->mapping))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
while (NULL != (item = opal_list_remove_first(&node_list))) { while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item); OBJ_RELEASE(item);
@ -231,25 +232,31 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
} }
} }
} }
/* /* add node to map if necessary - nothing we can do here
* Put the process on the found node (add it if not already in the map) * but search for it
*/ */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, found = false;
nd, for (j=0; j < jdata->map->nodes->size; j++) {
jdata->map->cpus_per_rank, if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, j))) {
proc->app_idx, continue;
NULL, }
jdata->map->oversubscribe, if (nptr == nd) {
false, found = true;
&proc))) { break;
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error
*/
if (ORTE_ERR_NODE_FULLY_USED != rc) {
ORTE_ERROR_LOG(rc);
goto error;
} }
} }
if (!found) {
OBJ_RETAIN(nd);
opal_pointer_array_add(jdata->map->nodes, nd);
nd->mapped = true;
}
OBJ_RETAIN(nd); /* maintain accounting on object */
proc->node = nd;
proc->nodename = nd->name;
nd->num_procs++;
opal_pointer_array_add(nd->procs, (void*)proc);
/* retain the proc struct so that we correctly track its release */
OBJ_RETAIN(proc);
/* flag the proc state as non-launched so we'll know to launch it */ /* flag the proc state as non-launched so we'll know to launch it */
proc->state = ORTE_PROC_STATE_INIT; proc->state = ORTE_PROC_STATE_INIT;
@ -259,11 +266,6 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
*/ */
orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc); orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc);
} }
/* define the daemons that we will use for this job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
error: error:
return rc; return rc;
@ -474,7 +476,7 @@ static int get_new_node(orte_proc_t *proc,
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list,
&num_slots, &num_slots,
app, app,
map->policy))) { map->mapping))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto release; goto release;
} }
@ -716,7 +718,7 @@ static int map_to_ftgrps(orte_job_t *jdata)
*/ */
OBJ_CONSTRUCT(&node_list, opal_list_t); OBJ_CONSTRUCT(&node_list, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
map->policy))) { map->mapping))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
@ -813,18 +815,36 @@ static int map_to_ftgrps(orte_job_t *jdata)
"%s rmaps:resilient: placing proc into fault group %d node %s", "%s rmaps:resilient: placing proc into fault group %d node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == target) ? -1 : target->ftgrp, nd->name)); (NULL == target) ? -1 : target->ftgrp, nd->name));
/* put proc on that node */ /* if the node isn't in the map, add it */
proc=NULL; if (!nd->mapped) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, app->idx, OBJ_RETAIN(node);
&node_list, jdata->map->oversubscribe, false, &proc))) { opal_pointer_array_add(map->nodes, nd);
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this nd->mapped = true;
* really isn't an error
*/
if (ORTE_ERR_NODE_FULLY_USED != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
} }
proc = OBJ_NEW(orte_proc_t);
/* set the jobid */
proc->name.jobid = jdata->jobid;
proc->app_idx = app->idx;
OBJ_RETAIN(node); /* maintain accounting on object */
proc->node = nd;
proc->nodename = nd->name;
nd->num_procs++;
if ((nd->slots < nd->slots_inuse) ||
(0 < nd->slots_max && nd->slots_max < nd->slots_inuse)) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, nd->num_procs, app->app);
return ORTE_ERR_SILENT;
}
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
nd->oversubscribed = true;
}
opal_pointer_array_add(nd->procs, (void*)proc);
/* retain the proc struct so that we correctly track its release */
OBJ_RETAIN(proc);
/* flag the proc as ready for launch */ /* flag the proc as ready for launch */
proc->state = ORTE_PROC_STATE_INIT; proc->state = ORTE_PROC_STATE_INIT;
@ -864,11 +884,5 @@ static int map_to_ftgrps(orte_job_t *jdata)
return rc; return rc;
} }
/* define the daemons that we will use for this job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -110,13 +111,6 @@ typedef struct orte_rmaps_base_component_2_0_0_t orte_rmaps_base_component_2_0_0
typedef orte_rmaps_base_component_2_0_0_t orte_rmaps_base_component_t; typedef orte_rmaps_base_component_2_0_0_t orte_rmaps_base_component_t;
/**
* Macro for use in components that are of type rmaps
*/
#define ORTE_RMAPS_BASE_VERSION_2_0_0 \
MCA_BASE_VERSION_2_0_0, \
"rmaps", 2, 0, 0
END_C_DECLS END_C_DECLS
#endif #endif

Просмотреть файл

@ -8,6 +8,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -24,6 +25,7 @@
#include "orte/constants.h" #include "orte/constants.h"
#include "opal/class/opal_pointer_array.h" #include "opal/class/opal_pointer_array.h"
#include "opal/mca/hwloc/hwloc.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
@ -33,6 +35,11 @@
BEGIN_C_DECLS BEGIN_C_DECLS
typedef uint16_t orte_mapping_policy_t;
#define ORTE_MAPPING_POLICY OPAL_UINT16
typedef uint16_t orte_ranking_policy_t;
#define ORTE_RANKING_POLICY OPAL_UINT16
/* /*
* Structure that represents the mapping of a job to an * Structure that represents the mapping of a job to an
* allocated set of resources. * allocated set of resources.
@ -42,16 +49,16 @@ struct orte_job_map_t {
/* user-specified mapping params */ /* user-specified mapping params */
char *req_mapper; /* requested mapper */ char *req_mapper; /* requested mapper */
char *last_mapper; /* last mapper used */ char *last_mapper; /* last mapper used */
orte_mapping_policy_t policy; orte_mapping_policy_t mapping;
int npernode; orte_ranking_policy_t ranking;
int nperboard; #if OPAL_HAVE_HWLOC
int npersocket; opal_binding_policy_t binding;
opal_hwloc_level_t bind_level;
#endif
/* mapping options */
char *ppr;
int16_t cpus_per_rank; int16_t cpus_per_rank;
int16_t stride;
/* are we allowed to oversubscribe the nodes in this job */
bool oversubscribe;
bool display_map; bool display_map;
bool cpu_lists;
/* *** */ /* *** */
/* number of new daemons required to be launched /* number of new daemons required to be launched
* to support this job map * to support this job map
@ -69,6 +76,87 @@ struct orte_job_map_t {
typedef struct orte_job_map_t orte_job_map_t; typedef struct orte_job_map_t orte_job_map_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_map_t); ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_map_t);
/**
* Macro for use in components that are of type rmaps
*/
#define ORTE_RMAPS_BASE_VERSION_2_0_0 \
MCA_BASE_VERSION_2_0_0, \
"rmaps", 2, 0, 0
/* define map-related directives */
#define ORTE_MAPPING_NO_USE_LOCAL 0x0100
#define ORTE_MAPPING_NO_OVERSUBSCRIBE 0x0200
#define ORTE_MAPPING_SUBSCRIBE_GIVEN 0x0400
#define ORTE_MAPPING_SPAN 0x0800
#define ORTE_MAPPING_PPR 0x1000
/* an error flag */
#define ORTE_MAPPING_CONFLICTED 0x2000
#define ORTE_MAPPING_GIVEN 0x4000
#define ORTE_SET_MAPPING_DIRECTIVE(target, pol) \
(target) |= (pol)
#define ORTE_UNSET_MAPPING_DIRECTIVE(target, pol) \
(target) &= ~(pol)
#define ORTE_GET_MAPPING_DIRECTIVE(pol) \
((pol) & 0xff00)
/* round-robin policies */
#define ORTE_MAPPING_BYSLOT 1
#define ORTE_MAPPING_BYNODE 2
#define ORTE_MAPPING_BYBOARD 3
#define ORTE_MAPPING_BYNUMA 4
#define ORTE_MAPPING_BYSOCKET 5
#define ORTE_MAPPING_BYL3CACHE 6
#define ORTE_MAPPING_BYL2CACHE 7
#define ORTE_MAPPING_BYL1CACHE 8
#define ORTE_MAPPING_BYCORE 9
#define ORTE_MAPPING_BYHWTHREAD 10
/* convenience - declare anything <= 15 to be round-robin*/
#define ORTE_MAPPING_RR 0x000f
/* sequential policy */
#define ORTE_MAPPING_SEQ 20
/* rank file and other user-defined mapping */
#define ORTE_MAPPING_BYUSER 22
/* macro to separate out the mapping policy
* from the directives
*/
#define ORTE_GET_MAPPING_POLICY(pol) \
((pol) & 0x00ff)
/* macro to determine if mapping policy is set */
#define ORTE_MAPPING_POLICY_IS_SET(pol) \
((pol) & 0x00ff)
#define ORTE_SET_MAPPING_POLICY(target, pol) \
(target) = (pol) | ((target) & 0xff00)
/* define ranking directives */
#define ORTE_RANKING_SPAN 0x1000
#define ORTE_RANKING_FILL 0x2000
#define ORTE_RANKING_GIVEN 0x4000
#define ORTE_SET_RANKING_DIRECTIVE(target, pol) \
(target) |= (pol)
#define ORTE_UNSET_RANKING_DIRECTIVE(target, pol) \
(target) &= ~(pol)
#define ORTE_GET_RANKING_DIRECTIVE(pol) \
((pol) & 0xf000)
/* define ranking policies */
#define ORTE_RANK_BY_NODE 1
#define ORTE_RANK_BY_BOARD 2
#define ORTE_RANK_BY_NUMA 3
#define ORTE_RANK_BY_SOCKET 4
#define ORTE_RANK_BY_L3CACHE 5
#define ORTE_RANK_BY_L2CACHE 6
#define ORTE_RANK_BY_L1CACHE 7
#define ORTE_RANK_BY_CORE 8
#define ORTE_RANK_BY_HWTHREAD 9
#define ORTE_RANK_BY_SLOT 10
#define ORTE_GET_RANKING_POLICY(pol) \
((pol) & 0x0fff)
/* macro to determine if ranking policy is set */
#define ORTE_RANKING_POLICY_IS_SET(pol) \
((pol) & 0x0fff)
#define ORTE_SET_RANKING_POLICY(target, pol) \
(target) = (pol) | ((target) & 0xf000)
END_C_DECLS END_C_DECLS
#endif #endif

Просмотреть файл

@ -24,7 +24,8 @@ dist_pkgdata_DATA = help-orte-rmaps-rr.txt
sources = \ sources = \
rmaps_rr.c \ rmaps_rr.c \
rmaps_rr.h \ rmaps_rr.h \
rmaps_rr_component.c rmaps_rr_component.c \
rmaps_rr_mappers.c
# Make the output library in this directory, and name it either # Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la # mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la

Просмотреть файл

@ -51,3 +51,11 @@ You have specified a rank-to-node/slot mapping, but failed to provide
the number of processes to be executed. For some reason, this information the number of processes to be executed. For some reason, this information
could not be obtained from the mapping you provided, so we cannot continue could not be obtained from the mapping you provided, so we cannot continue
with executing the specified application. with executing the specified application.
#
[orte-rmaps-rr:not-enough-objs]
There are not enough resources on the available nodes
to meet the requested mapping.
Application: %s
Number of procs: %d
Number of resources: %d

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -40,6 +40,7 @@
#include "orte/mca/rmaps/base/base.h" #include "orte/mca/rmaps/base/base.h"
#include "rmaps_rr.h" #include "rmaps_rr.h"
static orte_node_t* get_starting_point(opal_list_t *node_list, orte_job_t *jdata);
/* /*
* Create a round-robin mapping for the job. * Create a round-robin mapping for the job.
@ -52,7 +53,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
opal_list_item_t *item; opal_list_item_t *item;
orte_std_cntr_t num_nodes, num_slots; orte_std_cntr_t num_nodes, num_slots;
int rc; int rc;
opal_list_item_t *cur_node_item;
mca_base_component_t *c = &mca_rmaps_round_robin_component.base_version; mca_base_component_t *c = &mca_rmaps_round_robin_component.base_version;
/* this mapper can only handle initial launch /* this mapper can only handle initial launch
@ -74,9 +74,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION; return ORTE_ERR_TAKE_NEXT_OPTION;
} }
if (0 < jdata->map->npernode || if (ORTE_MAPPING_RR < ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
0 < jdata->map->nperboard ||
0 < jdata->map->npersocket) {
/* I don't know how to do these - defer */ /* I don't know how to do these - defer */
opal_output_verbose(5, orte_rmaps_base.rmaps_output, opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr: job %s not using rr mapper", "mca:rmaps:rr: job %s not using rr mapper",
@ -122,14 +120,14 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
* option * option
*/ */
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) { jdata->map->mapping))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto error; goto error;
} }
num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list); num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
/* if a bookmark exists from some prior mapping, set us to start there */ /* if a bookmark exists from some prior mapping, set us to start there */
cur_node_item = orte_rmaps_base_get_starting_point(&node_list, jdata); jdata->bookmark = get_starting_point(&node_list, jdata);
if (0 == app->num_procs) { if (0 == app->num_procs) {
/* set the num_procs to equal the number of slots on these mapped nodes */ /* set the num_procs to equal the number of slots on these mapped nodes */
@ -137,12 +135,42 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
} }
/* Make assignments */ /* Make assignments */
if (jdata->map->policy & ORTE_MAPPING_BYNODE) { if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_base_map_bynode(jdata, app, &node_list, rc = orte_rmaps_rr_bynode(jdata, app, &node_list, num_slots,
app->num_procs, cur_node_item); app->num_procs);
} else if (ORTE_MAPPING_BYSLOT == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
#if OPAL_HAVE_HWLOC
} else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_PU, 0);
} else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CORE, 0);
} else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CACHE, 1);
} else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CACHE, 2);
} else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CACHE, 3);
} else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_SOCKET, 0);
} else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_NODE, 0);
#endif
} else { } else {
rc = orte_rmaps_base_map_byslot(jdata, app, &node_list, /* unrecognized mapping directive */
app->num_procs, cur_node_item); orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy",
true, "mapping",
orte_rmaps_base_print_mapping(jdata->map->mapping));
rc = ORTE_ERR_SILENT;
goto error;
} }
if (ORTE_SUCCESS != rc) { if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -155,28 +183,19 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
/* cleanup the node list - it can differ from one app_context /* cleanup the node list - it can differ from one app_context
* to another, so we have to get it every time * to another, so we have to get it every time
*/ */
while(NULL != (item = opal_list_remove_first(&node_list))) { while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item); OBJ_RELEASE(item);
} }
OBJ_DESTRUCT(&node_list); OBJ_DESTRUCT(&node_list);
}
/* compute vpids and add proc objects to the job */ /* compute vpids and add proc objects to the job - do this after
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) { * each app_context so that the ranks within each context are
ORTE_ERROR_LOG(rc); * contiguous
return rc; */
} if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
/* compute and save local ranks */ return rc;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { }
ORTE_ERROR_LOG(rc);
return rc;
}
/* define the daemons that we will use for this job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;
@ -190,6 +209,85 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
return rc; return rc;
} }
/*
* determine the proper starting point for the next mapping operation
*/
static orte_node_t* get_starting_point(opal_list_t *node_list, orte_job_t *jdata)
{
opal_list_item_t *item, *cur_node_item;
orte_node_t *node, *nd1, *ndmin;
int overload;
/* if a bookmark exists from some prior mapping, set us to start there */
if (NULL != jdata->bookmark) {
cur_node_item = NULL;
/* find this node on the list */
for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
if (node->index == jdata->bookmark->index) {
cur_node_item = item;
break;
}
}
/* see if we found it - if not, just start at the beginning */
if (NULL == cur_node_item) {
cur_node_item = opal_list_get_first(node_list);
}
} else {
/* if no bookmark, then just start at the beginning of the list */
cur_node_item = opal_list_get_first(node_list);
}
/* is this node fully subscribed? If so, then the first
* proc we assign will oversubscribe it, so let's look
* for another candidate
*/
node = (orte_node_t*)cur_node_item;
ndmin = node;
overload = ndmin->slots_inuse - ndmin->slots_alloc;
if (node->slots_inuse >= node->slots_alloc) {
/* work down the list - is there another node that
* would not be oversubscribed?
*/
if (cur_node_item != opal_list_get_last(node_list)) {
item = opal_list_get_next(cur_node_item);
} else {
item = opal_list_get_first(node_list);
}
while (item != cur_node_item) {
nd1 = (orte_node_t*)item;
if (nd1->slots_inuse < nd1->slots_alloc) {
/* this node is not oversubscribed! use it! */
return (orte_node_t*)item;
}
/* this one was also oversubscribed, keep track of the
* node that has the least usage - if we can't
* find anyone who isn't fully utilized, we will
* start with the least used node
*/
if (overload >= (nd1->slots_inuse - nd1->slots_alloc)) {
ndmin = nd1;
overload = ndmin->slots_inuse - ndmin->slots_alloc;
}
if (item == opal_list_get_last(node_list)) {
item = opal_list_get_first(node_list);
} else {
item= opal_list_get_next(item);
}
}
/* if we get here, then we cycled all the way around the
* list without finding a better answer - just use the node
* that is minimally overloaded
*/
cur_node_item = (opal_list_item_t*)ndmin;
}
return (orte_node_t*)cur_node_item;
}
orte_rmaps_base_module_t orte_rmaps_round_robin_module = { orte_rmaps_base_module_t orte_rmaps_round_robin_module = {
orte_rmaps_rr_map orte_rmaps_rr_map
}; };

Просмотреть файл

@ -24,6 +24,10 @@
#define ORTE_RMAPS_RR_H #define ORTE_RMAPS_RR_H
#include "orte_config.h" #include "orte_config.h"
#include "opal/mca/hwloc/hwloc.h"
#include "opal/class/opal_list.h"
#include "orte/mca/rmaps/rmaps.h" #include "orte/mca/rmaps/rmaps.h"
BEGIN_C_DECLS BEGIN_C_DECLS
@ -31,6 +35,24 @@ BEGIN_C_DECLS
ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_round_robin_component; ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_round_robin_component;
extern orte_rmaps_base_module_t orte_rmaps_round_robin_module; extern orte_rmaps_base_module_t orte_rmaps_round_robin_module;
ORTE_MODULE_DECLSPEC int orte_rmaps_rr_bynode(orte_job_t *jdata,
orte_app_context_t *app,
opal_list_t *node_list,
orte_std_cntr_t num_slots,
orte_vpid_t nprocs);
ORTE_MODULE_DECLSPEC int orte_rmaps_rr_byslot(orte_job_t *jdata,
orte_app_context_t *app,
opal_list_t *node_list,
orte_std_cntr_t num_slots,
orte_vpid_t nprocs);
#if OPAL_HAVE_HWLOC
ORTE_MODULE_DECLSPEC int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_app_context_t *app,
opal_list_t *node_list,
orte_std_cntr_t num_slots,
orte_vpid_t num_procs,
hwloc_obj_type_t target, unsigned cache_level);
#endif
END_C_DECLS END_C_DECLS

Просмотреть файл

@ -63,7 +63,7 @@ static int orte_rmaps_round_robin_open(void)
mca_base_param_reg_int(c, "priority", mca_base_param_reg_int(c, "priority",
"Priority of the rr rmaps component", "Priority of the rr rmaps component",
false, false, 100, false, false, 10,
&my_priority); &my_priority);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

712
orte/mca/rmaps/round_robin/rmaps_rr_mappers.c Обычный файл
Просмотреть файл

@ -0,0 +1,712 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <string.h>
#include "opal/util/output.h"
#include "opal/mca/hwloc/base/base.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
#include "rmaps_rr.h"
static orte_proc_t* setup_proc(orte_job_t *jdata,
orte_node_t *node,
orte_app_idx_t idx);
int orte_rmaps_rr_byslot(orte_job_t *jdata,
orte_app_context_t *app,
opal_list_t *node_list,
orte_std_cntr_t num_slots,
orte_vpid_t num_procs)
{
int rc, i, nprocs_mapped;
orte_node_t *node;
orte_proc_t *proc;
opal_list_item_t *item;
int num_procs_to_assign, extra_procs_to_assign=0, nxtra_nodes=0;
#if OPAL_HAVE_HWLOC
hwloc_obj_t obj=NULL;
#endif
float balance;
bool add_one=false;
bool oversubscribed = false;
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr: mapping by slot for job %s slots %d num_procs %lu",
ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs);
/* check to see if we can map all the procs */
if (num_slots < app->num_procs) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
return ORTE_ERR_SILENT;
}
oversubscribed = true;
/* compute how many extra procs to put on each node */
balance = (float)(app->num_procs - num_slots) / (float)opal_list_get_size(node_list);
extra_procs_to_assign = (int)balance;
if (0 < (balance - (float)extra_procs_to_assign)) {
/* compute how many nodes need an extra proc */
nxtra_nodes = app->num_procs - num_slots - (extra_procs_to_assign * opal_list_get_size(node_list));
/* add one so that we add an extra proc to the first nodes
* until all procs are mapped
*/
extra_procs_to_assign++;
/* flag that we added one */
add_one = true;
}
}
/* map the number of procs to each node until we
* map all specified procs
*/
nprocs_mapped = 0;
while (NULL != (item = opal_list_remove_first(node_list))) {
node = (orte_node_t*)item;
#if OPAL_HAVE_HWLOC
/* get the root object as we are not assigning
* locale except at the node level
*/
if (NULL != node->topology) {
obj = hwloc_get_root_obj(node->topology);
}
#endif
if (add_one) {
if (0 == nxtra_nodes) {
--extra_procs_to_assign;
add_one = false;
} else {
--nxtra_nodes;
}
}
if (oversubscribed) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
}
if (0 == node->slots_alloc) {
num_procs_to_assign = 1 + extra_procs_to_assign;
} else {
num_procs_to_assign = node->slots_alloc + extra_procs_to_assign;
}
for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
if (0 == i) {
/* add this node to the map - do it only once */
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
if (NULL == (proc = setup_proc(jdata, node, app->idx))) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
nprocs_mapped++;
#if OPAL_HAVE_HWLOC
proc->locale = obj;
#endif
}
jdata->bookmark = node;
/* release the node - the object will persist */
OBJ_RELEASE(node);
}
return ORTE_SUCCESS;
}
int orte_rmaps_rr_bynode(orte_job_t *jdata,
orte_app_context_t *app,
opal_list_t *node_list,
orte_std_cntr_t num_slots,
orte_vpid_t num_procs)
{
int j, nprocs_mapped, lag, delta;
orte_node_t *node;
orte_proc_t *proc;
opal_list_item_t *item;
int num_procs_to_assign, navg, idx;
int extra_procs_to_assign=0, nxtra_nodes=0;
#if OPAL_HAVE_HWLOC
hwloc_obj_t obj=NULL;
#endif
float balance;
bool add_one=false;
bool oversubscribed=false;
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr: mapping by node for job %s slots %d num_procs %lu",
ORTE_JOBID_PRINT(jdata->jobid),
(int)num_slots, (unsigned long)num_procs);
/* quick check to see if we can map all the procs */
if (num_slots < app->num_procs) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
return ORTE_ERR_SILENT;
}
oversubscribed = true;
}
/* divide the procs evenly across all nodes - this is the
* average we have to maintain as we go, but we adjust
* the number on each node to reflect its available slots.
* Obviously, if all nodes have the same number of slots,
* then the avg is what we get on each node - this is
* the most common situation.
*/
navg = app->num_procs / opal_list_get_size(node_list);
if (0 == navg) {
/* if there are less procs than nodes, we have to
* place at least one/node
*/
navg = 1;
}
/* compute how many extra procs to put on each node */
balance = (float)(app->num_procs - (navg * opal_list_get_size(node_list))) / (float)opal_list_get_size(node_list);
extra_procs_to_assign = (int)balance;
if (0 < (balance - (float)extra_procs_to_assign)) {
/* compute how many nodes need an extra proc */
nxtra_nodes = app->num_procs - ((navg + extra_procs_to_assign) * opal_list_get_size(node_list));
/* add one so that we add an extra proc to the first nodes
* until all procs are mapped
*/
extra_procs_to_assign++;
/* flag that we added one */
add_one = true;
}
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr: mapping by node navg %d extra_procs %d extra_nodes %d",
navg, extra_procs_to_assign, nxtra_nodes);
nprocs_mapped = 0;
lag = 0;
while (NULL != (item = opal_list_remove_first(node_list))) {
node = (orte_node_t*)item;
#if OPAL_HAVE_HWLOC
/* get the root object as we are not assigning
* locale except at the node level
*/
if (NULL != node->topology) {
obj = hwloc_get_root_obj(node->topology);
}
#endif
/* add this node to the map */
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(idx);
return idx;
}
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
/* compute the number of procs to go on this node */
if (add_one) {
if (0 == nxtra_nodes) {
--extra_procs_to_assign;
add_one = false;
} else {
--nxtra_nodes;
}
}
if (oversubscribed) {
/* everybody just takes their share */
num_procs_to_assign = navg + extra_procs_to_assign;
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
} else {
/* if we are not oversubscribed, then there are enough
* slots to handle all the procs. However, not every
* node will have the same number of slots, so we
* have to track how many procs to "shift" elsewhere
* to make up the difference
*/
if (0 == node->slots_alloc) {
/* if there are no extras to take, then we can
* safely remove this node as we don't need it
*/
if (0 == extra_procs_to_assign) {
opal_pointer_array_set_item(jdata->map->nodes, idx, NULL);
OBJ_RELEASE(node);
--(jdata->map->num_nodes);
/* update how many we are lagging behind */
lag += navg;
continue;
}
/* everybody has to take at least the extras */
num_procs_to_assign = extra_procs_to_assign;
/* update how many we are lagging behind */
lag += navg;
} else {
/* if slots_alloc < avg, then take all */
if (node->slots_alloc < navg) {
num_procs_to_assign = node->slots_alloc + extra_procs_to_assign;
/* update how many we are lagging behind */
lag += navg - node->slots_alloc;
} else {
/* take the avg plus as much of the "lag" as we can */
delta = 0;
if (0 < lag) {
delta = node->slots_alloc - navg;
if (lag < delta) {
delta = lag;
}
lag -= delta;
}
num_procs_to_assign = navg + delta + extra_procs_to_assign;
}
}
}
for (j=0; j < num_procs_to_assign && nprocs_mapped < app->num_procs; j++) {
if (NULL == (proc = setup_proc(jdata, node, app->idx))) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
nprocs_mapped++;
#if OPAL_HAVE_HWLOC
proc->locale = obj;
#endif
}
jdata->bookmark = node;
/* maintain acctg */
OBJ_RELEASE(node);
if (nprocs_mapped == app->num_procs) {
/* we are done */
break;
}
}
return ORTE_SUCCESS;
}
#if OPAL_HAVE_HWLOC
static int byobj_span(orte_job_t *jdata,
orte_app_context_t *app,
opal_list_t *node_list,
orte_std_cntr_t num_slots,
orte_vpid_t num_procs,
hwloc_obj_type_t target, unsigned cache_level);
/* mapping by hwloc object looks a lot like mapping by node,
* but has the added complication of possibly having different
* numbers of objects on each node
*/
int orte_rmaps_rr_byobj(orte_job_t *jdata,
orte_app_context_t *app,
opal_list_t *node_list,
orte_std_cntr_t num_slots,
orte_vpid_t num_procs,
hwloc_obj_type_t target, unsigned cache_level)
{
int i, j, nprocs_mapped;
orte_node_t *node;
orte_proc_t *proc;
opal_list_item_t *item;
int num_procs_to_assign, nperobj, nprocs, nxtra_objs=0;
int extra_procs_to_assign=0, nxtra_nodes=0, idx;
hwloc_obj_t obj=NULL;
unsigned int nobjs;
float balance;
bool add_one=false;
bool oversubscribed = false;
/* there are two modes for mapping by object: span and not-span. The
* span mode essentially operates as if there was just a single
* "super-node" in the system - i.e., it balances the load across
* all objects of the indicated type regardless of their location.
* In essence, it acts as if we placed one proc on each object, cycling
* across all objects on all nodes, and then wrapped around to place
* another proc on each object, doing so until all procs were placed.
*
* In contrast, the non-span mode operates similar to byslot mapping.
* All slots on each node are filled, assigning each proc to an object
* on that node in a balanced fashion, and then the mapper moves on
* to the next node. Thus, procs tend to be "front loaded" onto the
* list of nodes, as opposed to being "load balanced" in the span mode
*/
if (ORTE_MAPPING_SPAN & jdata->map->mapping) {
return byobj_span(jdata, app, node_list, num_slots,
num_procs, target, cache_level);
}
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr: mapping no-span by %s for job %s slots %d num_procs %lu",
hwloc_obj_type_string(target),
ORTE_JOBID_PRINT(jdata->jobid),
(int)num_slots, (unsigned long)num_procs);
/* quick check to see if we can map all the procs - can't
* do more because we don't know how many total objects exist
* across all the nodes
*/
if (num_slots < app->num_procs) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
return ORTE_ERR_SILENT;
}
oversubscribed = true;
/* compute how many extra procs to put on each node */
balance = (float)(app->num_procs - num_slots) / (float)opal_list_get_size(node_list);
extra_procs_to_assign = (int)balance;
if (0 < (balance - (float)extra_procs_to_assign)) {
/* compute how many nodes need an extra proc */
nxtra_nodes = app->num_procs - num_slots - (extra_procs_to_assign * opal_list_get_size(node_list));
/* add one so that we add an extra proc to the first nodes
* until all procs are mapped
*/
extra_procs_to_assign++;
/* flag that we added one */
add_one = true;
}
}
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr: mapping no-span by %s extra_procs %d extra_nodes %d",
hwloc_obj_type_string(target),
extra_procs_to_assign, nxtra_nodes);
nprocs_mapped = 0;
while (NULL != (item = opal_list_remove_first(node_list))) {
node = (orte_node_t*)item;
/* bozo check */
if (NULL == node->topology) {
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
true, node->name);
return ORTE_ERR_SILENT;
}
/* add this node to the map */
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(idx);
return idx;
}
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
if (oversubscribed) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
}
/* compute the number of procs to go on this node */
if (add_one) {
if (0 == nxtra_nodes) {
--extra_procs_to_assign;
add_one = false;
} else {
--nxtra_nodes;
}
}
if (0 == node->slots_alloc) {
/* everybody takes at least the extras */
num_procs_to_assign = extra_procs_to_assign;
} else {
num_procs_to_assign = node->slots_alloc + extra_procs_to_assign;
}
/* get the number of objects of this type on this node */
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE);
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr:byobj: found %d objs on node %s", nobjs, node->name);
/* compute the number of procs to go on each object */
nperobj = num_procs_to_assign / nobjs;
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr:byobj: placing %d procs on each object", nperobj);
if ((int)(nperobj * nobjs) < num_procs_to_assign) {
/* compute how many objs need an extra proc */
nxtra_objs = num_procs_to_assign - nperobj * nobjs;
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr:byobj: adding 1 extra proc to the first %d objects, if needed", nxtra_objs);
}
/* loop through the number of objects */
for (i=0; i < (int)nobjs && nprocs_mapped < (int)app->num_procs; i++) {
/* get the hwloc object */
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level, i, OPAL_HWLOC_AVAILABLE))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* map the reqd number of procs */
if (0 < nxtra_objs) {
nprocs = nperobj + 1;
--nxtra_objs;
} else {
nprocs = nperobj;
}
for (j=0; j < nprocs && nprocs_mapped < app->num_procs; j++) {
if (NULL == (proc = setup_proc(jdata, node, app->idx))) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
nprocs_mapped++;
proc->locale = obj;
}
}
jdata->bookmark = node;
/* maintain acctg */
OBJ_RELEASE(node);
if (nprocs_mapped == app->num_procs) {
/* we are done */
break;
}
}
return ORTE_SUCCESS;
}
static int byobj_span(orte_job_t *jdata,
orte_app_context_t *app,
opal_list_t *node_list,
orte_std_cntr_t num_slots,
orte_vpid_t num_procs,
hwloc_obj_type_t target, unsigned cache_level)
{
int i, j, nprocs_mapped, lag, delta, navg;
orte_node_t *node;
orte_proc_t *proc;
opal_list_item_t *item;
int num_procs_to_assign, nperobj, nprocs, nxtra_objs=0;
int extra_procs_to_assign=0, nxtra_nodes=0, idx;
hwloc_obj_t obj=NULL;
unsigned int nobjs;
float balance;
bool add_one=false;
bool oversubscribed=false;
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr: mapping span by %s for job %s slots %d num_procs %lu",
hwloc_obj_type_string(target),
ORTE_JOBID_PRINT(jdata->jobid),
(int)num_slots, (unsigned long)num_procs);
/* quick check to see if we can map all the procs - can't
* do more because we don't know how many total objects exist
* across all the nodes
*/
if (num_slots < app->num_procs) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
return ORTE_ERR_SILENT;
}
oversubscribed = true;
}
/* divide the procs evenly across all nodes - this is the
* average we have to maintain as we go, but we adjust
* the number on each node to reflect its available slots.
* Obviously, if all nodes have the same number of slots,
* then the avg is what we get on each node - this is
* the most common situation.
*/
navg = app->num_procs / opal_list_get_size(node_list);
if (0 == navg) {
/* if there are less procs than nodes, we have to
* place at least one/node
*/
navg = 1;
}
/* compute how many extra procs to put on each node */
balance = (float)(app->num_procs - (navg * opal_list_get_size(node_list))) / (float)opal_list_get_size(node_list);
extra_procs_to_assign = (int)balance;
if (0 < (balance - (float)extra_procs_to_assign)) {
/* compute how many nodes need an extra proc */
nxtra_nodes = app->num_procs - ((navg + extra_procs_to_assign) * opal_list_get_size(node_list));
/* add one so that we add an extra proc to the first nodes
* until all procs are mapped
*/
extra_procs_to_assign++;
/* flag that we added one */
add_one = true;
}
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr: mapping by %s navg %d extra_procs %d extra_nodes %d",
hwloc_obj_type_string(target),
navg, extra_procs_to_assign, nxtra_nodes);
nprocs_mapped = 0;
lag = 0;
while (NULL != (item = opal_list_remove_first(node_list))) {
node = (orte_node_t*)item;
/* bozo check */
if (NULL == node->topology) {
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
true, node->name);
return ORTE_ERR_SILENT;
}
/* add this node to the map */
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(idx);
return idx;
}
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
/* compute the number of procs to go on this node */
if (add_one) {
if (0 == nxtra_nodes) {
--extra_procs_to_assign;
add_one = false;
} else {
--nxtra_nodes;
}
}
if (oversubscribed) {
/* everybody just takes their share */
num_procs_to_assign = navg + extra_procs_to_assign;
} else {
/* if we are not oversubscribed, then there are enough
* slots to handle all the procs. However, not every
* node will have the same number of slots, so we
* have to track how many procs to "shift" elsewhere
* to make up the difference
*/
if (0 == node->slots_alloc) {
/* if there are no extras to take, then we can
* safely remove this node as we don't need it
*/
if (0 == extra_procs_to_assign) {
opal_pointer_array_set_item(jdata->map->nodes, idx, NULL);
OBJ_RELEASE(node);
--(jdata->map->num_nodes);
/* update how many we are lagging behind */
lag += navg;
continue;
}
/* everybody has to take at least the extras */
num_procs_to_assign = extra_procs_to_assign;
/* update how many we are lagging behind */
lag += navg;
} else {
/* if slots_alloc < avg, then take all */
if (node->slots_alloc < navg) {
num_procs_to_assign = node->slots_alloc + extra_procs_to_assign;
/* update how many we are lagging behind */
lag += navg - node->slots_alloc;
} else {
/* take the avg plus as much of the "lag" as we can */
delta = 0;
if (0 < lag) {
delta = node->slots_alloc - navg;
if (lag < delta) {
delta = lag;
}
lag -= delta;
}
num_procs_to_assign = navg + delta + extra_procs_to_assign;
}
}
}
/* get the number of objects of this type on this node */
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE);
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr:byobj: found %d objs on node %s", nobjs, node->name);
/* compute the number of procs to go on each object */
nperobj = num_procs_to_assign / nobjs;
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr:byobj: placing %d procs on each object", nperobj);
if ((int)(nperobj * nobjs) < num_procs_to_assign) {
/* compute how many objs need an extra proc */
nxtra_objs = num_procs_to_assign - nperobj * nobjs;
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr:byobj: adding 1 extra proc to the first %d objects, if needed", nxtra_objs);
}
/* loop through the number of objects */
for (i=0; i < (int)nobjs && nprocs_mapped < (int)app->num_procs; i++) {
/* get the hwloc object */
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level, i, OPAL_HWLOC_AVAILABLE))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* map the reqd number of procs */
if (0 < nxtra_objs) {
nprocs = nperobj + 1;
--nxtra_objs;
} else {
nprocs = nperobj;
}
for (j=0; j < nprocs && nprocs_mapped < app->num_procs; j++) {
if (NULL == (proc = setup_proc(jdata, node, app->idx))) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
nprocs_mapped++;
proc->locale = obj;
}
}
jdata->bookmark = node;
/* maintain acctg */
OBJ_RELEASE(node);
if (nprocs_mapped == app->num_procs) {
/* we are done */
break;
}
}
return ORTE_SUCCESS;
}
#endif
static orte_proc_t* setup_proc(orte_job_t *jdata,
orte_node_t *node,
orte_app_idx_t idx)
{
orte_proc_t *proc;
int rc;
proc = OBJ_NEW(orte_proc_t);
/* set the jobid */
proc->name.jobid = jdata->jobid;
/* we do not set the vpid here - this will be done
* during a second phase, but we do set the epoch here
* since they all start with the same value.
*/
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
/* flag the proc as ready for launch */
proc->state = ORTE_PROC_STATE_INIT;
proc->app_idx = idx;
OBJ_RETAIN(node); /* maintain accounting on object */
proc->node = node;
proc->nodename = node->name;
node->num_procs++;
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(proc);
return NULL;
}
/* retain the proc struct so that we correctly track its release */
OBJ_RETAIN(proc);
return proc;
}

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -31,7 +31,7 @@
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
#include "opal/util/if.h" #include "opal/util/if.h"
#include "opal/util/opal_sos.h" #include "opal/mca/hwloc/hwloc.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
@ -72,7 +72,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
opal_list_t *node_list=NULL; opal_list_t *node_list=NULL;
orte_proc_t *proc; orte_proc_t *proc;
mca_base_component_t *c = &mca_rmaps_seq_component.base_version; mca_base_component_t *c = &mca_rmaps_seq_component.base_version;
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:seq mapping job %s", "%s rmaps:seq mapping job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -96,9 +96,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION; return ORTE_ERR_TAKE_NEXT_OPTION;
} }
if (0 < jdata->map->npernode || if (ORTE_MAPPING_SEQ != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
0 < jdata->map->nperboard ||
0 < jdata->map->npersocket) {
/* I don't know how to do these - defer */ /* I don't know how to do these - defer */
opal_output_verbose(5, orte_rmaps_base.rmaps_output, opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:seq: job %s not using seq mapper", "mca:rmaps:seq: job %s not using seq mapper",
@ -168,7 +166,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
} }
/* check for nolocal and remove the head node, if required */ /* check for nolocal and remove the head node, if required */
if (map->policy & ORTE_MAPPING_NO_USE_LOCAL) { if (map->mapping & ORTE_MAPPING_NO_USE_LOCAL) {
for (item = opal_list_get_first(node_list); for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list); item != opal_list_get_end(node_list);
item = opal_list_get_next(item) ) { item = opal_list_get_next(item) ) {
@ -218,24 +216,54 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
rc = ORTE_ERR_SILENT; rc = ORTE_ERR_SILENT;
goto error; goto error;
} }
/* ensure the node is in the map */
if (!node->mapped) {
OBJ_RETAIN(node);
opal_pointer_array_add(map->nodes, node);
node->mapped = true;
}
proc = OBJ_NEW(orte_proc_t);
/* set the jobid */
proc->name.jobid = jdata->jobid;
proc->name.vpid = vpid++;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
/* flag the proc as ready for launch */
proc->state = ORTE_PROC_STATE_INIT;
proc->app_idx = i;
/* assign proc to this node - do NOT allow claim_slot to remove OBJ_RETAIN(node); /* maintain accounting on object */
* an oversubscribed node from the list! proc->node = node;
*/ proc->nodename = node->name;
proc = NULL; node->num_procs++;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, if ((node->slots < node->slots_inuse) ||
jdata->map->cpus_per_rank, app->idx, (0 < node->slots_max && node->slots_max < node->slots_inuse)) {
node_list, if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
jdata->map->oversubscribe, orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
false, &proc))) { true, node->num_procs, app->app);
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) { rc = ORTE_ERR_SILENT;
ORTE_ERROR_LOG(rc);
goto error; goto error;
} }
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
} }
/* assign the vpid */ if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
proc->name.vpid = vpid++; ORTE_ERROR_LOG(rc);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); OBJ_RELEASE(proc);
return rc;
}
/* retain the proc struct so that we correctly track its release */
OBJ_RETAIN(proc);
#if OPAL_HAVE_HWLOC
/* assign the locale - okay for the topo to be null as
* it just means it wasn't returned
*/
if (NULL != node->topology) {
proc->locale = hwloc_get_root_obj(node->topology);
}
#endif
/* add to the jdata proc array */ /* add to the jdata proc array */
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
@ -260,21 +288,9 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
} }
} }
/* compute and save local ranks */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* define the daemons that we will use for this job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS; return ORTE_SUCCESS;
error: error:
if (NULL != default_node_list) { if (NULL != default_node_list) {
while (NULL != (item = opal_list_remove_first(default_node_list))) { while (NULL != (item = opal_list_remove_first(default_node_list))) {
OBJ_RELEASE(item); OBJ_RELEASE(item);

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights * Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2009 Institut National de Recherche en Informatique * Copyright (c) 2009 Institut National de Recherche en Informatique
@ -195,9 +195,11 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
NULL, OPAL_CMD_LINE_TYPE_STRING, NULL, OPAL_CMD_LINE_TYPE_STRING,
"Regular expression defining nodes in system" }, "Regular expression defining nodes in system" },
#if OPAL_HAVE_HWLOC
{ "orte", "hetero", "nodes", '\0', NULL, "hetero-nodes", 0, { "orte", "hetero", "nodes", '\0', NULL, "hetero-nodes", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL, NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" }, "Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" },
#endif
/* End of list */ /* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0, { NULL, NULL, NULL, '\0', NULL, NULL, 0,

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -311,9 +312,14 @@ int orte_dt_copy_map(orte_job_map_t **dest, orte_job_map_t *src, opal_data_type_
} }
/* copy data into it */ /* copy data into it */
(*dest)->policy = src->policy; (*dest)->mapping = src->mapping;
(*dest)->npernode = src->npernode; (*dest)->ranking = src->ranking;
(*dest)->oversubscribe = src->oversubscribe; #if OPAL_HAVE_HWLOC
(*dest)->binding = src->binding;
#endif
if (NULL != src->ppr) {
(*dest)->ppr = strdup(src->ppr);
}
(*dest)->display_map = src->display_map; (*dest)->display_map = src->display_map;
(*dest)->num_new_daemons = src->num_new_daemons; (*dest)->num_new_daemons = src->num_new_daemons;
(*dest)->daemon_vpid_start = src->daemon_vpid_start; (*dest)->daemon_vpid_start = src->daemon_vpid_start;

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -22,7 +23,9 @@
#include <sys/types.h> #include <sys/types.h>
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "opal/util/opal_sos.h" #include "opal/dss/dss.h"
#include "opal/dss/dss_internal.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/class/opal_pointer_array.h" #include "opal/class/opal_pointer_array.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
@ -442,15 +445,6 @@ int orte_dt_pack_node(opal_buffer_t *buffer, const void *src,
return rc; return rc;
} }
/* do not pack the local board, socket, and core info */
/* pack the cpu set info */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(nodes[i]->cpu_set)), 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* do not pack the username */ /* do not pack the username */
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;
@ -477,13 +471,6 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
return rc; return rc;
} }
/* pack the pid */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(procs[i]->pid)), 1, OPAL_PID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the local rank */ /* pack the local rank */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(procs[i]->local_rank)), 1, ORTE_LOCAL_RANK))) { (void*)(&(procs[i]->local_rank)), 1, ORTE_LOCAL_RANK))) {
@ -498,6 +485,14 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
return rc; return rc;
} }
#if OPAL_HAVE_HWLOC
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&procs[i]->cpu_bitmap), 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#endif
/* pack the state */ /* pack the state */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(procs[i]->state)), 1, ORTE_PROC_STATE))) { (void*)(&(procs[i]->state)), 1, ORTE_PROC_STATE))) {
@ -512,13 +507,6 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
return rc; return rc;
} }
/* pack the name of the node where this proc is executing */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(procs[i]->nodename)), 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the number of restarts */ /* pack the number of restarts */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)&(procs[i]->restarts), 1, OPAL_INT32))) { (void*)&(procs[i]->restarts), 1, OPAL_INT32))) {
@ -906,26 +894,23 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src,
return rc; return rc;
} }
/* pack the mapper used */ /* pack the policies */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->last_mapper), 1, OPAL_STRING))) { if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->mapping), 1, ORTE_MAPPING_POLICY))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->ranking), 1, ORTE_RANKING_POLICY))) {
/* pack the policy used to generate it */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->policy), 1, ORTE_MAPPING_POLICY))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
#if OPAL_HAVE_HWLOC
/* pack the #procs/node */ if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->binding), 1, OPAL_BINDING_POLICY))) {
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->npernode), 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
#endif
/* pack the oversubscribe flag */ /* pack any ppr */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->oversubscribe), 1, OPAL_BOOL))) { if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->ppr), 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
@ -935,24 +920,6 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src,
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
/* pack the number of new daemons */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->num_new_daemons), 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the daemon starting vpid */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->daemon_vpid_start), 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the number of nodes */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->num_nodes), 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;

Просмотреть файл

@ -10,6 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -23,7 +24,7 @@
#include <sys/types.h> #include <sys/types.h>
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "opal/mca/hwloc/hwloc.h" #include "opal/mca/hwloc/base/base.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/base.h" #include "orte/mca/rmaps/base/base.h"
@ -279,9 +280,9 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
tmp = tmp3; tmp = tmp3;
} }
asprintf(&tmp2, "%s\n%s\tNum launched: %ld\tNum reported: %ld\n%s\tNum terminated: %ld\tOversubscribe override?: %s", asprintf(&tmp2, "%s\n%s\tNum launched: %ld\tNum reported: %ld\tNum terminated: %ld",
tmp, pfx, (long)src->num_launched, (long)src->num_reported, pfx, tmp, pfx, (long)src->num_launched, (long)src->num_reported,
(long)src->num_terminated, src->oversubscribe_override ? "True" : "False"); (long)src->num_terminated);
free(tmp); free(tmp);
tmp = tmp2; tmp = tmp2;
@ -376,11 +377,6 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
} }
} }
asprintf(&tmp2, "%s\n%s\tNum boards: %ld\tNum sockets/board: %ld\tNum cores/socket: %ld", tmp, pfx2,
(long)src->boards, (long)src->sockets_per_board, (long)src->cores_per_socket);
free(tmp);
tmp = tmp2;
if (NULL == src->daemon) { if (NULL == src->daemon) {
asprintf(&tmp2, "%s\n%s\tDaemon: %s\tDaemon launched: %s", tmp, pfx2, asprintf(&tmp2, "%s\n%s\tDaemon: %s\tDaemon launched: %s", tmp, pfx2,
"Not defined", src->daemon_launched ? "True" : "False"); "Not defined", src->daemon_launched ? "True" : "False");
@ -397,9 +393,8 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
free(tmp); free(tmp);
tmp = tmp2; tmp = tmp2;
asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld:\tCpu set: %s", tmp, pfx2, asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld", tmp, pfx2,
(long)src->slots_alloc, (long)src->slots_max, (long)src->slots_alloc, (long)src->slots_max);
(NULL == src->cpu_set) ? "NULL" : src->cpu_set);
free(tmp); free(tmp);
tmp = tmp2; tmp = tmp2;
@ -462,7 +457,6 @@ PRINT_PROCS:
int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_type_t type) int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_type_t type)
{ {
char *tmp, *tmp2, *pfx2; char *tmp, *tmp2, *pfx2;
char *locale=NULL;
/* set default result */ /* set default result */
*output = NULL; *output = NULL;
@ -474,23 +468,6 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
asprintf(&pfx2, "%s", prefix); asprintf(&pfx2, "%s", prefix);
} }
if (orte_display_diffable_output) {
/* print only the parts important to testing
* mapping operations
*/
#if OPAL_HAVE_HWLOC
if (NULL != src->locale) {
hwloc_bitmap_list_asprintf(&locale, src->locale->cpuset);
}
#endif
asprintf(output, "%s<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu locale=%s>",
pfx2, ORTE_VPID_PRINT(src->name.vpid), (long)src->app_idx,
(unsigned long)src->local_rank,
(unsigned long)src->node_rank,
(NULL == locale) ? "UNKNOWN" : locale);
return ORTE_SUCCESS;
}
if (orte_xml_output) { if (orte_xml_output) {
/* need to create the output in XML format */ /* need to create the output in XML format */
if (0 == src->pid) { if (0 == src->pid) {
@ -541,14 +518,24 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
tmp = tmp2; tmp = tmp2;
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
if (NULL != src->locale) { {
hwloc_bitmap_list_asprintf(&locale, src->locale->cpuset); char *locale=NULL;
}
#endif
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld\tLocale: %s\tSlot list: %s", tmp, pfx2, if (NULL != src->locale) {
orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx, hwloc_bitmap_list_asprintf(&locale, src->locale->cpuset);
(NULL == locale) ? "UNKNOWN" : locale, (NULL == src->slot_list) ? "NULL" : src->slot_list); }
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld\tLocale: %s\tBinding: %s[%u]", tmp, pfx2,
orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx,
(NULL == locale) ? "UNKNOWN" : locale,
(NULL == src->cpu_bitmap) ? "NULL" : src->cpu_bitmap, src->bind_idx);
if (NULL != locale) {
free(locale);
}
}
#else
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld", tmp, pfx2,
orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx);
#endif
free(tmp); free(tmp);
/* set the return */ /* set the return */
@ -662,38 +649,6 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
asprintf(&pfx2, "%s", prefix); asprintf(&pfx2, "%s", prefix);
} }
if (orte_display_diffable_output) {
/* display just the procs in a diffable format */
asprintf(&tmp, "<map>\n");
/* loop through nodes */
for (i=0; i < src->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(src->nodes, i))) {
continue;
}
asprintf(&tmp2, "%s\n\t<host name=%s>", tmp, (NULL == node->name) ? "UNKNOWN" : node->name);
free(tmp);
tmp = tmp2;
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
orte_dt_print_proc(&tmp2, "\t\t", proc, ORTE_PROC);
asprintf(&tmp3, "%s\n%s", tmp, tmp2);
free(tmp2);
free(tmp);
tmp = tmp3;
}
asprintf(&tmp2, "%s\n\t</host>", tmp);
free(tmp);
tmp = tmp2;
}
asprintf(&tmp2, "%s\n</map>\n", tmp);
free(tmp);
free(pfx2);
*output = tmp2;
return ORTE_SUCCESS;
}
if (orte_xml_output) { if (orte_xml_output) {
/* need to create the output in XML format */ /* need to create the output in XML format */
asprintf(&tmp, "<map>\n"); asprintf(&tmp, "<map>\n");
@ -733,13 +688,25 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
asprintf(&pfx, "%s\t", pfx2); asprintf(&pfx, "%s\t", pfx2);
if (orte_devel_level_output) { if (orte_devel_level_output) {
asprintf(&tmp, "\n%sMapper requested: %s\tLast mapper: %s\tMapping policy: %04x\n%s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s", #if OPAL_HAVE_HWLOC
asprintf(&tmp, "\n%sMapper requested: %s Last mapper: %s Mapping policy: %s Ranking policy: %s Binding policy: %s[%s] Cpu set: %s PPR: %s",
pfx2, (NULL == src->req_mapper) ? "NULL" : src->req_mapper, pfx2, (NULL == src->req_mapper) ? "NULL" : src->req_mapper,
(NULL == src->last_mapper) ? "NULL" : src->last_mapper, (NULL == src->last_mapper) ? "NULL" : src->last_mapper,
src->policy, pfx2, (long)src->npernode, orte_rmaps_base_print_mapping(src->mapping),
(src->oversubscribe) ? "TRUE" : "FALSE", orte_rmaps_base_print_ranking(src->ranking),
(src->cpu_lists) ? "TRUE" : "FALSE"); opal_hwloc_base_print_binding(src->binding),
opal_hwloc_base_print_level(src->bind_level),
(NULL == opal_hwloc_base_cpu_set) ? "NULL" : opal_hwloc_base_cpu_set,
(NULL == src->ppr) ? "NULL" : src->ppr);
#else
asprintf(&tmp, "\n%sMapper requested: %s Last mapper: %s Mapping policy: %s Ranking policy: %s PPR: %s",
pfx2, (NULL == src->req_mapper) ? "NULL" : src->req_mapper,
(NULL == src->last_mapper) ? "NULL" : src->last_mapper,
orte_rmaps_base_print_mapping(src->mapping),
orte_rmaps_base_print_ranking(src->ranking),
(NULL == src->ppr) ? "NULL" : src->ppr);
#endif
if (ORTE_VPID_INVALID == src->daemon_vpid_start) { if (ORTE_VPID_INVALID == src->daemon_vpid_start) {
asprintf(&tmp2, "%s\n%sNum new daemons: %ld\tNew daemon starting vpid INVALID\n%sNum nodes: %ld", asprintf(&tmp2, "%s\n%sNum new daemons: %ld\tNew daemon starting vpid INVALID\n%sNum nodes: %ld",
tmp, pfx, (long)src->num_new_daemons, pfx, (long)src->num_nodes); tmp, pfx, (long)src->num_new_daemons, pfx, (long)src->num_nodes);

Просмотреть файл

@ -7,6 +7,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -196,10 +197,6 @@ int orte_dt_size_proc(size_t *size, orte_proc_t *src, opal_data_type_t type)
/* if src is NULL, then that's all we wanted */ /* if src is NULL, then that's all we wanted */
if (NULL == src) return ORTE_SUCCESS; if (NULL == src) return ORTE_SUCCESS;
if (NULL != src->slot_list) {
*size += strlen(src->slot_list);
}
#if OPAL_ENABLE_FT_CR == 1 #if OPAL_ENABLE_FT_CR == 1
if (NULL != src->ckpt_snapshot_ref) { if (NULL != src->ckpt_snapshot_ref) {
*size += strlen(src->ckpt_snapshot_ref); *size += strlen(src->ckpt_snapshot_ref);

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -21,10 +22,11 @@
#include <sys/types.h> #include <sys/types.h>
#include "orte/mca/errmgr/errmgr.h"
#include "opal/dss/dss.h" #include "opal/dss/dss.h"
#include "opal/dss/dss_internal.h" #include "opal/dss/dss_internal.h"
#include "opal/util/opal_sos.h" #include "opal/mca/hwloc/hwloc.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/data_type_support/orte_dt_support.h" #include "orte/runtime/data_type_support/orte_dt_support.h"
/* /*
@ -474,16 +476,6 @@ int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest,
return rc; return rc;
} }
/* do not unpack the board, socket, and core info */
/* unpack the cpu set */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(nodes[i]->cpu_set), &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* do not unpack the username */ /* do not unpack the username */
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;
@ -518,14 +510,6 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
return rc; return rc;
} }
/* unpack the pid */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(procs[i]->pid)), &n, OPAL_PID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the local rank */ /* unpack the local rank */
n = 1; n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -534,7 +518,7 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
return rc; return rc;
} }
/* unpack the local rank */ /* unpack the node rank */
n = 1; n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(procs[i]->node_rank)), &n, ORTE_NODE_RANK))) { (&(procs[i]->node_rank)), &n, ORTE_NODE_RANK))) {
@ -542,6 +526,16 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
return rc; return rc;
} }
#if OPAL_HAVE_HWLOC
/* unpack the binding pattern */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(void*)(&(procs[i]->cpu_bitmap)), &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#endif
/* unpack the state */ /* unpack the state */
n = 1; n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -558,13 +552,6 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
return rc; return rc;
} }
/* unpack the name of the node where this proc is executing */
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(void*)(&(procs[i]->nodename)), &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the number of restarts */ /* unpack the number of restarts */
n = 1; n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -965,7 +952,7 @@ int orte_dt_unpack_job_state(opal_buffer_t *buffer, void *dest,
* sending a map - hence, we do not pack that field, so don't unpack it here * sending a map - hence, we do not pack that field, so don't unpack it here
*/ */
int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest, int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type) int32_t *num_vals, opal_data_type_t type)
{ {
int rc; int rc;
int32_t i, n; int32_t i, n;
@ -990,34 +977,31 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
return rc; return rc;
} }
/* unpack the mapper used */ /* unpack the policies */
n = 1; n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->last_mapper), &n, OPAL_STRING))) { &(maps[i]->mapping), &n, ORTE_MAPPING_POLICY))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
/* unpack the policy */
n = 1; n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->policy), &n, ORTE_MAPPING_POLICY))) { &(maps[i]->ranking), &n, ORTE_RANKING_POLICY))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
#if OPAL_HAVE_HWLOC
/* unpack the #procs/node */
n = 1; n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->npernode), &n, ORTE_STD_CNTR))) { &(maps[i]->binding), &n, OPAL_BINDING_POLICY))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
#endif
/* unpack the oversubscribe flag */ /* unpack the ppr */
n = 1; n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->oversubscribe), &n, OPAL_BOOL))) { &(maps[i]->ppr), &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
@ -1029,28 +1013,6 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
/* unpack the number of daemons to be created */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &(maps[i]->num_new_daemons), &n, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the starting vpid of the new daemons */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &(maps[i]->daemon_vpid_start), &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the number of nodes */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->num_nodes), &n, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -27,7 +27,7 @@
#endif #endif
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
#include "opal/mca/paffinity/paffinity.h" #include "opal/mca/hwloc/hwloc.h"
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/opal_sos.h" #include "opal/util/opal_sos.h"
@ -146,24 +146,15 @@ bool orte_assume_same_shell = true;
/* report launch progress */ /* report launch progress */
bool orte_report_launch_progress = false; bool orte_report_launch_progress = false;
/* cluster hardware info */
uint8_t orte_default_num_boards;
uint8_t orte_default_num_sockets_per_board;
uint8_t orte_default_num_cores_per_socket;
/* allocation specification */ /* allocation specification */
char *orte_default_cpu_set;
char *orte_default_hostfile = NULL; char *orte_default_hostfile = NULL;
char *orte_rankfile; char *orte_rankfile = NULL;
#ifdef __WINDOWS__ #ifdef __WINDOWS__
char *orte_ccp_headnode; char *orte_ccp_headnode;
#endif #endif
int orte_num_allocated_nodes = 0; int orte_num_allocated_nodes = 0;
char *orte_node_regex = NULL; char *orte_node_regex = NULL;
/* default rank assigment and binding policy */
orte_mapping_policy_t orte_default_mapping_policy = 0;
/* tool communication controls */ /* tool communication controls */
bool orte_report_events = false; bool orte_report_events = false;
char *orte_report_events_uri = NULL; char *orte_report_events_uri = NULL;
@ -705,7 +696,6 @@ static void orte_job_construct(orte_job_t* job)
job->map = NULL; job->map = NULL;
job->bookmark = NULL; job->bookmark = NULL;
job->oversubscribe_override = false;
job->state = ORTE_JOB_STATE_UNDEF; job->state = ORTE_JOB_STATE_UNDEF;
job->num_launched = 0; job->num_launched = 0;
@ -839,15 +829,6 @@ static void orte_node_construct(orte_node_t* node)
node->slots_alloc = 0; node->slots_alloc = 0;
node->slots_max = 0; node->slots_max = 0;
node->boards = orte_default_num_boards;
node->sockets_per_board = orte_default_num_sockets_per_board;
node->cores_per_socket = orte_default_num_cores_per_socket;
if (NULL != orte_default_cpu_set) {
node->cpu_set = strdup(orte_default_cpu_set);
} else {
node->cpu_set = NULL;
}
node->username = NULL; node->username = NULL;
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
@ -862,6 +843,7 @@ static void orte_node_destruct(orte_node_t* node)
{ {
int i; int i;
opal_node_stats_t *stats; opal_node_stats_t *stats;
orte_proc_t *proc;
if (NULL != node->name) { if (NULL != node->name) {
free(node->name); free(node->name);
@ -880,18 +862,15 @@ static void orte_node_destruct(orte_node_t* node)
} }
for (i=0; i < node->procs->size; i++) { for (i=0; i < node->procs->size; i++) {
if (NULL != node->procs->addr[i]) { if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
((orte_proc_t*)(node->procs->addr[i]))->node = NULL; opal_pointer_array_set_item(node->procs, i, NULL);
OBJ_RELEASE(node->procs->addr[i]); OBJ_RELEASE(proc);
node->procs->addr[i] = NULL;
} }
} }
OBJ_RELEASE(node->procs); OBJ_RELEASE(node->procs);
if (NULL != node->cpu_set) { /* we release the topology elsewhere */
free(node->cpu_set);
node->cpu_set = NULL;
}
if (NULL != node->username) { if (NULL != node->username) {
free(node->username); free(node->username);
node->username = NULL; node->username = NULL;
@ -925,8 +904,9 @@ static void orte_proc_construct(orte_proc_t* proc)
proc->app_idx = 0; proc->app_idx = 0;
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
proc->locale = NULL; proc->locale = NULL;
proc->bind_idx = 0;
proc->cpu_bitmap = NULL;
#endif #endif
proc->slot_list = NULL;
proc->node = NULL; proc->node = NULL;
proc->prior_node = NULL; proc->prior_node = NULL;
proc->nodename = NULL; proc->nodename = NULL;
@ -957,11 +937,11 @@ static void orte_proc_destruct(orte_proc_t* proc)
* associated node object - the node object * associated node object - the node object
* will free it * will free it
*/ */
#if OPAL_HAVE_HWLOC
if (NULL != proc->slot_list) { if (NULL != proc->cpu_bitmap) {
free(proc->slot_list); free(proc->cpu_bitmap);
proc->slot_list = NULL;
} }
#endif
if (NULL != proc->node) { if (NULL != proc->node) {
OBJ_RELEASE(proc->node); OBJ_RELEASE(proc->node);
@ -1000,21 +980,14 @@ static void orte_nid_construct(orte_nid_t *ptr)
ptr->name = NULL; ptr->name = NULL;
ptr->daemon = ORTE_VPID_INVALID; ptr->daemon = ORTE_VPID_INVALID;
ptr->oversubscribed = false; ptr->oversubscribed = false;
OBJ_CONSTRUCT(&ptr->sysinfo, opal_list_t);
} }
static void orte_nid_destruct(orte_nid_t *ptr) static void orte_nid_destruct(orte_nid_t *ptr)
{ {
opal_list_item_t *item;
if (NULL != ptr->name) { if (NULL != ptr->name) {
free(ptr->name); free(ptr->name);
ptr->name = NULL; ptr->name = NULL;
} }
while (NULL != (item = opal_list_remove_first(&ptr->sysinfo))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&ptr->sysinfo);
} }
OBJ_CLASS_INSTANCE(orte_nid_t, OBJ_CLASS_INSTANCE(orte_nid_t,
@ -1039,6 +1012,9 @@ static void orte_jmap_construct(orte_jmap_t *ptr)
{ {
ptr->job = ORTE_JOBID_INVALID; ptr->job = ORTE_JOBID_INVALID;
ptr->num_procs = 0; ptr->num_procs = 0;
#if OPAL_HAVE_HWLOC
ptr->bind_level = OPAL_HWLOC_NODE_LEVEL;
#endif
OBJ_CONSTRUCT(&ptr->pmap, opal_pointer_array_t); OBJ_CONSTRUCT(&ptr->pmap, opal_pointer_array_t);
opal_pointer_array_init(&ptr->pmap, opal_pointer_array_init(&ptr->pmap,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
@ -1048,12 +1024,13 @@ static void orte_jmap_construct(orte_jmap_t *ptr)
static void orte_jmap_destruct(orte_jmap_t *ptr) static void orte_jmap_destruct(orte_jmap_t *ptr)
{ {
orte_pmap_t **pmaps; orte_pmap_t *pmap;
int i; int i;
pmaps = (orte_pmap_t**)ptr->pmap.addr; for (i=0; i < ptr->pmap.size; i++) {
for (i=0; i < ptr->pmap.size && NULL != pmaps[i]; i++) { if (NULL != (pmap = (orte_pmap_t*)opal_pointer_array_get_item(&ptr->pmap, i))) {
OBJ_RELEASE(pmaps[i]); OBJ_RELEASE(pmap);
}
} }
OBJ_DESTRUCT(&ptr->pmap); OBJ_DESTRUCT(&ptr->pmap);
} }
@ -1064,20 +1041,19 @@ OBJ_CLASS_INSTANCE(orte_jmap_t,
orte_jmap_destruct); orte_jmap_destruct);
static void orte_job_map_construct(orte_job_map_t* map) static void orte_job_map_construct(orte_job_map_t* map)
{ {
map->req_mapper = NULL; map->req_mapper = NULL;
map->last_mapper = NULL; map->last_mapper = NULL;
map->policy = 0; map->mapping = 0;
map->npernode = 0; map->ranking = 0;
map->nperboard = 0; #if OPAL_HAVE_HWLOC
map->npersocket = 0; map->binding = 0;
map->bind_level = OPAL_HWLOC_NODE_LEVEL;
#endif
map->ppr = NULL;
map->cpus_per_rank = 1; map->cpus_per_rank = 1;
map->stride = 1;
map->oversubscribe = true; /* default to allowing oversubscribe */
map->display_map = false; map->display_map = false;
map->cpu_lists = false;
map->num_new_daemons = 0; map->num_new_daemons = 0;
map->daemon_vpid_start = ORTE_VPID_INVALID; map->daemon_vpid_start = ORTE_VPID_INVALID;
map->num_nodes = 0; map->num_nodes = 0;
@ -1091,17 +1067,21 @@ static void orte_job_map_construct(orte_job_map_t* map)
static void orte_job_map_destruct(orte_job_map_t* map) static void orte_job_map_destruct(orte_job_map_t* map)
{ {
orte_std_cntr_t i; orte_std_cntr_t i;
orte_node_t *node;
if (NULL != map->req_mapper) { if (NULL != map->req_mapper) {
free(map->req_mapper); free(map->req_mapper);
} }
if (NULL != map->last_mapper) { if (NULL != map->last_mapper) {
free(map->last_mapper); free(map->last_mapper);
} }
if (NULL != map->ppr) {
free(map->ppr);
}
for (i=0; i < map->nodes->size; i++) { for (i=0; i < map->nodes->size; i++) {
if (NULL != map->nodes->addr[i]) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
OBJ_RELEASE(map->nodes->addr[i]); OBJ_RELEASE(node);
map->nodes->addr[i] = NULL; opal_pointer_array_set_item(map->nodes, i, NULL);
} }
} }
OBJ_RELEASE(map->nodes); OBJ_RELEASE(map->nodes);

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -85,8 +85,6 @@ ORTE_DECLSPEC extern bool orte_in_parallel_debugger;
/* error manager callback function */ /* error manager callback function */
typedef void (*orte_err_cb_fn_t)(orte_process_name_t *proc, orte_proc_state_t state, void *cbdata); typedef void (*orte_err_cb_fn_t)(orte_process_name_t *proc, orte_proc_state_t state, void *cbdata);
typedef uint16_t orte_mapping_policy_t;
ORTE_DECLSPEC extern int orte_exit_status; ORTE_DECLSPEC extern int orte_exit_status;
#if ORTE_DISABLE_FULL_SUPPORT #if ORTE_DISABLE_FULL_SUPPORT
@ -169,7 +167,20 @@ typedef struct orte_app_context_t orte_app_context_t;
} \ } \
} while(0); } while(0);
/* define a set of flags to control the launch of a job */
typedef uint16_t orte_job_controls_t;
#define ORTE_JOB_CONTROL OPAL_UINT16
#define ORTE_JOB_CONTROL_LOCAL_SLAVE 0x0001
#define ORTE_JOB_CONTROL_NON_ORTE_JOB 0x0002
#define ORTE_JOB_CONTROL_DEBUGGER_DAEMON 0x0014
#define ORTE_JOB_CONTROL_FORWARD_OUTPUT 0x0008
#define ORTE_JOB_CONTROL_DO_NOT_MONITOR 0x0010
#define ORTE_JOB_CONTROL_FORWARD_COMM 0x0020
#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x0040
#define ORTE_JOB_CONTROL_RECOVERABLE 0x0080
#define ORTE_JOB_CONTROL_SPIN_FOR_DEBUG 0x0100
/* global type definitions used by RTE - instanced in orte_globals.c */ /* global type definitions used by RTE - instanced in orte_globals.c */
/************ /************
@ -264,6 +275,8 @@ typedef struct {
orte_node_rank_t next_node_rank; orte_node_rank_t next_node_rank;
/* whether or not we are oversubscribed */ /* whether or not we are oversubscribed */
bool oversubscribed; bool oversubscribed;
/* whether we have been added to the current map */
bool mapped;
/** State of this node */ /** State of this node */
orte_node_state_t state; orte_node_state_t state;
/** A "soft" limit on the number of slots available on the node. /** A "soft" limit on the number of slots available on the node.
@ -290,14 +303,6 @@ typedef struct {
specified limit. For example, if we have two processors, we specified limit. For example, if we have two processors, we
may want to allow up to four processes but no more. */ may want to allow up to four processes but no more. */
orte_std_cntr_t slots_max; orte_std_cntr_t slots_max;
/* number of physical boards in the node - defaults to 1 */
uint8_t boards;
/* number of sockets on each board - defaults to 1 */
uint8_t sockets_per_board;
/* number of cores per socket - defaults to 1 */
uint8_t cores_per_socket;
/* cpus on this node that are assigned for our use */
char *cpu_set;
/** Username on this node, if specified */ /** Username on this node, if specified */
char *username; char *username;
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
@ -309,70 +314,6 @@ typedef struct {
} orte_node_t; } orte_node_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_node_t); ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_node_t);
/* define a set of flags to control the launch of a job */
typedef uint16_t orte_job_controls_t;
#define ORTE_JOB_CONTROL OPAL_UINT16
#define ORTE_JOB_CONTROL_LOCAL_SLAVE 0x0001
#define ORTE_JOB_CONTROL_NON_ORTE_JOB 0x0002
#define ORTE_JOB_CONTROL_DEBUGGER_DAEMON 0x0014
#define ORTE_JOB_CONTROL_FORWARD_OUTPUT 0x0008
#define ORTE_JOB_CONTROL_DO_NOT_MONITOR 0x0010
#define ORTE_JOB_CONTROL_FORWARD_COMM 0x0020
#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x0040
#define ORTE_JOB_CONTROL_RECOVERABLE 0x0080
#define ORTE_JOB_CONTROL_SPIN_FOR_DEBUG 0x0100
#define ORTE_MAPPING_POLICY OPAL_UINT16
/* put the rank assignment method in the upper 8 bits */
#define ORTE_MAPPING_USE_VM 0x0100
#define ORTE_MAPPING_BYNODE 0x0200
#define ORTE_MAPPING_BYSLOT 0x0400
#define ORTE_MAPPING_BYSOCKET 0x0800
#define ORTE_MAPPING_BYBOARD 0x1000
#define ORTE_MAPPING_NO_USE_LOCAL 0x2000
#define ORTE_MAPPING_NPERXXX 0x4000
#define ORTE_MAPPING_BYUSER 0x8000
/* check if policy is set */
#define ORTE_MAPPING_POLICY_IS_SET(pol) (pol & 0xff00)
/* nice macro for setting these */
#define ORTE_SET_MAPPING_POLICY(pol) \
orte_default_mapping_policy = (orte_default_mapping_policy & 0x00ff) | (pol);
/* macro to detect if some other policy has been set */
#define ORTE_XSET_MAPPING_POLICY(pol) \
do { \
orte_mapping_policy_t tmp; \
tmp = (orte_default_mapping_policy & 0xff00) & ~(pol); \
if (0 == tmp) { \
ORTE_SET_MAPPING_POLICY((pol)); \
} \
} while(0);
/* macro to add another mapping policy */
#define ORTE_ADD_MAPPING_POLICY(pol) \
orte_default_mapping_policy |= (pol);
/* put the binding policy in the lower 8 bits, using the paffinity values */
#define ORTE_BIND_TO_NONE (uint16_t)OPAL_PAFFINITY_DO_NOT_BIND
#define ORTE_BIND_TO_CORE (uint16_t)OPAL_PAFFINITY_BIND_TO_CORE
#define ORTE_BIND_TO_SOCKET (uint16_t)OPAL_PAFFINITY_BIND_TO_SOCKET
#define ORTE_BIND_TO_BOARD (uint16_t)OPAL_PAFFINITY_BIND_TO_BOARD
#define ORTE_BIND_IF_SUPPORTED (uint16_t)OPAL_PAFFINITY_BIND_IF_SUPPORTED
/* nice macro for setting these */
#define ORTE_SET_BINDING_POLICY(pol) \
orte_default_mapping_policy = (orte_default_mapping_policy & 0xff00) | (pol);
/* macro to detect if some other policy has been set */
#define ORTE_XSET_BINDING_POLICY(pol) \
do { \
orte_mapping_policy_t tmp; \
tmp = (orte_default_mapping_policy & 0x00ff) & ~(pol); \
if (0 == tmp) { \
ORTE_SET_BINDING_POLICY((pol)); \
} \
} while(0);
/* macro to detect if binding was qualified */
#define ORTE_BINDING_NOT_REQUIRED(n) \
(ORTE_BIND_IF_SUPPORTED & (n))
typedef struct { typedef struct {
/** Base object so this can be put on a list */ /** Base object so this can be put on a list */
opal_list_item_t super; opal_list_item_t super;
@ -406,11 +347,6 @@ typedef struct {
* indicates the node where we stopped * indicates the node where we stopped
*/ */
orte_node_t *bookmark; orte_node_t *bookmark;
/** Whether or not to override oversubscription based on local
* hardware - used to indicate uncertainty in number of
* actual processors available on this node
*/
bool oversubscribe_override;
/* state of the overall job */ /* state of the overall job */
orte_job_state_t state; orte_job_state_t state;
/* number of procs launched */ /* number of procs launched */
@ -484,9 +420,11 @@ struct orte_proc_t {
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
/* hwloc object to which this process was mapped */ /* hwloc object to which this process was mapped */
hwloc_obj_t locale; hwloc_obj_t locale;
/* where the proc was bound */
unsigned int bind_idx;
/* string representation of cpu bindings */
char *cpu_bitmap;
#endif #endif
/* a cpu list, if specified by the user */
char *slot_list;
/* pointer to the node where this proc is executing */ /* pointer to the node where this proc is executing */
orte_node_t *node; orte_node_t *node;
/* pointer to the node where this proc last executed */ /* pointer to the node where this proc last executed */
@ -533,8 +471,6 @@ typedef struct {
orte_vpid_t daemon; orte_vpid_t daemon;
/* whether or not this node is oversubscribed */ /* whether or not this node is oversubscribed */
bool oversubscribed; bool oversubscribed;
/* list of system info */
opal_list_t sysinfo;
} orte_nid_t; } orte_nid_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_nid_t); ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_nid_t);
@ -559,6 +495,10 @@ typedef struct {
orte_jobid_t job; orte_jobid_t job;
/* number of procs in this job */ /* number of procs in this job */
orte_vpid_t num_procs; orte_vpid_t num_procs;
#if OPAL_HAVE_HWLOC
/* binding level of the job */
opal_hwloc_level_t bind_level;
#endif
/* array of data for procs */ /* array of data for procs */
opal_pointer_array_t pmap; opal_pointer_array_t pmap;
} orte_jmap_t; } orte_jmap_t;
@ -673,13 +613,7 @@ ORTE_DECLSPEC extern bool orte_assume_same_shell;
/* whether or not to report launch progress */ /* whether or not to report launch progress */
ORTE_DECLSPEC extern bool orte_report_launch_progress; ORTE_DECLSPEC extern bool orte_report_launch_progress;
/* cluster hardware info */
ORTE_DECLSPEC extern uint8_t orte_default_num_boards;
ORTE_DECLSPEC extern uint8_t orte_default_num_sockets_per_board;
ORTE_DECLSPEC extern uint8_t orte_default_num_cores_per_socket;
/* allocation specification */ /* allocation specification */
ORTE_DECLSPEC extern char *orte_default_cpu_set;
ORTE_DECLSPEC extern char *orte_default_hostfile; ORTE_DECLSPEC extern char *orte_default_hostfile;
ORTE_DECLSPEC extern char *orte_rankfile; ORTE_DECLSPEC extern char *orte_rankfile;
#ifdef __WINDOWS__ #ifdef __WINDOWS__
@ -688,16 +622,10 @@ ORTE_DECLSPEC extern char *orte_ccp_headnode;
ORTE_DECLSPEC extern int orte_num_allocated_nodes; ORTE_DECLSPEC extern int orte_num_allocated_nodes;
ORTE_DECLSPEC extern char *orte_node_regex; ORTE_DECLSPEC extern char *orte_node_regex;
/* default rank assigment and binding policy */
ORTE_DECLSPEC extern orte_mapping_policy_t orte_default_mapping_policy;
/* tool communication controls */ /* tool communication controls */
ORTE_DECLSPEC extern bool orte_report_events; ORTE_DECLSPEC extern bool orte_report_events;
ORTE_DECLSPEC extern char *orte_report_events_uri; ORTE_DECLSPEC extern char *orte_report_events_uri;
/* report bindings */
ORTE_DECLSPEC extern bool orte_report_bindings;
/* barrier control */ /* barrier control */
ORTE_DECLSPEC extern bool orte_do_not_barrier; ORTE_DECLSPEC extern bool orte_do_not_barrier;

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -28,7 +28,6 @@
#include <stdio.h> #include <stdio.h>
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
#include "opal/mca/paffinity/base/base.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/argv.h" #include "opal/util/argv.h"
@ -43,8 +42,7 @@ static bool passed_thru = false;
int orte_register_params(void) int orte_register_params(void)
{ {
int value, tmp; int value, tmp;
char *strval, **params; char *strval;
uint16_t binding;
/* only go thru this once - mpirun calls it twice, which causes /* only go thru this once - mpirun calls it twice, which causes
* any error messages to show up twice * any error messages to show up twice
@ -234,12 +232,6 @@ int orte_register_params(void)
mca_base_param_reg_string_name("orte", "default_hostfile", mca_base_param_reg_string_name("orte", "default_hostfile",
"Name of the default hostfile (relative or absolute path)", "Name of the default hostfile (relative or absolute path)",
false, false, NULL, &orte_default_hostfile); false, false, NULL, &orte_default_hostfile);
/* rankfile */
tmp = mca_base_param_reg_string_name("orte", "rankfile",
"Name of the rankfile to be used for mapping processes (relative or absolute path)",
false, false, NULL, NULL);
mca_base_param_reg_syn_name(tmp, "rmaps", "rank_file_path", false);
mca_base_param_lookup_string(tmp, &orte_rankfile);
#ifdef __WINDOWS__ #ifdef __WINDOWS__
mca_base_param_reg_string_name("orte", "ccp_headnode", mca_base_param_reg_string_name("orte", "ccp_headnode",
@ -315,11 +307,14 @@ int orte_register_params(void)
"Indicates that multiple app_contexts are being provided that are a mix of 32/64 bit binaries (default: false)", "Indicates that multiple app_contexts are being provided that are a mix of 32/64 bit binaries (default: false)",
false, false, (int) false, &value); false, false, (int) false, &value);
orte_hetero_apps = OPAL_INT_TO_BOOL(value); orte_hetero_apps = OPAL_INT_TO_BOOL(value);
#if OPAL_HAVE_HWLOC
mca_base_param_reg_int_name("orte", "hetero_nodes", mca_base_param_reg_int_name("orte", "hetero_nodes",
"Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]", "Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]",
false, false, (int) false, &value); false, false, (int) false, &value);
orte_hetero_nodes = OPAL_INT_TO_BOOL(value); orte_hetero_nodes = OPAL_INT_TO_BOOL(value);
#endif
/* allow specification of the launch agent */ /* allow specification of the launch agent */
mca_base_param_reg_string_name("orte", "launch_agent", mca_base_param_reg_string_name("orte", "launch_agent",
"Command used to start processes on remote nodes (default: orted)", "Command used to start processes on remote nodes (default: orted)",
@ -394,71 +389,6 @@ int orte_register_params(void)
"cpu model detected in node", "cpu model detected in node",
true, false, NULL, &orte_local_cpu_model); true, false, NULL, &orte_local_cpu_model);
/* cluster hardware info */
mca_base_param_reg_int_name("orte", "num_boards",
"Number of processor boards/node (1-256) [default: 1]",
false, false, 1, &value);
orte_default_num_boards = (uint8_t)value;
mca_base_param_reg_int_name("orte", "num_sockets",
"Number of sockets/board (1-256)",
false, false, 0, &value);
orte_default_num_sockets_per_board = (uint8_t)value;
mca_base_param_reg_int_name("orte", "num_cores",
"Number of cores/socket (1-256)",
false, false, 0, &value);
orte_default_num_cores_per_socket = (uint8_t)value;
/* cpu allocation specification */
mca_base_param_reg_string_name("orte", "cpu_set",
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]",
false, false, NULL, &orte_default_cpu_set);
/* binding specification - this will be overridden by any cmd line directive, and
* ignored unless opal_paffinity_alone is set
*/
mca_base_param_reg_string_name("orte", "process_binding",
"Policy for binding processes [none | core | socket | board] (supported qualifier: if-avail)",
false, false, NULL, &strval);
if (NULL != strval) {
if (0 == strcasecmp(strval, "none")) {
/* no binding */
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_NONE);
} else {
binding = 0;
params = opal_argv_split(strval, ':');
if (1 < opal_argv_count(params)) {
if (0 != strcasecmp(params[1], "if-avail")) {
/* unknown option */
opal_output(0, "Unknown qualifier to orte_process_binding: %s", strval);
return ORTE_ERR_BAD_PARAM;
}
binding = ORTE_BIND_IF_SUPPORTED;
}
if (0 == strcasecmp(params[0], "socket")) {
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_SOCKET | binding);
} else if (0 == strcasecmp(params[0], "board")) {
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_BOARD | binding);
} else if (0 == strcasecmp(params[0], "core")) {
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE | binding);
}
}
}
/* if nothing was set, but opal_paffinity_alone is set, then default
* to bind-to-core
*/
if (opal_paffinity_alone) {
ORTE_XSET_BINDING_POLICY(ORTE_BIND_TO_CORE);
}
/* whether or not to report bindings */
mca_base_param_reg_int_name("orte", "report_bindings",
"Report bindings",
false, false,
(int) false, &value);
orte_report_bindings = OPAL_INT_TO_BOOL(value);
/* tool communication controls */ /* tool communication controls */
mca_base_param_reg_string_name("orte", "report_events", mca_base_param_reg_string_name("orte", "report_events",
"URI to which events are to be reported (default: NULL)", "URI to which events are to be reported (default: NULL)",

Просмотреть файл

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights * Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
@ -274,13 +274,13 @@ static void dump_aborted_procs(void)
break; break;
case ORTE_ERR_MULTIPLE_AFFINITIES: case ORTE_ERR_MULTIPLE_AFFINITIES:
orte_show_help("help-orterun.txt", orte_show_help("help-orterun.txt",
"orterun:multiple-paffinity-schemes", true, proc->slot_list); "orterun:multiple-paffinity-schemes", true, NULL);
break; break;
case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED: case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
orte_show_help("help-orterun.txt", orte_show_help("help-orterun.txt",
"orterun:topo-not-supported", "orterun:topo-not-supported",
true, orte_process_info.nodename, "rankfile containing a slot_list of ", true, orte_process_info.nodename, "rankfile containing a slot_list of ",
proc->slot_list, approc->app); NULL, approc->app);
break; break;
case ORTE_ERR_INVALID_NODE_RANK: case ORTE_ERR_INVALID_NODE_RANK:
orte_show_help("help-orterun.txt", orte_show_help("help-orterun.txt",
@ -326,7 +326,7 @@ static void dump_aborted_procs(void)
case ORTE_ERR_SLOT_LIST_RANGE: case ORTE_ERR_SLOT_LIST_RANGE:
orte_show_help("help-orterun.txt", orte_show_help("help-orterun.txt",
"orterun:invalid-slot-list-range", "orterun:invalid-slot-list-range",
true, node->name, proc->slot_list); true, node->name, NULL);
break; break;
case ORTE_ERR_PIPE_READ_FAILURE: case ORTE_ERR_PIPE_READ_FAILURE:
orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true, orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,

Просмотреть файл

@ -13,7 +13,7 @@ int main(int argc, char* argv[])
pid_t pid; pid_t pid;
pid = getpid(); pid = getpid();
printf("Parent [pid %ld] starting up!\n", (long)pid); printf("[pid %ld] starting up!\n", (long)pid);
MPI_Init(NULL, NULL); MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_rank(MPI_COMM_WORLD, &rank);
printf("%d completed MPI_Init\n", rank); printf("%d completed MPI_Init\n", rank);

Просмотреть файл

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights * Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
@ -50,7 +50,6 @@
#include "opal/mca/event/event.h" #include "opal/mca/event/event.h"
#include "opal/mca/installdirs/installdirs.h" #include "opal/mca/installdirs/installdirs.h"
#include "opal/mca/base/base.h" #include "opal/mca/base/base.h"
#include "opal/mca/paffinity/base/base.h"
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/opal_sos.h" #include "opal/util/opal_sos.h"
@ -81,6 +80,7 @@
#include "orte/mca/odls/odls.h" #include "orte/mca/odls/odls.h"
#include "orte/mca/plm/plm.h" #include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h" #include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/ras/ras.h"
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h" #include "orte/mca/rml/rml_types.h"
#include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/rml/base/rml_contact.h"
@ -254,37 +254,7 @@ static opal_cmd_line_init_t cmd_line_init[] = {
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL, OPAL_CMD_LINE_TYPE_NULL,
"Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" }, "Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" },
/* Mapping options */ /* Mapping controls */
{ NULL, NULL, NULL, '\0', "bynode", "bynode", 0,
&orterun_globals.by_node, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to assign processes round-robin by node" },
{ NULL, NULL, NULL, '\0', "byslot", "byslot", 0,
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to assign processes round-robin by slot (the default)" },
{ NULL, NULL, NULL, '\0', "bycore", "bycore", 0,
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
"Alias for byslot" },
{ NULL, NULL, NULL, '\0', "bysocket", "bysocket", 0,
&orterun_globals.by_socket, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to assign processes round-robin by socket" },
{ NULL, NULL, NULL, '\0', "byboard", "byboard", 0,
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to assign processes round-robin by board (equivalent to bynode if only 1 board/node)" },
{ "rmaps", "base", "pernode", '\0', "pernode", "pernode", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Launch one process per available node on the specified number of nodes [no -np => use all allocated nodes]" },
{ "rmaps", "base", "n_pernode", '\0', "npernode", "npernode", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Launch n processes per node on all allocated nodes" },
{ "rmaps", "base", "slot_list", '\0', "slot-list", "slot-list", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"List of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files)" },
{ "rmaps", "base", "no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Nodes are not to be oversubscribed, even if the system supports such operation"},
{ "rmaps", "base", "loadbalance", '\0', "loadbalance", "loadbalance", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Balance total number of procs across all allocated nodes"},
{ "rmaps", "base", "display_map", '\0', "display-map", "display-map", 0, { "rmaps", "base", "display_map", '\0', "display-map", "display-map", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL, NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Display the process map just before launch"}, "Display the process map just before launch"},
@ -303,39 +273,98 @@ static opal_cmd_line_init_t cmd_line_init[] = {
{ "rmaps", "base", "no_schedule_local", '\0', "nolocal", "nolocal", 0, { "rmaps", "base", "no_schedule_local", '\0', "nolocal", "nolocal", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL, NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Do not run any MPI applications on the local node" }, "Do not run any MPI applications on the local node" },
{ "rmaps", "base", "no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Nodes are not to be oversubscribed, even if the system supports such operation"},
{ "rmaps", "base", "oversubscribe", '\0', "oversubscribe", "oversubscribe", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Nodes are allowed to be oversubscribed, even on a managed system"},
#if 0
{ "rmaps", "base", "cpus_per_rank", '\0', "cpus-per-proc", "cpus-per-proc", 1, { "rmaps", "base", "cpus_per_rank", '\0', "cpus-per-proc", "cpus-per-proc", 1,
NULL, OPAL_CMD_LINE_TYPE_INT, NULL, OPAL_CMD_LINE_TYPE_INT,
"Number of cpus to use for each process [default=1]" }, "Number of cpus to use for each process [default=1]" },
{ "rmaps", "base", "cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1, { "rmaps", "base", "cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1,
NULL, OPAL_CMD_LINE_TYPE_INT, NULL, OPAL_CMD_LINE_TYPE_INT,
"Synonym for cpus-per-proc" }, "Synonym for cpus-per-proc" },
{ "rmaps", "base", "n_perboard", '\0', "nperboard", "nperboard", 1, #endif
NULL, OPAL_CMD_LINE_TYPE_INT,
"Launch n processes per board on all allocated nodes" }, /* backward compatiblity */
{ "rmaps", "base", "n_persocket", '\0', "npersocket", "npersocket", 1, { "rmaps", "base", "bynode", '\0', "bynode", "bynode", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to map and rank processes round-robin by node" },
{ "rmaps", "base", "byslot", '\0', "byslot", "byslot", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to map and rank processes round-robin by slot" },
/* Nperxxx options that do not require topology and are always
* available - included for backwards compatibility
*/
{ "rmaps", "ppr", "pernode", '\0', "pernode", "pernode", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Launch one process per available node" },
{ "rmaps", "ppr", "n_pernode", '\0', "npernode", "npernode", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Launch n processes per node on all allocated nodes" },
#if OPAL_HAVE_HWLOC
/* declare hardware threads as independent cpus */
{ "hwloc", "base", "use_hwthreads_as_cpus", '\0', "use-hwthread-cpus", "use-hwthread-cpus", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Use hardware threads as independent cpus" },
/* include npersocket for backwards compatibility */
{ "rmaps", "ppr", "n_persocket", '\0', "npersocket", "npersocket", 1,
NULL, OPAL_CMD_LINE_TYPE_INT, NULL, OPAL_CMD_LINE_TYPE_INT,
"Launch n processes per socket on all allocated nodes" }, "Launch n processes per socket on all allocated nodes" },
/* binding options */ /* Mapping options */
{ NULL, NULL, NULL, '\0', "bind-to-none", "bind-to-none", 0, { "rmaps", "base", "mapping_policy", '\0', NULL, "map-by", 1,
&orterun_globals.bind_to_none, OPAL_CMD_LINE_TYPE_BOOL, NULL, OPAL_CMD_LINE_TYPE_STRING,
"Do not bind processes to cores or sockets (default)" }, "Mapping Policy [slot (default) | hwthread | core | socket | numa | board | node]" },
{ NULL, NULL, NULL, '\0', "bind-to-core", "bind-to-core", 0,
&orterun_globals.bind_to_core, OPAL_CMD_LINE_TYPE_BOOL, /* Ranking options */
"Whether to bind processes to specific cores" }, { "rmaps", "base", "ranking_policy", '\0', NULL, "rank-by", 1,
{ NULL, NULL, NULL, '\0', "bind-to-board", "bind-to-board", 0, NULL, OPAL_CMD_LINE_TYPE_STRING,
&orterun_globals.bind_to_board, OPAL_CMD_LINE_TYPE_BOOL, "Ranking Policy [slot (default) | hwthread | core | socket | numa | board | node]" },
"Whether to bind processes to specific boards (meaningless on 1 board/node)" },
{ NULL, NULL, NULL, '\0', "bind-to-socket", "bind-to-socket", 0, /* Binding options */
&orterun_globals.bind_to_socket, OPAL_CMD_LINE_TYPE_BOOL, { "hwloc", "base", "binding_policy", '\0', NULL, "bind-to", 1,
"Whether to bind processes to sockets" }, NULL, OPAL_CMD_LINE_TYPE_STRING,
{ "rmaps", "base", "stride", '\0', "stride", "stride", 1, "Policy for binding processes [none (default) | hwthread | core | socket | numa | board] (supported qualifiers: overload-allowed,if-supported)" },
NULL, OPAL_CMD_LINE_TYPE_INT,
"When binding multiple cores to a rank, the step size to use between cores [default: 1]" }, /* backward compatiblity */
{ "orte", "report", "bindings", '\0', "report-bindings", "report-bindings", 0, { "hwloc", "base", "bind_to_core", '\0', "bind-to-core", "bind-to-core", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Bind processes to cores" },
{ "hwloc", "base", "bind_to_socket", '\0', "bind-to-socket", "bind-to-socket", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Bind processes to sockets" },
{ "hwloc", "base", "report_bindings", '\0', "report-bindings", "report-bindings", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL, NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to report process bindings to stderr" }, "Whether to report process bindings to stderr" },
/* slot list option */
{ "hwloc", "base", "slot_list", '\0', "slot-list", "slot-list", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"List of processor IDs to bind processes to [default=NULL]"},
/* generalized pattern mapping option */
{ "rmaps", "ppr", "pattern", '\0', NULL, "ppr", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Comma-separated list of number of processes on a given resource type [default: none]" },
#else
/* Mapping options */
{ "rmaps", "base", "mapping_policy", '\0', NULL, "map-by", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Mapping Policy [slot (default) | node]" },
/* Ranking options */
{ "rmaps", "base", "ranking_policy", '\0', NULL, "rank-by", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Ranking Policy [slot (default) | node]" },
#endif
/* Allocation options */ /* Allocation options */
{ "ras", "base", "display_alloc", '\0', "display-allocation", "display-allocation", 0, { "ras", "base", "display_alloc", '\0', "display-allocation", "display-allocation", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL, NULL, OPAL_CMD_LINE_TYPE_BOOL,
@ -343,20 +372,14 @@ static opal_cmd_line_init_t cmd_line_init[] = {
{ "ras", "base", "display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0, { "ras", "base", "display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL, NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Display a detailed list (mostly intended for developers) of the allocation being used by this job"}, "Display a detailed list (mostly intended for developers) of the allocation being used by this job"},
{ "orte", "cpu", "set", '\0', "cpu-set", "cpu-set", 1, #if OPAL_HAVE_HWLOC
{ "hwloc", "base", "cpu_set", '\0', "cpu-set", "cpu-set", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING, NULL, OPAL_CMD_LINE_TYPE_STRING,
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]"}, "Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]"},
#endif
/* cluster hardware info */ { NULL, NULL, NULL, 'H', "host", "host", 1,
{ "orte", "num", "boards", '\0', "num-boards", "num-boards", 1, NULL, OPAL_CMD_LINE_TYPE_STRING,
NULL, OPAL_CMD_LINE_TYPE_INT, "List of hosts to invoke processes on" },
"Number of processor boards/node (1-256) [default: 1]"},
{ "orte", "num", "sockets", '\0', "num-sockets", "num-sockets", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Number of sockets/board (1-256) [default: 1]"},
{ "orte", "num", "cores", '\0', "num-cores", "num-cores", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Number of cores/socket (1-256) [default: 1]"},
/* mpiexec-like arguments */ /* mpiexec-like arguments */
{ NULL, NULL, NULL, '\0', "wdir", "wdir", 1, { NULL, NULL, NULL, '\0', "wdir", "wdir", 1,
@ -435,13 +458,11 @@ static opal_cmd_line_init_t cmd_line_init[] = {
NULL, OPAL_CMD_LINE_TYPE_INT, NULL, OPAL_CMD_LINE_TYPE_INT,
"Max number of times to restart a failed process" }, "Max number of times to restart a failed process" },
{ "orte", "vm", "launch", '\0', "vm", "vm", 0, #if OPAL_HAVE_HWLOC
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Launch daemons on all nodes at start to create a virtual machine [Default = false]" },
{ "orte", "hetero", "nodes", '\0', NULL, "hetero-nodes", 0, { "orte", "hetero", "nodes", '\0', NULL, "hetero-nodes", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL, NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" }, "Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" },
#endif
#if OPAL_ENABLE_CRDEBUG == 1 #if OPAL_ENABLE_CRDEBUG == 1
{ "opal", "cr", "enable_crdebug", '\0', "crdebug", "crdebug", 0, { "opal", "cr", "enable_crdebug", '\0', "crdebug", "crdebug", 0,
@ -477,6 +498,8 @@ int orterun(int argc, char *argv[])
char * tmp_env_var = NULL; char * tmp_env_var = NULL;
orte_debugger_breakpoint_fn_t foo; orte_debugger_breakpoint_fn_t foo;
orte_job_t *daemons; orte_job_t *daemons;
int32_t ljob, i;
orte_app_context_t *app, *dapp;
/* find our basename (the name of the executable) so that we can /* find our basename (the name of the executable) so that we can
use it in pretty-print error messages */ use it in pretty-print error messages */
@ -572,7 +595,9 @@ int orterun(int argc, char *argv[])
*/ */
jdata = OBJ_NEW(orte_job_t); jdata = OBJ_NEW(orte_job_t);
if (NULL == jdata) { if (NULL == jdata) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); /* cannot call ORTE_ERROR_LOG as the errmgr
* hasn't been loaded yet!
*/
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
@ -625,7 +650,9 @@ int orterun(int argc, char *argv[])
* require * require
*/ */
if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_HNP))) { if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
ORTE_ERROR_LOG(rc); /* cannot call ORTE_ERROR_LOG as it could be the errmgr
* never got loaded!
*/
return rc; return rc;
} }
/* finalize the OPAL utils. As they are opened again from orte_init->opal_init /* finalize the OPAL utils. As they are opened again from orte_init->opal_init
@ -633,6 +660,9 @@ int orterun(int argc, char *argv[])
*/ */
opal_finalize_util(); opal_finalize_util();
/* get the daemon job object */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
/* check for request to report uri */ /* check for request to report uri */
if (NULL != orterun_globals.report_uri) { if (NULL != orterun_globals.report_uri) {
FILE *fp; FILE *fp;
@ -678,14 +708,25 @@ int orterun(int argc, char *argv[])
Since there always MUST be at least one app_context, we are safe in Since there always MUST be at least one app_context, we are safe in
doing this. doing this.
*/ */
if (NULL != ((orte_app_context_t*)jdata->apps->addr[0])->prefix_dir) { if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0)) &&
NULL != app->prefix_dir) {
char *oldenv, *newenv, *lib_base, *bin_base; char *oldenv, *newenv, *lib_base, *bin_base;
/* copy the prefix into the daemon job so that any launcher
* can find the orteds when we launch the virtual machine
*/
if (NULL == (dapp = (orte_app_context_t*)opal_pointer_array_get_item(daemons->apps, 0))) {
/* that's an error in the ess */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
dapp->prefix_dir = strdup(app->prefix_dir);
lib_base = opal_basename(opal_install_dirs.libdir); lib_base = opal_basename(opal_install_dirs.libdir);
bin_base = opal_basename(opal_install_dirs.bindir); bin_base = opal_basename(opal_install_dirs.bindir);
/* Reset PATH */ /* Reset PATH */
newenv = opal_os_path( false, ((orte_app_context_t*)jdata->apps->addr[0])->prefix_dir, bin_base, NULL ); newenv = opal_os_path( false, app->prefix_dir, bin_base, NULL );
oldenv = getenv("PATH"); oldenv = getenv("PATH");
if (NULL != oldenv) { if (NULL != oldenv) {
char *temp; char *temp;
@ -701,7 +742,7 @@ int orterun(int argc, char *argv[])
free(bin_base); free(bin_base);
/* Reset LD_LIBRARY_PATH */ /* Reset LD_LIBRARY_PATH */
newenv = opal_os_path( false, ((orte_app_context_t*)jdata->apps->addr[0])->prefix_dir, lib_base, NULL ); newenv = opal_os_path( false, app->prefix_dir, lib_base, NULL );
oldenv = getenv("LD_LIBRARY_PATH"); oldenv = getenv("LD_LIBRARY_PATH");
if (NULL != oldenv) { if (NULL != oldenv) {
char* temp; char* temp;
@ -783,58 +824,63 @@ int orterun(int argc, char *argv[])
} }
} }
/* if we are launching the vm, now is the time to do so */ /*** LAUNCH THE ORTE VIRTUAL MACHINE ***/
if (orte_vm_launch) {
int32_t ljob, i;
orte_app_context_t *app;
/* we may need to look at the apps for the user's job /* we may need to look at the apps for the user's job
* to get our full list of nodes, so prep the job for * to get our full list of nodes, so prep the job for
* launch. This duplicates some code in orte_plm_base_setup_job * launch - start by getting a jobid for it */
* that won't run if we do this here - eventually, we'll want if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(jdata))) {
* to refactor the plm_base routine to avoid the duplication ORTE_ERROR_LOG(rc);
goto DONE;
}
/* store it on the global job data pool - this is the key
* step required before we launch the daemons. It allows
* the orte_rmaps_base_setup_virtual_machine routine to
* search all apps for any hosts to be used by the vm
*/
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* set the job state */
jdata->state = ORTE_JOB_STATE_INIT;
/* if job recovery is not defined, set it to default */
if (!jdata->recovery_defined) {
/* set to system default */
jdata->enable_recovery = orte_enable_recovery;
}
/* if app recovery is not defined, set apps to defaults */
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
if (!app->recovery_defined) {
app->max_restarts = orte_max_restarts;
}
}
/* if we don't want to launch, then don't attempt to
* launch the daemons - the user really wants to just
* look at the proposed process map
*/
if (!orte_do_not_launch) {
/* run the allocator on the application job - this allows us to
* pickup any host or hostfile arguments so we get the full
* array of nodes in our allocation
*/ */
/* get a jobid for it */ if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) {
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(jdata))) {
ORTE_ERROR_LOG(rc);
goto DONE; goto DONE;
} }
/* store it on the global job data pool - this is the key
* step required before we launch the daemons. It allows
* the orte_rmaps_base_setup_virtual_machine routine to
* search all apps for any hosts to be used by the vm
*/
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* set the job state */
jdata->state = ORTE_JOB_STATE_INIT;
/* if job recovery is not defined, set it to default */
if (!jdata->recovery_defined) {
/* set to system default */
jdata->enable_recovery = orte_enable_recovery;
}
/* if app recovery is not defined, set apps to defaults */
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
if (!app->recovery_defined) {
app->max_restarts = orte_max_restarts;
}
}
/* get the daemon job object */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
/* launch the daemons */ /* launch the daemons */
if (ORTE_SUCCESS != (rc = orte_plm.spawn(daemons))) { if (ORTE_SUCCESS != (rc = orte_plm.spawn(daemons))) {
fprintf(stderr, "%s: UNABLE TO LAUNCH VIRTUAL MACHINE\n", orte_basename); fprintf(stderr, "%s: UNABLE TO LAUNCH VIRTUAL MACHINE\n", orte_basename);
goto DONE; goto DONE;
} }
/* ensure all future jobs use the VM */
orte_default_mapping_policy |= ORTE_MAPPING_USE_VM;
} }
/*** LAUNCH THE APPLICATION ***/
/* setup for debugging */ /* setup for debugging */
orte_debugger.init_before_spawn(jdata); orte_debugger.init_before_spawn(jdata);
@ -880,13 +926,6 @@ static int init_globals(void)
orterun_globals.help = false; orterun_globals.help = false;
orterun_globals.version = false; orterun_globals.version = false;
orterun_globals.verbose = false; orterun_globals.verbose = false;
orterun_globals.by_node = false;
orterun_globals.by_slot = false;
orterun_globals.by_board = false;
orterun_globals.by_socket = false;
orterun_globals.bind_to_core = false;
orterun_globals.bind_to_board = false;
orterun_globals.bind_to_socket = false;
orterun_globals.debugger = false; orterun_globals.debugger = false;
orterun_globals.num_procs = 0; orterun_globals.num_procs = 0;
if( NULL != orterun_globals.env_val ) if( NULL != orterun_globals.env_val )
@ -982,35 +1021,7 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
run_debugger(orte_basename, cmd_line, argc, argv, orterun_globals.num_procs); run_debugger(orte_basename, cmd_line, argc, argv, orterun_globals.num_procs);
} }
/* extract any rank assignment policy directives */ /* if recovery was disabled on the cmd line, do so */
if (orterun_globals.by_node) {
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYNODE);
} else if (orterun_globals.by_board) {
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYBOARD);
} else if (orterun_globals.by_socket) {
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSOCKET);
} else if (orterun_globals.by_slot) {
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSLOT);
}
/* if nothing was specified, leave it as set by
* mca param
*/
/* extract any binding policy directives */
if (orterun_globals.bind_to_socket) {
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_SOCKET);
} else if (orterun_globals.bind_to_board) {
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_BOARD);
} else if (orterun_globals.bind_to_core) {
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE);
} else if (orterun_globals.bind_to_none) {
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_NONE);
}
/* if nothing was specified, leave it as set
* by mca param
*/
/* if recovery was disabled on the cmd line, do so */
if (orterun_globals.disable_recovery) { if (orterun_globals.disable_recovery) {
orte_enable_recovery = false; orte_enable_recovery = false;
orte_max_restarts = 0; orte_max_restarts = 0;

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -40,14 +40,6 @@ struct orterun_globals_t {
char *report_pid; char *report_pid;
char *report_uri; char *report_uri;
bool exit; bool exit;
bool by_node;
bool by_slot;
bool by_board;
bool by_socket;
bool bind_to_none;
bool bind_to_core;
bool bind_to_board;
bool bind_to_socket;
bool debugger; bool debugger;
int num_procs; int num_procs;
char *env_val; char *env_val;

Просмотреть файл

@ -42,7 +42,6 @@
* relative node syntax should generate an immediate error * relative node syntax should generate an immediate error
*/ */
int orte_util_add_dash_host_nodes(opal_list_t *nodes, int orte_util_add_dash_host_nodes(opal_list_t *nodes,
bool *override_oversubscribed,
char ** host_argv) char ** host_argv)
{ {
opal_list_item_t* item; opal_list_item_t* item;
@ -129,14 +128,6 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
node->slots_inuse = 0; node->slots_inuse = 0;
node->slots_max = 0; node->slots_max = 0;
node->slots = 1; node->slots = 1;
/* indicate that ORTE should override any oversubscribed conditions
* based on local hardware limits since the user (a) might not have
* provided us any info on the #slots for a node, and (b) the user
* might have been wrong! If we don't check the number of local physical
* processors, then we could be too aggressive on our sched_yield setting
* and cause performance problems.
*/
*override_oversubscribed = true;
opal_list_append(nodes, &node->super); opal_list_append(nodes, &node->super);
} }
} }

Просмотреть файл

@ -30,7 +30,6 @@
BEGIN_C_DECLS BEGIN_C_DECLS
ORTE_DECLSPEC int orte_util_add_dash_host_nodes(opal_list_t *nodes, ORTE_DECLSPEC int orte_util_add_dash_host_nodes(opal_list_t *nodes,
bool *override_oversubscribed,
char ** host_argv); char ** host_argv);
ORTE_DECLSPEC int orte_util_filter_dash_host_nodes(opal_list_t *nodes, ORTE_DECLSPEC int orte_util_filter_dash_host_nodes(opal_list_t *nodes,

Просмотреть файл

@ -11,6 +11,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights * Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -332,49 +333,6 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
node->username = hostfile_parse_string(); node->username = hostfile_parse_string();
break; break;
case ORTE_HOSTFILE_BOARDS:
rc = hostfile_parse_int();
if (rc < 0) {
orte_show_help("help-hostfile.txt", "boards",
true,
cur_hostfile_name, rc);
OBJ_RELEASE(node);
return ORTE_ERROR;
}
node->boards = rc;
break;
case ORTE_HOSTFILE_SOCKETS_PER_BOARD:
rc = hostfile_parse_int();
if (rc < 0) {
orte_show_help("help-hostfile.txt", "sockets",
true,
cur_hostfile_name, rc);
OBJ_RELEASE(node);
return ORTE_ERROR;
}
node->sockets_per_board = rc;
break;
case ORTE_HOSTFILE_CORES_PER_SOCKET:
rc = hostfile_parse_int();
if (rc < 0) {
orte_show_help("help-hostfile.txt", "cores",
true,
cur_hostfile_name, rc);
OBJ_RELEASE(node);
return ORTE_ERROR;
}
node->cores_per_socket = rc;
break;
case ORTE_HOSTFILE_CPU_SET:
if (NULL != node->cpu_set) {
free(node->cpu_set);
}
node->cpu_set = hostfile_parse_string();
break;
case ORTE_HOSTFILE_COUNT: case ORTE_HOSTFILE_COUNT:
case ORTE_HOSTFILE_CPU: case ORTE_HOSTFILE_CPU:
case ORTE_HOSTFILE_SLOTS: case ORTE_HOSTFILE_SLOTS:
@ -516,7 +474,6 @@ unlock:
*/ */
int orte_util_add_hostfile_nodes(opal_list_t *nodes, int orte_util_add_hostfile_nodes(opal_list_t *nodes,
bool *override_oversubscribed,
char *hostfile) char *hostfile)
{ {
opal_list_t exclude; opal_list_t exclude;
@ -567,15 +524,6 @@ int orte_util_add_hostfile_nodes(opal_list_t *nodes,
OBJ_RELEASE(item); OBJ_RELEASE(item);
} }
/* indicate that ORTE should override any oversubscribed conditions
* based on local hardware limits since the user (a) might not have
* provided us any info on the #slots for a node, and (b) the user
* might have been wrong! If we don't check the number of local physical
* processors, then we could be too aggressive on our sched_yield setting
* and cause performance problems.
*/
*override_oversubscribed = true;
cleanup: cleanup:
OBJ_DESTRUCT(&exclude); OBJ_DESTRUCT(&exclude);

Просмотреть файл

@ -31,7 +31,6 @@
BEGIN_C_DECLS BEGIN_C_DECLS
ORTE_DECLSPEC int orte_util_add_hostfile_nodes(opal_list_t *nodes, ORTE_DECLSPEC int orte_util_add_hostfile_nodes(opal_list_t *nodes,
bool *override_oversubscribed,
char *hostfile); char *hostfile);
ORTE_DECLSPEC int orte_util_filter_hostfile_nodes(opal_list_t *nodes, ORTE_DECLSPEC int orte_util_filter_hostfile_nodes(opal_list_t *nodes,

Просмотреть файл

@ -10,6 +10,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -75,8 +76,7 @@ extern orte_hostfile_value_t orte_util_hostfile_value;
#define ORTE_HOSTFILE_BOARDS 17 #define ORTE_HOSTFILE_BOARDS 17
#define ORTE_HOSTFILE_SOCKETS_PER_BOARD 18 #define ORTE_HOSTFILE_SOCKETS_PER_BOARD 18
#define ORTE_HOSTFILE_CORES_PER_SOCKET 19 #define ORTE_HOSTFILE_CORES_PER_SOCKET 19
#define ORTE_HOSTFILE_CPU_SET 20
/* ensure we can handle a rank_file input */ /* ensure we can handle a rank_file input */
#define ORTE_HOSTFILE_RANK 21 #define ORTE_HOSTFILE_RANK 20
#endif #endif

Просмотреть файл

@ -12,6 +12,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -144,12 +145,6 @@ cores_per_socket { orte_util_hostfile_value.sval = yytext;
"cores-per-socket" { orte_util_hostfile_value.sval = yytext; "cores-per-socket" { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_CORES_PER_SOCKET; } return ORTE_HOSTFILE_CORES_PER_SOCKET; }
cpu_set { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_CPU_SET; }
"cpu-set" { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_CPU_SET; }
\+n[0-9]+ { orte_util_hostfile_value.sval = yytext; \+n[0-9]+ { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_RELATIVE; } return ORTE_HOSTFILE_RELATIVE; }
\+[eE][\:][0-9]+ { orte_util_hostfile_value.sval = yytext; \+[eE][\:][0-9]+ { orte_util_hostfile_value.sval = yytext;

Просмотреть файл

@ -46,7 +46,7 @@
#include "opal/dss/dss.h" #include "opal/dss/dss.h"
#include "opal/runtime/opal.h" #include "opal/runtime/opal.h"
#include "opal/class/opal_pointer_array.h" #include "opal/class/opal_pointer_array.h"
#include "opal/mca/hwloc/hwloc.h" #include "opal/mca/hwloc/base/base.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/argv.h" #include "opal/util/argv.h"
@ -88,6 +88,24 @@ int orte_util_nidmap_init(opal_buffer_t *buffer)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
#if OPAL_HAVE_HWLOC
{
hwloc_topology_t topo;
/* extract the topology */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &cnt, OPAL_HWLOC_TOPO))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (NULL == opal_hwloc_topology) {
opal_hwloc_topology = topo;
} else {
hwloc_topology_destroy(topo);
}
}
#endif
/* extract the byte object holding the daemonmap */ /* extract the byte object holding the daemonmap */
cnt=1; cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) {
@ -113,16 +131,6 @@ int orte_util_nidmap_init(opal_buffer_t *buffer)
return rc; return rc;
} }
/* the bytes in the object were free'd by the decode */ /* the bytes in the object were free'd by the decode */
#if OPAL_HAVE_HWLOC
/* extract the topology */
if (NULL == opal_hwloc_topology) {
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &opal_hwloc_topology, &cnt, OPAL_HWLOC_TOPO))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
#endif
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -522,6 +530,9 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
orte_job_t *jdata = NULL; orte_job_t *jdata = NULL;
int32_t *nodes = NULL; int32_t *nodes = NULL;
int i, j, k, rc = ORTE_SUCCESS; int i, j, k, rc = ORTE_SUCCESS;
#if OPAL_HAVE_HWLOC
unsigned int *bind_idx=NULL;
#endif
/* setup the working buffer */ /* setup the working buffer */
OBJ_CONSTRUCT(&buf, opal_buffer_t); OBJ_CONSTRUCT(&buf, opal_buffer_t);
@ -550,12 +561,21 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup_and_return; goto cleanup_and_return;
} }
#if OPAL_HAVE_HWLOC
/* allocate memory for the nodes, local ranks and node ranks */ /* pack the bind level */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &(jdata->map->bind_level), 1, OPAL_HWLOC_LEVEL_T))) {
ORTE_ERROR_LOG(rc);
goto cleanup_and_return;
}
#endif
/* allocate memory for the nodes, local ranks, node ranks, and bind_idx */
nodes = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t)); nodes = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t));
lrank = (orte_local_rank_t*)malloc(jdata->num_procs*sizeof(orte_local_rank_t)); lrank = (orte_local_rank_t*)malloc(jdata->num_procs*sizeof(orte_local_rank_t));
nrank = (orte_node_rank_t*)malloc(jdata->num_procs*sizeof(orte_node_rank_t)); nrank = (orte_node_rank_t*)malloc(jdata->num_procs*sizeof(orte_node_rank_t));
#if OPAL_HAVE_HWLOC
bind_idx = (unsigned int*)malloc(jdata->num_procs*sizeof(unsigned int));
#endif
/* transfer and pack the node info in one pack */ /* transfer and pack the node info in one pack */
for (i=0, k=0; i < jdata->procs->size; i++) { for (i=0, k=0; i < jdata->procs->size; i++) {
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, i))) { if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, i))) {
@ -569,6 +589,9 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
nodes[k] = proc->node->index; nodes[k] = proc->node->index;
lrank[k] = proc->local_rank; lrank[k] = proc->local_rank;
nrank[k] = proc->node_rank; nrank[k] = proc->node_rank;
#if OPAL_HAVE_HWLOC
bind_idx[k] = proc->bind_idx;
#endif
++k; ++k;
} }
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, nodes, jdata->num_procs, OPAL_INT32))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, nodes, jdata->num_procs, OPAL_INT32))) {
@ -585,6 +608,13 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup_and_return; goto cleanup_and_return;
} }
#if OPAL_HAVE_HWLOC
/* transfer and pack the bind_idx in one pack */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, bind_idx, jdata->num_procs, OPAL_UINT))) {
ORTE_ERROR_LOG(rc);
goto cleanup_and_return;
}
#endif
} }
/* transfer the payload to the byte object */ /* transfer the payload to the byte object */
@ -601,6 +631,11 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
if( NULL != nodes ) { if( NULL != nodes ) {
free(nodes); free(nodes);
} }
#if OPAL_HAVE_HWLOC
if( NULL != bind_idx ) {
free(bind_idx);
}
#endif
OBJ_DESTRUCT(&buf); OBJ_DESTRUCT(&buf);
return rc; return rc;
@ -612,9 +647,13 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
orte_jobid_t jobid; orte_jobid_t jobid;
orte_vpid_t i, num_procs; orte_vpid_t i, num_procs;
orte_pmap_t *pmap; orte_pmap_t *pmap;
int32_t *nodes, my_node; int32_t *nodes=NULL, my_node;
orte_local_rank_t *local_rank; orte_local_rank_t *local_rank=NULL;
orte_node_rank_t *node_rank; orte_node_rank_t *node_rank=NULL;
#if OPAL_HAVE_HWLOC
opal_hwloc_level_t bind_level;
unsigned int *bind_idx=NULL;
#endif
orte_std_cntr_t n; orte_std_cntr_t n;
opal_buffer_t buf; opal_buffer_t buf;
orte_jmap_t *jmap; orte_jmap_t *jmap;
@ -658,6 +697,17 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
goto cleanup; goto cleanup;
} }
#if OPAL_HAVE_HWLOC
/* unpack the binding level */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &bind_level, &n, OPAL_HWLOC_LEVEL_T))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* set mine */
orte_process_info.bind_level = bind_level;
#endif
/* allocate memory for the node info */ /* allocate memory for the node info */
nodes = (int32_t*)malloc(num_procs * 4); nodes = (int32_t*)malloc(num_procs * 4);
/* unpack it in one shot */ /* unpack it in one shot */
@ -687,6 +737,19 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
goto cleanup; goto cleanup;
} }
#if OPAL_HAVE_HWLOC
/* allocate memory for bind_idx */
bind_idx = (unsigned int*)malloc(num_procs*sizeof(unsigned int));
/* unpack bind_idx in one shot */
n=num_procs;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, bind_idx, &n, OPAL_UINT))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* set mine */
orte_process_info.bind_idx = bind_idx[ORTE_PROC_MY_NAME->vpid];
#endif
/* if we already know about this job, we need to check the data to see /* if we already know about this job, we need to check the data to see
* if something has changed - e.g., a proc that is being restarted somewhere * if something has changed - e.g., a proc that is being restarted somewhere
* other than where it previously was * other than where it previously was
@ -703,34 +766,6 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
} }
/* now use the opal function to reset the internal pointers */ /* now use the opal function to reset the internal pointers */
opal_pointer_array_remove_all(&jmap->pmap); opal_pointer_array_remove_all(&jmap->pmap);
/* set the size of the storage so we minimize realloc's */
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&jmap->pmap, num_procs))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* add in the updated array */
for (i=0; i < num_procs; i++) {
pmap = OBJ_NEW(orte_pmap_t);
/* add the pidmap entry at the specific site corresponding
* to the proc's vpid
*/
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(&jmap->pmap, i, pmap))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* add/update the data */
pmap->node = nodes[i];
pmap->local_rank = local_rank[i];
pmap->node_rank = node_rank[i];
/* set locality - for now, just do node level */
if (pmap->node == my_node) {
pmap->locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
} else {
pmap->locality = OPAL_PROC_NON_LOCAL;
}
}
/* update the #procs */
jmap->num_procs = num_procs;
} else { } else {
/* if we don't already have this data, store it /* if we don't already have this data, store it
* unfortunately, job objects cannot be stored * unfortunately, job objects cannot be stored
@ -740,40 +775,67 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
*/ */
jmap = OBJ_NEW(orte_jmap_t); jmap = OBJ_NEW(orte_jmap_t);
jmap->job = jobid; jmap->job = jobid;
jmap->num_procs = num_procs;
if (0 > (j = opal_pointer_array_add(&orte_jobmap, jmap))) { if (0 > (j = opal_pointer_array_add(&orte_jobmap, jmap))) {
ORTE_ERROR_LOG(j); ORTE_ERROR_LOG(j);
rc = j; rc = j;
goto cleanup; goto cleanup;
} }
/* allocate memory for the procs array */ }
opal_pointer_array_set_size(&jmap->pmap, num_procs); /* update the binding level and num_procs */
/* xfer the data */ #if OPAL_HAVE_HWLOC
for (i=0; i < num_procs; i++) { jmap->bind_level = bind_level;
pmap = OBJ_NEW(orte_pmap_t); #endif
pmap->node = nodes[i]; jmap->num_procs = num_procs;
pmap->local_rank = local_rank[i]; /* set the size of the storage so we minimize realloc's */
pmap->node_rank = node_rank[i]; if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&jmap->pmap, num_procs))) {
/* set locality - for now, just do node level */ ORTE_ERROR_LOG(rc);
if (pmap->node == my_node) { return rc;
pmap->locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; }
} else { /* xfer the data */
pmap->locality = OPAL_PROC_NON_LOCAL; for (i=0; i < num_procs; i++) {
} pmap = OBJ_NEW(orte_pmap_t);
/* add the pidmap entry at the specific site corresponding pmap->node = nodes[i];
* to the proc's vpid pmap->local_rank = local_rank[i];
*/ pmap->node_rank = node_rank[i];
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(&jmap->pmap, i, pmap))) { /* set locality */
ORTE_ERROR_LOG(rc); if (ORTE_PROC_MY_NAME->vpid == i) {
goto cleanup; /* this is me */
} pmap->locality = OPAL_PROC_ALL_LOCAL;
#if OPAL_HAVE_HWLOC
} else if (pmap->node == my_node) {
/* we share a node - see what else we share */
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
orte_process_info.bind_level,
orte_process_info.bind_idx,
jmap->bind_level,
bind_idx[i]);
#else
} else if (pmap->node == my_node) {
pmap->locality = OPAL_PROC_ON_NODE;
#endif
} else {
pmap->locality = OPAL_PROC_NON_LOCAL;
}
/* add the pidmap entry at the specific site corresponding
* to the proc's vpid
*/
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(&jmap->pmap, i, pmap))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
} }
} }
/* release data */ /* release data */
free(nodes); free(nodes);
nodes = NULL;
free(local_rank); free(local_rank);
local_rank = NULL;
free(node_rank); free(node_rank);
node_rank = NULL;
#if OPAL_HAVE_HWLOC
free(bind_idx);
bind_idx = NULL;
#endif
/* setup for next cycle */ /* setup for next cycle */
n = 1; n = 1;
} }
@ -781,7 +843,21 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
rc = ORTE_SUCCESS; rc = ORTE_SUCCESS;
} }
cleanup: cleanup:
if (NULL != nodes) {
free(nodes);
}
if (NULL != local_rank) {
free(local_rank);
}
if (NULL != node_rank) {
free(node_rank);
}
#if OPAL_HAVE_HWLOC
if (NULL != bind_idx) {
free(bind_idx);
}
#endif
OBJ_DESTRUCT(&buf); OBJ_DESTRUCT(&buf);
return rc; return rc;
} }

Просмотреть файл

@ -69,6 +69,10 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
/* .sock_stdin = */ NULL, /* .sock_stdin = */ NULL,
/* .sock_stdout = */ NULL, /* .sock_stdout = */ NULL,
/* .sock_stderr = */ NULL, /* .sock_stderr = */ NULL,
#if OPAL_HAVE_HWLOC
/* .bind_level = */ OPAL_HWLOC_NODE_LEVEL,
/* .bind_idx = */ 0,
#endif
/* .job_name = */ NULL, /* .job_name = */ NULL,
/* .job_instance = */ NULL, /* .job_instance = */ NULL,
/* .executable = */ NULL, /* .executable = */ NULL,

Просмотреть файл

@ -37,7 +37,9 @@
#endif #endif
#include "orte/types.h" #include "orte/types.h"
#include "opal/dss/dss_types.h" #include "opal/dss/dss_types.h"
#include "opal/mca/hwloc/hwloc.h"
BEGIN_C_DECLS BEGIN_C_DECLS
@ -111,6 +113,10 @@ struct orte_proc_info_t {
char *sock_stdin; /**< Path name to temp file for stdin. */ char *sock_stdin; /**< Path name to temp file for stdin. */
char *sock_stdout; /**< Path name to temp file for stdout. */ char *sock_stdout; /**< Path name to temp file for stdout. */
char *sock_stderr; /**< Path name to temp file for stderr. */ char *sock_stderr; /**< Path name to temp file for stderr. */
#if OPAL_HAVE_HWLOC
opal_hwloc_level_t bind_level;
unsigned int bind_idx;
#endif
/* name/instance info for debug support */ /* name/instance info for debug support */
char *job_name; char *job_name;
char *job_instance; char *job_instance;