At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
Этот коммит содержится в:
родитель
c8e105bd8c
Коммит
6310361532
@ -3,7 +3,7 @@ enable_multicast=no
|
||||
enable_dlopen=no
|
||||
enable_pty_support=no
|
||||
with_blcr=no
|
||||
with_openib=yes
|
||||
with_openib=no
|
||||
with_memory_manager=no
|
||||
enable_mem_debug=yes
|
||||
enable_mem_profile=no
|
||||
|
@ -62,6 +62,7 @@
|
||||
mca_component_show_load_errors = 0
|
||||
mpi_param_check = 0
|
||||
orte_abort_timeout = 10
|
||||
hwloc_base_mem_bind_failure_action = silent
|
||||
|
||||
## Protect the shared file systems
|
||||
|
||||
@ -72,22 +73,13 @@ oob_tcp_disable_family = IPv6
|
||||
#oob_tcp_connect_timeout=600
|
||||
|
||||
## Define the MPI interconnects
|
||||
btl = sm,openib,self
|
||||
#mpi_leave_pinned = 1
|
||||
btl = sm,tcp,self
|
||||
|
||||
## Setup shared memory
|
||||
btl_sm_free_list_max = 768
|
||||
|
||||
## Setup OpenIB
|
||||
btl_openib_want_fork_support = 0
|
||||
btl_openib_cpc_include = oob
|
||||
#btl_openib_receive_queues = P,128,256,64,32,32:S,2048,1024,128,32:S,12288,1024,128,32:S,65536,1024,128,32
|
||||
|
||||
## Setup TCP
|
||||
btl_tcp_if_include = ib0
|
||||
|
||||
## Configure the PML
|
||||
pml_ob1_use_early_completion = 0
|
||||
|
||||
## Enable cpu affinity
|
||||
opal_paffinity_alone = 1
|
||||
|
@ -58,6 +58,8 @@ enum {
|
||||
OMPI_ERR_DATA_OVERWRITE_ATTEMPT = OPAL_ERR_DATA_OVERWRITE_ATTEMPT,
|
||||
|
||||
OMPI_ERR_BUFFER = OPAL_ERR_BUFFER,
|
||||
OMPI_ERR_SILENT = OPAL_ERR_SILENT,
|
||||
|
||||
OMPI_ERR_REQUEST = OMPI_ERR_BASE - 1
|
||||
};
|
||||
|
||||
|
@ -568,7 +568,7 @@ static int spawn(int count, char **array_of_commands,
|
||||
char stdin_target[OPAL_PATH_MAX];
|
||||
char params[OPAL_PATH_MAX];
|
||||
char mapper[OPAL_PATH_MAX];
|
||||
int nperxxx;
|
||||
int npernode;
|
||||
char slot_list[OPAL_PATH_MAX];
|
||||
|
||||
orte_job_t *jdata;
|
||||
@ -735,7 +735,7 @@ static int spawn(int count, char **array_of_commands,
|
||||
}
|
||||
|
||||
/* check for 'mapper' */
|
||||
ompi_info_get (array_of_info[i], "mapper", sizeof(mapper) - 1, mapper, &flag);
|
||||
ompi_info_get(array_of_info[i], "mapper", sizeof(mapper) - 1, mapper, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
@ -743,20 +743,27 @@ static int spawn(int count, char **array_of_commands,
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
/* load it with the system defaults */
|
||||
jdata->map->policy = orte_default_mapping_policy;
|
||||
jdata->map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
|
||||
jdata->map->stride = orte_rmaps_base.stride;
|
||||
jdata->map->oversubscribe = orte_rmaps_base.oversubscribe;
|
||||
jdata->map->display_map = orte_rmaps_base.display_map;
|
||||
}
|
||||
jdata->map->req_mapper = strdup(mapper);
|
||||
}
|
||||
|
||||
/* check for 'npernode' */
|
||||
/* check for 'display_map' */
|
||||
ompi_info_get_bool(array_of_info[i], "display_map", &local_spawn, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
jdata->map->display_map = true;
|
||||
}
|
||||
|
||||
/* check for 'npernode' and 'ppr' */
|
||||
ompi_info_get (array_of_info[i], "npernode", sizeof(slot_list) - 1, slot_list, &flag);
|
||||
if ( flag ) {
|
||||
if (ORTE_SUCCESS != ompi_info_value_to_int(slot_list, &nperxxx)) {
|
||||
if (ORTE_SUCCESS != ompi_info_value_to_int(slot_list, &npernode)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
@ -766,18 +773,14 @@ static int spawn(int count, char **array_of_commands,
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
/* load it with the system defaults */
|
||||
jdata->map->policy = orte_default_mapping_policy;
|
||||
jdata->map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
|
||||
jdata->map->stride = orte_rmaps_base.stride;
|
||||
jdata->map->oversubscribe = orte_rmaps_base.oversubscribe;
|
||||
jdata->map->display_map = orte_rmaps_base.display_map;
|
||||
}
|
||||
jdata->map->npernode = nperxxx;
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->mapping |= ORTE_MAPPING_PPR;
|
||||
asprintf(&(jdata->map->ppr), "%d:n", npernode);
|
||||
}
|
||||
|
||||
/* check for 'map_bynode' */
|
||||
ompi_info_get_bool(array_of_info[i], "map_bynode", &local_bynode, &flag);
|
||||
ompi_info_get (array_of_info[i], "pernode", sizeof(slot_list) - 1, slot_list, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
@ -785,20 +788,438 @@ static int spawn(int count, char **array_of_commands,
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
/* load it with the system defaults */
|
||||
jdata->map->policy = orte_default_mapping_policy;
|
||||
jdata->map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
|
||||
jdata->map->stride = orte_rmaps_base.stride;
|
||||
jdata->map->oversubscribe = orte_rmaps_base.oversubscribe;
|
||||
jdata->map->display_map = orte_rmaps_base.display_map;
|
||||
}
|
||||
if( local_bynode ) {
|
||||
jdata->map->policy = ORTE_MAPPING_BYNODE;
|
||||
}
|
||||
else {
|
||||
jdata->map->policy = ORTE_MAPPING_BYSLOT;
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->mapping |= ORTE_MAPPING_PPR;
|
||||
jdata->map->ppr = strdup("1:n");
|
||||
}
|
||||
ompi_info_get (array_of_info[i], "ppr", sizeof(slot_list) - 1, slot_list, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->mapping |= ORTE_MAPPING_PPR;
|
||||
jdata->map->ppr = strdup(slot_list);
|
||||
}
|
||||
|
||||
/* check for 'map_byxxx' */
|
||||
ompi_info_get_bool(array_of_info[i], "map_by_node", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->mapping |= ORTE_MAPPING_BYNODE;
|
||||
}
|
||||
#if OPAL_HAVE_HWLOC
|
||||
ompi_info_get_bool(array_of_info[i], "map_by_board", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->mapping |= ORTE_MAPPING_BYBOARD;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "map_by_numa", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->mapping |= ORTE_MAPPING_BYNUMA;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "map_by_socket", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->mapping |= ORTE_MAPPING_BYSOCKET;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "map_by_l3cache", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->mapping |= ORTE_MAPPING_BYL3CACHE;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "map_by_l2cache", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->mapping |= ORTE_MAPPING_BYL2CACHE;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "map_by_l1cache", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->mapping |= ORTE_MAPPING_BYL1CACHE;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "map_by_core", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->mapping |= ORTE_MAPPING_BYCORE;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "map_by_hwthread", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->mapping |= ORTE_MAPPING_BYHWTHREAD;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* check for 'rank_byxxx' */
|
||||
ompi_info_get_bool(array_of_info[i], "rank_by_node", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (0 != jdata->map->ranking) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->ranking = ORTE_RANK_BY_NODE;
|
||||
}
|
||||
#if OPAL_HAVE_HWLOC
|
||||
ompi_info_get_bool(array_of_info[i], "rank_by_board", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (0 != jdata->map->ranking) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->ranking = ORTE_RANK_BY_BOARD;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "rank_by_numa", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (0 != jdata->map->ranking) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->ranking = ORTE_RANK_BY_NUMA;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "rank_by_socket", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (0 != jdata->map->ranking) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->ranking = ORTE_RANK_BY_SOCKET;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "rank_by_l3cache", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (0 != jdata->map->ranking) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->ranking = ORTE_RANK_BY_L3CACHE;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "rank_by_l2cache", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (0 != jdata->map->ranking) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->ranking = ORTE_RANK_BY_L2CACHE;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "rank_by_l1cache", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (0 != jdata->map->ranking) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->ranking = ORTE_RANK_BY_L1CACHE;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "rank_by_core", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (0 != jdata->map->ranking) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->ranking = ORTE_RANK_BY_CORE;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "rank_by_hwthread", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (0 != jdata->map->ranking) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->ranking = ORTE_RANK_BY_HWTHREAD;
|
||||
}
|
||||
|
||||
/* check for 'bind_toxxx' */
|
||||
ompi_info_get_bool(array_of_info[i], "bind_if_supported", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
jdata->map->binding |= OPAL_BIND_IF_SUPPORTED;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "bind_overload_allowed", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
jdata->map->binding |= OPAL_BIND_ALLOW_OVERLOAD;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "bind_to_none", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->binding |= OPAL_BIND_TO_NONE;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "bind_to_board", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->binding |= OPAL_BIND_TO_BOARD;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "bind_to_numa", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->binding |= OPAL_BIND_TO_NUMA;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "bind_to_socket", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->binding |= OPAL_BIND_TO_SOCKET;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "bind_to_l3cache", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->binding |= OPAL_BIND_TO_L3CACHE;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "bind_to_l2cache", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->binding |= OPAL_BIND_TO_L2CACHE;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "bind_to_l1cache", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->binding |= OPAL_BIND_TO_L1CACHE;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "bind_to_core", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->binding |= OPAL_BIND_TO_CORE;
|
||||
}
|
||||
ompi_info_get_bool(array_of_info[i], "bind_to_hwthread", &local_bynode, &flag);
|
||||
if ( flag ) {
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == jdata->map) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
jdata->map->binding |= OPAL_BIND_TO_HWTHREAD;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* check for 'preload_binary' */
|
||||
ompi_info_get_bool(array_of_info[i], "ompi_preload_binary", &local_spawn, &flag);
|
||||
|
@ -287,14 +287,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
int param, value;
|
||||
struct timeval ompistart, ompistop;
|
||||
char *event_val = NULL;
|
||||
opal_paffinity_base_cpu_set_t mask;
|
||||
bool proc_bound;
|
||||
#if 0
|
||||
/* see comment below about sched_yield */
|
||||
int num_processors;
|
||||
#endif
|
||||
bool orte_setup = false;
|
||||
bool paffinity_enabled = false;
|
||||
|
||||
/* bitflag of the thread level support provided. To be used
|
||||
* for the modex in order to work in heterogeneous environments. */
|
||||
@ -371,6 +364,18 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
gettimeofday(&ompistart, NULL);
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* if hwloc is available but didn't get setup for some
|
||||
* reason, do so now
|
||||
*/
|
||||
if (NULL == opal_hwloc_topology) {
|
||||
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
|
||||
error = "Topology init";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Register errhandler callback with orte errmgr */
|
||||
if (NULL != orte_errmgr.set_fault_callback) {
|
||||
orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback);
|
||||
@ -412,17 +417,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
goto error;
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* If orte_init() didn't fill in opal_hwloc_topology, then we need
|
||||
to go fill it in ourselves. */
|
||||
if (NULL == opal_hwloc_topology) {
|
||||
if (0 != hwloc_topology_init(&opal_hwloc_topology) ||
|
||||
0 != hwloc_topology_load(opal_hwloc_topology)) {
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Once we've joined the RTE, see if any MCA parameters were
|
||||
passed to the MPI level */
|
||||
|
||||
@ -442,106 +436,217 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
}
|
||||
#endif
|
||||
|
||||
/* if it hasn't already been done, setup process affinity.
|
||||
* First check to see if a slot list was
|
||||
* specified. If so, use it. If no slot list was specified,
|
||||
* that's not an error -- just fall through and try the next
|
||||
* paffinity scheme.
|
||||
*/
|
||||
ret = opal_paffinity_base_get(&mask);
|
||||
if (OPAL_SUCCESS == ret) {
|
||||
/* paffinity is supported - check for binding */
|
||||
OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &proc_bound);
|
||||
if (proc_bound || opal_paffinity_base_bound) {
|
||||
/* someone external set it - indicate it is set
|
||||
* so that we know
|
||||
*/
|
||||
paffinity_enabled = true;
|
||||
} else {
|
||||
/* the system is capable of doing processor affinity, but it
|
||||
* has not yet been set - see if a slot_list was given
|
||||
*/
|
||||
if (NULL != opal_paffinity_base_slot_list) {
|
||||
/* It's an error if multiple paffinity schemes were specified */
|
||||
if (opal_paffinity_alone) {
|
||||
ret = OMPI_ERR_BAD_PARAM;
|
||||
error = "Multiple processor affinity schemes specified (can only specify one)";
|
||||
goto error;
|
||||
}
|
||||
ret = opal_paffinity_base_slot_list_set((long)ORTE_PROC_MY_NAME->vpid, opal_paffinity_base_slot_list, &mask);
|
||||
if (OPAL_SUCCESS != ret && OPAL_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
error = "opal_paffinity_base_slot_list_set() returned an error";
|
||||
goto error;
|
||||
}
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
/* print out a warning if result is no-op, if not suppressed */
|
||||
OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &proc_bound);
|
||||
if (!proc_bound && orte_odls_base.warn_if_not_bound) {
|
||||
orte_show_help("help-orte-odls-base.txt",
|
||||
"orte-odls-base:warn-not-bound",
|
||||
true, "slot-list",
|
||||
"Request resulted in binding to all available processors",
|
||||
orte_process_info.nodename,
|
||||
"bind-to-slot-list", opal_paffinity_base_slot_list, argv[0]);
|
||||
}
|
||||
#endif
|
||||
paffinity_enabled = true;
|
||||
} else if (opal_paffinity_alone) {
|
||||
/* no slot_list, but they asked for paffinity */
|
||||
int phys_cpu;
|
||||
orte_node_rank_t nrank;
|
||||
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) {
|
||||
/* this is okay - we probably were direct-launched, which means
|
||||
* we won't get our node rank until the modex. So just ignore
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{
|
||||
hwloc_obj_t node, obj;
|
||||
hwloc_cpuset_t cpus, nodeset;
|
||||
bool paffinity_enabled=false;
|
||||
orte_node_rank_t nrank;
|
||||
hwloc_obj_type_t target;
|
||||
unsigned cache_level;
|
||||
struct hwloc_topology_support *support;
|
||||
|
||||
/* see if we were bound when launched */
|
||||
if (NULL == getenv("OMPI_MCA_opal_bound_at_launch")) {
|
||||
/* we were not bound at launch */
|
||||
if (NULL != opal_hwloc_topology) {
|
||||
support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
|
||||
/* get our node object */
|
||||
node = hwloc_get_root_obj(opal_hwloc_topology);
|
||||
nodeset = hwloc_bitmap_alloc();
|
||||
hwloc_bitmap_and(nodeset, node->online_cpuset, node->allowed_cpuset);
|
||||
/* get our cpuset */
|
||||
cpus = hwloc_bitmap_alloc();
|
||||
hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS);
|
||||
/* we are bound if the two cpusets are not equal */
|
||||
if (0 != hwloc_bitmap_compare(cpus, nodeset)) {
|
||||
/* someone external set it - indicate it is set
|
||||
* so that we know
|
||||
*/
|
||||
goto MOVEON;
|
||||
paffinity_enabled = true;
|
||||
hwloc_bitmap_free(nodeset);
|
||||
hwloc_bitmap_free(cpus);
|
||||
} else if (support->cpubind->set_thisproc_cpubind &&
|
||||
OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
|
||||
OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||
/* the system is capable of doing processor affinity, but it
|
||||
* has not yet been set - see if a slot_list was given
|
||||
*/
|
||||
hwloc_bitmap_zero(cpus);
|
||||
if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||
if (ORTE_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list,
|
||||
opal_hwloc_topology, cpus))) {
|
||||
error = "Setting processor affinity failed";
|
||||
hwloc_bitmap_free(nodeset);
|
||||
hwloc_bitmap_free(cpus);
|
||||
goto error;
|
||||
}
|
||||
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
|
||||
error = "Setting processor affinity failed";
|
||||
hwloc_bitmap_free(nodeset);
|
||||
hwloc_bitmap_free(cpus);
|
||||
goto error;
|
||||
}
|
||||
/* try to find a level and index for this location */
|
||||
opal_hwloc_base_get_level_and_index(cpus, &orte_process_info.bind_level, &orte_process_info.bind_idx);
|
||||
/* cleanup */
|
||||
hwloc_bitmap_free(nodeset);
|
||||
hwloc_bitmap_free(cpus);
|
||||
paffinity_enabled = true;
|
||||
} else {
|
||||
/* cleanup */
|
||||
hwloc_bitmap_free(nodeset);
|
||||
hwloc_bitmap_free(cpus);
|
||||
/* get the node rank */
|
||||
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) {
|
||||
/* this is not an error - could be due to being
|
||||
* direct launched - so just ignore and leave
|
||||
* us unbound
|
||||
*/
|
||||
goto MOVEON;
|
||||
}
|
||||
/* if the binding policy is hwthread, then we bind to the nrank-th
|
||||
* hwthread on this node
|
||||
*/
|
||||
if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_PU,
|
||||
0, nrank, OPAL_HWLOC_LOGICAL))) {
|
||||
ret = OMPI_ERR_NOT_FOUND;
|
||||
error = "Getting hwthread object";
|
||||
goto error;
|
||||
}
|
||||
cpus = hwloc_bitmap_alloc();
|
||||
hwloc_bitmap_and(cpus, obj->online_cpuset, obj->allowed_cpuset);
|
||||
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
|
||||
ret = OMPI_ERROR;
|
||||
error = "Setting processor affinity failed";
|
||||
hwloc_bitmap_free(cpus);
|
||||
goto error;
|
||||
}
|
||||
hwloc_bitmap_free(cpus);
|
||||
orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
|
||||
orte_process_info.bind_idx = nrank;
|
||||
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||
/* if the binding policy is core, then we bind to the nrank-th
|
||||
* core on this node
|
||||
*/
|
||||
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE,
|
||||
0, nrank, OPAL_HWLOC_LOGICAL))) {
|
||||
ret = OMPI_ERR_NOT_FOUND;
|
||||
error = "Getting core object";
|
||||
goto error;
|
||||
}
|
||||
cpus = hwloc_bitmap_alloc();
|
||||
hwloc_bitmap_and(cpus, obj->online_cpuset, obj->allowed_cpuset);
|
||||
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
|
||||
error = "Setting processor affinity failed";
|
||||
hwloc_bitmap_free(cpus);
|
||||
ret = OMPI_ERROR;
|
||||
goto error;
|
||||
}
|
||||
hwloc_bitmap_free(cpus);
|
||||
orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL;
|
||||
orte_process_info.bind_idx = nrank;
|
||||
} else {
|
||||
/* for all higher binding policies, we bind to the specified
|
||||
* object that the nrank-th core belongs to
|
||||
*/
|
||||
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE,
|
||||
0, nrank, OPAL_HWLOC_LOGICAL))) {
|
||||
ret = OMPI_ERR_NOT_FOUND;
|
||||
error = "Getting core object";
|
||||
goto error;
|
||||
}
|
||||
if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||
target = HWLOC_OBJ_CACHE;
|
||||
cache_level = 1;
|
||||
orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
|
||||
} else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||
target = HWLOC_OBJ_CACHE;
|
||||
cache_level = 2;
|
||||
orte_process_info.bind_level = OPAL_HWLOC_L2CACHE_LEVEL;
|
||||
} else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||
target = HWLOC_OBJ_CACHE;
|
||||
cache_level = 3;
|
||||
orte_process_info.bind_level = OPAL_HWLOC_L3CACHE_LEVEL;
|
||||
} else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||
target = HWLOC_OBJ_SOCKET;
|
||||
orte_process_info.bind_level = OPAL_HWLOC_SOCKET_LEVEL;
|
||||
} else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||
target = HWLOC_OBJ_NODE;
|
||||
orte_process_info.bind_level = OPAL_HWLOC_NUMA_LEVEL;
|
||||
} else {
|
||||
ret = OMPI_ERR_NOT_FOUND;
|
||||
error = "Binding policy not known";
|
||||
goto error;
|
||||
}
|
||||
for (obj = obj->parent; NULL != obj; obj = obj->parent) {
|
||||
if (target == obj->type) {
|
||||
if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
|
||||
continue;
|
||||
}
|
||||
/* this is the place! */
|
||||
cpus = hwloc_bitmap_alloc();
|
||||
hwloc_bitmap_and(cpus, obj->online_cpuset, obj->allowed_cpuset);
|
||||
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
|
||||
ret = OMPI_ERROR;
|
||||
error = "Setting processor affinity failed";
|
||||
hwloc_bitmap_free(cpus);
|
||||
goto error;
|
||||
}
|
||||
hwloc_bitmap_free(cpus);
|
||||
orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology,
|
||||
obj, OPAL_HWLOC_LOGICAL);
|
||||
paffinity_enabled = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!paffinity_enabled) {
|
||||
ret = OMPI_ERROR;
|
||||
error = "Setting processor affinity failed";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
paffinity_enabled = true;
|
||||
}
|
||||
}
|
||||
OPAL_PAFFINITY_CPU_ZERO(mask);
|
||||
ret = opal_paffinity_base_get_physical_processor_id(nrank, &phys_cpu);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
error = "Could not get physical processor id - cannot set processor affinity";
|
||||
goto error;
|
||||
/* If we were able to set processor affinity, try setting up
|
||||
memory affinity */
|
||||
if (!opal_maffinity_setup && paffinity_enabled) {
|
||||
if (OPAL_SUCCESS == opal_maffinity_base_open() &&
|
||||
OPAL_SUCCESS == opal_maffinity_base_select()) {
|
||||
opal_maffinity_setup = true;
|
||||
}
|
||||
}
|
||||
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
|
||||
ret = opal_paffinity_base_set(mask);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
error = "Setting processor affinity failed";
|
||||
goto error;
|
||||
}
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
/* print out a warning if result is no-op, if not suppressed */
|
||||
OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &proc_bound);
|
||||
if (!proc_bound && orte_odls_base.warn_if_not_bound) {
|
||||
orte_show_help("help-orte-odls-base.txt",
|
||||
"orte-odls-base:warn-not-bound",
|
||||
true, "cpu",
|
||||
"Request resulted in binding to all available processors",
|
||||
orte_process_info.nodename,
|
||||
"[opal|mpi]_paffinity_alone set non-zero", "n/a", argv[0]);
|
||||
}
|
||||
#endif
|
||||
paffinity_enabled = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MOVEON:
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* get or update our local cpuset - it will get used multiple
|
||||
* times, so it's more efficient to keep a global copy
|
||||
*/
|
||||
opal_hwloc_base_get_local_cpuset();
|
||||
#endif
|
||||
|
||||
/* If we were able to set processor affinity, try setting up
|
||||
memory affinity */
|
||||
if (!opal_maffinity_setup && paffinity_enabled) {
|
||||
if (OPAL_SUCCESS == opal_maffinity_base_open() &&
|
||||
OPAL_SUCCESS == opal_maffinity_base_select()) {
|
||||
opal_maffinity_setup = true;
|
||||
/* report bindings, if requested */
|
||||
if (opal_hwloc_report_bindings) {
|
||||
char bindings[64];
|
||||
hwloc_obj_t root;
|
||||
hwloc_cpuset_t cpus;
|
||||
/* get the root object for this node */
|
||||
root = hwloc_get_root_obj(opal_hwloc_topology);
|
||||
cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, root);
|
||||
if (0 == hwloc_bitmap_compare(cpus, opal_hwloc_my_cpuset)) {
|
||||
opal_output(0, "%s is not bound",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
} else {
|
||||
hwloc_bitmap_list_snprintf(bindings, 64, opal_hwloc_my_cpuset);
|
||||
opal_output(0, "%s is bound to cpus %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
bindings);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* initialize datatypes. This step should be done early as it will
|
||||
* create the local convertor and local arch used in the proc
|
||||
* init.
|
||||
@ -649,7 +754,7 @@ MOVEON:
|
||||
|
||||
if (OMPI_SUCCESS !=
|
||||
(ret = ompi_osc_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
|
||||
OMPI_ENABLE_THREAD_MULTIPLE))) {
|
||||
OMPI_ENABLE_THREAD_MULTIPLE))) {
|
||||
error = "ompi_osc_base_find_available() failed";
|
||||
goto error;
|
||||
}
|
||||
@ -801,16 +906,16 @@ MOVEON:
|
||||
* Dump all MCA parameters if requested
|
||||
*/
|
||||
if (ompi_mpi_show_mca_params) {
|
||||
ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank,
|
||||
nprocs,
|
||||
orte_process_info.nodename);
|
||||
ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank,
|
||||
nprocs,
|
||||
orte_process_info.nodename);
|
||||
}
|
||||
|
||||
/* Do we need to wait for a debugger? */
|
||||
ompi_wait_for_debugger();
|
||||
|
||||
/* check for timing request - get stop time and report elapsed
|
||||
time if so, then start the clock again */
|
||||
time if so, then start the clock again */
|
||||
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
|
||||
gettimeofday(&ompistop, NULL);
|
||||
opal_output(0, "ompi_mpi_init[%ld]: time from modex to first barrier %ld usec",
|
||||
|
@ -68,7 +68,8 @@ enum {
|
||||
OPAL_ERR_INVALID_PHYS_CPU = (OPAL_ERR_BASE - 39),
|
||||
OPAL_ERR_MULTIPLE_AFFINITIES = (OPAL_ERR_BASE - 40),
|
||||
OPAL_ERR_SLOT_LIST_RANGE = (OPAL_ERR_BASE - 41),
|
||||
OPAL_ERR_NETWORK_NOT_PARSEABLE = (OPAL_ERR_BASE - 42)
|
||||
OPAL_ERR_NETWORK_NOT_PARSEABLE = (OPAL_ERR_BASE - 42),
|
||||
OPAL_ERR_SILENT = (OPAL_ERR_BASE - 43)
|
||||
};
|
||||
|
||||
#define OPAL_ERR_MAX (OPAL_ERR_BASE - 100)
|
||||
|
@ -9,6 +9,8 @@
|
||||
|
||||
EXTRA_DIST = base/.windows
|
||||
|
||||
dist_pkgdata_DATA = base/help-opal-hwloc-base.txt
|
||||
|
||||
headers += \
|
||||
base/base.h
|
||||
|
||||
|
@ -77,7 +77,111 @@ OPAL_DECLSPEC extern bool opal_hwloc_base_inited;
|
||||
OPAL_DECLSPEC extern bool opal_hwloc_topology_inited;
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
OPAL_DECLSPEC extern char *opal_hwloc_base_slot_list;
|
||||
OPAL_DECLSPEC extern char *opal_hwloc_base_cpu_set;
|
||||
OPAL_DECLSPEC extern hwloc_cpuset_t opal_hwloc_base_given_cpus;
|
||||
|
||||
/**
|
||||
* Report a bind failure using the normal mechanisms if a component
|
||||
* fails to bind memory -- according to the value of the
|
||||
* hwloc_base_bind_failure_action MCA parameter.
|
||||
*/
|
||||
OPAL_DECLSPEC int opal_hwloc_base_report_bind_failure(const char *file,
|
||||
int line,
|
||||
const char *msg,
|
||||
int rc);
|
||||
|
||||
OPAL_DECLSPEC opal_paffinity_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t topo,
|
||||
opal_hwloc_level_t level1,
|
||||
unsigned int peer1,
|
||||
opal_hwloc_level_t level2,
|
||||
unsigned int peer2);
|
||||
|
||||
OPAL_DECLSPEC void opal_hwloc_base_get_local_cpuset(void);
|
||||
|
||||
/**
|
||||
* Enum for what memory allocation policy we want for user allocations.
|
||||
* MAP = memory allocation policy.
|
||||
*/
|
||||
typedef enum {
|
||||
OPAL_HWLOC_BASE_MAP_NONE,
|
||||
OPAL_HWLOC_BASE_MAP_LOCAL_ONLY
|
||||
} opal_hwloc_base_map_t;
|
||||
|
||||
/**
|
||||
* Global reflecting the MAP (set by MCA param).
|
||||
*/
|
||||
OPAL_DECLSPEC extern opal_hwloc_base_map_t opal_hwloc_base_map;
|
||||
|
||||
/**
|
||||
* Enum for what to do if the hwloc framework tries to bind memory
|
||||
* and fails. BFA = bind failure action.
|
||||
*/
|
||||
typedef enum {
|
||||
OPAL_HWLOC_BASE_MBFA_SILENT,
|
||||
OPAL_HWLOC_BASE_MBFA_WARN,
|
||||
OPAL_HWLOC_BASE_MBFA_ERROR
|
||||
} opal_hwloc_base_mbfa_t;
|
||||
|
||||
/**
|
||||
* Global reflecting the BFA (set by MCA param).
|
||||
*/
|
||||
OPAL_DECLSPEC extern opal_hwloc_base_mbfa_t opal_hwloc_base_mbfa;
|
||||
|
||||
/* some critical helper functions */
|
||||
OPAL_DECLSPEC int opal_hwloc_base_filter_cpus(hwloc_topology_t topo);
|
||||
OPAL_DECLSPEC int opal_hwloc_base_get_topology(void);
|
||||
OPAL_DECLSPEC void opal_hwloc_base_free_topology(hwloc_topology_t topo);
|
||||
OPAL_DECLSPEC hwloc_cpuset_t opal_hwloc_base_get_available_cpus(hwloc_topology_t topo,
|
||||
hwloc_obj_t obj);
|
||||
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo,
|
||||
hwloc_obj_type_t target,
|
||||
unsigned cache_level,
|
||||
opal_hwloc_resource_type_t rtype);
|
||||
OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_obj_by_type(hwloc_topology_t topo,
|
||||
hwloc_obj_type_t target,
|
||||
unsigned cache_level,
|
||||
unsigned int instance,
|
||||
opal_hwloc_resource_type_t rtype);
|
||||
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_obj_idx(hwloc_topology_t topo,
|
||||
hwloc_obj_t obj,
|
||||
opal_hwloc_resource_type_t rtype);
|
||||
OPAL_DECLSPEC void opal_hwloc_base_get_level_and_index(hwloc_cpuset_t cpus,
|
||||
opal_hwloc_level_t *bind_level,
|
||||
unsigned int *bind_idx);
|
||||
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_npus(hwloc_topology_t topo,
|
||||
hwloc_obj_t target);
|
||||
OPAL_DECLSPEC char* opal_hwloc_base_print_binding(opal_binding_policy_t binding);
|
||||
OPAL_DECLSPEC char* opal_hwloc_base_print_locality(opal_paffinity_locality_t locality);
|
||||
OPAL_DECLSPEC char* opal_hwloc_base_print_level(opal_hwloc_level_t level);
|
||||
|
||||
/**
|
||||
* Provide a utility to parse a slot list against the local
|
||||
* logical cpus, and produce a cpuset for the described binding
|
||||
*/
|
||||
OPAL_DECLSPEC int opal_hwloc_base_slot_list_parse(const char *slot_str,
|
||||
hwloc_topology_t topo,
|
||||
hwloc_cpuset_t cpumask);
|
||||
|
||||
/**
|
||||
* Report a bind failure using the normal mechanisms if a component
|
||||
* fails to bind memory -- according to the value of the
|
||||
* hwloc_base_bind_failure_action MCA parameter.
|
||||
*/
|
||||
OPAL_DECLSPEC int opal_hwloc_base_report_bind_failure(const char *file,
|
||||
int line,
|
||||
const char *msg,
|
||||
int rc);
|
||||
|
||||
/**
|
||||
* This function sets the process-wide memory affinity policy
|
||||
* according to opal_hwloc_base_map and opal_hwloc_base_mbfa. It needs
|
||||
* to be a separate, standalone function (as opposed to being done
|
||||
* during opal_hwloc_base_open()) because opal_hwloc_topology is not
|
||||
* loaded by opal_hwloc_base_open(). Hence, an upper layer needs to
|
||||
* invoke this function after opal_hwloc_topology has been loaded.
|
||||
*/
|
||||
OPAL_DECLSPEC int opal_hwloc_base_set_process_membind_policy(void);
|
||||
|
||||
/* datatype support */
|
||||
OPAL_DECLSPEC int opal_hwloc_pack(opal_buffer_t *buffer, const void *src,
|
||||
@ -100,80 +204,8 @@ OPAL_DECLSPEC int opal_hwloc_size(size_t *size,
|
||||
opal_data_type_t type);
|
||||
OPAL_DECLSPEC void opal_hwloc_release(opal_dss_value_t *value);
|
||||
|
||||
/**
|
||||
* Report a bind failure using the normal mechanisms if a component
|
||||
* fails to bind memory -- according to the value of the
|
||||
* hwloc_base_bind_failure_action MCA parameter.
|
||||
*/
|
||||
OPAL_DECLSPEC int opal_hwloc_base_report_bind_failure(const char *file,
|
||||
int line,
|
||||
const char *msg,
|
||||
int rc);
|
||||
|
||||
OPAL_DECLSPEC opal_paffinity_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t topo,
|
||||
hwloc_cpuset_t peer1,
|
||||
hwloc_cpuset_t peer2);
|
||||
|
||||
OPAL_DECLSPEC void opal_hwloc_base_get_local_cpuset(void);
|
||||
|
||||
/* some critical helper functions */
|
||||
OPAL_DECLSPEC int opal_hwloc_base_filter_cpus(hwloc_topology_t topo);
|
||||
OPAL_DECLSPEC int opal_hwloc_base_get_topology(void);
|
||||
OPAL_DECLSPEC void opal_hwloc_base_free_topology(hwloc_topology_t topo);
|
||||
OPAL_DECLSPEC hwloc_cpuset_t opal_hwloc_base_get_available_cpus(hwloc_topology_t topo,
|
||||
hwloc_obj_t obj);
|
||||
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo,
|
||||
hwloc_obj_type_t target,
|
||||
unsigned cache_level,
|
||||
opal_hwloc_resource_type_t rtype);
|
||||
OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_obj_by_type(hwloc_topology_t topo,
|
||||
hwloc_obj_type_t target,
|
||||
unsigned cache_level,
|
||||
unsigned int instance,
|
||||
opal_hwloc_resource_type_t rtype);
|
||||
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_npus(hwloc_topology_t topo,
|
||||
hwloc_obj_t target);
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Enum for what memory allocation policy we want for user allocations.
|
||||
* MAP = memory allocation policy.
|
||||
*/
|
||||
typedef enum {
|
||||
OPAL_HWLOC_BASE_MAP_NONE,
|
||||
OPAL_HWLOC_BASE_MAP_LOCAL_ONLY
|
||||
} opal_hwloc_base_map_t;
|
||||
|
||||
/**
|
||||
* Global reflecting the MAP (set by MCA param).
|
||||
*/
|
||||
OPAL_DECLSPEC extern opal_hwloc_base_map_t opal_hwloc_base_map;
|
||||
|
||||
/**
|
||||
* Enum for what to do if the hwloc framework tries to bind memory
|
||||
* and fails. BFA = bind failure action.
|
||||
*/
|
||||
typedef enum {
|
||||
OPAL_HWLOC_BASE_MBFA_WARN,
|
||||
OPAL_HWLOC_BASE_MBFA_ERROR
|
||||
} opal_hwloc_base_mbfa_t;
|
||||
|
||||
/**
|
||||
* Global reflecting the BFA (set by MCA param).
|
||||
*/
|
||||
OPAL_DECLSPEC extern opal_hwloc_base_mbfa_t opal_hwloc_base_mbfa;
|
||||
|
||||
/**
|
||||
* This function sets the process-wide memory affinity policy
|
||||
* according to opal_hwloc_base_map and opal_hwloc_base_mbfa. It needs
|
||||
* to be a separate, standalone function (as opposed to being done
|
||||
* during opal_hwloc_base_open()) because opal_hwloc_topology is not
|
||||
* loaded by opal_hwloc_base_open(). Hence, an upper layer needs to
|
||||
* invoke this function after opal_hwloc_topology has been loaded.
|
||||
*/
|
||||
OPAL_DECLSPEC int opal_hwloc_base_set_process_membind_policy(void);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* OPAL_HWLOC_BASE_H */
|
||||
|
@ -39,3 +39,25 @@ message will only be reported at most once per process.
|
||||
File: %s:%d
|
||||
Message: %s
|
||||
Severity: %s
|
||||
#
|
||||
[unrecognized-policy]
|
||||
The specified %s policy is not recognized:
|
||||
|
||||
Policy: %s
|
||||
|
||||
Please check for a typo or ensure that the option is a supported
|
||||
one.
|
||||
#
|
||||
[logical-cpu-not-found]
|
||||
A specified logical processor does not exist in this topology:
|
||||
|
||||
Cpu set given: %s
|
||||
#
|
||||
[redefining-policy]
|
||||
Conflicting directives for binding policy are causing the policy
|
||||
to be redefined:
|
||||
|
||||
New policy: %s
|
||||
Prior policy: %s
|
||||
|
||||
Please check that only one policy is defined.
|
||||
|
@ -12,6 +12,7 @@
|
||||
|
||||
#include "opal/constants.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/mca/mca.h"
|
||||
@ -39,7 +40,13 @@ bool opal_hwloc_base_inited = false;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
hwloc_topology_t opal_hwloc_topology=NULL;
|
||||
hwloc_cpuset_t opal_hwloc_my_cpuset=NULL;
|
||||
hwloc_cpuset_t opal_hwloc_base_given_cpus=NULL;
|
||||
opal_hwloc_base_map_t opal_hwloc_base_map = OPAL_HWLOC_BASE_MAP_NONE;
|
||||
opal_hwloc_base_mbfa_t opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_WARN;
|
||||
opal_binding_policy_t opal_hwloc_binding_policy=0;
|
||||
char *opal_hwloc_base_slot_list=NULL;
|
||||
char *opal_hwloc_base_cpu_set=NULL;
|
||||
bool opal_hwloc_report_bindings=false;
|
||||
hwloc_obj_type_t opal_hwloc_levels[] = {
|
||||
HWLOC_OBJ_MACHINE,
|
||||
HWLOC_OBJ_NODE,
|
||||
@ -50,10 +57,8 @@ hwloc_obj_type_t opal_hwloc_levels[] = {
|
||||
HWLOC_OBJ_CORE,
|
||||
HWLOC_OBJ_PU
|
||||
};
|
||||
|
||||
bool opal_hwloc_use_hwthreads_as_cpus = false;
|
||||
#endif
|
||||
opal_hwloc_base_map_t opal_hwloc_base_map = OPAL_HWLOC_BASE_MAP_NONE;
|
||||
opal_hwloc_base_mbfa_t opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_ERROR;
|
||||
|
||||
|
||||
int opal_hwloc_base_open(void)
|
||||
@ -65,10 +70,11 @@ int opal_hwloc_base_open(void)
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{
|
||||
int value;
|
||||
int value, i;
|
||||
opal_data_type_t tmp;
|
||||
char *str_value;
|
||||
|
||||
char **tmpvals, **quals;
|
||||
|
||||
/* Debugging / verbose output */
|
||||
mca_base_param_reg_int_name("hwloc", "base_verbose",
|
||||
"Verbosity level of the hwloc framework",
|
||||
@ -102,12 +108,16 @@ int opal_hwloc_base_open(void)
|
||||
gethostname(hostname, sizeof(hostname));
|
||||
opal_show_help("help-opal-hwloc-base.txt", "invalid policy",
|
||||
true, hostname, getpid(), str_value);
|
||||
free(str_value);
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
free(str_value);
|
||||
|
||||
/* hwloc_base_bind_failure_action */
|
||||
switch (opal_hwloc_base_mbfa) {
|
||||
case OPAL_HWLOC_BASE_MBFA_SILENT:
|
||||
str_value = "silent";
|
||||
break;
|
||||
case OPAL_HWLOC_BASE_MBFA_WARN:
|
||||
str_value = "warn";
|
||||
break;
|
||||
@ -116,9 +126,11 @@ int opal_hwloc_base_open(void)
|
||||
break;
|
||||
}
|
||||
mca_base_param_reg_string_name("hwloc", "base_mem_bind_failure_action",
|
||||
"What Open MPI will do if it explicitly tries to bind memory to a specific NUMA location, and fails. Note that this is a different case than the general allocation policy described by hwloc_base_alloc_policy. A value of \"warn\" means that Open MPI will warn the first time this happens, but allow the job to continue (possibly with degraded performance). A value of \"error\" means that Open MPI will abort the job if this happens.",
|
||||
"What Open MPI will do if it explicitly tries to bind memory to a specific NUMA location, and fails. Note that this is a different case than the general allocation policy described by hwloc_base_alloc_policy. A value of \"silent\" means that Open MPI will proceed without comment. A value of \"warn\" means that Open MPI will warn the first time this happens, but allow the job to continue (possibly with degraded performance). A value of \"error\" means that Open MPI will abort the job if this happens.",
|
||||
false, false, str_value, &str_value);
|
||||
if (strcasecmp(str_value, "warn") == 0) {
|
||||
if (strcasecmp(str_value, "silent") == 0) {
|
||||
opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_SILENT;
|
||||
} else if (strcasecmp(str_value, "warn") == 0) {
|
||||
opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_WARN;
|
||||
} else if (strcasecmp(str_value, "error") == 0) {
|
||||
opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_ERROR;
|
||||
@ -127,14 +139,123 @@ int opal_hwloc_base_open(void)
|
||||
gethostname(hostname, sizeof(hostname));
|
||||
opal_show_help("help-opal-hwloc-base.txt", "invalid error action",
|
||||
true, hostname, getpid(), str_value);
|
||||
free(str_value);
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
free(str_value);
|
||||
|
||||
/* binding specification */
|
||||
mca_base_param_reg_string_name("hwloc", "base_binding_policy",
|
||||
"Policy for binding processes [none (default) | hwthread | core | l1cache | l2cache | l3cache | socket | numa | board] (supported qualifiers: overload-allowed,if-supported)",
|
||||
false, false, NULL, &str_value);
|
||||
if (NULL == str_value) {
|
||||
opal_hwloc_binding_policy = OPAL_BIND_TO_NONE;
|
||||
/* mark that no binding policy was specified */
|
||||
opal_hwloc_binding_policy &= ~OPAL_BIND_GIVEN;
|
||||
} else if (0 == strncasecmp(str_value, "none", strlen("none"))) {
|
||||
opal_hwloc_binding_policy = OPAL_BIND_TO_NONE;
|
||||
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
|
||||
} else {
|
||||
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
|
||||
tmpvals = opal_argv_split(str_value, ':');
|
||||
if (1 < opal_argv_count(tmpvals)) {
|
||||
quals = opal_argv_split(tmpvals[1], ',');
|
||||
for (i=0; NULL != quals[i]; i++) {
|
||||
if (0 == strcasecmp(quals[i], "if-supported")) {
|
||||
opal_hwloc_binding_policy |= OPAL_BIND_IF_SUPPORTED;
|
||||
} else if (0 == strcasecmp(quals[i], "overload-allowed")) {
|
||||
opal_hwloc_binding_policy |= OPAL_BIND_ALLOW_OVERLOAD;
|
||||
} else {
|
||||
/* unknown option */
|
||||
opal_output(0, "Unknown qualifier to orte_process_binding: %s", str_value);
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
}
|
||||
opal_argv_free(quals);
|
||||
}
|
||||
if (0 == strcasecmp(tmpvals[0], "hwthread")) {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "core")) {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "l1cache")) {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_L1CACHE);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "l2cache")) {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_L2CACHE);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "l3cache")) {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_L3CACHE);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "socket")) {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_SOCKET);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "numa")) {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_NUMA);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "board")) {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_BOARD);
|
||||
} else {
|
||||
opal_show_help("help-opal-hwloc-base.txt", "unrecognized-policy", true, "binding", str_value);
|
||||
opal_argv_free(tmpvals);
|
||||
free(str_value);
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
opal_argv_free(tmpvals);
|
||||
}
|
||||
free(str_value);
|
||||
|
||||
/* backward compatibility */
|
||||
mca_base_param_reg_int_name("hwloc", "base_bind_to_core",
|
||||
"Bind processes to cores",
|
||||
false, false, (int)false, &value);
|
||||
if (value) {
|
||||
/* set binding policy to core - error if something else already set */
|
||||
if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
|
||||
OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) != OPAL_BIND_TO_CORE) {
|
||||
/* error - cannot redefine the default ranking policy */
|
||||
opal_show_help("help-opal-hwloc-base.txt", "redefining-policy", true,
|
||||
"core", opal_hwloc_base_print_binding(opal_hwloc_binding_policy));
|
||||
return OPAL_ERR_SILENT;
|
||||
}
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
|
||||
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
|
||||
}
|
||||
|
||||
mca_base_param_reg_int_name("hwloc", "base_bind_to_socket",
|
||||
"Bind processes to sockets",
|
||||
false, false, (int)false, &value);
|
||||
if (value) {
|
||||
/* set binding policy to socket - error if something else already set */
|
||||
if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
|
||||
OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) != OPAL_BIND_TO_SOCKET) {
|
||||
/* error - cannot redefine the default ranking policy */
|
||||
opal_show_help("help-opal-hwloc-base.txt", "redefining-policy", true,
|
||||
"socket", opal_hwloc_base_print_binding(opal_hwloc_binding_policy));
|
||||
return OPAL_ERR_SILENT;
|
||||
}
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_SOCKET);
|
||||
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
|
||||
}
|
||||
|
||||
mca_base_param_reg_int_name("hwloc", "base_report_bindings",
|
||||
"Report bindings to stderr",
|
||||
false, false, (int)false, &value);
|
||||
opal_hwloc_report_bindings = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* did the user provide a slot list? */
|
||||
tmp = mca_base_param_reg_string_name("hwloc", "base_slot_list",
|
||||
"List of processor IDs to bind processes to [default=NULL]",
|
||||
false, false, NULL, &opal_hwloc_base_slot_list);
|
||||
if (NULL != opal_hwloc_base_slot_list) {
|
||||
/* if we already were given a policy, then this is an error */
|
||||
if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
|
||||
opal_show_help("help-opal-hwloc-base.txt", "redefining-policy", true,
|
||||
"socket", opal_hwloc_base_print_binding(opal_hwloc_binding_policy));
|
||||
return OPAL_ERR_SILENT;
|
||||
}
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CPUSET);
|
||||
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
|
||||
}
|
||||
|
||||
/* cpu allocation specification */
|
||||
mca_base_param_reg_string_name("hwloc", "base_cpu_set",
|
||||
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]",
|
||||
false, false, NULL, &opal_hwloc_base_cpu_set);
|
||||
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]",
|
||||
false, false, NULL, &opal_hwloc_base_cpu_set);
|
||||
|
||||
/* to support tools such as ompi_info, add the components
|
||||
* to a list
|
||||
@ -147,6 +268,12 @@ int opal_hwloc_base_open(void)
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
/* declare hwthreads as independent cpus */
|
||||
mca_base_param_reg_int_name("hwloc", "base_use_hwthreads_as_cpus",
|
||||
"Use hardware threads as independent cpus",
|
||||
false, false, (int)false, &value);
|
||||
opal_hwloc_use_hwthreads_as_cpus = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* declare the hwloc data types */
|
||||
tmp = OPAL_HWLOC_TOPO;
|
||||
if (OPAL_SUCCESS != (value = opal_dss.register_type(opal_hwloc_pack,
|
||||
@ -171,6 +298,7 @@ static void obj_data_const(opal_hwloc_obj_data_t *ptr)
|
||||
{
|
||||
ptr->available = NULL;
|
||||
ptr->npus = 0;
|
||||
ptr->idx = UINT_MAX;
|
||||
}
|
||||
static void obj_data_dest(opal_hwloc_obj_data_t *ptr)
|
||||
{
|
||||
|
@ -34,7 +34,7 @@ int opal_hwloc_base_set_process_membind_policy(void)
|
||||
if (NULL == opal_hwloc_topology) {
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
|
||||
/* Set the default memory allocation policy according to MCA
|
||||
param */
|
||||
switch (opal_hwloc_base_map) {
|
||||
@ -54,10 +54,20 @@ int opal_hwloc_base_set_process_membind_policy(void)
|
||||
if (NULL == cpuset) {
|
||||
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
} else {
|
||||
int e;
|
||||
hwloc_get_cpubind(opal_hwloc_topology, cpuset, 0);
|
||||
rc = hwloc_set_membind(opal_hwloc_topology,
|
||||
cpuset, HWLOC_MEMBIND_BIND, flags);
|
||||
cpuset, policy, flags);
|
||||
e = errno;
|
||||
hwloc_bitmap_free(cpuset);
|
||||
|
||||
/* See if hwloc was able to do it. If hwloc failed due to
|
||||
ENOSYS, but the base_map == NONE, then it's not really an
|
||||
error. */
|
||||
if (0 != rc && ENOSYS == e &&
|
||||
OPAL_HWLOC_BASE_MAP_NONE == opal_hwloc_base_map) {
|
||||
rc = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return (0 == rc) ? OPAL_SUCCESS : OPAL_ERROR;
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -24,6 +24,9 @@
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/class/opal_value_array.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
@ -62,17 +65,13 @@ typedef struct opal_hwloc_base_component_2_0_0_t opal_hwloc_component_t;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
/* include implementation to call */
|
||||
#if OPAL_HAVE_HWLOC
|
||||
#include MCA_hwloc_IMPLEMENTATION_HEADER
|
||||
|
||||
/* Define a hierarchical level value that
|
||||
* helps resolve the hwloc behavior of
|
||||
* treating caches as a single type of
|
||||
* entity
|
||||
* entity - must always be available
|
||||
*/
|
||||
typedef enum {
|
||||
OPAL_HWLOC_NODE_LEVEL=1,
|
||||
typedef enum uint8_t {
|
||||
OPAL_HWLOC_NODE_LEVEL=0,
|
||||
OPAL_HWLOC_NUMA_LEVEL,
|
||||
OPAL_HWLOC_SOCKET_LEVEL,
|
||||
OPAL_HWLOC_L3CACHE_LEVEL,
|
||||
@ -81,6 +80,12 @@ typedef enum {
|
||||
OPAL_HWLOC_CORE_LEVEL,
|
||||
OPAL_HWLOC_HWTHREAD_LEVEL
|
||||
} opal_hwloc_level_t;
|
||||
#define OPAL_HWLOC_LEVEL_T OPAL_UINT8
|
||||
|
||||
/* include implementation to call */
|
||||
#if OPAL_HAVE_HWLOC
|
||||
#include MCA_hwloc_IMPLEMENTATION_HEADER
|
||||
|
||||
|
||||
/* define type of processor info requested */
|
||||
typedef uint8_t opal_hwloc_resource_type_t;
|
||||
@ -93,6 +98,7 @@ typedef struct {
|
||||
opal_object_t super;
|
||||
hwloc_cpuset_t available;
|
||||
unsigned int npus;
|
||||
unsigned int idx;
|
||||
} opal_hwloc_obj_data_t;
|
||||
OBJ_CLASS_DECLARATION(opal_hwloc_obj_data_t);
|
||||
|
||||
@ -112,9 +118,46 @@ typedef struct {
|
||||
} opal_hwloc_topo_data_t;
|
||||
OBJ_CLASS_DECLARATION(opal_hwloc_topo_data_t);
|
||||
|
||||
/* define binding policies */
|
||||
typedef uint16_t opal_binding_policy_t;
|
||||
#define OPAL_BINDING_POLICY OPAL_UINT16
|
||||
|
||||
/* binding directives */
|
||||
#define OPAL_BIND_IF_SUPPORTED 0x1000
|
||||
#define OPAL_BIND_ALLOW_OVERLOAD 0x2000
|
||||
#define OPAL_BIND_GIVEN 0x4000
|
||||
/* binding policies */
|
||||
#define OPAL_BIND_TO_NONE 1
|
||||
#define OPAL_BIND_TO_BOARD 2
|
||||
#define OPAL_BIND_TO_NUMA 3
|
||||
#define OPAL_BIND_TO_SOCKET 4
|
||||
#define OPAL_BIND_TO_L3CACHE 5
|
||||
#define OPAL_BIND_TO_L2CACHE 6
|
||||
#define OPAL_BIND_TO_L1CACHE 7
|
||||
#define OPAL_BIND_TO_CORE 8
|
||||
#define OPAL_BIND_TO_HWTHREAD 9
|
||||
#define OPAL_BIND_TO_CPUSET 10
|
||||
#define OPAL_GET_BINDING_POLICY(pol) \
|
||||
((pol) & 0x0fff)
|
||||
#define OPAL_SET_BINDING_POLICY(target, pol) \
|
||||
(target) = (pol) | ((target) & 0xf000)
|
||||
/* check if policy is set */
|
||||
#define OPAL_BINDING_POLICY_IS_SET(pol) \
|
||||
((pol) & 0x4000)
|
||||
/* macro to detect if binding was qualified */
|
||||
#define OPAL_BINDING_REQUIRED(n) \
|
||||
(!(OPAL_BIND_IF_SUPPORTED & (n)))
|
||||
/* macro to detect if binding is forced */
|
||||
#define OPAL_BIND_OVERLOAD_ALLOWED(n) \
|
||||
(OPAL_BIND_ALLOW_OVERLOAD & (n))
|
||||
|
||||
/* some global values */
|
||||
OPAL_DECLSPEC extern hwloc_topology_t opal_hwloc_topology;
|
||||
OPAL_DECLSPEC extern opal_binding_policy_t opal_hwloc_binding_policy;
|
||||
OPAL_DECLSPEC extern hwloc_cpuset_t opal_hwloc_my_cpuset;
|
||||
OPAL_DECLSPEC extern bool opal_hwloc_report_bindings;
|
||||
OPAL_DECLSPEC extern hwloc_obj_type_t opal_hwloc_levels[];
|
||||
OPAL_DECLSPEC extern bool opal_hwloc_use_hwthreads_as_cpus;
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
# Copyright © 2009-2010 INRIA. All rights reserved.
|
||||
# Copyright © 2009-2010 Université Bordeaux 1
|
||||
# Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright © 2011 Oracle and/or its affiliates. All rights reserved.
|
||||
# See COPYING in top-level directory.
|
||||
|
||||
# Only install the headers if we're in standalone mode (meaning:
|
||||
@ -33,6 +34,11 @@ include_hwloc_HEADERS += \
|
||||
hwloc/linux-libnuma.h
|
||||
endif HWLOC_HAVE_LINUX
|
||||
|
||||
if HWLOC_HAVE_SOLARIS
|
||||
include_hwloc_HEADERS += \
|
||||
private/solaris-chiptype.h
|
||||
endif HWLOC_HAVE_SOLARIS
|
||||
|
||||
if HWLOC_HAVE_SCHED_SETAFFINITY
|
||||
include_hwloc_HEADERS += hwloc/glibc-sched.h
|
||||
endif HWLOC_HAVE_SCHED_SETAFFINITY
|
||||
|
@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/* SPARC Chip Modes. */
|
||||
#define MODE_UNKNOWN 0
|
||||
#define MODE_SPITFIRE 1
|
||||
#define MODE_BLACKBIRD 2
|
||||
#define MODE_CHEETAH 3
|
||||
#define MODE_SPARC64_VI 4
|
||||
#define MODE_T1 5
|
||||
#define MODE_T2 6
|
||||
#define MODE_SPARC64_VII 7
|
||||
#define MODE_ROCK 8
|
||||
|
||||
/* SPARC Chip Implementations. */
|
||||
#define IMPL_SPARC64_VI 0x6
|
||||
#define IMPL_SPARC64_VII 0x7
|
||||
#define IMPL_SPITFIRE 0x10
|
||||
#define IMPL_BLACKBIRD 0x11
|
||||
#define IMPL_SABRE 0x12
|
||||
#define IMPL_HUMMINGBIRD 0x13
|
||||
#define IMPL_CHEETAH 0x14
|
||||
#define IMPL_CHEETAHPLUS 0x15
|
||||
#define IMPL_JALAPENO 0x16
|
||||
#define IMPL_JAGUAR 0x18
|
||||
#define IMPL_PANTHER 0x19
|
||||
#define IMPL_NIAGARA 0x23
|
||||
#define IMPL_NIAGARA_2 0x24
|
||||
#define IMPL_ROCK 0x25
|
||||
|
||||
/* Default Mfg, Cache, Speed settings */
|
||||
#define TI_MANUFACTURER 0x17
|
||||
#define TWO_MEG_CACHE 2097152
|
||||
#define SPITFIRE_SPEED 142943750
|
||||
|
||||
char* hwloc_solaris_get_chip_type(void);
|
||||
char* hwloc_solaris_get_chip_model(void);
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
# Copyright © 2009-2010 INRIA. All rights reserved.
|
||||
# Copyright © 2009-2010 Université Bordeaux 1
|
||||
# Copyright © 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright © 2011 Oracle and/or its affiliates. All rights reserved.
|
||||
# See COPYING in top-level directory.
|
||||
|
||||
AM_CFLAGS = $(HWLOC_CFLAGS)
|
||||
@ -35,6 +36,8 @@ ldflags =
|
||||
|
||||
if HWLOC_HAVE_SOLARIS
|
||||
sources += topology-solaris.c
|
||||
sources += topology-solaris-chiptype.c
|
||||
ldflags += -lpicl
|
||||
endif HWLOC_HAVE_SOLARIS
|
||||
|
||||
if HWLOC_HAVE_LINUX
|
||||
|
321
opal/mca/hwloc/hwloc122ompi/hwloc/src/topology-solaris-chiptype.c
Обычный файл
321
opal/mca/hwloc/hwloc122ompi/hwloc/src/topology-solaris-chiptype.c
Обычный файл
@ -0,0 +1,321 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include <private/solaris-chiptype.h>
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <strings.h>
|
||||
#include <sys/systeminfo.h>
|
||||
#include <picl.h>
|
||||
|
||||
/*****************************************************************************
|
||||
Order of this list is important for the assign_value and
|
||||
assign_string_value routines
|
||||
*****************************************************************************/
|
||||
|
||||
static const char* items[] = {
|
||||
"clock-frequency",
|
||||
"cpu-mhz",
|
||||
"ecache-size",
|
||||
"l2-cache-size",
|
||||
"sectored-l2-cache-size",
|
||||
"implementation#",
|
||||
"manufacturer#",
|
||||
"compatible",
|
||||
"ProcessorType",
|
||||
"vendor-id",
|
||||
"brand-string"
|
||||
};
|
||||
|
||||
#define NUM_ITEMS (sizeof(items) / sizeof(items[0]))
|
||||
|
||||
/*****************************************************************************
|
||||
SPARC strings for chip modes and implementation
|
||||
*****************************************************************************/
|
||||
static const char* sparc_modes[] = {
|
||||
"UNKNOWN",
|
||||
"SPITFIRE",
|
||||
"BLACKBIRD",
|
||||
"CHEETAH",
|
||||
"SPARC64_VI",
|
||||
"T1",
|
||||
"T2",
|
||||
"SPARC64_VII",
|
||||
"ROCK"
|
||||
};
|
||||
|
||||
/*****************************************************************************
|
||||
Default values are for Unknown so we can build up from there.
|
||||
*****************************************************************************/
|
||||
|
||||
static long dss_chip_mode = MODE_UNKNOWN;
|
||||
static long dss_chip_impl = IMPL_SPITFIRE;
|
||||
static long dss_chip_cache = TWO_MEG_CACHE;
|
||||
static long dss_chip_manufacturer = TI_MANUFACTURER;
|
||||
static long long dss_chip_speed = SPITFIRE_SPEED;
|
||||
static char dss_chip_type[PICL_PROPNAMELEN_MAX];
|
||||
static char dss_chip_model[PICL_PROPNAMELEN_MAX];
|
||||
static int called_cpu_probe = 0;
|
||||
|
||||
/*****************************************************************************
|
||||
Assigns values based on the value of index. For this reason, the order of
|
||||
the items array is important.
|
||||
*****************************************************************************/
|
||||
static void assign_value(int index, long long val) {
|
||||
if (index == 0) { /* clock-frequency */
|
||||
dss_chip_speed = val;
|
||||
}
|
||||
if (index == 1) { /* cpu-mhz */
|
||||
dss_chip_speed = val * 1000000; /* Scale since value was in MHz */
|
||||
}
|
||||
else if ((index >= 2) && (index <= 4)) {
|
||||
/* ecache-size, l2-cache-size, sectored-l2-cache-size */
|
||||
dss_chip_cache = val;
|
||||
}
|
||||
else if (index == 5) {
|
||||
/* implementation# T1, T2, and Rock do not have this, see RFE 6615268 */
|
||||
dss_chip_impl = val;
|
||||
if (dss_chip_impl == IMPL_SPITFIRE) {
|
||||
dss_chip_mode = 1;
|
||||
}
|
||||
else if ((dss_chip_impl >= IMPL_BLACKBIRD) &&
|
||||
(dss_chip_impl <= IMPL_HUMMINGBIRD)) {
|
||||
dss_chip_mode = 2;
|
||||
}
|
||||
else if ((dss_chip_impl >= IMPL_CHEETAH) &&
|
||||
(dss_chip_impl <= IMPL_PANTHER)) {
|
||||
dss_chip_mode = 3;
|
||||
}
|
||||
else if (dss_chip_impl == IMPL_SPARC64_VI) {
|
||||
dss_chip_mode = 4;
|
||||
}
|
||||
else if (dss_chip_impl == IMPL_NIAGARA) {
|
||||
dss_chip_mode = 5;
|
||||
}
|
||||
else if (dss_chip_impl == IMPL_NIAGARA_2) {
|
||||
dss_chip_mode = 6;
|
||||
}
|
||||
else if (dss_chip_impl == IMPL_SPARC64_VII) {
|
||||
dss_chip_mode = 7;
|
||||
}
|
||||
else if (dss_chip_impl == IMPL_ROCK) {
|
||||
dss_chip_mode = 8;
|
||||
}
|
||||
}
|
||||
else if (index == 6) { /* manufacturer# */
|
||||
dss_chip_manufacturer = val;
|
||||
}
|
||||
}
|
||||
|
||||
/*****************************************************************************
|
||||
Assigns values based on the value of index. For this reason, the order of
|
||||
the items array is important.
|
||||
*****************************************************************************/
|
||||
static void assign_string_value(int index, char* string_val) {
|
||||
if (index == 7) { /* compatible */
|
||||
if (strncasecmp(string_val, "FJSV,SPARC64-VI",
|
||||
PICL_PROPNAMELEN_MAX) == 0) {
|
||||
dss_chip_mode = 4;
|
||||
}
|
||||
else if (strncasecmp(string_val, "SUNW,UltraSPARC-T1",
|
||||
PICL_PROPNAMELEN_MAX) == 0) {
|
||||
dss_chip_mode = 5;
|
||||
}
|
||||
else if (strncasecmp(string_val, "SUNW,UltraSPARC-T2",
|
||||
PICL_PROPNAMELEN_MAX) == 0) {
|
||||
dss_chip_mode = 6;
|
||||
}
|
||||
else if (strncasecmp(string_val, "FJSV,SPARC64-VII",
|
||||
PICL_PROPNAMELEN_MAX) == 0) {
|
||||
dss_chip_mode = 7;
|
||||
}
|
||||
else if (strncasecmp(string_val, "SUNW,Rock",
|
||||
PICL_PROPNAMELEN_MAX) == 0) {
|
||||
dss_chip_mode = 8;
|
||||
}
|
||||
} else if (index == 8) { /* ProcessorType */
|
||||
strncpy(&dss_chip_type[0], string_val, PICL_PROPNAMELEN_MAX);
|
||||
} else if (index == 10) { /* brand-string */
|
||||
strncpy(&dss_chip_model[0], string_val, PICL_PROPNAMELEN_MAX);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*****************************************************************************
|
||||
Gets called by probe_cpu. Cycles through the table values until we find
|
||||
what we are looking for.
|
||||
*****************************************************************************/
|
||||
static int search_table(int index, picl_prophdl_t table_hdl) {
|
||||
|
||||
picl_prophdl_t col_hdl;
|
||||
picl_prophdl_t row_hdl;
|
||||
picl_propinfo_t p_info;
|
||||
int val;
|
||||
char string_val[PICL_PROPNAMELEN_MAX];
|
||||
|
||||
for (val = picl_get_next_by_col(table_hdl, &row_hdl); val != PICL_ENDOFLIST;
|
||||
val = picl_get_next_by_col(row_hdl, &row_hdl)) {
|
||||
if (val == PICL_SUCCESS) {
|
||||
for (col_hdl = row_hdl; val != PICL_ENDOFLIST;
|
||||
val = picl_get_next_by_row(col_hdl, &col_hdl)) {
|
||||
if (val == PICL_SUCCESS) {
|
||||
val = picl_get_propinfo(col_hdl, &p_info);
|
||||
if (val == PICL_SUCCESS) {
|
||||
if (p_info.type == PICL_PTYPE_CHARSTRING) {
|
||||
val = picl_get_propval(col_hdl, &string_val, sizeof(string_val));
|
||||
if (val == PICL_SUCCESS) {
|
||||
assign_string_value(index, string_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*****************************************************************************
|
||||
Gets called by picl_walk_tree_by_class. Then it cycles through the properties
|
||||
until we find what we are looking for. Once we are done, we return
|
||||
PICL_WALK_TERMINATE to stop picl_walk_tree_by_class from traversing the tree.
|
||||
|
||||
Note that PICL_PTYPE_UNSIGNED_INT and PICL_PTYPE_INT can either be 4-bytes
|
||||
or 8-bytes.
|
||||
*****************************************************************************/
|
||||
static int probe_cpu(picl_nodehdl_t node_hdl, void* dummy_arg) {
|
||||
|
||||
picl_prophdl_t p_hdl;
|
||||
picl_prophdl_t table_hdl;
|
||||
picl_propinfo_t p_info;
|
||||
long long long_long_val;
|
||||
unsigned int uint_val;
|
||||
int index;
|
||||
int int_val;
|
||||
int val;
|
||||
char string_val[PICL_PROPNAMELEN_MAX];
|
||||
|
||||
val = picl_get_first_prop(node_hdl, &p_hdl);
|
||||
while (val == PICL_SUCCESS) {
|
||||
called_cpu_probe = 1;
|
||||
val = picl_get_propinfo(p_hdl, &p_info);
|
||||
if (val == PICL_SUCCESS) {
|
||||
for (index = 0; index < NUM_ITEMS; index++) {
|
||||
if (strcasecmp(p_info.name, items[index]) == 0) {
|
||||
if (p_info.type == PICL_PTYPE_UNSIGNED_INT) {
|
||||
if (p_info.size == sizeof(uint_val)) {
|
||||
val = picl_get_propval(p_hdl, &uint_val, sizeof(uint_val));
|
||||
if (val == PICL_SUCCESS) {
|
||||
long_long_val = uint_val;
|
||||
assign_value(index, long_long_val);
|
||||
}
|
||||
}
|
||||
else if (p_info.size == sizeof(long_long_val)) {
|
||||
val = picl_get_propval(p_hdl, &long_long_val,
|
||||
sizeof(long_long_val));
|
||||
if (val == PICL_SUCCESS) {
|
||||
assign_value(index, long_long_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (p_info.type == PICL_PTYPE_INT) {
|
||||
if (p_info.size == sizeof(int_val)) {
|
||||
val = picl_get_propval(p_hdl, &int_val, sizeof(int_val));
|
||||
if (val == PICL_SUCCESS) {
|
||||
long_long_val = int_val;
|
||||
assign_value(index, long_long_val);
|
||||
}
|
||||
}
|
||||
else if (p_info.size == sizeof(long_long_val)) {
|
||||
val = picl_get_propval(p_hdl, &long_long_val,
|
||||
sizeof(long_long_val));
|
||||
if (val == PICL_SUCCESS) {
|
||||
assign_value(index, long_long_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (p_info.type == PICL_PTYPE_CHARSTRING) {
|
||||
val = picl_get_propval(p_hdl, &string_val, sizeof(string_val));
|
||||
if (val == PICL_SUCCESS) {
|
||||
assign_string_value(index, string_val);
|
||||
}
|
||||
}
|
||||
else if (p_info.type == PICL_PTYPE_TABLE) {
|
||||
val = picl_get_propval(p_hdl, &table_hdl, p_info.size);
|
||||
if (val == PICL_SUCCESS) {
|
||||
search_table(index, table_hdl);
|
||||
}
|
||||
}
|
||||
break;
|
||||
} else if (index == NUM_ITEMS-1) {
|
||||
if (p_info.type == PICL_PTYPE_CHARSTRING) {
|
||||
val = picl_get_propval(p_hdl, &string_val, sizeof(string_val));
|
||||
if (val == PICL_SUCCESS) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
val = picl_get_next_prop(p_hdl, &p_hdl);
|
||||
}
|
||||
return PICL_WALK_TERMINATE;
|
||||
}
|
||||
|
||||
|
||||
/*****************************************************************************
|
||||
Initializes, gets the root, then walks the picl tree looking for information
|
||||
|
||||
Currently, the "core" class is only needed for OPL systems
|
||||
*****************************************************************************/
|
||||
char* hwloc_solaris_get_chip_type(void) {
|
||||
picl_nodehdl_t root;
|
||||
int val;
|
||||
static char chip_type[PICL_PROPNAMELEN_MAX];
|
||||
|
||||
val = picl_initialize();
|
||||
if (val != PICL_SUCCESS) { /* Can't initialize session with PICL daemon */
|
||||
return(NULL);
|
||||
}
|
||||
val = picl_get_root(&root);
|
||||
if (val != PICL_SUCCESS) { /* Failed to get root node of the PICL tree */
|
||||
return(NULL);
|
||||
}
|
||||
val = picl_walk_tree_by_class(root, "cpu", (void *)NULL, probe_cpu);
|
||||
val = picl_walk_tree_by_class(root, "core", (void *)NULL, probe_cpu);
|
||||
picl_shutdown();
|
||||
|
||||
if (called_cpu_probe) {
|
||||
strncpy(chip_type, dss_chip_type, PICL_PROPNAMELEN_MAX);
|
||||
} else {
|
||||
/* no picl information on machine available */
|
||||
sysinfo(SI_HW_PROVIDER, chip_type, PICL_PROPNAMELEN_MAX);
|
||||
}
|
||||
return(chip_type);
|
||||
}
|
||||
|
||||
/*****************************************************************************
|
||||
Initializes, gets the root, then walks the picl tree looking for information
|
||||
|
||||
Currently, the "core" class is only needed for OPL systems
|
||||
*****************************************************************************/
|
||||
char *hwloc_solaris_get_chip_model(void) {
|
||||
|
||||
if (called_cpu_probe) {
|
||||
if (dss_chip_mode != MODE_UNKNOWN) { /* SPARC chip */
|
||||
strncpy(dss_chip_model, sparc_modes[dss_chip_mode],
|
||||
PICL_PROPNAMELEN_MAX);
|
||||
}
|
||||
} else {
|
||||
/* no picl information on machine available */
|
||||
sysinfo(SI_PLATFORM, dss_chip_model, PICL_PROPNAMELEN_MAX);
|
||||
}
|
||||
return(dss_chip_model);
|
||||
}
|
||||
|
@ -3,6 +3,7 @@
|
||||
* Copyright © 2009-2011 INRIA. All rights reserved.
|
||||
* Copyright © 2009-2011 Université Bordeaux 1
|
||||
* Copyright © 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright © 2011 Oracle and/or its affiliates. All rights reserved.
|
||||
* See COPYING in top-level directory.
|
||||
*/
|
||||
|
||||
@ -10,6 +11,7 @@
|
||||
#include <hwloc.h>
|
||||
#include <private/private.h>
|
||||
#include <private/debug.h>
|
||||
#include <private/solaris-chiptype.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
@ -607,9 +609,22 @@ hwloc_look_kstat(struct hwloc_topology *topology)
|
||||
* pkg_core_id for the core ID (not unique). They are not useful to us
|
||||
* however. */
|
||||
}
|
||||
|
||||
if (look_chips)
|
||||
hwloc_setup_level(procid_max, numsockets, osphysids, proc_physids, topology, HWLOC_OBJ_SOCKET);
|
||||
if (look_chips) {
|
||||
/* Set up the Socket object inline instead of using hwloc_setup_level
|
||||
* so we can add the CPUVendor and CPUModel info objects.
|
||||
*/
|
||||
struct hwloc_obj *obj;
|
||||
unsigned j;
|
||||
for (j = 0; j < numsockets; j++) {
|
||||
obj = hwloc_alloc_setup_object(HWLOC_OBJ_SOCKET, osphysids[j]);
|
||||
hwloc_object_cpuset_from_array(obj, j, proc_physids, procid_max);
|
||||
hwloc_debug_2args_bitmap("%s %d has cpuset %s\n",
|
||||
hwloc_obj_type_string(HWLOC_OBJ_SOCKET),
|
||||
j, obj->cpuset);
|
||||
hwloc_insert_object_by_cpuset(topology, obj);
|
||||
}
|
||||
hwloc_debug("%s", "\n");
|
||||
}
|
||||
|
||||
if (look_cores)
|
||||
hwloc_setup_level(procid_max, numcores, oscoreids, proc_coreids, topology, HWLOC_OBJ_CORE);
|
||||
@ -627,17 +642,30 @@ void
|
||||
hwloc_look_solaris(struct hwloc_topology *topology)
|
||||
{
|
||||
unsigned nbprocs = hwloc_fallback_nbprocessors (topology);
|
||||
char *CPUType;
|
||||
char *CPUModel;
|
||||
#ifdef HAVE_LIBLGRP
|
||||
hwloc_look_lgrp(topology);
|
||||
#endif /* HAVE_LIBLGRP */
|
||||
#ifdef HAVE_LIBKSTAT
|
||||
nbprocs = 0;
|
||||
if (hwloc_look_kstat(topology))
|
||||
return;
|
||||
if (hwloc_look_kstat(topology)) {
|
||||
/* Set CPU Type and Model for machine. */
|
||||
CPUType = hwloc_solaris_get_chip_type();
|
||||
CPUModel = hwloc_solaris_get_chip_model();
|
||||
hwloc_add_object_info(topology->levels[0][0], "CPUType", CPUType);
|
||||
hwloc_add_object_info(topology->levels[0][0], "CPUModel", CPUModel);
|
||||
return;
|
||||
}
|
||||
#endif /* HAVE_LIBKSTAT */
|
||||
hwloc_setup_pu_level(topology, nbprocs);
|
||||
|
||||
hwloc_add_object_info(topology->levels[0][0], "Backend", "Solaris");
|
||||
|
||||
/* Set CPU Type and Model for machine. */
|
||||
CPUType = hwloc_solaris_get_chip_type();
|
||||
CPUModel = hwloc_solaris_get_chip_model();
|
||||
hwloc_add_object_info(topology->levels[0][0], "CPUType", CPUType);
|
||||
hwloc_add_object_info(topology->levels[0][0], "CPUModel", CPUModel);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -205,6 +205,9 @@ opal_err2str(int errnum, const char **errmsg)
|
||||
case OPAL_ERR_NETWORK_NOT_PARSEABLE:
|
||||
retval = "Provided network specification is not parseable";
|
||||
break;
|
||||
case OPAL_ERR_SILENT:
|
||||
retval = NULL;
|
||||
break;
|
||||
default:
|
||||
retval = NULL;
|
||||
}
|
||||
|
@ -74,7 +74,8 @@ enum {
|
||||
ORTE_ERR_INVALID_PHYS_CPU = OPAL_ERR_INVALID_PHYS_CPU,
|
||||
ORTE_ERR_MULTIPLE_AFFINITIES = OPAL_ERR_MULTIPLE_AFFINITIES,
|
||||
ORTE_ERR_SLOT_LIST_RANGE = OPAL_ERR_SLOT_LIST_RANGE,
|
||||
|
||||
ORTE_ERR_SILENT = OPAL_ERR_SILENT,
|
||||
|
||||
/* error codes specific to ORTE - don't forget to update
|
||||
orte/util/error_strings.c when adding new error codes!!
|
||||
Otherwise, the error reporting system will potentially crash,
|
||||
@ -95,35 +96,34 @@ enum {
|
||||
ORTE_ERR_INDETERMINATE_STATE_INFO = (ORTE_ERR_BASE - 13),
|
||||
ORTE_ERR_NODE_FULLY_USED = (ORTE_ERR_BASE - 14),
|
||||
ORTE_ERR_INVALID_NUM_PROCS = (ORTE_ERR_BASE - 15),
|
||||
ORTE_ERR_SILENT = (ORTE_ERR_BASE - 16),
|
||||
ORTE_ERR_ADDRESSEE_UNKNOWN = (ORTE_ERR_BASE - 17),
|
||||
ORTE_ERR_SYS_LIMITS_PIPES = (ORTE_ERR_BASE - 18),
|
||||
ORTE_ERR_PIPE_SETUP_FAILURE = (ORTE_ERR_BASE - 19),
|
||||
ORTE_ERR_SYS_LIMITS_CHILDREN = (ORTE_ERR_BASE - 20),
|
||||
ORTE_ERR_FAILED_GET_TERM_ATTRS = (ORTE_ERR_BASE - 21),
|
||||
ORTE_ERR_WDIR_NOT_FOUND = (ORTE_ERR_BASE - 22),
|
||||
ORTE_ERR_EXE_NOT_FOUND = (ORTE_ERR_BASE - 23),
|
||||
ORTE_ERR_PIPE_READ_FAILURE = (ORTE_ERR_BASE - 24),
|
||||
ORTE_ERR_EXE_NOT_ACCESSIBLE = (ORTE_ERR_BASE - 25),
|
||||
ORTE_ERR_FAILED_TO_START = (ORTE_ERR_BASE - 26),
|
||||
ORTE_ERR_FILE_NOT_EXECUTABLE = (ORTE_ERR_BASE - 27),
|
||||
ORTE_ERR_HNP_COULD_NOT_START = (ORTE_ERR_BASE - 28),
|
||||
ORTE_ERR_SYS_LIMITS_SOCKETS = (ORTE_ERR_BASE - 29),
|
||||
ORTE_ERR_SOCKET_NOT_AVAILABLE = (ORTE_ERR_BASE - 30),
|
||||
ORTE_ERR_SYSTEM_WILL_BOOTSTRAP = (ORTE_ERR_BASE - 31),
|
||||
ORTE_ERR_RESTART_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32),
|
||||
ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 33),
|
||||
ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34),
|
||||
ORTE_ERR_UNRECOVERABLE = (ORTE_ERR_BASE - 35),
|
||||
ORTE_ERR_MEM_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 36),
|
||||
ORTE_ERR_HEARTBEAT_LOST = (ORTE_ERR_BASE - 37),
|
||||
ORTE_ERR_PROC_STALLED = (ORTE_ERR_BASE - 38),
|
||||
ORTE_ERR_NO_APP_SPECIFIED = (ORTE_ERR_BASE - 39),
|
||||
ORTE_ERR_NO_EXE_SPECIFIED = (ORTE_ERR_BASE - 40),
|
||||
ORTE_ERR_COMM_DISABLED = (ORTE_ERR_BASE - 41),
|
||||
ORTE_ERR_FAILED_TO_MAP = (ORTE_ERR_BASE - 42),
|
||||
ORTE_ERR_TAKE_NEXT_OPTION = (ORTE_ERR_BASE - 43),
|
||||
ORTE_ERR_SENSOR_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 44)
|
||||
ORTE_ERR_ADDRESSEE_UNKNOWN = (ORTE_ERR_BASE - 16),
|
||||
ORTE_ERR_SYS_LIMITS_PIPES = (ORTE_ERR_BASE - 17),
|
||||
ORTE_ERR_PIPE_SETUP_FAILURE = (ORTE_ERR_BASE - 18),
|
||||
ORTE_ERR_SYS_LIMITS_CHILDREN = (ORTE_ERR_BASE - 19),
|
||||
ORTE_ERR_FAILED_GET_TERM_ATTRS = (ORTE_ERR_BASE - 20),
|
||||
ORTE_ERR_WDIR_NOT_FOUND = (ORTE_ERR_BASE - 21),
|
||||
ORTE_ERR_EXE_NOT_FOUND = (ORTE_ERR_BASE - 22),
|
||||
ORTE_ERR_PIPE_READ_FAILURE = (ORTE_ERR_BASE - 23),
|
||||
ORTE_ERR_EXE_NOT_ACCESSIBLE = (ORTE_ERR_BASE - 24),
|
||||
ORTE_ERR_FAILED_TO_START = (ORTE_ERR_BASE - 25),
|
||||
ORTE_ERR_FILE_NOT_EXECUTABLE = (ORTE_ERR_BASE - 26),
|
||||
ORTE_ERR_HNP_COULD_NOT_START = (ORTE_ERR_BASE - 27),
|
||||
ORTE_ERR_SYS_LIMITS_SOCKETS = (ORTE_ERR_BASE - 28),
|
||||
ORTE_ERR_SOCKET_NOT_AVAILABLE = (ORTE_ERR_BASE - 29),
|
||||
ORTE_ERR_SYSTEM_WILL_BOOTSTRAP = (ORTE_ERR_BASE - 30),
|
||||
ORTE_ERR_RESTART_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 31),
|
||||
ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 32),
|
||||
ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 33),
|
||||
ORTE_ERR_UNRECOVERABLE = (ORTE_ERR_BASE - 34),
|
||||
ORTE_ERR_MEM_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 35),
|
||||
ORTE_ERR_HEARTBEAT_LOST = (ORTE_ERR_BASE - 36),
|
||||
ORTE_ERR_PROC_STALLED = (ORTE_ERR_BASE - 37),
|
||||
ORTE_ERR_NO_APP_SPECIFIED = (ORTE_ERR_BASE - 38),
|
||||
ORTE_ERR_NO_EXE_SPECIFIED = (ORTE_ERR_BASE - 39),
|
||||
ORTE_ERR_COMM_DISABLED = (ORTE_ERR_BASE - 40),
|
||||
ORTE_ERR_FAILED_TO_MAP = (ORTE_ERR_BASE - 41),
|
||||
ORTE_ERR_TAKE_NEXT_OPTION = (ORTE_ERR_BASE - 42),
|
||||
ORTE_ERR_SENSOR_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 43)
|
||||
};
|
||||
|
||||
#define ORTE_ERR_MAX (ORTE_ERR_BASE - 100)
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
@ -318,12 +318,12 @@ static void attach_debugger(int fd, short event, void *arg)
|
||||
build_debugger_args(app);
|
||||
opal_pointer_array_add(jdata->apps, app);
|
||||
jdata->num_apps = 1;
|
||||
/* setup the mapping policy to bynode so we get one
|
||||
/* setup the mapping policy to pernode so we get one
|
||||
* daemon on each node
|
||||
*/
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
jdata->map->policy = ORTE_MAPPING_BYNODE;
|
||||
jdata->map->npernode = 1;
|
||||
jdata->map->mapping = ORTE_MAPPING_PPR;
|
||||
jdata->map->ppr = strdup("1:n");
|
||||
/* now go ahead and spawn this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -282,14 +282,14 @@ static int modex(opal_list_t *procs)
|
||||
{
|
||||
int rc;
|
||||
opal_buffer_t buf, rbuf;
|
||||
char *locale=NULL;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
|
||||
"%s grpcomm:bad: modex entered",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
if (NULL == procs) {
|
||||
/* The modex will be realized in the background by the daemons. The processes will
|
||||
/* This is a modex across our peers at startup. The modex will be realized in the
|
||||
* background by the daemons. The processes will
|
||||
* only be informed when all data has been collected from all processes. The get_attr
|
||||
* will realize the blocking, it will not return until the data has been received.
|
||||
*/
|
||||
@ -308,47 +308,6 @@ static int modex(opal_list_t *procs)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{
|
||||
if (NULL != opal_hwloc_topology) {
|
||||
/* our cpuset should already be known, but check for safety */
|
||||
if (NULL == opal_hwloc_my_cpuset) {
|
||||
opal_hwloc_base_get_local_cpuset();
|
||||
}
|
||||
/* convert to a string */
|
||||
hwloc_bitmap_list_asprintf(&locale, opal_hwloc_my_cpuset);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:bad LOCALE %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), locale));
|
||||
/* pack it */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(locale);
|
||||
goto cleanup;
|
||||
}
|
||||
free(locale);
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:bad NO TOPO - ADDING PLACEHOLDER",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* pack a placeholder */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:bad NO HWLOC - ADDING PLACEHOLDER",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* pack a placeholder */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* pack the entries we have received */
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -375,6 +334,9 @@ static int modex(opal_list_t *procs)
|
||||
|
||||
return rc;
|
||||
} else {
|
||||
/* this is a modex across a specified list of procs, usually during
|
||||
* a connect/accept.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_full_modex(procs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
@ -90,7 +90,6 @@ ORTE_DECLSPEC int orte_grpcomm_base_set_proc_attr(const char *attr_name,
|
||||
ORTE_DECLSPEC int orte_grpcomm_base_get_proc_attr(const orte_process_name_t proc,
|
||||
const char * attribute_name, void **val,
|
||||
size_t *size);
|
||||
ORTE_DECLSPEC int orte_grpcomm_base_peer_modex(void);
|
||||
ORTE_DECLSPEC int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf);
|
||||
ORTE_DECLSPEC int orte_grpcomm_base_full_modex(opal_list_t *procs);
|
||||
ORTE_DECLSPEC int orte_grpcomm_base_purge_proc_attrs(void);
|
||||
|
@ -61,7 +61,6 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs)
|
||||
orte_pmap_t *pmap;
|
||||
orte_vpid_t daemon;
|
||||
char *hostname;
|
||||
char *locale=NULL;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:full:modex: performing modex",
|
||||
@ -104,42 +103,12 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs)
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{
|
||||
/* get and pack our cpuset so other procs can determine our locality */
|
||||
if (NULL != opal_hwloc_topology) {
|
||||
/* our cpuset should already be known, but check for safety */
|
||||
if (NULL == opal_hwloc_my_cpuset) {
|
||||
opal_hwloc_base_get_local_cpuset();
|
||||
}
|
||||
/* convert to a string */
|
||||
hwloc_bitmap_list_asprintf(&locale, opal_hwloc_my_cpuset);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex LOCALE %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), locale));
|
||||
/* pack it */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(locale);
|
||||
goto cleanup;
|
||||
}
|
||||
free(locale);
|
||||
} else {
|
||||
/* pack a placeholder */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex NO TOPO - ADDING PLACEHOLDER",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
/* pack our binding info so other procs can determine our locality */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.bind_level, 1, OPAL_HWLOC_LEVEL_T))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
#else
|
||||
/* pack a placeholder */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex NO HWLOC - ADDING PLACEHOLDER",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.bind_idx, 1, OPAL_UINT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
@ -155,12 +124,20 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs)
|
||||
"%s grpcomm:base:full:modex: executing allgather",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* exchange the buffer with the list of peers */
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather_list(procs, &buf, &rbuf))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
if (NULL == procs) {
|
||||
/* exchange the buffer with my peers */
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(&buf, &rbuf))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* exchange the buffer with the list of peers */
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather_list(procs, &buf, &rbuf))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:full:modex: processing modex info",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
@ -234,7 +211,8 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs)
|
||||
/* node wasn't found - let's add it */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:full:modex no nidmap entry for node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostname));
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == hostname) ? "NULL" : hostname));
|
||||
nid = OBJ_NEW(orte_nid_t);
|
||||
nid->name = strdup(hostname);
|
||||
nid->daemon = daemon;
|
||||
@ -287,19 +265,63 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs)
|
||||
}
|
||||
}
|
||||
|
||||
/* unpack the locality info */
|
||||
cnt = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &locale, &cnt, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex setting proc %s locale %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc_name),
|
||||
(NULL == locale) ? "NULL" : locale));
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{
|
||||
opal_hwloc_level_t bind_level;
|
||||
unsigned int bind_idx;
|
||||
|
||||
/* store on the pmap */
|
||||
/* unpack the locality info */
|
||||
cnt = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &bind_level, &cnt, OPAL_HWLOC_LEVEL_T))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
cnt = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &bind_idx, &cnt, OPAL_UINT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex setting proc %s level %s idx %u",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc_name),
|
||||
opal_hwloc_base_print_level(bind_level), bind_idx));
|
||||
|
||||
/* store on the pmap */
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &proc_name, ORTE_PROC_MY_NAME)) {
|
||||
/* if this data is from myself, then set locality to all */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex setting proc %s locale ALL",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc_name)));
|
||||
pmap->locality = OPAL_PROC_ALL_LOCAL;
|
||||
} else if (daemon != ORTE_PROC_MY_DAEMON->vpid) {
|
||||
/* this is on a different node, then mark as non-local */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex setting proc %s locale NONLOCAL",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc_name)));
|
||||
pmap->locality = OPAL_PROC_NON_LOCAL;
|
||||
} else if (OPAL_HWLOC_NODE_LEVEL == orte_process_info.bind_level ||
|
||||
OPAL_HWLOC_NODE_LEVEL == bind_level) {
|
||||
/* one or both of us is not bound, so all we can say is we are on the
|
||||
* same node
|
||||
*/
|
||||
pmap->locality = OPAL_PROC_ON_NODE;
|
||||
} else {
|
||||
/* determine relative location on our node */
|
||||
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
|
||||
orte_process_info.bind_level,
|
||||
orte_process_info.bind_idx,
|
||||
bind_level, bind_idx);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex setting proc %s locale %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc_name),
|
||||
opal_hwloc_base_print_locality(pmap->locality)));
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &proc_name, ORTE_PROC_MY_NAME)) {
|
||||
/* if this data is from myself, then set locality to all */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
@ -314,38 +336,11 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc_name)));
|
||||
pmap->locality = OPAL_PROC_NON_LOCAL;
|
||||
} else if (NULL == locale || 0 == strlen(locale)){
|
||||
/* if we share a node, but we don't know anything more, then
|
||||
* mark us as on the node as this is all we know
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex setting proc %s locale NODE",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc_name)));
|
||||
pmap->locality = OPAL_PROC_ON_NODE;
|
||||
} else {
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* convert the locale to a cpuset */
|
||||
if (NULL == orte_grpcomm_base.working_cpuset) {
|
||||
orte_grpcomm_base.working_cpuset = hwloc_bitmap_alloc();
|
||||
}
|
||||
if (0 != hwloc_bitmap_list_sscanf(orte_grpcomm_base.working_cpuset, locale)) {
|
||||
/* got a bad locale */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
goto cleanup;
|
||||
}
|
||||
/* determine relative location on our node */
|
||||
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
|
||||
opal_hwloc_my_cpuset,
|
||||
orte_grpcomm_base.working_cpuset);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex setting proc %s locale %04x",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc_name), pmap->locality));
|
||||
#endif
|
||||
/* must be on our node */
|
||||
pmap->locality = OPAL_PROC_ON_NODE;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:full:modex: adding modex entry for proc %s",
|
||||
@ -373,7 +368,6 @@ int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf)
|
||||
int rc=ORTE_SUCCESS;
|
||||
orte_vpid_t daemon;
|
||||
orte_pmap_t *pmap;
|
||||
char *locale;
|
||||
|
||||
/* process the results */
|
||||
/* extract the number of procs that put data in the buffer */
|
||||
@ -402,9 +396,7 @@ int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* SINCE THIS IS AMONGST PEERS, THERE IS NO NEED TO UPDATE THE NIDMAP/PIDMAP
|
||||
* ITSELF, EXCEPT FOR LOCALITY INFO
|
||||
*/
|
||||
/* SINCE THIS IS AMONGST PEERS, THERE IS NO NEED TO UPDATE THE NIDMAP/PIDMAP */
|
||||
|
||||
if (ORTE_VPID_INVALID == (daemon = orte_ess.proc_get_daemon(&proc_name))) {
|
||||
/* clear problem */
|
||||
@ -420,65 +412,6 @@ int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* unpack the locality info */
|
||||
cnt = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &locale, &cnt, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex:unpack received proc %s locale %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc_name),
|
||||
(NULL == locale) ? "NULL" : locale));
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &proc_name, ORTE_PROC_MY_NAME)) {
|
||||
/* if this data is from myself, then set locality to all */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex:unpack setting proc %s locale ALL",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc_name)));
|
||||
pmap->locality = OPAL_PROC_ALL_LOCAL;
|
||||
} else if (daemon != ORTE_PROC_MY_DAEMON->vpid) {
|
||||
/* this is on a different node, then mark as non-local */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex:unpack setting proc %s locale NONLOCAL",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc_name)));
|
||||
pmap->locality = OPAL_PROC_NON_LOCAL;
|
||||
} else if (NULL == locale || 0 == strlen(locale)){
|
||||
/* if we share a node, but we don't know anything more, then
|
||||
* mark us as on the node as this is all we know
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex:unpack setting proc %s locale NODE",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc_name)));
|
||||
pmap->locality = OPAL_PROC_ON_NODE;
|
||||
} else {
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* convert the locale to a cpuset */
|
||||
if (NULL == orte_grpcomm_base.working_cpuset) {
|
||||
orte_grpcomm_base.working_cpuset = hwloc_bitmap_alloc();
|
||||
}
|
||||
if (0 != hwloc_bitmap_list_sscanf(orte_grpcomm_base.working_cpuset, locale)) {
|
||||
/* got a bad locale */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
goto cleanup;
|
||||
}
|
||||
/* determine relative location on our node */
|
||||
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
|
||||
opal_hwloc_my_cpuset,
|
||||
orte_grpcomm_base.working_cpuset);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex:unpack setting proc %s locale %04x",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc_name), pmap->locality));
|
||||
#endif
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:modex:unpack: adding modex entry for proc %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -495,99 +428,6 @@ int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf)
|
||||
return rc;
|
||||
}
|
||||
|
||||
int orte_grpcomm_base_peer_modex(void)
|
||||
{
|
||||
opal_buffer_t buf, rbuf;
|
||||
int rc = ORTE_SUCCESS;
|
||||
char *locale=NULL;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:peer:modex: performing modex",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* setup the buffer that will actually be sent */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
OBJ_CONSTRUCT(&rbuf, opal_buffer_t);
|
||||
|
||||
/* put our process name in the buffer so it can be unpacked later */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{
|
||||
if (NULL != opal_hwloc_topology) {
|
||||
/* our cpuset should already be known, but check for safety */
|
||||
if (NULL == opal_hwloc_my_cpuset) {
|
||||
opal_hwloc_base_get_local_cpuset();
|
||||
}
|
||||
/* convert to a string */
|
||||
hwloc_bitmap_list_asprintf(&locale, opal_hwloc_my_cpuset);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:peer:modex LOCALE %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), locale));
|
||||
/* pack it */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(locale);
|
||||
goto cleanup;
|
||||
}
|
||||
free(locale);
|
||||
} else {
|
||||
/* pack a placeholder */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:peer:modex NO TOPO - ADDING PLACEHOLDER",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
/* pack a placeholder */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:peer:modex NO HWLOC - ADDING PLACEHOLDER",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &locale, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* pack the entries we have received */
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:peer:modex: executing allgather",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* exchange the buffer with my peers */
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(&buf, &rbuf))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:peer:modex: processing modex info",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_unpack(&rbuf)) ) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OBJ_DESTRUCT(&buf);
|
||||
OBJ_DESTRUCT(&rbuf);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* MODEX DATABASE DESIGN
|
||||
*
|
||||
|
@ -68,11 +68,8 @@ int orte_grpcomm_hier_close(void)
|
||||
|
||||
int orte_grpcomm_hier_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (ORTE_PROC_IS_MPI) {
|
||||
*priority = 1000;
|
||||
} else {
|
||||
*priority = 0;
|
||||
}
|
||||
/* only select if directed */
|
||||
*priority = 0;
|
||||
*module = (mca_base_module_t *)&orte_grpcomm_hier_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -54,7 +54,6 @@ static int xcast(orte_jobid_t job,
|
||||
orte_rml_tag_t tag);
|
||||
static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf);
|
||||
static int hier_barrier(void);
|
||||
static int modex(opal_list_t *procs);
|
||||
|
||||
/* Module def */
|
||||
orte_grpcomm_base_module_t orte_grpcomm_hier_module = {
|
||||
@ -66,7 +65,7 @@ orte_grpcomm_base_module_t orte_grpcomm_hier_module = {
|
||||
hier_barrier,
|
||||
orte_grpcomm_base_set_proc_attr,
|
||||
orte_grpcomm_base_get_proc_attr,
|
||||
modex,
|
||||
orte_grpcomm_base_full_modex,
|
||||
orte_grpcomm_base_purge_proc_attrs
|
||||
};
|
||||
|
||||
@ -421,35 +420,3 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*** MODEX SECTION ***/
|
||||
|
||||
static int modex(opal_list_t *procs)
|
||||
{
|
||||
int rc;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
|
||||
"%s grpcomm:hier: modex entered",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if we were given a list of procs to modex with, then this is happening
|
||||
* as part of a connect/accept operation
|
||||
*/
|
||||
if (NULL != procs) {
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_full_modex(procs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else {
|
||||
|
||||
/* otherwise, we are doing this across our peers */
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_peer_modex())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
|
||||
"%s grpcomm:hier: modex completed",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
@ -344,43 +344,25 @@ static int modex(opal_list_t *procs)
|
||||
free(rml_uri);
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{
|
||||
char *locale;
|
||||
|
||||
/* provide the locality info */
|
||||
if (NULL != opal_hwloc_topology) {
|
||||
/* our cpuset should already be known, but check for safety */
|
||||
if (NULL == opal_hwloc_my_cpuset) {
|
||||
opal_hwloc_base_get_local_cpuset();
|
||||
}
|
||||
/* convert to a string */
|
||||
hwloc_bitmap_list_asprintf(&locale, opal_hwloc_my_cpuset);
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:pmi LOCALE %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), locale));
|
||||
/* NTH: some characters are not allowed in pmi2 land - not sure
|
||||
* if hwloc would use them, but just to be safe we need to encode
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = pmi_encode(locale, strlen(locale)))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(locale);
|
||||
return rc;
|
||||
}
|
||||
/* get the key */
|
||||
if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, "HWLOC"))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(locale);
|
||||
return rc;
|
||||
}
|
||||
/* encoding puts the encoded value in pmi_attr_val */
|
||||
rc = kvs_put(pmi_kvs_key, pmi_attr_val);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
|
||||
free(locale);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
free(locale);
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, "BIND_LEVEL"))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
snprintf(val, 64, "%u", (unsigned int)orte_process_info.bind_level);
|
||||
rc = kvs_put(pmi_kvs_key, val);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, "BIND_IDX"))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
snprintf(val, 64, "%u", orte_process_info.bind_idx);
|
||||
rc = kvs_put(pmi_kvs_key, val);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -527,10 +509,11 @@ static int modex(opal_list_t *procs)
|
||||
(unsigned int)pmap->node_rank));
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{
|
||||
char *locale;
|
||||
opal_hwloc_level_t bind_level;
|
||||
unsigned int bind_idx;
|
||||
|
||||
/* get the proc's locality info, if available */
|
||||
if (ORTE_SUCCESS != (rc = setup_key(&name, "HWLOC"))) {
|
||||
if (ORTE_SUCCESS != (rc = setup_key(&name, "BIND_LEVEL"))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -561,30 +544,28 @@ static int modex(opal_list_t *procs)
|
||||
ORTE_NAME_PRINT(&name)));
|
||||
pmap->locality = OPAL_PROC_ON_NODE;
|
||||
} else {
|
||||
/* we encoded to protect against pmi2 restrictions */
|
||||
locale = pmi_decode(&len);
|
||||
if (NULL == locale) {
|
||||
return ORTE_ERROR;
|
||||
bind_level = strtol(pmi_attr_val, NULL, 10);
|
||||
if (ORTE_SUCCESS != (rc = setup_key(&name, "BIND_IDX"))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* convert the locale to a cpuset */
|
||||
if (NULL == orte_grpcomm_base.working_cpuset) {
|
||||
orte_grpcomm_base.working_cpuset = hwloc_bitmap_alloc();
|
||||
rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
/* all we know is we share a node */
|
||||
pmap->locality = OPAL_PROC_ON_NODE;
|
||||
} else {
|
||||
bind_idx = strtol(pmi_attr_val, NULL, 10);
|
||||
/* determine relative location on our node */
|
||||
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
|
||||
orte_process_info.bind_level,
|
||||
orte_process_info.bind_idx,
|
||||
bind_level, bind_idx);
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcommpmi setting proc %s locale %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&name),
|
||||
opal_hwloc_base_print_locality(pmap->locality)));
|
||||
}
|
||||
if (0 != hwloc_bitmap_list_sscanf(orte_grpcomm_base.working_cpuset, locale)) {
|
||||
/* got a bad locale */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
free(locale);
|
||||
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
}
|
||||
free(locale);
|
||||
/* determine relative location on our node */
|
||||
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
|
||||
opal_hwloc_my_cpuset,
|
||||
orte_grpcomm_base.working_cpuset);
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcommpmi setting proc %s locale %04x",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&name), pmap->locality));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -53,8 +54,6 @@ typedef struct orte_odls_base_t {
|
||||
opal_list_t available_components;
|
||||
/** selected component */
|
||||
orte_odls_base_component_t selected_component;
|
||||
/* warn if binding no-op */
|
||||
bool warn_if_not_bound;
|
||||
} orte_odls_base_t;
|
||||
|
||||
/**
|
||||
|
@ -13,6 +13,7 @@
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -47,7 +48,7 @@
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/sys_limits.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/paffinity/base/base.h"
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
#include "opal/mca/shmem/base/base.h"
|
||||
#include "opal/mca/pstat/pstat.h"
|
||||
|
||||
@ -85,8 +86,6 @@
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
|
||||
static bool override_oversubscribed = false;
|
||||
|
||||
/* IT IS CRITICAL THAT ANY CHANGE IN THE ORDER OF THE INFO PACKED IN
|
||||
* THIS FUNCTION BE REFLECTED IN THE CONSTRUCT_CHILD_LIST PARSER BELOW
|
||||
*/
|
||||
@ -99,10 +98,8 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
orte_job_map_t *map=NULL;
|
||||
opal_buffer_t *wireup;
|
||||
opal_byte_object_t bo, *boptr;
|
||||
int32_t numbytes, *restarts;
|
||||
int32_t numbytes;
|
||||
int8_t flag;
|
||||
orte_app_idx_t *app_idx;
|
||||
orte_vpid_t i;
|
||||
int j;
|
||||
orte_daemon_cmd_flag_t command;
|
||||
orte_app_context_t *app;
|
||||
@ -265,30 +262,14 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the oversubscribe override flag */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->oversubscribe_override, 1, OPAL_BOOL))) {
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* pack the binding policy so the daemon knows if binding is required */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->map->binding, 1, OPAL_BINDING_POLICY))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* pack the map & binding policy for this job */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->policy, 1, ORTE_MAPPING_POLICY))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the cpus_per_rank for this job */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->cpus_per_rank, 1, OPAL_INT16))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the stride for this job */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->stride, 1, OPAL_INT16))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the control flags for this job */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->controls, 1, ORTE_JOB_CONTROL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -337,52 +318,17 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
/* release the data since it has now been copied into our buffer */
|
||||
free(bo.bytes);
|
||||
|
||||
/* transfer and pack the app_idx and restart arrays for this job */
|
||||
app_idx = (orte_app_idx_t*)malloc(jdata->num_procs * sizeof(orte_app_idx_t));
|
||||
restarts = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t));
|
||||
for (j=0, i=0; i < jdata->num_procs && j < jdata->procs->size; j++) {
|
||||
/* pack the procs for this job */
|
||||
for (j=0; j < jdata->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
app_idx[i] = proc->app_idx;
|
||||
restarts[i++] = proc->restarts;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, app_idx, jdata->num_procs, ORTE_APP_IDX))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
free(app_idx);
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, restarts, jdata->num_procs, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
free(restarts);
|
||||
|
||||
/* are there cpu_list strings? */
|
||||
if (jdata->map->cpu_lists) {
|
||||
flag = (int8_t)true;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &flag, 1, OPAL_INT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
for (j=0, i=0; i < jdata->num_procs && j < jdata->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &proc->slot_list, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
} else {
|
||||
flag = (int8_t)false;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &flag, 1, OPAL_INT8))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &proc, 1, ORTE_PROC))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -474,18 +420,15 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
orte_vpid_t j, host_daemon;
|
||||
orte_odls_child_t *child;
|
||||
orte_std_cntr_t cnt;
|
||||
orte_process_name_t proc;
|
||||
orte_odls_job_t *jobdat=NULL;
|
||||
opal_byte_object_t *bo;
|
||||
opal_list_item_t *item;
|
||||
int8_t flag;
|
||||
orte_app_idx_t *app_idx=NULL;
|
||||
int32_t *restarts=NULL;
|
||||
char **slot_str=NULL;
|
||||
orte_jobid_t debugger;
|
||||
bool add_child;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
orte_app_context_t *app;
|
||||
orte_proc_t *pptr;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:constructing child list",
|
||||
@ -614,30 +557,14 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* unpack the override oversubscribed flag */
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* unpack the binding policy */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &override_oversubscribed, &cnt, OPAL_BOOL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* unpack the mapping policy for the job */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->policy, &cnt, ORTE_MAPPING_POLICY))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* unpack the cpus/rank for the job */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->cpus_per_rank, &cnt, OPAL_INT16))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* unpack the stride for the job */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->stride, &cnt, OPAL_INT16))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->binding, &cnt, OPAL_BINDING_POLICY))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
#endif
|
||||
/* unpack the control flags for the job */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->controls, &cnt, ORTE_JOB_CONTROL))) {
|
||||
@ -693,53 +620,21 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
|
||||
/* allocate memory for app_idx */
|
||||
app_idx = (orte_app_idx_t*)malloc(jobdat->num_procs * sizeof(orte_app_idx_t));
|
||||
/* unpack app_idx in one shot */
|
||||
cnt=jobdat->num_procs;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, app_idx, &cnt, ORTE_APP_IDX))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
|
||||
/* allocate memory for restarts */
|
||||
restarts = (int32_t*)malloc(jobdat->num_procs * sizeof(int32_t));
|
||||
/* unpack restarts in one shot */
|
||||
cnt=jobdat->num_procs;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, restarts, &cnt, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
|
||||
/* unpack flag to indicate if slot_strings are present */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
|
||||
if (flag) {
|
||||
/* allocate space */
|
||||
slot_str = (char**)malloc(jobdat->num_procs * sizeof(char*));
|
||||
for (j=0; j < jobdat->num_procs; j++) {
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &slot_str[j], &cnt, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* cycle through the procs and find mine */
|
||||
proc.jobid = jobdat->jobid;
|
||||
/* unpack the procs */
|
||||
for (j=0; j < jobdat->num_procs; j++) {
|
||||
proc.vpid = j;
|
||||
ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc));
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &pptr, &cnt, ORTE_PROC))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
|
||||
/* see if it is one of mine */
|
||||
ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&pptr->name));
|
||||
/* get the vpid of the daemon that is to host this proc */
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output,
|
||||
"%s odls:constructing child list - looking for daemon for proc %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc)));
|
||||
if (ORTE_VPID_INVALID == (host_daemon = orte_ess.proc_get_daemon(&proc))) {
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name)));
|
||||
if (ORTE_VPID_INVALID == (host_daemon = orte_ess.proc_get_daemon(&pptr->name))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto REPORT_ERROR;
|
||||
@ -747,7 +642,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output,
|
||||
"%s odls:constructing child list - checking proc %s on daemon %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc),
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name),
|
||||
ORTE_VPID_PRINT(host_daemon)));
|
||||
|
||||
/* does this proc belong to us? */
|
||||
@ -755,7 +650,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output,
|
||||
"%s odls:constructing child list - found proc %s for me!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc)));
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name)));
|
||||
|
||||
add_child = true;
|
||||
/* if this job is restarting procs, then we need to treat things
|
||||
@ -773,17 +668,17 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL ==
|
||||
orte_util_compare_name_fields(mask, child->name, &proc)) {
|
||||
orte_util_compare_name_fields(mask, child->name, &pptr->name)) {
|
||||
/* do not duplicate this child on the list! */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"proc %s is on list and is %s",
|
||||
ORTE_NAME_PRINT(&proc),
|
||||
ORTE_NAME_PRINT(&pptr->name),
|
||||
(child->alive) ? "ALIVE" : "DEAD"));
|
||||
add_child = false;
|
||||
child->restarts = restarts[j];
|
||||
child->restarts = pptr->restarts;
|
||||
child->do_not_barrier = true;
|
||||
/* mark that this app_context is being used on this node */
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, app_idx[j]);
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, pptr->app_idx);
|
||||
app->used_on_node = true;
|
||||
break;
|
||||
}
|
||||
@ -794,27 +689,29 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
if (add_child) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"adding proc %s to my local list",
|
||||
ORTE_NAME_PRINT(&proc)));
|
||||
ORTE_NAME_PRINT(&pptr->name)));
|
||||
/* keep tabs of the number of local procs */
|
||||
jobdat->num_local_procs++;
|
||||
/* add this proc to our child list */
|
||||
child = OBJ_NEW(orte_odls_child_t);
|
||||
/* copy the name to preserve it */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.copy((void**)&child->name, &proc, ORTE_NAME))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.copy((void**)&child->name, &pptr->name, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
child->app_idx = app_idx[j]; /* save the index into the app_context objects */
|
||||
child->restarts = restarts[j];
|
||||
child->app_idx = pptr->app_idx; /* save the index into the app_context objects */
|
||||
child->restarts = pptr->restarts;
|
||||
/* if the job is in restart mode, the child must not barrier when launched */
|
||||
if (ORTE_JOB_STATE_RESTART == jobdat->state) {
|
||||
child->do_not_barrier = true;
|
||||
}
|
||||
if (NULL != slot_str && NULL != slot_str[j]) {
|
||||
child->slot_list = strdup(slot_str[j]);
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (NULL != pptr->cpu_bitmap) {
|
||||
child->cpu_bitmap = strdup(pptr->cpu_bitmap);
|
||||
}
|
||||
#endif
|
||||
/* mark that this app_context is being used on this node */
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, app_idx[j]);
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, pptr->app_idx);
|
||||
app->used_on_node = true;
|
||||
/* protect operation on the global list of children */
|
||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||
@ -823,6 +720,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(pptr);
|
||||
}
|
||||
|
||||
/* flag that the launch msg has been processed so daemon collectives can proceed */
|
||||
@ -832,22 +730,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
OPAL_THREAD_UNLOCK(&jobdat->lock);
|
||||
|
||||
done:
|
||||
if (NULL != app_idx) {
|
||||
free(app_idx);
|
||||
app_idx = NULL;
|
||||
}
|
||||
if (NULL != restarts) {
|
||||
free(restarts);
|
||||
restarts = NULL;
|
||||
}
|
||||
if (NULL != slot_str) {
|
||||
for (j=0; j < jobdat->num_procs; j++) {
|
||||
free(slot_str[j]);
|
||||
}
|
||||
free(slot_str);
|
||||
slot_str = NULL;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
REPORT_ERROR:
|
||||
@ -860,24 +742,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
orte_errmgr.update_state(*job, ORTE_JOB_STATE_NEVER_LAUNCHED,
|
||||
NULL, ORTE_PROC_STATE_UNDEF, 0, rc);
|
||||
|
||||
if (NULL != app_idx) {
|
||||
free(app_idx);
|
||||
app_idx = NULL;
|
||||
}
|
||||
if (NULL != restarts) {
|
||||
free(restarts);
|
||||
restarts = NULL;
|
||||
}
|
||||
if (NULL != slot_str && NULL != jobdat) {
|
||||
for (j=0; j < jobdat->num_procs; j++) {
|
||||
if (NULL != slot_str[j]) {
|
||||
free(slot_str[j]);
|
||||
}
|
||||
}
|
||||
free(slot_str);
|
||||
slot_str = NULL;
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -900,15 +764,15 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
|
||||
}
|
||||
|
||||
/* special case handling for --prefix: this is somewhat icky,
|
||||
but at least some users do this. :-\ It is possible that
|
||||
when using --prefix, the user will also "-x PATH" and/or
|
||||
"-x LD_LIBRARY_PATH", which would therefore clobber the
|
||||
work that was done in the prior pls to ensure that we have
|
||||
the prefix at the beginning of the PATH and
|
||||
LD_LIBRARY_PATH. So examine the context->env and see if we
|
||||
find PATH or LD_LIBRARY_PATH. If found, that means the
|
||||
prior work was clobbered, and we need to re-prefix those
|
||||
variables. */
|
||||
but at least some users do this. :-\ It is possible that
|
||||
when using --prefix, the user will also "-x PATH" and/or
|
||||
"-x LD_LIBRARY_PATH", which would therefore clobber the
|
||||
work that was done in the prior pls to ensure that we have
|
||||
the prefix at the beginning of the PATH and
|
||||
LD_LIBRARY_PATH. So examine the context->env and see if we
|
||||
find PATH or LD_LIBRARY_PATH. If found, that means the
|
||||
prior work was clobbered, and we need to re-prefix those
|
||||
variables. */
|
||||
for (i = 0; NULL != context->prefix_dir && NULL != context->env && NULL != context->env[i]; ++i) {
|
||||
char *newenv;
|
||||
|
||||
@ -979,19 +843,42 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
|
||||
free(param);
|
||||
free(param2);
|
||||
|
||||
/* pass a param telling the child what type and model of cpu we are on,
|
||||
* if we know it
|
||||
*/
|
||||
if (NULL != orte_local_cpu_type) {
|
||||
param = mca_base_param_environ_variable("orte","cpu","type");
|
||||
opal_setenv(param, orte_local_cpu_type, true, environ_copy);
|
||||
free(param);
|
||||
}
|
||||
if (NULL != orte_local_cpu_model) {
|
||||
param = mca_base_param_environ_variable("orte","cpu","model");
|
||||
opal_setenv(param, orte_local_cpu_model, true, environ_copy);
|
||||
free(param);
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{
|
||||
/* pass a param telling the child what type and model of cpu we are on,
|
||||
* if we know it. If hwloc has the value, use what it knows. Otherwise,
|
||||
* see if we were explicitly given it and use that value.
|
||||
*/
|
||||
hwloc_obj_t obj;
|
||||
char *htmp;
|
||||
if (NULL != opal_hwloc_topology) {
|
||||
obj = hwloc_get_root_obj(opal_hwloc_topology);
|
||||
if (NULL != (htmp = hwloc_obj_get_info_by_name(obj, "CPUType")) ||
|
||||
NULL != (htmp = orte_local_cpu_type)) {
|
||||
param = mca_base_param_environ_variable("orte","cpu","type");
|
||||
opal_setenv(param, htmp, true, environ_copy);
|
||||
free(param);
|
||||
}
|
||||
if (NULL != (htmp = hwloc_obj_get_info_by_name(obj, "CPUModel")) ||
|
||||
NULL != (htmp = orte_local_cpu_model)) {
|
||||
param = mca_base_param_environ_variable("orte","cpu","model");
|
||||
opal_setenv(param, htmp, true, environ_copy);
|
||||
free(param);
|
||||
}
|
||||
} else {
|
||||
if (NULL != orte_local_cpu_type) {
|
||||
param = mca_base_param_environ_variable("orte","cpu","type");
|
||||
opal_setenv(param, orte_local_cpu_type, true, environ_copy);
|
||||
free(param);
|
||||
}
|
||||
if (NULL != orte_local_cpu_model) {
|
||||
param = mca_base_param_environ_variable("orte","cpu","model");
|
||||
opal_setenv(param, orte_local_cpu_model, true, environ_copy);
|
||||
free(param);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* get shmem's best component name so we can provide a hint to the shmem
|
||||
* framework. the idea here is to have someone figure out what component to
|
||||
@ -1277,7 +1164,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
opal_list_item_t *item;
|
||||
orte_app_context_t *app, *dbg;
|
||||
orte_odls_child_t *child=NULL;
|
||||
int num_processors;
|
||||
bool oversubscribed;
|
||||
int rc=ORTE_SUCCESS;
|
||||
bool launch_failed=true;
|
||||
@ -1386,48 +1272,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
orte_sstore.wait_all_deps();
|
||||
#endif
|
||||
|
||||
/* if the mapper says we are oversubscribed, then we trust it - unless
|
||||
* it told us -not- to!
|
||||
*/
|
||||
if (oversubscribed && !override_oversubscribed) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:launch mapper declares this node oversubscribed",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
} else {
|
||||
/* if the mapper thinks we are not oversubscribed, then we
|
||||
* do a final smoke test by checking against the #processors. This
|
||||
* is done solely in case the mapper had incorrect knowledge of
|
||||
* the #local processors
|
||||
*/
|
||||
/* compute the number of local procs alive or about to be launched
|
||||
* as part of this job
|
||||
*/
|
||||
total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs;
|
||||
/* get the number of local processors */
|
||||
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_processor_info(&num_processors))) {
|
||||
/* if we cannot find the number of local processors, we have no choice
|
||||
* but to default to conservative settings
|
||||
*/
|
||||
oversubscribed = true;
|
||||
} else {
|
||||
if (total_num_local_procs > num_processors) {
|
||||
/* if the #procs > #processors, declare us oversubscribed. This
|
||||
* covers the case where the user didn't tell us anything about the
|
||||
* number of available slots, so we defaulted to a value of 1
|
||||
*/
|
||||
oversubscribed = true;
|
||||
} else {
|
||||
/* otherwise, declare us to not be oversubscribed so we can be aggressive */
|
||||
oversubscribed = false;
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:launch found %d processors for %d children and locally set oversubscribed to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(ORTE_SUCCESS == rc) ? num_processors : -1, (int)opal_list_get_size(&orte_local_children),
|
||||
oversubscribed ? "true" : "false"));
|
||||
}
|
||||
|
||||
/* setup to report the proc state to the HNP */
|
||||
OBJ_CONSTRUCT(&alert, opal_buffer_t);
|
||||
|
||||
@ -2016,7 +1860,11 @@ void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid)
|
||||
opal_buffer_t buffer;
|
||||
opal_byte_object_t *bo;
|
||||
int rc;
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
opal_hwloc_level_t bind_level;
|
||||
unsigned int bind_idx;
|
||||
#endif
|
||||
|
||||
/* create a job tracking object for it */
|
||||
jobdat = OBJ_NEW(orte_odls_job_t);
|
||||
jobdat->jobid = jobid;
|
||||
@ -2028,12 +1876,20 @@ void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid)
|
||||
opal_dss.pack(&buffer, &jobid, 1, ORTE_JOBID); /* jobid */
|
||||
vpid1 = 1;
|
||||
opal_dss.pack(&buffer, &vpid1, 1, ORTE_VPID); /* num_procs */
|
||||
#if OPAL_HAVE_HWLOC
|
||||
bind_level = OPAL_HWLOC_NODE_LEVEL;
|
||||
opal_dss.pack(&buffer, &bind_level, 1, OPAL_HWLOC_LEVEL_T); /* num_procs */
|
||||
#endif
|
||||
one32 = 0;
|
||||
opal_dss.pack(&buffer, &one32, 1, OPAL_INT32); /* node index */
|
||||
lrank = 0;
|
||||
opal_dss.pack(&buffer, &lrank, 1, ORTE_LOCAL_RANK); /* local rank */
|
||||
nrank = 0;
|
||||
opal_dss.pack(&buffer, &nrank, 1, ORTE_NODE_RANK); /* node rank */
|
||||
#if OPAL_HAVE_HWLOC
|
||||
bind_idx = 0;
|
||||
opal_dss.pack(&buffer, &bind_idx, 1, OPAL_UINT);
|
||||
#endif
|
||||
/* setup a byte object and unload the packed data to it */
|
||||
bo = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t));
|
||||
opal_dss.unload(&buffer, (void**)&bo->bytes, &bo->size);
|
||||
@ -2189,14 +2045,14 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:sync sending byte object",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
opal_dss.pack(&buffer, &orte_odls_globals.dmap, 1, OPAL_BYTE_OBJECT);
|
||||
opal_dss.pack(&buffer, &jobdat->pmap, 1, OPAL_BYTE_OBJECT);
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* send the local topology so the individual apps
|
||||
* don't hammer the system to collect it themselves
|
||||
*/
|
||||
opal_dss.pack(&buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO);
|
||||
#endif
|
||||
opal_dss.pack(&buffer, &orte_odls_globals.dmap, 1, OPAL_BYTE_OBJECT);
|
||||
opal_dss.pack(&buffer, &jobdat->pmap, 1, OPAL_BYTE_OBJECT);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -10,6 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -31,7 +32,6 @@
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
#include "opal/mca/paffinity/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/argv.h"
|
||||
@ -89,7 +89,7 @@ orte_odls_globals_t orte_odls_globals;
|
||||
int orte_odls_base_open(void)
|
||||
{
|
||||
char **ranks=NULL, *tmp;
|
||||
int i, rank, sock, core;
|
||||
int i, rank;
|
||||
orte_namelist_t *nm;
|
||||
bool xterm_hold;
|
||||
|
||||
@ -101,13 +101,6 @@ int orte_odls_base_open(void)
|
||||
"Time to wait for a process to die after issuing a kill signal to it",
|
||||
false, false, 1, &orte_odls_globals.timeout_before_sigkill);
|
||||
|
||||
mca_base_param_reg_int_name("odls", "warn_if_not_bound",
|
||||
"If nonzero, issue a warning if the program asked "
|
||||
"for a binding that results in a no-op (ex: "
|
||||
"bind-to-socket on a single socket node)",
|
||||
false, false, 1, &i);
|
||||
orte_odls_base.warn_if_not_bound = OPAL_INT_TO_BOOL(i);
|
||||
|
||||
/* initialize the global list of local children and job data */
|
||||
OBJ_CONSTRUCT(&orte_local_children, opal_list_t);
|
||||
OBJ_CONSTRUCT(&orte_local_children_lock, opal_mutex_t);
|
||||
@ -125,43 +118,6 @@ int orte_odls_base_open(void)
|
||||
orte_odls_globals.debugger = NULL;
|
||||
orte_odls_globals.debugger_launched = false;
|
||||
|
||||
/* get any external processor bindings */
|
||||
OPAL_PAFFINITY_CPU_ZERO(orte_odls_globals.my_cores);
|
||||
orte_odls_globals.bound = false;
|
||||
orte_odls_globals.num_processors = 0;
|
||||
OBJ_CONSTRUCT(&orte_odls_globals.sockets, opal_bitmap_t);
|
||||
opal_bitmap_init(&orte_odls_globals.sockets, 16);
|
||||
/* default the number of sockets to those found during startup */
|
||||
orte_odls_globals.num_sockets = orte_default_num_sockets_per_board;
|
||||
/* see if paffinity is supported */
|
||||
if (ORTE_SUCCESS == opal_paffinity_base_get(&orte_odls_globals.my_cores)) {
|
||||
/* get the number of local processors */
|
||||
opal_paffinity_base_get_processor_info(&orte_odls_globals.num_processors);
|
||||
/* determine if we are bound */
|
||||
OPAL_PAFFINITY_PROCESS_IS_BOUND(orte_odls_globals.my_cores, &orte_odls_globals.bound);
|
||||
/* if we are bound, determine the number of sockets - and which ones - that are available to us */
|
||||
if (orte_odls_globals.bound) {
|
||||
for (i=0; i < orte_odls_globals.num_processors; i++) {
|
||||
if (OPAL_PAFFINITY_CPU_ISSET(i, orte_odls_globals.my_cores)) {
|
||||
opal_paffinity_base_get_map_to_socket_core(i, &sock, &core);
|
||||
opal_bitmap_set_bit(&orte_odls_globals.sockets, sock);
|
||||
}
|
||||
}
|
||||
/* determine how many sockets we have available to us */
|
||||
orte_odls_globals.num_sockets = 0;
|
||||
for (i=0; i < opal_bitmap_size(&orte_odls_globals.sockets); i++) {
|
||||
if (opal_bitmap_is_set_bit(&orte_odls_globals.sockets, i)) {
|
||||
orte_odls_globals.num_sockets++;
|
||||
}
|
||||
}
|
||||
if (orte_report_bindings) {
|
||||
orte_show_help("help-odls-base.txt",
|
||||
"orte-odls-base:show-bindings",
|
||||
false, orte_odls_globals.my_cores.bitmask[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* check if the user requested that we display output in xterms */
|
||||
if (NULL != orte_xterm) {
|
||||
/* construct a list of ranks to be displayed */
|
||||
@ -250,13 +206,15 @@ static void orte_odls_child_constructor(orte_odls_child_t *ptr)
|
||||
ptr->init_recvd = false;
|
||||
ptr->fini_recvd = false;
|
||||
ptr->rml_uri = NULL;
|
||||
ptr->slot_list = NULL;
|
||||
ptr->waitpid_recvd = false;
|
||||
ptr->iof_complete = false;
|
||||
ptr->do_not_barrier = false;
|
||||
ptr->notified = false;
|
||||
OBJ_CONSTRUCT(&ptr->stats, opal_ring_buffer_t);
|
||||
opal_ring_buffer_init(&ptr->stats, orte_stat_history_size);
|
||||
#if OPAL_HAVE_HWLOC
|
||||
ptr->cpu_bitmap = NULL;
|
||||
#endif
|
||||
}
|
||||
static void orte_odls_child_destructor(orte_odls_child_t *ptr)
|
||||
{
|
||||
@ -264,12 +222,16 @@ static void orte_odls_child_destructor(orte_odls_child_t *ptr)
|
||||
|
||||
if (NULL != ptr->name) free(ptr->name);
|
||||
if (NULL != ptr->rml_uri) free(ptr->rml_uri);
|
||||
if (NULL != ptr->slot_list) free(ptr->slot_list);
|
||||
|
||||
while (NULL != (st = (opal_pstats_t*)opal_ring_buffer_pop(&ptr->stats))) {
|
||||
OBJ_RELEASE(st);
|
||||
}
|
||||
OBJ_DESTRUCT(&ptr->stats);
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (NULL != ptr->cpu_bitmap) {
|
||||
free(ptr->cpu_bitmap);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(orte_odls_child_t,
|
||||
opal_list_item_t,
|
||||
@ -288,7 +250,9 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
|
||||
OBJ_CONSTRUCT(&ptr->apps, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&ptr->apps, 2, INT_MAX, 2);
|
||||
ptr->num_apps = 0;
|
||||
ptr->policy = 0;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
ptr->binding = 0;
|
||||
#endif
|
||||
ptr->cpus_per_rank = 1;
|
||||
ptr->stride = 1;
|
||||
ptr->controls = 0;
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -33,7 +34,6 @@
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/dss/dss_types.h"
|
||||
#include "opal/mca/paffinity/paffinity.h"
|
||||
|
||||
#include "orte/mca/grpcomm/grpcomm_types.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
@ -66,18 +66,6 @@ typedef struct {
|
||||
opal_list_t xterm_ranks;
|
||||
/* the xterm cmd to be used */
|
||||
char **xtermcmd;
|
||||
/* any externally provided bindings */
|
||||
opal_paffinity_base_cpu_set_t my_cores;
|
||||
/* flag whether or not we are bound */
|
||||
bool bound;
|
||||
/* local number of processors */
|
||||
int num_processors;
|
||||
/* map of locally available sockets
|
||||
* as determined by external bindings
|
||||
*/
|
||||
opal_bitmap_t sockets;
|
||||
/* number of sockets available to us */
|
||||
int num_sockets;
|
||||
} orte_odls_globals_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals;
|
||||
|
@ -11,7 +11,7 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -94,3 +94,24 @@ Your job may behave unpredictably after this, or abort.
|
||||
Application name: %s
|
||||
Function: %s
|
||||
Location: %s:%d
|
||||
#
|
||||
[memory not bound]
|
||||
WARNING: Open MPI tried to bind a process but failed. This is a
|
||||
warning only; your job will continue, though performance may
|
||||
be degraded.
|
||||
|
||||
Local host: %s
|
||||
Application name: %s
|
||||
Error message: %s
|
||||
Location: %s:%d
|
||||
|
||||
#
|
||||
[memory binding error]
|
||||
Open MPI tried to bind memory for a new process but something went
|
||||
wrong. The process was killed without launching the target
|
||||
application. Your job will now abort.
|
||||
|
||||
Local host: %s
|
||||
Application name: %s
|
||||
Error message: %s
|
||||
Location: %s:%d
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
* Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
@ -105,11 +105,11 @@
|
||||
#include <sys/select.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/mca/maffinity/base/base.h"
|
||||
#include "opal/mca/paffinity/base/base.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/fd.h"
|
||||
|
||||
@ -379,745 +379,6 @@ static void send_error_show_help(int fd, int exit_status,
|
||||
exit(exit_status);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Bind the process to a specific slot list
|
||||
*/
|
||||
static int bind_to_slot_list(orte_app_context_t* context,
|
||||
orte_odls_child_t *child,
|
||||
orte_odls_job_t *jobdat,
|
||||
bool *bound, int pipe_fd)
|
||||
{
|
||||
int rc;
|
||||
opal_paffinity_base_cpu_set_t mask;
|
||||
char *msg = NULL;
|
||||
|
||||
*bound = false;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:default:fork binding child %s to slot_list %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name),
|
||||
child->slot_list));
|
||||
if (opal_paffinity_alone) {
|
||||
send_error_show_help(pipe_fd, 1,
|
||||
"help-orte-odls-default.txt",
|
||||
"slot list and paffinity_alone",
|
||||
orte_process_info.nodename, context->app);
|
||||
/* Does not return */
|
||||
}
|
||||
if (orte_report_bindings) {
|
||||
opal_output(0, "%s odls:default:fork binding child %s to slot_list %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name), child->slot_list);
|
||||
}
|
||||
rc = opal_paffinity_base_slot_list_set((long)child->name->vpid,
|
||||
child->slot_list, &mask);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
if (ORTE_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
/* OS doesn't support providing topology information */
|
||||
send_error_show_help(pipe_fd, 1, "help-orte-odls-default.txt",
|
||||
"binding not supported",
|
||||
orte_process_info.nodename, context->app);
|
||||
/* Does not return */
|
||||
}
|
||||
asprintf(&msg, "opal_paffinity_base_slot_list_set() returned \"%s\"",
|
||||
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
|
||||
if (NULL == msg) {
|
||||
msg = "opal_paffinity_base_slot_list_set() returned failure";
|
||||
}
|
||||
send_error_show_help(pipe_fd, 1, "help-orte-odls-default.txt",
|
||||
"binding generic error",
|
||||
orte_process_info.nodename, context->app, msg,
|
||||
__FILE__, __LINE__);
|
||||
/* Does not return */
|
||||
}
|
||||
|
||||
/* if we didn't wind up bound, then generate a warning unless
|
||||
suppressed */
|
||||
OPAL_PAFFINITY_PROCESS_IS_BOUND((mask), bound);
|
||||
if (!bound && orte_odls_base.warn_if_not_bound) {
|
||||
send_warn_show_help(pipe_fd, "help-orte-odls-base.txt",
|
||||
"warn not bound", "slot list"
|
||||
"Request resulted in binding to all available processors",
|
||||
orte_process_info.nodename, context->app,
|
||||
"bind to slot list", child->slot_list);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* This function always prints a message: it may be a warning or an
|
||||
* error.
|
||||
*
|
||||
* If binding is not required for this process, then print a simple
|
||||
* warning message and return an error code. If binding *is*
|
||||
* required, then send an error message up the pipe to the parent and
|
||||
* exit.
|
||||
*/
|
||||
static int bind_failed_msg(const char *msg, orte_mapping_policy_t policy,
|
||||
int return_code_if_warning,
|
||||
int pipe_fd, const char *app_name,
|
||||
const char *filename, int line_num)
|
||||
{
|
||||
/* If binding is not required, then send a warning up the pipe and
|
||||
then return an error code. */
|
||||
if (ORTE_BINDING_NOT_REQUIRED(policy)) {
|
||||
send_warn_show_help(pipe_fd,
|
||||
"help-orte-odls-default.txt", "not bound",
|
||||
orte_process_info.nodename, app_name, msg,
|
||||
filename, line_num);
|
||||
return return_code_if_warning;
|
||||
}
|
||||
|
||||
/* If binding is required, send an error up the pipe (which exits
|
||||
-- it doesn't return). */
|
||||
send_error_show_help(pipe_fd, 1, "help-orte-odls-default.txt",
|
||||
"binding generic error",
|
||||
orte_process_info.nodename, app_name, msg,
|
||||
filename, line_num);
|
||||
/* Does not return */
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Similar to bind_failed_msg(), but if binding is not required, do
|
||||
* not output a message (just return an error code). If binding is
|
||||
* required, handling is the same as for bind_failed_msg().
|
||||
*/
|
||||
static int bind_failed(const char *msg, orte_mapping_policy_t policy,
|
||||
int return_code_if_warning,
|
||||
int pipe_fd, const char *app_name,
|
||||
const char *filename, int line_num)
|
||||
{
|
||||
if (ORTE_BINDING_NOT_REQUIRED(policy)) {
|
||||
return return_code_if_warning;
|
||||
}
|
||||
|
||||
/* This won't return, but use "return" statement here so that the
|
||||
compiler won't complain. */
|
||||
return bind_failed_msg(msg, policy, 0, pipe_fd, app_name,
|
||||
filename, line_num);
|
||||
}
|
||||
|
||||
/*
|
||||
* Bind the process to a core
|
||||
*/
|
||||
static int bind_to_core(orte_app_context_t* context,
|
||||
orte_odls_child_t *child,
|
||||
orte_odls_job_t *jobdat,
|
||||
bool *bound, int pipe_fd)
|
||||
{
|
||||
bool flag;
|
||||
int i, rc;
|
||||
char *tmp, *msg;
|
||||
int16_t n;
|
||||
orte_node_rank_t nrank, lrank;
|
||||
opal_paffinity_base_cpu_set_t mask;
|
||||
int target_socket, npersocket, logical_skt;
|
||||
int logical_cpu, phys_core, phys_cpu, ncpu;
|
||||
|
||||
*bound = false;
|
||||
|
||||
/* we want to bind this proc to a specific core, or multiple cores
|
||||
if the cpus_per_rank is > 0 */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:default:fork binding child %s to core(s) cpus/rank %d stride %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name),
|
||||
(int)jobdat->cpus_per_rank, (int)jobdat->stride));
|
||||
|
||||
/* get the node rank */
|
||||
if (ORTE_NODE_RANK_INVALID ==
|
||||
(nrank = orte_ess.get_node_rank(child->name))) {
|
||||
send_error_show_help(pipe_fd, 1, "help-orte-odls-default.txt",
|
||||
"binding generic error",
|
||||
orte_process_info.nodename, context->app,
|
||||
"ess.get_node_rank returned NODE_RANK_INVALID",
|
||||
__FILE__, __LINE__);
|
||||
/* Does not return */
|
||||
}
|
||||
|
||||
/* get the local rank */
|
||||
if (ORTE_LOCAL_RANK_INVALID ==
|
||||
(lrank = orte_ess.get_local_rank(child->name))) {
|
||||
send_error_show_help(pipe_fd, 1, "help-orte-odls-default.txt",
|
||||
"binding generic error",
|
||||
orte_process_info.nodename, context->app,
|
||||
"ess.get_local_rank returned LOCAL_RANK_INVALID",
|
||||
__FILE__, __LINE__);
|
||||
/* Does not return */
|
||||
}
|
||||
|
||||
/* init the mask */
|
||||
OPAL_PAFFINITY_CPU_ZERO(mask);
|
||||
if (ORTE_MAPPING_NPERXXX & jobdat->policy) {
|
||||
/* we need to balance the children from this job
|
||||
across the available sockets */
|
||||
npersocket = jobdat->num_local_procs / orte_odls_globals.num_sockets;
|
||||
/* determine the socket to use based on those available */
|
||||
if (npersocket < 2) {
|
||||
/* if we only have 1/sock, or we have less procs than
|
||||
sockets, then just put it on the lrank socket */
|
||||
logical_skt = lrank;
|
||||
} else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) {
|
||||
logical_skt = lrank % npersocket;
|
||||
} else {
|
||||
logical_skt = lrank / npersocket;
|
||||
}
|
||||
if (orte_odls_globals.bound) {
|
||||
/* if we are already bound (by some other entity), use
|
||||
this as an index into our available sockets */
|
||||
for (n = target_socket = 0;
|
||||
n < logical_skt &&
|
||||
target_socket < opal_bitmap_size(&orte_odls_globals.sockets);
|
||||
target_socket++) {
|
||||
if (opal_bitmap_is_set_bit(&orte_odls_globals.sockets,
|
||||
target_socket)) {
|
||||
n++;
|
||||
}
|
||||
}
|
||||
/* Did we have enough sockets? */
|
||||
if (n < logical_skt) {
|
||||
return bind_failed_msg("not enough processor sockets available",
|
||||
jobdat->policy,
|
||||
ORTE_ERR_NOT_FOUND,
|
||||
pipe_fd, context->app,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
} else {
|
||||
rc = opal_paffinity_base_get_physical_socket_id(logical_skt,
|
||||
&target_socket);
|
||||
if (ORTE_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
return bind_failed_msg("OS does not provide processor topology info (physical socket ID)",
|
||||
jobdat->policy,
|
||||
ORTE_ERR_NOT_FOUND,
|
||||
pipe_fd, context->app,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:default:fork child %s local rank %d npersocket %d logical socket %d target socket %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name), lrank,
|
||||
npersocket, logical_skt, target_socket));
|
||||
/* set the starting point */
|
||||
logical_cpu = (lrank % npersocket) * jobdat->cpus_per_rank;
|
||||
/* bind to this socket */
|
||||
goto bind_socket;
|
||||
} else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) {
|
||||
/* this corresponds to a mapping policy where
|
||||
* local rank 0 goes on socket 0, and local
|
||||
* rank 1 goes on socket 1, etc. - round robin
|
||||
* until all ranks are mapped
|
||||
*
|
||||
* NOTE: we already know our number of sockets
|
||||
* from when we initialized
|
||||
*/
|
||||
rc = opal_paffinity_base_get_physical_socket_id(lrank % orte_odls_globals.num_sockets, &target_socket);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
/* This may be a small memory leak, but this child is
|
||||
exiting soon anyway; keep the logic simple by not
|
||||
worrying about the small leak. */
|
||||
asprintf(&msg, "opal_paffinity_base_get_physical_socket_id(%d) returned \"%s\"",
|
||||
(lrank % orte_odls_globals.num_sockets),
|
||||
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
|
||||
if (NULL == msg) {
|
||||
msg = "opal_paffinity_base_get_physical_socket_id() failed";
|
||||
}
|
||||
if (OPAL_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
msg = "OS does not provide processor topology information (physical socket ID)";
|
||||
}
|
||||
return bind_failed(msg, jobdat->policy, ORTE_ERR_NOT_SUPPORTED,
|
||||
pipe_fd, context->app, __FILE__, __LINE__);
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"bysocket lrank %d numsocks %d logical socket %d target socket %d", (int)lrank,
|
||||
(int)orte_odls_globals.num_sockets,
|
||||
(int)(lrank % orte_odls_globals.num_sockets),
|
||||
target_socket));
|
||||
/* my starting core within this socket has to be
|
||||
offset by cpus_per_rank */
|
||||
logical_cpu = (lrank / orte_odls_globals.num_sockets) * jobdat->cpus_per_rank;
|
||||
|
||||
bind_socket:
|
||||
/* cycle across the cpus_per_rank */
|
||||
for (n=0; n < jobdat->cpus_per_rank; n++) {
|
||||
/* get the physical core within this target socket */
|
||||
rc = opal_paffinity_base_get_physical_core_id(target_socket, logical_cpu, &phys_core);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
/* Seem comment above about "This may be a small
|
||||
memory leak" */
|
||||
asprintf(&msg, "opal_paffinity_base_get_physical_core_id(%d, %d) returned \"%s\"",
|
||||
target_socket, logical_cpu,
|
||||
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
|
||||
if (NULL == msg) {
|
||||
msg = "opal_paffinity_base_get_physical_core_id() failed";
|
||||
}
|
||||
if (OPAL_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
msg = "OS does not provide processor topology information (physical core ID)";
|
||||
}
|
||||
return bind_failed(msg, jobdat->policy, ORTE_ERR_NOT_SUPPORTED,
|
||||
pipe_fd, context->app, __FILE__, __LINE__);
|
||||
}
|
||||
/* map this to a physical cpu on this node */
|
||||
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu))) {
|
||||
/* Seem comment above about "This may be a small
|
||||
memory leak" */
|
||||
asprintf(&msg, "opal_paffinity_base_get_map_to_processor_id(%d, %d) returned \"%s\"",
|
||||
target_socket, phys_core,
|
||||
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
|
||||
if (NULL == msg) {
|
||||
msg = "opal_paffinity_base_get_map_to_processor_id() failed";
|
||||
}
|
||||
if (OPAL_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
msg = "OS does not provide processor topology information (map socket,core->ID)";
|
||||
}
|
||||
return bind_failed(msg, jobdat->policy, ORTE_ERR_NOT_SUPPORTED,
|
||||
pipe_fd, context->app, __FILE__, __LINE__);
|
||||
}
|
||||
/* are we bound? */
|
||||
if (orte_odls_globals.bound) {
|
||||
/* see if this physical cpu is available to us */
|
||||
if (!OPAL_PAFFINITY_CPU_ISSET(phys_cpu, orte_odls_globals.my_cores)) {
|
||||
/* no it isn't - skip it */
|
||||
continue;
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:default:fork mapping phys socket %d core %d to phys_cpu %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
target_socket, phys_core, phys_cpu));
|
||||
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
|
||||
/* increment logical cpu */
|
||||
logical_cpu += jobdat->stride;
|
||||
}
|
||||
if (orte_report_bindings) {
|
||||
tmp = opal_paffinity_base_print_binding(mask);
|
||||
opal_output(0, "%s odls:default:fork binding child %s to socket %d cpus %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name), target_socket, tmp);
|
||||
free(tmp);
|
||||
}
|
||||
} else {
|
||||
/* my starting core has to be offset by cpus_per_rank */
|
||||
logical_cpu = nrank * jobdat->cpus_per_rank;
|
||||
for (n=0; n < jobdat->cpus_per_rank; n++) {
|
||||
/* are we bound? */
|
||||
if (orte_odls_globals.bound) {
|
||||
/* if we are bound, then use the logical_cpu as an
|
||||
index against our available cores */
|
||||
ncpu = 0;
|
||||
for (i = 0; i < OPAL_PAFFINITY_BITMASK_CPU_MAX &&
|
||||
ncpu <= logical_cpu; i++) {
|
||||
if (OPAL_PAFFINITY_CPU_ISSET(i,
|
||||
orte_odls_globals.my_cores)) {
|
||||
ncpu++;
|
||||
phys_cpu = i;
|
||||
}
|
||||
}
|
||||
/* if we don't have enough processors, that is an
|
||||
error */
|
||||
if (ncpu <= logical_cpu) {
|
||||
if (ORTE_BINDING_NOT_REQUIRED(jobdat->policy)) {
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
send_error_show_help(pipe_fd, 1,
|
||||
"help-orte-odls-default.txt",
|
||||
"binding generic error",
|
||||
orte_process_info.nodename,
|
||||
context->app,
|
||||
"not enough logical processors",
|
||||
__FILE__, __LINE__);
|
||||
/* Does not return */
|
||||
}
|
||||
} else {
|
||||
/* if we are not bound, then all processors are
|
||||
available to us, so index into the node's array to
|
||||
get the physical cpu */
|
||||
rc = opal_paffinity_base_get_physical_processor_id(logical_cpu,
|
||||
&phys_cpu);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
/* No processor to bind to */
|
||||
/* Seem comment above about "This may be a small
|
||||
memory leak" */
|
||||
asprintf(&msg, "opal_paffinity_base_get_physical_processor_id(%d) returned \"%s\"",
|
||||
logical_cpu,
|
||||
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
|
||||
if (NULL == msg) {
|
||||
msg = "opal_paffinity_base_get_physical_processor_id() failed";
|
||||
}
|
||||
if (OPAL_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
msg = "OS does not provide processor topology information (physical processor ID)";
|
||||
}
|
||||
return bind_failed(msg, jobdat->policy,
|
||||
ORTE_ERR_NOT_SUPPORTED,
|
||||
pipe_fd, context->app,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
}
|
||||
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
|
||||
/* increment logical cpu */
|
||||
logical_cpu += jobdat->stride;
|
||||
}
|
||||
if (orte_report_bindings) {
|
||||
tmp = opal_paffinity_base_print_binding(mask);
|
||||
opal_output(0, "%s odls:default:fork binding child %s to cpus %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name), tmp);
|
||||
free(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
/* Bind me! */
|
||||
if (ORTE_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
|
||||
/* Seem comment above about "This may be a small memory
|
||||
leak" */
|
||||
asprintf(&msg, "opal_paffinity_base_set returned \"%s\"",
|
||||
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
|
||||
if (NULL == msg) {
|
||||
msg = "opal_paffinity_base_set() failed";
|
||||
}
|
||||
return bind_failed(msg,
|
||||
jobdat->policy,
|
||||
OPAL_SOS_GET_ERROR_CODE(rc),
|
||||
pipe_fd, context->app, __FILE__, __LINE__);
|
||||
}
|
||||
*bound = true;
|
||||
|
||||
/* If the above work resulted in binding to everything (i.e.,
|
||||
effectively not binding), warn -- unless the warning is
|
||||
suppressed. */
|
||||
OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &flag);
|
||||
if (!flag && orte_odls_base.warn_if_not_bound) {
|
||||
send_warn_show_help(pipe_fd,
|
||||
"help-orte-odls-default.txt",
|
||||
"bound to everything",
|
||||
orte_process_info.nodename, context->app,
|
||||
__FILE__, __LINE__);
|
||||
} else if (orte_report_bindings) {
|
||||
tmp = opal_paffinity_base_print_binding(mask);
|
||||
opal_output(0, "%s odls:default:fork binding child %s to socket %d cpus %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name), target_socket, tmp);
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int bind_to_socket(orte_app_context_t* context,
|
||||
orte_odls_child_t *child,
|
||||
orte_odls_job_t *jobdat,
|
||||
bool *bound, int pipe_fd)
|
||||
{
|
||||
bool flag;
|
||||
int i, rc;
|
||||
char *tmp, *msg;
|
||||
int16_t n;
|
||||
orte_node_rank_t lrank;
|
||||
opal_paffinity_base_cpu_set_t mask;
|
||||
int target_socket, npersocket, logical_skt;
|
||||
int logical_cpu, phys_core, phys_cpu, ncpu;
|
||||
|
||||
*bound = false;
|
||||
|
||||
/* bind this proc to a socket */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:default:fork binding child %s to socket",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
/* layout this process across the sockets based on
|
||||
* the provided mapping policy
|
||||
*/
|
||||
if (ORTE_LOCAL_RANK_INVALID == (lrank = orte_ess.get_local_rank(child->name))) {
|
||||
send_error_show_help(pipe_fd, 1, "help-orte-odls-default.txt",
|
||||
"binding generic error",
|
||||
orte_process_info.nodename, context->app,
|
||||
"ess.get_local_rank returned NODE_RANK_INVALID",
|
||||
__FILE__, __LINE__);
|
||||
/* Does not return */
|
||||
}
|
||||
if (ORTE_MAPPING_NPERXXX & jobdat->policy) {
|
||||
/* we need to balance the children from this job
|
||||
across the available sockets */
|
||||
npersocket = jobdat->num_local_procs / orte_odls_globals.num_sockets;
|
||||
/* determine the socket to use based on those available */
|
||||
if (npersocket < 2) {
|
||||
/* if we only have 1/sock, or we have less
|
||||
* procs than sockets, then just put it on the
|
||||
* lrank socket
|
||||
*/
|
||||
logical_skt = lrank;
|
||||
} else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) {
|
||||
logical_skt = lrank % npersocket;
|
||||
} else {
|
||||
logical_skt = lrank / npersocket;
|
||||
}
|
||||
if (orte_odls_globals.bound) {
|
||||
/* if we are bound, use this as an index into
|
||||
our available sockets */
|
||||
for (target_socket=0, n = 0; target_socket < opal_bitmap_size(&orte_odls_globals.sockets) && n < logical_skt; target_socket++) {
|
||||
if (opal_bitmap_is_set_bit(&orte_odls_globals.sockets, target_socket)) {
|
||||
n++;
|
||||
}
|
||||
}
|
||||
/* if we don't have enough sockets, that is an error */
|
||||
if (n < logical_skt) {
|
||||
return bind_failed_msg("not enough processor sockets available",
|
||||
jobdat->policy,
|
||||
ORTE_ERR_NOT_FOUND,
|
||||
pipe_fd, context->app,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
} else {
|
||||
rc = opal_paffinity_base_get_physical_socket_id(logical_skt, &target_socket);
|
||||
if (ORTE_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
/* OS doesn't support providing topology
|
||||
information */
|
||||
return bind_failed_msg("OS does not provide processor topology info (physical socket ID)",
|
||||
jobdat->policy,
|
||||
ORTE_ERR_NOT_FOUND,
|
||||
pipe_fd, context->app,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:default:fork child %s local rank %d npersocket %d logical socket %d target socket %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name), lrank,
|
||||
npersocket, logical_skt, target_socket));
|
||||
} else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) {
|
||||
/* this corresponds to a mapping policy where
|
||||
* local rank 0 goes on socket 0, and local
|
||||
* rank 1 goes on socket 1, etc. - round robin
|
||||
* until all ranks are mapped
|
||||
*
|
||||
* NOTE: we already know our number of sockets
|
||||
* from when we initialized
|
||||
*/
|
||||
rc = opal_paffinity_base_get_physical_socket_id(lrank % orte_odls_globals.num_sockets, &target_socket);
|
||||
if (ORTE_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
/* OS does not support providing topology
|
||||
information */
|
||||
return bind_failed_msg("OS does not provide processor topology info(physical socket ID)",
|
||||
jobdat->policy,
|
||||
ORTE_ERR_NOT_FOUND,
|
||||
pipe_fd, context->app,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"bysocket lrank %d numsocks %d logical socket %d target socket %d", (int)lrank,
|
||||
(int)orte_odls_globals.num_sockets,
|
||||
(int)(lrank % orte_odls_globals.num_sockets),
|
||||
target_socket));
|
||||
} else {
|
||||
/* use a byslot-like policy where local rank 0 goes on
|
||||
* socket 0, and local rank 1 goes on socket 0, etc.
|
||||
* following round-robin until all ranks mapped
|
||||
*/
|
||||
if (orte_odls_globals.bound) {
|
||||
/* if we are bound, then we compute the
|
||||
* logical socket id based on the number of
|
||||
* available cores in each socket so that each
|
||||
* rank gets its own core, adjusting for the
|
||||
* cpus_per_task
|
||||
*/
|
||||
/* Find the lrank available core, accounting
|
||||
for cpus_per_task */
|
||||
logical_cpu = lrank * jobdat->cpus_per_rank;
|
||||
/* use the logical_cpu as an index against our
|
||||
available cores */
|
||||
ncpu = 0;
|
||||
for (i=0; i < orte_odls_globals.num_processors && ncpu <= logical_cpu; i++) {
|
||||
if (OPAL_PAFFINITY_CPU_ISSET(i, orte_odls_globals.my_cores)) {
|
||||
ncpu++;
|
||||
phys_cpu = i;
|
||||
}
|
||||
}
|
||||
/* if we don't have enough processors, that is
|
||||
an error */
|
||||
if (ncpu < logical_cpu) {
|
||||
send_error_show_help(pipe_fd, 1,
|
||||
"help-orte-odls-default.txt",
|
||||
"binding generic error",
|
||||
orte_process_info.nodename,
|
||||
context->app,
|
||||
"not enough logical processors",
|
||||
__FILE__, __LINE__);
|
||||
/* Does not return */
|
||||
}
|
||||
/* get the physical socket of that cpu */
|
||||
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_map_to_socket_core(phys_cpu, &target_socket, &phys_core))) {
|
||||
/* Seem comment above about "This may be a small
|
||||
memory leak" */
|
||||
asprintf(&msg, "opal_paffinity_base_get_map_to_socket_core(%d) returned \"%s\"",
|
||||
phys_cpu, opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
|
||||
if (NULL == msg) {
|
||||
msg = "opal_paffinity_base_get_map_to_socket_core() failed";
|
||||
}
|
||||
if (OPAL_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
msg = "OS does not provide processor topology information (map socket,core->ID)";
|
||||
}
|
||||
return bind_failed(msg, jobdat->policy,
|
||||
ORTE_ERR_NOT_SUPPORTED,
|
||||
pipe_fd, context->app,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
} else {
|
||||
/* if we are not bound, then just use all sockets */
|
||||
if (1 == orte_odls_globals.num_sockets) {
|
||||
/* if we only have one socket, then just
|
||||
put it there */
|
||||
rc = opal_paffinity_base_get_physical_socket_id(0, &target_socket);
|
||||
if (ORTE_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
/* OS doesn't support providing
|
||||
topology information */
|
||||
return bind_failed_msg("OS does not provide processor topology info (physical socket ID)",
|
||||
jobdat->policy,
|
||||
ORTE_ERR_NOT_FOUND,
|
||||
pipe_fd, context->app,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
} else {
|
||||
/* compute the logical socket,
|
||||
compensating for the number of
|
||||
cpus_per_rank */
|
||||
logical_skt = lrank / (orte_default_num_cores_per_socket / jobdat->cpus_per_rank);
|
||||
/* wrap that around the number of sockets
|
||||
so we round-robin */
|
||||
logical_skt = logical_skt % orte_odls_globals.num_sockets;
|
||||
/* now get the target physical socket */
|
||||
rc = opal_paffinity_base_get_physical_socket_id(logical_skt, &target_socket);
|
||||
if (ORTE_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
/* OS doesn't support providing
|
||||
topology information */
|
||||
return bind_failed_msg("OS does not provide processor topology info (physical socket ID)",
|
||||
jobdat->policy,
|
||||
ORTE_ERR_NOT_FOUND,
|
||||
pipe_fd, context->app,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"byslot lrank %d socket %d", (int)lrank, target_socket));
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_PAFFINITY_CPU_ZERO(mask);
|
||||
|
||||
for (n=0; n < orte_default_num_cores_per_socket; n++) {
|
||||
/* get the physical core within this target socket */
|
||||
rc = opal_paffinity_base_get_physical_core_id(target_socket, n, &phys_core);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
/* Seem comment above about "This may be a small memory
|
||||
leak" */
|
||||
asprintf(&msg, "opal_paffinity_base_get_physical_core_id(%d, %d) returned \"%s\"",
|
||||
target_socket, n,
|
||||
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
|
||||
if (NULL == msg) {
|
||||
msg = "opal_paffinity_base_get_physical_core_id() failed";
|
||||
}
|
||||
if (OPAL_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
msg = "OS does not provide processor topology information (physical core ID)";
|
||||
}
|
||||
return bind_failed(msg, jobdat->policy,
|
||||
ORTE_ERR_NOT_SUPPORTED,
|
||||
pipe_fd, context->app,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
/* map this to a physical cpu on this node */
|
||||
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu))) {
|
||||
/* Seem comment above about "This may be a small memory
|
||||
leak" */
|
||||
asprintf(&msg, "opal_paffinity_base_get_map_to_processor_id(%d, %d) returned \"%s\"",
|
||||
target_socket, phys_core,
|
||||
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
|
||||
if (NULL == msg) {
|
||||
msg = "opal_paffinity_base_get_map_to_processor_id()";
|
||||
}
|
||||
if (OPAL_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
msg = "OS does not provide processor topology information (map socket,core->ID)";
|
||||
}
|
||||
return bind_failed(msg, jobdat->policy,
|
||||
ORTE_ERR_NOT_SUPPORTED,
|
||||
pipe_fd, context->app,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
/* are we bound? */
|
||||
if (orte_odls_globals.bound) {
|
||||
/* see if this physical cpu is available to us */
|
||||
if (!OPAL_PAFFINITY_CPU_ISSET(phys_cpu, orte_odls_globals.my_cores)) {
|
||||
/* no it isn't - skip it */
|
||||
continue;
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:default:fork mapping phys socket %d core %d to phys_cpu %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
target_socket, phys_core, phys_cpu));
|
||||
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
|
||||
}
|
||||
|
||||
/* Bind me! */
|
||||
if (ORTE_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
|
||||
/* Seem comment above about "This may be a small memory
|
||||
leak" */
|
||||
asprintf(&msg, "opal_paffinity_base_set() returned \"%s\"",
|
||||
opal_strerror(OPAL_SOS_GET_ERROR_CODE(rc)));
|
||||
if (NULL == msg) {
|
||||
msg = "opal_paffinity_base_set() failed";
|
||||
}
|
||||
return bind_failed(msg,
|
||||
jobdat->policy,
|
||||
OPAL_SOS_GET_ERROR_CODE(rc),
|
||||
pipe_fd, context->app, __FILE__, __LINE__);
|
||||
}
|
||||
*bound = true;
|
||||
|
||||
/* If the above work resulted in binding to everything (i.e.,
|
||||
effectively not binding), warn -- unless the warning is
|
||||
suppressed. */
|
||||
OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &flag);
|
||||
if (!flag && orte_odls_base.warn_if_not_bound) {
|
||||
send_warn_show_help(pipe_fd,
|
||||
"help-orte-odls-default.txt",
|
||||
"bound to everything",
|
||||
orte_process_info.nodename, context->app,
|
||||
__FILE__, __LINE__);
|
||||
} else if (orte_report_bindings) {
|
||||
tmp = opal_paffinity_base_print_binding(mask);
|
||||
opal_output(0, "%s odls:default:fork binding child %s to socket %d cpus %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name), target_socket, tmp);
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int bind_to_board(orte_app_context_t* context,
|
||||
orte_odls_child_t *child,
|
||||
orte_odls_job_t *jobdat,
|
||||
bool *bound, int pipe_fd)
|
||||
{
|
||||
/* Not currently supported until multi-board paffinity enabled.
|
||||
But this is not an error -- for now. */
|
||||
*bound = false;
|
||||
if (orte_odls_base.warn_if_not_bound) {
|
||||
send_warn_show_help(pipe_fd, "help-orte-odls-base.txt",
|
||||
"warn not bound", "board",
|
||||
"Not currently supported by Open MPI",
|
||||
orte_process_info.nodename, context->app,
|
||||
"Bind to board", "");
|
||||
}
|
||||
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
|
||||
static int do_child(orte_app_context_t* context,
|
||||
orte_odls_child_t *child,
|
||||
char **environ_copy,
|
||||
@ -1127,10 +388,11 @@ static int do_child(orte_app_context_t* context,
|
||||
int i;
|
||||
sigset_t sigs;
|
||||
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
||||
bool paffinity_enabled = false;
|
||||
char *param, *tmp;
|
||||
opal_paffinity_base_cpu_set_t mask;
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
int rc;
|
||||
char *param, *msg;
|
||||
#endif
|
||||
|
||||
if (orte_forward_job_control) {
|
||||
/* Set a new process group for this child, so that a
|
||||
SIGSTOP can be sent to it without being sent to the
|
||||
@ -1164,31 +426,117 @@ static int do_child(orte_app_context_t* context,
|
||||
orte_process_info.nodename, context->app);
|
||||
/* Does not return */
|
||||
}
|
||||
|
||||
/* Setup process affinity. Not for the meek. */
|
||||
|
||||
if (NULL != child->slot_list) {
|
||||
bind_to_slot_list(context, child, jobdat,
|
||||
&paffinity_enabled, write_fd);
|
||||
} else if (ORTE_BIND_TO_CORE & jobdat->policy) {
|
||||
bind_to_core(context, child, jobdat,
|
||||
&paffinity_enabled, write_fd);
|
||||
} else if (ORTE_BIND_TO_SOCKET & jobdat->policy) {
|
||||
bind_to_socket(context, child, jobdat,
|
||||
&paffinity_enabled, write_fd);
|
||||
} else if (ORTE_BIND_TO_BOARD & jobdat->policy) {
|
||||
bind_to_board(context, child, jobdat,
|
||||
&paffinity_enabled, write_fd);
|
||||
}
|
||||
|
||||
/* If we were able to set processor affinity, then also
|
||||
setup memory affinity. */
|
||||
if (paffinity_enabled) {
|
||||
if (OPAL_SUCCESS == opal_maffinity_base_open() &&
|
||||
OPAL_SUCCESS == opal_maffinity_base_select()) {
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{
|
||||
hwloc_cpuset_t cpuset;
|
||||
|
||||
/* Set process affinity, if given */
|
||||
if (NULL != child->cpu_bitmap) {
|
||||
/* convert the list to a cpu bitmap */
|
||||
cpuset = hwloc_bitmap_alloc();
|
||||
if (0 != (rc = hwloc_bitmap_list_sscanf(cpuset, child->cpu_bitmap))) {
|
||||
/* See comment above about "This may be a small memory leak" */
|
||||
asprintf(&msg, "hwloc_bitmap_sscanf returned \"%s\" for the string \"%s\"",
|
||||
opal_strerror(rc), child->cpu_bitmap);
|
||||
if (NULL == msg) {
|
||||
msg = "failed to convert bitmap list to hwloc bitmap";
|
||||
}
|
||||
if (OPAL_BINDING_REQUIRED(jobdat->binding)) {
|
||||
/* If binding is required, send an error up the pipe (which exits
|
||||
-- it doesn't return). */
|
||||
send_error_show_help(write_fd, 1, "help-orte-odls-default.txt",
|
||||
"binding generic error",
|
||||
orte_process_info.nodename,
|
||||
context->app, msg,
|
||||
__FILE__, __LINE__);
|
||||
} else {
|
||||
send_warn_show_help(write_fd,
|
||||
"help-orte-odls-default.txt", "not bound",
|
||||
orte_process_info.nodename, context->app, msg,
|
||||
__FILE__, __LINE__);
|
||||
goto PROCEED;
|
||||
}
|
||||
}
|
||||
/* bind as specified */
|
||||
if (opal_hwloc_report_bindings) {
|
||||
opal_output(0, "%s odls:default binding child %s to cpus %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name), child->cpu_bitmap);
|
||||
}
|
||||
rc = hwloc_set_cpubind(opal_hwloc_topology, cpuset, 0);
|
||||
if (rc < 0) {
|
||||
char *tmp = NULL;
|
||||
if (errno == ENOSYS) {
|
||||
msg = "hwloc indicates cpu binding not supported";
|
||||
} else if (errno == EXDEV) {
|
||||
msg = "hwloc indicates cpu binding cannot be enforced";
|
||||
} else {
|
||||
asprintf(&msg, "hwloc_set_cpubind returned \"%s\" for bitmap \"%s\"",
|
||||
opal_strerror(rc), child->cpu_bitmap);
|
||||
}
|
||||
if (OPAL_BINDING_REQUIRED(jobdat->binding)) {
|
||||
/* If binding is required, send an error up the pipe (which exits
|
||||
-- it doesn't return). */
|
||||
send_error_show_help(write_fd, 1, "help-orte-odls-default.txt",
|
||||
"binding generic error",
|
||||
orte_process_info.nodename, context->app, msg,
|
||||
__FILE__, __LINE__);
|
||||
} else {
|
||||
send_warn_show_help(write_fd,
|
||||
"help-orte-odls-default.txt", "not bound",
|
||||
orte_process_info.nodename, context->app, msg,
|
||||
__FILE__, __LINE__);
|
||||
if (NULL != tmp) {
|
||||
free(tmp);
|
||||
free(msg);
|
||||
}
|
||||
goto PROCEED;
|
||||
}
|
||||
if (NULL != tmp) {
|
||||
free(tmp);
|
||||
free(msg);
|
||||
}
|
||||
}
|
||||
/* set memory affinity policy */
|
||||
if (ORTE_SUCCESS != opal_hwloc_base_set_process_membind_policy()) {
|
||||
if (errno == ENOSYS) {
|
||||
msg = "hwloc indicates memory binding not supported";
|
||||
} else if (errno == EXDEV) {
|
||||
msg = "hwloc indicates memory binding cannot be enforced";
|
||||
} else {
|
||||
msg = "failed to bind memory";
|
||||
}
|
||||
if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
|
||||
/* If binding is required, send an error up the pipe (which exits
|
||||
-- it doesn't return). */
|
||||
send_error_show_help(write_fd, 1, "help-orte-odls-default.txt",
|
||||
"memory binding error",
|
||||
orte_process_info.nodename, context->app, msg,
|
||||
__FILE__, __LINE__);
|
||||
} else {
|
||||
send_warn_show_help(write_fd,
|
||||
"help-orte-odls-default.txt", "memory not bound",
|
||||
orte_process_info.nodename, context->app, msg,
|
||||
__FILE__, __LINE__);
|
||||
goto PROCEED;
|
||||
}
|
||||
}
|
||||
opal_maffinity_setup = true;
|
||||
/* Set an info MCA param that tells
|
||||
the launched processes that it was bound by us (e.g., so that
|
||||
MPI_INIT doesn't try to bind itself) */
|
||||
param = mca_base_param_environ_variable("opal","bound","at_launch");
|
||||
opal_setenv(param, "1", true, &environ_copy);
|
||||
free(param);
|
||||
/* ...and provide a nice string representation of what we
|
||||
bound to */
|
||||
param = mca_base_param_environ_variable("opal","base","applied_binding");
|
||||
opal_setenv(param, child->cpu_bitmap, true, &environ_copy);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
} else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) {
|
||||
/* tie stdin/out/err/internal to /dev/null */
|
||||
int fdnull;
|
||||
@ -1205,26 +553,10 @@ static int do_child(orte_app_context_t* context,
|
||||
}
|
||||
close(fdnull);
|
||||
}
|
||||
|
||||
/* If we are able to bind, then set an info MCA param that tells
|
||||
the launched processes that it was bound by us (e.g., so that
|
||||
MPI_INIT doesn't try to bind itself) */
|
||||
if (paffinity_enabled) {
|
||||
param = mca_base_param_environ_variable("paffinity","base","bound");
|
||||
opal_setenv(param, "1", true, &environ_copy);
|
||||
free(param);
|
||||
/* ...and provide a nice string representation of what we
|
||||
bound to */
|
||||
if (OPAL_SUCCESS == opal_paffinity_base_get(&mask)) {
|
||||
tmp = opal_paffinity_base_print_binding(mask);
|
||||
if (NULL != tmp) {
|
||||
param = mca_base_param_environ_variable("paffinity","base","applied_binding");
|
||||
opal_setenv(param, tmp, true, &environ_copy);
|
||||
free(tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
PROCEED:
|
||||
#endif
|
||||
/* close all file descriptors w/ exception of stdin/stdout/stderr,
|
||||
the pipe used for the IOF INTERNAL messages, and the pipe up to
|
||||
the parent. */
|
||||
@ -1310,7 +642,7 @@ static int do_parent(orte_app_context_t* context,
|
||||
rc = opal_fd_read(read_fd, sizeof(msg), &msg);
|
||||
|
||||
/* If the pipe closed, then the child successfully launched */
|
||||
if (OPAL_ERR_TIMEOUT == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OPAL_ERR_TIMEOUT == rc) {
|
||||
break;
|
||||
}
|
||||
|
||||
@ -1393,7 +725,7 @@ static int do_parent(orte_app_context_t* context,
|
||||
child->alive = false;
|
||||
}
|
||||
close(read_fd);
|
||||
return ORTE_SUCCESS;
|
||||
return ORTE_ERR_FAILED_TO_START;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -33,9 +34,11 @@
|
||||
#include "opal/dss/dss_types.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
|
||||
#include "orte/mca/plm/plm_types.h"
|
||||
#include "orte/mca/grpcomm/grpcomm_types.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
@ -107,7 +110,9 @@ typedef struct {
|
||||
bool init_recvd; /* process called orte_init */
|
||||
bool fini_recvd; /* process called orte_finalize */
|
||||
char *rml_uri; /* contact info for this child */
|
||||
char *slot_list; /* list of slots for this child */
|
||||
#if OPAL_HAVE_HWLOC
|
||||
char *cpu_bitmap; /* binding pattern for this child */
|
||||
#endif
|
||||
bool waitpid_recvd; /* waitpid has detected proc termination */
|
||||
bool iof_complete; /* IOF has noted proc terminating all channels */
|
||||
struct timeval starttime; /* when the proc was started - for timing purposes only */
|
||||
@ -133,7 +138,9 @@ typedef struct orte_odls_job_t {
|
||||
bool launch_msg_processed; /* launch msg has been fully processed */
|
||||
opal_pointer_array_t apps; /* app_contexts for this job */
|
||||
orte_app_idx_t num_apps; /* number of app_contexts */
|
||||
orte_mapping_policy_t policy; /* mapping policy */
|
||||
#if OPAL_HAVE_HWLOC
|
||||
opal_binding_policy_t binding; /* binding policy */
|
||||
#endif
|
||||
int16_t cpus_per_rank; /* number of cpus/rank */
|
||||
int16_t stride; /* step size between cores of multi-core/rank procs */
|
||||
orte_job_controls_t controls; /* control flags for job */
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Institut National de Recherche en Informatique
|
||||
* et Automatique. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -81,41 +81,44 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
|
||||
/* if the job is not being restarted or hasn't already been given a jobid, prep it */
|
||||
if (ORTE_JOB_STATE_RESTART != jdata->state && ORTE_JOBID_INVALID == jdata->jobid) {
|
||||
/* get a jobid for it */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(jdata))) {
|
||||
/* if this is the daemon job, we don't perform certain functions */
|
||||
if (jdata->jobid != ORTE_PROC_MY_NAME->jobid) {
|
||||
/* if the job is not being restarted or hasn't already been given a jobid, prep it */
|
||||
if (ORTE_JOB_STATE_RESTART != jdata->state && ORTE_JOBID_INVALID == jdata->jobid) {
|
||||
/* get a jobid for it */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* store it on the global job data pool */
|
||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
||||
|
||||
/* set the job state */
|
||||
jdata->state = ORTE_JOB_STATE_INIT;
|
||||
|
||||
/* if job recovery is not defined, set it to default */
|
||||
if (!jdata->recovery_defined) {
|
||||
/* set to system default */
|
||||
jdata->enable_recovery = orte_enable_recovery;
|
||||
}
|
||||
/* if app recovery is not defined, set apps to defaults */
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!app->recovery_defined) {
|
||||
app->max_restarts = orte_max_restarts;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* get the allocation */
|
||||
if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* store it on the global job data pool */
|
||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
||||
|
||||
/* set the job state */
|
||||
jdata->state = ORTE_JOB_STATE_INIT;
|
||||
|
||||
/* if job recovery is not defined, set it to default */
|
||||
if (!jdata->recovery_defined) {
|
||||
/* set to system default */
|
||||
jdata->enable_recovery = orte_enable_recovery;
|
||||
}
|
||||
/* if app recovery is not defined, set apps to defaults */
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!app->recovery_defined) {
|
||||
app->max_restarts = orte_max_restarts;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* get the allocation */
|
||||
if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(jdata))) {
|
||||
@ -123,64 +126,6 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* RHC: Please leave this code here - it is needed for
|
||||
* rare debugging that doesn't merit a separate debug-flag,
|
||||
* but is a pain to have to replicate when needed
|
||||
*/
|
||||
{
|
||||
char *crud;
|
||||
orte_odls_job_t *jobdat;
|
||||
crud = orte_regex_encode_maps(jdata);
|
||||
opal_output(0, "maps regex: %s", (NULL == crud) ? "NULL" : crud);
|
||||
if (NULL == crud) {
|
||||
orte_never_launched = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(0);
|
||||
orte_jobs_complete();
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
orte_util_nidmap_init(NULL);
|
||||
orte_regex_decode_maps(crud, &jobdat);
|
||||
free(crud);
|
||||
/* print-out the map */
|
||||
orte_nidmap_dump();
|
||||
orte_jobmap_dump();
|
||||
/* printout the jobdat */
|
||||
opal_output(orte_clean_output, "**** DUMP OF JOBDAT %s (%d nodes %d procs) ***",
|
||||
ORTE_JOBID_PRINT(jobdat->jobid), (int)jobdat->num_nodes, (int)(jobdat->num_procs));
|
||||
opal_output(orte_clean_output, "\tNum slots: %d\tControl: %x\tStdin: %d",
|
||||
(int)jobdat->total_slots_alloc, jobdat->controls, (int)jobdat->stdin_target);
|
||||
opal_output(orte_clean_output, "\tApp: %s", jobdat->apps[0]->app);
|
||||
opal_output(orte_clean_output, "\tCwd: %s", jobdat->apps[0]->cwd);
|
||||
crud = opal_argv_join(jobdat->apps[0]->argv, ',');
|
||||
opal_output(orte_clean_output, "\tArgv: %s", crud);
|
||||
free(crud);
|
||||
crud = opal_argv_join(jobdat->apps[0]->env, ',');
|
||||
opal_output(orte_clean_output, "\tEnv: %s", crud);
|
||||
free(crud);
|
||||
orte_never_launched = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(0);
|
||||
orte_jobs_complete();
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
{
|
||||
opal_byte_object_t bo;
|
||||
|
||||
/* construct a nodemap */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(&bo))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(&bo))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* print-out the map */
|
||||
orte_nidmap_dump();
|
||||
}
|
||||
#endif
|
||||
|
||||
/* if we don't want to launch, now is the time to leave */
|
||||
if (orte_do_not_launch) {
|
||||
orte_never_launched = true;
|
||||
@ -192,7 +137,8 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
/* quick sanity check - is the stdin target within range
|
||||
* of the job?
|
||||
*/
|
||||
if (ORTE_VPID_WILDCARD != jdata->stdin_target &&
|
||||
if (jdata->jobid != ORTE_PROC_MY_NAME->jobid &&
|
||||
ORTE_VPID_WILDCARD != jdata->stdin_target &&
|
||||
ORTE_VPID_INVALID != jdata->stdin_target &&
|
||||
jdata->num_procs <= jdata->stdin_target) {
|
||||
/* this request cannot be met */
|
||||
@ -551,6 +497,9 @@ static void process_orted_launch_report(int fd, short event, void *data)
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s RECEIVED TOPOLOGY FROM NODE %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename));
|
||||
if (10 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
opal_dss.dump(0, topo, OPAL_HWLOC_TOPO);
|
||||
}
|
||||
/* do we already have this topology from some other node? */
|
||||
found = false;
|
||||
for (i=0; i < orte_node_topologies->size; i++) {
|
||||
@ -573,6 +522,7 @@ static void process_orted_launch_report(int fd, short event, void *data)
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s NEW TOPOLOGY - ADDING",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
opal_pointer_array_add(orte_node_topologies, topo);
|
||||
node->topology = topo;
|
||||
}
|
||||
@ -776,12 +726,14 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
if (orted_spin_flag) {
|
||||
opal_argv_append(argc, argv, "--spin");
|
||||
}
|
||||
if (orte_report_bindings) {
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (opal_hwloc_report_bindings) {
|
||||
opal_argv_append(argc, argv, "--report-bindings");
|
||||
}
|
||||
if (orte_hetero_nodes) {
|
||||
opal_argv_append(argc, argv, "--hetero-nodes");
|
||||
}
|
||||
#endif
|
||||
|
||||
if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
|
||||
opal_argv_append(argc, argv, "--debug-failure");
|
||||
|
@ -203,7 +203,6 @@ int orte_plm_base_local_slave_launch(orte_job_t *jdata)
|
||||
orte_node_t *node;
|
||||
char *nodename;
|
||||
char *exec_path;
|
||||
bool flag;
|
||||
orte_app_context_t *app;
|
||||
int rc;
|
||||
pid_t pid;
|
||||
@ -221,7 +220,7 @@ int orte_plm_base_local_slave_launch(orte_job_t *jdata)
|
||||
|
||||
/* identify the target host - can only be one! */
|
||||
OBJ_CONSTRUCT(&hosts, opal_list_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&hosts, &flag, app->dash_host))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&hosts, app->dash_host))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&hosts);
|
||||
return rc;
|
||||
|
@ -31,5 +31,8 @@ please check that none of the following exist:
|
||||
MCA param file: orte_allocation_required = 1
|
||||
Environment: OMPI_MCA_orte_allocation_required=1
|
||||
Cmd line: -mca orte_allocation_required 1
|
||||
|
||||
|
||||
#
|
||||
[ras-sim:mismatch]
|
||||
The number of topology files and the list of number of nodes
|
||||
must match - i.e., a number of nodes must be given for each
|
||||
topology.
|
||||
|
@ -29,11 +29,11 @@
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
@ -91,7 +91,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
|
||||
opal_list_t nodes;
|
||||
orte_node_t *node;
|
||||
orte_std_cntr_t i;
|
||||
bool override_oversubscribed;
|
||||
orte_app_context_t *app;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output,
|
||||
@ -139,7 +138,7 @@ int orte_ras_base_allocate(orte_job_t *jdata)
|
||||
if (NULL != orte_ras_base.active_module) {
|
||||
/* read the allocation */
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base.active_module->allocate(&nodes))) {
|
||||
if (ORTE_ERR_SYSTEM_WILL_BOOTSTRAP == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (ORTE_ERR_SYSTEM_WILL_BOOTSTRAP == rc) {
|
||||
/* this module indicates that nodes will be discovered
|
||||
* on a bootstrap basis, so all we do here is add our
|
||||
* own node to the list
|
||||
@ -162,6 +161,10 @@ int orte_ras_base_allocate(orte_job_t *jdata)
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
/* default to no-oversubscribe-allowed for managed systems */
|
||||
if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
}
|
||||
goto DISPLAY;
|
||||
} else if (orte_allocation_required) {
|
||||
/* if nothing was found, and an allocation is
|
||||
@ -197,7 +200,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
|
||||
|
||||
/* a default hostfile was provided - parse it */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
|
||||
&override_oversubscribed,
|
||||
orte_default_hostfile))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
@ -214,8 +216,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* update the jdata object with override_oversubscribed flag */
|
||||
jdata->oversubscribe_override = override_oversubscribed;
|
||||
/* cleanup */
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
goto DISPLAY;
|
||||
@ -246,7 +246,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
|
||||
|
||||
/* hostfile was specified - parse it and add it to the list */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
|
||||
&override_oversubscribed,
|
||||
app->hostfile))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
@ -265,8 +264,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* update the jdata object with override_oversubscribed flag */
|
||||
jdata->oversubscribe_override = override_oversubscribed;
|
||||
/* cleanup */
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
goto DISPLAY;
|
||||
@ -300,7 +297,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
|
||||
}
|
||||
if (NULL != app->dash_host) {
|
||||
if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes,
|
||||
&override_oversubscribed,
|
||||
app->dash_host))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
@ -319,8 +315,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* update the jdata object with override_oversubscribed flag */
|
||||
jdata->oversubscribe_override = override_oversubscribed;
|
||||
/* cleanup */
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
goto DISPLAY;
|
||||
@ -336,7 +330,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
|
||||
if (NULL != orte_rankfile) {
|
||||
/* check the rankfile for node information */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
|
||||
&override_oversubscribed,
|
||||
orte_rankfile))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
@ -353,8 +346,6 @@ int orte_ras_base_allocate(orte_job_t *jdata)
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* update the jdata object with override_oversubscribed flag */
|
||||
jdata->oversubscribe_override = false;
|
||||
/* cleanup */
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
goto DISPLAY;
|
||||
@ -383,8 +374,6 @@ addlocal:
|
||||
node->slots_inuse = 0;
|
||||
node->slots_max = 0;
|
||||
node->slots = 1;
|
||||
/* indicate that we don't know anything about over_subscribing */
|
||||
jdata->oversubscribe_override = true;
|
||||
opal_list_append(&nodes, &node->super);
|
||||
|
||||
/* store the results in the global resource pool - this removes the
|
||||
@ -416,7 +405,6 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t nodes;
|
||||
bool override_oversubscribed;
|
||||
int i;
|
||||
orte_app_context_t *app;
|
||||
|
||||
@ -448,7 +436,6 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
|
||||
|
||||
/* hostfile was specified - parse it and add it to the list */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
|
||||
&override_oversubscribed,
|
||||
app->add_hostfile))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
@ -472,7 +459,6 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
|
||||
}
|
||||
if (NULL != app->add_host) {
|
||||
if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes,
|
||||
&override_oversubscribed,
|
||||
app->add_host))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
@ -489,8 +475,6 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* update the jdata object with override_oversubscribed flag */
|
||||
jdata->oversubscribe_override = override_oversubscribed;
|
||||
/* cleanup */
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
}
|
||||
|
@ -65,8 +65,6 @@ orte_ras_sim_component_t mca_ras_simulator_component = {
|
||||
|
||||
static int ras_sim_open(void)
|
||||
{
|
||||
int tmp;
|
||||
|
||||
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
|
||||
"slots",
|
||||
"Number of slots on each node to simulate",
|
||||
@ -76,24 +74,28 @@ static int ras_sim_open(void)
|
||||
"Number of max slots on each node to simulate",
|
||||
false, false, 0, &mca_ras_simulator_component.slots_max);
|
||||
#if OPAL_HAVE_HWLOC
|
||||
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
|
||||
"num_nodes",
|
||||
"Comma-separated list of number of nodes to simulate for each topology",
|
||||
false, false, NULL, &mca_ras_simulator_component.num_nodes);
|
||||
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
|
||||
"topo_files",
|
||||
"Comma-separated list of files containing xml topology descriptions for simulated nodes",
|
||||
false, false, NULL, &mca_ras_simulator_component.topofiles);
|
||||
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
|
||||
"have_cpubind",
|
||||
"Topology supports binding to cpus",
|
||||
false, false, (int)true, &tmp);
|
||||
mca_ras_simulator_component.have_cpubind = OPAL_INT_TO_BOOL(tmp);
|
||||
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
|
||||
"have_membind",
|
||||
"Topology supports binding to memory",
|
||||
false, false, (int)true, &tmp);
|
||||
mca_ras_simulator_component.have_membind = OPAL_INT_TO_BOOL(tmp);
|
||||
{
|
||||
int tmp;
|
||||
|
||||
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
|
||||
"num_nodes",
|
||||
"Comma-separated list of number of nodes to simulate for each topology",
|
||||
false, false, NULL, &mca_ras_simulator_component.num_nodes);
|
||||
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
|
||||
"topo_files",
|
||||
"Comma-separated list of files containing xml topology descriptions for simulated nodes",
|
||||
false, false, NULL, &mca_ras_simulator_component.topofiles);
|
||||
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
|
||||
"have_cpubind",
|
||||
"Topology supports binding to cpus",
|
||||
false, false, (int)true, &tmp);
|
||||
mca_ras_simulator_component.have_cpubind = OPAL_INT_TO_BOOL(tmp);
|
||||
mca_base_param_reg_int(&mca_ras_simulator_component.super.base_version,
|
||||
"have_membind",
|
||||
"Topology supports binding to memory",
|
||||
false, false, (int)true, &tmp);
|
||||
mca_ras_simulator_component.have_membind = OPAL_INT_TO_BOOL(tmp);
|
||||
}
|
||||
#else
|
||||
mca_base_param_reg_string(&mca_ras_simulator_component.super.base_version,
|
||||
"num_nodes",
|
||||
|
@ -36,5 +36,12 @@ libmca_rmaps_la_SOURCES += \
|
||||
base/rmaps_base_map_job.c \
|
||||
base/rmaps_base_get_job_map.c \
|
||||
base/rmaps_base_support_fns.c \
|
||||
base/rmaps_base_common_mappers.c
|
||||
base/rmaps_base_ranking.c \
|
||||
base/rmaps_base_print_fns.c
|
||||
|
||||
if OPAL_HAVE_HWLOC
|
||||
libmca_rmaps_la_SOURCES += \
|
||||
base/rmaps_base_binding.c
|
||||
endif
|
||||
|
||||
endif
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -58,18 +59,17 @@ typedef struct {
|
||||
opal_list_t available_components;
|
||||
/* list of selected modules */
|
||||
opal_list_t selected_modules;
|
||||
/** whether or not we allow oversubscription of nodes */
|
||||
bool oversubscribe;
|
||||
/* default ppr */
|
||||
char *ppr;
|
||||
/* cpus per rank */
|
||||
int cpus_per_rank;
|
||||
/* stride */
|
||||
int stride;
|
||||
/* do not allow use of the localhost */
|
||||
bool no_use_local;
|
||||
/* display the map after it is computed */
|
||||
bool display_map;
|
||||
/* slot list, if provided by user */
|
||||
char *slot_list;
|
||||
/* default mapping directives */
|
||||
orte_mapping_policy_t mapping;
|
||||
orte_ranking_policy_t ranking;
|
||||
} orte_rmaps_base_t;
|
||||
|
||||
/**
|
||||
@ -99,11 +99,19 @@ ORTE_DECLSPEC int orte_rmaps_base_get_vpid_range(orte_jobid_t jobid,
|
||||
ORTE_DECLSPEC int orte_rmaps_base_set_vpid_range(orte_jobid_t jobid,
|
||||
orte_vpid_t start, orte_vpid_t range);
|
||||
|
||||
/* pretty-print functions */
|
||||
ORTE_DECLSPEC char* orte_rmaps_base_print_mapping(orte_mapping_policy_t mapping);
|
||||
ORTE_DECLSPEC char* orte_rmaps_base_print_ranking(orte_ranking_policy_t ranking);
|
||||
|
||||
/**
|
||||
* Close down the rmaps framework
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_rmaps_base_close(void);
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
ORTE_DECLSPEC int orte_rmaps_base_prep_topology(hwloc_topology_t topo);
|
||||
#endif
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -10,6 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -93,4 +94,94 @@ Your job failed to map. Either no mapper was available, or none
|
||||
of the available mappers was able to perform the requested
|
||||
mapping operation. This can happen if you request a map type
|
||||
(e.g., loadbalance) and the corresponding mapper was not built.
|
||||
#
|
||||
[unrecognized-policy]
|
||||
The specified %s policy is not recognized:
|
||||
|
||||
Policy: %s
|
||||
|
||||
Please check for a typo or ensure that the option is a supported
|
||||
one.
|
||||
#
|
||||
[redefining-policy]
|
||||
Conflicting directives for %s policy are causing the policy
|
||||
to be redefined:
|
||||
|
||||
New policy: %s
|
||||
Prior policy: %s
|
||||
|
||||
Please check that only one policy is defined.
|
||||
#
|
||||
[rmaps:binding-target-not-found]
|
||||
A request was made to bind to %s, but an appropriate target could not
|
||||
be found on node %s.
|
||||
#
|
||||
[rmaps:binding-overload]
|
||||
A request was made to bind to that would result in binding more
|
||||
processes than cpus on a resource:
|
||||
|
||||
Bind to: %s
|
||||
Node: %s
|
||||
#processes: %d
|
||||
#cpus: %d
|
||||
|
||||
You can override this protection by adding the "overload-allowed"
|
||||
option to your binding directive.
|
||||
#
|
||||
[rmaps:no-topology]
|
||||
A request was made for nperxxx that requires knowledge of
|
||||
a remote node's topology. However, no topology info is
|
||||
available for the following node:
|
||||
|
||||
Node: %s
|
||||
|
||||
The job cannot be executed under this condition. Please either
|
||||
remove the nperxxx directive and specify the number of processes
|
||||
to use, or investigate the lack of topology info.
|
||||
#
|
||||
[rmaps:no-available-cpus]
|
||||
While computing bindings, we found no available cpus on
|
||||
the following node:
|
||||
|
||||
Node: %s
|
||||
|
||||
Please check your allocation.
|
||||
#
|
||||
[rmaps:cpubind-not-supported]
|
||||
A request was made to bind a process, but at least one node does NOT
|
||||
support binding processes to cpus.
|
||||
|
||||
Node: %s
|
||||
#
|
||||
[rmaps:membind-not-supported]
|
||||
WARNING: a request was made to bind a process. While the system
|
||||
supports binding the process itself, at least one node does NOT
|
||||
support binding memory to the process location.
|
||||
|
||||
Node: %s
|
||||
|
||||
This is a warning only; your job will continue, though performance may
|
||||
be degraded.
|
||||
#
|
||||
[rmaps:membind-not-supported-fatal]
|
||||
A request was made to bind a process. While the system
|
||||
supports binding the process itself, at least one node does NOT
|
||||
support binding memory to the process location.
|
||||
|
||||
Node: %s
|
||||
|
||||
The provided memory binding policy requires that we abort the
|
||||
job at this time.
|
||||
#
|
||||
[rmaps:no-bindable-objects]
|
||||
No bindable objects of the specified type were available
|
||||
on at least one node:
|
||||
|
||||
Node: %s
|
||||
Target: %s
|
||||
#
|
||||
[rmaps:unknown-binding-level]
|
||||
Unknown binding level:
|
||||
|
||||
Target: %s
|
||||
Cache level: %u
|
||||
|
688
orte/mca/rmaps/base/rmaps_base_binding.c
Обычный файл
688
orte/mca/rmaps/base/rmaps_base_binding.c
Обычный файл
@ -0,0 +1,688 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/util/if.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/threads/tsd.h"
|
||||
|
||||
#include "orte/types.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/hostfile/hostfile.h"
|
||||
#include "orte/util/dash_host/dash_host.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/runtime/data_type_support/orte_dt_support.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
|
||||
static bool membind_warned=false;
|
||||
|
||||
static int bind_upwards(orte_job_t *jdata,
|
||||
hwloc_obj_type_t target,
|
||||
unsigned cache_level)
|
||||
{
|
||||
/* traverse the hwloc topology tree on each node upwards
|
||||
* until we find an object of type target - and then bind
|
||||
* the process to that target
|
||||
*/
|
||||
int i, j;
|
||||
orte_job_map_t *map;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
hwloc_obj_t obj;
|
||||
hwloc_cpuset_t cpus;
|
||||
unsigned int idx, ncpus, nobjs, nsave, *nbound=NULL;
|
||||
struct hwloc_topology_support *support;
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: bind upwards for job %s with bindings %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
opal_hwloc_base_print_binding(jdata->map->binding));
|
||||
/* initialize */
|
||||
map = jdata->map;
|
||||
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
/* check if topology supports cpubind - if not, then we cannot bind */
|
||||
support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
|
||||
if (!support->cpubind->set_thisproc_cpubind) {
|
||||
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) {
|
||||
/* we are not required to bind, so ignore this */
|
||||
continue;
|
||||
}
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
|
||||
if (NULL != nbound) {
|
||||
free(nbound);
|
||||
}
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* check if topology supports membind */
|
||||
if (!support->membind->set_thisproc_membind) {
|
||||
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
|
||||
membind_warned = true;
|
||||
} else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
|
||||
if (NULL != nbound) {
|
||||
free(nbound);
|
||||
}
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
|
||||
/* get the number of objects of this type on this node */
|
||||
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
|
||||
cache_level, OPAL_HWLOC_AVAILABLE);
|
||||
if (0 == nobjs) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-bindable-objects", true,
|
||||
node->name, hwloc_obj_type_string(target));
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* setup the array */
|
||||
if (NULL == nbound) {
|
||||
nbound = (unsigned int*)malloc(nobjs * sizeof(int));
|
||||
nsave = nobjs;
|
||||
} else if (nsave < nobjs) {
|
||||
nbound = (unsigned int*)realloc(nbound, nobjs * sizeof(int));
|
||||
}
|
||||
memset(nbound, 0, nobjs * sizeof(int));
|
||||
|
||||
/* cycle thru the procs */
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs that have already been bound - should
|
||||
* never happen, but safer
|
||||
*/
|
||||
if (NULL != proc->cpu_bitmap) {
|
||||
continue;
|
||||
}
|
||||
/* bozo check */
|
||||
if (NULL == proc->locale) {
|
||||
opal_output(0, "BIND UPWARDS: LOCALE FOR PROC %s IS NULL", ORTE_NAME_PRINT(&proc->name));
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* starting at the locale, move up thru the parents
|
||||
* to find the target object type
|
||||
*/
|
||||
for (obj = proc->locale->parent; NULL != obj; obj = obj->parent) {
|
||||
opal_output(0, "%s bind:upward target %s type %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
hwloc_obj_type_string(target),
|
||||
hwloc_obj_type_string(obj->type));
|
||||
if (target == obj->type) {
|
||||
if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
|
||||
continue;
|
||||
}
|
||||
/* get its index */
|
||||
if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) {
|
||||
free(nbound);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* track the number bound */
|
||||
++nbound[idx];
|
||||
/* get the number of cpus under this location */
|
||||
if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
|
||||
free(nbound);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* error out if adding a proc would cause overload and that wasn't allowed */
|
||||
if (ncpus < nbound[idx] &&
|
||||
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
|
||||
opal_hwloc_base_print_binding(map->binding), node->name,
|
||||
nbound[idx], ncpus);
|
||||
free(nbound);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* bind it here */
|
||||
proc->bind_idx = idx;
|
||||
cpus = opal_hwloc_base_get_available_cpus(node->topology, obj);
|
||||
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"%s BOUND PROC %s TO %s[%s:%u] on node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name),
|
||||
proc->cpu_bitmap,
|
||||
hwloc_obj_type_string(target),
|
||||
idx, node->name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == proc->cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) {
|
||||
/* didn't find anyone to bind to - this is an error
|
||||
* unless the user specified if-supported
|
||||
*/
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true,
|
||||
opal_hwloc_base_print_binding(map->binding), node->name);
|
||||
free(nbound);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (NULL != nbound) {
|
||||
free(nbound);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int bind_downwards(orte_job_t *jdata,
|
||||
hwloc_obj_type_t target,
|
||||
unsigned cache_level)
|
||||
{
|
||||
int i, j;
|
||||
orte_job_map_t *map;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
hwloc_obj_t obj;
|
||||
hwloc_cpuset_t cpus;
|
||||
unsigned int n, idx, minval, ncpus, nobjs, nsave, *nbound=NULL;
|
||||
struct hwloc_topology_support *support;
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: bind downward for job %s with bindings %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
opal_hwloc_base_print_binding(jdata->map->binding));
|
||||
/* initialize */
|
||||
map = jdata->map;
|
||||
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
/* check if topology supports cpubind - if not, then we cannot bind */
|
||||
support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
|
||||
if (!support->cpubind->set_thisproc_cpubind) {
|
||||
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) {
|
||||
/* we are not required to bind, so ignore this */
|
||||
continue;
|
||||
}
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
|
||||
if (NULL != nbound) {
|
||||
free(nbound);
|
||||
}
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* check if topology supports membind */
|
||||
if (!support->membind->set_thisproc_membind) {
|
||||
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
|
||||
membind_warned = true;
|
||||
} else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
|
||||
if (NULL != nbound) {
|
||||
free(nbound);
|
||||
}
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
|
||||
/* get the number of objects of this type on this node */
|
||||
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
|
||||
cache_level, OPAL_HWLOC_AVAILABLE);
|
||||
if (0 == nobjs) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-bindable-objects", true,
|
||||
node->name, hwloc_obj_type_string(target));
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* setup the array */
|
||||
if (NULL == nbound) {
|
||||
nbound = (unsigned int*)malloc(nobjs * sizeof(int));
|
||||
nsave = nobjs;
|
||||
} else if (nsave < nobjs) {
|
||||
nbound = (unsigned int*)realloc(nbound, nobjs * sizeof(int));
|
||||
}
|
||||
memset(nbound, 0, nobjs * sizeof(int));
|
||||
|
||||
/* cycle thru the procs */
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs that have already been bound - should
|
||||
* never happen, but safer
|
||||
*/
|
||||
if (NULL != proc->cpu_bitmap) {
|
||||
continue;
|
||||
}
|
||||
/* cycle across the target objects and select the one with
|
||||
* minimum usage
|
||||
*/
|
||||
minval = UINT_MAX;
|
||||
idx = 0;
|
||||
for (n=0; n < nobjs; n++) {
|
||||
if (nbound[n] < minval) {
|
||||
minval = nbound[n];
|
||||
idx = n;
|
||||
}
|
||||
}
|
||||
/* track the number bound */
|
||||
++nbound[idx];
|
||||
/* get the number of cpus under this location */
|
||||
obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level,
|
||||
idx, OPAL_HWLOC_AVAILABLE);
|
||||
if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
|
||||
free(nbound);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* error out if adding a proc would cause overload and that wasn't allowed */
|
||||
if (ncpus < nbound[idx] &&
|
||||
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
|
||||
opal_hwloc_base_print_binding(map->binding), node->name,
|
||||
nbound[idx], ncpus);
|
||||
free(nbound);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* bind the proc here */
|
||||
proc->bind_idx = idx;
|
||||
cpus = opal_hwloc_base_get_available_cpus(node->topology, obj);
|
||||
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"%s BOUND PROC %s TO %s[%s:%u] on node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name),
|
||||
proc->cpu_bitmap, hwloc_obj_type_string(obj->type),
|
||||
idx, node->name);
|
||||
}
|
||||
}
|
||||
|
||||
if (NULL != nbound) {
|
||||
free(nbound);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int bind_in_place(orte_job_t *jdata,
|
||||
hwloc_obj_type_t target,
|
||||
unsigned cache_level)
|
||||
{
|
||||
/* traverse the hwloc topology tree on each node downwards
|
||||
* until we find an unused object of type target - and then bind
|
||||
* the process to that target
|
||||
*/
|
||||
int i, j;
|
||||
orte_job_map_t *map;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
hwloc_cpuset_t cpus;
|
||||
unsigned int idx, ncpus, nobjs, nsave, *nbound=NULL;
|
||||
struct hwloc_topology_support *support;
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: bind in place for job %s with bindings %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
opal_hwloc_base_print_binding(jdata->map->binding));
|
||||
/* initialize */
|
||||
map = jdata->map;
|
||||
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
/* check if topology supports cpubind - if not, then we cannot bind */
|
||||
support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
|
||||
if (!support->cpubind->set_thisproc_cpubind) {
|
||||
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) {
|
||||
/* we are not required to bind, so ignore this */
|
||||
continue;
|
||||
}
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
|
||||
if (NULL != nbound) {
|
||||
free(nbound);
|
||||
}
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* check if topology supports membind */
|
||||
if (!support->membind->set_thisproc_membind) {
|
||||
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
|
||||
membind_warned = true;
|
||||
} else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
|
||||
if (NULL != nbound) {
|
||||
free(nbound);
|
||||
}
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
|
||||
/* get the number of objects of this type on this node */
|
||||
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
|
||||
cache_level, OPAL_HWLOC_AVAILABLE);
|
||||
if (0 == nobjs) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-bindable-objects", true,
|
||||
node->name, hwloc_obj_type_string(target));
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* setup the array */
|
||||
if (NULL == nbound) {
|
||||
nbound = (unsigned int*)malloc(nobjs * sizeof(int));
|
||||
nsave = nobjs;
|
||||
} else if (nsave < nobjs) {
|
||||
nbound = (unsigned int*)realloc(nbound, nobjs * sizeof(int));
|
||||
}
|
||||
memset(nbound, 0, nobjs * sizeof(int));
|
||||
/* cycle thru the procs */
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs that have already been bound - should
|
||||
* never happen, but safer
|
||||
*/
|
||||
if (NULL != proc->cpu_bitmap) {
|
||||
continue;
|
||||
}
|
||||
/* get the index of this location */
|
||||
if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, proc->locale, OPAL_HWLOC_AVAILABLE))) {
|
||||
free(nbound);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"BINDING PROC %s TO %s NUMBER %u",
|
||||
ORTE_NAME_PRINT(&proc->name),
|
||||
hwloc_obj_type_string(proc->locale->type), idx);
|
||||
/* get the number of cpus under this location */
|
||||
if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, proc->locale))) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
|
||||
free(nbound);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* track number bound */
|
||||
++nbound[idx];
|
||||
/* error out if adding a proc would cause overload and that wasn't allowed */
|
||||
if (ncpus < nbound[idx] &&
|
||||
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
|
||||
opal_hwloc_base_print_binding(map->binding), node->name,
|
||||
nbound[idx], ncpus);
|
||||
free(nbound);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* bind the proc here */
|
||||
proc->bind_idx = idx;
|
||||
cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale);
|
||||
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"%s BOUND PROC %s TO %s[%s:%u] on node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name),
|
||||
proc->cpu_bitmap,
|
||||
hwloc_obj_type_string(proc->locale->type),
|
||||
idx, node->name);
|
||||
}
|
||||
}
|
||||
|
||||
if (NULL != nbound) {
|
||||
free(nbound);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
|
||||
{
|
||||
if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) ||
|
||||
OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
/* no binding requested */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (OPAL_BIND_TO_BOARD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
/* doesn't do anything at this time */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* binding requested */
|
||||
/* if the job was mapped by the corresponding target, then
|
||||
* there is nothing more to do - the launch message creator
|
||||
* will see that the binding object is NULL and will simply
|
||||
* use the locale as the place to bind the proc
|
||||
*
|
||||
* otherwise, we have to bind either up or down the hwloc
|
||||
* tree. If we are binding upwards (e.g., mapped to hwthread
|
||||
* but binding to core), then we just climb the tree to find
|
||||
* the first matching object.
|
||||
*
|
||||
* if we are binding downwards (e.g., mapped to node and bind
|
||||
* to core), then we have to do a round-robin assigment of
|
||||
* procs to the resources below.
|
||||
*/
|
||||
if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
int rc;
|
||||
/* record the level for locality purposes */
|
||||
jdata->map->bind_level = OPAL_HWLOC_HWTHREAD_LEVEL;
|
||||
if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: bindings for job %s - hwthread to hwthread",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_PU, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
/* HW threads are at the bottom, so all other bindings are upwards */
|
||||
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_PU, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
int rc;
|
||||
/* record the level for locality purposes */
|
||||
jdata->map->bind_level = OPAL_HWLOC_CORE_LEVEL;
|
||||
if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: bindings for job %s - core to core",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CORE, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
/* if the mapping policy used is less than bycore, then it is a
|
||||
* downward binding - i.e., the locale is above the binding location.
|
||||
* for example, if we map-to-socket and bind-to-core, then we compare
|
||||
* the mapping value of ORTE_MAPPING_BYCORE to ORTE_MAPPING_BYSOCKET.
|
||||
* In this case, BYCORE > BYSOCKET, so we know that the locale is
|
||||
* above the desired binding level (sockets are at a higher level than
|
||||
* the desired core binding level), and we will have to bind downwards
|
||||
*/
|
||||
if (ORTE_MAPPING_BYCORE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CORE, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CORE, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
return rc;
|
||||
} else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
int rc;
|
||||
/* record the level for locality purposes */
|
||||
jdata->map->bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
|
||||
if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: bindings for job %s - L1cache to L1cache",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 1))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
/* if the mapping policy is less than l1cache, then it is a
|
||||
* downward binding
|
||||
*/
|
||||
if (ORTE_MAPPING_BYL1CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 1))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 1))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
return rc;
|
||||
} else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
int rc;
|
||||
/* record the level for locality purposes */
|
||||
jdata->map->bind_level = OPAL_HWLOC_L2CACHE_LEVEL;
|
||||
if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: bindings for job %s - L2cache to L2cache",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 2))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
/* if the mapping policy is less than l2cache, then it is a
|
||||
* downward binding
|
||||
*/
|
||||
if (ORTE_MAPPING_BYL2CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 2))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 2))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
return rc;
|
||||
} else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
int rc;
|
||||
/* record the level for locality purposes */
|
||||
jdata->map->bind_level = OPAL_HWLOC_L3CACHE_LEVEL;
|
||||
if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: bindings for job %s - L3cache to L3cache",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 3))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
/* if the mapping policy is less than l3cache, then it is a
|
||||
* downward binding
|
||||
*/
|
||||
if (ORTE_MAPPING_BYL3CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 3))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 3))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
return rc;
|
||||
} else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
int rc;
|
||||
/* record the level for locality purposes */
|
||||
jdata->map->bind_level = OPAL_HWLOC_SOCKET_LEVEL;
|
||||
if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: bindings for job %s - socket to socket",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_SOCKET, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
/* if the mapping policy is less than bysocket, then it is a
|
||||
* downward binding
|
||||
*/
|
||||
if (ORTE_MAPPING_BYSOCKET > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
return rc;
|
||||
} else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
int rc;
|
||||
/* record the level for locality purposes */
|
||||
jdata->map->bind_level = OPAL_HWLOC_NUMA_LEVEL;
|
||||
if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: bindings for job %s - numa to numa",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
/* if the mapping policy is less than numa, then it is a
|
||||
* downward binding
|
||||
*/
|
||||
if (ORTE_MAPPING_BYNUMA > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_NODE, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_NODE, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
return rc;
|
||||
} else {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,356 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/util/if.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/hostfile/hostfile.h"
|
||||
#include "orte/util/dash_host/dash_host.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
|
||||
|
||||
/*
|
||||
* determine the proper starting point for the next mapping operation
|
||||
*/
|
||||
opal_list_item_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list, orte_job_t *jdata)
|
||||
{
|
||||
opal_list_item_t *item, *cur_node_item;
|
||||
orte_node_t *node, *nd1, *ndmin;
|
||||
int overload;
|
||||
|
||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||
if (NULL != jdata->bookmark) {
|
||||
cur_node_item = NULL;
|
||||
/* find this node on the list */
|
||||
for (item = opal_list_get_first(node_list);
|
||||
item != opal_list_get_end(node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
|
||||
if (node->index == jdata->bookmark->index) {
|
||||
cur_node_item = item;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* see if we found it - if not, just start at the beginning */
|
||||
if (NULL == cur_node_item) {
|
||||
cur_node_item = opal_list_get_first(node_list);
|
||||
}
|
||||
} else {
|
||||
/* if no bookmark, then just start at the beginning of the list */
|
||||
cur_node_item = opal_list_get_first(node_list);
|
||||
}
|
||||
|
||||
/* is this node fully subscribed? If so, then the first
|
||||
* proc we assign will oversubscribe it, so let's look
|
||||
* for another candidate
|
||||
*/
|
||||
node = (orte_node_t*)cur_node_item;
|
||||
ndmin = node;
|
||||
overload = ndmin->slots_inuse - ndmin->slots_alloc;
|
||||
if (node->slots_inuse >= node->slots_alloc) {
|
||||
/* work down the list - is there another node that
|
||||
* would not be oversubscribed?
|
||||
*/
|
||||
if (cur_node_item != opal_list_get_last(node_list)) {
|
||||
item = opal_list_get_next(cur_node_item);
|
||||
} else {
|
||||
item = opal_list_get_first(node_list);
|
||||
}
|
||||
while (item != cur_node_item) {
|
||||
nd1 = (orte_node_t*)item;
|
||||
if (nd1->slots_inuse < nd1->slots_alloc) {
|
||||
/* this node is not oversubscribed! use it! */
|
||||
return item;
|
||||
}
|
||||
/* this one was also oversubscribed, keep track of the
|
||||
* node that has the least usage - if we can't
|
||||
* find anyone who isn't fully utilized, we will
|
||||
* start with the least used node
|
||||
*/
|
||||
if (overload >= (nd1->slots_inuse - nd1->slots_alloc)) {
|
||||
ndmin = nd1;
|
||||
overload = ndmin->slots_inuse - ndmin->slots_alloc;
|
||||
}
|
||||
if (item == opal_list_get_last(node_list)) {
|
||||
item = opal_list_get_first(node_list);
|
||||
} else {
|
||||
item= opal_list_get_next(item);
|
||||
}
|
||||
}
|
||||
/* if we get here, then we cycled all the way around the
|
||||
* list without finding a better answer - just use the node
|
||||
* that is minimally overloaded
|
||||
*/
|
||||
cur_node_item = (opal_list_item_t*)ndmin;
|
||||
}
|
||||
|
||||
return cur_node_item;
|
||||
}
|
||||
|
||||
/*
|
||||
* Query the registry for all nodes allocated to a specified app_context
|
||||
*/
|
||||
int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||
opal_list_t *node_list, orte_vpid_t num_procs,
|
||||
opal_list_item_t *cur_node_item)
|
||||
{
|
||||
int rc=ORTE_SUCCESS;
|
||||
int i;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
opal_list_item_t *next;
|
||||
orte_vpid_t num_alloc = 0;
|
||||
orte_vpid_t start;
|
||||
int num_procs_to_assign, num_possible_procs;
|
||||
|
||||
/* This loop continues until all procs have been mapped or we run
|
||||
out of resources. We determine that we have "run out of
|
||||
resources" when either all nodes have slots_max processes mapped to them,
|
||||
(thus there are no free slots for a process to be mapped), OR all nodes
|
||||
have reached their soft limit and the user directed us to "no oversubscribe".
|
||||
If we still have processes that haven't been mapped yet, then it's an
|
||||
"out of resources" error. */
|
||||
|
||||
start = jdata->num_procs;
|
||||
|
||||
while ( num_alloc < num_procs) {
|
||||
/** see if any nodes remain unused and available. We need to do this check
|
||||
* each time since we may remove nodes from the list (as they become fully
|
||||
* used) as we cycle through the loop */
|
||||
if(0 >= opal_list_get_size(node_list) ) {
|
||||
/* Everything is at max usage! :( */
|
||||
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:alloc-error",
|
||||
true, num_procs, app->app);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
||||
/* Save the next node we can use before claiming slots, since
|
||||
* we may need to prune the nodes list removing overused nodes.
|
||||
* Wrap around to beginning if we are at the end of the list */
|
||||
if (opal_list_get_end(node_list) == opal_list_get_next(cur_node_item)) {
|
||||
next = opal_list_get_first(node_list);
|
||||
}
|
||||
else {
|
||||
next = opal_list_get_next(cur_node_item);
|
||||
}
|
||||
|
||||
/** declare a shorter name for convenience in the code below */
|
||||
node = (orte_node_t*) cur_node_item;
|
||||
/* If we have available slots on this node, claim all of them
|
||||
* If node_slots == 0, assume 1 slot for that node.
|
||||
* JJH - is this assumption fully justified?
|
||||
*
|
||||
* If we are now oversubscribing the nodes, then we still take:
|
||||
* (a) if the node has not been used yet, we take a full node_slots
|
||||
* (b) if some of the slots are in-use, then we take the number of
|
||||
* remaining slots before hitting the soft limit (node_slots)
|
||||
* (c) if we are at or above the soft limit, we take a full node_slots
|
||||
* unless we are loadbalancing, in which case we only take one
|
||||
*
|
||||
* Note: if node_slots is zero, then we always just take 1 slot
|
||||
*
|
||||
* We continue this process until either everything is done,
|
||||
* or all nodes have hit their hard limit. This algorithm ensures we
|
||||
* fully utilize each node before oversubscribing, and preserves the ratio
|
||||
* of processes between the nodes thereafter (e.g., if one node has twice as
|
||||
* many processes as another before oversubscribing, it will continue
|
||||
* to do so after oversubscribing).
|
||||
*/
|
||||
if (node->slots_inuse >= node->slots_alloc || 0 == node->slots_inuse) {
|
||||
if (0 == node->slots_alloc) {
|
||||
num_procs_to_assign = 1;
|
||||
} else {
|
||||
/* 'num_possible_procs' defines the number of ranks */
|
||||
num_possible_procs = node->slots_alloc;
|
||||
if (0 == num_possible_procs) {
|
||||
num_procs_to_assign = 1;
|
||||
} else {
|
||||
num_procs_to_assign = num_possible_procs;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* 'num_possible_procs' define number of ranks on the node. Each
|
||||
* rank occupies one slot. Each slot may represent more than one
|
||||
* cpu, depending on the cpus-per-task setting
|
||||
*/
|
||||
num_possible_procs = (node->slots_alloc - node->slots_inuse);
|
||||
if (0 == num_possible_procs) {
|
||||
num_procs_to_assign = 1;
|
||||
} else {
|
||||
num_procs_to_assign = num_possible_procs;
|
||||
}
|
||||
}
|
||||
|
||||
/* check if we are in npernode mode - if so, then set the num_slots_to_take
|
||||
* to the num_per_node
|
||||
*/
|
||||
if (0 < jdata->map->npernode) {
|
||||
num_procs_to_assign = jdata->map->npernode;
|
||||
}
|
||||
|
||||
for( i = 0; i < num_procs_to_assign; ++i) {
|
||||
proc = NULL;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
node_list, jdata->map->oversubscribe,
|
||||
true, &proc))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error - we just need to break from the loop
|
||||
* since the node is fully used up. For now, just don't report
|
||||
* an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* assign the vpid */
|
||||
proc->name.vpid = start++;
|
||||
|
||||
/* Update the number of procs allocated */
|
||||
++num_alloc;
|
||||
|
||||
/** if all the procs have been mapped, we return */
|
||||
if (num_alloc == num_procs) {
|
||||
goto complete;
|
||||
}
|
||||
|
||||
/* if we have fully used up this node, then break from the loop */
|
||||
if (ORTE_ERR_NODE_FULLY_USED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* we move on to the next node in all cases EXCEPT if we came
|
||||
* out of the loop without having taken a full bite AND the
|
||||
* node is NOT max'd out
|
||||
*
|
||||
*/
|
||||
if (i < (num_procs_to_assign-1) &&
|
||||
ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
continue;
|
||||
}
|
||||
cur_node_item = next;
|
||||
}
|
||||
|
||||
complete:
|
||||
/* save the bookmark */
|
||||
jdata->bookmark = (orte_node_t*)cur_node_item;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
|
||||
opal_list_t *node_list, orte_vpid_t num_procs,
|
||||
opal_list_item_t *cur_node_item)
|
||||
{
|
||||
int rc = ORTE_SUCCESS;
|
||||
opal_list_item_t *next;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
orte_vpid_t num_alloc=0;
|
||||
orte_vpid_t start;
|
||||
|
||||
/* This loop continues until all procs have been mapped or we run
|
||||
out of resources. We determine that we have "run out of
|
||||
resources" when all nodes have slots_max processes mapped to them,
|
||||
thus there are no free slots for a process to be mapped, or we have
|
||||
hit the soft limit on all nodes and are in a "no oversubscribe" state.
|
||||
If we still have processes that haven't been mapped yet, then it's an
|
||||
"out of resources" error.
|
||||
|
||||
In this scenario, we rely on the claim_slot function to handle the
|
||||
oversubscribed case. The claim_slot function will leave a node on the
|
||||
list until it either reaches slots_max OR reaches the
|
||||
soft limit and the "no_oversubscribe" flag has been set - at which point,
|
||||
the node will be removed to prevent any more processes from being mapped to
|
||||
it. Since we are taking one slot from each node as we cycle through, the
|
||||
list, oversubscription is automatically taken care of via this logic.
|
||||
*/
|
||||
|
||||
start = jdata->num_procs;
|
||||
|
||||
while (num_alloc < num_procs) {
|
||||
/** see if any nodes remain unused and available. We need to do this check
|
||||
* each time since we may remove nodes from the list (as they become fully
|
||||
* used) as we cycle through the loop */
|
||||
if(0 >= opal_list_get_size(node_list) ) {
|
||||
/* No more nodes to allocate :( */
|
||||
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:alloc-error",
|
||||
true, num_procs, app->app);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
||||
/* Save the next node we can use before claiming slots, since
|
||||
* we may need to prune the nodes list removing overused nodes.
|
||||
* Wrap around to beginning if we are at the end of the list */
|
||||
if (opal_list_get_end(node_list) == opal_list_get_next(cur_node_item)) {
|
||||
next = opal_list_get_first(node_list);
|
||||
}
|
||||
else {
|
||||
next = opal_list_get_next(cur_node_item);
|
||||
}
|
||||
|
||||
/* Allocate a slot on this node */
|
||||
node = (orte_node_t*) cur_node_item;
|
||||
proc = NULL;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, jdata->map->cpus_per_rank, app->idx,
|
||||
node_list, jdata->map->oversubscribe, true, &proc))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error - we just need to break from the loop
|
||||
* since the node is fully used up. For now, just don't report
|
||||
* an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* assign the vpid */
|
||||
proc->name.vpid = start++;
|
||||
|
||||
/* Update the number of procs allocated */
|
||||
++num_alloc;
|
||||
|
||||
cur_node_item = next;
|
||||
}
|
||||
|
||||
/* save the bookmark */
|
||||
jdata->bookmark = (orte_node_t*)cur_node_item;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -24,6 +25,7 @@
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -51,6 +53,10 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
|
||||
* DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP
|
||||
*/
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: mapping job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
|
||||
/* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE
|
||||
* THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE
|
||||
* PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY
|
||||
@ -71,10 +77,15 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
/* load it with the system defaults */
|
||||
map->policy = orte_default_mapping_policy;
|
||||
map->mapping = orte_rmaps_base.mapping;
|
||||
map->ranking = orte_rmaps_base.ranking;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
map->binding = opal_hwloc_binding_policy;
|
||||
#endif
|
||||
if (NULL != orte_rmaps_base.ppr) {
|
||||
map->ppr = strdup(orte_rmaps_base.ppr);
|
||||
}
|
||||
map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
|
||||
map->stride = orte_rmaps_base.stride;
|
||||
map->oversubscribe = orte_rmaps_base.oversubscribe;
|
||||
map->display_map = orte_rmaps_base.display_map;
|
||||
/* assign the map object to this job */
|
||||
jdata->map = map;
|
||||
@ -82,60 +93,174 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
|
||||
if (!jdata->map->display_map) {
|
||||
jdata->map->display_map = orte_rmaps_base.display_map;
|
||||
}
|
||||
if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->policy)) {
|
||||
jdata->map->policy = jdata->map->policy | orte_default_mapping_policy;
|
||||
/* set the default mapping policy IFF it wasn't provided */
|
||||
if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, orte_rmaps_base.mapping);
|
||||
}
|
||||
if (!ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping));
|
||||
}
|
||||
/* ditto for rank and bind policies */
|
||||
if (!ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) {
|
||||
ORTE_SET_RANKING_POLICY(jdata->map->ranking, orte_rmaps_base.ranking);
|
||||
}
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||
jdata->map->binding = opal_hwloc_binding_policy;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* if the job is the daemon job, then we are just mapping daemons and
|
||||
* not apps in preparation to launch a virtual machine
|
||||
*/
|
||||
if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: mapping daemons");
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_setup_virtual_machine(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* cycle thru the available mappers until one agrees to map
|
||||
* the job
|
||||
*/
|
||||
did_map = false;
|
||||
for (item = opal_list_get_first(&orte_rmaps_base.selected_modules);
|
||||
item != opal_list_get_end(&orte_rmaps_base.selected_modules);
|
||||
item = opal_list_get_next(item)) {
|
||||
mod = (orte_rmaps_base_selected_module_t*)item;
|
||||
if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata))) {
|
||||
did_map = true;
|
||||
break;
|
||||
}
|
||||
/* mappers return "next option" if they didn't attempt to
|
||||
* map the job. anything else is a true error.
|
||||
*/
|
||||
if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
} else {
|
||||
/* cycle thru the available mappers until one agrees to map
|
||||
* the job
|
||||
*/
|
||||
did_map = false;
|
||||
for (item = opal_list_get_first(&orte_rmaps_base.selected_modules);
|
||||
item != opal_list_get_end(&orte_rmaps_base.selected_modules);
|
||||
item = opal_list_get_next(item)) {
|
||||
mod = (orte_rmaps_base_selected_module_t*)item;
|
||||
if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata))) {
|
||||
did_map = true;
|
||||
break;
|
||||
}
|
||||
/* mappers return "next option" if they didn't attempt to
|
||||
* map the job. anything else is a true error.
|
||||
*/
|
||||
if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
/* if we get here without doing the map, or with zero procs in
|
||||
* the map, then that's an error
|
||||
*/
|
||||
if (!did_map || 0 == jdata->num_procs) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "failed-map", true);
|
||||
return ORTE_ERR_FAILED_TO_MAP;
|
||||
}
|
||||
}
|
||||
/* if we get here without doing the map, or with zero procs in
|
||||
* the map, then that's an error
|
||||
*/
|
||||
if (!did_map || 0 == jdata->num_procs) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "failed-map", true);
|
||||
return ORTE_ERR_FAILED_TO_MAP;
|
||||
}
|
||||
|
||||
/* compute and save local ranks */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if we wanted to display the map, now is the time to do it */
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* compute and save bindings */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* if we wanted to display the map, now is the time to do it - ignore
|
||||
* daemon job
|
||||
*/
|
||||
if (jdata->map->display_map) {
|
||||
char *output;
|
||||
opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP);
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "%s\n", output);
|
||||
fflush(orte_xml_fp);
|
||||
int i, j;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
|
||||
if (orte_display_diffable_output) {
|
||||
/* intended solely to test mapping methods, this output
|
||||
* can become quite long when testing at scale. Rather
|
||||
* than enduring all the malloc/free's required to
|
||||
* create an arbitrary-length string, custom-generate
|
||||
* the output a line at a time here
|
||||
*/
|
||||
/* display just the procs in a diffable format */
|
||||
opal_output(orte_clean_output, "<map>");
|
||||
fflush(stderr);
|
||||
/* loop through nodes */
|
||||
for (i=0; i < jdata->map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
opal_output(orte_clean_output, "\t<host name=%s>", (NULL == node->name) ? "UNKNOWN" : node->name);
|
||||
fflush(stderr);
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{
|
||||
char locale[64];
|
||||
|
||||
if (NULL != proc->locale) {
|
||||
hwloc_bitmap_list_snprintf(locale, 64, proc->locale->cpuset);
|
||||
}
|
||||
opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu locale=%s binding=%s[%s:%u]>",
|
||||
ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx,
|
||||
(unsigned long)proc->local_rank,
|
||||
(unsigned long)proc->node_rank, locale,
|
||||
(NULL == proc->cpu_bitmap) ? "NULL" : proc->cpu_bitmap,
|
||||
opal_hwloc_base_print_level(jdata->map->bind_level), proc->bind_idx);
|
||||
}
|
||||
#else
|
||||
opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu>",
|
||||
ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx,
|
||||
(unsigned long)proc->local_rank,
|
||||
(unsigned long)proc->node_rank);
|
||||
#endif
|
||||
fflush(stderr);
|
||||
}
|
||||
opal_output(orte_clean_output, "\t</host>");
|
||||
fflush(stderr);
|
||||
}
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{
|
||||
opal_paffinity_locality_t locality;
|
||||
orte_proc_t *p0;
|
||||
|
||||
/* test locality - for the first node, print the locality of each proc relative to the first one */
|
||||
node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, 0);
|
||||
p0 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, 0);
|
||||
opal_output(orte_clean_output, "\t<locality>");
|
||||
for (j=1; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
locality = opal_hwloc_base_get_relative_locality(node->topology,
|
||||
jdata->map->bind_level,
|
||||
p0->bind_idx,
|
||||
jdata->map->bind_level,
|
||||
proc->bind_idx);
|
||||
opal_output(orte_clean_output, "\t\t<bind_level=%s rank=%s bind_idx=%u rank=%s bind_idx=%u locality=%s>",
|
||||
opal_hwloc_base_print_level(jdata->map->bind_level),
|
||||
ORTE_VPID_PRINT(p0->name.vpid),
|
||||
p0->bind_idx, ORTE_VPID_PRINT(proc->name.vpid),
|
||||
proc->bind_idx, opal_hwloc_base_print_locality(locality));
|
||||
}
|
||||
opal_output(orte_clean_output, "\t</locality>\n</map>");
|
||||
fflush(stderr);
|
||||
}
|
||||
#else
|
||||
opal_output(orte_clean_output, "\n</map>");
|
||||
fflush(stderr);
|
||||
#endif
|
||||
} else {
|
||||
opal_output(orte_clean_output, "%s", output);
|
||||
opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP);
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "%s\n", output);
|
||||
fflush(orte_xml_fp);
|
||||
} else {
|
||||
opal_output(orte_clean_output, "%s", output);
|
||||
}
|
||||
free(output);
|
||||
}
|
||||
free(output);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -30,7 +30,6 @@
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/paffinity/paffinity.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/show_help.h"
|
||||
@ -82,36 +81,231 @@ orte_rmaps_t orte_rmaps = {
|
||||
*/
|
||||
int orte_rmaps_base_open(void)
|
||||
{
|
||||
int param, value;
|
||||
int param, value, i;
|
||||
char *policy;
|
||||
bool btmp;
|
||||
orte_mapping_policy_t tmp=0;
|
||||
orte_ranking_policy_t rtmp=0;
|
||||
char **ck, **ck2;
|
||||
size_t len;
|
||||
|
||||
/* init the globals */
|
||||
OBJ_CONSTRUCT(&orte_rmaps_base.selected_modules, opal_list_t);
|
||||
orte_rmaps_base.ppr = NULL;
|
||||
orte_rmaps_base.cpus_per_rank = 1;
|
||||
orte_rmaps_base.display_map = false;
|
||||
orte_rmaps_base.slot_list = NULL;
|
||||
orte_rmaps_base.mapping = 0;
|
||||
orte_rmaps_base.ranking = 0;
|
||||
|
||||
/* Debugging / verbose output. Always have stream open, with
|
||||
verbose set by the mca open system... */
|
||||
verbose set by the mca open system... */
|
||||
orte_rmaps_base.rmaps_output = opal_output_open(NULL);
|
||||
|
||||
/* Are we scheduling by node or by slot? */
|
||||
param = mca_base_param_reg_string_name("rmaps", "base_schedule_policy",
|
||||
"Scheduling Policy for RMAPS. [slot (alias:core) | socket | board | node]",
|
||||
false, false, "slot", &policy);
|
||||
|
||||
/* if something is specified, do not override what may already
|
||||
* be present - could have been given on cmd line
|
||||
*/
|
||||
if (0 == strcasecmp(policy, "slot") ||
|
||||
0 == strcasecmp(policy, "core")) {
|
||||
ORTE_XSET_MAPPING_POLICY(ORTE_MAPPING_BYSLOT);
|
||||
} else if (0 == strcasecmp(policy, "socket")) {
|
||||
ORTE_XSET_MAPPING_POLICY(ORTE_MAPPING_BYSOCKET);
|
||||
} else if (0 == strcasecmp(policy, "board")) {
|
||||
ORTE_XSET_MAPPING_POLICY(ORTE_MAPPING_BYBOARD);
|
||||
} else if (0 == strcasecmp(policy, "node")) {
|
||||
ORTE_XSET_MAPPING_POLICY(ORTE_MAPPING_BYNODE);
|
||||
/* define default mapping policy */
|
||||
param = mca_base_param_reg_string_name("rmaps", "base_mapping_policy",
|
||||
#if OPAL_HAVE_HWLOC
|
||||
"Mapping Policy [slot (default) | hwthread | core | l1cache | l2cache | l3cache | socket | numa | board | node | seq], with allowed modifiers :SPAN,OVERSUBSCRIBE,NOOVERSUBSCRIBE",
|
||||
#else
|
||||
"Mapping Policy [slot (default) | node], with allowed modifiers :SPAN,OVERSUBSCRIBE,NOOVERSUBSCRIBE",
|
||||
#endif
|
||||
false, false, NULL, &policy);
|
||||
mca_base_param_reg_syn_name(param, "rmaps", "base_schedule_policy", true);
|
||||
|
||||
if (NULL == policy) {
|
||||
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSLOT);
|
||||
ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
|
||||
} else {
|
||||
ck = opal_argv_split(policy, ':');
|
||||
if (2 < opal_argv_count(ck)) {
|
||||
/* incorrect format */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "mapping", policy);
|
||||
opal_argv_free(ck);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
if (2 == opal_argv_count(ck)) {
|
||||
ck2 = opal_argv_split(ck[1], ',');
|
||||
for (i=0; NULL != ck2[i]; i++) {
|
||||
if (0 == strncasecmp(ck2[i], "span", strlen(ck2[i]))) {
|
||||
orte_rmaps_base.mapping |= ORTE_MAPPING_SPAN;
|
||||
} else if (0 == strncasecmp(ck2[i], "oversubscribe", strlen(ck2[i]))) {
|
||||
if (ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
|
||||
/* error - cannot redefine the default mapping policy */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
|
||||
"oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
|
||||
} else if (0 == strncasecmp(ck2[i], "nooversubscribe", strlen(ck2[i]))) {
|
||||
if (ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
|
||||
/* error - cannot redefine the default mapping policy */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
|
||||
"nooversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
|
||||
} else {
|
||||
/* unrecognized modifier */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "unrecognized-modifier", true, "mapping", ck2[i]);
|
||||
opal_argv_free(ck);
|
||||
opal_argv_free(ck2);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
opal_argv_free(ck2);
|
||||
}
|
||||
len = strlen(ck[0]);
|
||||
if (0 == strncasecmp(ck[0], "slot", len)) {
|
||||
tmp = ORTE_MAPPING_BYSLOT;
|
||||
} else if (0 == strncasecmp(ck[0], "node", len)) {
|
||||
tmp = ORTE_MAPPING_BYNODE;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
} else if (0 == strncasecmp(ck[0], "core", len)) {
|
||||
tmp = ORTE_MAPPING_BYCORE;
|
||||
} else if (0 == strncasecmp(ck[0], "l1cache", len)) {
|
||||
tmp = ORTE_MAPPING_BYL1CACHE;
|
||||
} else if (0 == strncasecmp(ck[0], "l2cache", len)) {
|
||||
tmp = ORTE_MAPPING_BYL2CACHE;
|
||||
} else if (0 == strncasecmp(ck[0], "l3cache", len)) {
|
||||
tmp = ORTE_MAPPING_BYL3CACHE;
|
||||
} else if (0 == strncasecmp(ck[0], "socket", len)) {
|
||||
tmp = ORTE_MAPPING_BYSOCKET;
|
||||
} else if (0 == strncasecmp(ck[0], "numa", len)) {
|
||||
tmp = ORTE_MAPPING_BYNUMA;
|
||||
} else if (0 == strncasecmp(ck[0], "board", len)) {
|
||||
tmp = ORTE_MAPPING_BYBOARD;
|
||||
} else if (0 == strncasecmp(ck[0], "hwthread", len)) {
|
||||
tmp = ORTE_MAPPING_BYHWTHREAD;
|
||||
/* if we are mapping processes to individual hwthreads, then
|
||||
* we need to treat those hwthreads as separate cpus
|
||||
*/
|
||||
opal_hwloc_use_hwthreads_as_cpus = true;
|
||||
#endif
|
||||
} else {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "mapping", policy);
|
||||
opal_argv_free(ck);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, tmp);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
|
||||
opal_argv_free(ck);
|
||||
}
|
||||
|
||||
/* define default ranking policy */
|
||||
param = mca_base_param_reg_string_name("rmaps", "base_ranking_policy",
|
||||
#if OPAL_HAVE_HWLOC
|
||||
"Ranking Policy [slot (default) | hwthread | core | l1cache | l2cache | l3cache | socket | numa | board | node], with modifier :SPAN or :FILL",
|
||||
#else
|
||||
"Ranking Policy [slot (default) | node]",
|
||||
#endif
|
||||
false, false, NULL, &policy);
|
||||
if (NULL == policy) {
|
||||
ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_SLOT);
|
||||
} else {
|
||||
ck = opal_argv_split(policy, ':');
|
||||
if (2 < opal_argv_count(ck)) {
|
||||
/* incorrect format */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "ranking", policy);
|
||||
opal_argv_free(ck);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
if (2 == opal_argv_count(ck)) {
|
||||
if (0 == strncasecmp(ck[1], "span", strlen(ck[1]))) {
|
||||
orte_rmaps_base.ranking |= ORTE_RANKING_SPAN;
|
||||
} else if (0 == strncasecmp(ck[1], "fill", strlen(ck[1]))) {
|
||||
orte_rmaps_base.ranking |= ORTE_RANKING_FILL;
|
||||
} else {
|
||||
/* unrecognized modifier */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "unrecognized-modifier", true, "ranking", ck[1]);
|
||||
opal_argv_free(ck);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
len = strlen(ck[0]);
|
||||
if (0 == strncasecmp(ck[0], "slot", len)) {
|
||||
rtmp = ORTE_RANK_BY_SLOT;
|
||||
} else if (0 == strncasecmp(ck[0], "node", len)) {
|
||||
rtmp = ORTE_RANK_BY_NODE;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
} else if (0 == strncasecmp(ck[0], "hwthread", len)) {
|
||||
rtmp = ORTE_RANK_BY_HWTHREAD;
|
||||
} else if (0 == strncasecmp(ck[0], "core", len)) {
|
||||
rtmp = ORTE_RANK_BY_CORE;
|
||||
} else if (0 == strncasecmp(ck[0], "l1cache", len)) {
|
||||
rtmp = ORTE_RANK_BY_L1CACHE;
|
||||
} else if (0 == strncasecmp(ck[0], "l2cache", len)) {
|
||||
rtmp = ORTE_RANK_BY_L2CACHE;
|
||||
} else if (0 == strncasecmp(ck[0], "l3cache", len)) {
|
||||
rtmp = ORTE_RANK_BY_L3CACHE;
|
||||
} else if (0 == strncasecmp(ck[0], "socket", len)) {
|
||||
rtmp = ORTE_RANK_BY_SOCKET;
|
||||
} else if (0 == strncasecmp(ck[0], "numa", len)) {
|
||||
rtmp = ORTE_RANK_BY_NUMA;
|
||||
} else if (0 == strncasecmp(ck[0], "board", len)) {
|
||||
rtmp = ORTE_RANK_BY_BOARD;
|
||||
#endif
|
||||
} else {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "ranking", policy);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, rtmp);
|
||||
ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN);
|
||||
}
|
||||
|
||||
/* backward compatibility */
|
||||
mca_base_param_reg_int_name("rmaps", "base_byslot",
|
||||
"Whether to map and rank processes round-robin by slot",
|
||||
false, false, (int)false, &value);
|
||||
if (value) {
|
||||
/* set mapping policy to byslot - error if something else already set */
|
||||
if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
|
||||
ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYSLOT) {
|
||||
/* error - cannot redefine the default mapping policy */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
|
||||
"byslot", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSLOT);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
|
||||
/* set ranking policy to byslot - error if something else already set */
|
||||
if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) &&
|
||||
ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_SLOT) {
|
||||
/* error - cannot redefine the default ranking policy */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking",
|
||||
"byslot", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking));
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_SLOT);
|
||||
ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN);
|
||||
}
|
||||
mca_base_param_reg_int_name("rmaps", "base_bynode",
|
||||
"Whether to map and rank processes round-robin by node",
|
||||
false, false, (int)false, &value);
|
||||
if (value) {
|
||||
/* set mapping policy to bynode - error if something else already set */
|
||||
if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
|
||||
ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYNODE) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
|
||||
"bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
|
||||
/* set ranking policy to bynode - error if something else already set */
|
||||
if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) &&
|
||||
ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_NODE) {
|
||||
/* error - cannot redefine the default ranking policy */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking",
|
||||
"bynode", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking));
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_NODE);
|
||||
ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN);
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* #cpus/rank to use */
|
||||
param = mca_base_param_reg_int_name("rmaps", "base_cpus_per_proc",
|
||||
"Number of cpus to use for each rank [1-2**15 (default=1)]",
|
||||
@ -119,38 +313,21 @@ int orte_rmaps_base_open(void)
|
||||
mca_base_param_reg_syn_name(param, "rmaps", "base_cpus_per_rank", false);
|
||||
mca_base_param_lookup_int(param, &value);
|
||||
orte_rmaps_base.cpus_per_rank = value;
|
||||
/* if the #cpus/rank > #cpus/socket, politely tell the user and abort
|
||||
*
|
||||
* NOTE: have to check that the default_num_cores_per_socket was set
|
||||
* as ompi_info doesn't call the ess init function, and thus might
|
||||
* leave this value at its default of zero
|
||||
*/
|
||||
if (0 < orte_default_num_cores_per_socket &&
|
||||
orte_rmaps_base.cpus_per_rank > orte_default_num_cores_per_socket) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "too-many-cpus-per-rank",
|
||||
true, orte_rmaps_base.cpus_per_rank,
|
||||
orte_default_num_cores_per_socket);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* if the cpus/rank > 1, then we have to bind to cores UNLESS the binding has
|
||||
* already been set to something else
|
||||
*/
|
||||
if (1 < orte_rmaps_base.cpus_per_rank) {
|
||||
ORTE_XSET_BINDING_POLICY(ORTE_BIND_TO_CORE);
|
||||
if (1 < orte_rmaps_base.cpus_per_rank &&
|
||||
!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
|
||||
opal_hwloc_binding_policy |= OPAL_BIND_TO_CORE;
|
||||
}
|
||||
|
||||
/* stride to use */
|
||||
param = mca_base_param_reg_int_name("rmaps", "base_stride",
|
||||
"When binding multiple cores to a rank, the step size to use between cores [1-2**15 (default: 1)]",
|
||||
false, false, 1, &value);
|
||||
orte_rmaps_base.stride = value;
|
||||
|
||||
#endif
|
||||
|
||||
/* Should we schedule on the local node or not? */
|
||||
mca_base_param_reg_int_name("rmaps", "base_no_schedule_local",
|
||||
"If false, allow scheduling MPI applications on the same node as mpirun (default). If true, do not schedule any MPI applications on the same node as mpirun",
|
||||
false, false, (int)false, &value);
|
||||
if (value) {
|
||||
orte_default_mapping_policy |= ORTE_MAPPING_NO_USE_LOCAL;
|
||||
orte_rmaps_base.mapping |= ORTE_MAPPING_NO_USE_LOCAL;
|
||||
}
|
||||
|
||||
/* Should we oversubscribe or not? */
|
||||
@ -159,11 +336,33 @@ int orte_rmaps_base_open(void)
|
||||
"If true, then do not allow oversubscription of nodes - mpirun will return an error if there aren't enough nodes to launch all processes without oversubscribing",
|
||||
false, false, (int)false, &value);
|
||||
if (value) {
|
||||
orte_rmaps_base.oversubscribe = false;
|
||||
} else {
|
||||
orte_rmaps_base.oversubscribe = true;
|
||||
if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
|
||||
!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
|
||||
/* error - cannot redefine the default mapping policy */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
|
||||
"no-oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
|
||||
}
|
||||
|
||||
|
||||
/** force oversubscription permission */
|
||||
mca_base_param_reg_int_name("rmaps", "base_oversubscribe",
|
||||
"If true, then =allow oversubscription of nodes",
|
||||
false, false, (int)false, &value);
|
||||
if (value) {
|
||||
if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
|
||||
(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
|
||||
/* error - cannot redefine the default mapping policy */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
|
||||
"oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
|
||||
}
|
||||
|
||||
/* should we display the map after determining it? */
|
||||
mca_base_param_reg_int_name("rmaps", "base_display_map",
|
||||
"Whether to display the process map after it is computed",
|
||||
@ -200,11 +399,18 @@ int orte_rmaps_base_open(void)
|
||||
mca_base_components_open("rmaps", orte_rmaps_base.rmaps_output,
|
||||
mca_rmaps_base_static_components,
|
||||
&orte_rmaps_base.available_components, true)) {
|
||||
return ORTE_ERROR;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* check to see if any component indicated a problem */
|
||||
if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
|
||||
/* the component would have already reported the error, so
|
||||
* tell the rest of the chain to shut up
|
||||
*/
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
222
orte/mca/rmaps/base/rmaps_base_print_fns.c
Обычный файл
222
orte/mca/rmaps/base/rmaps_base_print_fns.c
Обычный файл
@ -0,0 +1,222 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/util/if.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/threads/tsd.h"
|
||||
|
||||
#include "orte/types.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/hostfile/hostfile.h"
|
||||
#include "orte/util/dash_host/dash_host.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/runtime/data_type_support/orte_dt_support.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
|
||||
#define ORTE_RMAPS_PRINT_MAX_SIZE 50
|
||||
#define ORTE_RMAPS_PRINT_NUM_BUFS 16
|
||||
|
||||
static bool fns_init=false;
|
||||
static opal_tsd_key_t print_tsd_key;
|
||||
static char* orte_rmaps_print_null = "NULL";
|
||||
typedef struct {
|
||||
char *buffers[ORTE_RMAPS_PRINT_NUM_BUFS];
|
||||
int cntr;
|
||||
} orte_rmaps_print_buffers_t;
|
||||
|
||||
static void buffer_cleanup(void *value)
|
||||
{
|
||||
int i;
|
||||
orte_rmaps_print_buffers_t *ptr;
|
||||
|
||||
if (NULL != value) {
|
||||
ptr = (orte_rmaps_print_buffers_t*)value;
|
||||
for (i=0; i < ORTE_RMAPS_PRINT_NUM_BUFS; i++) {
|
||||
free(ptr->buffers[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static orte_rmaps_print_buffers_t *get_print_buffer(void)
|
||||
{
|
||||
orte_rmaps_print_buffers_t *ptr;
|
||||
int ret, i;
|
||||
|
||||
if (!fns_init) {
|
||||
/* setup the print_args function */
|
||||
if (ORTE_SUCCESS != (ret = opal_tsd_key_create(&print_tsd_key, buffer_cleanup))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return NULL;
|
||||
}
|
||||
fns_init = true;
|
||||
}
|
||||
|
||||
ret = opal_tsd_getspecific(print_tsd_key, (void**)&ptr);
|
||||
if (OPAL_SUCCESS != ret) return NULL;
|
||||
|
||||
if (NULL == ptr) {
|
||||
ptr = (orte_rmaps_print_buffers_t*)malloc(sizeof(orte_rmaps_print_buffers_t));
|
||||
for (i=0; i < ORTE_RMAPS_PRINT_NUM_BUFS; i++) {
|
||||
ptr->buffers[i] = (char *) malloc((ORTE_RMAPS_PRINT_MAX_SIZE+1) * sizeof(char));
|
||||
}
|
||||
ptr->cntr = 0;
|
||||
ret = opal_tsd_setspecific(print_tsd_key, (void*)ptr);
|
||||
}
|
||||
|
||||
return (orte_rmaps_print_buffers_t*) ptr;
|
||||
}
|
||||
|
||||
char* orte_rmaps_base_print_mapping(orte_mapping_policy_t mapping)
|
||||
{
|
||||
char *ret, *map, *mymap, *tmp;
|
||||
orte_rmaps_print_buffers_t *ptr;
|
||||
|
||||
if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
|
||||
return "CONFLICTED";
|
||||
}
|
||||
|
||||
ptr = get_print_buffer();
|
||||
if (NULL == ptr) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return orte_rmaps_print_null;
|
||||
}
|
||||
/* cycle around the ring */
|
||||
if (ORTE_RMAPS_PRINT_NUM_BUFS == ptr->cntr) {
|
||||
ptr->cntr = 0;
|
||||
}
|
||||
|
||||
switch(ORTE_GET_MAPPING_POLICY(mapping)) {
|
||||
case ORTE_MAPPING_BYNODE:
|
||||
map = "BYNODE";
|
||||
break;
|
||||
case ORTE_MAPPING_BYBOARD:
|
||||
map = "BYBOARD";
|
||||
break;
|
||||
case ORTE_MAPPING_BYNUMA:
|
||||
map = "BYNUMA";
|
||||
break;
|
||||
case ORTE_MAPPING_BYSOCKET:
|
||||
map = "BYSOCKET";
|
||||
break;
|
||||
case ORTE_MAPPING_BYL3CACHE:
|
||||
map = "BYL3CACHE";
|
||||
break;
|
||||
case ORTE_MAPPING_BYL2CACHE:
|
||||
map = "BYL2CACHE";
|
||||
break;
|
||||
case ORTE_MAPPING_BYL1CACHE:
|
||||
map = "BYL1CACHE";
|
||||
break;
|
||||
case ORTE_MAPPING_BYCORE:
|
||||
map = "BYCORE";
|
||||
break;
|
||||
case ORTE_MAPPING_BYHWTHREAD:
|
||||
map = "BYHWTHREAD";
|
||||
break;
|
||||
case ORTE_MAPPING_BYSLOT:
|
||||
map = "BYSLOT";
|
||||
break;
|
||||
case ORTE_MAPPING_SEQ:
|
||||
map = "SEQUENTIAL";
|
||||
break;
|
||||
case ORTE_MAPPING_BYUSER:
|
||||
map = "BYUSER";
|
||||
break;
|
||||
default:
|
||||
if (ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
|
||||
map = "PPR";
|
||||
} else {
|
||||
map = "UNKNOWN";
|
||||
}
|
||||
}
|
||||
if (0 != strcmp(map, "PPR") && (ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(mapping))) {
|
||||
asprintf(&mymap, "%s[PPR]:", map);
|
||||
} else {
|
||||
asprintf(&mymap, "%s:", map);
|
||||
}
|
||||
if (ORTE_MAPPING_NO_USE_LOCAL & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
|
||||
asprintf(&tmp, "%sNO_USE_LOCAL,", mymap);
|
||||
free(mymap);
|
||||
mymap = tmp;
|
||||
}
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
|
||||
asprintf(&tmp, "%sNOOVERSUBSCRIBE,", mymap);
|
||||
free(mymap);
|
||||
mymap = tmp;
|
||||
} else if (ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
|
||||
asprintf(&tmp, "%sOVERSUBSCRIBE,", mymap);
|
||||
free(mymap);
|
||||
mymap = tmp;
|
||||
}
|
||||
if (ORTE_MAPPING_SPAN & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
|
||||
asprintf(&tmp, "%sSPAN,", mymap);
|
||||
free(mymap);
|
||||
mymap = tmp;
|
||||
}
|
||||
|
||||
/* remove the trailing mark */
|
||||
mymap[strlen(mymap)-1] = '\0';
|
||||
|
||||
snprintf(ptr->buffers[ptr->cntr], ORTE_RMAPS_PRINT_MAX_SIZE, "%s", mymap);
|
||||
free(mymap);
|
||||
ret = ptr->buffers[ptr->cntr];
|
||||
ptr->cntr++;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
char* orte_rmaps_base_print_ranking(orte_ranking_policy_t ranking)
|
||||
{
|
||||
switch(ORTE_GET_RANKING_POLICY(ranking)) {
|
||||
case ORTE_RANK_BY_NODE:
|
||||
return "NODE";
|
||||
case ORTE_RANK_BY_BOARD:
|
||||
return "BOARD";
|
||||
case ORTE_RANK_BY_NUMA:
|
||||
return "NUMA";
|
||||
case ORTE_RANK_BY_SOCKET:
|
||||
return "SOCKET";
|
||||
case ORTE_RANK_BY_CORE:
|
||||
return "CORE";
|
||||
case ORTE_RANK_BY_HWTHREAD:
|
||||
return "HWTHREAD";
|
||||
case ORTE_RANK_BY_SLOT:
|
||||
return "SLOT";
|
||||
default:
|
||||
return "UNKNOWN";
|
||||
}
|
||||
}
|
737
orte/mca/rmaps/base/rmaps_base_ranking.c
Обычный файл
737
orte/mca/rmaps/base/rmaps_base_ranking.c
Обычный файл
@ -0,0 +1,737 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/util/if.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/threads/tsd.h"
|
||||
|
||||
#include "orte/types.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/hostfile/hostfile.h"
|
||||
#include "orte/util/dash_host/dash_host.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/runtime/data_type_support/orte_dt_support.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
static int rank_span(orte_job_t *jdata,
|
||||
hwloc_obj_type_t target,
|
||||
unsigned cache_level)
|
||||
{
|
||||
orte_job_map_t *map;
|
||||
hwloc_obj_t obj;
|
||||
int num_objs, i, j, n, rc;
|
||||
orte_vpid_t num_ranked=0;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
orte_vpid_t vpid;
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_span: for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
|
||||
/* if the ranking is spanned, then we perform the
|
||||
* ranking as if it was one big node - i.e., we
|
||||
* rank one proc on each object, step to the next object
|
||||
* moving across all the nodes, then wrap around to the
|
||||
* first object on the first node.
|
||||
*
|
||||
* Node 0 Node 1
|
||||
* Obj 0 Obj 1 Obj 0 Obj 1
|
||||
* 0 4 1 5 2 6 3 7
|
||||
* 8 12 9 13 10 14 11 15
|
||||
*/
|
||||
|
||||
/* In the interest of getting this committed in finite time,
|
||||
* just loop across the nodes and objects until all procs
|
||||
* are mapped
|
||||
*/
|
||||
|
||||
map = jdata->map;
|
||||
vpid = 0;
|
||||
while (vpid < jdata->num_procs) {
|
||||
for (n=0; n < map->nodes->size && vpid < jdata->num_procs; n++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
|
||||
continue;
|
||||
}
|
||||
/* get the number of objects - only consider those we can actually use */
|
||||
num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
|
||||
cache_level, OPAL_HWLOC_AVAILABLE);
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_span: found %d objects on node %s with %d procs",
|
||||
num_objs, node->name, (int)node->num_procs);
|
||||
|
||||
/* for each object */
|
||||
for (i=0; i < num_objs && vpid < jdata->num_procs; i++) {
|
||||
obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
|
||||
cache_level, i, OPAL_HWLOC_AVAILABLE);
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_span: working object %d", i);
|
||||
|
||||
/* cycle thru the procs on this node */
|
||||
for (j=0; j < node->procs->size && vpid < jdata->num_procs; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_span skipping proc %s - from another job, num_ranked %d",
|
||||
ORTE_NAME_PRINT(&proc->name), num_ranked);
|
||||
continue;
|
||||
}
|
||||
/* ignore procs that are already assigned */
|
||||
if (ORTE_VPID_INVALID != proc->name.vpid) {
|
||||
continue;
|
||||
}
|
||||
/* protect against bozo case */
|
||||
if (NULL == proc->locale) {
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* ignore procs not on this object */
|
||||
if (!hwloc_bitmap_intersects(obj->cpuset, proc->locale->cpuset)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_span: proc at position %d is not on object %d",
|
||||
j, i);
|
||||
continue;
|
||||
}
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_span: assigning vpid %s", ORTE_VPID_PRINT(vpid));
|
||||
proc->name.vpid = vpid++;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
|
||||
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
|
||||
|
||||
/* If there is an invalid epoch here, it's because it doesn't exist yet. */
|
||||
if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) {
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* move to next object */
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int rank_fill(orte_job_t *jdata,
|
||||
hwloc_obj_type_t target,
|
||||
unsigned cache_level)
|
||||
{
|
||||
orte_job_map_t *map;
|
||||
hwloc_obj_t obj;
|
||||
int num_objs, i, j, n, rc;
|
||||
orte_vpid_t num_ranked=0;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
orte_vpid_t vpid;
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_fill: for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
|
||||
/* if the ranking is fill, then we rank all the procs
|
||||
* within a given object before moving on to the next
|
||||
*
|
||||
* Node 0 Node 1
|
||||
* Obj 0 Obj 1 Obj 0 Obj 1
|
||||
* 0 1 4 5 8 9 12 13
|
||||
* 2 3 6 7 10 11 14 15
|
||||
*/
|
||||
|
||||
map = jdata->map;
|
||||
vpid = 0;
|
||||
for (n=0; n < map->nodes->size && vpid < jdata->num_procs; n++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
|
||||
continue;
|
||||
}
|
||||
/* get the number of objects - only consider those we can actually use */
|
||||
num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
|
||||
cache_level, OPAL_HWLOC_AVAILABLE);
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_fill: found %d objects on node %s with %d procs",
|
||||
num_objs, node->name, (int)node->num_procs);
|
||||
|
||||
/* for each object */
|
||||
for (i=0; i < num_objs && vpid < jdata->num_procs; i++) {
|
||||
obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
|
||||
cache_level, i, OPAL_HWLOC_AVAILABLE);
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_fill: working object %d", i);
|
||||
|
||||
/* cycle thru the procs on this node */
|
||||
for (j=0; j < node->procs->size && vpid < jdata->num_procs; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_fill skipping proc %s - from another job, num_ranked %d",
|
||||
ORTE_NAME_PRINT(&proc->name), num_ranked);
|
||||
continue;
|
||||
}
|
||||
/* ignore procs that are already assigned */
|
||||
if (ORTE_VPID_INVALID != proc->name.vpid) {
|
||||
continue;
|
||||
}
|
||||
/* protect against bozo case */
|
||||
if (NULL == proc->locale) {
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* ignore procs not on this object */
|
||||
if (!hwloc_bitmap_intersects(obj->cpuset, proc->locale->cpuset)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_fill: proc at position %d is not on object %d",
|
||||
j, i);
|
||||
continue;
|
||||
}
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid));
|
||||
proc->name.vpid = vpid++;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
|
||||
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
|
||||
|
||||
/* If there is an invalid epoch here, it's because it doesn't exist yet. */
|
||||
if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) {
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int rank_by(orte_job_t *jdata,
|
||||
hwloc_obj_type_t target,
|
||||
unsigned cache_level)
|
||||
{
|
||||
orte_job_map_t *map;
|
||||
hwloc_obj_t obj;
|
||||
int num_objs, i, j, n;
|
||||
orte_vpid_t num_ranked=0;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
orte_vpid_t vpid;
|
||||
opal_pointer_array_t objs;
|
||||
bool all_done;
|
||||
|
||||
if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
|
||||
return rank_span(jdata, target, cache_level);
|
||||
} else if (ORTE_RANKING_FILL & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
|
||||
return rank_fill(jdata, target, cache_level);
|
||||
}
|
||||
|
||||
/* if ranking is not spanned or filled, then we
|
||||
* default to assign ranks sequentially across
|
||||
* target objects within a node until that node
|
||||
* is fully ranked, and then move on to the next
|
||||
* node
|
||||
*
|
||||
* Node 0 Node 1
|
||||
* Obj 0 Obj 1 Obj 0 Obj 1
|
||||
* 0 2 1 3 8 10 9 11
|
||||
* 4 6 5 7 12 14 13 15
|
||||
*/
|
||||
|
||||
/* setup the pointer array */
|
||||
OBJ_CONSTRUCT(&objs, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&objs, 2, INT_MAX, 2);
|
||||
|
||||
map = jdata->map;
|
||||
vpid = 0;
|
||||
for (n=0; n < map->nodes->size && vpid < jdata->num_procs; n++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
|
||||
continue;
|
||||
}
|
||||
/* get the number of objects - only consider those we can actually use */
|
||||
num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
|
||||
cache_level, OPAL_HWLOC_AVAILABLE);
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_by: found %d objects on node %s with %d procs",
|
||||
num_objs, node->name, (int)node->num_procs);
|
||||
/* collect all the objects */
|
||||
for (i=0; i < num_objs; i++) {
|
||||
obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
|
||||
cache_level, i, OPAL_HWLOC_AVAILABLE);
|
||||
opal_pointer_array_set_item(&objs, i, obj);
|
||||
}
|
||||
|
||||
/* cycle across the objects, assigning a proc to each one,
|
||||
* until all procs have been assigned - unfortunately, since
|
||||
* more than this job may be mapped onto a node, the number
|
||||
* of procs on the node can't be used to tell us when we
|
||||
* are done. Instead, we have to just keep going until all
|
||||
* procs are ranked - which means we have to make one extra
|
||||
* pass thru the loop
|
||||
*
|
||||
* Perhaps someday someone will come up with a more efficient
|
||||
* algorithm, but this works for now.
|
||||
*/
|
||||
all_done = false;
|
||||
while (!all_done && vpid < jdata->num_procs) {
|
||||
all_done = true;
|
||||
/* cycle across the objects */
|
||||
for (i=0; i < num_objs && vpid < jdata->num_procs; i++) {
|
||||
obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i);
|
||||
|
||||
/* find the next proc on this object */
|
||||
for (j=0; j < node->procs->size && vpid < jdata->num_procs; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d",
|
||||
ORTE_NAME_PRINT(&proc->name), num_ranked);
|
||||
continue;
|
||||
}
|
||||
/* ignore procs that are already ranked */
|
||||
if (ORTE_VPID_INVALID != proc->name.vpid) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs on other objects */
|
||||
if (!hwloc_bitmap_intersects(obj->cpuset, proc->locale->cpuset)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_by: proc at position %d is not on object %d",
|
||||
j, i);
|
||||
continue;
|
||||
}
|
||||
proc->name.vpid = vpid++;
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid));
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
|
||||
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
|
||||
|
||||
/* If there is an invalid epoch here, it's because it doesn't exist yet. */
|
||||
if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) {
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
}
|
||||
/* flag that one was mapped */
|
||||
all_done = false;
|
||||
/* move to next object */
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
OBJ_DESTRUCT(&objs);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
{
|
||||
orte_job_map_t *map;
|
||||
orte_vpid_t vpid, cnt;
|
||||
int i, j;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc, *ptr;
|
||||
int rc;
|
||||
|
||||
map = jdata->map;
|
||||
|
||||
if (ORTE_RANK_BY_NODE == ORTE_GET_RANKING_POLICY(map->ranking) ||
|
||||
ORTE_RANK_BY_BOARD == ORTE_GET_RANKING_POLICY(map->ranking)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:base: computing vpids by node for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
/* assign the ranks round-robin across nodes - only one board/node
|
||||
* at this time, so they are equivalent
|
||||
*/
|
||||
cnt=0;
|
||||
vpid=0;
|
||||
while (cnt < jdata->num_procs) {
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
continue;
|
||||
}
|
||||
if (ORTE_VPID_INVALID != proc->name.vpid) {
|
||||
/* vpid was already assigned, probably by the
|
||||
* round-robin mapper. Some mappers require that
|
||||
* we insert the proc into the jdata->procs
|
||||
* array, while others will have already done it - so check and
|
||||
* do the operation if required
|
||||
*/
|
||||
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* if we added it to the array, then account for
|
||||
* it in our loop - otherwise don't as we would be
|
||||
* double counting
|
||||
*/
|
||||
cnt++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
/* find next available vpid */
|
||||
while (NULL != (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid)) &&
|
||||
ORTE_VPID_INVALID != ptr->name.vpid) {
|
||||
vpid++;
|
||||
}
|
||||
proc->name.vpid = vpid++;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
|
||||
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
|
||||
/* insert the proc into the jdata->procs array - can't already
|
||||
* be there as the only way to this point in the code is for the
|
||||
* vpid to have been INVALID
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
cnt++;
|
||||
break; /* move on to next node */
|
||||
}
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (ORTE_RANK_BY_SLOT == ORTE_GET_RANKING_POLICY(map->ranking)) {
|
||||
/* assign the ranks sequentially */
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:base: computing vpids by slot for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
vpid = 0;
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
continue;
|
||||
}
|
||||
if (ORTE_VPID_INVALID == proc->name.vpid) {
|
||||
/* find the next available vpid */
|
||||
while (NULL != (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid)) &&
|
||||
ORTE_VPID_INVALID != ptr->name.vpid) {
|
||||
vpid++;
|
||||
}
|
||||
proc->name.vpid = vpid++;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
|
||||
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
|
||||
|
||||
/* If there is an invalid epoch here, it's because it doesn't exist yet. */
|
||||
if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) {
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
}
|
||||
}
|
||||
/* some mappers require that we insert the proc into the jdata->procs
|
||||
* array, while others will have already done it - so check and
|
||||
* do the operation if required
|
||||
*/
|
||||
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (ORTE_RANK_BY_NUMA == ORTE_GET_RANKING_POLICY(map->ranking)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: computing ranks by NUMA for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_NODE, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_RANK_BY_SOCKET == ORTE_GET_RANKING_POLICY(map->ranking)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: computing ranks by socket for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_SOCKET, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_RANK_BY_L3CACHE == ORTE_GET_RANKING_POLICY(map->ranking)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: computing ranks by L3cache for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 3))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_RANK_BY_L2CACHE == ORTE_GET_RANKING_POLICY(map->ranking)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: computing ranks by L2cache for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 2))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_RANK_BY_L1CACHE == ORTE_GET_RANKING_POLICY(map->ranking)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: computing ranks by L1cache for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 1))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_RANK_BY_CORE == ORTE_GET_RANKING_POLICY(map->ranking)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: computing ranks by core for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CORE, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_RANK_BY_HWTHREAD == ORTE_GET_RANKING_POLICY(map->ranking)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: computing ranks by hwthread for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_PU, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata)
|
||||
{
|
||||
orte_std_cntr_t i;
|
||||
int j, k;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc, *psave, *psave2;
|
||||
orte_vpid_t minv, minv2;
|
||||
orte_local_rank_t local_rank;
|
||||
orte_job_map_t *map;
|
||||
orte_app_context_t *app;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:compute_usage",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* point to map */
|
||||
map = jdata->map;
|
||||
|
||||
/* for each node in the map... */
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
/* cycle through the array of procs on this node, setting
|
||||
* local and node ranks, until we
|
||||
* have done so for all procs on nodes in this map
|
||||
*/
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* init search values */
|
||||
local_rank = 0;
|
||||
|
||||
/* the proc map may have holes in it, so cycle
|
||||
* all the way through and avoid the holes
|
||||
*/
|
||||
for (k=0; k < node->procs->size; k++) {
|
||||
/* if this proc is NULL, skip it */
|
||||
if (NULL == opal_pointer_array_get_item(node->procs, k)) {
|
||||
continue;
|
||||
}
|
||||
minv = ORTE_VPID_MAX;
|
||||
minv2 = ORTE_VPID_MAX;
|
||||
psave = NULL;
|
||||
psave2 = NULL;
|
||||
/* find the minimum vpid proc */
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
/* if this proc is NULL, skip it */
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* only look at procs for this job when
|
||||
* determining local rank
|
||||
*/
|
||||
if (proc->name.jobid == jdata->jobid &&
|
||||
ORTE_LOCAL_RANK_INVALID == proc->local_rank &&
|
||||
proc->name.vpid < minv) {
|
||||
minv = proc->name.vpid;
|
||||
psave = proc;
|
||||
}
|
||||
/* no matter what job...still have to handle node_rank */
|
||||
if (ORTE_NODE_RANK_INVALID == proc->node_rank &&
|
||||
proc->name.vpid < minv2) {
|
||||
minv2 = proc->name.vpid;
|
||||
psave2 = proc;
|
||||
}
|
||||
}
|
||||
if (NULL == psave && NULL == psave2) {
|
||||
/* we must have processed them all for this node! */
|
||||
break;
|
||||
}
|
||||
if (NULL != psave) {
|
||||
psave->local_rank = local_rank;
|
||||
++local_rank;
|
||||
}
|
||||
if (NULL != psave2) {
|
||||
psave2->node_rank = node->next_node_rank;
|
||||
node->next_node_rank++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* compute app_rank */
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
k=0;
|
||||
/* loop thru all procs in job to find those from this app_context */
|
||||
for (j=0; j < jdata->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
if (proc->app_idx != app->idx) {
|
||||
continue;
|
||||
}
|
||||
proc->app_rank = k++;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* when we restart a process on a different node, we have to
|
||||
* ensure that the node and local ranks assigned to the proc
|
||||
* don't overlap with any pre-existing proc on that node. If
|
||||
* we don't, then it would be possible for procs to conflict
|
||||
* when opening static ports, should that be enabled.
|
||||
*/
|
||||
void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode,
|
||||
orte_node_t *newnode, orte_proc_t *newproc)
|
||||
{
|
||||
int k;
|
||||
orte_node_rank_t node_rank;
|
||||
orte_local_rank_t local_rank;
|
||||
orte_proc_t *proc;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:update_usage",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if the node hasn't changed, then we can just use the
|
||||
* pre-defined values
|
||||
*/
|
||||
if (oldnode == newnode) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* if the node has changed, then search the new node for the
|
||||
* lowest unused local and node rank
|
||||
*/
|
||||
node_rank = 0;
|
||||
retry_nr:
|
||||
for (k=0; k < newnode->procs->size; k++) {
|
||||
/* if this proc is NULL, skip it */
|
||||
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(newnode->procs, k))) {
|
||||
continue;
|
||||
}
|
||||
if (node_rank == proc->node_rank) {
|
||||
node_rank++;
|
||||
goto retry_nr;
|
||||
}
|
||||
}
|
||||
newproc->node_rank = node_rank;
|
||||
|
||||
local_rank = 0;
|
||||
retry_lr:
|
||||
for (k=0; k < newnode->procs->size; k++) {
|
||||
/* if this proc is NULL, skip it */
|
||||
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(newnode->procs, k))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
continue;
|
||||
}
|
||||
if (local_rank == proc->local_rank) {
|
||||
local_rank++;
|
||||
goto retry_lr;
|
||||
}
|
||||
}
|
||||
newproc->local_rank = local_rank;
|
||||
}
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -30,7 +31,8 @@
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/threads/tsd.h"
|
||||
|
||||
#include "orte/types.h"
|
||||
#include "orte/util/show_help.h"
|
||||
@ -39,6 +41,7 @@
|
||||
#include "orte/util/hostfile/hostfile.h"
|
||||
#include "orte/util/dash_host/dash_host.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/runtime/data_type_support/orte_dt_support.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
@ -60,13 +63,14 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
*total_num_slots = 0;
|
||||
|
||||
/* if the hnp was allocated, include it unless flagged not to */
|
||||
if (orte_hnp_is_allocated) {
|
||||
if (orte_hnp_is_allocated && !(policy & ORTE_MAPPING_NO_USE_LOCAL)) {
|
||||
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) {
|
||||
if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
|
||||
/* clear this for future use, but don't include it */
|
||||
node->state = ORTE_NODE_STATE_UP;
|
||||
} else if (ORTE_NODE_STATE_NOT_INCLUDED != node->state) {
|
||||
OBJ_RETAIN(node);
|
||||
node->mapped = false;
|
||||
opal_list_append(allocated_nodes, &node->super);
|
||||
}
|
||||
}
|
||||
@ -92,6 +96,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
* destructed along the way
|
||||
*/
|
||||
OBJ_RETAIN(node);
|
||||
node->mapped = false;
|
||||
opal_list_append(allocated_nodes, &node->super);
|
||||
}
|
||||
}
|
||||
@ -264,32 +269,6 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
}
|
||||
#endif
|
||||
|
||||
/* If the "no local" option was set, then remove the local node
|
||||
* from the list
|
||||
*/
|
||||
if (policy & ORTE_MAPPING_NO_USE_LOCAL) {
|
||||
/* we don't need to check through the entire list as
|
||||
* the head node - if it is on the list at all - will
|
||||
* always be in the first position
|
||||
*/
|
||||
item = opal_list_get_first(allocated_nodes);
|
||||
node = (orte_node_t*)item;
|
||||
/* need to check ifislocal because the name in the
|
||||
* hostfile may not have been FQDN, while name returned
|
||||
* by gethostname may have been (or vice versa)
|
||||
*/
|
||||
if (opal_ifislocal(node->name)) {
|
||||
opal_list_remove_item(allocated_nodes, item);
|
||||
OBJ_RELEASE(item); /* "un-retain" it */
|
||||
}
|
||||
/** if we aren't mapping daemons, check that anything is left! */
|
||||
if (NULL != app && 0 == opal_list_get_size(allocated_nodes)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt",
|
||||
"orte-rmaps-base:nolocal-no-available-resources", true);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
|
||||
/* if the app is NULL, then we are mapping daemons - so remove
|
||||
* all nodes that already have a daemon on them
|
||||
*
|
||||
@ -340,7 +319,12 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
opal_list_remove_item(allocated_nodes, item);
|
||||
OBJ_RELEASE(item); /* "un-retain" it */
|
||||
} else { /** otherwise, add the slots for our job to the total */
|
||||
num_slots += node->slots_alloc;
|
||||
if (0 == node->slots_alloc) {
|
||||
/* always allocate at least one */
|
||||
num_slots++;
|
||||
} else {
|
||||
num_slots += node->slots_alloc;
|
||||
}
|
||||
}
|
||||
|
||||
/** go on to next item */
|
||||
@ -359,682 +343,39 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_rmaps_base_add_proc_to_map(orte_job_map_t *map, orte_node_t *node,
|
||||
bool oversubscribed, orte_proc_t *proc)
|
||||
{
|
||||
orte_std_cntr_t i;
|
||||
orte_node_t *node_from_map;
|
||||
int rc;
|
||||
|
||||
/* see if this node has already been assigned to the map - if
|
||||
* not, then add the pointer to the pointer array
|
||||
*/
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node_from_map = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
if (node_from_map->index == node->index) {
|
||||
/* we have this node in the array */
|
||||
goto PROCESS;
|
||||
}
|
||||
}
|
||||
/* if we get here, then this node isn't already in the map - add it */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base: adding node %s to map",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == node->name) ? "NULL" : node->name));
|
||||
|
||||
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++map->num_nodes;
|
||||
|
||||
PROCESS:
|
||||
/* add the proc to this node's local processes - it is assumed
|
||||
* that the proc isn't already there as this would be an error
|
||||
* in the mapper
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base: mapping proc for job %s to node %s whose daemon is %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(proc->name.jobid),
|
||||
(NULL == node->name) ? "NULL" : node->name,
|
||||
(NULL == node->daemon) ? "NULL" : ORTE_NAME_PRINT(&(node->daemon->name))));
|
||||
|
||||
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* retain the proc struct so that we correctly track its release */
|
||||
OBJ_RETAIN(proc);
|
||||
++node->num_procs;
|
||||
|
||||
/* update the oversubscribed state of the node */
|
||||
node->oversubscribed = oversubscribed;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Claim a slot for a specified job on a node
|
||||
*/
|
||||
int orte_rmaps_base_claim_slot(orte_job_t *jdata,
|
||||
orte_node_t *current_node,
|
||||
int32_t cpus_per_rank,
|
||||
orte_std_cntr_t app_idx,
|
||||
opal_list_t *nodes,
|
||||
bool oversubscribe,
|
||||
bool remove_from_list,
|
||||
orte_proc_t **returnproc)
|
||||
{
|
||||
orte_proc_t *proc;
|
||||
bool oversub;
|
||||
int rc;
|
||||
|
||||
/* if we were given a proc, just use it */
|
||||
if (NULL != returnproc && NULL != *returnproc) {
|
||||
proc = *returnproc;
|
||||
} else {
|
||||
/* create mapped_proc object */
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
if (NULL == proc) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
/* set the jobid */
|
||||
proc->name.jobid = jdata->jobid;
|
||||
/* flag the proc as ready for launch */
|
||||
proc->state = ORTE_PROC_STATE_INIT;
|
||||
/* we do not set the vpid here - this will be done
|
||||
* during a second phase
|
||||
*/
|
||||
|
||||
/* We do set the epoch here since they all start with the same value. */
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
|
||||
proc->app_idx = app_idx;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:claim_slot: created new proc %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
|
||||
/* provide returned proc, if requested */
|
||||
if (NULL != returnproc) {
|
||||
*returnproc = proc;
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_RETAIN(current_node); /* maintain accounting on object */
|
||||
|
||||
proc->node = current_node;
|
||||
proc->nodename = current_node->name;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:claim_slot mapping proc in job %s to node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid), current_node->name));
|
||||
|
||||
/* Be sure to demarcate the slots for this proc as claimed from the node */
|
||||
current_node->slots_inuse += 1;
|
||||
|
||||
/* see if this node is oversubscribed now */
|
||||
if (current_node->slots_inuse > current_node->slots) {
|
||||
oversub = true;
|
||||
} else {
|
||||
oversub = false;
|
||||
}
|
||||
|
||||
/* assign the proc to the node and ensure the node is on the map */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_add_proc_to_map(jdata->map, current_node,
|
||||
oversub, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(proc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* If this node has reached its max number of allocatable slots OR it has
|
||||
* reached the soft limit AND we are in a "no oversubscribe" state, then
|
||||
* we need to return a flag telling the mapper this is the case so it
|
||||
* can move on to the next node
|
||||
*/
|
||||
if ((0 != current_node->slots_max &&
|
||||
current_node->slots_inuse >= current_node->slots_max) ||
|
||||
(!oversubscribe && current_node->slots_inuse >= current_node->slots)) {
|
||||
/* see if we are supposed to remove the node from the list - some
|
||||
* mappers want us to do so to avoid any chance of continuing to
|
||||
* add procs to it
|
||||
*/
|
||||
if (NULL != nodes && remove_from_list) {
|
||||
opal_list_remove_item(nodes, (opal_list_item_t*)current_node);
|
||||
/* release it - it was retained when we started, so this
|
||||
* just ensures the instance counter is correctly updated
|
||||
*/
|
||||
OBJ_RELEASE(current_node);
|
||||
}
|
||||
/* now return the proper code so the caller knows this node
|
||||
* is fully used
|
||||
*/
|
||||
return ORTE_ERR_NODE_FULLY_USED;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
{
|
||||
orte_job_map_t *map;
|
||||
orte_vpid_t vpid, cnt;
|
||||
int i, j;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc, *ptr;
|
||||
int rc;
|
||||
|
||||
map = jdata->map;
|
||||
|
||||
if (ORTE_MAPPING_BYSLOT & map->policy ||
|
||||
ORTE_MAPPING_BYSOCKET & map->policy ||
|
||||
ORTE_MAPPING_BYBOARD & map->policy) {
|
||||
/* assign the ranks sequentially */
|
||||
vpid = 0;
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
continue;
|
||||
}
|
||||
if (ORTE_VPID_INVALID == proc->name.vpid) {
|
||||
/* find the next available vpid */
|
||||
while (NULL != (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid)) &&
|
||||
ORTE_VPID_INVALID != ptr->name.vpid) {
|
||||
vpid++;
|
||||
}
|
||||
proc->name.vpid = vpid++;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
|
||||
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
|
||||
|
||||
/* If there is an invalid epoch here, it's because it doesn't exist yet. */
|
||||
if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) {
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
}
|
||||
}
|
||||
/* some mappers require that we insert the proc into the jdata->procs
|
||||
* array, while others will have already done it - so check and
|
||||
* do the operation if required
|
||||
*/
|
||||
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (ORTE_MAPPING_BYNODE & map->policy) {
|
||||
/* assign the ranks round-robin across nodes */
|
||||
cnt=0;
|
||||
vpid=0;
|
||||
while (cnt < jdata->num_procs) {
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
continue;
|
||||
}
|
||||
if (ORTE_VPID_INVALID != proc->name.vpid) {
|
||||
/* vpid was already assigned, probably by the
|
||||
* round-robin mapper. Some mappers require that
|
||||
* we insert the proc into the jdata->procs
|
||||
* array, while others will have already done it - so check and
|
||||
* do the operation if required
|
||||
*/
|
||||
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* if we added it to the array, then account for
|
||||
* it in our loop - otherwise don't as we would be
|
||||
* double counting
|
||||
*/
|
||||
cnt++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
/* find next available vpid */
|
||||
while (NULL != (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid)) &&
|
||||
ORTE_VPID_INVALID != ptr->name.vpid) {
|
||||
vpid++;
|
||||
}
|
||||
proc->name.vpid = vpid++;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
|
||||
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
|
||||
/* insert the proc into the jdata->procs array - can't already
|
||||
* be there as the only way to this point in the code is for the
|
||||
* vpid to have been INVALID
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
cnt++;
|
||||
break; /* move on to next node */
|
||||
}
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata)
|
||||
{
|
||||
orte_std_cntr_t i;
|
||||
int j, k;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc, *psave, *psave2;
|
||||
orte_vpid_t minv, minv2;
|
||||
orte_local_rank_t local_rank;
|
||||
orte_job_map_t *map;
|
||||
orte_app_context_t *app;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:compute_usage",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* point to map */
|
||||
map = jdata->map;
|
||||
|
||||
/* for each node in the map... */
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
/* cycle through the array of procs on this node, setting
|
||||
* local and node ranks, until we
|
||||
* have done so for all procs on nodes in this map
|
||||
*/
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* init search values */
|
||||
local_rank = 0;
|
||||
|
||||
/* the proc map may have holes in it, so cycle
|
||||
* all the way through and avoid the holes
|
||||
*/
|
||||
for (k=0; k < node->procs->size; k++) {
|
||||
/* if this proc is NULL, skip it */
|
||||
if (NULL == opal_pointer_array_get_item(node->procs, k)) {
|
||||
continue;
|
||||
}
|
||||
minv = ORTE_VPID_MAX;
|
||||
minv2 = ORTE_VPID_MAX;
|
||||
psave = NULL;
|
||||
psave2 = NULL;
|
||||
/* find the minimum vpid proc */
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
/* if this proc is NULL, skip it */
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* only look at procs for this job when
|
||||
* determining local rank
|
||||
*/
|
||||
if (proc->name.jobid == jdata->jobid &&
|
||||
ORTE_LOCAL_RANK_INVALID == proc->local_rank &&
|
||||
proc->name.vpid < minv) {
|
||||
minv = proc->name.vpid;
|
||||
psave = proc;
|
||||
}
|
||||
/* no matter what job...still have to handle node_rank */
|
||||
if (ORTE_NODE_RANK_INVALID == proc->node_rank &&
|
||||
proc->name.vpid < minv2) {
|
||||
minv2 = proc->name.vpid;
|
||||
psave2 = proc;
|
||||
}
|
||||
}
|
||||
if (NULL == psave && NULL == psave2) {
|
||||
/* we must have processed them all for this node! */
|
||||
break;
|
||||
}
|
||||
if (NULL != psave) {
|
||||
psave->local_rank = local_rank;
|
||||
++local_rank;
|
||||
}
|
||||
if (NULL != psave2) {
|
||||
psave2->node_rank = node->next_node_rank;
|
||||
node->next_node_rank++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* compute app_rank */
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
k=0;
|
||||
/* loop thru all procs in job to find those from this app_context */
|
||||
for (j=0; j < jdata->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
if (proc->app_idx != app->idx) {
|
||||
continue;
|
||||
}
|
||||
proc->app_rank = k++;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* when we restart a process on a different node, we have to
|
||||
* ensure that the node and local ranks assigned to the proc
|
||||
* don't overlap with any pre-existing proc on that node. If
|
||||
* we don't, then it would be possible for procs to conflict
|
||||
* when opening static ports, should that be enabled.
|
||||
*/
|
||||
void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode,
|
||||
orte_node_t *newnode, orte_proc_t *newproc)
|
||||
{
|
||||
int k;
|
||||
orte_node_rank_t node_rank;
|
||||
orte_local_rank_t local_rank;
|
||||
orte_proc_t *proc;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:update_usage",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if the node hasn't changed, then we can just use the
|
||||
* pre-defined values
|
||||
*/
|
||||
if (oldnode == newnode) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* if the node has changed, then search the new node for the
|
||||
* lowest unused local and node rank
|
||||
*/
|
||||
node_rank = 0;
|
||||
retry_nr:
|
||||
for (k=0; k < newnode->procs->size; k++) {
|
||||
/* if this proc is NULL, skip it */
|
||||
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(newnode->procs, k))) {
|
||||
continue;
|
||||
}
|
||||
if (node_rank == proc->node_rank) {
|
||||
node_rank++;
|
||||
goto retry_nr;
|
||||
}
|
||||
}
|
||||
newproc->node_rank = node_rank;
|
||||
|
||||
local_rank = 0;
|
||||
retry_lr:
|
||||
for (k=0; k < newnode->procs->size; k++) {
|
||||
/* if this proc is NULL, skip it */
|
||||
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(newnode->procs, k))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
continue;
|
||||
}
|
||||
if (local_rank == proc->local_rank) {
|
||||
local_rank++;
|
||||
goto retry_lr;
|
||||
}
|
||||
}
|
||||
newproc->local_rank = local_rank;
|
||||
}
|
||||
|
||||
|
||||
int orte_rmaps_base_define_daemons(orte_job_t *jdata)
|
||||
{
|
||||
orte_job_map_t *map;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
orte_job_t *daemons;
|
||||
int i;
|
||||
int rc;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:define_daemons",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
if (ORTE_MAPPING_USE_VM & jdata->map->policy) {
|
||||
/* nothing for us to do - all daemons are
|
||||
* defined by definition!
|
||||
*/
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* get the daemon job data struct */
|
||||
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_HNP->jobid))) {
|
||||
/* bad news */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FATAL);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
|
||||
/* initialize the #new daemons */
|
||||
map = jdata->map;
|
||||
map->num_new_daemons = 0;
|
||||
|
||||
/* go through the nodes in the map, checking each one's daemon name
|
||||
*/
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
if (NULL == node->daemon) {
|
||||
/* we haven't defined one for it
|
||||
* yet, so do so now and indicate it is to be launched
|
||||
*/
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
if (NULL == proc) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
proc->name.jobid = ORTE_PROC_MY_HNP->jobid;
|
||||
if (ORTE_VPID_MAX-1 <= daemons->num_procs) {
|
||||
/* no more daemons available */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "out-of-vpids", true);
|
||||
OBJ_RELEASE(proc);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
proc->name.vpid = daemons->num_procs; /* take the next available vpid */
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
proc->node = node;
|
||||
proc->nodename = node->name;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:define_daemons add new daemon %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
/* add the daemon to the daemon job object */
|
||||
if (0 > (rc = opal_pointer_array_add(daemons->procs, (void*)proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
++daemons->num_procs;
|
||||
/* point the node to the daemon */
|
||||
node->daemon = proc;
|
||||
OBJ_RETAIN(proc); /* maintain accounting */
|
||||
/* track number of daemons to be launched */
|
||||
++map->num_new_daemons;
|
||||
/* and their starting vpid */
|
||||
if (ORTE_VPID_INVALID == map->daemon_vpid_start) {
|
||||
map->daemon_vpid_start = proc->name.vpid;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* If we are launching on a node where there used to be a daemon, but
|
||||
* it had previously failed, try to relaunch it. (Daemon Recovery) Do
|
||||
* this ONLY if there are procs mapped to that daemon!
|
||||
*/
|
||||
else if (node->daemon->state > ORTE_PROC_STATE_UNTERMINATED) {
|
||||
/* If no processes are to be launched on this node, then exclude it */
|
||||
if( 0 >= node->num_procs ) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:define_daemons Skipping the Recovery of daemon %s [0x%x] Launched: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&node->daemon->name),
|
||||
node->daemon->state,
|
||||
(node->daemon_launched ? "T" : "F")
|
||||
));
|
||||
/* since this daemon exists but is not needed, then flag it
|
||||
* as "launched" to avoid relaunching it for no reason
|
||||
*/
|
||||
node->daemon_launched = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:define_daemons RECOVERING daemon %s [0x%x] Launched: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&node->daemon->name),
|
||||
node->daemon->state,
|
||||
(node->daemon_launched ? "T" : "F")
|
||||
));
|
||||
|
||||
/* flag that the daemon is no longer launched */
|
||||
node->daemon_launched = false;
|
||||
|
||||
/* set the state to indicate launch is in progress */
|
||||
node->daemon->state = ORTE_PROC_STATE_RESTART;
|
||||
|
||||
free(node->daemon->rml_uri);
|
||||
node->daemon->rml_uri = NULL;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:define_daemons add new daemon %s (Recovering old daemon)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&node->daemon->name)));
|
||||
|
||||
/* track number of daemons to be launched */
|
||||
++map->num_new_daemons;
|
||||
}
|
||||
else {
|
||||
/* this daemon was previously defined - flag it */
|
||||
node->daemon_launched = true;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:define_daemons existing daemon %s already launched",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&node->daemon->name)));
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
|
||||
{
|
||||
orte_job_t *jdat;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
orte_job_map_t *map;
|
||||
opal_list_t node_list;
|
||||
opal_list_item_t *item;
|
||||
orte_app_context_t *app;
|
||||
orte_std_cntr_t num_slots;
|
||||
int rc, i, n;
|
||||
bool ignored;
|
||||
int rc, i;
|
||||
|
||||
/* get the daemon app if provided - may include -host or hostfile
|
||||
* info about available nodes
|
||||
*/
|
||||
app = (orte_app_context_t *) opal_pointer_array_get_item(jdata->apps, 0);
|
||||
|
||||
map = jdata->map;
|
||||
|
||||
/* get the list of all available nodes that do not already
|
||||
* have a daemon on them
|
||||
/* cycle thru all available nodes and find those that do not already
|
||||
* have a daemon on them - no need to include our own as we are
|
||||
* obviously already here!
|
||||
*/
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots,
|
||||
app, map->policy))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
return rc;
|
||||
}
|
||||
/* check all other known jobs to see if they have something to
|
||||
* add to the allocation - we won't have seen these and the
|
||||
* daemon job won't have any in its app
|
||||
*/
|
||||
for (i=0; i < orte_job_data->size; i++) {
|
||||
if (NULL == (jdat = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
|
||||
for (i=1; i < orte_node_pool->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
||||
continue;
|
||||
}
|
||||
for (n=0; n < jdat->apps->size; n++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdat->apps, n))) {
|
||||
continue;
|
||||
}
|
||||
if (NULL != app->hostfile) {
|
||||
/* hostfile was specified - parse it and add it to the list. The
|
||||
* function automatically ignores duplicates
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&node_list,
|
||||
&ignored,
|
||||
app->hostfile))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
if (NULL != app->dash_host) {
|
||||
/* parse and add to list, ignoring duplicates */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&node_list,
|
||||
&ignored,
|
||||
app->dash_host))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* add all these nodes to the map */
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
/* if this is my node, ignore it - we are already here */
|
||||
if (0 == strcmp(node->name, orte_process_info.nodename)) {
|
||||
/* if this node already has a daemon, skip it */
|
||||
if (NULL != node->daemon) {
|
||||
continue;
|
||||
}
|
||||
/* add the node to the map */
|
||||
opal_pointer_array_add(map->nodes, (void*)node);
|
||||
++(map->num_nodes);
|
||||
/* if this node already has a daemon, release that object
|
||||
* to maintain bookkeeping
|
||||
*/
|
||||
if (NULL != node->daemon) {
|
||||
OBJ_RELEASE(node->daemon);
|
||||
}
|
||||
/* maintain accounting */
|
||||
OBJ_RETAIN(node);
|
||||
/* create a new daemon object for this node */
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
if (NULL == proc) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
proc->name.jobid = ORTE_PROC_MY_HNP->jobid;
|
||||
proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
if (ORTE_VPID_MAX-1 <= jdata->num_procs) {
|
||||
/* no more daemons available */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "out-of-vpids", true);
|
||||
@ -1044,6 +385,8 @@ int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
|
||||
proc->name.vpid = jdata->num_procs; /* take the next available vpid */
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
|
||||
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
|
||||
/* point the proc to the node and maintain accounting */
|
||||
OBJ_RETAIN(node);
|
||||
proc->node = node;
|
||||
proc->nodename = node->name;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
@ -1051,7 +394,7 @@ int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
/* add the daemon to the daemon job object */
|
||||
if (0 > (rc = opal_pointer_array_add(jdata->procs, (void*)proc))) {
|
||||
if (0 > (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, (void*)proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -1066,7 +409,6 @@ int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
|
||||
map->daemon_vpid_start = proc->name.vpid;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -48,16 +49,6 @@ ORTE_DECLSPEC orte_job_map_t* orte_rmaps_base_get_job_map(orte_jobid_t job);
|
||||
|
||||
/* LOCAL FUNCTIONS for use by RMAPS components */
|
||||
|
||||
/*
|
||||
* Function to add a mapped_proc entry to a map
|
||||
* Scans list of nodes on map to see if the specified one already
|
||||
* exists - if so, just add this entry to that node's list of
|
||||
* procs. If not, then add new node entry and put this proc
|
||||
* on its list.
|
||||
*/
|
||||
int orte_rmaps_base_add_proc_to_map(orte_job_map_t *map, orte_node_t *node,
|
||||
bool oversubscribed, orte_proc_t *proc);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_get_target_nodes(opal_list_t* node_list,
|
||||
orte_std_cntr_t *total_num_slots,
|
||||
orte_app_context_t *app,
|
||||
@ -70,39 +61,19 @@ ORTE_DECLSPEC int orte_rmaps_base_get_mapped_targets(opal_list_t *mapped_node_li
|
||||
opal_list_t *master_node_list,
|
||||
orte_std_cntr_t *total_num_slots);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_claim_slot(orte_job_t *jdata,
|
||||
orte_node_t *current_node,
|
||||
int32_t stride,
|
||||
orte_std_cntr_t app_idx,
|
||||
opal_list_t *nodes,
|
||||
bool oversubscribe,
|
||||
bool remove_from_list,
|
||||
orte_proc_t **returnproc);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_compute_bindings(orte_job_t *jdata);
|
||||
|
||||
ORTE_DECLSPEC void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode,
|
||||
orte_node_t *newnode, orte_proc_t *newproc);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_rearrange_map(orte_app_context_t *app, orte_job_map_t *map, opal_list_t *procs);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_define_daemons(orte_job_t *jdata);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata);
|
||||
|
||||
ORTE_DECLSPEC opal_list_item_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list, orte_job_t *jdata);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||
opal_list_t *node_list, orte_vpid_t num_procs,
|
||||
opal_list_item_t *cur_node_item);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
|
||||
opal_list_t *node_list, orte_vpid_t num_procs,
|
||||
opal_list_item_t *cur_node_item);
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -1,46 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-rmaps-lb.txt
|
||||
|
||||
sources = \
|
||||
rmaps_lb.c \
|
||||
rmaps_lb.h \
|
||||
rmaps_lb_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_rmaps_load_balance_DSO
|
||||
component_noinst =
|
||||
component_install = mca_rmaps_load_balance.la
|
||||
else
|
||||
component_noinst = libmca_rmaps_load_balance.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_rmaps_load_balance_la_SOURCES = $(sources)
|
||||
mca_rmaps_load_balance_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_rmaps_load_balance_la_SOURCES =$(sources)
|
||||
libmca_rmaps_load_balance_la_LDFLAGS = -module -avoid-version
|
@ -1,53 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for Open RTE's orterun.
|
||||
#
|
||||
[orte-rmaps-rr:alloc-error]
|
||||
There are not enough slots available in the system to satisfy the %d slots
|
||||
that were requested by the application:
|
||||
%s
|
||||
|
||||
Either request fewer slots for your application, or make more slots available
|
||||
for use.
|
||||
[orte-rmaps-rr:multi-apps-and-zero-np]
|
||||
RMAPS found multiple applications to be launched, with
|
||||
at least one that failed to specify the number of processes to execute.
|
||||
When specifying multiple applications, you must specify how many processes
|
||||
of each to launch via the -np argument.
|
||||
|
||||
[orte-rmaps-rr:per-node-and-too-many-procs]
|
||||
There are not enough nodes in your allocation to satisfy your request to launch
|
||||
%d processes on a per-node basis - only %d nodes were available.
|
||||
|
||||
Either request fewer processes, or obtain a larger allocation.
|
||||
[orte-rmaps-rr:n-per-node-and-too-many-procs]
|
||||
There are not enough nodes in your allocation to satisfy your request to launch
|
||||
%d processes on a %d per-node basis - only %d nodes with a total of %d slots were available.
|
||||
|
||||
Either request fewer processes, or obtain a larger allocation.
|
||||
[orte-rmaps-rr:n-per-node-and-not-enough-slots]
|
||||
There are not enough slots on the nodes in your allocation to satisfy your request to launch on a %d process-per-node basis - only %d slots/node were available.
|
||||
|
||||
Either request fewer processes/node, or obtain a larger allocation.
|
||||
|
||||
[orte-rmaps-rr:no-np-and-user-map]
|
||||
You have specified a rank-to-node/slot mapping, but failed to provide
|
||||
the number of processes to be executed. For some reason, this information
|
||||
could not be obtained from the mapping you provided, so we cannot continue
|
||||
with executing the specified application.
|
@ -1,544 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "rmaps_lb.h"
|
||||
|
||||
static int switchyard(orte_job_t *jdata);
|
||||
|
||||
orte_rmaps_base_module_t orte_rmaps_load_balance_module = {
|
||||
switchyard
|
||||
};
|
||||
|
||||
/* Local functions */
|
||||
static int npernode(orte_job_t *jdata);
|
||||
static int nperboard(orte_job_t *jdata);
|
||||
static int npersocket(orte_job_t *jdata);
|
||||
static int loadbalance(orte_job_t *jdata);
|
||||
|
||||
static int switchyard(orte_job_t *jdata)
|
||||
{
|
||||
int rc;
|
||||
mca_base_component_t *c = &mca_rmaps_load_balance_component.super.base_version;
|
||||
|
||||
/* only handle initial launch of loadbalanced
|
||||
* or NPERxxx jobs - allow restarting of failed apps
|
||||
*/
|
||||
if (ORTE_JOB_STATE_INIT != jdata->state) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:lb: job %s not in initial state - loadbalance cannot map",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
return ORTE_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
if (NULL != jdata->map->req_mapper &&
|
||||
0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
|
||||
/* a mapper has been specified, and it isn't me */
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:lb: job %s not using loadbalance mapper",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
return ORTE_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:loadbalance: mapping job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
|
||||
/* flag that I did the mapping */
|
||||
if (NULL != jdata->map->last_mapper) {
|
||||
free(jdata->map->last_mapper);
|
||||
}
|
||||
jdata->map->last_mapper = strdup(c->mca_component_name);
|
||||
|
||||
if (0 < mca_rmaps_load_balance_component.npernode ||
|
||||
0 < jdata->map->npernode) {
|
||||
rc = npernode(jdata);
|
||||
} else if (0 < mca_rmaps_load_balance_component.nperboard ||
|
||||
0 < jdata->map->nperboard) {
|
||||
rc = nperboard(jdata);
|
||||
} else if (0 < mca_rmaps_load_balance_component.npersocket ||
|
||||
0 < jdata->map->npersocket) {
|
||||
rc = npersocket(jdata);
|
||||
} else {
|
||||
rc = loadbalance(jdata);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* compute and save local ranks */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* define the daemons that we will use for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/* place specified #procs on each node, up to the specified total
|
||||
* number of procs (if one was given).
|
||||
*/
|
||||
static int npernode(orte_job_t *jdata)
|
||||
{
|
||||
orte_app_context_t *app;
|
||||
int j, rc=ORTE_SUCCESS;
|
||||
opal_list_t node_list;
|
||||
opal_list_item_t *item;
|
||||
orte_std_cntr_t num_slots;
|
||||
orte_node_t *node;
|
||||
int np, nprocs;
|
||||
int num_nodes;
|
||||
|
||||
/* setup the node list */
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
|
||||
/* can only have one app_context here */
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
/* use the number of procs if one was given */
|
||||
if (0 < app->num_procs) {
|
||||
np = app->num_procs;
|
||||
} else {
|
||||
np = INT_MAX;
|
||||
}
|
||||
/* for each app_context, we have to get the list of nodes that it can
|
||||
* use since that can now be modified with a hostfile and/or -host
|
||||
* option
|
||||
*/
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->policy))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
/* loop through the list of nodes */
|
||||
num_nodes = opal_list_get_size(&node_list);
|
||||
nprocs = 0;
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
/* put the specified number of procs on each node */
|
||||
for (j=0; j < mca_rmaps_load_balance_component.npernode && nprocs < np; j++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
&node_list, jdata->map->oversubscribe,
|
||||
false, NULL))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
|
||||
* more procs to place, then that is an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc) ||
|
||||
j < mca_rmaps_load_balance_component.npernode-1) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(node);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
nprocs++;
|
||||
}
|
||||
OBJ_RELEASE(node);
|
||||
}
|
||||
/* if the user requested a specific number of procs and
|
||||
* the total number of procs we were able to assign
|
||||
* doesn't equal the number requested, then we have a
|
||||
* problem
|
||||
*/
|
||||
if (0 < app->num_procs && nprocs < app->num_procs) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
|
||||
app->app, app->num_procs,
|
||||
"number of nodes", num_nodes,
|
||||
"npernode", mca_rmaps_load_balance_component.npernode);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* update the number of procs in the job */
|
||||
jdata->num_procs += nprocs;
|
||||
/* compute vpids and add proc objects to the job - this has to be
|
||||
* done after each app_context is mapped in order to keep the
|
||||
* vpids contiguous within an app_context
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
error:
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int nperboard(orte_job_t *jdata)
|
||||
{
|
||||
orte_app_context_t *app;
|
||||
int j, k, rc=ORTE_SUCCESS;
|
||||
opal_list_t node_list;
|
||||
opal_list_item_t *item;
|
||||
orte_std_cntr_t num_slots;
|
||||
orte_node_t *node;
|
||||
int np, nprocs;
|
||||
int num_boards=orte_default_num_boards;
|
||||
|
||||
/* setup the node list */
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
|
||||
/* can only have one app_context here */
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
/* use the number of procs if one was given */
|
||||
if (0 < app->num_procs) {
|
||||
np = app->num_procs;
|
||||
} else {
|
||||
np = INT_MAX;
|
||||
}
|
||||
/* for each app_context, we have to get the list of nodes that it can
|
||||
* use since that can now be modified with a hostfile and/or -host
|
||||
* option
|
||||
*/
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->policy))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
/* loop through the list of nodes */
|
||||
nprocs = 0;
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
num_boards = node->boards;
|
||||
/* loop through the number of boards in this node */
|
||||
for (k=0; k < node->boards && nprocs < np; k++) {
|
||||
/* put the specified number of procs on each board */
|
||||
for (j=0; j < mca_rmaps_load_balance_component.nperboard && nprocs < np; j++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
&node_list, jdata->map->oversubscribe,
|
||||
false, NULL))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
|
||||
* more procs to place, then that is an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc) ||
|
||||
j < mca_rmaps_load_balance_component.nperboard-1) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(node);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
nprocs++;
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(node);
|
||||
}
|
||||
/* if the user requested a specific number of procs and
|
||||
* the total number of procs we were able to assign
|
||||
* doesn't equal the number requested, then we have a
|
||||
* problem
|
||||
*/
|
||||
if (0 < app->num_procs && nprocs < app->num_procs) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
|
||||
app->app, app->num_procs,
|
||||
"number of boards", num_boards,
|
||||
"nperboard", mca_rmaps_load_balance_component.nperboard);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* update the number of procs in the job */
|
||||
jdata->num_procs += nprocs;
|
||||
/* compute vpids and add proc objects to the job - this has to be
|
||||
* done after each app_context is mapped in order to keep the
|
||||
* vpids contiguous within an app_context
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
error:
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static int npersocket(orte_job_t *jdata)
|
||||
{
|
||||
orte_app_context_t *app;
|
||||
int j, k, n, rc=ORTE_SUCCESS;
|
||||
opal_list_t node_list;
|
||||
opal_list_item_t *item;
|
||||
orte_std_cntr_t num_slots;
|
||||
orte_node_t *node;
|
||||
int np, nprocs;
|
||||
int num_sockets=orte_default_num_sockets_per_board;
|
||||
|
||||
/* setup the node list */
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
|
||||
/* can only have one app_context here */
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
/* use the number of procs if one was given */
|
||||
if (0 < app->num_procs) {
|
||||
np = app->num_procs;
|
||||
} else {
|
||||
np = INT_MAX;
|
||||
}
|
||||
/* for each app_context, we have to get the list of nodes that it can
|
||||
* use since that can now be modified with a hostfile and/or -host
|
||||
* option
|
||||
*/
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->policy))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
/* loop through the list of nodes */
|
||||
nprocs = 0;
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
num_sockets = node->sockets_per_board;
|
||||
/* loop through the number of boards in this node */
|
||||
for (k=0; k < node->boards && nprocs < np; k++) {
|
||||
/* loop through the number of sockets/board */
|
||||
for (n=0; n < node->sockets_per_board && nprocs < np; n++) {
|
||||
/* put the specified number of procs on each socket */
|
||||
for (j=0; j < mca_rmaps_load_balance_component.npersocket && nprocs < np; j++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
&node_list, jdata->map->oversubscribe,
|
||||
false, NULL))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
|
||||
* more procs to place, then that is an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc) ||
|
||||
j < mca_rmaps_load_balance_component.npersocket-1) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(node);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
/* track the number of procs */
|
||||
nprocs++;
|
||||
}
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(node);
|
||||
}
|
||||
/* if the user requested a specific number of procs and
|
||||
* the total number of procs we were able to assign
|
||||
* doesn't equal the number requested, then we have a
|
||||
* problem
|
||||
*/
|
||||
if (0 < app->num_procs && nprocs < app->num_procs) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
|
||||
app->app, app->num_procs,
|
||||
"number of sockets", num_sockets,
|
||||
"npersocket", mca_rmaps_load_balance_component.npersocket);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* update the number of procs in the job */
|
||||
jdata->num_procs += nprocs;
|
||||
/* compute vpids and add proc objects to the job - this has to be
|
||||
* done after each app_context is mapped in order to keep the
|
||||
* vpids contiguous within an app_context
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
error:
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Create a load balanced mapping for the job by assigning a constant #procs/node, with
|
||||
* leftovers being spread one/node starting from the first node.
|
||||
*/
|
||||
static int loadbalance(orte_job_t *jdata)
|
||||
{
|
||||
orte_app_context_t *app;
|
||||
int i, j;
|
||||
opal_list_t node_list;
|
||||
orte_std_cntr_t num_nodes, num_slots;
|
||||
int rc=ORTE_SUCCESS, np, nprocs;
|
||||
int ppn = 0;
|
||||
opal_list_item_t *item, *start;
|
||||
orte_node_t *node;
|
||||
|
||||
/* setup */
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
|
||||
/* compute total #procs we are going to add and the total number of nodes available */
|
||||
for(i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
/* get the nodes and #slots available for this app_context */
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->policy))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
if (0 < app->num_procs) {
|
||||
np = app->num_procs;
|
||||
} else {
|
||||
/* set the num_procs to the #slots */
|
||||
np = num_slots;
|
||||
}
|
||||
num_nodes = opal_list_get_size(&node_list);
|
||||
/* compute the base ppn */
|
||||
ppn = np / num_nodes;
|
||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||
start = orte_rmaps_base_get_starting_point(&node_list, jdata);
|
||||
/* loop through the list of nodes until we either assign all the procs
|
||||
* or return to the starting point
|
||||
*/
|
||||
item = start;
|
||||
nprocs = 0;
|
||||
do {
|
||||
node = (orte_node_t*)item;
|
||||
/* put the specified number of procs on each node */
|
||||
for (j=0; j < ppn; j++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
&node_list, jdata->map->oversubscribe,
|
||||
false, NULL))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
|
||||
* more procs to place, then that is an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc) ||
|
||||
j < ppn-1) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
nprocs++;
|
||||
}
|
||||
/* move to next node */
|
||||
if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
|
||||
item = opal_list_get_first(&node_list);
|
||||
}
|
||||
else {
|
||||
item = opal_list_get_next(item);
|
||||
}
|
||||
} while (item != start && nprocs < np);
|
||||
|
||||
/* save the bookmark */
|
||||
jdata->bookmark = node;
|
||||
|
||||
/* if we haven't assigned all the procs, then loop through the list
|
||||
* again, assigning 1 per node until all are assigned
|
||||
*/
|
||||
item = start;
|
||||
while (nprocs < np) {
|
||||
node = (orte_node_t*)item;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
&node_list, jdata->map->oversubscribe,
|
||||
false, NULL))) {
|
||||
/* if the code is not ORTE_ERR_NODE_FULLY_USED, then that is an error */
|
||||
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
nprocs++;
|
||||
/* move to next node */
|
||||
if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
|
||||
item = opal_list_get_first(&node_list);
|
||||
}
|
||||
else {
|
||||
item = opal_list_get_next(item);
|
||||
}
|
||||
}
|
||||
/* save the bookmark */
|
||||
jdata->bookmark = node;
|
||||
|
||||
/* cleanup */
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
/* if the user requested a specific number of procs and
|
||||
* the total number of procs we were able to assign
|
||||
* doesn't equal the number requested, then we have a
|
||||
* problem
|
||||
*/
|
||||
if (0 < app->num_procs && nprocs < app->num_procs) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
|
||||
app->app, app->num_procs,
|
||||
"number of slots", nprocs,
|
||||
"number of nodes", num_nodes);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* update the number of procs in the job */
|
||||
jdata->num_procs += nprocs;
|
||||
/* compute vpids and add proc objects to the job - this has to be
|
||||
* done after each app_context is mapped in order to keep the
|
||||
* vpids contiguous within an app_context
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
error:
|
||||
while(NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
@ -1,45 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Resource Mapping
|
||||
*/
|
||||
#ifndef ORTE_RMAPS_LB_H
|
||||
#define ORTE_RMAPS_LB_H
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_rmaps_lb_component_t {
|
||||
orte_rmaps_base_component_t super;
|
||||
int npernode;
|
||||
int nperboard;
|
||||
int npersocket;
|
||||
};
|
||||
typedef struct orte_rmaps_lb_component_t orte_rmaps_lb_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_rmaps_lb_component_t mca_rmaps_load_balance_component;
|
||||
extern orte_rmaps_base_module_t orte_rmaps_load_balance_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,143 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/paffinity/paffinity.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "rmaps_lb.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_rmaps_lb_open(void);
|
||||
static int orte_rmaps_lb_close(void);
|
||||
static int orte_rmaps_lb_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
static int my_priority;
|
||||
|
||||
orte_rmaps_lb_component_t mca_rmaps_load_balance_component = {
|
||||
{
|
||||
{
|
||||
ORTE_RMAPS_BASE_VERSION_2_0_0,
|
||||
|
||||
"load_balance", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_rmaps_lb_open, /* component open */
|
||||
orte_rmaps_lb_close, /* component close */
|
||||
orte_rmaps_lb_query /* component query */
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_rmaps_lb_open(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_rmaps_load_balance_component.super.base_version;
|
||||
int value, tmp;
|
||||
|
||||
/* initialize */
|
||||
mca_rmaps_load_balance_component.npernode = 0;
|
||||
mca_rmaps_load_balance_component.nperboard = 0;
|
||||
mca_rmaps_load_balance_component.npersocket = 0;
|
||||
|
||||
mca_base_param_reg_int(c, "priority",
|
||||
"Priority of the loadbalance rmaps component",
|
||||
false, false, 80,
|
||||
&my_priority);
|
||||
|
||||
/* check for procs/xxx directives */
|
||||
tmp = mca_base_param_reg_int(c, "pernode",
|
||||
"Launch one ppn as directed",
|
||||
false, false, (int)false, NULL);
|
||||
mca_base_param_reg_syn_name(tmp, "rmaps", "base_pernode", false);
|
||||
mca_base_param_lookup_int(tmp, &value);
|
||||
if (value) {
|
||||
mca_rmaps_load_balance_component.npernode = 1;
|
||||
}
|
||||
|
||||
/* #procs/node */
|
||||
tmp = mca_base_param_reg_int(c, "n_pernode",
|
||||
"Launch n procs/node",
|
||||
false, false, mca_rmaps_load_balance_component.npernode, NULL);
|
||||
mca_base_param_reg_syn_name(tmp, "rmaps", "base_n_pernode", false);
|
||||
mca_base_param_lookup_int(tmp, &mca_rmaps_load_balance_component.npernode);
|
||||
|
||||
/* #procs/board */
|
||||
tmp = mca_base_param_reg_int(c, "n_perboard",
|
||||
"Launch n procs/board",
|
||||
false, false, -1, NULL);
|
||||
mca_base_param_reg_syn_name(tmp, "rmaps", "base_n_perboard", false);
|
||||
mca_base_param_lookup_int(tmp, &mca_rmaps_load_balance_component.nperboard);
|
||||
if (0 < mca_rmaps_load_balance_component.nperboard) {
|
||||
ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_NPERXXX);
|
||||
}
|
||||
|
||||
/* #procs/socket */
|
||||
tmp = mca_base_param_reg_int(c, "n_persocket",
|
||||
"Launch n procs/socket",
|
||||
false, false, -1, NULL);
|
||||
mca_base_param_reg_syn_name(tmp, "rmaps", "base_n_persocket", false);
|
||||
mca_base_param_lookup_int(tmp, &mca_rmaps_load_balance_component.npersocket);
|
||||
if (0 < mca_rmaps_load_balance_component.npersocket) {
|
||||
ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_NPERXXX);
|
||||
/* force bind to socket if not overridden by user */
|
||||
ORTE_XSET_BINDING_POLICY(ORTE_BIND_TO_SOCKET);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_rmaps_lb_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* after rr, unless lb values are set */
|
||||
if (0 < mca_rmaps_load_balance_component.npernode ||
|
||||
0 < mca_rmaps_load_balance_component.nperboard ||
|
||||
0 < mca_rmaps_load_balance_component.npersocket) {
|
||||
my_priority = 10000;
|
||||
}
|
||||
*priority = my_priority;
|
||||
*module = (mca_base_module_t *)&orte_rmaps_load_balance_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_rmaps_lb_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -1,26 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_orte_rmaps_ppr_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -------------------------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_rmaps_ppr_CONFIG],[
|
||||
AC_REQUIRE([MCA_opal_hwloc_CONFIG_REQUIRE])
|
||||
AC_CONFIG_FILES([orte/mca/rmaps/ppr/Makefile])
|
||||
|
||||
# All we check for is whether $OPAL_HAVE_HWLOC is 1.
|
||||
# See big comment in opal/mca/hwloc/configure.m4.
|
||||
AC_MSG_CHECKING([if hwloc is enabled])
|
||||
AS_IF([test $OPAL_HAVE_HWLOC -eq 1],
|
||||
[AC_MSG_RESULT([yes])
|
||||
$1],
|
||||
[AC_MSG_RESULT([no])
|
||||
$2])
|
||||
])dnl
|
@ -1,23 +1,12 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for Open RTE's orterun.
|
||||
#
|
||||
#
|
||||
[invalid-ppr]
|
||||
An invalid value was given for the number of processes
|
||||
|
@ -29,37 +29,48 @@
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "rmaps_ppr.h"
|
||||
|
||||
static int ppr(orte_job_t *jdata);
|
||||
static int ppr_mapper(orte_job_t *jdata);
|
||||
|
||||
orte_rmaps_base_module_t orte_rmaps_ppr_module = {
|
||||
ppr
|
||||
ppr_mapper
|
||||
};
|
||||
|
||||
static orte_proc_t* setup_proc(orte_job_t *jdata, orte_node_t *node,
|
||||
static orte_proc_t* setup_proc(orte_job_t *jdata,
|
||||
orte_node_t *node,
|
||||
orte_app_idx_t idx);
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
static void prune(orte_jobid_t jobid,
|
||||
orte_app_idx_t app_idx,
|
||||
orte_node_t *node,
|
||||
opal_hwloc_level_t *level,
|
||||
orte_vpid_t *nmapped);
|
||||
#endif
|
||||
|
||||
static int ppr(orte_job_t *jdata)
|
||||
static int ppr[OPAL_HWLOC_HWTHREAD_LEVEL+1];
|
||||
|
||||
static int ppr_mapper(orte_job_t *jdata)
|
||||
{
|
||||
int rc, local_limit, j;
|
||||
orte_rmaps_ppr_component_t *c = &mca_rmaps_ppr_component;
|
||||
int rc, j, n;
|
||||
mca_base_component_t *c=&mca_rmaps_ppr_component.base_version;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
orte_app_context_t *app;
|
||||
orte_vpid_t total_procs, nprocs_mapped;
|
||||
opal_hwloc_level_t level, start=OPAL_HWLOC_NODE_LEVEL;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
hwloc_obj_t obj;
|
||||
hwloc_obj_type_t lowest;
|
||||
opal_hwloc_level_t level;
|
||||
unsigned cache_level;
|
||||
unsigned cache_level=0;
|
||||
unsigned int nobjs, i;
|
||||
#endif
|
||||
opal_list_t node_list;
|
||||
opal_list_item_t *item;
|
||||
orte_std_cntr_t num_slots;
|
||||
unsigned int nobjs, i;
|
||||
orte_app_idx_t idx;
|
||||
char **ppr_req, **ck;
|
||||
size_t len;
|
||||
bool pruning_reqd = false;
|
||||
|
||||
/* only handle initial launch of loadbalanced
|
||||
* or NPERxxx jobs - allow restarting of failed apps
|
||||
@ -71,37 +82,138 @@ static int ppr(orte_job_t *jdata)
|
||||
return ORTE_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
if (NULL != jdata->map->req_mapper &&
|
||||
0 != strcasecmp(jdata->map->req_mapper, c->super.base_version.mca_component_name)) {
|
||||
0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
|
||||
/* a mapper has been specified, and it isn't me */
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:ppr: job %s not using ppr mapper",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
return ORTE_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
if (NULL == jdata->map->ppr ||
|
||||
!(ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) {
|
||||
/* not for us */
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:ppr: job %s not using ppr mapper",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
return ORTE_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:ppr: mapping job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
"mca:rmaps:ppr: mapping job %s with ppr %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr);
|
||||
|
||||
/* flag that I did the mapping */
|
||||
if (NULL != jdata->map->last_mapper) {
|
||||
free(jdata->map->last_mapper);
|
||||
}
|
||||
jdata->map->last_mapper = strdup(c->super.base_version.mca_component_name);
|
||||
jdata->map->last_mapper = strdup(c->mca_component_name);
|
||||
|
||||
/* initialize */
|
||||
memset(ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t));
|
||||
|
||||
/* parse option */
|
||||
n=0;
|
||||
ppr_req = opal_argv_split(jdata->map->ppr, ',');
|
||||
for (j=0; NULL != ppr_req[j]; j++) {
|
||||
/* split on the colon */
|
||||
ck = opal_argv_split(ppr_req[j], ':');
|
||||
if (2 != opal_argv_count(ck)) {
|
||||
/* must provide a specification */
|
||||
orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, jdata->map->ppr);
|
||||
opal_argv_free(ppr_req);
|
||||
opal_argv_free(ck);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
len = strlen(ck[1]);
|
||||
if (0 == strncasecmp(ck[1], "node", len)) {
|
||||
ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNODE);
|
||||
start = OPAL_HWLOC_NODE_LEVEL;
|
||||
n++;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
} else if (0 == strncasecmp(ck[1], "hwthread", len) ||
|
||||
0 == strncasecmp(ck[1], "thread", len)) {
|
||||
ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
start = OPAL_HWLOC_HWTHREAD_LEVEL;
|
||||
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYHWTHREAD);
|
||||
n++;
|
||||
} else if (0 == strncasecmp(ck[1], "core", len)) {
|
||||
ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
if (start < OPAL_HWLOC_CORE_LEVEL) {
|
||||
start = OPAL_HWLOC_CORE_LEVEL;
|
||||
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYCORE);
|
||||
}
|
||||
n++;
|
||||
} else if (0 == strncasecmp(ck[1], "socket", len) ||
|
||||
0 == strncasecmp(ck[1], "skt", len)) {
|
||||
ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
if (start < OPAL_HWLOC_SOCKET_LEVEL) {
|
||||
start = OPAL_HWLOC_SOCKET_LEVEL;
|
||||
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSOCKET);
|
||||
}
|
||||
n++;
|
||||
} else if (0 == strncasecmp(ck[1], "l1cache", len)) {
|
||||
ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
if (start < OPAL_HWLOC_L1CACHE_LEVEL) {
|
||||
start = OPAL_HWLOC_L1CACHE_LEVEL;
|
||||
cache_level = 1;
|
||||
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL1CACHE);
|
||||
}
|
||||
n++;
|
||||
} else if (0 == strncasecmp(ck[1], "l2cache", len)) {
|
||||
ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
if (start < OPAL_HWLOC_L2CACHE_LEVEL) {
|
||||
start = OPAL_HWLOC_L2CACHE_LEVEL;
|
||||
cache_level = 2;
|
||||
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL2CACHE);
|
||||
}
|
||||
n++;
|
||||
} else if (0 == strncasecmp(ck[1], "l3cache", len)) {
|
||||
ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
if (start < OPAL_HWLOC_L3CACHE_LEVEL) {
|
||||
start = OPAL_HWLOC_L3CACHE_LEVEL;
|
||||
cache_level = 3;
|
||||
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL3CACHE);
|
||||
}
|
||||
n++;
|
||||
} else if (0 == strncasecmp(ck[1], "numa", len)) {
|
||||
ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
if (start < OPAL_HWLOC_NUMA_LEVEL) {
|
||||
start = OPAL_HWLOC_NUMA_LEVEL;
|
||||
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNUMA);
|
||||
}
|
||||
n++;
|
||||
#endif
|
||||
} else {
|
||||
/* unknown spec */
|
||||
orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], jdata->map->ppr);
|
||||
opal_argv_free(ppr_req);
|
||||
opal_argv_free(ck);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
opal_argv_free(ck);
|
||||
}
|
||||
opal_argv_free(ppr_req);
|
||||
/* if nothing was given, that's an error */
|
||||
if (0 == n) {
|
||||
opal_output(0, "NOTHING GIVEN");
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* if more than one level was specified, then pruning will be reqd */
|
||||
if (1 < n) {
|
||||
pruning_reqd = true;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:ppr: job %s assigned policy %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
orte_rmaps_base_print_mapping(jdata->map->mapping));
|
||||
|
||||
/* convenience */
|
||||
local_limit = mca_rmaps_ppr_component.ppr[mca_rmaps_ppr_component.start];
|
||||
level = mca_rmaps_ppr_component.start;
|
||||
|
||||
/* find the lowest level that was defined in the ppr */
|
||||
lowest = opal_hwloc_levels[mca_rmaps_ppr_component.start];
|
||||
if (OPAL_HWLOC_L3CACHE_LEVEL == mca_rmaps_ppr_component.start) {
|
||||
cache_level = 3;
|
||||
} else if (OPAL_HWLOC_L2CACHE_LEVEL == mca_rmaps_ppr_component.start) {
|
||||
cache_level = 2;
|
||||
} else if (OPAL_HWLOC_L1CACHE_LEVEL == mca_rmaps_ppr_component.start) {
|
||||
cache_level = 1;
|
||||
}
|
||||
level = start;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
lowest = opal_hwloc_levels[start];
|
||||
#endif
|
||||
|
||||
for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
|
||||
@ -121,7 +233,7 @@ static int ppr(orte_job_t *jdata)
|
||||
/* get the available nodes */
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->policy))) {
|
||||
jdata->map->mapping))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
@ -129,6 +241,7 @@ static int ppr(orte_job_t *jdata)
|
||||
/* cycle across the nodes */
|
||||
nprocs_mapped = 0;
|
||||
while (NULL != (node = (orte_node_t*)opal_list_remove_first(&node_list))) {
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* bozo check */
|
||||
if (NULL == node->topology) {
|
||||
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
|
||||
@ -136,6 +249,7 @@ static int ppr(orte_job_t *jdata)
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
#endif
|
||||
/* add the node to the map */
|
||||
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -146,15 +260,21 @@ static int ppr(orte_job_t *jdata)
|
||||
/* if we are mapping solely at the node level, just put
|
||||
* that many procs on this node
|
||||
*/
|
||||
if (HWLOC_OBJ_MACHINE == lowest) {
|
||||
for (j=0; j < local_limit && nprocs_mapped < total_procs; j++) {
|
||||
if (OPAL_HWLOC_NODE_LEVEL == start) {
|
||||
#if OPAL_HAVE_HWLOC
|
||||
obj = hwloc_get_root_obj(node->topology);
|
||||
#endif
|
||||
for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
|
||||
if (NULL == (proc = setup_proc(jdata, node, idx))) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto error;
|
||||
}
|
||||
nprocs_mapped++;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
proc->locale = obj;
|
||||
#endif
|
||||
}
|
||||
#if OPAL_HAVE_HWLOC
|
||||
} else {
|
||||
/* get the number of lowest resources on this node */
|
||||
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
@ -168,7 +288,7 @@ static int ppr(orte_job_t *jdata)
|
||||
obj = opal_hwloc_base_get_obj_by_type(node->topology,
|
||||
lowest, cache_level,
|
||||
i, OPAL_HWLOC_AVAILABLE);
|
||||
for (j=0; j < local_limit && nprocs_mapped < total_procs; j++) {
|
||||
for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
|
||||
if (NULL == (proc = setup_proc(jdata, node, idx))) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto error;
|
||||
@ -178,7 +298,7 @@ static int ppr(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
|
||||
if (mca_rmaps_ppr_component.pruning_reqd) {
|
||||
if (pruning_reqd) {
|
||||
/* go up the ladder and prune the procs according to
|
||||
* the specification, adjusting the count of procs on the
|
||||
* node as we go
|
||||
@ -186,6 +306,7 @@ static int ppr(orte_job_t *jdata)
|
||||
level--;
|
||||
prune(jdata->jobid, idx, node, &level, &nprocs_mapped);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* set the total slots used to the number of procs placed
|
||||
@ -197,12 +318,18 @@ static int ppr(orte_job_t *jdata)
|
||||
* we have violated the total slot specification - regardless,
|
||||
* if slots_max was given, we are not allowed to violate it!
|
||||
*/
|
||||
if ((!(jdata->map->oversubscribe) && node->slots < node->slots_inuse) ||
|
||||
if ((node->slots < node->slots_inuse) ||
|
||||
(0 < node->slots_max && node->slots_max < node->slots_inuse)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, node->num_procs, app->app);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, node->num_procs, app->app);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
|
||||
/* update the number of procs in the job and the app */
|
||||
@ -216,10 +343,10 @@ static int ppr(orte_job_t *jdata)
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (nprocs_mapped < total_procs) {
|
||||
if (ORTE_VPID_MAX != total_procs && nprocs_mapped < total_procs) {
|
||||
/* couldn't map them all */
|
||||
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs",
|
||||
true, app->app, app->num_procs, mca_rmaps_ppr_component.given_ppr);
|
||||
true, app->app, app->num_procs, jdata->map->ppr);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
@ -231,17 +358,6 @@ static int ppr(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
|
||||
/* compute and save local ranks */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* define the daemons that we will use for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
error:
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
@ -250,6 +366,7 @@ static int ppr(orte_job_t *jdata)
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
static hwloc_obj_t find_split(hwloc_topology_t topo, hwloc_obj_t obj)
|
||||
{
|
||||
unsigned k;
|
||||
@ -296,7 +413,7 @@ static void prune(orte_jobid_t jobid,
|
||||
|
||||
/* convenience */
|
||||
lvl = opal_hwloc_levels[ll];
|
||||
limit = mca_rmaps_ppr_component.ppr[ll];
|
||||
limit = ppr[ll];
|
||||
|
||||
if (0 == limit) {
|
||||
/* no limit at this level, so move up if necessary */
|
||||
@ -440,6 +557,7 @@ static void prune(orte_jobid_t jobid,
|
||||
error:
|
||||
opal_output(0, "INFINITE LOOP");
|
||||
}
|
||||
#endif
|
||||
|
||||
static orte_proc_t* setup_proc(orte_job_t *jdata,
|
||||
orte_node_t *node,
|
||||
|
@ -18,17 +18,7 @@
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_rmaps_ppr_component_t {
|
||||
orte_rmaps_base_component_t super;
|
||||
char *given_ppr;
|
||||
bool selected;
|
||||
bool pruning_reqd;
|
||||
int ppr[OPAL_HWLOC_HWTHREAD_LEVEL];
|
||||
opal_hwloc_level_t start;
|
||||
};
|
||||
typedef struct orte_rmaps_ppr_component_t orte_rmaps_ppr_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_rmaps_ppr_component_t mca_rmaps_ppr_component;
|
||||
ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_ppr_component;
|
||||
extern orte_rmaps_base_module_t orte_rmaps_ppr_module;
|
||||
|
||||
|
||||
|
@ -26,23 +26,21 @@ static int orte_rmaps_ppr_open(void);
|
||||
static int orte_rmaps_ppr_close(void);
|
||||
static int orte_rmaps_ppr_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
orte_rmaps_ppr_component_t mca_rmaps_ppr_component = {
|
||||
orte_rmaps_base_component_t mca_rmaps_ppr_component = {
|
||||
{
|
||||
{
|
||||
ORTE_RMAPS_BASE_VERSION_2_0_0,
|
||||
ORTE_RMAPS_BASE_VERSION_2_0_0,
|
||||
|
||||
"ppr", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_rmaps_ppr_open, /* component open */
|
||||
orte_rmaps_ppr_close, /* component close */
|
||||
orte_rmaps_ppr_query /* component query */
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
"ppr", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_rmaps_ppr_open, /* component open */
|
||||
orte_rmaps_ppr_close, /* component close */
|
||||
orte_rmaps_ppr_query /* component query */
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
};
|
||||
|
||||
@ -52,101 +50,104 @@ orte_rmaps_ppr_component_t mca_rmaps_ppr_component = {
|
||||
*/
|
||||
static int orte_rmaps_ppr_open(void)
|
||||
{
|
||||
char **ppr, *ctmp, **ck;
|
||||
int i, n;
|
||||
size_t value;
|
||||
opal_hwloc_level_t start=OPAL_HWLOC_NODE_LEVEL;
|
||||
int tmp, value;
|
||||
mca_base_component_t *c=&mca_rmaps_ppr_component.base_version;
|
||||
|
||||
/* initialize */
|
||||
mca_rmaps_ppr_component.selected = false;
|
||||
mca_rmaps_ppr_component.pruning_reqd = false;
|
||||
memset(mca_rmaps_ppr_component.ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t));
|
||||
n=0;
|
||||
|
||||
mca_base_param_reg_string(&mca_rmaps_ppr_component.super.base_version,
|
||||
"pattern",
|
||||
"Comma-separate list of number of processes on a given resource type [default: none]",
|
||||
false, false, NULL, &mca_rmaps_ppr_component.given_ppr);
|
||||
ctmp = mca_rmaps_ppr_component.given_ppr;
|
||||
if (NULL != ctmp) {
|
||||
ppr = opal_argv_split(ctmp, ',');
|
||||
|
||||
/* check validity of mppr spec */
|
||||
for (i=0; NULL != ppr[i]; i++) {
|
||||
/* split on the colon */
|
||||
ck = opal_argv_split(ppr[i], ':');
|
||||
if (2 != opal_argv_count(ck)) {
|
||||
/* must provide a specification */
|
||||
orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, ctmp);
|
||||
opal_argv_free(ppr);
|
||||
opal_argv_free(ck);
|
||||
free(ctmp);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
value = strlen(ck[1]);
|
||||
if (0 == strncasecmp(ck[1], "hwthread", value) ||
|
||||
0 == strncasecmp(ck[1], "thread", value)) {
|
||||
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
start = OPAL_HWLOC_HWTHREAD_LEVEL;
|
||||
n++;
|
||||
} else if (0 == strncasecmp(ck[1], "core", value)) {
|
||||
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
if (start < OPAL_HWLOC_CORE_LEVEL) {
|
||||
start = OPAL_HWLOC_CORE_LEVEL;
|
||||
}
|
||||
n++;
|
||||
} else if (0 == strncasecmp(ck[1], "socket", value) ||
|
||||
0 == strncasecmp(ck[1], "skt", value)) {
|
||||
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
if (start < OPAL_HWLOC_SOCKET_LEVEL) {
|
||||
start = OPAL_HWLOC_SOCKET_LEVEL;
|
||||
}
|
||||
n++;
|
||||
} else if (0 == strncasecmp(ck[1], "l1cache", value)) {
|
||||
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
if (start < OPAL_HWLOC_L1CACHE_LEVEL) {
|
||||
start = OPAL_HWLOC_L1CACHE_LEVEL;
|
||||
}
|
||||
n++;
|
||||
} else if (0 == strncasecmp(ck[1], "l2cache", value)) {
|
||||
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
if (start < OPAL_HWLOC_L2CACHE_LEVEL) {
|
||||
start = OPAL_HWLOC_L2CACHE_LEVEL;
|
||||
}
|
||||
n++;
|
||||
} else if (0 == strncasecmp(ck[1], "l3cache", value)) {
|
||||
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
if (start < OPAL_HWLOC_L3CACHE_LEVEL) {
|
||||
start = OPAL_HWLOC_L3CACHE_LEVEL;
|
||||
}
|
||||
n++;
|
||||
} else if (0 == strncasecmp(ck[1], "numa", value)) {
|
||||
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
if (start < OPAL_HWLOC_NUMA_LEVEL) {
|
||||
start = OPAL_HWLOC_NUMA_LEVEL;
|
||||
}
|
||||
n++;
|
||||
} else if (0 == strncasecmp(ck[1], "node", value)) {
|
||||
mca_rmaps_ppr_component.ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10);
|
||||
n++;
|
||||
} else {
|
||||
/* unknown spec */
|
||||
orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], ctmp);
|
||||
opal_argv_free(ppr);
|
||||
opal_argv_free(ck);
|
||||
free(ctmp);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
opal_argv_free(ck);
|
||||
/* check for pernode, npernode, and npersocket directives - reqd for backward compatibility */
|
||||
tmp = mca_base_param_reg_int(c, "pernode",
|
||||
"Launch one ppn as directed",
|
||||
false, false, (int)false, NULL);
|
||||
mca_base_param_reg_syn_name(tmp, "rmaps", "base_pernode", false);
|
||||
mca_base_param_lookup_int(tmp, &value);
|
||||
if (value) {
|
||||
if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
|
||||
/* if a non-default mapping is already specified, then we
|
||||
* have an error
|
||||
*/
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
|
||||
"PERNODE", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_CONFLICTED);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
opal_argv_free(ppr);
|
||||
mca_rmaps_ppr_component.selected = true;
|
||||
mca_rmaps_ppr_component.start = start;
|
||||
/* if more than one level was specified, then pruning will be reqd */
|
||||
if (1 < n) {
|
||||
mca_rmaps_ppr_component.pruning_reqd = true;
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_PPR);
|
||||
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE);
|
||||
orte_rmaps_base.ppr = strdup("1:node");
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
|
||||
}
|
||||
|
||||
tmp = mca_base_param_reg_int(c, "n_pernode",
|
||||
"Launch n procs/node",
|
||||
false, false, (int)false, NULL);
|
||||
mca_base_param_reg_syn_name(tmp, "rmaps", "base_n_pernode", false);
|
||||
mca_base_param_lookup_int(tmp, &value);
|
||||
if (value) {
|
||||
if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
|
||||
/* if a non-default mapping is already specified, then we
|
||||
* have an error
|
||||
*/
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
|
||||
"NPERNODE", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_CONFLICTED);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_PPR);
|
||||
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE);
|
||||
asprintf(&orte_rmaps_base.ppr, "%d:node", value);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{
|
||||
char *ppr;
|
||||
|
||||
tmp = mca_base_param_reg_int(c, "n_persocket",
|
||||
"Launch n procs/socket",
|
||||
false, false, (int)false, NULL);
|
||||
mca_base_param_reg_syn_name(tmp, "rmaps", "base_n_persocket", false);
|
||||
mca_base_param_lookup_int(tmp, &value);
|
||||
if (value) {
|
||||
if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
|
||||
/* if a non-default mapping is already specified, then we
|
||||
* have an error
|
||||
*/
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
|
||||
"NPERSOCKET", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_CONFLICTED);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_PPR);
|
||||
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSOCKET);
|
||||
/* this implies binding to the sockets, unless otherwise directed */
|
||||
if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_SOCKET);
|
||||
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
|
||||
}
|
||||
asprintf(&orte_rmaps_base.ppr, "%d:socket", value);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
|
||||
}
|
||||
|
||||
mca_base_param_reg_string(c, "pattern",
|
||||
"Comma-separated list of number of processes on a given resource type [default: none]",
|
||||
false, false, NULL, &ppr);
|
||||
if (NULL != ppr) {
|
||||
if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
|
||||
/* if a non-default mapping is already specified, then we
|
||||
* have an error
|
||||
*/
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
|
||||
"PPR", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_CONFLICTED);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_PPR);
|
||||
/* since we don't know what pattern was given, leave the policy undefined
|
||||
* for now - we will assign it when we analyze the pattern later
|
||||
*/
|
||||
orte_rmaps_base.ppr = ppr;
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -154,16 +155,9 @@ static int orte_rmaps_ppr_open(void)
|
||||
|
||||
static int orte_rmaps_ppr_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (mca_rmaps_ppr_component.selected) {
|
||||
*priority = 1000;
|
||||
*module = (mca_base_module_t *)&orte_rmaps_ppr_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* cannot run without ppr spec */
|
||||
*priority = 0;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
*priority = 90;
|
||||
*module = (mca_base_module_t *)&orte_rmaps_ppr_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -172,9 +166,6 @@ static int orte_rmaps_ppr_query(mca_base_module_t **module, int *priority)
|
||||
|
||||
static int orte_rmaps_ppr_close(void)
|
||||
{
|
||||
if (NULL != mca_rmaps_ppr_component.given_ppr) {
|
||||
free(mca_rmaps_ppr_component.given_ppr);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -9,17 +10,24 @@
|
||||
# This is the US/English general help file for rankle utilities.
|
||||
#
|
||||
# Voltaire
|
||||
[no-hwloc]
|
||||
A slot_list containing detailed location info was given, but
|
||||
hwloc support is not available:
|
||||
|
||||
Rank: %d
|
||||
Slot list: %s
|
||||
|
||||
Unfortunately, hwloc support is required for this action.
|
||||
Please reconfigure OMPI for hwloc if binding to specified
|
||||
cpus is desired.
|
||||
[no-rankfile]
|
||||
Open RTE was unable to open the rankfile:
|
||||
%s
|
||||
Check to make sure the path and filename are correct.
|
||||
|
||||
usage: mpirun mca rmaps_rankfile_path rankfile ./app
|
||||
usage: mpirun -mca rmaps_rankfile_path rankfile ./app
|
||||
|
||||
all unspecified by rankfile ranks are assigned using
|
||||
byslot or bynode policy.
|
||||
|
||||
example: cat hosfile
|
||||
example: cat hostfile
|
||||
host1
|
||||
host2
|
||||
host3
|
||||
@ -89,24 +97,12 @@ at least one that failed to specify the number of processes to execute.
|
||||
When specifying multiple applications, you must specify how many processes
|
||||
of each to launch via the -np argument.
|
||||
#
|
||||
[orte-rmaps-rf:per-node-and-too-many-procs]
|
||||
There are not enough nodes in your allocation to satisfy your request to
|
||||
launch
|
||||
%d processes on a per-node basis - only %d nodes were available.
|
||||
[missing-rank]
|
||||
A rank is missing its location specification:
|
||||
|
||||
Either request fewer processes, or obtain a larger allocation.
|
||||
#
|
||||
[orte-rmaps-rf:n-per-node-and-too-many-procs]
|
||||
There are not enough nodes in your allocation to satisfy your request to
|
||||
launch
|
||||
%d processes on a %d per-node basis - only %d nodes with a total of %d slots
|
||||
%were available.
|
||||
Rank: %d
|
||||
Rank file: %s
|
||||
|
||||
Either request fewer processes, or obtain a larger allocation.
|
||||
#
|
||||
[orte-rmaps-rf:n-per-node-and-not-enough-slots]
|
||||
There are not enough slots on the nodes in your allocation to satisfy your
|
||||
request to launch on a %d process-per-node basis - only %d slots/node were
|
||||
available.
|
||||
|
||||
Either request fewer processes/node, or obtain a larger allocation.
|
||||
All processes must have their location specified in the rank file. Either
|
||||
add an entry to the file, or provide a default slot_list to use for
|
||||
any unspecified ranks.
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2008 Voltaire. All rights reserved
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
@ -36,8 +36,8 @@
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/if.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
@ -58,224 +58,8 @@ char *orte_rmaps_rank_file_slot_list;
|
||||
/*
|
||||
* Local variable
|
||||
*/
|
||||
static opal_list_item_t *cur_node_item = NULL;
|
||||
static opal_pointer_array_t rankmap;
|
||||
|
||||
/*
|
||||
* Create a default mapping for the application, mapping rank by rank_file and
|
||||
* by node.
|
||||
*/
|
||||
static int map_app_by_node(orte_app_context_t* app,
|
||||
orte_job_t* jdata,
|
||||
orte_vpid_t vpid_start,
|
||||
opal_list_t* nodes )
|
||||
{
|
||||
int rc = ORTE_SUCCESS;
|
||||
opal_list_item_t *next;
|
||||
orte_node_t *node;
|
||||
orte_std_cntr_t num_alloc = 0;
|
||||
orte_proc_t *proc;
|
||||
|
||||
/* This loop continues until all procs have been mapped or we run
|
||||
out of resources. We determine that we have "run out of
|
||||
resources" when all nodes have slots_max processes mapped to them,
|
||||
thus there are no free slots for a process to be mapped, or we have
|
||||
hit the soft limit on all nodes and are in a "no oversubscribe" state.
|
||||
If we still have processes that haven't been mapped yet, then it's an
|
||||
"out of resources" error.
|
||||
|
||||
In this scenario, we rely on the claim_slot function to handle the
|
||||
oversubscribed case. The claim_slot function will leave a node on the
|
||||
list until it either reaches slots_max OR reaches the
|
||||
soft limit and the "no_oversubscribe" flag has been set - at which point,
|
||||
the node will be removed to prevent any more processes from being mapped to
|
||||
it. Since we are taking one slot from each node as we cycle through, the
|
||||
list, oversubscription is automatically taken care of via this logic.
|
||||
*/
|
||||
|
||||
while (num_alloc < app->num_procs) {
|
||||
if (NULL != opal_pointer_array_get_item(&rankmap, vpid_start+num_alloc)) {
|
||||
/* this rank was already mapped */
|
||||
++num_alloc;
|
||||
continue;
|
||||
}
|
||||
/** see if any nodes remain unused and available. We need to do this check
|
||||
* each time since we may remove nodes from the list (as they become fully
|
||||
* used) as we cycle through the loop */
|
||||
if(0 >= opal_list_get_size(nodes) ) {
|
||||
/* No more nodes to allocate :( */
|
||||
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
||||
/* Save the next node we can use before claiming slots, since
|
||||
* we may need to prune the nodes list removing overused nodes.
|
||||
* Wrap around to beginning if we are at the end of the list */
|
||||
if (opal_list_get_end(nodes) == opal_list_get_next(cur_node_item)) {
|
||||
next = opal_list_get_first(nodes);
|
||||
}
|
||||
else {
|
||||
next = opal_list_get_next(cur_node_item);
|
||||
}
|
||||
/* Allocate a slot on this node */
|
||||
node = (orte_node_t*) cur_node_item;
|
||||
/* grab the slot - have a new proc object created */
|
||||
proc = NULL;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
|
||||
nodes, jdata->map->oversubscribe, true, &proc))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error - we just need to break from the loop
|
||||
* since the node is fully used up. For now, just don't report
|
||||
* an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
if (NULL != mca_rmaps_rank_file_component.slot_list) {
|
||||
proc->slot_list = strdup(mca_rmaps_rank_file_component.slot_list);
|
||||
}
|
||||
++num_alloc;
|
||||
cur_node_item = next;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a default mapping for the application, scheduling ranks byr rank_file
|
||||
* and by slot.
|
||||
*/
|
||||
static int map_app_by_slot(orte_app_context_t* app,
|
||||
orte_job_t* jdata,
|
||||
orte_vpid_t vpid_start,
|
||||
opal_list_t* nodes )
|
||||
{
|
||||
int rc = ORTE_SUCCESS;
|
||||
orte_std_cntr_t i, num_slots_to_take, num_alloc = 0;
|
||||
orte_node_t *node;
|
||||
opal_list_item_t *next;
|
||||
orte_proc_t *proc;
|
||||
|
||||
/* This loop continues until all procs have been mapped or we run
|
||||
out of resources. We determine that we have "run out of
|
||||
resources" when either all nodes have slots_max processes mapped to them,
|
||||
(thus there are no free slots for a process to be mapped), OR all nodes
|
||||
have reached their soft limit and the user directed us to "no oversubscribe".
|
||||
If we still have processes that haven't been mapped yet, then it's an
|
||||
"out of resources" error. */
|
||||
while ( num_alloc < app->num_procs) {
|
||||
/** see if any nodes remain unused and available. We need to do this check
|
||||
* each time since we may remove nodes from the list (as they become fully
|
||||
* used) as we cycle through the loop */
|
||||
if(0 >= opal_list_get_size(nodes) ) {
|
||||
/* Everything is at max usage! :( */
|
||||
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
||||
/* Save the next node we can use before claiming slots, since
|
||||
* we may need to prune the nodes list removing overused nodes.
|
||||
* Wrap around to beginning if we are at the end of the list */
|
||||
if (opal_list_get_end(nodes) == opal_list_get_next(cur_node_item)) {
|
||||
next = opal_list_get_first(nodes);
|
||||
} else {
|
||||
next = opal_list_get_next(cur_node_item);
|
||||
}
|
||||
|
||||
/** declare a shorter name for convenience in the code below */
|
||||
node = (orte_node_t*) cur_node_item;
|
||||
/* If we have available slots on this node, claim all of them
|
||||
* If node_slots == 0, assume 1 slot for that node.
|
||||
* JJH - is this assumption fully justified?
|
||||
*
|
||||
* If we are now oversubscribing the nodes, then we still take:
|
||||
* (a) if the node has not been used yet, we take a full node_slots
|
||||
* (b) if some of the slots are in-use, then we take the number of
|
||||
* remaining slots before hitting the soft limit (node_slots)
|
||||
* (c) if we are at or above the soft limit, we take a full node_slots
|
||||
*
|
||||
* Note: if node_slots is zero, then we always just take 1 slot
|
||||
*
|
||||
* We continue this process until either everything is done,
|
||||
* or all nodes have hit their hard limit. This algorithm ensures we
|
||||
* fully utilize each node before oversubscribing, and preserves the ratio
|
||||
* of processes between the nodes thereafter (e.g., if one node has twice as
|
||||
* many processes as another before oversubscribing, it will continue
|
||||
* to do so after oversubscribing).
|
||||
*/
|
||||
|
||||
|
||||
if (0 == node->slots_inuse ||
|
||||
node->slots_inuse >= node->slots_alloc) {
|
||||
num_slots_to_take = (node->slots_alloc == 0) ? 1 : node->slots_alloc;
|
||||
} else {
|
||||
num_slots_to_take = node->slots_alloc - node->slots_inuse;
|
||||
}
|
||||
|
||||
/* check if we are in npernode mode - if so, then set the num_slots_to_take
|
||||
* to the num_per_node
|
||||
*/
|
||||
if (0 < jdata->map->npernode) {
|
||||
num_slots_to_take = jdata->map->npernode;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
while (num_alloc < app->num_procs && i < num_slots_to_take) {
|
||||
if (NULL != opal_pointer_array_get_item(&rankmap, vpid_start+num_alloc)) {
|
||||
/* this rank was already mapped */
|
||||
++num_alloc;
|
||||
continue;
|
||||
}
|
||||
/* grab the slot - have a new proc object created */
|
||||
proc = NULL;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
|
||||
nodes, jdata->map->oversubscribe, true, &proc))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error - we just need to break from the loop
|
||||
* since the node is fully used up. For now, just don't report
|
||||
* an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
if (NULL != mca_rmaps_rank_file_component.slot_list) {
|
||||
proc->slot_list = strdup(mca_rmaps_rank_file_component.slot_list);
|
||||
}
|
||||
/* Update the rank */
|
||||
++num_alloc;
|
||||
/* track #slots taken */
|
||||
i++;
|
||||
|
||||
/** if all the procs have been mapped OR we have fully used up this node, then
|
||||
* break from the loop
|
||||
*/
|
||||
if(num_alloc == app->num_procs ||
|
||||
ORTE_ERR_NODE_FULLY_USED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* we move on to the next node in all cases EXCEPT if we came
|
||||
* out of the loop without having taken a full bite AND the
|
||||
* node is NOT max'd out
|
||||
*
|
||||
*/
|
||||
if (i < (num_slots_to_take-1) &&
|
||||
ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
continue;
|
||||
}
|
||||
cur_node_item = next;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a rank_file mapping for the job.
|
||||
*/
|
||||
@ -284,17 +68,17 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
orte_job_map_t *map;
|
||||
orte_app_context_t *app=NULL;
|
||||
orte_std_cntr_t i, k;
|
||||
orte_vpid_t total_procs;
|
||||
opal_list_t node_list;
|
||||
opal_list_item_t *item;
|
||||
orte_node_t *node, *nd, *root_node;
|
||||
orte_vpid_t rank, vpid_start;
|
||||
orte_std_cntr_t num_nodes, num_slots;
|
||||
orte_rmaps_rank_file_map_t *rfmap;
|
||||
orte_std_cntr_t slots_per_node, relative_index, tmp_cnt;
|
||||
orte_std_cntr_t relative_index, tmp_cnt;
|
||||
int rc;
|
||||
orte_proc_t *proc;
|
||||
mca_base_component_t *c = &mca_rmaps_rank_file_component.super.base_version;
|
||||
char *slots;
|
||||
|
||||
/* only handle initial launch of rf job */
|
||||
if (ORTE_JOB_STATE_INIT != jdata->state) {
|
||||
@ -311,7 +95,10 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
return ORTE_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
|
||||
if (ORTE_MAPPING_BYUSER != ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping)) {
|
||||
/* NOT FOR US */
|
||||
return ORTE_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_file: mapping job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
@ -335,7 +122,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
|
||||
/* if the number of processes wasn't specified, then we know there can be only
|
||||
* one app_context allowed in the launch, and that we are to launch it across
|
||||
* all available slots. We'll double-check the single app_context rule first
|
||||
* all available slots.
|
||||
*/
|
||||
if (0 == app->num_procs && 1 < jdata->num_apps) {
|
||||
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:multi-apps-and-zero-np",
|
||||
@ -344,24 +131,11 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* likewise, we only support pernode options for a single app_context */
|
||||
if (0 < map->npernode && 1 < jdata->num_apps) {
|
||||
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:multi-apps-and-zero-np",
|
||||
true, jdata->num_apps, NULL);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
|
||||
}
|
||||
|
||||
/* END SANITY CHECKS */
|
||||
|
||||
/* flag the map as containing cpu_lists */
|
||||
map->cpu_lists = true;
|
||||
|
||||
/* start at the beginning... */
|
||||
vpid_start = 0;
|
||||
jdata->num_procs = 0;
|
||||
total_procs = 0;
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
OBJ_CONSTRUCT(&rankmap, opal_pointer_array_t);
|
||||
|
||||
@ -384,70 +158,45 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
* option
|
||||
*/
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
map->policy))) {
|
||||
map->mapping))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
|
||||
|
||||
/* we already checked for sanity, so these are okay to just do here */
|
||||
if (map->npernode == 1) {
|
||||
/* there are three use-cases that we need to deal with:
|
||||
* (a) if -np was not provided, then we just use the number of nodes
|
||||
* (b) if -np was provided AND #procs > #nodes, then error out
|
||||
* (c) if -np was provided AND #procs <= #nodes, then launch
|
||||
* the specified #procs one/node. In this case, we just
|
||||
* leave app->num_procs alone
|
||||
*/
|
||||
if (0 == app->num_procs) {
|
||||
app->num_procs = num_nodes;
|
||||
} else if (app->num_procs > num_nodes) {
|
||||
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:per-node-and-too-many-procs",
|
||||
true, app->num_procs, num_nodes, NULL);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
} else if (map->npernode > 1) {
|
||||
/* first, let's check to see if there are enough slots/node to
|
||||
* meet the request - error out if not
|
||||
*/
|
||||
slots_per_node = num_slots / num_nodes;
|
||||
if (map->npernode > slots_per_node) {
|
||||
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:n-per-node-and-not-enough-slots",
|
||||
true, map->npernode, slots_per_node, NULL);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
/* there are three use-cases that we need to deal with:
|
||||
* (a) if -np was not provided, then we just use the n/node * #nodes
|
||||
* (b) if -np was provided AND #procs > (n/node * #nodes), then error out
|
||||
* (c) if -np was provided AND #procs <= (n/node * #nodes), then launch
|
||||
* the specified #procs n/node. In this case, we just
|
||||
* leave app->num_procs alone
|
||||
*/
|
||||
if (0 == app->num_procs) {
|
||||
/* set the num_procs to equal the specified num/node * the number of nodes */
|
||||
app->num_procs = map->npernode * num_nodes;
|
||||
} else if (app->num_procs > (map->npernode * num_nodes)) {
|
||||
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:n-per-node-and-too-many-procs",
|
||||
true, app->num_procs, map->npernode, num_nodes, num_slots, NULL);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
} else if (0 == app->num_procs) {
|
||||
/* we already checked for sanity, so it's okay to just do here */
|
||||
if (0 == app->num_procs) {
|
||||
/** set the num_procs to equal the number of slots on these mapped nodes */
|
||||
app->num_procs = num_slots;
|
||||
}
|
||||
|
||||
/* keep track of the total #procs in this job */
|
||||
total_procs += app->num_procs;
|
||||
|
||||
for (k=0; k < app->num_procs; k++) {
|
||||
rank = vpid_start + k;
|
||||
/* get the rankfile entry for this rank */
|
||||
if (NULL == (rfmap = (orte_rmaps_rank_file_map_t*)opal_pointer_array_get_item(&rankmap, rank))) {
|
||||
/* no entry for this rank */
|
||||
continue;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* no entry for this rank - if a default slot_list was given,
|
||||
* then use it instead
|
||||
*/
|
||||
if (NULL != opal_hwloc_base_slot_list) {
|
||||
slots = opal_hwloc_base_slot_list;
|
||||
} else {
|
||||
#endif
|
||||
/* all ranks must be specified */
|
||||
orte_show_help("help-rmaps_rank_file.txt", "missing-rank", true, rank, orte_rankfile);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
}
|
||||
} else {
|
||||
if (0 == strlen(rfmap->slot_list)) {
|
||||
/* rank was specified but no slot list given - that's an error */
|
||||
orte_show_help("help-rmaps_rank_file.txt","no-slot-list", true, rank, rfmap->node_name);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
slots = rfmap->slot_list;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* find the node where this proc was assigned */
|
||||
@ -460,51 +209,110 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
0 == strcmp(nd->name, rfmap->node_name)) {
|
||||
node = nd;
|
||||
break;
|
||||
} else if (NULL != rfmap->node_name &&
|
||||
(('+' == rfmap->node_name[0]) &&
|
||||
(('n' == rfmap->node_name[1]) ||
|
||||
('N' == rfmap->node_name[1])))) {
|
||||
} else if (NULL != rfmap->node_name &&
|
||||
(('+' == rfmap->node_name[0]) &&
|
||||
(('n' == rfmap->node_name[1]) ||
|
||||
('N' == rfmap->node_name[1])))) {
|
||||
|
||||
relative_index=atoi(strtok(rfmap->node_name,"+n"));
|
||||
if ( relative_index >= (int)opal_list_get_size (&node_list) || ( 0 > relative_index)){
|
||||
orte_show_help("help-rmaps_rank_file.txt","bad-index", true,rfmap->node_name);
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
root_node = (orte_node_t*) opal_list_get_first(&node_list);
|
||||
for(tmp_cnt=0; tmp_cnt<relative_index; tmp_cnt++) {
|
||||
root_node = (orte_node_t*) opal_list_get_next(root_node);
|
||||
}
|
||||
node = root_node;
|
||||
break;
|
||||
}
|
||||
relative_index=atoi(strtok(rfmap->node_name,"+n"));
|
||||
if ( relative_index >= (int)opal_list_get_size (&node_list) || ( 0 > relative_index)){
|
||||
orte_show_help("help-rmaps_rank_file.txt","bad-index", true,rfmap->node_name);
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
root_node = (orte_node_t*) opal_list_get_first(&node_list);
|
||||
for(tmp_cnt=0; tmp_cnt<relative_index; tmp_cnt++) {
|
||||
root_node = (orte_node_t*) opal_list_get_next(root_node);
|
||||
}
|
||||
node = root_node;
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
if (NULL == node) {
|
||||
orte_show_help("help-rmaps_rank_file.txt","bad-host", true, rfmap->node_name);
|
||||
return ORTE_ERR_SILENT;
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
if (0 == strlen(rfmap->slot_list)) {
|
||||
/* rank was specified but no slot list given - that's an error */
|
||||
orte_show_help("help-rmaps_rank_file.txt","no-slot-list", true, rank, rfmap->node_name);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
proc = NULL;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
|
||||
&node_list, jdata->map->oversubscribe, true, &proc))) {
|
||||
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
/* if this is a true error and not the node just being
|
||||
* full, then report the error and abort
|
||||
*/
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* ensure the node is in the map */
|
||||
if (!node->mapped) {
|
||||
OBJ_RETAIN(node);
|
||||
opal_pointer_array_add(map->nodes, node);
|
||||
node->mapped = true;
|
||||
}
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
/* set the jobid */
|
||||
proc->name.jobid = jdata->jobid;
|
||||
proc->name.vpid = rank;
|
||||
/* Either init or update the epoch. */
|
||||
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
/* flag the proc as ready for launch */
|
||||
proc->state = ORTE_PROC_STATE_INIT;
|
||||
proc->app_idx = i;
|
||||
|
||||
proc->slot_list = strdup(rfmap->slot_list);
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
proc->node = node;
|
||||
proc->nodename = node->name;
|
||||
node->num_procs++;
|
||||
if ((node->slots < node->slots_inuse) ||
|
||||
(0 < node->slots_max && node->slots_max < node->slots_inuse)) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, node->num_procs, app->app);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(proc);
|
||||
return rc;
|
||||
}
|
||||
/* retain the proc struct so that we correctly track its release */
|
||||
OBJ_RETAIN(proc);
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (NULL != slots) {
|
||||
/* setup the bitmap */
|
||||
hwloc_cpuset_t bitmap;
|
||||
if (NULL == node->topology) {
|
||||
/* not allowed - for rank-file, we must have
|
||||
* the topology info
|
||||
*/
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology", true, node->name);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
bitmap = hwloc_bitmap_alloc();
|
||||
/* parse the slot_list to find the socket and core */
|
||||
if (ORTE_SUCCESS != (rc = opal_hwloc_base_slot_list_parse(slots, node->topology, bitmap))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
/* note that we cannot set the proc locale to any specific object
|
||||
* as the slot list may have assigned it to more than one - so
|
||||
* leave that field NULL
|
||||
*/
|
||||
/* set the proc to the specified map */
|
||||
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, bitmap);
|
||||
/* cleanup */
|
||||
hwloc_bitmap_free(bitmap);
|
||||
}
|
||||
#else
|
||||
/* if we don't have hwloc, then all the rank_file can contain
|
||||
* is the node assignment - it cannot contain any directives
|
||||
* for socket, cores, etc. as we cannot honor them
|
||||
*/
|
||||
if (NULL != slots) {
|
||||
orte_show_help("help-rmaps_rank_file.txt", "no-hwloc", true, rank, slots);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* insert the proc into the proper place */
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
|
||||
proc->name.vpid, proc))) {
|
||||
@ -518,7 +326,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
/* cleanup the node list - it can differ from one app_context
|
||||
* to another, so we have to get it every time
|
||||
*/
|
||||
while(NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
@ -526,94 +334,6 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
|
||||
/* did we map all the procs, or did the user's rankfile not contain
|
||||
* a specification for every rank?
|
||||
*/
|
||||
if (jdata->num_procs < total_procs) {
|
||||
/* we need to map the remainder of the procs according to the
|
||||
* mapping policy
|
||||
*/
|
||||
vpid_start = 0;
|
||||
for(i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* for each app_context, we have to get the list of nodes that it can
|
||||
* use since that can now be modified with a hostfile and/or -host
|
||||
* option
|
||||
*/
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
map->policy))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||
if (NULL != jdata->bookmark) {
|
||||
cur_node_item = NULL;
|
||||
/* find this node on the list */
|
||||
for (item = opal_list_get_first(&node_list);
|
||||
item != opal_list_get_end(&node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
|
||||
if (node->index == jdata->bookmark->index) {
|
||||
cur_node_item = item;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* see if we found it - if not, just start at the beginning */
|
||||
if (NULL == cur_node_item) {
|
||||
cur_node_item = opal_list_get_first(&node_list);
|
||||
}
|
||||
} else {
|
||||
/* if no bookmark, then just start at the beginning of the list */
|
||||
cur_node_item = opal_list_get_first(&node_list);
|
||||
}
|
||||
if (map->policy & ORTE_MAPPING_BYNODE) {
|
||||
rc = map_app_by_node(app, jdata, vpid_start, &node_list);
|
||||
} else {
|
||||
rc = map_app_by_slot(app, jdata, vpid_start, &node_list);
|
||||
}
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
vpid_start += app->num_procs;
|
||||
/* cleanup the node list - it can differ from one app_context
|
||||
* to another, so we have to get it every time
|
||||
*/
|
||||
while(NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
}
|
||||
/* save the bookmark */
|
||||
jdata->bookmark = (orte_node_t*)cur_node_item;
|
||||
}
|
||||
|
||||
/* update the job's number of procs */
|
||||
jdata->num_procs = total_procs;
|
||||
|
||||
/* compute vpids and add proc objects to the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* compute and save convenience values */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* define the daemons that we will use for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* cleanup the rankmap */
|
||||
for (i=0; i < rankmap.size; i++) {
|
||||
if (NULL != (rfmap = opal_pointer_array_get_item(&rankmap, i))) {
|
||||
@ -621,9 +341,9 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&rankmap);
|
||||
return ORTE_SUCCESS;
|
||||
return rc;
|
||||
|
||||
error:
|
||||
error:
|
||||
while(NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
@ -11,6 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Voltaire. All rights reserved
|
||||
*
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -30,7 +31,6 @@
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/class/opal_object.h"
|
||||
#include "opal/mca/paffinity/paffinity.h"
|
||||
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Voltaire. All rights reserved
|
||||
*
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -25,10 +26,11 @@
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "orte/mca/ras/ras_types.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
@ -82,17 +84,29 @@ static int orte_rmaps_rank_file_open(void)
|
||||
false, false, 0,
|
||||
&my_priority);
|
||||
|
||||
/* did the user provide a slot list? */
|
||||
tmp = mca_base_param_reg_string(c, "slot_list",
|
||||
"List of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files) [default=NULL]",
|
||||
false, false, NULL, NULL);
|
||||
mca_base_param_reg_syn_name(tmp, "rmaps", "base_slot_list", false);
|
||||
mca_base_param_lookup_string(tmp, &mca_rmaps_rank_file_component.slot_list);
|
||||
tmp = mca_base_param_reg_string(c, "path",
|
||||
"Name of the rankfile to be used for mapping processes (relative or absolute path)",
|
||||
false, false, NULL, NULL);
|
||||
mca_base_param_reg_syn_name(tmp, "orte", "rankfile", false);
|
||||
mca_base_param_lookup_string(tmp, &orte_rankfile);
|
||||
|
||||
/* ensure we flag mapping by user */
|
||||
if (NULL != mca_rmaps_rank_file_component.slot_list ||
|
||||
NULL != orte_rankfile) {
|
||||
ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_BYUSER);
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (NULL != opal_hwloc_base_slot_list || NULL != orte_rankfile) {
|
||||
#else
|
||||
if (NULL != orte_rankfile) {
|
||||
#endif
|
||||
if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
|
||||
/* if a non-default mapping is already specified, then we
|
||||
* have an error
|
||||
*/
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
|
||||
"RANK_FILE", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_CONFLICTED);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYUSER);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
|
||||
/* make us first */
|
||||
my_priority = 10000;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
@ -59,9 +59,9 @@ static int map_to_ftgrps(orte_job_t *jdata);
|
||||
static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
{
|
||||
orte_app_context_t *app;
|
||||
int i;
|
||||
int i, j;
|
||||
int rc = ORTE_SUCCESS;
|
||||
orte_node_t *nd=NULL, *oldnode, *node;
|
||||
orte_node_t *nd=NULL, *oldnode, *node, *nptr;
|
||||
orte_rmaps_res_ftgrp_t *target = NULL;
|
||||
orte_proc_t *proc;
|
||||
orte_vpid_t totprocs;
|
||||
@ -69,6 +69,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
orte_std_cntr_t num_slots;
|
||||
opal_list_item_t *item;
|
||||
mca_base_component_t *c = &mca_rmaps_resilient_component.super.base_version;
|
||||
bool found;
|
||||
|
||||
if (ORTE_JOB_STATE_INIT == jdata->state) {
|
||||
if (NULL != jdata->map->req_mapper &&
|
||||
@ -172,7 +173,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list,
|
||||
&num_slots,
|
||||
app,
|
||||
jdata->map->policy))) {
|
||||
jdata->map->mapping))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
@ -231,25 +232,31 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Put the process on the found node (add it if not already in the map)
|
||||
/* add node to map if necessary - nothing we can do here
|
||||
* but search for it
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata,
|
||||
nd,
|
||||
jdata->map->cpus_per_rank,
|
||||
proc->app_idx,
|
||||
NULL,
|
||||
jdata->map->oversubscribe,
|
||||
false,
|
||||
&proc))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
found = false;
|
||||
for (j=0; j < jdata->map->nodes->size; j++) {
|
||||
if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, j))) {
|
||||
continue;
|
||||
}
|
||||
if (nptr == nd) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
OBJ_RETAIN(nd);
|
||||
opal_pointer_array_add(jdata->map->nodes, nd);
|
||||
nd->mapped = true;
|
||||
}
|
||||
OBJ_RETAIN(nd); /* maintain accounting on object */
|
||||
proc->node = nd;
|
||||
proc->nodename = nd->name;
|
||||
nd->num_procs++;
|
||||
opal_pointer_array_add(nd->procs, (void*)proc);
|
||||
/* retain the proc struct so that we correctly track its release */
|
||||
OBJ_RETAIN(proc);
|
||||
|
||||
/* flag the proc state as non-launched so we'll know to launch it */
|
||||
proc->state = ORTE_PROC_STATE_INIT;
|
||||
@ -259,11 +266,6 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
*/
|
||||
orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc);
|
||||
}
|
||||
/* define the daemons that we will use for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
error:
|
||||
return rc;
|
||||
@ -474,7 +476,7 @@ static int get_new_node(orte_proc_t *proc,
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list,
|
||||
&num_slots,
|
||||
app,
|
||||
map->policy))) {
|
||||
map->mapping))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto release;
|
||||
}
|
||||
@ -716,7 +718,7 @@ static int map_to_ftgrps(orte_job_t *jdata)
|
||||
*/
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
map->policy))) {
|
||||
map->mapping))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -813,18 +815,36 @@ static int map_to_ftgrps(orte_job_t *jdata)
|
||||
"%s rmaps:resilient: placing proc into fault group %d node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == target) ? -1 : target->ftgrp, nd->name));
|
||||
/* put proc on that node */
|
||||
proc=NULL;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, app->idx,
|
||||
&node_list, jdata->map->oversubscribe, false, &proc))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* if the node isn't in the map, add it */
|
||||
if (!nd->mapped) {
|
||||
OBJ_RETAIN(node);
|
||||
opal_pointer_array_add(map->nodes, nd);
|
||||
nd->mapped = true;
|
||||
}
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
/* set the jobid */
|
||||
proc->name.jobid = jdata->jobid;
|
||||
proc->app_idx = app->idx;
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
proc->node = nd;
|
||||
proc->nodename = nd->name;
|
||||
nd->num_procs++;
|
||||
if ((nd->slots < nd->slots_inuse) ||
|
||||
(0 < nd->slots_max && nd->slots_max < nd->slots_inuse)) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, nd->num_procs, app->app);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
nd->oversubscribed = true;
|
||||
}
|
||||
opal_pointer_array_add(nd->procs, (void*)proc);
|
||||
/* retain the proc struct so that we correctly track its release */
|
||||
OBJ_RETAIN(proc);
|
||||
|
||||
/* flag the proc as ready for launch */
|
||||
proc->state = ORTE_PROC_STATE_INIT;
|
||||
|
||||
@ -864,11 +884,5 @@ static int map_to_ftgrps(orte_job_t *jdata)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* define the daemons that we will use for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -110,13 +111,6 @@ typedef struct orte_rmaps_base_component_2_0_0_t orte_rmaps_base_component_2_0_0
|
||||
typedef orte_rmaps_base_component_2_0_0_t orte_rmaps_base_component_t;
|
||||
|
||||
|
||||
/**
|
||||
* Macro for use in components that are of type rmaps
|
||||
*/
|
||||
#define ORTE_RMAPS_BASE_VERSION_2_0_0 \
|
||||
MCA_BASE_VERSION_2_0_0, \
|
||||
"rmaps", 2, 0, 0
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -8,6 +8,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -24,6 +25,7 @@
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
@ -33,6 +35,11 @@
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef uint16_t orte_mapping_policy_t;
|
||||
#define ORTE_MAPPING_POLICY OPAL_UINT16
|
||||
typedef uint16_t orte_ranking_policy_t;
|
||||
#define ORTE_RANKING_POLICY OPAL_UINT16
|
||||
|
||||
/*
|
||||
* Structure that represents the mapping of a job to an
|
||||
* allocated set of resources.
|
||||
@ -42,16 +49,16 @@ struct orte_job_map_t {
|
||||
/* user-specified mapping params */
|
||||
char *req_mapper; /* requested mapper */
|
||||
char *last_mapper; /* last mapper used */
|
||||
orte_mapping_policy_t policy;
|
||||
int npernode;
|
||||
int nperboard;
|
||||
int npersocket;
|
||||
orte_mapping_policy_t mapping;
|
||||
orte_ranking_policy_t ranking;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
opal_binding_policy_t binding;
|
||||
opal_hwloc_level_t bind_level;
|
||||
#endif
|
||||
/* mapping options */
|
||||
char *ppr;
|
||||
int16_t cpus_per_rank;
|
||||
int16_t stride;
|
||||
/* are we allowed to oversubscribe the nodes in this job */
|
||||
bool oversubscribe;
|
||||
bool display_map;
|
||||
bool cpu_lists;
|
||||
/* *** */
|
||||
/* number of new daemons required to be launched
|
||||
* to support this job map
|
||||
@ -69,6 +76,87 @@ struct orte_job_map_t {
|
||||
typedef struct orte_job_map_t orte_job_map_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_map_t);
|
||||
|
||||
/**
|
||||
* Macro for use in components that are of type rmaps
|
||||
*/
|
||||
#define ORTE_RMAPS_BASE_VERSION_2_0_0 \
|
||||
MCA_BASE_VERSION_2_0_0, \
|
||||
"rmaps", 2, 0, 0
|
||||
|
||||
/* define map-related directives */
|
||||
#define ORTE_MAPPING_NO_USE_LOCAL 0x0100
|
||||
#define ORTE_MAPPING_NO_OVERSUBSCRIBE 0x0200
|
||||
#define ORTE_MAPPING_SUBSCRIBE_GIVEN 0x0400
|
||||
#define ORTE_MAPPING_SPAN 0x0800
|
||||
#define ORTE_MAPPING_PPR 0x1000
|
||||
/* an error flag */
|
||||
#define ORTE_MAPPING_CONFLICTED 0x2000
|
||||
#define ORTE_MAPPING_GIVEN 0x4000
|
||||
#define ORTE_SET_MAPPING_DIRECTIVE(target, pol) \
|
||||
(target) |= (pol)
|
||||
#define ORTE_UNSET_MAPPING_DIRECTIVE(target, pol) \
|
||||
(target) &= ~(pol)
|
||||
#define ORTE_GET_MAPPING_DIRECTIVE(pol) \
|
||||
((pol) & 0xff00)
|
||||
|
||||
/* round-robin policies */
|
||||
#define ORTE_MAPPING_BYSLOT 1
|
||||
#define ORTE_MAPPING_BYNODE 2
|
||||
#define ORTE_MAPPING_BYBOARD 3
|
||||
#define ORTE_MAPPING_BYNUMA 4
|
||||
#define ORTE_MAPPING_BYSOCKET 5
|
||||
#define ORTE_MAPPING_BYL3CACHE 6
|
||||
#define ORTE_MAPPING_BYL2CACHE 7
|
||||
#define ORTE_MAPPING_BYL1CACHE 8
|
||||
#define ORTE_MAPPING_BYCORE 9
|
||||
#define ORTE_MAPPING_BYHWTHREAD 10
|
||||
/* convenience - declare anything <= 15 to be round-robin*/
|
||||
#define ORTE_MAPPING_RR 0x000f
|
||||
/* sequential policy */
|
||||
#define ORTE_MAPPING_SEQ 20
|
||||
/* rank file and other user-defined mapping */
|
||||
#define ORTE_MAPPING_BYUSER 22
|
||||
/* macro to separate out the mapping policy
|
||||
* from the directives
|
||||
*/
|
||||
#define ORTE_GET_MAPPING_POLICY(pol) \
|
||||
((pol) & 0x00ff)
|
||||
/* macro to determine if mapping policy is set */
|
||||
#define ORTE_MAPPING_POLICY_IS_SET(pol) \
|
||||
((pol) & 0x00ff)
|
||||
#define ORTE_SET_MAPPING_POLICY(target, pol) \
|
||||
(target) = (pol) | ((target) & 0xff00)
|
||||
|
||||
/* define ranking directives */
|
||||
#define ORTE_RANKING_SPAN 0x1000
|
||||
#define ORTE_RANKING_FILL 0x2000
|
||||
#define ORTE_RANKING_GIVEN 0x4000
|
||||
#define ORTE_SET_RANKING_DIRECTIVE(target, pol) \
|
||||
(target) |= (pol)
|
||||
#define ORTE_UNSET_RANKING_DIRECTIVE(target, pol) \
|
||||
(target) &= ~(pol)
|
||||
#define ORTE_GET_RANKING_DIRECTIVE(pol) \
|
||||
((pol) & 0xf000)
|
||||
|
||||
/* define ranking policies */
|
||||
#define ORTE_RANK_BY_NODE 1
|
||||
#define ORTE_RANK_BY_BOARD 2
|
||||
#define ORTE_RANK_BY_NUMA 3
|
||||
#define ORTE_RANK_BY_SOCKET 4
|
||||
#define ORTE_RANK_BY_L3CACHE 5
|
||||
#define ORTE_RANK_BY_L2CACHE 6
|
||||
#define ORTE_RANK_BY_L1CACHE 7
|
||||
#define ORTE_RANK_BY_CORE 8
|
||||
#define ORTE_RANK_BY_HWTHREAD 9
|
||||
#define ORTE_RANK_BY_SLOT 10
|
||||
#define ORTE_GET_RANKING_POLICY(pol) \
|
||||
((pol) & 0x0fff)
|
||||
/* macro to determine if ranking policy is set */
|
||||
#define ORTE_RANKING_POLICY_IS_SET(pol) \
|
||||
((pol) & 0x0fff)
|
||||
#define ORTE_SET_RANKING_POLICY(target, pol) \
|
||||
(target) = (pol) | ((target) & 0xf000)
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -24,7 +24,8 @@ dist_pkgdata_DATA = help-orte-rmaps-rr.txt
|
||||
sources = \
|
||||
rmaps_rr.c \
|
||||
rmaps_rr.h \
|
||||
rmaps_rr_component.c
|
||||
rmaps_rr_component.c \
|
||||
rmaps_rr_mappers.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
|
@ -51,3 +51,11 @@ You have specified a rank-to-node/slot mapping, but failed to provide
|
||||
the number of processes to be executed. For some reason, this information
|
||||
could not be obtained from the mapping you provided, so we cannot continue
|
||||
with executing the specified application.
|
||||
#
|
||||
[orte-rmaps-rr:not-enough-objs]
|
||||
There are not enough resources on the available nodes
|
||||
to meet the requested mapping.
|
||||
|
||||
Application: %s
|
||||
Number of procs: %d
|
||||
Number of resources: %d
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -40,6 +40,7 @@
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "rmaps_rr.h"
|
||||
|
||||
static orte_node_t* get_starting_point(opal_list_t *node_list, orte_job_t *jdata);
|
||||
|
||||
/*
|
||||
* Create a round-robin mapping for the job.
|
||||
@ -52,7 +53,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
opal_list_item_t *item;
|
||||
orte_std_cntr_t num_nodes, num_slots;
|
||||
int rc;
|
||||
opal_list_item_t *cur_node_item;
|
||||
mca_base_component_t *c = &mca_rmaps_round_robin_component.base_version;
|
||||
|
||||
/* this mapper can only handle initial launch
|
||||
@ -74,9 +74,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
return ORTE_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
if (0 < jdata->map->npernode ||
|
||||
0 < jdata->map->nperboard ||
|
||||
0 < jdata->map->npersocket) {
|
||||
if (ORTE_MAPPING_RR < ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
/* I don't know how to do these - defer */
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr: job %s not using rr mapper",
|
||||
@ -122,14 +120,14 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
* option
|
||||
*/
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->policy))) {
|
||||
jdata->map->mapping))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
|
||||
|
||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||
cur_node_item = orte_rmaps_base_get_starting_point(&node_list, jdata);
|
||||
jdata->bookmark = get_starting_point(&node_list, jdata);
|
||||
|
||||
if (0 == app->num_procs) {
|
||||
/* set the num_procs to equal the number of slots on these mapped nodes */
|
||||
@ -137,12 +135,42 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
/* Make assignments */
|
||||
if (jdata->map->policy & ORTE_MAPPING_BYNODE) {
|
||||
rc = orte_rmaps_base_map_bynode(jdata, app, &node_list,
|
||||
app->num_procs, cur_node_item);
|
||||
if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_bynode(jdata, app, &node_list, num_slots,
|
||||
app->num_procs);
|
||||
} else if (ORTE_MAPPING_BYSLOT == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
|
||||
app->num_procs);
|
||||
#if OPAL_HAVE_HWLOC
|
||||
} else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
|
||||
app->num_procs, HWLOC_OBJ_PU, 0);
|
||||
} else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
|
||||
app->num_procs, HWLOC_OBJ_CORE, 0);
|
||||
} else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
|
||||
app->num_procs, HWLOC_OBJ_CACHE, 1);
|
||||
} else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
|
||||
app->num_procs, HWLOC_OBJ_CACHE, 2);
|
||||
} else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
|
||||
app->num_procs, HWLOC_OBJ_CACHE, 3);
|
||||
} else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
|
||||
app->num_procs, HWLOC_OBJ_SOCKET, 0);
|
||||
} else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
|
||||
app->num_procs, HWLOC_OBJ_NODE, 0);
|
||||
#endif
|
||||
} else {
|
||||
rc = orte_rmaps_base_map_byslot(jdata, app, &node_list,
|
||||
app->num_procs, cur_node_item);
|
||||
/* unrecognized mapping directive */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy",
|
||||
true, "mapping",
|
||||
orte_rmaps_base_print_mapping(jdata->map->mapping));
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -155,28 +183,19 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
/* cleanup the node list - it can differ from one app_context
|
||||
* to another, so we have to get it every time
|
||||
*/
|
||||
while(NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
}
|
||||
|
||||
/* compute vpids and add proc objects to the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* compute and save local ranks */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* define the daemons that we will use for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
/* compute vpids and add proc objects to the job - do this after
|
||||
* each app_context so that the ranks within each context are
|
||||
* contiguous
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
@ -190,6 +209,85 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* determine the proper starting point for the next mapping operation
|
||||
*/
|
||||
static orte_node_t* get_starting_point(opal_list_t *node_list, orte_job_t *jdata)
|
||||
{
|
||||
opal_list_item_t *item, *cur_node_item;
|
||||
orte_node_t *node, *nd1, *ndmin;
|
||||
int overload;
|
||||
|
||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||
if (NULL != jdata->bookmark) {
|
||||
cur_node_item = NULL;
|
||||
/* find this node on the list */
|
||||
for (item = opal_list_get_first(node_list);
|
||||
item != opal_list_get_end(node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
|
||||
if (node->index == jdata->bookmark->index) {
|
||||
cur_node_item = item;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* see if we found it - if not, just start at the beginning */
|
||||
if (NULL == cur_node_item) {
|
||||
cur_node_item = opal_list_get_first(node_list);
|
||||
}
|
||||
} else {
|
||||
/* if no bookmark, then just start at the beginning of the list */
|
||||
cur_node_item = opal_list_get_first(node_list);
|
||||
}
|
||||
|
||||
/* is this node fully subscribed? If so, then the first
|
||||
* proc we assign will oversubscribe it, so let's look
|
||||
* for another candidate
|
||||
*/
|
||||
node = (orte_node_t*)cur_node_item;
|
||||
ndmin = node;
|
||||
overload = ndmin->slots_inuse - ndmin->slots_alloc;
|
||||
if (node->slots_inuse >= node->slots_alloc) {
|
||||
/* work down the list - is there another node that
|
||||
* would not be oversubscribed?
|
||||
*/
|
||||
if (cur_node_item != opal_list_get_last(node_list)) {
|
||||
item = opal_list_get_next(cur_node_item);
|
||||
} else {
|
||||
item = opal_list_get_first(node_list);
|
||||
}
|
||||
while (item != cur_node_item) {
|
||||
nd1 = (orte_node_t*)item;
|
||||
if (nd1->slots_inuse < nd1->slots_alloc) {
|
||||
/* this node is not oversubscribed! use it! */
|
||||
return (orte_node_t*)item;
|
||||
}
|
||||
/* this one was also oversubscribed, keep track of the
|
||||
* node that has the least usage - if we can't
|
||||
* find anyone who isn't fully utilized, we will
|
||||
* start with the least used node
|
||||
*/
|
||||
if (overload >= (nd1->slots_inuse - nd1->slots_alloc)) {
|
||||
ndmin = nd1;
|
||||
overload = ndmin->slots_inuse - ndmin->slots_alloc;
|
||||
}
|
||||
if (item == opal_list_get_last(node_list)) {
|
||||
item = opal_list_get_first(node_list);
|
||||
} else {
|
||||
item= opal_list_get_next(item);
|
||||
}
|
||||
}
|
||||
/* if we get here, then we cycled all the way around the
|
||||
* list without finding a better answer - just use the node
|
||||
* that is minimally overloaded
|
||||
*/
|
||||
cur_node_item = (opal_list_item_t*)ndmin;
|
||||
}
|
||||
|
||||
return (orte_node_t*)cur_node_item;
|
||||
}
|
||||
|
||||
orte_rmaps_base_module_t orte_rmaps_round_robin_module = {
|
||||
orte_rmaps_rr_map
|
||||
};
|
||||
|
@ -24,6 +24,10 @@
|
||||
#define ORTE_RMAPS_RR_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
@ -31,6 +35,24 @@ BEGIN_C_DECLS
|
||||
ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_round_robin_component;
|
||||
extern orte_rmaps_base_module_t orte_rmaps_round_robin_module;
|
||||
|
||||
ORTE_MODULE_DECLSPEC int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
orte_app_context_t *app,
|
||||
opal_list_t *node_list,
|
||||
orte_std_cntr_t num_slots,
|
||||
orte_vpid_t nprocs);
|
||||
ORTE_MODULE_DECLSPEC int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
orte_app_context_t *app,
|
||||
opal_list_t *node_list,
|
||||
orte_std_cntr_t num_slots,
|
||||
orte_vpid_t nprocs);
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
ORTE_MODULE_DECLSPEC int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_app_context_t *app,
|
||||
opal_list_t *node_list,
|
||||
orte_std_cntr_t num_slots,
|
||||
orte_vpid_t num_procs,
|
||||
hwloc_obj_type_t target, unsigned cache_level);
|
||||
#endif
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -63,7 +63,7 @@ static int orte_rmaps_round_robin_open(void)
|
||||
|
||||
mca_base_param_reg_int(c, "priority",
|
||||
"Priority of the rr rmaps component",
|
||||
false, false, 100,
|
||||
false, false, 10,
|
||||
&my_priority);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
712
orte/mca/rmaps/round_robin/rmaps_rr_mappers.c
Обычный файл
712
orte/mca/rmaps/round_robin/rmaps_rr_mappers.c
Обычный файл
@ -0,0 +1,712 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "rmaps_rr.h"
|
||||
|
||||
static orte_proc_t* setup_proc(orte_job_t *jdata,
|
||||
orte_node_t *node,
|
||||
orte_app_idx_t idx);
|
||||
|
||||
int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
orte_app_context_t *app,
|
||||
opal_list_t *node_list,
|
||||
orte_std_cntr_t num_slots,
|
||||
orte_vpid_t num_procs)
|
||||
{
|
||||
int rc, i, nprocs_mapped;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
opal_list_item_t *item;
|
||||
int num_procs_to_assign, extra_procs_to_assign=0, nxtra_nodes=0;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
hwloc_obj_t obj=NULL;
|
||||
#endif
|
||||
float balance;
|
||||
bool add_one=false;
|
||||
bool oversubscribed = false;
|
||||
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr: mapping by slot for job %s slots %d num_procs %lu",
|
||||
ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs);
|
||||
|
||||
/* check to see if we can map all the procs */
|
||||
if (num_slots < app->num_procs) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
oversubscribed = true;
|
||||
/* compute how many extra procs to put on each node */
|
||||
balance = (float)(app->num_procs - num_slots) / (float)opal_list_get_size(node_list);
|
||||
extra_procs_to_assign = (int)balance;
|
||||
if (0 < (balance - (float)extra_procs_to_assign)) {
|
||||
/* compute how many nodes need an extra proc */
|
||||
nxtra_nodes = app->num_procs - num_slots - (extra_procs_to_assign * opal_list_get_size(node_list));
|
||||
/* add one so that we add an extra proc to the first nodes
|
||||
* until all procs are mapped
|
||||
*/
|
||||
extra_procs_to_assign++;
|
||||
/* flag that we added one */
|
||||
add_one = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* map the number of procs to each node until we
|
||||
* map all specified procs
|
||||
*/
|
||||
nprocs_mapped = 0;
|
||||
while (NULL != (item = opal_list_remove_first(node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* get the root object as we are not assigning
|
||||
* locale except at the node level
|
||||
*/
|
||||
if (NULL != node->topology) {
|
||||
obj = hwloc_get_root_obj(node->topology);
|
||||
}
|
||||
#endif
|
||||
if (add_one) {
|
||||
if (0 == nxtra_nodes) {
|
||||
--extra_procs_to_assign;
|
||||
add_one = false;
|
||||
} else {
|
||||
--nxtra_nodes;
|
||||
}
|
||||
}
|
||||
if (oversubscribed) {
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
if (0 == node->slots_alloc) {
|
||||
num_procs_to_assign = 1 + extra_procs_to_assign;
|
||||
} else {
|
||||
num_procs_to_assign = node->slots_alloc + extra_procs_to_assign;
|
||||
}
|
||||
for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
|
||||
if (0 == i) {
|
||||
/* add this node to the map - do it only once */
|
||||
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
}
|
||||
if (NULL == (proc = setup_proc(jdata, node, app->idx))) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
nprocs_mapped++;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
proc->locale = obj;
|
||||
#endif
|
||||
}
|
||||
jdata->bookmark = node;
|
||||
/* release the node - the object will persist */
|
||||
OBJ_RELEASE(node);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
orte_app_context_t *app,
|
||||
opal_list_t *node_list,
|
||||
orte_std_cntr_t num_slots,
|
||||
orte_vpid_t num_procs)
|
||||
{
|
||||
int j, nprocs_mapped, lag, delta;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
opal_list_item_t *item;
|
||||
int num_procs_to_assign, navg, idx;
|
||||
int extra_procs_to_assign=0, nxtra_nodes=0;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
hwloc_obj_t obj=NULL;
|
||||
#endif
|
||||
float balance;
|
||||
bool add_one=false;
|
||||
bool oversubscribed=false;
|
||||
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr: mapping by node for job %s slots %d num_procs %lu",
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
(int)num_slots, (unsigned long)num_procs);
|
||||
|
||||
/* quick check to see if we can map all the procs */
|
||||
if (num_slots < app->num_procs) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
oversubscribed = true;
|
||||
}
|
||||
|
||||
/* divide the procs evenly across all nodes - this is the
|
||||
* average we have to maintain as we go, but we adjust
|
||||
* the number on each node to reflect its available slots.
|
||||
* Obviously, if all nodes have the same number of slots,
|
||||
* then the avg is what we get on each node - this is
|
||||
* the most common situation.
|
||||
*/
|
||||
navg = app->num_procs / opal_list_get_size(node_list);
|
||||
if (0 == navg) {
|
||||
/* if there are less procs than nodes, we have to
|
||||
* place at least one/node
|
||||
*/
|
||||
navg = 1;
|
||||
}
|
||||
|
||||
/* compute how many extra procs to put on each node */
|
||||
balance = (float)(app->num_procs - (navg * opal_list_get_size(node_list))) / (float)opal_list_get_size(node_list);
|
||||
extra_procs_to_assign = (int)balance;
|
||||
if (0 < (balance - (float)extra_procs_to_assign)) {
|
||||
/* compute how many nodes need an extra proc */
|
||||
nxtra_nodes = app->num_procs - ((navg + extra_procs_to_assign) * opal_list_get_size(node_list));
|
||||
/* add one so that we add an extra proc to the first nodes
|
||||
* until all procs are mapped
|
||||
*/
|
||||
extra_procs_to_assign++;
|
||||
/* flag that we added one */
|
||||
add_one = true;
|
||||
}
|
||||
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr: mapping by node navg %d extra_procs %d extra_nodes %d",
|
||||
navg, extra_procs_to_assign, nxtra_nodes);
|
||||
|
||||
nprocs_mapped = 0;
|
||||
lag = 0;
|
||||
while (NULL != (item = opal_list_remove_first(node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* get the root object as we are not assigning
|
||||
* locale except at the node level
|
||||
*/
|
||||
if (NULL != node->topology) {
|
||||
obj = hwloc_get_root_obj(node->topology);
|
||||
}
|
||||
#endif
|
||||
/* add this node to the map */
|
||||
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(idx);
|
||||
return idx;
|
||||
}
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
/* compute the number of procs to go on this node */
|
||||
if (add_one) {
|
||||
if (0 == nxtra_nodes) {
|
||||
--extra_procs_to_assign;
|
||||
add_one = false;
|
||||
} else {
|
||||
--nxtra_nodes;
|
||||
}
|
||||
}
|
||||
if (oversubscribed) {
|
||||
/* everybody just takes their share */
|
||||
num_procs_to_assign = navg + extra_procs_to_assign;
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
} else {
|
||||
/* if we are not oversubscribed, then there are enough
|
||||
* slots to handle all the procs. However, not every
|
||||
* node will have the same number of slots, so we
|
||||
* have to track how many procs to "shift" elsewhere
|
||||
* to make up the difference
|
||||
*/
|
||||
if (0 == node->slots_alloc) {
|
||||
/* if there are no extras to take, then we can
|
||||
* safely remove this node as we don't need it
|
||||
*/
|
||||
if (0 == extra_procs_to_assign) {
|
||||
opal_pointer_array_set_item(jdata->map->nodes, idx, NULL);
|
||||
OBJ_RELEASE(node);
|
||||
--(jdata->map->num_nodes);
|
||||
/* update how many we are lagging behind */
|
||||
lag += navg;
|
||||
continue;
|
||||
}
|
||||
/* everybody has to take at least the extras */
|
||||
num_procs_to_assign = extra_procs_to_assign;
|
||||
/* update how many we are lagging behind */
|
||||
lag += navg;
|
||||
} else {
|
||||
/* if slots_alloc < avg, then take all */
|
||||
if (node->slots_alloc < navg) {
|
||||
num_procs_to_assign = node->slots_alloc + extra_procs_to_assign;
|
||||
/* update how many we are lagging behind */
|
||||
lag += navg - node->slots_alloc;
|
||||
} else {
|
||||
/* take the avg plus as much of the "lag" as we can */
|
||||
delta = 0;
|
||||
if (0 < lag) {
|
||||
delta = node->slots_alloc - navg;
|
||||
if (lag < delta) {
|
||||
delta = lag;
|
||||
}
|
||||
lag -= delta;
|
||||
}
|
||||
num_procs_to_assign = navg + delta + extra_procs_to_assign;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (j=0; j < num_procs_to_assign && nprocs_mapped < app->num_procs; j++) {
|
||||
if (NULL == (proc = setup_proc(jdata, node, app->idx))) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
nprocs_mapped++;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
proc->locale = obj;
|
||||
#endif
|
||||
}
|
||||
jdata->bookmark = node;
|
||||
/* maintain acctg */
|
||||
OBJ_RELEASE(node);
|
||||
if (nprocs_mapped == app->num_procs) {
|
||||
/* we are done */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
static int byobj_span(orte_job_t *jdata,
|
||||
orte_app_context_t *app,
|
||||
opal_list_t *node_list,
|
||||
orte_std_cntr_t num_slots,
|
||||
orte_vpid_t num_procs,
|
||||
hwloc_obj_type_t target, unsigned cache_level);
|
||||
|
||||
/* mapping by hwloc object looks a lot like mapping by node,
|
||||
* but has the added complication of possibly having different
|
||||
* numbers of objects on each node
|
||||
*/
|
||||
int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
orte_app_context_t *app,
|
||||
opal_list_t *node_list,
|
||||
orte_std_cntr_t num_slots,
|
||||
orte_vpid_t num_procs,
|
||||
hwloc_obj_type_t target, unsigned cache_level)
|
||||
{
|
||||
int i, j, nprocs_mapped;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
opal_list_item_t *item;
|
||||
int num_procs_to_assign, nperobj, nprocs, nxtra_objs=0;
|
||||
int extra_procs_to_assign=0, nxtra_nodes=0, idx;
|
||||
hwloc_obj_t obj=NULL;
|
||||
unsigned int nobjs;
|
||||
float balance;
|
||||
bool add_one=false;
|
||||
bool oversubscribed = false;
|
||||
|
||||
/* there are two modes for mapping by object: span and not-span. The
|
||||
* span mode essentially operates as if there was just a single
|
||||
* "super-node" in the system - i.e., it balances the load across
|
||||
* all objects of the indicated type regardless of their location.
|
||||
* In essence, it acts as if we placed one proc on each object, cycling
|
||||
* across all objects on all nodes, and then wrapped around to place
|
||||
* another proc on each object, doing so until all procs were placed.
|
||||
*
|
||||
* In contrast, the non-span mode operates similar to byslot mapping.
|
||||
* All slots on each node are filled, assigning each proc to an object
|
||||
* on that node in a balanced fashion, and then the mapper moves on
|
||||
* to the next node. Thus, procs tend to be "front loaded" onto the
|
||||
* list of nodes, as opposed to being "load balanced" in the span mode
|
||||
*/
|
||||
if (ORTE_MAPPING_SPAN & jdata->map->mapping) {
|
||||
return byobj_span(jdata, app, node_list, num_slots,
|
||||
num_procs, target, cache_level);
|
||||
}
|
||||
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr: mapping no-span by %s for job %s slots %d num_procs %lu",
|
||||
hwloc_obj_type_string(target),
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
(int)num_slots, (unsigned long)num_procs);
|
||||
|
||||
/* quick check to see if we can map all the procs - can't
|
||||
* do more because we don't know how many total objects exist
|
||||
* across all the nodes
|
||||
*/
|
||||
if (num_slots < app->num_procs) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
oversubscribed = true;
|
||||
/* compute how many extra procs to put on each node */
|
||||
balance = (float)(app->num_procs - num_slots) / (float)opal_list_get_size(node_list);
|
||||
extra_procs_to_assign = (int)balance;
|
||||
if (0 < (balance - (float)extra_procs_to_assign)) {
|
||||
/* compute how many nodes need an extra proc */
|
||||
nxtra_nodes = app->num_procs - num_slots - (extra_procs_to_assign * opal_list_get_size(node_list));
|
||||
/* add one so that we add an extra proc to the first nodes
|
||||
* until all procs are mapped
|
||||
*/
|
||||
extra_procs_to_assign++;
|
||||
/* flag that we added one */
|
||||
add_one = true;
|
||||
}
|
||||
}
|
||||
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr: mapping no-span by %s extra_procs %d extra_nodes %d",
|
||||
hwloc_obj_type_string(target),
|
||||
extra_procs_to_assign, nxtra_nodes);
|
||||
|
||||
nprocs_mapped = 0;
|
||||
while (NULL != (item = opal_list_remove_first(node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
/* bozo check */
|
||||
if (NULL == node->topology) {
|
||||
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
|
||||
true, node->name);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* add this node to the map */
|
||||
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(idx);
|
||||
return idx;
|
||||
}
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
|
||||
if (oversubscribed) {
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
/* compute the number of procs to go on this node */
|
||||
if (add_one) {
|
||||
if (0 == nxtra_nodes) {
|
||||
--extra_procs_to_assign;
|
||||
add_one = false;
|
||||
} else {
|
||||
--nxtra_nodes;
|
||||
}
|
||||
}
|
||||
if (0 == node->slots_alloc) {
|
||||
/* everybody takes at least the extras */
|
||||
num_procs_to_assign = extra_procs_to_assign;
|
||||
} else {
|
||||
num_procs_to_assign = node->slots_alloc + extra_procs_to_assign;
|
||||
}
|
||||
|
||||
/* get the number of objects of this type on this node */
|
||||
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE);
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr:byobj: found %d objs on node %s", nobjs, node->name);
|
||||
/* compute the number of procs to go on each object */
|
||||
nperobj = num_procs_to_assign / nobjs;
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr:byobj: placing %d procs on each object", nperobj);
|
||||
if ((int)(nperobj * nobjs) < num_procs_to_assign) {
|
||||
/* compute how many objs need an extra proc */
|
||||
nxtra_objs = num_procs_to_assign - nperobj * nobjs;
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr:byobj: adding 1 extra proc to the first %d objects, if needed", nxtra_objs);
|
||||
}
|
||||
/* loop through the number of objects */
|
||||
for (i=0; i < (int)nobjs && nprocs_mapped < (int)app->num_procs; i++) {
|
||||
/* get the hwloc object */
|
||||
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level, i, OPAL_HWLOC_AVAILABLE))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
/* map the reqd number of procs */
|
||||
if (0 < nxtra_objs) {
|
||||
nprocs = nperobj + 1;
|
||||
--nxtra_objs;
|
||||
} else {
|
||||
nprocs = nperobj;
|
||||
}
|
||||
for (j=0; j < nprocs && nprocs_mapped < app->num_procs; j++) {
|
||||
if (NULL == (proc = setup_proc(jdata, node, app->idx))) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
nprocs_mapped++;
|
||||
proc->locale = obj;
|
||||
}
|
||||
}
|
||||
jdata->bookmark = node;
|
||||
/* maintain acctg */
|
||||
OBJ_RELEASE(node);
|
||||
if (nprocs_mapped == app->num_procs) {
|
||||
/* we are done */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int byobj_span(orte_job_t *jdata,
|
||||
orte_app_context_t *app,
|
||||
opal_list_t *node_list,
|
||||
orte_std_cntr_t num_slots,
|
||||
orte_vpid_t num_procs,
|
||||
hwloc_obj_type_t target, unsigned cache_level)
|
||||
{
|
||||
int i, j, nprocs_mapped, lag, delta, navg;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
opal_list_item_t *item;
|
||||
int num_procs_to_assign, nperobj, nprocs, nxtra_objs=0;
|
||||
int extra_procs_to_assign=0, nxtra_nodes=0, idx;
|
||||
hwloc_obj_t obj=NULL;
|
||||
unsigned int nobjs;
|
||||
float balance;
|
||||
bool add_one=false;
|
||||
bool oversubscribed=false;
|
||||
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr: mapping span by %s for job %s slots %d num_procs %lu",
|
||||
hwloc_obj_type_string(target),
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
(int)num_slots, (unsigned long)num_procs);
|
||||
|
||||
/* quick check to see if we can map all the procs - can't
|
||||
* do more because we don't know how many total objects exist
|
||||
* across all the nodes
|
||||
*/
|
||||
if (num_slots < app->num_procs) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
oversubscribed = true;
|
||||
}
|
||||
|
||||
/* divide the procs evenly across all nodes - this is the
|
||||
* average we have to maintain as we go, but we adjust
|
||||
* the number on each node to reflect its available slots.
|
||||
* Obviously, if all nodes have the same number of slots,
|
||||
* then the avg is what we get on each node - this is
|
||||
* the most common situation.
|
||||
*/
|
||||
navg = app->num_procs / opal_list_get_size(node_list);
|
||||
if (0 == navg) {
|
||||
/* if there are less procs than nodes, we have to
|
||||
* place at least one/node
|
||||
*/
|
||||
navg = 1;
|
||||
}
|
||||
|
||||
|
||||
/* compute how many extra procs to put on each node */
|
||||
balance = (float)(app->num_procs - (navg * opal_list_get_size(node_list))) / (float)opal_list_get_size(node_list);
|
||||
extra_procs_to_assign = (int)balance;
|
||||
if (0 < (balance - (float)extra_procs_to_assign)) {
|
||||
/* compute how many nodes need an extra proc */
|
||||
nxtra_nodes = app->num_procs - ((navg + extra_procs_to_assign) * opal_list_get_size(node_list));
|
||||
/* add one so that we add an extra proc to the first nodes
|
||||
* until all procs are mapped
|
||||
*/
|
||||
extra_procs_to_assign++;
|
||||
/* flag that we added one */
|
||||
add_one = true;
|
||||
}
|
||||
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr: mapping by %s navg %d extra_procs %d extra_nodes %d",
|
||||
hwloc_obj_type_string(target),
|
||||
navg, extra_procs_to_assign, nxtra_nodes);
|
||||
|
||||
nprocs_mapped = 0;
|
||||
lag = 0;
|
||||
while (NULL != (item = opal_list_remove_first(node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
/* bozo check */
|
||||
if (NULL == node->topology) {
|
||||
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
|
||||
true, node->name);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* add this node to the map */
|
||||
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(idx);
|
||||
return idx;
|
||||
}
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
/* compute the number of procs to go on this node */
|
||||
if (add_one) {
|
||||
if (0 == nxtra_nodes) {
|
||||
--extra_procs_to_assign;
|
||||
add_one = false;
|
||||
} else {
|
||||
--nxtra_nodes;
|
||||
}
|
||||
}
|
||||
if (oversubscribed) {
|
||||
/* everybody just takes their share */
|
||||
num_procs_to_assign = navg + extra_procs_to_assign;
|
||||
} else {
|
||||
/* if we are not oversubscribed, then there are enough
|
||||
* slots to handle all the procs. However, not every
|
||||
* node will have the same number of slots, so we
|
||||
* have to track how many procs to "shift" elsewhere
|
||||
* to make up the difference
|
||||
*/
|
||||
if (0 == node->slots_alloc) {
|
||||
/* if there are no extras to take, then we can
|
||||
* safely remove this node as we don't need it
|
||||
*/
|
||||
if (0 == extra_procs_to_assign) {
|
||||
opal_pointer_array_set_item(jdata->map->nodes, idx, NULL);
|
||||
OBJ_RELEASE(node);
|
||||
--(jdata->map->num_nodes);
|
||||
/* update how many we are lagging behind */
|
||||
lag += navg;
|
||||
continue;
|
||||
}
|
||||
/* everybody has to take at least the extras */
|
||||
num_procs_to_assign = extra_procs_to_assign;
|
||||
/* update how many we are lagging behind */
|
||||
lag += navg;
|
||||
} else {
|
||||
/* if slots_alloc < avg, then take all */
|
||||
if (node->slots_alloc < navg) {
|
||||
num_procs_to_assign = node->slots_alloc + extra_procs_to_assign;
|
||||
/* update how many we are lagging behind */
|
||||
lag += navg - node->slots_alloc;
|
||||
} else {
|
||||
/* take the avg plus as much of the "lag" as we can */
|
||||
delta = 0;
|
||||
if (0 < lag) {
|
||||
delta = node->slots_alloc - navg;
|
||||
if (lag < delta) {
|
||||
delta = lag;
|
||||
}
|
||||
lag -= delta;
|
||||
}
|
||||
num_procs_to_assign = navg + delta + extra_procs_to_assign;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* get the number of objects of this type on this node */
|
||||
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE);
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr:byobj: found %d objs on node %s", nobjs, node->name);
|
||||
/* compute the number of procs to go on each object */
|
||||
nperobj = num_procs_to_assign / nobjs;
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr:byobj: placing %d procs on each object", nperobj);
|
||||
if ((int)(nperobj * nobjs) < num_procs_to_assign) {
|
||||
/* compute how many objs need an extra proc */
|
||||
nxtra_objs = num_procs_to_assign - nperobj * nobjs;
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr:byobj: adding 1 extra proc to the first %d objects, if needed", nxtra_objs);
|
||||
}
|
||||
/* loop through the number of objects */
|
||||
for (i=0; i < (int)nobjs && nprocs_mapped < (int)app->num_procs; i++) {
|
||||
/* get the hwloc object */
|
||||
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level, i, OPAL_HWLOC_AVAILABLE))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
/* map the reqd number of procs */
|
||||
if (0 < nxtra_objs) {
|
||||
nprocs = nperobj + 1;
|
||||
--nxtra_objs;
|
||||
} else {
|
||||
nprocs = nperobj;
|
||||
}
|
||||
for (j=0; j < nprocs && nprocs_mapped < app->num_procs; j++) {
|
||||
if (NULL == (proc = setup_proc(jdata, node, app->idx))) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
nprocs_mapped++;
|
||||
proc->locale = obj;
|
||||
}
|
||||
}
|
||||
jdata->bookmark = node;
|
||||
/* maintain acctg */
|
||||
OBJ_RELEASE(node);
|
||||
if (nprocs_mapped == app->num_procs) {
|
||||
/* we are done */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
static orte_proc_t* setup_proc(orte_job_t *jdata,
|
||||
orte_node_t *node,
|
||||
orte_app_idx_t idx)
|
||||
{
|
||||
orte_proc_t *proc;
|
||||
int rc;
|
||||
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
/* set the jobid */
|
||||
proc->name.jobid = jdata->jobid;
|
||||
/* we do not set the vpid here - this will be done
|
||||
* during a second phase, but we do set the epoch here
|
||||
* since they all start with the same value.
|
||||
*/
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
/* flag the proc as ready for launch */
|
||||
proc->state = ORTE_PROC_STATE_INIT;
|
||||
proc->app_idx = idx;
|
||||
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
proc->node = node;
|
||||
proc->nodename = node->name;
|
||||
node->num_procs++;
|
||||
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(proc);
|
||||
return NULL;
|
||||
}
|
||||
/* retain the proc struct so that we correctly track its release */
|
||||
OBJ_RETAIN(proc);
|
||||
|
||||
return proc;
|
||||
}
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -31,7 +31,7 @@
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/if.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -72,7 +72,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
||||
opal_list_t *node_list=NULL;
|
||||
orte_proc_t *proc;
|
||||
mca_base_component_t *c = &mca_rmaps_seq_component.base_version;
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:seq mapping job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -96,9 +96,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
return ORTE_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
if (0 < jdata->map->npernode ||
|
||||
0 < jdata->map->nperboard ||
|
||||
0 < jdata->map->npersocket) {
|
||||
if (ORTE_MAPPING_SEQ != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
/* I don't know how to do these - defer */
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:seq: job %s not using seq mapper",
|
||||
@ -168,7 +166,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
/* check for nolocal and remove the head node, if required */
|
||||
if (map->policy & ORTE_MAPPING_NO_USE_LOCAL) {
|
||||
if (map->mapping & ORTE_MAPPING_NO_USE_LOCAL) {
|
||||
for (item = opal_list_get_first(node_list);
|
||||
item != opal_list_get_end(node_list);
|
||||
item = opal_list_get_next(item) ) {
|
||||
@ -218,24 +216,54 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
/* ensure the node is in the map */
|
||||
if (!node->mapped) {
|
||||
OBJ_RETAIN(node);
|
||||
opal_pointer_array_add(map->nodes, node);
|
||||
node->mapped = true;
|
||||
}
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
/* set the jobid */
|
||||
proc->name.jobid = jdata->jobid;
|
||||
proc->name.vpid = vpid++;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
/* flag the proc as ready for launch */
|
||||
proc->state = ORTE_PROC_STATE_INIT;
|
||||
proc->app_idx = i;
|
||||
|
||||
/* assign proc to this node - do NOT allow claim_slot to remove
|
||||
* an oversubscribed node from the list!
|
||||
*/
|
||||
proc = NULL;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
node_list,
|
||||
jdata->map->oversubscribe,
|
||||
false, &proc))) {
|
||||
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
proc->node = node;
|
||||
proc->nodename = node->name;
|
||||
node->num_procs++;
|
||||
if ((node->slots < node->slots_inuse) ||
|
||||
(0 < node->slots_max && node->slots_max < node->slots_inuse)) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, node->num_procs, app->app);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
/* assign the vpid */
|
||||
proc->name.vpid = vpid++;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
|
||||
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(proc);
|
||||
return rc;
|
||||
}
|
||||
/* retain the proc struct so that we correctly track its release */
|
||||
OBJ_RETAIN(proc);
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* assign the locale - okay for the topo to be null as
|
||||
* it just means it wasn't returned
|
||||
*/
|
||||
if (NULL != node->topology) {
|
||||
proc->locale = hwloc_get_root_obj(node->topology);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* add to the jdata proc array */
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||
@ -260,21 +288,9 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
|
||||
/* compute and save local ranks */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* define the daemons that we will use for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
error:
|
||||
if (NULL != default_node_list) {
|
||||
while (NULL != (item = opal_list_remove_first(default_node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2009 Institut National de Recherche en Informatique
|
||||
@ -195,9 +195,11 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Regular expression defining nodes in system" },
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{ "orte", "hetero", "nodes", '\0', NULL, "hetero-nodes", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" },
|
||||
#endif
|
||||
|
||||
/* End of list */
|
||||
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -311,9 +312,14 @@ int orte_dt_copy_map(orte_job_map_t **dest, orte_job_map_t *src, opal_data_type_
|
||||
}
|
||||
|
||||
/* copy data into it */
|
||||
(*dest)->policy = src->policy;
|
||||
(*dest)->npernode = src->npernode;
|
||||
(*dest)->oversubscribe = src->oversubscribe;
|
||||
(*dest)->mapping = src->mapping;
|
||||
(*dest)->ranking = src->ranking;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
(*dest)->binding = src->binding;
|
||||
#endif
|
||||
if (NULL != src->ppr) {
|
||||
(*dest)->ppr = strdup(src->ppr);
|
||||
}
|
||||
(*dest)->display_map = src->display_map;
|
||||
(*dest)->num_new_daemons = src->num_new_daemons;
|
||||
(*dest)->daemon_vpid_start = src->daemon_vpid_start;
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -22,7 +23,9 @@
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/dss/dss_internal.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -442,15 +445,6 @@ int orte_dt_pack_node(opal_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* do not pack the local board, socket, and core info */
|
||||
|
||||
/* pack the cpu set info */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(nodes[i]->cpu_set)), 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* do not pack the username */
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
@ -477,13 +471,6 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the pid */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(procs[i]->pid)), 1, OPAL_PID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the local rank */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(procs[i]->local_rank)), 1, ORTE_LOCAL_RANK))) {
|
||||
@ -498,6 +485,14 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&procs[i]->cpu_bitmap), 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* pack the state */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(procs[i]->state)), 1, ORTE_PROC_STATE))) {
|
||||
@ -512,13 +507,6 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the name of the node where this proc is executing */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(procs[i]->nodename)), 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the number of restarts */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)&(procs[i]->restarts), 1, OPAL_INT32))) {
|
||||
@ -906,26 +894,23 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the mapper used */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->last_mapper), 1, OPAL_STRING))) {
|
||||
/* pack the policies */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->mapping), 1, ORTE_MAPPING_POLICY))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the policy used to generate it */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->policy), 1, ORTE_MAPPING_POLICY))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->ranking), 1, ORTE_RANKING_POLICY))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the #procs/node */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->npernode), 1, ORTE_STD_CNTR))) {
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->binding), 1, OPAL_BINDING_POLICY))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the oversubscribe flag */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->oversubscribe), 1, OPAL_BOOL))) {
|
||||
#endif
|
||||
/* pack any ppr */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->ppr), 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -935,24 +920,6 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the number of new daemons */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->num_new_daemons), 1, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the daemon starting vpid */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->daemon_vpid_start), 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the number of nodes */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->num_nodes), 1, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -10,6 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -23,7 +24,7 @@
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
@ -279,9 +280,9 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
|
||||
tmp = tmp3;
|
||||
}
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tNum launched: %ld\tNum reported: %ld\n%s\tNum terminated: %ld\tOversubscribe override?: %s",
|
||||
tmp, pfx, (long)src->num_launched, (long)src->num_reported, pfx,
|
||||
(long)src->num_terminated, src->oversubscribe_override ? "True" : "False");
|
||||
asprintf(&tmp2, "%s\n%s\tNum launched: %ld\tNum reported: %ld\tNum terminated: %ld",
|
||||
tmp, pfx, (long)src->num_launched, (long)src->num_reported,
|
||||
(long)src->num_terminated);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
@ -376,11 +377,6 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
|
||||
}
|
||||
}
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tNum boards: %ld\tNum sockets/board: %ld\tNum cores/socket: %ld", tmp, pfx2,
|
||||
(long)src->boards, (long)src->sockets_per_board, (long)src->cores_per_socket);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
if (NULL == src->daemon) {
|
||||
asprintf(&tmp2, "%s\n%s\tDaemon: %s\tDaemon launched: %s", tmp, pfx2,
|
||||
"Not defined", src->daemon_launched ? "True" : "False");
|
||||
@ -397,9 +393,8 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld:\tCpu set: %s", tmp, pfx2,
|
||||
(long)src->slots_alloc, (long)src->slots_max,
|
||||
(NULL == src->cpu_set) ? "NULL" : src->cpu_set);
|
||||
asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld", tmp, pfx2,
|
||||
(long)src->slots_alloc, (long)src->slots_max);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
@ -462,7 +457,6 @@ PRINT_PROCS:
|
||||
int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_type_t type)
|
||||
{
|
||||
char *tmp, *tmp2, *pfx2;
|
||||
char *locale=NULL;
|
||||
|
||||
/* set default result */
|
||||
*output = NULL;
|
||||
@ -474,23 +468,6 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
|
||||
asprintf(&pfx2, "%s", prefix);
|
||||
}
|
||||
|
||||
if (orte_display_diffable_output) {
|
||||
/* print only the parts important to testing
|
||||
* mapping operations
|
||||
*/
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (NULL != src->locale) {
|
||||
hwloc_bitmap_list_asprintf(&locale, src->locale->cpuset);
|
||||
}
|
||||
#endif
|
||||
asprintf(output, "%s<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu locale=%s>",
|
||||
pfx2, ORTE_VPID_PRINT(src->name.vpid), (long)src->app_idx,
|
||||
(unsigned long)src->local_rank,
|
||||
(unsigned long)src->node_rank,
|
||||
(NULL == locale) ? "UNKNOWN" : locale);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (orte_xml_output) {
|
||||
/* need to create the output in XML format */
|
||||
if (0 == src->pid) {
|
||||
@ -541,14 +518,24 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
|
||||
tmp = tmp2;
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (NULL != src->locale) {
|
||||
hwloc_bitmap_list_asprintf(&locale, src->locale->cpuset);
|
||||
}
|
||||
#endif
|
||||
{
|
||||
char *locale=NULL;
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld\tLocale: %s\tSlot list: %s", tmp, pfx2,
|
||||
orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx,
|
||||
(NULL == locale) ? "UNKNOWN" : locale, (NULL == src->slot_list) ? "NULL" : src->slot_list);
|
||||
if (NULL != src->locale) {
|
||||
hwloc_bitmap_list_asprintf(&locale, src->locale->cpuset);
|
||||
}
|
||||
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld\tLocale: %s\tBinding: %s[%u]", tmp, pfx2,
|
||||
orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx,
|
||||
(NULL == locale) ? "UNKNOWN" : locale,
|
||||
(NULL == src->cpu_bitmap) ? "NULL" : src->cpu_bitmap, src->bind_idx);
|
||||
if (NULL != locale) {
|
||||
free(locale);
|
||||
}
|
||||
}
|
||||
#else
|
||||
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld", tmp, pfx2,
|
||||
orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx);
|
||||
#endif
|
||||
free(tmp);
|
||||
|
||||
/* set the return */
|
||||
@ -662,38 +649,6 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
|
||||
asprintf(&pfx2, "%s", prefix);
|
||||
}
|
||||
|
||||
if (orte_display_diffable_output) {
|
||||
/* display just the procs in a diffable format */
|
||||
asprintf(&tmp, "<map>\n");
|
||||
/* loop through nodes */
|
||||
for (i=0; i < src->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(src->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
asprintf(&tmp2, "%s\n\t<host name=%s>", tmp, (NULL == node->name) ? "UNKNOWN" : node->name);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
orte_dt_print_proc(&tmp2, "\t\t", proc, ORTE_PROC);
|
||||
asprintf(&tmp3, "%s\n%s", tmp, tmp2);
|
||||
free(tmp2);
|
||||
free(tmp);
|
||||
tmp = tmp3;
|
||||
}
|
||||
asprintf(&tmp2, "%s\n\t</host>", tmp);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
}
|
||||
asprintf(&tmp2, "%s\n</map>\n", tmp);
|
||||
free(tmp);
|
||||
free(pfx2);
|
||||
*output = tmp2;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (orte_xml_output) {
|
||||
/* need to create the output in XML format */
|
||||
asprintf(&tmp, "<map>\n");
|
||||
@ -733,13 +688,25 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
|
||||
asprintf(&pfx, "%s\t", pfx2);
|
||||
|
||||
if (orte_devel_level_output) {
|
||||
asprintf(&tmp, "\n%sMapper requested: %s\tLast mapper: %s\tMapping policy: %04x\n%s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s",
|
||||
#if OPAL_HAVE_HWLOC
|
||||
asprintf(&tmp, "\n%sMapper requested: %s Last mapper: %s Mapping policy: %s Ranking policy: %s Binding policy: %s[%s] Cpu set: %s PPR: %s",
|
||||
pfx2, (NULL == src->req_mapper) ? "NULL" : src->req_mapper,
|
||||
(NULL == src->last_mapper) ? "NULL" : src->last_mapper,
|
||||
src->policy, pfx2, (long)src->npernode,
|
||||
(src->oversubscribe) ? "TRUE" : "FALSE",
|
||||
(src->cpu_lists) ? "TRUE" : "FALSE");
|
||||
|
||||
orte_rmaps_base_print_mapping(src->mapping),
|
||||
orte_rmaps_base_print_ranking(src->ranking),
|
||||
opal_hwloc_base_print_binding(src->binding),
|
||||
opal_hwloc_base_print_level(src->bind_level),
|
||||
(NULL == opal_hwloc_base_cpu_set) ? "NULL" : opal_hwloc_base_cpu_set,
|
||||
(NULL == src->ppr) ? "NULL" : src->ppr);
|
||||
#else
|
||||
asprintf(&tmp, "\n%sMapper requested: %s Last mapper: %s Mapping policy: %s Ranking policy: %s PPR: %s",
|
||||
pfx2, (NULL == src->req_mapper) ? "NULL" : src->req_mapper,
|
||||
(NULL == src->last_mapper) ? "NULL" : src->last_mapper,
|
||||
orte_rmaps_base_print_mapping(src->mapping),
|
||||
orte_rmaps_base_print_ranking(src->ranking),
|
||||
(NULL == src->ppr) ? "NULL" : src->ppr);
|
||||
#endif
|
||||
|
||||
if (ORTE_VPID_INVALID == src->daemon_vpid_start) {
|
||||
asprintf(&tmp2, "%s\n%sNum new daemons: %ld\tNew daemon starting vpid INVALID\n%sNum nodes: %ld",
|
||||
tmp, pfx, (long)src->num_new_daemons, pfx, (long)src->num_nodes);
|
||||
|
@ -7,6 +7,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -196,10 +197,6 @@ int orte_dt_size_proc(size_t *size, orte_proc_t *src, opal_data_type_t type)
|
||||
/* if src is NULL, then that's all we wanted */
|
||||
if (NULL == src) return ORTE_SUCCESS;
|
||||
|
||||
if (NULL != src->slot_list) {
|
||||
*size += strlen(src->slot_list);
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
if (NULL != src->ckpt_snapshot_ref) {
|
||||
*size += strlen(src->ckpt_snapshot_ref);
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -21,10 +22,11 @@
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/dss/dss_internal.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/runtime/data_type_support/orte_dt_support.h"
|
||||
|
||||
/*
|
||||
@ -474,16 +476,6 @@ int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* do not unpack the board, socket, and core info */
|
||||
|
||||
/* unpack the cpu set */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
&(nodes[i]->cpu_set), &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* do not unpack the username */
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
@ -518,14 +510,6 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the pid */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(&(procs[i]->pid)), &n, OPAL_PID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the local rank */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
@ -534,7 +518,7 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the local rank */
|
||||
/* unpack the node rank */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(&(procs[i]->node_rank)), &n, ORTE_NODE_RANK))) {
|
||||
@ -542,6 +526,16 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* unpack the binding pattern */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(void*)(&(procs[i]->cpu_bitmap)), &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* unpack the state */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
@ -558,13 +552,6 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the name of the node where this proc is executing */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(void*)(&(procs[i]->nodename)), &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the number of restarts */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
@ -965,7 +952,7 @@ int orte_dt_unpack_job_state(opal_buffer_t *buffer, void *dest,
|
||||
* sending a map - hence, we do not pack that field, so don't unpack it here
|
||||
*/
|
||||
int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
|
||||
int32_t *num_vals, opal_data_type_t type)
|
||||
int32_t *num_vals, opal_data_type_t type)
|
||||
{
|
||||
int rc;
|
||||
int32_t i, n;
|
||||
@ -990,34 +977,31 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the mapper used */
|
||||
/* unpack the policies */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
&(maps[i]->last_mapper), &n, OPAL_STRING))) {
|
||||
&(maps[i]->mapping), &n, ORTE_MAPPING_POLICY))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the policy */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
&(maps[i]->policy), &n, ORTE_MAPPING_POLICY))) {
|
||||
&(maps[i]->ranking), &n, ORTE_RANKING_POLICY))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the #procs/node */
|
||||
#if OPAL_HAVE_HWLOC
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
&(maps[i]->npernode), &n, ORTE_STD_CNTR))) {
|
||||
&(maps[i]->binding), &n, OPAL_BINDING_POLICY))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the oversubscribe flag */
|
||||
#endif
|
||||
/* unpack the ppr */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
&(maps[i]->oversubscribe), &n, OPAL_BOOL))) {
|
||||
&(maps[i]->ppr), &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -1029,28 +1013,6 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the number of daemons to be created */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &(maps[i]->num_new_daemons), &n, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the starting vpid of the new daemons */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &(maps[i]->daemon_vpid_start), &n, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the number of nodes */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
&(maps[i]->num_nodes), &n, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -27,7 +27,7 @@
|
||||
#endif
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/paffinity/paffinity.h"
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
@ -146,24 +146,15 @@ bool orte_assume_same_shell = true;
|
||||
/* report launch progress */
|
||||
bool orte_report_launch_progress = false;
|
||||
|
||||
/* cluster hardware info */
|
||||
uint8_t orte_default_num_boards;
|
||||
uint8_t orte_default_num_sockets_per_board;
|
||||
uint8_t orte_default_num_cores_per_socket;
|
||||
|
||||
/* allocation specification */
|
||||
char *orte_default_cpu_set;
|
||||
char *orte_default_hostfile = NULL;
|
||||
char *orte_rankfile;
|
||||
char *orte_rankfile = NULL;
|
||||
#ifdef __WINDOWS__
|
||||
char *orte_ccp_headnode;
|
||||
#endif
|
||||
int orte_num_allocated_nodes = 0;
|
||||
char *orte_node_regex = NULL;
|
||||
|
||||
/* default rank assigment and binding policy */
|
||||
orte_mapping_policy_t orte_default_mapping_policy = 0;
|
||||
|
||||
/* tool communication controls */
|
||||
bool orte_report_events = false;
|
||||
char *orte_report_events_uri = NULL;
|
||||
@ -705,7 +696,6 @@ static void orte_job_construct(orte_job_t* job)
|
||||
|
||||
job->map = NULL;
|
||||
job->bookmark = NULL;
|
||||
job->oversubscribe_override = false;
|
||||
job->state = ORTE_JOB_STATE_UNDEF;
|
||||
|
||||
job->num_launched = 0;
|
||||
@ -839,15 +829,6 @@ static void orte_node_construct(orte_node_t* node)
|
||||
node->slots_alloc = 0;
|
||||
node->slots_max = 0;
|
||||
|
||||
node->boards = orte_default_num_boards;
|
||||
node->sockets_per_board = orte_default_num_sockets_per_board;
|
||||
node->cores_per_socket = orte_default_num_cores_per_socket;
|
||||
if (NULL != orte_default_cpu_set) {
|
||||
node->cpu_set = strdup(orte_default_cpu_set);
|
||||
} else {
|
||||
node->cpu_set = NULL;
|
||||
}
|
||||
|
||||
node->username = NULL;
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
@ -862,6 +843,7 @@ static void orte_node_destruct(orte_node_t* node)
|
||||
{
|
||||
int i;
|
||||
opal_node_stats_t *stats;
|
||||
orte_proc_t *proc;
|
||||
|
||||
if (NULL != node->name) {
|
||||
free(node->name);
|
||||
@ -880,18 +862,15 @@ static void orte_node_destruct(orte_node_t* node)
|
||||
}
|
||||
|
||||
for (i=0; i < node->procs->size; i++) {
|
||||
if (NULL != node->procs->addr[i]) {
|
||||
((orte_proc_t*)(node->procs->addr[i]))->node = NULL;
|
||||
OBJ_RELEASE(node->procs->addr[i]);
|
||||
node->procs->addr[i] = NULL;
|
||||
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
||||
opal_pointer_array_set_item(node->procs, i, NULL);
|
||||
OBJ_RELEASE(proc);
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(node->procs);
|
||||
|
||||
if (NULL != node->cpu_set) {
|
||||
free(node->cpu_set);
|
||||
node->cpu_set = NULL;
|
||||
}
|
||||
/* we release the topology elsewhere */
|
||||
|
||||
if (NULL != node->username) {
|
||||
free(node->username);
|
||||
node->username = NULL;
|
||||
@ -925,8 +904,9 @@ static void orte_proc_construct(orte_proc_t* proc)
|
||||
proc->app_idx = 0;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
proc->locale = NULL;
|
||||
proc->bind_idx = 0;
|
||||
proc->cpu_bitmap = NULL;
|
||||
#endif
|
||||
proc->slot_list = NULL;
|
||||
proc->node = NULL;
|
||||
proc->prior_node = NULL;
|
||||
proc->nodename = NULL;
|
||||
@ -957,11 +937,11 @@ static void orte_proc_destruct(orte_proc_t* proc)
|
||||
* associated node object - the node object
|
||||
* will free it
|
||||
*/
|
||||
|
||||
if (NULL != proc->slot_list) {
|
||||
free(proc->slot_list);
|
||||
proc->slot_list = NULL;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (NULL != proc->cpu_bitmap) {
|
||||
free(proc->cpu_bitmap);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (NULL != proc->node) {
|
||||
OBJ_RELEASE(proc->node);
|
||||
@ -1000,21 +980,14 @@ static void orte_nid_construct(orte_nid_t *ptr)
|
||||
ptr->name = NULL;
|
||||
ptr->daemon = ORTE_VPID_INVALID;
|
||||
ptr->oversubscribed = false;
|
||||
OBJ_CONSTRUCT(&ptr->sysinfo, opal_list_t);
|
||||
}
|
||||
|
||||
static void orte_nid_destruct(orte_nid_t *ptr)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
if (NULL != ptr->name) {
|
||||
free(ptr->name);
|
||||
ptr->name = NULL;
|
||||
}
|
||||
while (NULL != (item = opal_list_remove_first(&ptr->sysinfo))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&ptr->sysinfo);
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(orte_nid_t,
|
||||
@ -1039,6 +1012,9 @@ static void orte_jmap_construct(orte_jmap_t *ptr)
|
||||
{
|
||||
ptr->job = ORTE_JOBID_INVALID;
|
||||
ptr->num_procs = 0;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
ptr->bind_level = OPAL_HWLOC_NODE_LEVEL;
|
||||
#endif
|
||||
OBJ_CONSTRUCT(&ptr->pmap, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&ptr->pmap,
|
||||
ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
|
||||
@ -1048,12 +1024,13 @@ static void orte_jmap_construct(orte_jmap_t *ptr)
|
||||
|
||||
static void orte_jmap_destruct(orte_jmap_t *ptr)
|
||||
{
|
||||
orte_pmap_t **pmaps;
|
||||
orte_pmap_t *pmap;
|
||||
int i;
|
||||
|
||||
pmaps = (orte_pmap_t**)ptr->pmap.addr;
|
||||
for (i=0; i < ptr->pmap.size && NULL != pmaps[i]; i++) {
|
||||
OBJ_RELEASE(pmaps[i]);
|
||||
for (i=0; i < ptr->pmap.size; i++) {
|
||||
if (NULL != (pmap = (orte_pmap_t*)opal_pointer_array_get_item(&ptr->pmap, i))) {
|
||||
OBJ_RELEASE(pmap);
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&ptr->pmap);
|
||||
}
|
||||
@ -1064,20 +1041,19 @@ OBJ_CLASS_INSTANCE(orte_jmap_t,
|
||||
orte_jmap_destruct);
|
||||
|
||||
|
||||
|
||||
static void orte_job_map_construct(orte_job_map_t* map)
|
||||
{
|
||||
map->req_mapper = NULL;
|
||||
map->last_mapper = NULL;
|
||||
map->policy = 0;
|
||||
map->npernode = 0;
|
||||
map->nperboard = 0;
|
||||
map->npersocket = 0;
|
||||
map->mapping = 0;
|
||||
map->ranking = 0;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
map->binding = 0;
|
||||
map->bind_level = OPAL_HWLOC_NODE_LEVEL;
|
||||
#endif
|
||||
map->ppr = NULL;
|
||||
map->cpus_per_rank = 1;
|
||||
map->stride = 1;
|
||||
map->oversubscribe = true; /* default to allowing oversubscribe */
|
||||
map->display_map = false;
|
||||
map->cpu_lists = false;
|
||||
map->num_new_daemons = 0;
|
||||
map->daemon_vpid_start = ORTE_VPID_INVALID;
|
||||
map->num_nodes = 0;
|
||||
@ -1091,17 +1067,21 @@ static void orte_job_map_construct(orte_job_map_t* map)
|
||||
static void orte_job_map_destruct(orte_job_map_t* map)
|
||||
{
|
||||
orte_std_cntr_t i;
|
||||
|
||||
orte_node_t *node;
|
||||
|
||||
if (NULL != map->req_mapper) {
|
||||
free(map->req_mapper);
|
||||
}
|
||||
if (NULL != map->last_mapper) {
|
||||
free(map->last_mapper);
|
||||
}
|
||||
if (NULL != map->ppr) {
|
||||
free(map->ppr);
|
||||
}
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL != map->nodes->addr[i]) {
|
||||
OBJ_RELEASE(map->nodes->addr[i]);
|
||||
map->nodes->addr[i] = NULL;
|
||||
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
OBJ_RELEASE(node);
|
||||
opal_pointer_array_set_item(map->nodes, i, NULL);
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(map->nodes);
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -85,8 +85,6 @@ ORTE_DECLSPEC extern bool orte_in_parallel_debugger;
|
||||
/* error manager callback function */
|
||||
typedef void (*orte_err_cb_fn_t)(orte_process_name_t *proc, orte_proc_state_t state, void *cbdata);
|
||||
|
||||
typedef uint16_t orte_mapping_policy_t;
|
||||
|
||||
ORTE_DECLSPEC extern int orte_exit_status;
|
||||
|
||||
#if ORTE_DISABLE_FULL_SUPPORT
|
||||
@ -169,7 +167,20 @@ typedef struct orte_app_context_t orte_app_context_t;
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
|
||||
/* define a set of flags to control the launch of a job */
|
||||
typedef uint16_t orte_job_controls_t;
|
||||
#define ORTE_JOB_CONTROL OPAL_UINT16
|
||||
|
||||
#define ORTE_JOB_CONTROL_LOCAL_SLAVE 0x0001
|
||||
#define ORTE_JOB_CONTROL_NON_ORTE_JOB 0x0002
|
||||
#define ORTE_JOB_CONTROL_DEBUGGER_DAEMON 0x0014
|
||||
#define ORTE_JOB_CONTROL_FORWARD_OUTPUT 0x0008
|
||||
#define ORTE_JOB_CONTROL_DO_NOT_MONITOR 0x0010
|
||||
#define ORTE_JOB_CONTROL_FORWARD_COMM 0x0020
|
||||
#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x0040
|
||||
#define ORTE_JOB_CONTROL_RECOVERABLE 0x0080
|
||||
#define ORTE_JOB_CONTROL_SPIN_FOR_DEBUG 0x0100
|
||||
|
||||
/* global type definitions used by RTE - instanced in orte_globals.c */
|
||||
|
||||
/************
|
||||
@ -264,6 +275,8 @@ typedef struct {
|
||||
orte_node_rank_t next_node_rank;
|
||||
/* whether or not we are oversubscribed */
|
||||
bool oversubscribed;
|
||||
/* whether we have been added to the current map */
|
||||
bool mapped;
|
||||
/** State of this node */
|
||||
orte_node_state_t state;
|
||||
/** A "soft" limit on the number of slots available on the node.
|
||||
@ -290,14 +303,6 @@ typedef struct {
|
||||
specified limit. For example, if we have two processors, we
|
||||
may want to allow up to four processes but no more. */
|
||||
orte_std_cntr_t slots_max;
|
||||
/* number of physical boards in the node - defaults to 1 */
|
||||
uint8_t boards;
|
||||
/* number of sockets on each board - defaults to 1 */
|
||||
uint8_t sockets_per_board;
|
||||
/* number of cores per socket - defaults to 1 */
|
||||
uint8_t cores_per_socket;
|
||||
/* cpus on this node that are assigned for our use */
|
||||
char *cpu_set;
|
||||
/** Username on this node, if specified */
|
||||
char *username;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
@ -309,70 +314,6 @@ typedef struct {
|
||||
} orte_node_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_node_t);
|
||||
|
||||
/* define a set of flags to control the launch of a job */
|
||||
typedef uint16_t orte_job_controls_t;
|
||||
#define ORTE_JOB_CONTROL OPAL_UINT16
|
||||
|
||||
#define ORTE_JOB_CONTROL_LOCAL_SLAVE 0x0001
|
||||
#define ORTE_JOB_CONTROL_NON_ORTE_JOB 0x0002
|
||||
#define ORTE_JOB_CONTROL_DEBUGGER_DAEMON 0x0014
|
||||
#define ORTE_JOB_CONTROL_FORWARD_OUTPUT 0x0008
|
||||
#define ORTE_JOB_CONTROL_DO_NOT_MONITOR 0x0010
|
||||
#define ORTE_JOB_CONTROL_FORWARD_COMM 0x0020
|
||||
#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x0040
|
||||
#define ORTE_JOB_CONTROL_RECOVERABLE 0x0080
|
||||
#define ORTE_JOB_CONTROL_SPIN_FOR_DEBUG 0x0100
|
||||
|
||||
#define ORTE_MAPPING_POLICY OPAL_UINT16
|
||||
/* put the rank assignment method in the upper 8 bits */
|
||||
#define ORTE_MAPPING_USE_VM 0x0100
|
||||
#define ORTE_MAPPING_BYNODE 0x0200
|
||||
#define ORTE_MAPPING_BYSLOT 0x0400
|
||||
#define ORTE_MAPPING_BYSOCKET 0x0800
|
||||
#define ORTE_MAPPING_BYBOARD 0x1000
|
||||
#define ORTE_MAPPING_NO_USE_LOCAL 0x2000
|
||||
#define ORTE_MAPPING_NPERXXX 0x4000
|
||||
#define ORTE_MAPPING_BYUSER 0x8000
|
||||
/* check if policy is set */
|
||||
#define ORTE_MAPPING_POLICY_IS_SET(pol) (pol & 0xff00)
|
||||
/* nice macro for setting these */
|
||||
#define ORTE_SET_MAPPING_POLICY(pol) \
|
||||
orte_default_mapping_policy = (orte_default_mapping_policy & 0x00ff) | (pol);
|
||||
/* macro to detect if some other policy has been set */
|
||||
#define ORTE_XSET_MAPPING_POLICY(pol) \
|
||||
do { \
|
||||
orte_mapping_policy_t tmp; \
|
||||
tmp = (orte_default_mapping_policy & 0xff00) & ~(pol); \
|
||||
if (0 == tmp) { \
|
||||
ORTE_SET_MAPPING_POLICY((pol)); \
|
||||
} \
|
||||
} while(0);
|
||||
/* macro to add another mapping policy */
|
||||
#define ORTE_ADD_MAPPING_POLICY(pol) \
|
||||
orte_default_mapping_policy |= (pol);
|
||||
|
||||
/* put the binding policy in the lower 8 bits, using the paffinity values */
|
||||
#define ORTE_BIND_TO_NONE (uint16_t)OPAL_PAFFINITY_DO_NOT_BIND
|
||||
#define ORTE_BIND_TO_CORE (uint16_t)OPAL_PAFFINITY_BIND_TO_CORE
|
||||
#define ORTE_BIND_TO_SOCKET (uint16_t)OPAL_PAFFINITY_BIND_TO_SOCKET
|
||||
#define ORTE_BIND_TO_BOARD (uint16_t)OPAL_PAFFINITY_BIND_TO_BOARD
|
||||
#define ORTE_BIND_IF_SUPPORTED (uint16_t)OPAL_PAFFINITY_BIND_IF_SUPPORTED
|
||||
/* nice macro for setting these */
|
||||
#define ORTE_SET_BINDING_POLICY(pol) \
|
||||
orte_default_mapping_policy = (orte_default_mapping_policy & 0xff00) | (pol);
|
||||
/* macro to detect if some other policy has been set */
|
||||
#define ORTE_XSET_BINDING_POLICY(pol) \
|
||||
do { \
|
||||
orte_mapping_policy_t tmp; \
|
||||
tmp = (orte_default_mapping_policy & 0x00ff) & ~(pol); \
|
||||
if (0 == tmp) { \
|
||||
ORTE_SET_BINDING_POLICY((pol)); \
|
||||
} \
|
||||
} while(0);
|
||||
/* macro to detect if binding was qualified */
|
||||
#define ORTE_BINDING_NOT_REQUIRED(n) \
|
||||
(ORTE_BIND_IF_SUPPORTED & (n))
|
||||
|
||||
typedef struct {
|
||||
/** Base object so this can be put on a list */
|
||||
opal_list_item_t super;
|
||||
@ -406,11 +347,6 @@ typedef struct {
|
||||
* indicates the node where we stopped
|
||||
*/
|
||||
orte_node_t *bookmark;
|
||||
/** Whether or not to override oversubscription based on local
|
||||
* hardware - used to indicate uncertainty in number of
|
||||
* actual processors available on this node
|
||||
*/
|
||||
bool oversubscribe_override;
|
||||
/* state of the overall job */
|
||||
orte_job_state_t state;
|
||||
/* number of procs launched */
|
||||
@ -484,9 +420,11 @@ struct orte_proc_t {
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* hwloc object to which this process was mapped */
|
||||
hwloc_obj_t locale;
|
||||
/* where the proc was bound */
|
||||
unsigned int bind_idx;
|
||||
/* string representation of cpu bindings */
|
||||
char *cpu_bitmap;
|
||||
#endif
|
||||
/* a cpu list, if specified by the user */
|
||||
char *slot_list;
|
||||
/* pointer to the node where this proc is executing */
|
||||
orte_node_t *node;
|
||||
/* pointer to the node where this proc last executed */
|
||||
@ -533,8 +471,6 @@ typedef struct {
|
||||
orte_vpid_t daemon;
|
||||
/* whether or not this node is oversubscribed */
|
||||
bool oversubscribed;
|
||||
/* list of system info */
|
||||
opal_list_t sysinfo;
|
||||
} orte_nid_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_nid_t);
|
||||
|
||||
@ -559,6 +495,10 @@ typedef struct {
|
||||
orte_jobid_t job;
|
||||
/* number of procs in this job */
|
||||
orte_vpid_t num_procs;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* binding level of the job */
|
||||
opal_hwloc_level_t bind_level;
|
||||
#endif
|
||||
/* array of data for procs */
|
||||
opal_pointer_array_t pmap;
|
||||
} orte_jmap_t;
|
||||
@ -673,13 +613,7 @@ ORTE_DECLSPEC extern bool orte_assume_same_shell;
|
||||
/* whether or not to report launch progress */
|
||||
ORTE_DECLSPEC extern bool orte_report_launch_progress;
|
||||
|
||||
/* cluster hardware info */
|
||||
ORTE_DECLSPEC extern uint8_t orte_default_num_boards;
|
||||
ORTE_DECLSPEC extern uint8_t orte_default_num_sockets_per_board;
|
||||
ORTE_DECLSPEC extern uint8_t orte_default_num_cores_per_socket;
|
||||
|
||||
/* allocation specification */
|
||||
ORTE_DECLSPEC extern char *orte_default_cpu_set;
|
||||
ORTE_DECLSPEC extern char *orte_default_hostfile;
|
||||
ORTE_DECLSPEC extern char *orte_rankfile;
|
||||
#ifdef __WINDOWS__
|
||||
@ -688,16 +622,10 @@ ORTE_DECLSPEC extern char *orte_ccp_headnode;
|
||||
ORTE_DECLSPEC extern int orte_num_allocated_nodes;
|
||||
ORTE_DECLSPEC extern char *orte_node_regex;
|
||||
|
||||
/* default rank assigment and binding policy */
|
||||
ORTE_DECLSPEC extern orte_mapping_policy_t orte_default_mapping_policy;
|
||||
|
||||
/* tool communication controls */
|
||||
ORTE_DECLSPEC extern bool orte_report_events;
|
||||
ORTE_DECLSPEC extern char *orte_report_events_uri;
|
||||
|
||||
/* report bindings */
|
||||
ORTE_DECLSPEC extern bool orte_report_bindings;
|
||||
|
||||
/* barrier control */
|
||||
ORTE_DECLSPEC extern bool orte_do_not_barrier;
|
||||
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -28,7 +28,6 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/paffinity/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
@ -43,8 +42,7 @@ static bool passed_thru = false;
|
||||
int orte_register_params(void)
|
||||
{
|
||||
int value, tmp;
|
||||
char *strval, **params;
|
||||
uint16_t binding;
|
||||
char *strval;
|
||||
|
||||
/* only go thru this once - mpirun calls it twice, which causes
|
||||
* any error messages to show up twice
|
||||
@ -234,12 +232,6 @@ int orte_register_params(void)
|
||||
mca_base_param_reg_string_name("orte", "default_hostfile",
|
||||
"Name of the default hostfile (relative or absolute path)",
|
||||
false, false, NULL, &orte_default_hostfile);
|
||||
/* rankfile */
|
||||
tmp = mca_base_param_reg_string_name("orte", "rankfile",
|
||||
"Name of the rankfile to be used for mapping processes (relative or absolute path)",
|
||||
false, false, NULL, NULL);
|
||||
mca_base_param_reg_syn_name(tmp, "rmaps", "rank_file_path", false);
|
||||
mca_base_param_lookup_string(tmp, &orte_rankfile);
|
||||
|
||||
#ifdef __WINDOWS__
|
||||
mca_base_param_reg_string_name("orte", "ccp_headnode",
|
||||
@ -315,11 +307,14 @@ int orte_register_params(void)
|
||||
"Indicates that multiple app_contexts are being provided that are a mix of 32/64 bit binaries (default: false)",
|
||||
false, false, (int) false, &value);
|
||||
orte_hetero_apps = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
mca_base_param_reg_int_name("orte", "hetero_nodes",
|
||||
"Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]",
|
||||
false, false, (int) false, &value);
|
||||
orte_hetero_nodes = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
#endif
|
||||
|
||||
/* allow specification of the launch agent */
|
||||
mca_base_param_reg_string_name("orte", "launch_agent",
|
||||
"Command used to start processes on remote nodes (default: orted)",
|
||||
@ -394,71 +389,6 @@ int orte_register_params(void)
|
||||
"cpu model detected in node",
|
||||
true, false, NULL, &orte_local_cpu_model);
|
||||
|
||||
/* cluster hardware info */
|
||||
mca_base_param_reg_int_name("orte", "num_boards",
|
||||
"Number of processor boards/node (1-256) [default: 1]",
|
||||
false, false, 1, &value);
|
||||
orte_default_num_boards = (uint8_t)value;
|
||||
|
||||
mca_base_param_reg_int_name("orte", "num_sockets",
|
||||
"Number of sockets/board (1-256)",
|
||||
false, false, 0, &value);
|
||||
orte_default_num_sockets_per_board = (uint8_t)value;
|
||||
|
||||
mca_base_param_reg_int_name("orte", "num_cores",
|
||||
"Number of cores/socket (1-256)",
|
||||
false, false, 0, &value);
|
||||
orte_default_num_cores_per_socket = (uint8_t)value;
|
||||
|
||||
/* cpu allocation specification */
|
||||
mca_base_param_reg_string_name("orte", "cpu_set",
|
||||
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]",
|
||||
false, false, NULL, &orte_default_cpu_set);
|
||||
|
||||
/* binding specification - this will be overridden by any cmd line directive, and
|
||||
* ignored unless opal_paffinity_alone is set
|
||||
*/
|
||||
mca_base_param_reg_string_name("orte", "process_binding",
|
||||
"Policy for binding processes [none | core | socket | board] (supported qualifier: if-avail)",
|
||||
false, false, NULL, &strval);
|
||||
if (NULL != strval) {
|
||||
if (0 == strcasecmp(strval, "none")) {
|
||||
/* no binding */
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_NONE);
|
||||
} else {
|
||||
binding = 0;
|
||||
params = opal_argv_split(strval, ':');
|
||||
if (1 < opal_argv_count(params)) {
|
||||
if (0 != strcasecmp(params[1], "if-avail")) {
|
||||
/* unknown option */
|
||||
opal_output(0, "Unknown qualifier to orte_process_binding: %s", strval);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
binding = ORTE_BIND_IF_SUPPORTED;
|
||||
}
|
||||
if (0 == strcasecmp(params[0], "socket")) {
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_SOCKET | binding);
|
||||
} else if (0 == strcasecmp(params[0], "board")) {
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_BOARD | binding);
|
||||
} else if (0 == strcasecmp(params[0], "core")) {
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE | binding);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if nothing was set, but opal_paffinity_alone is set, then default
|
||||
* to bind-to-core
|
||||
*/
|
||||
if (opal_paffinity_alone) {
|
||||
ORTE_XSET_BINDING_POLICY(ORTE_BIND_TO_CORE);
|
||||
}
|
||||
|
||||
/* whether or not to report bindings */
|
||||
mca_base_param_reg_int_name("orte", "report_bindings",
|
||||
"Report bindings",
|
||||
false, false,
|
||||
(int) false, &value);
|
||||
orte_report_bindings = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* tool communication controls */
|
||||
mca_base_param_reg_string_name("orte", "report_events",
|
||||
"URI to which events are to be reported (default: NULL)",
|
||||
|
@ -10,7 +10,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
@ -274,13 +274,13 @@ static void dump_aborted_procs(void)
|
||||
break;
|
||||
case ORTE_ERR_MULTIPLE_AFFINITIES:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:multiple-paffinity-schemes", true, proc->slot_list);
|
||||
"orterun:multiple-paffinity-schemes", true, NULL);
|
||||
break;
|
||||
case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:topo-not-supported",
|
||||
true, orte_process_info.nodename, "rankfile containing a slot_list of ",
|
||||
proc->slot_list, approc->app);
|
||||
NULL, approc->app);
|
||||
break;
|
||||
case ORTE_ERR_INVALID_NODE_RANK:
|
||||
orte_show_help("help-orterun.txt",
|
||||
@ -326,7 +326,7 @@ static void dump_aborted_procs(void)
|
||||
case ORTE_ERR_SLOT_LIST_RANGE:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-slot-list-range",
|
||||
true, node->name, proc->slot_list);
|
||||
true, node->name, NULL);
|
||||
break;
|
||||
case ORTE_ERR_PIPE_READ_FAILURE:
|
||||
orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
|
||||
|
@ -13,7 +13,7 @@ int main(int argc, char* argv[])
|
||||
pid_t pid;
|
||||
|
||||
pid = getpid();
|
||||
printf("Parent [pid %ld] starting up!\n", (long)pid);
|
||||
printf("[pid %ld] starting up!\n", (long)pid);
|
||||
MPI_Init(NULL, NULL);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
printf("%d completed MPI_Init\n", rank);
|
||||
|
@ -10,7 +10,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
@ -50,7 +50,6 @@
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/mca/installdirs/installdirs.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/paffinity/base/base.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
@ -81,6 +80,7 @@
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/ras/ras.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
@ -254,37 +254,7 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL,
|
||||
"Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" },
|
||||
|
||||
/* Mapping options */
|
||||
{ NULL, NULL, NULL, '\0', "bynode", "bynode", 0,
|
||||
&orterun_globals.by_node, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to assign processes round-robin by node" },
|
||||
{ NULL, NULL, NULL, '\0', "byslot", "byslot", 0,
|
||||
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to assign processes round-robin by slot (the default)" },
|
||||
{ NULL, NULL, NULL, '\0', "bycore", "bycore", 0,
|
||||
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Alias for byslot" },
|
||||
{ NULL, NULL, NULL, '\0', "bysocket", "bysocket", 0,
|
||||
&orterun_globals.by_socket, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to assign processes round-robin by socket" },
|
||||
{ NULL, NULL, NULL, '\0', "byboard", "byboard", 0,
|
||||
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to assign processes round-robin by board (equivalent to bynode if only 1 board/node)" },
|
||||
{ "rmaps", "base", "pernode", '\0', "pernode", "pernode", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Launch one process per available node on the specified number of nodes [no -np => use all allocated nodes]" },
|
||||
{ "rmaps", "base", "n_pernode", '\0', "npernode", "npernode", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Launch n processes per node on all allocated nodes" },
|
||||
{ "rmaps", "base", "slot_list", '\0', "slot-list", "slot-list", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files)" },
|
||||
{ "rmaps", "base", "no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Nodes are not to be oversubscribed, even if the system supports such operation"},
|
||||
{ "rmaps", "base", "loadbalance", '\0', "loadbalance", "loadbalance", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Balance total number of procs across all allocated nodes"},
|
||||
/* Mapping controls */
|
||||
{ "rmaps", "base", "display_map", '\0', "display-map", "display-map", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display the process map just before launch"},
|
||||
@ -303,39 +273,98 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
{ "rmaps", "base", "no_schedule_local", '\0', "nolocal", "nolocal", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Do not run any MPI applications on the local node" },
|
||||
{ "rmaps", "base", "no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Nodes are not to be oversubscribed, even if the system supports such operation"},
|
||||
{ "rmaps", "base", "oversubscribe", '\0', "oversubscribe", "oversubscribe", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Nodes are allowed to be oversubscribed, even on a managed system"},
|
||||
#if 0
|
||||
{ "rmaps", "base", "cpus_per_rank", '\0', "cpus-per-proc", "cpus-per-proc", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of cpus to use for each process [default=1]" },
|
||||
{ "rmaps", "base", "cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Synonym for cpus-per-proc" },
|
||||
{ "rmaps", "base", "n_perboard", '\0', "nperboard", "nperboard", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Launch n processes per board on all allocated nodes" },
|
||||
{ "rmaps", "base", "n_persocket", '\0', "npersocket", "npersocket", 1,
|
||||
#endif
|
||||
|
||||
/* backward compatiblity */
|
||||
{ "rmaps", "base", "bynode", '\0', "bynode", "bynode", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to map and rank processes round-robin by node" },
|
||||
{ "rmaps", "base", "byslot", '\0', "byslot", "byslot", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to map and rank processes round-robin by slot" },
|
||||
|
||||
/* Nperxxx options that do not require topology and are always
|
||||
* available - included for backwards compatibility
|
||||
*/
|
||||
{ "rmaps", "ppr", "pernode", '\0', "pernode", "pernode", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Launch one process per available node" },
|
||||
{ "rmaps", "ppr", "n_pernode", '\0', "npernode", "npernode", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Launch n processes per node on all allocated nodes" },
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* declare hardware threads as independent cpus */
|
||||
{ "hwloc", "base", "use_hwthreads_as_cpus", '\0', "use-hwthread-cpus", "use-hwthread-cpus", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Use hardware threads as independent cpus" },
|
||||
|
||||
/* include npersocket for backwards compatibility */
|
||||
{ "rmaps", "ppr", "n_persocket", '\0', "npersocket", "npersocket", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Launch n processes per socket on all allocated nodes" },
|
||||
|
||||
/* binding options */
|
||||
{ NULL, NULL, NULL, '\0', "bind-to-none", "bind-to-none", 0,
|
||||
&orterun_globals.bind_to_none, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Do not bind processes to cores or sockets (default)" },
|
||||
{ NULL, NULL, NULL, '\0', "bind-to-core", "bind-to-core", 0,
|
||||
&orterun_globals.bind_to_core, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to bind processes to specific cores" },
|
||||
{ NULL, NULL, NULL, '\0', "bind-to-board", "bind-to-board", 0,
|
||||
&orterun_globals.bind_to_board, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to bind processes to specific boards (meaningless on 1 board/node)" },
|
||||
{ NULL, NULL, NULL, '\0', "bind-to-socket", "bind-to-socket", 0,
|
||||
&orterun_globals.bind_to_socket, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to bind processes to sockets" },
|
||||
{ "rmaps", "base", "stride", '\0', "stride", "stride", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"When binding multiple cores to a rank, the step size to use between cores [default: 1]" },
|
||||
{ "orte", "report", "bindings", '\0', "report-bindings", "report-bindings", 0,
|
||||
/* Mapping options */
|
||||
{ "rmaps", "base", "mapping_policy", '\0', NULL, "map-by", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Mapping Policy [slot (default) | hwthread | core | socket | numa | board | node]" },
|
||||
|
||||
/* Ranking options */
|
||||
{ "rmaps", "base", "ranking_policy", '\0', NULL, "rank-by", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Ranking Policy [slot (default) | hwthread | core | socket | numa | board | node]" },
|
||||
|
||||
/* Binding options */
|
||||
{ "hwloc", "base", "binding_policy", '\0', NULL, "bind-to", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Policy for binding processes [none (default) | hwthread | core | socket | numa | board] (supported qualifiers: overload-allowed,if-supported)" },
|
||||
|
||||
/* backward compatiblity */
|
||||
{ "hwloc", "base", "bind_to_core", '\0', "bind-to-core", "bind-to-core", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Bind processes to cores" },
|
||||
{ "hwloc", "base", "bind_to_socket", '\0', "bind-to-socket", "bind-to-socket", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Bind processes to sockets" },
|
||||
|
||||
{ "hwloc", "base", "report_bindings", '\0', "report-bindings", "report-bindings", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to report process bindings to stderr" },
|
||||
|
||||
/* slot list option */
|
||||
{ "hwloc", "base", "slot_list", '\0', "slot-list", "slot-list", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of processor IDs to bind processes to [default=NULL]"},
|
||||
|
||||
/* generalized pattern mapping option */
|
||||
{ "rmaps", "ppr", "pattern", '\0', NULL, "ppr", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Comma-separated list of number of processes on a given resource type [default: none]" },
|
||||
#else
|
||||
/* Mapping options */
|
||||
{ "rmaps", "base", "mapping_policy", '\0', NULL, "map-by", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Mapping Policy [slot (default) | node]" },
|
||||
|
||||
/* Ranking options */
|
||||
{ "rmaps", "base", "ranking_policy", '\0', NULL, "rank-by", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Ranking Policy [slot (default) | node]" },
|
||||
#endif
|
||||
|
||||
/* Allocation options */
|
||||
{ "ras", "base", "display_alloc", '\0', "display-allocation", "display-allocation", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
@ -343,20 +372,14 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
{ "ras", "base", "display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display a detailed list (mostly intended for developers) of the allocation being used by this job"},
|
||||
{ "orte", "cpu", "set", '\0', "cpu-set", "cpu-set", 1,
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{ "hwloc", "base", "cpu_set", '\0', "cpu-set", "cpu-set", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]"},
|
||||
|
||||
/* cluster hardware info */
|
||||
{ "orte", "num", "boards", '\0', "num-boards", "num-boards", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of processor boards/node (1-256) [default: 1]"},
|
||||
{ "orte", "num", "sockets", '\0', "num-sockets", "num-sockets", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of sockets/board (1-256) [default: 1]"},
|
||||
{ "orte", "num", "cores", '\0', "num-cores", "num-cores", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of cores/socket (1-256) [default: 1]"},
|
||||
#endif
|
||||
{ NULL, NULL, NULL, 'H', "host", "host", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of hosts to invoke processes on" },
|
||||
|
||||
/* mpiexec-like arguments */
|
||||
{ NULL, NULL, NULL, '\0', "wdir", "wdir", 1,
|
||||
@ -435,13 +458,11 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Max number of times to restart a failed process" },
|
||||
|
||||
{ "orte", "vm", "launch", '\0', "vm", "vm", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Launch daemons on all nodes at start to create a virtual machine [Default = false]" },
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{ "orte", "hetero", "nodes", '\0', NULL, "hetero-nodes", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" },
|
||||
#endif
|
||||
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
{ "opal", "cr", "enable_crdebug", '\0', "crdebug", "crdebug", 0,
|
||||
@ -477,6 +498,8 @@ int orterun(int argc, char *argv[])
|
||||
char * tmp_env_var = NULL;
|
||||
orte_debugger_breakpoint_fn_t foo;
|
||||
orte_job_t *daemons;
|
||||
int32_t ljob, i;
|
||||
orte_app_context_t *app, *dapp;
|
||||
|
||||
/* find our basename (the name of the executable) so that we can
|
||||
use it in pretty-print error messages */
|
||||
@ -572,7 +595,9 @@ int orterun(int argc, char *argv[])
|
||||
*/
|
||||
jdata = OBJ_NEW(orte_job_t);
|
||||
if (NULL == jdata) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
/* cannot call ORTE_ERROR_LOG as the errmgr
|
||||
* hasn't been loaded yet!
|
||||
*/
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
@ -625,7 +650,9 @@ int orterun(int argc, char *argv[])
|
||||
* require
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
/* cannot call ORTE_ERROR_LOG as it could be the errmgr
|
||||
* never got loaded!
|
||||
*/
|
||||
return rc;
|
||||
}
|
||||
/* finalize the OPAL utils. As they are opened again from orte_init->opal_init
|
||||
@ -633,6 +660,9 @@ int orterun(int argc, char *argv[])
|
||||
*/
|
||||
opal_finalize_util();
|
||||
|
||||
/* get the daemon job object */
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
/* check for request to report uri */
|
||||
if (NULL != orterun_globals.report_uri) {
|
||||
FILE *fp;
|
||||
@ -678,14 +708,25 @@ int orterun(int argc, char *argv[])
|
||||
Since there always MUST be at least one app_context, we are safe in
|
||||
doing this.
|
||||
*/
|
||||
if (NULL != ((orte_app_context_t*)jdata->apps->addr[0])->prefix_dir) {
|
||||
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0)) &&
|
||||
NULL != app->prefix_dir) {
|
||||
char *oldenv, *newenv, *lib_base, *bin_base;
|
||||
|
||||
/* copy the prefix into the daemon job so that any launcher
|
||||
* can find the orteds when we launch the virtual machine
|
||||
*/
|
||||
if (NULL == (dapp = (orte_app_context_t*)opal_pointer_array_get_item(daemons->apps, 0))) {
|
||||
/* that's an error in the ess */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
dapp->prefix_dir = strdup(app->prefix_dir);
|
||||
|
||||
lib_base = opal_basename(opal_install_dirs.libdir);
|
||||
bin_base = opal_basename(opal_install_dirs.bindir);
|
||||
|
||||
/* Reset PATH */
|
||||
newenv = opal_os_path( false, ((orte_app_context_t*)jdata->apps->addr[0])->prefix_dir, bin_base, NULL );
|
||||
newenv = opal_os_path( false, app->prefix_dir, bin_base, NULL );
|
||||
oldenv = getenv("PATH");
|
||||
if (NULL != oldenv) {
|
||||
char *temp;
|
||||
@ -701,7 +742,7 @@ int orterun(int argc, char *argv[])
|
||||
free(bin_base);
|
||||
|
||||
/* Reset LD_LIBRARY_PATH */
|
||||
newenv = opal_os_path( false, ((orte_app_context_t*)jdata->apps->addr[0])->prefix_dir, lib_base, NULL );
|
||||
newenv = opal_os_path( false, app->prefix_dir, lib_base, NULL );
|
||||
oldenv = getenv("LD_LIBRARY_PATH");
|
||||
if (NULL != oldenv) {
|
||||
char* temp;
|
||||
@ -783,58 +824,63 @@ int orterun(int argc, char *argv[])
|
||||
}
|
||||
}
|
||||
|
||||
/* if we are launching the vm, now is the time to do so */
|
||||
if (orte_vm_launch) {
|
||||
int32_t ljob, i;
|
||||
orte_app_context_t *app;
|
||||
/*** LAUNCH THE ORTE VIRTUAL MACHINE ***/
|
||||
|
||||
/* we may need to look at the apps for the user's job
|
||||
* to get our full list of nodes, so prep the job for
|
||||
* launch. This duplicates some code in orte_plm_base_setup_job
|
||||
* that won't run if we do this here - eventually, we'll want
|
||||
* to refactor the plm_base routine to avoid the duplication
|
||||
/* we may need to look at the apps for the user's job
|
||||
* to get our full list of nodes, so prep the job for
|
||||
* launch - start by getting a jobid for it */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
/* store it on the global job data pool - this is the key
|
||||
* step required before we launch the daemons. It allows
|
||||
* the orte_rmaps_base_setup_virtual_machine routine to
|
||||
* search all apps for any hosts to be used by the vm
|
||||
*/
|
||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
||||
|
||||
/* set the job state */
|
||||
jdata->state = ORTE_JOB_STATE_INIT;
|
||||
|
||||
/* if job recovery is not defined, set it to default */
|
||||
if (!jdata->recovery_defined) {
|
||||
/* set to system default */
|
||||
jdata->enable_recovery = orte_enable_recovery;
|
||||
}
|
||||
/* if app recovery is not defined, set apps to defaults */
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!app->recovery_defined) {
|
||||
app->max_restarts = orte_max_restarts;
|
||||
}
|
||||
}
|
||||
|
||||
/* if we don't want to launch, then don't attempt to
|
||||
* launch the daemons - the user really wants to just
|
||||
* look at the proposed process map
|
||||
*/
|
||||
if (!orte_do_not_launch) {
|
||||
/* run the allocator on the application job - this allows us to
|
||||
* pickup any host or hostfile arguments so we get the full
|
||||
* array of nodes in our allocation
|
||||
*/
|
||||
/* get a jobid for it */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) {
|
||||
goto DONE;
|
||||
}
|
||||
/* store it on the global job data pool - this is the key
|
||||
* step required before we launch the daemons. It allows
|
||||
* the orte_rmaps_base_setup_virtual_machine routine to
|
||||
* search all apps for any hosts to be used by the vm
|
||||
*/
|
||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
||||
|
||||
/* set the job state */
|
||||
jdata->state = ORTE_JOB_STATE_INIT;
|
||||
|
||||
/* if job recovery is not defined, set it to default */
|
||||
if (!jdata->recovery_defined) {
|
||||
/* set to system default */
|
||||
jdata->enable_recovery = orte_enable_recovery;
|
||||
}
|
||||
/* if app recovery is not defined, set apps to defaults */
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!app->recovery_defined) {
|
||||
app->max_restarts = orte_max_restarts;
|
||||
}
|
||||
}
|
||||
/* get the daemon job object */
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
/* launch the daemons */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(daemons))) {
|
||||
fprintf(stderr, "%s: UNABLE TO LAUNCH VIRTUAL MACHINE\n", orte_basename);
|
||||
goto DONE;
|
||||
}
|
||||
/* ensure all future jobs use the VM */
|
||||
orte_default_mapping_policy |= ORTE_MAPPING_USE_VM;
|
||||
}
|
||||
|
||||
/*** LAUNCH THE APPLICATION ***/
|
||||
/* setup for debugging */
|
||||
orte_debugger.init_before_spawn(jdata);
|
||||
|
||||
@ -880,13 +926,6 @@ static int init_globals(void)
|
||||
orterun_globals.help = false;
|
||||
orterun_globals.version = false;
|
||||
orterun_globals.verbose = false;
|
||||
orterun_globals.by_node = false;
|
||||
orterun_globals.by_slot = false;
|
||||
orterun_globals.by_board = false;
|
||||
orterun_globals.by_socket = false;
|
||||
orterun_globals.bind_to_core = false;
|
||||
orterun_globals.bind_to_board = false;
|
||||
orterun_globals.bind_to_socket = false;
|
||||
orterun_globals.debugger = false;
|
||||
orterun_globals.num_procs = 0;
|
||||
if( NULL != orterun_globals.env_val )
|
||||
@ -982,35 +1021,7 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
||||
run_debugger(orte_basename, cmd_line, argc, argv, orterun_globals.num_procs);
|
||||
}
|
||||
|
||||
/* extract any rank assignment policy directives */
|
||||
if (orterun_globals.by_node) {
|
||||
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYNODE);
|
||||
} else if (orterun_globals.by_board) {
|
||||
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYBOARD);
|
||||
} else if (orterun_globals.by_socket) {
|
||||
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSOCKET);
|
||||
} else if (orterun_globals.by_slot) {
|
||||
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSLOT);
|
||||
}
|
||||
/* if nothing was specified, leave it as set by
|
||||
* mca param
|
||||
*/
|
||||
|
||||
/* extract any binding policy directives */
|
||||
if (orterun_globals.bind_to_socket) {
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_SOCKET);
|
||||
} else if (orterun_globals.bind_to_board) {
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_BOARD);
|
||||
} else if (orterun_globals.bind_to_core) {
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE);
|
||||
} else if (orterun_globals.bind_to_none) {
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_NONE);
|
||||
}
|
||||
/* if nothing was specified, leave it as set
|
||||
* by mca param
|
||||
*/
|
||||
|
||||
/* if recovery was disabled on the cmd line, do so */
|
||||
/* if recovery was disabled on the cmd line, do so */
|
||||
if (orterun_globals.disable_recovery) {
|
||||
orte_enable_recovery = false;
|
||||
orte_max_restarts = 0;
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -40,14 +40,6 @@ struct orterun_globals_t {
|
||||
char *report_pid;
|
||||
char *report_uri;
|
||||
bool exit;
|
||||
bool by_node;
|
||||
bool by_slot;
|
||||
bool by_board;
|
||||
bool by_socket;
|
||||
bool bind_to_none;
|
||||
bool bind_to_core;
|
||||
bool bind_to_board;
|
||||
bool bind_to_socket;
|
||||
bool debugger;
|
||||
int num_procs;
|
||||
char *env_val;
|
||||
|
@ -42,7 +42,6 @@
|
||||
* relative node syntax should generate an immediate error
|
||||
*/
|
||||
int orte_util_add_dash_host_nodes(opal_list_t *nodes,
|
||||
bool *override_oversubscribed,
|
||||
char ** host_argv)
|
||||
{
|
||||
opal_list_item_t* item;
|
||||
@ -129,14 +128,6 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
|
||||
node->slots_inuse = 0;
|
||||
node->slots_max = 0;
|
||||
node->slots = 1;
|
||||
/* indicate that ORTE should override any oversubscribed conditions
|
||||
* based on local hardware limits since the user (a) might not have
|
||||
* provided us any info on the #slots for a node, and (b) the user
|
||||
* might have been wrong! If we don't check the number of local physical
|
||||
* processors, then we could be too aggressive on our sched_yield setting
|
||||
* and cause performance problems.
|
||||
*/
|
||||
*override_oversubscribed = true;
|
||||
opal_list_append(nodes, &node->super);
|
||||
}
|
||||
}
|
||||
|
@ -30,7 +30,6 @@
|
||||
BEGIN_C_DECLS
|
||||
|
||||
ORTE_DECLSPEC int orte_util_add_dash_host_nodes(opal_list_t *nodes,
|
||||
bool *override_oversubscribed,
|
||||
char ** host_argv);
|
||||
|
||||
ORTE_DECLSPEC int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
|
||||
|
@ -11,6 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -332,49 +333,6 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
|
||||
node->username = hostfile_parse_string();
|
||||
break;
|
||||
|
||||
case ORTE_HOSTFILE_BOARDS:
|
||||
rc = hostfile_parse_int();
|
||||
if (rc < 0) {
|
||||
orte_show_help("help-hostfile.txt", "boards",
|
||||
true,
|
||||
cur_hostfile_name, rc);
|
||||
OBJ_RELEASE(node);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
node->boards = rc;
|
||||
break;
|
||||
|
||||
case ORTE_HOSTFILE_SOCKETS_PER_BOARD:
|
||||
rc = hostfile_parse_int();
|
||||
if (rc < 0) {
|
||||
orte_show_help("help-hostfile.txt", "sockets",
|
||||
true,
|
||||
cur_hostfile_name, rc);
|
||||
OBJ_RELEASE(node);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
node->sockets_per_board = rc;
|
||||
break;
|
||||
|
||||
case ORTE_HOSTFILE_CORES_PER_SOCKET:
|
||||
rc = hostfile_parse_int();
|
||||
if (rc < 0) {
|
||||
orte_show_help("help-hostfile.txt", "cores",
|
||||
true,
|
||||
cur_hostfile_name, rc);
|
||||
OBJ_RELEASE(node);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
node->cores_per_socket = rc;
|
||||
break;
|
||||
|
||||
case ORTE_HOSTFILE_CPU_SET:
|
||||
if (NULL != node->cpu_set) {
|
||||
free(node->cpu_set);
|
||||
}
|
||||
node->cpu_set = hostfile_parse_string();
|
||||
break;
|
||||
|
||||
case ORTE_HOSTFILE_COUNT:
|
||||
case ORTE_HOSTFILE_CPU:
|
||||
case ORTE_HOSTFILE_SLOTS:
|
||||
@ -516,7 +474,6 @@ unlock:
|
||||
*/
|
||||
|
||||
int orte_util_add_hostfile_nodes(opal_list_t *nodes,
|
||||
bool *override_oversubscribed,
|
||||
char *hostfile)
|
||||
{
|
||||
opal_list_t exclude;
|
||||
@ -567,15 +524,6 @@ int orte_util_add_hostfile_nodes(opal_list_t *nodes,
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
||||
/* indicate that ORTE should override any oversubscribed conditions
|
||||
* based on local hardware limits since the user (a) might not have
|
||||
* provided us any info on the #slots for a node, and (b) the user
|
||||
* might have been wrong! If we don't check the number of local physical
|
||||
* processors, then we could be too aggressive on our sched_yield setting
|
||||
* and cause performance problems.
|
||||
*/
|
||||
*override_oversubscribed = true;
|
||||
|
||||
cleanup:
|
||||
OBJ_DESTRUCT(&exclude);
|
||||
|
||||
|
@ -31,7 +31,6 @@
|
||||
BEGIN_C_DECLS
|
||||
|
||||
ORTE_DECLSPEC int orte_util_add_hostfile_nodes(opal_list_t *nodes,
|
||||
bool *override_oversubscribed,
|
||||
char *hostfile);
|
||||
|
||||
ORTE_DECLSPEC int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
|
||||
|
@ -10,6 +10,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -75,8 +76,7 @@ extern orte_hostfile_value_t orte_util_hostfile_value;
|
||||
#define ORTE_HOSTFILE_BOARDS 17
|
||||
#define ORTE_HOSTFILE_SOCKETS_PER_BOARD 18
|
||||
#define ORTE_HOSTFILE_CORES_PER_SOCKET 19
|
||||
#define ORTE_HOSTFILE_CPU_SET 20
|
||||
/* ensure we can handle a rank_file input */
|
||||
#define ORTE_HOSTFILE_RANK 21
|
||||
#define ORTE_HOSTFILE_RANK 20
|
||||
|
||||
#endif
|
||||
|
@ -12,6 +12,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -144,12 +145,6 @@ cores_per_socket { orte_util_hostfile_value.sval = yytext;
|
||||
"cores-per-socket" { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_CORES_PER_SOCKET; }
|
||||
|
||||
cpu_set { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_CPU_SET; }
|
||||
|
||||
"cpu-set" { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_CPU_SET; }
|
||||
|
||||
\+n[0-9]+ { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_RELATIVE; }
|
||||
\+[eE][\:][0-9]+ { orte_util_hostfile_value.sval = yytext;
|
||||
|
@ -46,7 +46,7 @@
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
@ -88,6 +88,24 @@ int orte_util_nidmap_init(opal_buffer_t *buffer)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
{
|
||||
hwloc_topology_t topo;
|
||||
|
||||
/* extract the topology */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &cnt, OPAL_HWLOC_TOPO))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (NULL == opal_hwloc_topology) {
|
||||
opal_hwloc_topology = topo;
|
||||
} else {
|
||||
hwloc_topology_destroy(topo);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* extract the byte object holding the daemonmap */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) {
|
||||
@ -113,16 +131,6 @@ int orte_util_nidmap_init(opal_buffer_t *buffer)
|
||||
return rc;
|
||||
}
|
||||
/* the bytes in the object were free'd by the decode */
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* extract the topology */
|
||||
if (NULL == opal_hwloc_topology) {
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &opal_hwloc_topology, &cnt, OPAL_HWLOC_TOPO))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -522,6 +530,9 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
|
||||
orte_job_t *jdata = NULL;
|
||||
int32_t *nodes = NULL;
|
||||
int i, j, k, rc = ORTE_SUCCESS;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
unsigned int *bind_idx=NULL;
|
||||
#endif
|
||||
|
||||
/* setup the working buffer */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
@ -550,12 +561,21 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup_and_return;
|
||||
}
|
||||
|
||||
/* allocate memory for the nodes, local ranks and node ranks */
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* pack the bind level */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &(jdata->map->bind_level), 1, OPAL_HWLOC_LEVEL_T))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup_and_return;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* allocate memory for the nodes, local ranks, node ranks, and bind_idx */
|
||||
nodes = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t));
|
||||
lrank = (orte_local_rank_t*)malloc(jdata->num_procs*sizeof(orte_local_rank_t));
|
||||
nrank = (orte_node_rank_t*)malloc(jdata->num_procs*sizeof(orte_node_rank_t));
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
bind_idx = (unsigned int*)malloc(jdata->num_procs*sizeof(unsigned int));
|
||||
#endif
|
||||
/* transfer and pack the node info in one pack */
|
||||
for (i=0, k=0; i < jdata->procs->size; i++) {
|
||||
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
@ -569,6 +589,9 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
|
||||
nodes[k] = proc->node->index;
|
||||
lrank[k] = proc->local_rank;
|
||||
nrank[k] = proc->node_rank;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
bind_idx[k] = proc->bind_idx;
|
||||
#endif
|
||||
++k;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, nodes, jdata->num_procs, OPAL_INT32))) {
|
||||
@ -585,6 +608,13 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup_and_return;
|
||||
}
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* transfer and pack the bind_idx in one pack */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, bind_idx, jdata->num_procs, OPAL_UINT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup_and_return;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* transfer the payload to the byte object */
|
||||
@ -601,6 +631,11 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
|
||||
if( NULL != nodes ) {
|
||||
free(nodes);
|
||||
}
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if( NULL != bind_idx ) {
|
||||
free(bind_idx);
|
||||
}
|
||||
#endif
|
||||
OBJ_DESTRUCT(&buf);
|
||||
|
||||
return rc;
|
||||
@ -612,9 +647,13 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
|
||||
orte_jobid_t jobid;
|
||||
orte_vpid_t i, num_procs;
|
||||
orte_pmap_t *pmap;
|
||||
int32_t *nodes, my_node;
|
||||
orte_local_rank_t *local_rank;
|
||||
orte_node_rank_t *node_rank;
|
||||
int32_t *nodes=NULL, my_node;
|
||||
orte_local_rank_t *local_rank=NULL;
|
||||
orte_node_rank_t *node_rank=NULL;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
opal_hwloc_level_t bind_level;
|
||||
unsigned int *bind_idx=NULL;
|
||||
#endif
|
||||
orte_std_cntr_t n;
|
||||
opal_buffer_t buf;
|
||||
orte_jmap_t *jmap;
|
||||
@ -658,6 +697,17 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* unpack the binding level */
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &bind_level, &n, OPAL_HWLOC_LEVEL_T))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
/* set mine */
|
||||
orte_process_info.bind_level = bind_level;
|
||||
#endif
|
||||
|
||||
/* allocate memory for the node info */
|
||||
nodes = (int32_t*)malloc(num_procs * 4);
|
||||
/* unpack it in one shot */
|
||||
@ -687,6 +737,19 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* allocate memory for bind_idx */
|
||||
bind_idx = (unsigned int*)malloc(num_procs*sizeof(unsigned int));
|
||||
/* unpack bind_idx in one shot */
|
||||
n=num_procs;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, bind_idx, &n, OPAL_UINT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
/* set mine */
|
||||
orte_process_info.bind_idx = bind_idx[ORTE_PROC_MY_NAME->vpid];
|
||||
#endif
|
||||
|
||||
/* if we already know about this job, we need to check the data to see
|
||||
* if something has changed - e.g., a proc that is being restarted somewhere
|
||||
* other than where it previously was
|
||||
@ -703,34 +766,6 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
|
||||
}
|
||||
/* now use the opal function to reset the internal pointers */
|
||||
opal_pointer_array_remove_all(&jmap->pmap);
|
||||
/* set the size of the storage so we minimize realloc's */
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&jmap->pmap, num_procs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* add in the updated array */
|
||||
for (i=0; i < num_procs; i++) {
|
||||
pmap = OBJ_NEW(orte_pmap_t);
|
||||
/* add the pidmap entry at the specific site corresponding
|
||||
* to the proc's vpid
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(&jmap->pmap, i, pmap))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
/* add/update the data */
|
||||
pmap->node = nodes[i];
|
||||
pmap->local_rank = local_rank[i];
|
||||
pmap->node_rank = node_rank[i];
|
||||
/* set locality - for now, just do node level */
|
||||
if (pmap->node == my_node) {
|
||||
pmap->locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
|
||||
} else {
|
||||
pmap->locality = OPAL_PROC_NON_LOCAL;
|
||||
}
|
||||
}
|
||||
/* update the #procs */
|
||||
jmap->num_procs = num_procs;
|
||||
} else {
|
||||
/* if we don't already have this data, store it
|
||||
* unfortunately, job objects cannot be stored
|
||||
@ -740,40 +775,67 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
|
||||
*/
|
||||
jmap = OBJ_NEW(orte_jmap_t);
|
||||
jmap->job = jobid;
|
||||
jmap->num_procs = num_procs;
|
||||
if (0 > (j = opal_pointer_array_add(&orte_jobmap, jmap))) {
|
||||
ORTE_ERROR_LOG(j);
|
||||
rc = j;
|
||||
goto cleanup;
|
||||
}
|
||||
/* allocate memory for the procs array */
|
||||
opal_pointer_array_set_size(&jmap->pmap, num_procs);
|
||||
/* xfer the data */
|
||||
for (i=0; i < num_procs; i++) {
|
||||
pmap = OBJ_NEW(orte_pmap_t);
|
||||
pmap->node = nodes[i];
|
||||
pmap->local_rank = local_rank[i];
|
||||
pmap->node_rank = node_rank[i];
|
||||
/* set locality - for now, just do node level */
|
||||
if (pmap->node == my_node) {
|
||||
pmap->locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
|
||||
} else {
|
||||
pmap->locality = OPAL_PROC_NON_LOCAL;
|
||||
}
|
||||
/* add the pidmap entry at the specific site corresponding
|
||||
* to the proc's vpid
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(&jmap->pmap, i, pmap))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
/* update the binding level and num_procs */
|
||||
#if OPAL_HAVE_HWLOC
|
||||
jmap->bind_level = bind_level;
|
||||
#endif
|
||||
jmap->num_procs = num_procs;
|
||||
/* set the size of the storage so we minimize realloc's */
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&jmap->pmap, num_procs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* xfer the data */
|
||||
for (i=0; i < num_procs; i++) {
|
||||
pmap = OBJ_NEW(orte_pmap_t);
|
||||
pmap->node = nodes[i];
|
||||
pmap->local_rank = local_rank[i];
|
||||
pmap->node_rank = node_rank[i];
|
||||
/* set locality */
|
||||
if (ORTE_PROC_MY_NAME->vpid == i) {
|
||||
/* this is me */
|
||||
pmap->locality = OPAL_PROC_ALL_LOCAL;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
} else if (pmap->node == my_node) {
|
||||
/* we share a node - see what else we share */
|
||||
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
|
||||
orte_process_info.bind_level,
|
||||
orte_process_info.bind_idx,
|
||||
jmap->bind_level,
|
||||
bind_idx[i]);
|
||||
#else
|
||||
} else if (pmap->node == my_node) {
|
||||
pmap->locality = OPAL_PROC_ON_NODE;
|
||||
#endif
|
||||
} else {
|
||||
pmap->locality = OPAL_PROC_NON_LOCAL;
|
||||
}
|
||||
/* add the pidmap entry at the specific site corresponding
|
||||
* to the proc's vpid
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(&jmap->pmap, i, pmap))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* release data */
|
||||
free(nodes);
|
||||
nodes = NULL;
|
||||
free(local_rank);
|
||||
local_rank = NULL;
|
||||
free(node_rank);
|
||||
node_rank = NULL;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
free(bind_idx);
|
||||
bind_idx = NULL;
|
||||
#endif
|
||||
/* setup for next cycle */
|
||||
n = 1;
|
||||
}
|
||||
@ -781,7 +843,21 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
cleanup:
|
||||
if (NULL != nodes) {
|
||||
free(nodes);
|
||||
}
|
||||
if (NULL != local_rank) {
|
||||
free(local_rank);
|
||||
}
|
||||
if (NULL != node_rank) {
|
||||
free(node_rank);
|
||||
}
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (NULL != bind_idx) {
|
||||
free(bind_idx);
|
||||
}
|
||||
#endif
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return rc;
|
||||
}
|
||||
|
@ -69,6 +69,10 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
|
||||
/* .sock_stdin = */ NULL,
|
||||
/* .sock_stdout = */ NULL,
|
||||
/* .sock_stderr = */ NULL,
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* .bind_level = */ OPAL_HWLOC_NODE_LEVEL,
|
||||
/* .bind_idx = */ 0,
|
||||
#endif
|
||||
/* .job_name = */ NULL,
|
||||
/* .job_instance = */ NULL,
|
||||
/* .executable = */ NULL,
|
||||
|
@ -37,7 +37,9 @@
|
||||
#endif
|
||||
|
||||
#include "orte/types.h"
|
||||
|
||||
#include "opal/dss/dss_types.h"
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
@ -111,6 +113,10 @@ struct orte_proc_info_t {
|
||||
char *sock_stdin; /**< Path name to temp file for stdin. */
|
||||
char *sock_stdout; /**< Path name to temp file for stdout. */
|
||||
char *sock_stderr; /**< Path name to temp file for stderr. */
|
||||
#if OPAL_HAVE_HWLOC
|
||||
opal_hwloc_level_t bind_level;
|
||||
unsigned int bind_idx;
|
||||
#endif
|
||||
/* name/instance info for debug support */
|
||||
char *job_name;
|
||||
char *job_instance;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user