Get staged execution working on multi-node setups. Improve efficiency by only remapping if all procs not yet mapped in the job.
This commit was SVN r27181.
Этот коммит содержится в:
родитель
dd5bd99942
Коммит
1b659de132
@ -34,6 +34,7 @@ BEGIN_C_DECLS
|
|||||||
#define ORTE_DB_ARCH "orte.arch"
|
#define ORTE_DB_ARCH "orte.arch"
|
||||||
#define ORTE_DB_NPROCS "orte.nprocs"
|
#define ORTE_DB_NPROCS "orte.nprocs"
|
||||||
#define ORTE_DB_RMLURI "orte.rmluri"
|
#define ORTE_DB_RMLURI "orte.rmluri"
|
||||||
|
#define ORTE_DB_BIND_BITMAP "orte.bind.bitmap"
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
|
||||||
|
@ -98,7 +98,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
|||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
orte_job_t *jdata=NULL;
|
orte_job_t *jdata=NULL;
|
||||||
orte_proc_t *proc;
|
|
||||||
orte_job_map_t *map=NULL;
|
orte_job_map_t *map=NULL;
|
||||||
opal_buffer_t *wireup;
|
opal_buffer_t *wireup;
|
||||||
opal_byte_object_t bo, *boptr;
|
opal_byte_object_t bo, *boptr;
|
||||||
@ -259,18 +258,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* pack the binding bitmaps */
|
|
||||||
for (j=0; j < jdata->procs->size; j++) {
|
|
||||||
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, j))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/* okay to pack NULL strings */
|
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &proc->cpu_bitmap, 1, OPAL_STRING))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* pack the collective ids */
|
/* pack the collective ids */
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_modex, 1, ORTE_GRPCOMM_COLL_ID_T))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_modex, 1, ORTE_GRPCOMM_COLL_ID_T))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -324,7 +311,8 @@ static int check_local_proc(orte_job_t *jdata, orte_proc_t *pptr)
|
|||||||
if (!pptr->local_proc) {
|
if (!pptr->local_proc) {
|
||||||
/* not on the local list */
|
/* not on the local list */
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
"adding proc %s to my local list",
|
"%s adding proc %s to my local list",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&pptr->name)));
|
ORTE_NAME_PRINT(&pptr->name)));
|
||||||
/* keep tabs of the number of local procs */
|
/* keep tabs of the number of local procs */
|
||||||
jdata->num_local_procs++;
|
jdata->num_local_procs++;
|
||||||
@ -351,7 +339,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
|||||||
orte_vpid_t j;
|
orte_vpid_t j;
|
||||||
orte_std_cntr_t cnt;
|
orte_std_cntr_t cnt;
|
||||||
orte_job_t *jdata=NULL;
|
orte_job_t *jdata=NULL;
|
||||||
orte_proc_t *proc;
|
|
||||||
opal_byte_object_t *bo;
|
opal_byte_object_t *bo;
|
||||||
int8_t flag;
|
int8_t flag;
|
||||||
int32_t n;
|
int32_t n;
|
||||||
@ -539,20 +526,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
|||||||
goto REPORT_ERROR;
|
goto REPORT_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* unpack the binding bitmaps */
|
|
||||||
for (j=0; j < jdata->num_procs; j++) {
|
|
||||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
||||||
rc = ORTE_ERR_NOT_FOUND;
|
|
||||||
goto REPORT_ERROR;
|
|
||||||
}
|
|
||||||
cnt = 1;
|
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &proc->cpu_bitmap, &cnt, OPAL_STRING))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
goto REPORT_ERROR;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* unpack the collective ids */
|
/* unpack the collective ids */
|
||||||
cnt=1;
|
cnt=1;
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_modex, &cnt, ORTE_GRPCOMM_COLL_ID_T))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_modex, &cnt, ORTE_GRPCOMM_COLL_ID_T))) {
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||||
|
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||||
|
* All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
|
@ -48,6 +48,7 @@ static int staged_mapper(orte_job_t *jdata)
|
|||||||
orte_proc_t *proc;
|
orte_proc_t *proc;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
bool work_to_do = false;
|
bool work_to_do = false;
|
||||||
|
opal_list_item_t *item;
|
||||||
|
|
||||||
/* only use this mapper if it was specified */
|
/* only use this mapper if it was specified */
|
||||||
if (NULL == jdata->map->req_mapper ||
|
if (NULL == jdata->map->req_mapper ||
|
||||||
@ -100,6 +101,9 @@ static int staged_mapper(orte_job_t *jdata)
|
|||||||
}
|
}
|
||||||
/* if nothing is available, then move on */
|
/* if nothing is available, then move on */
|
||||||
if (0 == num_slots || 0 == opal_list_get_size(&node_list)) {
|
if (0 == num_slots || 0 == opal_list_get_size(&node_list)) {
|
||||||
|
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||||
|
"%s mca:rmaps:staged: no nodes available for this app",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
OBJ_DESTRUCT(&node_list);
|
OBJ_DESTRUCT(&node_list);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -110,20 +114,46 @@ static int staged_mapper(orte_job_t *jdata)
|
|||||||
}
|
}
|
||||||
if (ORTE_PROC_STATE_UNDEF != proc->state) {
|
if (ORTE_PROC_STATE_UNDEF != proc->state) {
|
||||||
/* this proc has already been mapped or executed */
|
/* this proc has already been mapped or executed */
|
||||||
|
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||||
|
"%s mca:rmaps:staged: proc %s has already been mapped",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&proc->name));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* flag that there is at least one proc still to
|
/* flag that there is at least one proc still to
|
||||||
* be executed
|
* be executed
|
||||||
*/
|
*/
|
||||||
work_to_do = true;
|
work_to_do = true;
|
||||||
|
/* track number mapped */
|
||||||
|
jdata->num_mapped++;
|
||||||
/* map this proc to the first available slot */
|
/* map this proc to the first available slot */
|
||||||
node = (orte_node_t*)opal_list_get_first(&node_list);
|
node = (orte_node_t*)opal_list_get_first(&node_list);
|
||||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||||
|
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||||
|
"%s mca:rmaps:staged: assigning proc %s to node %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&proc->name), node->name);
|
||||||
proc->node = node;
|
proc->node = node;
|
||||||
proc->nodename = node->name;
|
proc->nodename = node->name;
|
||||||
|
/* the local rank is the number of procs
|
||||||
|
* on this node from this job - we don't
|
||||||
|
* directly track this number, so it must
|
||||||
|
* be found by looping across the node->procs
|
||||||
|
* array and counting it each time. For now,
|
||||||
|
* since we don't use this value in this mode
|
||||||
|
* of operation, just set it to something arbitrary
|
||||||
|
*/
|
||||||
|
proc->local_rank = node->num_procs;
|
||||||
|
/* the node rank is simply the number of procs
|
||||||
|
* on the node at this time
|
||||||
|
*/
|
||||||
|
proc->node_rank = node->num_procs;
|
||||||
|
/* track number of procs on node and number of slots used */
|
||||||
node->num_procs++;
|
node->num_procs++;
|
||||||
node->slots_inuse++;
|
node->slots_inuse++;
|
||||||
if (node->slots_inuse == node->slots_alloc) {
|
if (node->slots_inuse == node->slots_alloc) {
|
||||||
|
opal_output(0, "%s slots on node %s are fully used",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name);
|
||||||
opal_list_remove_item(&node_list, &node->super);
|
opal_list_remove_item(&node_list, &node->super);
|
||||||
OBJ_RELEASE(node);
|
OBJ_RELEASE(node);
|
||||||
}
|
}
|
||||||
@ -154,6 +184,11 @@ static int staged_mapper(orte_job_t *jdata)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* clear the list */
|
||||||
|
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||||
|
OBJ_RELEASE(item);
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&node_list);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if there isn't at least one proc that can be launched,
|
/* if there isn't at least one proc that can be launched,
|
||||||
|
@ -253,6 +253,7 @@ static void setup_job_complete(int fd, short args, void *cbdata)
|
|||||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||||
jdata->map->req_mapper = strdup("staged");
|
jdata->map->req_mapper = strdup("staged");
|
||||||
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_STAGED);
|
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_STAGED);
|
||||||
|
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||||
jdata->map->display_map = orte_rmaps_base.display_map;
|
jdata->map->display_map = orte_rmaps_base.display_map;
|
||||||
}
|
}
|
||||||
orte_plm_base_setup_job_complete(0, 0, (void*)caddy);
|
orte_plm_base_setup_job_complete(0, 0, (void*)caddy);
|
||||||
@ -347,12 +348,13 @@ static void track_procs(int fd, short args, void *cbdata)
|
|||||||
if (jdata->num_terminated == jdata->num_procs) {
|
if (jdata->num_terminated == jdata->num_procs) {
|
||||||
/* no other procs are waiting, so end this job */
|
/* no other procs are waiting, so end this job */
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
||||||
} else {
|
} else if (jdata->num_mapped < jdata->num_procs) {
|
||||||
/* schedule the job for re-mapping so that any procs
|
/* schedule the job for re-mapping so that procs
|
||||||
* waiting for resources can execute
|
* waiting for resources can execute
|
||||||
*/
|
*/
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
|
||||||
}
|
}
|
||||||
|
/* otherwise, do nothing until more procs terminate */
|
||||||
OBJ_RELEASE(caddy);
|
OBJ_RELEASE(caddy);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -710,6 +710,7 @@ static void orte_job_construct(orte_job_t* job)
|
|||||||
job->state = ORTE_JOB_STATE_UNDEF;
|
job->state = ORTE_JOB_STATE_UNDEF;
|
||||||
job->restart = false;
|
job->restart = false;
|
||||||
|
|
||||||
|
job->num_mapped = 0;
|
||||||
job->num_launched = 0;
|
job->num_launched = 0;
|
||||||
job->num_reported = 0;
|
job->num_reported = 0;
|
||||||
job->num_terminated = 0;
|
job->num_terminated = 0;
|
||||||
|
@ -406,6 +406,8 @@ typedef struct {
|
|||||||
orte_job_state_t state;
|
orte_job_state_t state;
|
||||||
/* some procs in this job are being restarted */
|
/* some procs in this job are being restarted */
|
||||||
bool restart;
|
bool restart;
|
||||||
|
/* number of procs mapped */
|
||||||
|
orte_vpid_t num_mapped;
|
||||||
/* number of procs launched */
|
/* number of procs launched */
|
||||||
orte_vpid_t num_launched;
|
orte_vpid_t num_launched;
|
||||||
/* number of procs reporting contact info */
|
/* number of procs reporting contact info */
|
||||||
|
@ -602,6 +602,10 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update)
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto cleanup_and_return;
|
goto cleanup_and_return;
|
||||||
}
|
}
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &proc->cpu_bitmap, 1, OPAL_STRING))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup_and_return;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &proc->state, 1, ORTE_PROC_STATE))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &proc->state, 1, ORTE_PROC_STATE))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -642,6 +646,7 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
|
|||||||
opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL, pbind, *lvptr;
|
opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL, pbind, *lvptr;
|
||||||
unsigned int bind_idx, pbidx, *uiptr;
|
unsigned int bind_idx, pbidx, *uiptr;
|
||||||
opal_hwloc_locality_t locality;
|
opal_hwloc_locality_t locality;
|
||||||
|
char *cpu_bitmap;
|
||||||
#endif
|
#endif
|
||||||
orte_std_cntr_t n;
|
orte_std_cntr_t n;
|
||||||
opal_buffer_t buf;
|
opal_buffer_t buf;
|
||||||
@ -742,29 +747,19 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
n=1;
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &cpu_bitmap, &n, OPAL_STRING))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
if (proc.jobid == ORTE_PROC_MY_NAME->jobid &&
|
if (proc.jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||||
proc.vpid == ORTE_PROC_MY_NAME->vpid) {
|
proc.vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||||
/* set mine */
|
/* set mine */
|
||||||
orte_process_info.my_local_rank = local_rank;
|
orte_process_info.my_local_rank = local_rank;
|
||||||
if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_LOCALRANK,
|
|
||||||
&orte_process_info.my_local_rank, ORTE_LOCAL_RANK))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
orte_process_info.my_node_rank = node_rank;
|
orte_process_info.my_node_rank = node_rank;
|
||||||
if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_NODERANK,
|
|
||||||
&orte_process_info.my_node_rank, ORTE_NODE_RANK))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
#if OPAL_HAVE_HWLOC
|
#if OPAL_HAVE_HWLOC
|
||||||
orte_process_info.bind_idx = bind_idx;
|
orte_process_info.bind_idx = bind_idx;
|
||||||
if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_BIND_INDEX,
|
|
||||||
&orte_process_info.bind_idx, OPAL_UINT))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
/* apps don't need the rest of the data in the buffer for this proc,
|
/* apps don't need the rest of the data in the buffer for this proc,
|
||||||
@ -785,13 +780,31 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
/* we don't need to store data for ourself in the database
|
/* store the values in the database */
|
||||||
* as we already did so
|
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) {
|
||||||
*/
|
ORTE_ERROR_LOG(rc);
|
||||||
if (proc.jobid == ORTE_PROC_MY_NAME->jobid &&
|
goto cleanup;
|
||||||
proc.vpid == ORTE_PROC_MY_NAME->vpid) {
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_NODERANK, &node_rank, ORTE_NODE_RANK))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
#if OPAL_HAVE_HWLOC
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_INDEX, &bind_idx, OPAL_UINT))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_BITMAP, cpu_bitmap, OPAL_STRING))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
/* we don't need to store the rest of the values
|
||||||
|
* for ourself in the database
|
||||||
|
* as we already did so during startup
|
||||||
|
*/
|
||||||
|
if (proc.jobid != ORTE_PROC_MY_NAME->jobid ||
|
||||||
|
proc.vpid != ORTE_PROC_MY_NAME->vpid) {
|
||||||
/* store the data for this proc */
|
/* store the data for this proc */
|
||||||
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_DAEMON_VPID, &dmn.vpid, ORTE_VPID))) {
|
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_DAEMON_VPID, &dmn.vpid, ORTE_VPID))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -806,20 +819,7 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
}
|
||||||
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_NODERANK, &node_rank, ORTE_NODE_RANK))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
#if OPAL_HAVE_HWLOC
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_INDEX, &bind_idx, OPAL_UINT))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
/* setup for next cycle */
|
/* setup for next cycle */
|
||||||
n = 1;
|
n = 1;
|
||||||
@ -929,6 +929,7 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo)
|
|||||||
#if OPAL_HAVE_HWLOC
|
#if OPAL_HAVE_HWLOC
|
||||||
opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL;
|
opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL;
|
||||||
unsigned int bind_idx;
|
unsigned int bind_idx;
|
||||||
|
char *cpu_bitmap;
|
||||||
#endif
|
#endif
|
||||||
orte_std_cntr_t n;
|
orte_std_cntr_t n;
|
||||||
opal_buffer_t buf;
|
opal_buffer_t buf;
|
||||||
@ -1014,6 +1015,11 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo)
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
n=1;
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &cpu_bitmap, &n, OPAL_STRING))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
n=1;
|
n=1;
|
||||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &state, &n, ORTE_PROC_STATE))) {
|
if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &state, &n, ORTE_PROC_STATE))) {
|
||||||
@ -1111,6 +1117,10 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo)
|
|||||||
proc->app_idx = app_idx;
|
proc->app_idx = app_idx;
|
||||||
proc->restarts = restarts;
|
proc->restarts = restarts;
|
||||||
proc->state = state;
|
proc->state = state;
|
||||||
|
#if OPAL_HAVE_HWLOC
|
||||||
|
proc->bind_idx = bind_idx;
|
||||||
|
proc->cpu_bitmap = cpu_bitmap;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
/* setup for next cycle */
|
/* setup for next cycle */
|
||||||
n = 1;
|
n = 1;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user