1
1

Get staged execution working on multi-node setups. Improve efficiency by only remapping if all procs not yet mapped in the job.

This commit was SVN r27181.
Этот коммит содержится в:
Ralph Castain 2012-08-29 20:35:52 +00:00
родитель dd5bd99942
Коммит 1b659de132
9 изменённых файлов: 95 добавлений и 69 удалений

Просмотреть файл

@ -34,6 +34,7 @@ BEGIN_C_DECLS
#define ORTE_DB_ARCH "orte.arch" #define ORTE_DB_ARCH "orte.arch"
#define ORTE_DB_NPROCS "orte.nprocs" #define ORTE_DB_NPROCS "orte.nprocs"
#define ORTE_DB_RMLURI "orte.rmluri" #define ORTE_DB_RMLURI "orte.rmluri"
#define ORTE_DB_BIND_BITMAP "orte.bind.bitmap"
END_C_DECLS END_C_DECLS

Просмотреть файл

@ -98,7 +98,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
{ {
int rc; int rc;
orte_job_t *jdata=NULL; orte_job_t *jdata=NULL;
orte_proc_t *proc;
orte_job_map_t *map=NULL; orte_job_map_t *map=NULL;
opal_buffer_t *wireup; opal_buffer_t *wireup;
opal_byte_object_t bo, *boptr; opal_byte_object_t bo, *boptr;
@ -259,18 +258,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
return rc; return rc;
} }
/* pack the binding bitmaps */
for (j=0; j < jdata->procs->size; j++) {
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, j))) {
continue;
}
/* okay to pack NULL strings */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &proc->cpu_bitmap, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* pack the collective ids */ /* pack the collective ids */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_modex, 1, ORTE_GRPCOMM_COLL_ID_T))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_modex, 1, ORTE_GRPCOMM_COLL_ID_T))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -324,7 +311,8 @@ static int check_local_proc(orte_job_t *jdata, orte_proc_t *pptr)
if (!pptr->local_proc) { if (!pptr->local_proc) {
/* not on the local list */ /* not on the local list */
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"adding proc %s to my local list", "%s adding proc %s to my local list",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pptr->name))); ORTE_NAME_PRINT(&pptr->name)));
/* keep tabs of the number of local procs */ /* keep tabs of the number of local procs */
jdata->num_local_procs++; jdata->num_local_procs++;
@ -351,7 +339,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
orte_vpid_t j; orte_vpid_t j;
orte_std_cntr_t cnt; orte_std_cntr_t cnt;
orte_job_t *jdata=NULL; orte_job_t *jdata=NULL;
orte_proc_t *proc;
opal_byte_object_t *bo; opal_byte_object_t *bo;
int8_t flag; int8_t flag;
int32_t n; int32_t n;
@ -539,20 +526,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
goto REPORT_ERROR; goto REPORT_ERROR;
} }
/* unpack the binding bitmaps */
for (j=0; j < jdata->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rc = ORTE_ERR_NOT_FOUND;
goto REPORT_ERROR;
}
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &proc->cpu_bitmap, &cnt, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
}
/* unpack the collective ids */ /* unpack the collective ids */
cnt=1; cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_modex, &cnt, ORTE_GRPCOMM_COLL_ID_T))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_modex, &cnt, ORTE_GRPCOMM_COLL_ID_T))) {

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC. * Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *

Просмотреть файл

@ -8,7 +8,9 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow

Просмотреть файл

@ -48,6 +48,7 @@ static int staged_mapper(orte_job_t *jdata)
orte_proc_t *proc; orte_proc_t *proc;
orte_node_t *node; orte_node_t *node;
bool work_to_do = false; bool work_to_do = false;
opal_list_item_t *item;
/* only use this mapper if it was specified */ /* only use this mapper if it was specified */
if (NULL == jdata->map->req_mapper || if (NULL == jdata->map->req_mapper ||
@ -100,6 +101,9 @@ static int staged_mapper(orte_job_t *jdata)
} }
/* if nothing is available, then move on */ /* if nothing is available, then move on */
if (0 == num_slots || 0 == opal_list_get_size(&node_list)) { if (0 == num_slots || 0 == opal_list_get_size(&node_list)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"%s mca:rmaps:staged: no nodes available for this app",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
OBJ_DESTRUCT(&node_list); OBJ_DESTRUCT(&node_list);
continue; continue;
} }
@ -110,20 +114,46 @@ static int staged_mapper(orte_job_t *jdata)
} }
if (ORTE_PROC_STATE_UNDEF != proc->state) { if (ORTE_PROC_STATE_UNDEF != proc->state) {
/* this proc has already been mapped or executed */ /* this proc has already been mapped or executed */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"%s mca:rmaps:staged: proc %s has already been mapped",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name));
continue; continue;
} }
/* flag that there is at least one proc still to /* flag that there is at least one proc still to
* be executed * be executed
*/ */
work_to_do = true; work_to_do = true;
/* track number mapped */
jdata->num_mapped++;
/* map this proc to the first available slot */ /* map this proc to the first available slot */
node = (orte_node_t*)opal_list_get_first(&node_list); node = (orte_node_t*)opal_list_get_first(&node_list);
OBJ_RETAIN(node); /* maintain accounting on object */ OBJ_RETAIN(node); /* maintain accounting on object */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"%s mca:rmaps:staged: assigning proc %s to node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), node->name);
proc->node = node; proc->node = node;
proc->nodename = node->name; proc->nodename = node->name;
/* the local rank is the number of procs
* on this node from this job - we don't
* directly track this number, so it must
* be found by looping across the node->procs
* array and counting it each time. For now,
* since we don't use this value in this mode
* of operation, just set it to something arbitrary
*/
proc->local_rank = node->num_procs;
/* the node rank is simply the number of procs
* on the node at this time
*/
proc->node_rank = node->num_procs;
/* track number of procs on node and number of slots used */
node->num_procs++; node->num_procs++;
node->slots_inuse++; node->slots_inuse++;
if (node->slots_inuse == node->slots_alloc) { if (node->slots_inuse == node->slots_alloc) {
opal_output(0, "%s slots on node %s are fully used",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name);
opal_list_remove_item(&node_list, &node->super); opal_list_remove_item(&node_list, &node->super);
OBJ_RELEASE(node); OBJ_RELEASE(node);
} }
@ -154,6 +184,11 @@ static int staged_mapper(orte_job_t *jdata)
break; break;
} }
} }
/* clear the list */
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
} }
/* if there isn't at least one proc that can be launched, /* if there isn't at least one proc that can be launched,

Просмотреть файл

@ -253,6 +253,7 @@ static void setup_job_complete(int fd, short args, void *cbdata)
jdata->map = OBJ_NEW(orte_job_map_t); jdata->map = OBJ_NEW(orte_job_map_t);
jdata->map->req_mapper = strdup("staged"); jdata->map->req_mapper = strdup("staged");
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_STAGED); ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_STAGED);
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
jdata->map->display_map = orte_rmaps_base.display_map; jdata->map->display_map = orte_rmaps_base.display_map;
} }
orte_plm_base_setup_job_complete(0, 0, (void*)caddy); orte_plm_base_setup_job_complete(0, 0, (void*)caddy);
@ -347,12 +348,13 @@ static void track_procs(int fd, short args, void *cbdata)
if (jdata->num_terminated == jdata->num_procs) { if (jdata->num_terminated == jdata->num_procs) {
/* no other procs are waiting, so end this job */ /* no other procs are waiting, so end this job */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
} else { } else if (jdata->num_mapped < jdata->num_procs) {
/* schedule the job for re-mapping so that any procs /* schedule the job for re-mapping so that procs
* waiting for resources can execute * waiting for resources can execute
*/ */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
} }
/* otherwise, do nothing until more procs terminate */
OBJ_RELEASE(caddy); OBJ_RELEASE(caddy);
return; return;
} }

Просмотреть файл

@ -710,6 +710,7 @@ static void orte_job_construct(orte_job_t* job)
job->state = ORTE_JOB_STATE_UNDEF; job->state = ORTE_JOB_STATE_UNDEF;
job->restart = false; job->restart = false;
job->num_mapped = 0;
job->num_launched = 0; job->num_launched = 0;
job->num_reported = 0; job->num_reported = 0;
job->num_terminated = 0; job->num_terminated = 0;

Просмотреть файл

@ -406,6 +406,8 @@ typedef struct {
orte_job_state_t state; orte_job_state_t state;
/* some procs in this job are being restarted */ /* some procs in this job are being restarted */
bool restart; bool restart;
/* number of procs mapped */
orte_vpid_t num_mapped;
/* number of procs launched */ /* number of procs launched */
orte_vpid_t num_launched; orte_vpid_t num_launched;
/* number of procs reporting contact info */ /* number of procs reporting contact info */

Просмотреть файл

@ -602,6 +602,10 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update)
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup_and_return; goto cleanup_and_return;
} }
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &proc->cpu_bitmap, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup_and_return;
}
#endif #endif
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &proc->state, 1, ORTE_PROC_STATE))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &proc->state, 1, ORTE_PROC_STATE))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -642,6 +646,7 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL, pbind, *lvptr; opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL, pbind, *lvptr;
unsigned int bind_idx, pbidx, *uiptr; unsigned int bind_idx, pbidx, *uiptr;
opal_hwloc_locality_t locality; opal_hwloc_locality_t locality;
char *cpu_bitmap;
#endif #endif
orte_std_cntr_t n; orte_std_cntr_t n;
opal_buffer_t buf; opal_buffer_t buf;
@ -742,29 +747,19 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; goto cleanup;
} }
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &cpu_bitmap, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
#endif #endif
if (proc.jobid == ORTE_PROC_MY_NAME->jobid && if (proc.jobid == ORTE_PROC_MY_NAME->jobid &&
proc.vpid == ORTE_PROC_MY_NAME->vpid) { proc.vpid == ORTE_PROC_MY_NAME->vpid) {
/* set mine */ /* set mine */
orte_process_info.my_local_rank = local_rank; orte_process_info.my_local_rank = local_rank;
if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_LOCALRANK,
&orte_process_info.my_local_rank, ORTE_LOCAL_RANK))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
orte_process_info.my_node_rank = node_rank; orte_process_info.my_node_rank = node_rank;
if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_NODERANK,
&orte_process_info.my_node_rank, ORTE_NODE_RANK))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
orte_process_info.bind_idx = bind_idx; orte_process_info.bind_idx = bind_idx;
if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_BIND_INDEX,
&orte_process_info.bind_idx, OPAL_UINT))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
#endif #endif
} }
/* apps don't need the rest of the data in the buffer for this proc, /* apps don't need the rest of the data in the buffer for this proc,
@ -785,27 +780,7 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; goto cleanup;
} }
/* we don't need to store data for ourself in the database /* store the values in the database */
* as we already did so
*/
if (proc.jobid == ORTE_PROC_MY_NAME->jobid &&
proc.vpid == ORTE_PROC_MY_NAME->vpid) {
continue;
}
/* store the data for this proc */
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_DAEMON_VPID, &dmn.vpid, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* lookup and store the hostname for this proc */
if (ORTE_SUCCESS != (rc = orte_db.fetch_pointer(&dmn, ORTE_DB_HOSTNAME, (void**)&hostname, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_HOSTNAME, hostname, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) { if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; goto cleanup;
@ -819,7 +794,32 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; goto cleanup;
} }
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_BITMAP, cpu_bitmap, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
#endif #endif
/* we don't need to store the rest of the values
* for ourself in the database
* as we already did so during startup
*/
if (proc.jobid != ORTE_PROC_MY_NAME->jobid ||
proc.vpid != ORTE_PROC_MY_NAME->vpid) {
/* store the data for this proc */
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_DAEMON_VPID, &dmn.vpid, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* lookup and store the hostname for this proc */
if (ORTE_SUCCESS != (rc = orte_db.fetch_pointer(&dmn, ORTE_DB_HOSTNAME, (void**)&hostname, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_HOSTNAME, hostname, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
} }
/* setup for next cycle */ /* setup for next cycle */
n = 1; n = 1;
@ -929,6 +929,7 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo)
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL; opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL;
unsigned int bind_idx; unsigned int bind_idx;
char *cpu_bitmap;
#endif #endif
orte_std_cntr_t n; orte_std_cntr_t n;
opal_buffer_t buf; opal_buffer_t buf;
@ -1014,6 +1015,11 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo)
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; goto cleanup;
} }
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &cpu_bitmap, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
#endif #endif
n=1; n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &state, &n, ORTE_PROC_STATE))) { if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &state, &n, ORTE_PROC_STATE))) {
@ -1111,6 +1117,10 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo)
proc->app_idx = app_idx; proc->app_idx = app_idx;
proc->restarts = restarts; proc->restarts = restarts;
proc->state = state; proc->state = state;
#if OPAL_HAVE_HWLOC
proc->bind_idx = bind_idx;
proc->cpu_bitmap = cpu_bitmap;
#endif
} }
/* setup for next cycle */ /* setup for next cycle */
n = 1; n = 1;