diff --git a/orte/mca/db/db_types.h b/orte/mca/db/db_types.h index d45735002c..aee66bc195 100644 --- a/orte/mca/db/db_types.h +++ b/orte/mca/db/db_types.h @@ -34,6 +34,7 @@ BEGIN_C_DECLS #define ORTE_DB_ARCH "orte.arch" #define ORTE_DB_NPROCS "orte.nprocs" #define ORTE_DB_RMLURI "orte.rmluri" +#define ORTE_DB_BIND_BITMAP "orte.bind.bitmap" END_C_DECLS diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 08cda9a6b4..b2b09fdc00 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -98,7 +98,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, { int rc; orte_job_t *jdata=NULL; - orte_proc_t *proc; orte_job_map_t *map=NULL; opal_buffer_t *wireup; opal_byte_object_t bo, *boptr; @@ -259,18 +258,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, return rc; } - /* pack the binding bitmaps */ - for (j=0; j < jdata->procs->size; j++) { - if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, j))) { - continue; - } - /* okay to pack NULL strings */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &proc->cpu_bitmap, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - /* pack the collective ids */ if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_modex, 1, ORTE_GRPCOMM_COLL_ID_T))) { ORTE_ERROR_LOG(rc); @@ -324,7 +311,8 @@ static int check_local_proc(orte_job_t *jdata, orte_proc_t *pptr) if (!pptr->local_proc) { /* not on the local list */ OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "adding proc %s to my local list", + "%s adding proc %s to my local list", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name))); /* keep tabs of the number of local procs */ jdata->num_local_procs++; @@ -351,7 +339,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, orte_vpid_t j; orte_std_cntr_t cnt; orte_job_t *jdata=NULL; - orte_proc_t *proc; opal_byte_object_t *bo; int8_t flag; int32_t n; @@ -539,20 +526,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, goto REPORT_ERROR; } - /* unpack the binding bitmaps */ - for (j=0; j < jdata->num_procs; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto REPORT_ERROR; - } - cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &proc->cpu_bitmap, &cnt, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - goto REPORT_ERROR; - } - } - /* unpack the collective ids */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_modex, &cnt, ORTE_GRPCOMM_COLL_ID_T))) { diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index d9dccceeda..c1828ec3e6 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * diff --git a/orte/mca/rmaps/rmaps_types.h b/orte/mca/rmaps/rmaps_types.h index cf5c436b32..c0e45ebb57 100644 --- a/orte/mca/rmaps/rmaps_types.h +++ b/orte/mca/rmaps/rmaps_types.h @@ -8,7 +8,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/orte/mca/rmaps/staged/rmaps_staged.c b/orte/mca/rmaps/staged/rmaps_staged.c index ed3880ef16..4182fd1e38 100644 --- a/orte/mca/rmaps/staged/rmaps_staged.c +++ b/orte/mca/rmaps/staged/rmaps_staged.c @@ -48,6 +48,7 @@ static int staged_mapper(orte_job_t *jdata) orte_proc_t *proc; orte_node_t *node; bool work_to_do = false; + opal_list_item_t *item; /* only use this mapper if it was specified */ if (NULL == jdata->map->req_mapper || @@ -100,6 +101,9 @@ static int staged_mapper(orte_job_t *jdata) } /* if nothing is available, then move on */ if (0 == num_slots || 0 == opal_list_get_size(&node_list)) { + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "%s mca:rmaps:staged: no nodes available for this app", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); OBJ_DESTRUCT(&node_list); continue; } @@ -110,20 +114,46 @@ static int staged_mapper(orte_job_t *jdata) } if (ORTE_PROC_STATE_UNDEF != proc->state) { /* this proc has already been mapped or executed */ + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "%s mca:rmaps:staged: proc %s has already been mapped", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name)); continue; } /* flag that there is at least one proc still to * be executed */ work_to_do = true; + /* track number mapped */ + jdata->num_mapped++; /* map this proc to the first available slot */ node = (orte_node_t*)opal_list_get_first(&node_list); OBJ_RETAIN(node); /* maintain accounting on object */ + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "%s mca:rmaps:staged: assigning proc %s to node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name), node->name); proc->node = node; proc->nodename = node->name; + /* the local rank is the number of procs + * on this node from this job - we don't + * directly track this number, so it must + * be found by looping across the node->procs + * array and counting it each time. For now, + * since we don't use this value in this mode + * of operation, just set it to something arbitrary + */ + proc->local_rank = node->num_procs; + /* the node rank is simply the number of procs + * on the node at this time + */ + proc->node_rank = node->num_procs; + /* track number of procs on node and number of slots used */ node->num_procs++; node->slots_inuse++; if (node->slots_inuse == node->slots_alloc) { + opal_output(0, "%s slots on node %s are fully used", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name); opal_list_remove_item(&node_list, &node->super); OBJ_RELEASE(node); } @@ -154,6 +184,11 @@ static int staged_mapper(orte_job_t *jdata) break; } } + /* clear the list */ + while (NULL != (item = opal_list_remove_first(&node_list))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&node_list); } /* if there isn't at least one proc that can be launched, diff --git a/orte/mca/state/staged/state_staged.c b/orte/mca/state/staged/state_staged.c index 89300409dd..c1528911a2 100644 --- a/orte/mca/state/staged/state_staged.c +++ b/orte/mca/state/staged/state_staged.c @@ -253,6 +253,7 @@ static void setup_job_complete(int fd, short args, void *cbdata) jdata->map = OBJ_NEW(orte_job_map_t); jdata->map->req_mapper = strdup("staged"); ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_STAGED); + ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); jdata->map->display_map = orte_rmaps_base.display_map; } orte_plm_base_setup_job_complete(0, 0, (void*)caddy); @@ -347,12 +348,13 @@ static void track_procs(int fd, short args, void *cbdata) if (jdata->num_terminated == jdata->num_procs) { /* no other procs are waiting, so end this job */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); - } else { - /* schedule the job for re-mapping so that any procs + } else if (jdata->num_mapped < jdata->num_procs) { + /* schedule the job for re-mapping so that procs * waiting for resources can execute */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); } + /* otherwise, do nothing until more procs terminate */ OBJ_RELEASE(caddy); return; } diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 5b139addbf..8f108541de 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -710,6 +710,7 @@ static void orte_job_construct(orte_job_t* job) job->state = ORTE_JOB_STATE_UNDEF; job->restart = false; + job->num_mapped = 0; job->num_launched = 0; job->num_reported = 0; job->num_terminated = 0; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 95639d09ec..d3fca7f2b3 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -406,6 +406,8 @@ typedef struct { orte_job_state_t state; /* some procs in this job are being restarted */ bool restart; + /* number of procs mapped */ + orte_vpid_t num_mapped; /* number of procs launched */ orte_vpid_t num_launched; /* number of procs reporting contact info */ diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index 01c8f41e7f..7ffa33a522 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -602,6 +602,10 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update) ORTE_ERROR_LOG(rc); goto cleanup_and_return; } + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &proc->cpu_bitmap, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup_and_return; + } #endif if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &proc->state, 1, ORTE_PROC_STATE))) { ORTE_ERROR_LOG(rc); @@ -642,6 +646,7 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo) opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL, pbind, *lvptr; unsigned int bind_idx, pbidx, *uiptr; opal_hwloc_locality_t locality; + char *cpu_bitmap; #endif orte_std_cntr_t n; opal_buffer_t buf; @@ -742,29 +747,19 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo) ORTE_ERROR_LOG(rc); goto cleanup; } + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &cpu_bitmap, &n, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } #endif if (proc.jobid == ORTE_PROC_MY_NAME->jobid && proc.vpid == ORTE_PROC_MY_NAME->vpid) { /* set mine */ orte_process_info.my_local_rank = local_rank; - if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_LOCALRANK, - &orte_process_info.my_local_rank, ORTE_LOCAL_RANK))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } orte_process_info.my_node_rank = node_rank; - if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_NODERANK, - &orte_process_info.my_node_rank, ORTE_NODE_RANK))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } #if OPAL_HAVE_HWLOC orte_process_info.bind_idx = bind_idx; - if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_BIND_INDEX, - &orte_process_info.bind_idx, OPAL_UINT))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } #endif } /* apps don't need the rest of the data in the buffer for this proc, @@ -785,27 +780,7 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo) ORTE_ERROR_LOG(rc); goto cleanup; } - /* we don't need to store data for ourself in the database - * as we already did so - */ - if (proc.jobid == ORTE_PROC_MY_NAME->jobid && - proc.vpid == ORTE_PROC_MY_NAME->vpid) { - continue; - } - /* store the data for this proc */ - if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_DAEMON_VPID, &dmn.vpid, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - /* lookup and store the hostname for this proc */ - if (ORTE_SUCCESS != (rc = orte_db.fetch_pointer(&dmn, ORTE_DB_HOSTNAME, (void**)&hostname, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_HOSTNAME, hostname, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + /* store the values in the database */ if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; @@ -819,7 +794,32 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo) ORTE_ERROR_LOG(rc); goto cleanup; } + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_BITMAP, cpu_bitmap, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } #endif + /* we don't need to store the rest of the values + * for ourself in the database + * as we already did so during startup + */ + if (proc.jobid != ORTE_PROC_MY_NAME->jobid || + proc.vpid != ORTE_PROC_MY_NAME->vpid) { + /* store the data for this proc */ + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_DAEMON_VPID, &dmn.vpid, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* lookup and store the hostname for this proc */ + if (ORTE_SUCCESS != (rc = orte_db.fetch_pointer(&dmn, ORTE_DB_HOSTNAME, (void**)&hostname, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_HOSTNAME, hostname, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } } /* setup for next cycle */ n = 1; @@ -929,6 +929,7 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo) #if OPAL_HAVE_HWLOC opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL; unsigned int bind_idx; + char *cpu_bitmap; #endif orte_std_cntr_t n; opal_buffer_t buf; @@ -1014,6 +1015,11 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo) ORTE_ERROR_LOG(rc); goto cleanup; } + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &cpu_bitmap, &n, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } #endif n=1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &state, &n, ORTE_PROC_STATE))) { @@ -1111,6 +1117,10 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo) proc->app_idx = app_idx; proc->restarts = restarts; proc->state = state; +#if OPAL_HAVE_HWLOC + proc->bind_idx = bind_idx; + proc->cpu_bitmap = cpu_bitmap; +#endif } /* setup for next cycle */ n = 1;