diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index b5bc86cf84..becb83250c 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -608,7 +608,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, char **slot_str=NULL; orte_jobid_t debugger; bool add_child; - + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:constructing child list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -1414,7 +1414,9 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, int inm; opal_event_t *delay; int num_procs_alive; - + orte_nid_t *nid; + orte_node_t *node; + /* protect operations involving the global list of children */ OPAL_THREAD_LOCK(&orte_odls_globals.mutex); @@ -1453,7 +1455,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, if (NULL == jobdat) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; - goto CLEANUP; + goto GETOUT; } /* do we have any local procs to launch? */ @@ -1466,6 +1468,29 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, apps = jobdat->apps; num_apps = jobdat->num_apps; + /* see if the mapper thinks we are oversubscribed */ + oversubscribed = false; + if (ORTE_PROC_IS_HNP) { + /* just fake it - we don't keep a local nidmap */ + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + rc = ORTE_ERR_NOT_FOUND; + goto CLEANUP; + } + if (node->oversubscribed) { + oversubscribed = true; + } + } else { + if (NULL == (nid = orte_util_lookup_nid(ORTE_PROC_MY_NAME))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + rc = ORTE_ERR_NOT_FOUND; + goto CLEANUP; + } + if (nid->oversubscribed) { + oversubscribed = true; + } + } + #if OPAL_ENABLE_FT_CR == 1 /* * Notify the local SnapC component regarding new job @@ -1496,30 +1521,41 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, orte_sstore.wait_all_deps(); #endif - if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_processor_info(&num_processors))) { - /* if we cannot find the number of local processors, we have no choice - * but to default to conservative settings - */ - oversubscribed = true; + /* if the mapper says we are oversubscribed, then we trust it */ + if (oversubscribed) { + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:launch mapper declares this node oversubscribed", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { - if (num_procs_alive > num_processors) { - /* if the #procs > #processors, declare us oversubscribed. This - * covers the case where the user didn't tell us anything about the - * number of available slots, so we defaulted to a value of 1 + /* if the mapper thinks we are not oversubscribed, then we + * do a final smoke test by checking against the #processors. This + * is done solely in case the mapper had incorrect knowledge of + * the #local processors + */ + if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_processor_info(&num_processors))) { + /* if we cannot find the number of local processors, we have no choice + * but to default to conservative settings */ oversubscribed = true; } else { - /* otherwise, declare us to not be oversubscribed so we can be aggressive */ - oversubscribed = false; + if (num_procs_alive > num_processors) { + /* if the #procs > #processors, declare us oversubscribed. This + * covers the case where the user didn't tell us anything about the + * number of available slots, so we defaulted to a value of 1 + */ + oversubscribed = true; + } else { + /* otherwise, declare us to not be oversubscribed so we can be aggressive */ + oversubscribed = false; + } } + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:launch found %d processors for %d children and set oversubscribed to %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (ORTE_SUCCESS == rc) ? num_processors : -1, (int)opal_list_get_size(&orte_local_children), + oversubscribed ? "true" : "false")); } - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:launch found %d processors for %d children and set oversubscribed to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (ORTE_SUCCESS == rc) ? num_processors: -1, (int)opal_list_get_size(&orte_local_children), - oversubscribed ? "true" : "false")); - /* setup to report the proc state to the HNP */ OBJ_CONSTRUCT(&alert, opal_buffer_t); @@ -1975,6 +2011,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } } + GETOUT: opal_condition_signal(&orte_odls_globals.cond); OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); return rc; diff --git a/orte/mca/rmaps/base/rmaps_base_common_mappers.c b/orte/mca/rmaps/base/rmaps_base_common_mappers.c index 92d2ba36c0..2c05f453e4 100644 --- a/orte/mca/rmaps/base/rmaps_base_common_mappers.c +++ b/orte/mca/rmaps/base/rmaps_base_common_mappers.c @@ -192,7 +192,8 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app, if (0 == node->slots_alloc) { num_procs_to_assign = 1; } else { - num_possible_procs = node->slots_alloc / jdata->map->cpus_per_rank; + /* 'num_possible_procs' defines the number of ranks */ + num_possible_procs = node->slots_alloc; if (0 == num_possible_procs) { num_procs_to_assign = 1; } else { @@ -200,7 +201,11 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app, } } } else { - num_possible_procs = (node->slots_alloc - node->slots_inuse) / jdata->map->cpus_per_rank; + /* 'num_possible_procs' define number of ranks on the node. Each + * rank occupies one slot. Each slot may represent more than one + * cpu, depending on the cpus-per-task setting + */ + num_possible_procs = (node->slots_alloc - node->slots_inuse); if (0 == num_possible_procs) { num_procs_to_assign = 1; } else { diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index 39e1f5eae5..5f04588777 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -364,6 +364,9 @@ PROCESS: /* retain the proc struct so that we correctly track its release */ OBJ_RETAIN(proc); ++node->num_procs; + + /* update the oversubscribed state of the node */ + node->oversubscribed = oversubscribed; return ORTE_SUCCESS; } @@ -423,7 +426,7 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata, ORTE_JOBID_PRINT(jdata->jobid), current_node->name)); /* Be sure to demarcate the slots for this proc as claimed from the node */ - current_node->slots_inuse += cpus_per_rank; + current_node->slots_inuse += 1; /* see if this node is oversubscribed now */ if (current_node->slots_inuse > current_node->slots) { diff --git a/orte/mca/rmaps/rmaps_types.h b/orte/mca/rmaps/rmaps_types.h index 7dd63e99b9..25fb3cb87c 100644 --- a/orte/mca/rmaps/rmaps_types.h +++ b/orte/mca/rmaps/rmaps_types.h @@ -46,6 +46,7 @@ struct orte_job_map_t { int npersocket; int16_t cpus_per_rank; int16_t stride; + /* are we allowed to oversubscribe the nodes in this job */ bool oversubscribe; bool display_map; bool cpu_lists; diff --git a/orte/runtime/data_type_support/orte_dt_print_fns.c b/orte/runtime/data_type_support/orte_dt_print_fns.c index b3237ef94f..836ee55174 100644 --- a/orte/runtime/data_type_support/orte_dt_print_fns.c +++ b/orte/runtime/data_type_support/orte_dt_print_fns.c @@ -384,8 +384,9 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_ free(tmp); tmp = tmp2; - asprintf(&tmp2, "%s\n%s\tNum slots: %ld\tSlots in use: %ld", tmp, pfx2, - (long)src->slots, (long)src->slots_inuse); + asprintf(&tmp2, "%s\n%s\tNum slots: %ld\tSlots in use: %ld\tOversubscribed: %s", tmp, pfx2, + (long)src->slots, (long)src->slots_inuse, + (src->oversubscribed) ? "TRUE" : "FALSE"); free(tmp); tmp = tmp2; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 39507df64b..9335ff100d 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -516,6 +516,8 @@ typedef struct { char *name; /* vpid of this job family's daemon on this node */ orte_vpid_t daemon; + /* whether or not this node is oversubscribed */ + bool oversubscribed; /* list of interface attributes */ opal_list_t attrs; /* list of system info */ diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index bacdef9258..417d4efdfe 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -299,6 +299,7 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr) char *nodename; opal_buffer_t buf; char *ptr; + uint8_t *oversub=NULL; /* setup a buffer for tmp use */ OBJ_CONSTRUCT(&buf, opal_buffer_t); @@ -357,8 +358,9 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr) * match their node array index */ - /* allocate space for the daemon vpids */ + /* allocate space for the daemon vpids and oversubscribed flags */ vpids = (orte_vpid_t*)malloc(num_nodes * sizeof(orte_vpid_t)); + oversub = (uint8_t*)malloc(num_nodes * sizeof(uint8_t)); for (i=0; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; @@ -369,12 +371,18 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr) continue; } vpids[i] = node->daemon->name.vpid; + oversub[i] = node->oversubscribed; } if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, vpids, num_nodes, ORTE_VPID))) { ORTE_ERROR_LOG(rc); return rc; } free(vpids); + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, oversub, num_nodes, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + return rc; + } + free(oversub); /* check if we are to send the profile file data */ if (orte_send_profile) { @@ -426,6 +434,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo) opal_buffer_t buf; opal_byte_object_t *boptr; int rc; + uint8_t *oversub; OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s decode:nidmap decoding nodemap", @@ -490,6 +499,15 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo) ORTE_ERROR_LOG(rc); return rc; } + + /* unpack the oversubscribed flags */ + oversub = (uint8_t*)malloc(num_nodes * sizeof(uint8_t)); + n=num_nodes; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, oversub, &n, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* transfer the data to the nidmap, counting the number of * daemons in the system */ @@ -497,13 +515,19 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo) for (i=0; i < num_nodes; i++) { if (NULL != (ndptr = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) { ndptr->daemon = vpids[i]; + if (0 == oversub[i]) { + ndptr->oversubscribed = false; + } else { + ndptr->oversubscribed = true; + } if (ORTE_VPID_INVALID != vpids[i]) { ++num_daemons; } } } free(vpids); - + free(oversub); + /* if we are a daemon or the HNP, update our num_procs */ if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { orte_process_info.num_procs = num_daemons;