/* * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "orte_config.h" #include "orte/constants.h" #include "orte/types.h" #include #ifdef HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H */ #ifdef HAVE_STRING_H #include #endif /* HAVE_STRING_H */ #include "opal/mca/base/mca_base_param.h" #include "opal/mca/hwloc/base/base.h" #include "orte/util/show_help.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rmaps/base/rmaps_private.h" #include "orte/mca/rmaps/base/base.h" #include "rmaps_ppr.h" static int ppr_mapper(orte_job_t *jdata); orte_rmaps_base_module_t orte_rmaps_ppr_module = { ppr_mapper }; #if OPAL_HAVE_HWLOC static void prune(orte_jobid_t jobid, orte_app_idx_t app_idx, orte_node_t *node, opal_hwloc_level_t *level, orte_vpid_t *nmapped); #endif static int ppr[OPAL_HWLOC_HWTHREAD_LEVEL+1]; static int ppr_mapper(orte_job_t *jdata) { int rc = ORTE_SUCCESS, j, n; mca_base_component_t *c=&mca_rmaps_ppr_component.base_version; orte_node_t *node; orte_proc_t *proc; orte_app_context_t *app; orte_vpid_t total_procs, nprocs_mapped; opal_hwloc_level_t level, start=OPAL_HWLOC_NODE_LEVEL; #if OPAL_HAVE_HWLOC hwloc_obj_t obj; hwloc_obj_type_t lowest; unsigned cache_level=0; unsigned int nobjs, i; #endif opal_list_t node_list; opal_list_item_t *item; orte_std_cntr_t num_slots; orte_app_idx_t idx; char **ppr_req, **ck; size_t len; bool pruning_reqd = false; bool initial_map=true; /* only handle initial launch of loadbalanced * or NPERxxx jobs - allow restarting of failed apps */ if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:ppr: job %s being restarted - ppr cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL != jdata->map->req_mapper && 0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) { /* a mapper has been specified, and it isn't me */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:ppr: job %s not using ppr mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL == jdata->map->ppr || !(ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) { /* not for us */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:ppr: job %s not using ppr mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:ppr: mapping job %s with ppr %s", ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(c->mca_component_name); /* initialize */ memset(ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t)); /* parse option */ n=0; ppr_req = opal_argv_split(jdata->map->ppr, ','); for (j=0; NULL != ppr_req[j]; j++) { /* split on the colon */ ck = opal_argv_split(ppr_req[j], ':'); if (2 != opal_argv_count(ck)) { /* must provide a specification */ orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, jdata->map->ppr); opal_argv_free(ppr_req); opal_argv_free(ck); return ORTE_ERR_SILENT; } len = strlen(ck[1]); if (0 == strncasecmp(ck[1], "node", len)) { ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10); ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNODE); start = OPAL_HWLOC_NODE_LEVEL; n++; #if OPAL_HAVE_HWLOC } else if (0 == strncasecmp(ck[1], "hwthread", len) || 0 == strncasecmp(ck[1], "thread", len)) { ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10); start = OPAL_HWLOC_HWTHREAD_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYHWTHREAD); n++; } else if (0 == strncasecmp(ck[1], "core", len)) { ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_CORE_LEVEL) { start = OPAL_HWLOC_CORE_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYCORE); } n++; } else if (0 == strncasecmp(ck[1], "socket", len) || 0 == strncasecmp(ck[1], "skt", len)) { ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_SOCKET_LEVEL) { start = OPAL_HWLOC_SOCKET_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSOCKET); } n++; } else if (0 == strncasecmp(ck[1], "l1cache", len)) { ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_L1CACHE_LEVEL) { start = OPAL_HWLOC_L1CACHE_LEVEL; cache_level = 1; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL1CACHE); } n++; } else if (0 == strncasecmp(ck[1], "l2cache", len)) { ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_L2CACHE_LEVEL) { start = OPAL_HWLOC_L2CACHE_LEVEL; cache_level = 2; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL2CACHE); } n++; } else if (0 == strncasecmp(ck[1], "l3cache", len)) { ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_L3CACHE_LEVEL) { start = OPAL_HWLOC_L3CACHE_LEVEL; cache_level = 3; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL3CACHE); } n++; } else if (0 == strncasecmp(ck[1], "numa", len)) { ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_NUMA_LEVEL) { start = OPAL_HWLOC_NUMA_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNUMA); } n++; #endif } else { /* unknown spec */ orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], jdata->map->ppr); opal_argv_free(ppr_req); opal_argv_free(ck); return ORTE_ERR_SILENT; } opal_argv_free(ck); } opal_argv_free(ppr_req); /* if nothing was given, that's an error */ if (0 == n) { opal_output(0, "NOTHING GIVEN"); return ORTE_ERR_SILENT; } /* if more than one level was specified, then pruning will be reqd */ if (1 < n) { pruning_reqd = true; } opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:ppr: job %s assigned policy %s", ORTE_JOBID_PRINT(jdata->jobid), orte_rmaps_base_print_mapping(jdata->map->mapping)); /* convenience */ level = start; #if OPAL_HAVE_HWLOC lowest = opal_hwloc_levels[start]; #endif for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) { continue; } /* if the number of total procs was given, set that * limit - otherwise, set to max so we simply fill * all the nodes with the pattern */ if (0 < app->num_procs) { total_procs = app->num_procs; } else { total_procs = ORTE_VPID_MAX; } /* get the available nodes */ OBJ_CONSTRUCT(&node_list, opal_list_t); if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->mapping, initial_map))) { ORTE_ERROR_LOG(rc); goto error; } /* flag that all subsequent requests should not reset the node->mapped flag */ initial_map = false; /* if a bookmark exists from some prior mapping, set us to start there */ jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata); /* cycle across the nodes */ nprocs_mapped = 0; for (item = opal_list_get_first(&node_list); item != opal_list_get_end(&node_list); item = opal_list_get_next(item)) { node = (orte_node_t*)item; #if OPAL_HAVE_HWLOC /* bozo check */ if (NULL == node->topology) { orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); rc = ORTE_ERR_SILENT; goto error; } #endif /* add the node to the map, if needed */ if (!node->mapped) { if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(rc); goto error; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ jdata->map->num_nodes++; } /* if we are mapping solely at the node level, just put * that many procs on this node */ if (OPAL_HWLOC_NODE_LEVEL == start) { #if OPAL_HAVE_HWLOC obj = hwloc_get_root_obj(node->topology); #endif for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) { rc = ORTE_ERR_OUT_OF_RESOURCE; goto error; } nprocs_mapped++; #if OPAL_HAVE_HWLOC proc->locale = obj; #endif } #if OPAL_HAVE_HWLOC } else { /* get the number of lowest resources on this node */ nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, lowest, cache_level, OPAL_HWLOC_AVAILABLE); /* map the specified number of procs to each such resource on this node, * recording the locale of each proc so we know its cpuset */ for (i=0; i < nobjs; i++) { obj = opal_hwloc_base_get_obj_by_type(node->topology, lowest, cache_level, i, OPAL_HWLOC_AVAILABLE); for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) { rc = ORTE_ERR_OUT_OF_RESOURCE; goto error; } nprocs_mapped++; proc->locale = obj; } } if (pruning_reqd) { /* go up the ladder and prune the procs according to * the specification, adjusting the count of procs on the * node as we go */ level--; prune(jdata->jobid, idx, node, &level, &nprocs_mapped); } #endif } /* set the total slots used */ if ((int)node->num_procs <= node->slots) { node->slots_inuse = (int)node->num_procs; } else { node->slots_inuse = node->slots; } /* if no-oversubscribe was specified, check to see if * we have violated the total slot specification - regardless, * if slots_max was given, we are not allowed to violate it! */ if ((node->slots < (int)node->num_procs) || (0 < node->slots_max && node->slots_max < (int)node->num_procs)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, node->num_procs, app->app); rc = ORTE_ERR_SILENT; goto error; } /* flag the node as oversubscribed so that sched-yield gets * properly set */ node->oversubscribed = true; } /* if we haven't mapped all the procs, continue on to the * next node */ if (total_procs == nprocs_mapped) { break; } } if (0 == app->num_procs) { app->num_procs = nprocs_mapped; } if (ORTE_VPID_MAX != total_procs && nprocs_mapped < total_procs) { /* couldn't map them all */ orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs", true, app->app, app->num_procs, jdata->map->ppr); rc = ORTE_ERR_SILENT; goto error; } /* compute vpids and add proc objects to the job */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { ORTE_ERROR_LOG(rc); goto error; } /* track the total number of processes we mapped - must update * this AFTER we compute vpids so that computation is done * correctly */ jdata->num_procs += app->num_procs; while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); } return ORTE_SUCCESS; error: while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); return rc; } #if OPAL_HAVE_HWLOC static hwloc_obj_t find_split(hwloc_topology_t topo, hwloc_obj_t obj) { unsigned k; hwloc_obj_t nxt; if (1 < obj->arity) { return obj; } for (k=0; k < obj->arity; k++) { nxt = find_split(topo, obj->children[k]); if (NULL != nxt) { return nxt; } } return NULL; } /* recursively climb the topology, pruning procs beyond that allowed * by the given ppr */ static void prune(orte_jobid_t jobid, orte_app_idx_t app_idx, orte_node_t *node, opal_hwloc_level_t *level, orte_vpid_t *nmapped) { hwloc_obj_t obj, top; unsigned int i, nobjs; hwloc_obj_type_t lvl; unsigned cache_level = 0, k; int nprocs; hwloc_cpuset_t avail, cpus, childcpus; int n, limit, nmax, nunder, idx, idxmax = 0; orte_proc_t *proc, *pptr, *procmax; opal_hwloc_level_t ll; char dang[64]; opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:ppr: pruning level %d", *level); /* convenience */ ll = *level; /* convenience */ lvl = opal_hwloc_levels[ll]; limit = ppr[ll]; if (0 == limit) { /* no limit at this level, so move up if necessary */ if (0 == ll) { /* done */ return; } --(*level); prune(jobid, app_idx, node, level, nmapped); return; } /* handle the darn cache thing again */ if (OPAL_HWLOC_L3CACHE_LEVEL == ll) { cache_level = 3; } else if (OPAL_HWLOC_L2CACHE_LEVEL == ll) { cache_level = 2; } else if (OPAL_HWLOC_L1CACHE_LEVEL == ll) { cache_level = 1; } /* get the number of resources at this level on this node */ nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, lvl, cache_level, OPAL_HWLOC_AVAILABLE); /* for each resource, compute the number of procs sitting * underneath it and check against the limit */ for (i=0; i < nobjs; i++) { obj = opal_hwloc_base_get_obj_by_type(node->topology, lvl, cache_level, i, OPAL_HWLOC_AVAILABLE); /* get the available cpuset */ avail = opal_hwloc_base_get_available_cpus(node->topology, obj); /* look at the intersection of this object's cpuset and that * of each proc in the job/app - if they intersect, then count this proc * against the limit */ nprocs = 0; for (n=0; n < node->procs->size; n++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) { continue; } if (proc->name.jobid != jobid || proc->app_idx != app_idx) { continue; } cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale); if (hwloc_bitmap_intersects(avail, cpus)) { nprocs++; } } opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:ppr: found %d procs limit %d", nprocs, limit); /* check against the limit */ while (limit < nprocs) { /* need to remove procs - do this in a semi-intelligent * manner to provide a little load balancing by cycling * across the objects beneath this one, removing procs * in a round-robin fashion until the limit is satisfied * * NOTE: I'm sure someone more knowledgeable with hwloc * will come up with a more efficient way to do this, so * consider this is a starting point */ /* find the first level that has more than * one child beneath it - if all levels * have only one child, then return this * object */ top = find_split(node->topology, obj); hwloc_obj_type_snprintf(dang, 64, top, 1); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:ppr: SPLIT AT LEVEL %s", dang); /* cycle across the children of this object */ nmax = 0; procmax = NULL; idx = 0; /* find the child with the most procs underneath it */ for (k=0; k < top->arity && limit < nprocs; k++) { /* get this object's available cpuset */ childcpus = opal_hwloc_base_get_available_cpus(node->topology, top->children[k]); nunder = 0; pptr = NULL; for (n=0; n < node->procs->size; n++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) { continue; } if (proc->name.jobid != jobid || proc->app_idx != app_idx) { continue; } cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale); if (hwloc_bitmap_intersects(childcpus, cpus)) { nunder++; if (NULL == pptr) { /* save the location of the first proc under this object */ pptr = proc; idx = n; } } } if (nmax < nunder) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:ppr: PROCS UNDER CHILD %d %d MAX %d", k, nunder, nmax); nmax = nunder; procmax = pptr; idxmax = idx; } } if (NULL == procmax) { /* can't find anything to remove - error out */ goto error; } /* remove it */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:ppr: removing proc at posn %d", idxmax); opal_pointer_array_set_item(node->procs, idxmax, NULL); node->num_procs--; node->slots_inuse--; if (node->slots_inuse < 0) { node->slots_inuse = 0; } nprocs--; *nmapped -= 1; OBJ_RELEASE(procmax); } } /* finished with this level - move up if necessary */ if (0 == ll) { return; } --(*level); prune(jobid, app_idx, node, level, nmapped); return; error: opal_output(0, "INFINITE LOOP"); } #endif