diff --git a/ompi/mca/rte/orte/rte_orte.h b/ompi/mca/rte/orte/rte_orte.h index 7a9aba9c9c..4a3bd78c73 100644 --- a/ompi/mca/rte/orte/rte_orte.h +++ b/ompi/mca/rte/orte/rte_orte.h @@ -56,6 +56,7 @@ typedef orte_ns_cmp_bitmask_t ompi_rte_cmp_bitmask_t; #define OMPI_PROCESS_NAME_NTOH ORTE_PROCESS_NAME_NTOH #define OMPI_RTE_NODE_ID ORTE_DB_DAEMON_VPID #define OMPI_RTE_MY_NODEID ORTE_PROC_MY_DAEMON->vpid +#define OMPI_RTE_HOST_ID ORTE_DB_HOSTID /* Collective objects and operations */ #define ompi_rte_collective_t orte_grpcomm_collective_t diff --git a/ompi/proc/proc.c b/ompi/proc/proc.c index d17eb32abc..e156c6c3e2 100644 --- a/ompi/proc/proc.c +++ b/ompi/proc/proc.c @@ -148,9 +148,28 @@ static int ompi_proc_set_locality(ompi_proc_t *proc) (void**)&vptr, OPAL_UINT32))) { return ret; } - /* if we are on different nodes, then we are non-local */ + /* if we are on different nodes, then we are probably non-local */ if (vpid != OMPI_RTE_MY_NODEID) { +#ifdef OMPI_RTE_HOST_ID + /* see if coprocessors were detected - if the hostid isn't + * present, then no coprocessors were detected and we can + * ignore this test + */ + vptr = &vpid; + if (OMPI_SUCCESS == opal_db.fetch((opal_identifier_t*)&proc->proc_name, OMPI_RTE_HOST_ID, + (void**)&vptr, OPAL_UINT32)) { + /* if this matches my host id, then we are on the same host, + * but not on the same board + */ + if (vpid == ompi_process_info.my_hostid) { + locality = OPAL_PROC_ON_HOST; + } else { + locality = OPAL_PROC_NON_LOCAL; + } + } +#else locality = OPAL_PROC_NON_LOCAL; +#endif } else { #if OPAL_HAVE_HWLOC { diff --git a/opal/mca/hwloc/base/Makefile.am b/opal/mca/hwloc/base/Makefile.am index 9f4332cfc5..38db09052a 100644 --- a/opal/mca/hwloc/base/Makefile.am +++ b/opal/mca/hwloc/base/Makefile.am @@ -13,8 +13,7 @@ headers += \ base/base.h libmca_hwloc_la_SOURCES += \ - base/hwloc_base_close.c \ - base/hwloc_base_open.c + base/hwloc_base_frame.c if OPAL_HAVE_HWLOC libmca_hwloc_la_SOURCES += \ diff --git a/opal/mca/hwloc/base/base.h b/opal/mca/hwloc/base/base.h index 080a68bd18..553d67aacc 100644 --- a/opal/mca/hwloc/base/base.h +++ b/opal/mca/hwloc/base/base.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -60,6 +61,7 @@ OPAL_DECLSPEC char* opal_hwloc_base_print_locality(opal_hwloc_locality_t localit OPAL_DECLSPEC extern char *opal_hwloc_base_slot_list; OPAL_DECLSPEC extern char *opal_hwloc_base_cpu_set; OPAL_DECLSPEC extern hwloc_cpuset_t opal_hwloc_base_given_cpus; +OPAL_DECLSPEC extern char *opal_hwloc_base_topo_file; /* convenience macro for debugging */ #define OPAL_HWLOC_SHOW_BINDING(n, v) \ @@ -193,6 +195,10 @@ OPAL_DECLSPEC int opal_hwloc_base_slot_list_parse(const char *slot_str, hwloc_topology_t topo, hwloc_cpuset_t cpumask); +OPAL_DECLSPEC char* opal_hwloc_base_find_coprocessors(hwloc_topology_t topo); +OPAL_DECLSPEC char* opal_hwloc_base_check_on_coprocessor(void); + + /** * Report a bind failure using the normal mechanisms if a component * fails to bind memory -- according to the value of the diff --git a/opal/mca/hwloc/base/hwloc_base_close.c b/opal/mca/hwloc/base/hwloc_base_close.c deleted file mode 100644 index 5a27253733..0000000000 --- a/opal/mca/hwloc/base/hwloc_base_close.c +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include "opal/constants.h" -#include "opal/mca/mca.h" -#include "opal/mca/base/base.h" -#include "opal/mca/hwloc/hwloc.h" -#include "opal/mca/hwloc/base/base.h" -#include "opal/util/output.h" - -int opal_hwloc_base_close(void); - -int opal_hwloc_base_close(void) -{ - if (!opal_hwloc_base_inited) { - return OPAL_SUCCESS; - } - -#if OPAL_HAVE_HWLOC - { - int ret; - - /* no need to close the component as it was statically opened */ - - /* for support of tools such as ompi_info */ - ret = mca_base_framework_components_close (&opal_hwloc_base_framework, NULL); - if (OPAL_SUCCESS != ret) { - return ret; - } - - /* free memory */ - if (NULL != opal_hwloc_my_cpuset) { - hwloc_bitmap_free(opal_hwloc_my_cpuset); - opal_hwloc_my_cpuset = NULL; - } - } -#endif - - /* All done */ - opal_hwloc_base_inited = false; - return OPAL_SUCCESS; -} diff --git a/opal/mca/hwloc/base/hwloc_base_open.c b/opal/mca/hwloc/base/hwloc_base_frame.c similarity index 94% rename from opal/mca/hwloc/base/hwloc_base_open.c rename to opal/mca/hwloc/base/hwloc_base_frame.c index cf765a2b05..0b776ba451 100644 --- a/opal/mca/hwloc/base/hwloc_base_open.c +++ b/opal/mca/hwloc/base/hwloc_base_frame.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2011-2013 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -56,6 +57,7 @@ hwloc_obj_type_t opal_hwloc_levels[] = { HWLOC_OBJ_PU }; bool opal_hwloc_use_hwthreads_as_cpus = false; +char *opal_hwloc_base_topo_file = NULL; #endif #if OPAL_HAVE_HWLOC @@ -75,8 +77,7 @@ static mca_base_var_enum_value_t hwloc_failure_action[] = { static int opal_hwloc_base_register(mca_base_register_flag_t flags); static int opal_hwloc_base_open(mca_base_open_flag_t flags); -/* defined in hwloc_base_close.c */ -int opal_hwloc_base_close(void); +static int opal_hwloc_base_close(void); MCA_BASE_FRAMEWORK_DECLARE(opal, hwloc, NULL, opal_hwloc_base_register, opal_hwloc_base_open, opal_hwloc_base_close, mca_hwloc_base_static_components, 0); @@ -162,6 +163,12 @@ static int opal_hwloc_base_register(mca_base_register_flag_t flags) MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &opal_hwloc_use_hwthreads_as_cpus); + opal_hwloc_base_topo_file = NULL; + (void) mca_base_var_register("opal", "hwloc", "base", "topo_file", + "Read local topology from file instead of directly sensing it", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &opal_hwloc_base_topo_file); + #endif /* register parameters */ return OPAL_SUCCESS; @@ -299,6 +306,37 @@ static int opal_hwloc_base_open(mca_base_open_flag_t flags) return OPAL_SUCCESS; } +static int opal_hwloc_base_close(void) +{ + if (!opal_hwloc_base_inited) { + return OPAL_SUCCESS; + } + +#if OPAL_HAVE_HWLOC + { + int ret; + + /* no need to close the component as it was statically opened */ + + /* for support of tools such as ompi_info */ + ret = mca_base_framework_components_close (&opal_hwloc_base_framework, NULL); + if (OPAL_SUCCESS != ret) { + return ret; + } + + /* free memory */ + if (NULL != opal_hwloc_my_cpuset) { + hwloc_bitmap_free(opal_hwloc_my_cpuset); + opal_hwloc_my_cpuset = NULL; + } + } +#endif + + /* All done */ + opal_hwloc_base_inited = false; + return OPAL_SUCCESS; +} + static bool fns_init=false; static opal_tsd_key_t print_tsd_key; char* opal_hwloc_print_null = "NULL"; diff --git a/opal/mca/hwloc/base/hwloc_base_util.c b/opal/mca/hwloc/base/hwloc_base_util.c index 17c33d3eb7..86c5f4a316 100644 --- a/opal/mca/hwloc/base/hwloc_base_util.c +++ b/opal/mca/hwloc/base/hwloc_base_util.c @@ -12,6 +12,7 @@ * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2013 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2013 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -33,6 +34,7 @@ #include "opal/constants.h" #include "opal/util/argv.h" #include "opal/util/output.h" +#include "opal/util/os_dirpath.h" #include "opal/util/show_help.h" #include "opal/threads/tsd.h" @@ -211,18 +213,25 @@ int opal_hwloc_base_get_topology(void) OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output, "hwloc:base:get_topology")); - if (0 != hwloc_topology_init(&opal_hwloc_topology) || - 0 != hwloc_topology_set_flags(opal_hwloc_topology, - (HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM | - HWLOC_TOPOLOGY_FLAG_IO_DEVICES)) || - 0 != hwloc_topology_load(opal_hwloc_topology)) { - return OPAL_ERR_NOT_SUPPORTED; - } + if (NULL == opal_hwloc_base_topo_file) { + if (0 != hwloc_topology_init(&opal_hwloc_topology) || + 0 != hwloc_topology_set_flags(opal_hwloc_topology, + (HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM | + HWLOC_TOPOLOGY_FLAG_IO_DEVICES)) || + 0 != hwloc_topology_load(opal_hwloc_topology)) { + return OPAL_ERR_NOT_SUPPORTED; + } - /* filter the cpus thru any default cpu set */ - rc = opal_hwloc_base_filter_cpus(opal_hwloc_topology); - if (OPAL_SUCCESS != rc) { - return rc; + /* filter the cpus thru any default cpu set */ + rc = opal_hwloc_base_filter_cpus(opal_hwloc_topology); + if (OPAL_SUCCESS != rc) { + return rc; + } + } else { + rc = opal_hwloc_base_set_topology(opal_hwloc_base_topo_file); + if (OPAL_SUCCESS != rc) { + return rc; + } } /* fill opal_cache_line_size global with the smallest L1 cache @@ -1309,7 +1318,7 @@ opal_hwloc_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t top * NOTE: we may alter that latter part as hwloc's ability to * sense multi-cu, multi-cluster systems grows */ - locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE | OPAL_PROC_ON_BOARD; + locality = OPAL_PROC_ON_NODE; /* if either cpuset is NULL, then that isn't bound */ if (NULL == cpuset1 || NULL == cpuset2) { @@ -1357,25 +1366,25 @@ opal_hwloc_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t top shared = true; switch(obj->type) { case HWLOC_OBJ_NODE: - locality |= OPAL_PROC_ON_NUMA; + locality = OPAL_PROC_ON_NUMA; break; case HWLOC_OBJ_SOCKET: - locality |= OPAL_PROC_ON_SOCKET; + locality = OPAL_PROC_ON_SOCKET; break; case HWLOC_OBJ_CACHE: if (3 == obj->attr->cache.depth) { - locality |= OPAL_PROC_ON_L3CACHE; + locality = OPAL_PROC_ON_L3CACHE; } else if (2 == obj->attr->cache.depth) { - locality |= OPAL_PROC_ON_L2CACHE; + locality = OPAL_PROC_ON_L2CACHE; } else { - locality |= OPAL_PROC_ON_L1CACHE; + locality = OPAL_PROC_ON_L1CACHE; } break; case HWLOC_OBJ_CORE: - locality |= OPAL_PROC_ON_CORE; + locality = OPAL_PROC_ON_CORE; break; case HWLOC_OBJ_PU: - locality |= OPAL_PROC_ON_HWTHREAD; + locality = OPAL_PROC_ON_HWTHREAD; break; default: /* just ignore it */ @@ -1404,6 +1413,110 @@ opal_hwloc_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t top return locality; } +/* searches the given topology for coprocessor objects and returns + * their serial numbers as a comma-delimited string, or NULL + * if no coprocessors are found + */ +char* opal_hwloc_base_find_coprocessors(hwloc_topology_t topo) +{ + hwloc_obj_t osdev; + unsigned i; + char **cps = NULL; + char *cpstring = NULL; + int depth; + + /* coprocessors are recorded under OS_DEVICEs, so first + * see if we have any of those + */ + if (HWLOC_TYPE_DEPTH_UNKNOWN == (depth = hwloc_get_type_depth(topo, HWLOC_OBJ_OS_DEVICE))) { + return NULL; + } + /* check the device objects for coprocessors */ + osdev = hwloc_get_obj_by_depth(topo, depth, 0); + while (NULL != osdev) { + if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type) { + /* got one! find and save its serial number */ + for (i=0; i < osdev->infos_count; i++) { + if (0 == strncmp(osdev->infos[i].name, "MICSerialNumber", strlen("MICSerialNumber"))) { + opal_argv_append_nosize(&cps, osdev->infos[i].value); + } + } + } + osdev = osdev->next_cousin; + } + if (NULL != cps) { + cpstring = opal_argv_join(cps, ','); + opal_argv_free(cps); + } + return cpstring; +} + +#define OPAL_HWLOC_MAX_ELOG_LINE 1024 + +static char *hwloc_getline(FILE *fp) +{ + char *ret, *buff; + char input[OPAL_HWLOC_MAX_ELOG_LINE]; + + ret = fgets(input, OPAL_HWLOC_MAX_ELOG_LINE, fp); + if (NULL != ret) { + input[strlen(input)-1] = '\0'; /* remove newline */ + buff = strdup(input); + return buff; + } + + return NULL; +} + +/* checks local environment to determine if this process + * is on a coprocessor - if so, it returns the serial number + * as a string, or NULL if it isn't on a coprocessor + */ +char* opal_hwloc_base_check_on_coprocessor(void) +{ + /* this support currently is limited to Intel Phi processors + * but will hopefully be extended as we get better, more + * generalized ways of identifying coprocessors + */ + FILE *fp; + char *t, *cptr, *e, *cp=NULL; + + if (OPAL_SUCCESS != opal_os_dirpath_access("/tmp/elog", S_IRUSR)) { + /* if the file isn't there, or we don't have permission + * to read it, then we are not on a coprocessor so far + * as we can tell + */ + return NULL; + } + if (NULL == (fp = fopen("/tmp/elog", "r"))) { + /* nothing we can do */ + return NULL; + } + /* look for the line containing the serial number of this + * card - usually the first line in the file + */ + while (NULL != (cptr = hwloc_getline(fp))) { + if (NULL != (t = strstr(cptr, "Card"))) { + /* we want the string right after this - delimited by + * a colon at the end + */ + t += 5; // move past "Card " + if (NULL == (e = strchr(t, ':'))) { + /* not what we were expecting */ + free(cptr); + continue; + } + *e = '\0'; + cp = strdup(t); + free(cptr); + break; + } + free(cptr); + } + fclose(fp); + return cp; +} + char* opal_hwloc_base_print_binding(opal_binding_policy_t binding) { char *ret, *bind; diff --git a/opal/mca/hwloc/hwloc.h b/opal/mca/hwloc/hwloc.h index d14f2ccc80..b39620712f 100644 --- a/opal/mca/hwloc/hwloc.h +++ b/opal/mca/hwloc/hwloc.h @@ -79,16 +79,17 @@ enum { OPAL_PROC_LOCALITY_UNKNOWN = 0x0000, OPAL_PROC_NON_LOCAL = 0x8000, OPAL_PROC_ON_CLUSTER = 0x0400, - OPAL_PROC_ON_CU = 0x0200, - OPAL_PROC_ON_NODE = 0x0100, - OPAL_PROC_ON_BOARD = 0x0080, - OPAL_PROC_ON_NUMA = 0x0040, - OPAL_PROC_ON_SOCKET = 0x0020, - OPAL_PROC_ON_L3CACHE = 0x0010, - OPAL_PROC_ON_L2CACHE = 0x0008, - OPAL_PROC_ON_L1CACHE = 0x0004, - OPAL_PROC_ON_CORE = 0x0002, - OPAL_PROC_ON_HWTHREAD = 0x0001, + OPAL_PROC_ON_CU = 0x0600, + OPAL_PROC_ON_HOST = 0x0700, + OPAL_PROC_ON_BOARD = 0x0680, + OPAL_PROC_ON_NODE = 0x0780, // same host and board + OPAL_PROC_ON_NUMA = 0x07c0, + OPAL_PROC_ON_SOCKET = 0x07b0, + OPAL_PROC_ON_L3CACHE = 0x07a0, + OPAL_PROC_ON_L2CACHE = 0x07a8, + OPAL_PROC_ON_L1CACHE = 0x07ac, + OPAL_PROC_ON_CORE = 0x07ab, + OPAL_PROC_ON_HWTHREAD = 0x07aa, OPAL_PROC_ALL_LOCAL = 0x0fff }; @@ -101,7 +102,8 @@ enum { #define OPAL_PROC_ON_LOCAL_SOCKET(n) ((n) & OPAL_PROC_ON_SOCKET) #define OPAL_PROC_ON_LOCAL_NUMA(n) ((n) & OPAL_PROC_ON_NUMA) #define OPAL_PROC_ON_LOCAL_BOARD(n) ((n) & OPAL_PROC_ON_BOARD) -#define OPAL_PROC_ON_LOCAL_NODE(n) ((n) & OPAL_PROC_ON_NODE) +#define OPAL_PROC_ON_LOCAL_HOST(n) ((n) & OPAL_PROC_ON_HOST) +#define OPAL_PROC_ON_LOCAL_NODE(n) (((n) & OPAL_PROC_ON_HOST) && ((n) & OPAL_PROC_ON_BOARD)) #define OPAL_PROC_ON_LOCAL_CU(n) ((n) & OPAL_PROC_ON_CU) #define OPAL_PROC_ON_LOCAL_CLUSTER(n) ((n) & OPAL_PROC_ON_CLUSTER) diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index 14db0216cb..a761660a4e 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -389,8 +389,26 @@ static int rte_init(void) node->name = strdup(orte_process_info.nodename); node->index = opal_pointer_array_set_item(orte_node_pool, 0, node); #if OPAL_HAVE_HWLOC - /* add it to the array of known topologies */ - opal_pointer_array_add(orte_node_topologies, opal_hwloc_topology); + { + char *coprocessors; + /* add it to the array of known topologies */ + opal_pointer_array_add(orte_node_topologies, opal_hwloc_topology); + /* detect and add any coprocessors */ + coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology); + if (NULL != coprocessors) { + node->coprocessors = opal_argv_split(coprocessors, ','); + node->coprocessor_host = true; + free(coprocessors); + orte_coprocessors_detected = true; + } + /* see if I am on a coprocessor */ + coprocessors = opal_hwloc_base_check_on_coprocessor(); + if (NULL != coprocessors) { + node->coprocessors = opal_argv_split(coprocessors, ','); + free(coprocessors); + orte_coprocessors_detected = true; + } + } #endif /* create and store a proc object for us */ diff --git a/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c b/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c index e372343e77..0d1493c3be 100644 --- a/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c +++ b/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c @@ -252,7 +252,7 @@ static int modex(orte_grpcomm_collective_t *coll) /* if we share a node, but we don't know anything more, then * mark us as on the node as this is all we know */ - locality = OPAL_PROC_ON_NODE; + locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { /* determine relative location on our node */ locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 119831c820..c12b9b7d94 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -13,6 +13,7 @@ * Copyright (c) 2009 Institut National de Recherche en Informatique * et Automatique. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * Copyright (c) 2013 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -334,6 +335,8 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata) #endif orte_job_t *jdata, *jdatorted; orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + int i, j, k; + orte_node_t *node, *nptr; /* if we don't want to launch the apps, now is the time to leave */ if (orte_do_not_launch) { @@ -406,6 +409,51 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata) ORTE_ERROR_LOG(rc); } #endif + + /* if coprocessors were detected, now is the time to + * identify who is attached to what host - this info + * will be shipped to the daemons in the nidmap. Someday, + * there may be a direct way for daemons on coprocessors + * to detect their hosts - but not today. + */ + if (orte_coprocessors_detected) { + /* cycle thru the nodes looking for hosts with + * coprocessors present + */ + for (i=0; i < orte_node_pool->size; i++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { + continue; + } + if (!node->coprocessor_host) { + continue; + } + /* set our hostid to our own daemon vpid */ + node->hostid = node->daemon->name.vpid; + /* cycle thru our list of coprocessors */ + for (j=0; NULL != node->coprocessors[j]; j++) { + /* search the list of nodes for this coprocessor - yes, + * this search stinks for scalability, but we'll have to + * find a more scalable method at some point + */ + for (k=0; k < orte_node_pool->size; k++) { + if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, k))) { + continue; + } + if (nptr->coprocessor_host || NULL == nptr->coprocessors) { + continue; + } + if (0 == strcmp(node->coprocessors[j], nptr->coprocessors[0])) { + /* found it - record the hostid as the vpid of the + * daemon on the host + */ + nptr->hostid = node->daemon->name.vpid; + break; + } + } + } + } + } + /* set the job state to the next position */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_APPS); @@ -772,49 +820,87 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, } #if OPAL_HAVE_HWLOC - /* store the local resources for that node */ - if (1 == dname.vpid || orte_hetero_nodes) { - hwloc_topology_t topo, t; - int i; - bool found; - + { + char *coprocessors; + + /* store the local resources for that node */ + if (1 == dname.vpid || orte_hetero_nodes) { + hwloc_topology_t topo, t; + int i; + bool found; + + idx=1; + if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &idx, OPAL_HWLOC_TOPO))) { + ORTE_ERROR_LOG(rc); + orted_failed_launch = true; + goto CLEANUP; + } + OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, + "%s RECEIVED TOPOLOGY FROM NODE %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename)); + if (10 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { + opal_dss.dump(0, topo, OPAL_HWLOC_TOPO); + } + /* do we already have this topology from some other node? */ + found = false; + for (i=0; i < orte_node_topologies->size; i++) { + if (NULL == (t = (hwloc_topology_t)opal_pointer_array_get_item(orte_node_topologies, i))) { + continue; + } + if (OPAL_EQUAL == opal_dss.compare(topo, t, OPAL_HWLOC_TOPO)) { + /* yes - just point to it */ + OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, + "%s TOPOLOGY MATCHES - DISCARDING", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + found = true; + node->topology = t; + hwloc_topology_destroy(topo); + break; + } + } + if (!found) { + /* nope - add it */ + OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, + "%s NEW TOPOLOGY - ADDING", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + opal_pointer_array_add(orte_node_topologies, topo); + node->topology = topo; + } + } + /* unpack any coprocessors */ idx=1; - if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &idx, OPAL_HWLOC_TOPO))) { + if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &coprocessors, &idx, OPAL_STRING))) { ORTE_ERROR_LOG(rc); orted_failed_launch = true; goto CLEANUP; } - OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, - "%s RECEIVED TOPOLOGY FROM NODE %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename)); - if (10 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { - opal_dss.dump(0, topo, OPAL_HWLOC_TOPO); + if (NULL != coprocessors) { + node->coprocessors = opal_argv_split(coprocessors, ','); + node->coprocessor_host = true; + free(coprocessors); + orte_coprocessors_detected = true; } - /* do we already have this topology from some other node? */ - found = false; - for (i=0; i < orte_node_topologies->size; i++) { - if (NULL == (t = (hwloc_topology_t)opal_pointer_array_get_item(orte_node_topologies, i))) { - continue; - } - if (OPAL_EQUAL == opal_dss.compare(topo, t, OPAL_HWLOC_TOPO)) { - /* yes - just point to it */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, - "%s TOPOLOGY MATCHES - DISCARDING", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - found = true; - node->topology = t; - hwloc_topology_destroy(topo); - break; - } + /* see if this daemon is on a coprocessor */ + idx=1; + if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &coprocessors, &idx, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + orted_failed_launch = true; + goto CLEANUP; } - if (!found) { - /* nope - add it */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, - "%s NEW TOPOLOGY - ADDING", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - opal_pointer_array_add(orte_node_topologies, topo); - node->topology = topo; + if (NULL != coprocessors) { + if (NULL != node->coprocessors) { + /* this is not allowed - a coprocessor cannot be host + * to another coprocessor at this time + */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED); + orted_failed_launch = true; + free(coprocessors); + goto CLEANUP; + } + node->coprocessors = opal_argv_split(coprocessors, ','); + free(coprocessors); + orte_coprocessors_detected = true; } } #endif diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index 85222de15e..bf7173fed9 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -722,10 +722,23 @@ int orte_daemon(int argc, char *argv[]) } #if OPAL_HAVE_HWLOC - /* add the local topology */ - if (NULL != opal_hwloc_topology && - (1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes)) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) { + { + char *coprocessors; + /* add the local topology */ + if (NULL != opal_hwloc_topology && + (1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes)) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) { + ORTE_ERROR_LOG(ret); + } + } + /* detect and add any coprocessors */ + coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology); + if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(ret); + } + /* see if I am on a coprocessor */ + coprocessors = opal_hwloc_base_check_on_coprocessor(); + if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); } } diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 01de9245f3..f479eb9501 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -68,6 +68,7 @@ bool orted_spin_flag = false; char *orte_local_cpu_type = NULL; char *orte_local_cpu_model = NULL; char *orte_basename = NULL; +bool orte_coprocessors_detected = false; /* ORTE OOB port flags */ bool orte_static_ports = false; @@ -815,9 +816,12 @@ OBJ_CLASS_INSTANCE(orte_job_t, static void orte_node_construct(orte_node_t* node) { + node->index = -1; node->name = NULL; node->alias = NULL; - node->index = -1; + node->coprocessors = NULL; + node->coprocessor_host = false; + node->hostid = ORTE_VPID_INVALID; node->daemon = NULL; node->daemon_launched = false; node->location_verified = false; @@ -865,6 +869,11 @@ static void orte_node_destruct(orte_node_t* node) node->alias = NULL; } + if (NULL != node->coprocessors) { + opal_argv_free(node->coprocessors); + node->coprocessors = NULL; + } + if (NULL != node->daemon) { node->daemon->node = NULL; OBJ_RELEASE(node->daemon); diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 32e366c599..6459517589 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -124,6 +124,7 @@ ORTE_DECLSPEC extern int orte_exit_status; #define ORTE_DB_ARCH "orte.arch" #define ORTE_DB_NPROCS "orte.nprocs" #define ORTE_DB_RMLURI "orte.rmluri" +#define ORTE_DB_HOSTID "orte.hostid" /* State Machine lists */ @@ -305,6 +306,18 @@ typedef struct { char *name; /* argv-like array of aliases for this node */ char **alias; + /* argv-like array of co-processor id's on this node */ + char **coprocessors; + /* whether or not this node hosts coprocessors - will + * be true if the coprocessor array contains hosted + * processors, false if this node itself is a coprocessor + */ + bool coprocessor_host; + /* if this "node" is a coprocessor being hosted on a + * different node, then we need to know the id of our + * "host" to help any procs on us to determine locality + */ + orte_vpid_t hostid; /* daemon on this node */ struct orte_proc_t *daemon; /* whether or not this daemon has been launched */ @@ -591,6 +604,7 @@ ORTE_DECLSPEC extern bool orted_spin_flag; ORTE_DECLSPEC extern char *orte_local_cpu_type; ORTE_DECLSPEC extern char *orte_local_cpu_model; ORTE_DECLSPEC extern char *orte_basename; +ORTE_DECLSPEC extern bool orte_coprocessors_detected; /* ORTE OOB port flags */ ORTE_DECLSPEC extern bool orte_static_ports; diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index e370ddf6ad..b0a128a581 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -264,6 +264,7 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update) char *ptr, *nodename; orte_job_t *daemons; orte_proc_t *dmn; + uint8_t flag; OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output, "%s orte:util:encode_nidmap", @@ -288,6 +289,17 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update) return rc; } + /* flag if coprocessors were detected */ + if (orte_coprocessors_detected) { + flag = 1; + } else { + flag = 0; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &flag, 1, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* only send info on nodes that have daemons on them, and * only regarding daemons that have changed - i.e., new * daemons since the last time we sent the info - so we @@ -357,6 +369,14 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update) ORTE_ERROR_LOG(rc); return rc; } + + /* if coprocessors were detected, send the hostid for this node */ + if (orte_coprocessors_detected) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node->hostid, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } } /* transfer the payload to the byte object */ @@ -380,6 +400,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo) int rc=ORTE_SUCCESS; uint8_t oversub; char *nodename; + orte_vpid_t hostid; OPAL_OUTPUT_VERBOSE((1, orte_nidmap_output, "%s decode:nidmap decoding nodemap", @@ -401,6 +422,18 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo) return rc; } + /* see if coprocessors were detected */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &oversub, &n, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (0 == oversub) { + orte_coprocessors_detected = false; + } else { + orte_coprocessors_detected = true; + } + /* set the daemon jobid */ daemon.jobid = ORTE_DAEMON_JOBID(ORTE_PROC_MY_NAME->jobid); @@ -484,6 +517,32 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo) ORTE_ERROR_LOG(rc); return rc; } + + /* if coprocessors were detected, unpack the hostid for the node - this + * value is associate with this daemon, not with any application process + */ + if (orte_coprocessors_detected) { + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &hostid, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&daemon, OPAL_SCOPE_NON_PEER, + ORTE_DB_HOSTID, &hostid, OPAL_UINT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* if this is my daemon, then store it as my hostid as well */ + if (daemon.vpid == ORTE_PROC_MY_DAEMON->vpid) { + if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)ORTE_PROC_MY_NAME, OPAL_SCOPE_NON_PEER, + ORTE_DB_HOSTID, &hostid, OPAL_UINT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* and record it */ + orte_process_info.my_hostid = hostid; + } + } } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { ORTE_ERROR_LOG(rc); @@ -506,7 +565,7 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo) orte_node_t *node; opal_buffer_t buf; int rc=ORTE_SUCCESS; - uint8_t *oversub; + uint8_t oversub; char *name; orte_job_t *daemons; orte_proc_t *dptr; @@ -532,6 +591,18 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo) return rc; } + /* see if coprocessors were detected */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &oversub, &n, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (0 == oversub) { + orte_coprocessors_detected = false; + } else { + orte_coprocessors_detected = true; + } + /* transfer the data to the nodes */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); daemons->num_procs = num_daemons; @@ -597,6 +668,16 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo) } else { node->oversubscribed = true; } + + /* if coprocessors were detected, unpack the hostid */ + if (orte_coprocessors_detected) { + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &node->hostid, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { ORTE_ERROR_LOG(rc); @@ -789,7 +870,7 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update) /* only APPS call this function - daemons have their own */ int orte_util_decode_pidmap(opal_byte_object_t *bo) { - orte_vpid_t num_procs; + orte_vpid_t num_procs, hostid, *vptr; orte_local_rank_t local_rank; orte_node_rank_t node_rank; #if OPAL_HAVE_HWLOC @@ -950,6 +1031,26 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo) ORTE_ERROR_LOG(rc); goto cleanup; } + /* if coprocessors were detected, lookup and store the hostid for this proc */ + if (orte_coprocessors_detected) { + /* lookup the hostid for this daemon */ + vptr = &hostid; + if (ORTE_SUCCESS != (rc = opal_db.fetch((opal_identifier_t*)&dmn, ORTE_DB_HOSTID, + (void**)&vptr, OPAL_UINT32))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output, + "%s FOUND HOSTID %s FOR DAEMON %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_VPID_PRINT(hostid), ORTE_VPID_PRINT(dmn.vpid))); + /* store it as hostid for this proc */ + if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&proc, OPAL_SCOPE_NON_PEER, + ORTE_DB_HOSTID, &hostid, OPAL_UINT32))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } /* lookup and store the hostname for this proc */ if (ORTE_SUCCESS != (rc = opal_db.fetch_pointer((opal_identifier_t*)&dmn, ORTE_DB_HOSTNAME, (void**)&hostname, OPAL_STRING))) { diff --git a/orte/util/proc_info.c b/orte/util/proc_info.c index 6d6c5702aa..54fe4d62e1 100644 --- a/orte/util/proc_info.c +++ b/orte/util/proc_info.c @@ -81,7 +81,8 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = { /* .app_rank = */ -1, /* .peer_modex = */ -1, /* .peer_init_barrier = */ -1, - /* .peer_fini_barrier = */ -1 + /* .peer_fini_barrier = */ -1, + /* .my_hostid = */ ORTE_VPID_INVALID }; static bool init=false; diff --git a/orte/util/proc_info.h b/orte/util/proc_info.h index c1dca059e5..dc3b67cfc3 100644 --- a/orte/util/proc_info.h +++ b/orte/util/proc_info.h @@ -11,6 +11,7 @@ * All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2013 Intel, Inc. All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -128,6 +129,7 @@ struct orte_proc_info_t { orte_grpcomm_coll_id_t peer_modex; /**< modex collective id */ orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */ orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */ + orte_vpid_t my_hostid; /** identifies the local host for a coprocessor */ }; typedef struct orte_proc_info_t orte_proc_info_t;