1
1

****************************************************************

This change contains a non-mandatory modification
       of the MPI-RTE interface. Anyone wishing to support
       coprocessors such as the Xeon Phi may wish to add
       the required definition and underlying support
****************************************************************

Add locality support for coprocessors such as the Intel Xeon Phi.

Detecting that we are on a coprocessor inside of a host node isn't straightforward. There are no good "hooks" provided for programmatically detecting that "we are on a coprocessor running its own OS", and the ORTE daemon just thinks it is on another node. However, in order to properly use the Phi's public interface for MPI transport, it is necessary that the daemon detect that it is colocated with procs on the host.

So we have to split the locality to separately record "on the same host" vs "on the same board". We already have the board-level locality flag, but not quite enough flexibility to handle this use-case. Thus, do the following:

1. add OPAL_PROC_ON_HOST flag to indicate we share a host, but not necessarily the same board

2. modify OPAL_PROC_ON_NODE to indicate we share both a host AND the same board. Note that we have to modify the OPAL_PROC_ON_LOCAL_NODE macro to explicitly check both conditions

3. add support in opal/mca/hwloc/base/hwloc_base_util.c for the host to check for coprocessors, and for daemons to check to see if they are on a coprocessor. The former is done via hwloc, but support for the latter is not yet provided by hwloc. So the code for detecting we are on a coprocessor currently is Xeon Phi specific - hopefully, we will find more generic methods in the future.

4. modify the orted and the hnp startup so they check for coprocessors and to see if they are on a coprocessor, and have the orteds pass that info back in their callback message. Automatically detect that coprocessors have been found and identify which coprocessors are on which hosts. Note that this algo isn't scalable at the moment - this will hopefully be improved over time.

5. modify the ompi proc locality detection function to look for coprocessor host info IF the OMPI_RTE_HOST_ID database key has been defined. RTE's that choose not to provide this support do not have to do anything - the associated code will simply be ignored.

6. include some cleanup of the hwloc open/close code so it conforms to how we did things in other frameworks (e.g., having a single "frame" file instead of open/close). Also, fix the locality flags - e.g., being on the same node means you must also be on the same cluster/cu, so ensure those flags are also set.

cmr:v1.7.4:reviewer=hjelmn

This commit was SVN r29435.
Этот коммит содержится в:
Ralph Castain 2013-10-14 16:52:58 +00:00
родитель 48c2728b1d
Коммит 24c811805f
17 изменённых файлов: 504 добавлений и 132 удалений

Просмотреть файл

@ -56,6 +56,7 @@ typedef orte_ns_cmp_bitmask_t ompi_rte_cmp_bitmask_t;
#define OMPI_PROCESS_NAME_NTOH ORTE_PROCESS_NAME_NTOH #define OMPI_PROCESS_NAME_NTOH ORTE_PROCESS_NAME_NTOH
#define OMPI_RTE_NODE_ID ORTE_DB_DAEMON_VPID #define OMPI_RTE_NODE_ID ORTE_DB_DAEMON_VPID
#define OMPI_RTE_MY_NODEID ORTE_PROC_MY_DAEMON->vpid #define OMPI_RTE_MY_NODEID ORTE_PROC_MY_DAEMON->vpid
#define OMPI_RTE_HOST_ID ORTE_DB_HOSTID
/* Collective objects and operations */ /* Collective objects and operations */
#define ompi_rte_collective_t orte_grpcomm_collective_t #define ompi_rte_collective_t orte_grpcomm_collective_t

Просмотреть файл

@ -148,9 +148,28 @@ static int ompi_proc_set_locality(ompi_proc_t *proc)
(void**)&vptr, OPAL_UINT32))) { (void**)&vptr, OPAL_UINT32))) {
return ret; return ret;
} }
/* if we are on different nodes, then we are non-local */ /* if we are on different nodes, then we are probably non-local */
if (vpid != OMPI_RTE_MY_NODEID) { if (vpid != OMPI_RTE_MY_NODEID) {
#ifdef OMPI_RTE_HOST_ID
/* see if coprocessors were detected - if the hostid isn't
* present, then no coprocessors were detected and we can
* ignore this test
*/
vptr = &vpid;
if (OMPI_SUCCESS == opal_db.fetch((opal_identifier_t*)&proc->proc_name, OMPI_RTE_HOST_ID,
(void**)&vptr, OPAL_UINT32)) {
/* if this matches my host id, then we are on the same host,
* but not on the same board
*/
if (vpid == ompi_process_info.my_hostid) {
locality = OPAL_PROC_ON_HOST;
} else {
locality = OPAL_PROC_NON_LOCAL;
}
}
#else
locality = OPAL_PROC_NON_LOCAL; locality = OPAL_PROC_NON_LOCAL;
#endif
} else { } else {
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
{ {

Просмотреть файл

@ -13,8 +13,7 @@ headers += \
base/base.h base/base.h
libmca_hwloc_la_SOURCES += \ libmca_hwloc_la_SOURCES += \
base/hwloc_base_close.c \ base/hwloc_base_frame.c
base/hwloc_base_open.c
if OPAL_HAVE_HWLOC if OPAL_HAVE_HWLOC
libmca_hwloc_la_SOURCES += \ libmca_hwloc_la_SOURCES += \

Просмотреть файл

@ -1,5 +1,6 @@
/* /*
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -60,6 +61,7 @@ OPAL_DECLSPEC char* opal_hwloc_base_print_locality(opal_hwloc_locality_t localit
OPAL_DECLSPEC extern char *opal_hwloc_base_slot_list; OPAL_DECLSPEC extern char *opal_hwloc_base_slot_list;
OPAL_DECLSPEC extern char *opal_hwloc_base_cpu_set; OPAL_DECLSPEC extern char *opal_hwloc_base_cpu_set;
OPAL_DECLSPEC extern hwloc_cpuset_t opal_hwloc_base_given_cpus; OPAL_DECLSPEC extern hwloc_cpuset_t opal_hwloc_base_given_cpus;
OPAL_DECLSPEC extern char *opal_hwloc_base_topo_file;
/* convenience macro for debugging */ /* convenience macro for debugging */
#define OPAL_HWLOC_SHOW_BINDING(n, v) \ #define OPAL_HWLOC_SHOW_BINDING(n, v) \
@ -193,6 +195,10 @@ OPAL_DECLSPEC int opal_hwloc_base_slot_list_parse(const char *slot_str,
hwloc_topology_t topo, hwloc_topology_t topo,
hwloc_cpuset_t cpumask); hwloc_cpuset_t cpumask);
OPAL_DECLSPEC char* opal_hwloc_base_find_coprocessors(hwloc_topology_t topo);
OPAL_DECLSPEC char* opal_hwloc_base_check_on_coprocessor(void);
/** /**
* Report a bind failure using the normal mechanisms if a component * Report a bind failure using the normal mechanisms if a component
* fails to bind memory -- according to the value of the * fails to bind memory -- according to the value of the

Просмотреть файл

@ -1,50 +0,0 @@
/*
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/hwloc/hwloc.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/util/output.h"
int opal_hwloc_base_close(void);
int opal_hwloc_base_close(void)
{
if (!opal_hwloc_base_inited) {
return OPAL_SUCCESS;
}
#if OPAL_HAVE_HWLOC
{
int ret;
/* no need to close the component as it was statically opened */
/* for support of tools such as ompi_info */
ret = mca_base_framework_components_close (&opal_hwloc_base_framework, NULL);
if (OPAL_SUCCESS != ret) {
return ret;
}
/* free memory */
if (NULL != opal_hwloc_my_cpuset) {
hwloc_bitmap_free(opal_hwloc_my_cpuset);
opal_hwloc_my_cpuset = NULL;
}
}
#endif
/* All done */
opal_hwloc_base_inited = false;
return OPAL_SUCCESS;
}

Просмотреть файл

@ -1,5 +1,6 @@
/* /*
* Copyright (c) 2011-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -56,6 +57,7 @@ hwloc_obj_type_t opal_hwloc_levels[] = {
HWLOC_OBJ_PU HWLOC_OBJ_PU
}; };
bool opal_hwloc_use_hwthreads_as_cpus = false; bool opal_hwloc_use_hwthreads_as_cpus = false;
char *opal_hwloc_base_topo_file = NULL;
#endif #endif
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
@ -75,8 +77,7 @@ static mca_base_var_enum_value_t hwloc_failure_action[] = {
static int opal_hwloc_base_register(mca_base_register_flag_t flags); static int opal_hwloc_base_register(mca_base_register_flag_t flags);
static int opal_hwloc_base_open(mca_base_open_flag_t flags); static int opal_hwloc_base_open(mca_base_open_flag_t flags);
/* defined in hwloc_base_close.c */ static int opal_hwloc_base_close(void);
int opal_hwloc_base_close(void);
MCA_BASE_FRAMEWORK_DECLARE(opal, hwloc, NULL, opal_hwloc_base_register, opal_hwloc_base_open, opal_hwloc_base_close, MCA_BASE_FRAMEWORK_DECLARE(opal, hwloc, NULL, opal_hwloc_base_register, opal_hwloc_base_open, opal_hwloc_base_close,
mca_hwloc_base_static_components, 0); mca_hwloc_base_static_components, 0);
@ -162,6 +163,12 @@ static int opal_hwloc_base_register(mca_base_register_flag_t flags)
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &opal_hwloc_use_hwthreads_as_cpus); MCA_BASE_VAR_SCOPE_READONLY, &opal_hwloc_use_hwthreads_as_cpus);
opal_hwloc_base_topo_file = NULL;
(void) mca_base_var_register("opal", "hwloc", "base", "topo_file",
"Read local topology from file instead of directly sensing it",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &opal_hwloc_base_topo_file);
#endif #endif
/* register parameters */ /* register parameters */
return OPAL_SUCCESS; return OPAL_SUCCESS;
@ -299,6 +306,37 @@ static int opal_hwloc_base_open(mca_base_open_flag_t flags)
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
static int opal_hwloc_base_close(void)
{
if (!opal_hwloc_base_inited) {
return OPAL_SUCCESS;
}
#if OPAL_HAVE_HWLOC
{
int ret;
/* no need to close the component as it was statically opened */
/* for support of tools such as ompi_info */
ret = mca_base_framework_components_close (&opal_hwloc_base_framework, NULL);
if (OPAL_SUCCESS != ret) {
return ret;
}
/* free memory */
if (NULL != opal_hwloc_my_cpuset) {
hwloc_bitmap_free(opal_hwloc_my_cpuset);
opal_hwloc_my_cpuset = NULL;
}
}
#endif
/* All done */
opal_hwloc_base_inited = false;
return OPAL_SUCCESS;
}
static bool fns_init=false; static bool fns_init=false;
static opal_tsd_key_t print_tsd_key; static opal_tsd_key_t print_tsd_key;
char* opal_hwloc_print_null = "NULL"; char* opal_hwloc_print_null = "NULL";

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. * Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -33,6 +34,7 @@
#include "opal/constants.h" #include "opal/constants.h"
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/os_dirpath.h"
#include "opal/util/show_help.h" #include "opal/util/show_help.h"
#include "opal/threads/tsd.h" #include "opal/threads/tsd.h"
@ -211,18 +213,25 @@ int opal_hwloc_base_get_topology(void)
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
"hwloc:base:get_topology")); "hwloc:base:get_topology"));
if (0 != hwloc_topology_init(&opal_hwloc_topology) || if (NULL == opal_hwloc_base_topo_file) {
0 != hwloc_topology_set_flags(opal_hwloc_topology, if (0 != hwloc_topology_init(&opal_hwloc_topology) ||
(HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM | 0 != hwloc_topology_set_flags(opal_hwloc_topology,
HWLOC_TOPOLOGY_FLAG_IO_DEVICES)) || (HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
0 != hwloc_topology_load(opal_hwloc_topology)) { HWLOC_TOPOLOGY_FLAG_IO_DEVICES)) ||
return OPAL_ERR_NOT_SUPPORTED; 0 != hwloc_topology_load(opal_hwloc_topology)) {
} return OPAL_ERR_NOT_SUPPORTED;
}
/* filter the cpus thru any default cpu set */ /* filter the cpus thru any default cpu set */
rc = opal_hwloc_base_filter_cpus(opal_hwloc_topology); rc = opal_hwloc_base_filter_cpus(opal_hwloc_topology);
if (OPAL_SUCCESS != rc) { if (OPAL_SUCCESS != rc) {
return rc; return rc;
}
} else {
rc = opal_hwloc_base_set_topology(opal_hwloc_base_topo_file);
if (OPAL_SUCCESS != rc) {
return rc;
}
} }
/* fill opal_cache_line_size global with the smallest L1 cache /* fill opal_cache_line_size global with the smallest L1 cache
@ -1309,7 +1318,7 @@ opal_hwloc_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t top
* NOTE: we may alter that latter part as hwloc's ability to * NOTE: we may alter that latter part as hwloc's ability to
* sense multi-cu, multi-cluster systems grows * sense multi-cu, multi-cluster systems grows
*/ */
locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE | OPAL_PROC_ON_BOARD; locality = OPAL_PROC_ON_NODE;
/* if either cpuset is NULL, then that isn't bound */ /* if either cpuset is NULL, then that isn't bound */
if (NULL == cpuset1 || NULL == cpuset2) { if (NULL == cpuset1 || NULL == cpuset2) {
@ -1357,25 +1366,25 @@ opal_hwloc_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t top
shared = true; shared = true;
switch(obj->type) { switch(obj->type) {
case HWLOC_OBJ_NODE: case HWLOC_OBJ_NODE:
locality |= OPAL_PROC_ON_NUMA; locality = OPAL_PROC_ON_NUMA;
break; break;
case HWLOC_OBJ_SOCKET: case HWLOC_OBJ_SOCKET:
locality |= OPAL_PROC_ON_SOCKET; locality = OPAL_PROC_ON_SOCKET;
break; break;
case HWLOC_OBJ_CACHE: case HWLOC_OBJ_CACHE:
if (3 == obj->attr->cache.depth) { if (3 == obj->attr->cache.depth) {
locality |= OPAL_PROC_ON_L3CACHE; locality = OPAL_PROC_ON_L3CACHE;
} else if (2 == obj->attr->cache.depth) { } else if (2 == obj->attr->cache.depth) {
locality |= OPAL_PROC_ON_L2CACHE; locality = OPAL_PROC_ON_L2CACHE;
} else { } else {
locality |= OPAL_PROC_ON_L1CACHE; locality = OPAL_PROC_ON_L1CACHE;
} }
break; break;
case HWLOC_OBJ_CORE: case HWLOC_OBJ_CORE:
locality |= OPAL_PROC_ON_CORE; locality = OPAL_PROC_ON_CORE;
break; break;
case HWLOC_OBJ_PU: case HWLOC_OBJ_PU:
locality |= OPAL_PROC_ON_HWTHREAD; locality = OPAL_PROC_ON_HWTHREAD;
break; break;
default: default:
/* just ignore it */ /* just ignore it */
@ -1404,6 +1413,110 @@ opal_hwloc_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t top
return locality; return locality;
} }
/* searches the given topology for coprocessor objects and returns
* their serial numbers as a comma-delimited string, or NULL
* if no coprocessors are found
*/
char* opal_hwloc_base_find_coprocessors(hwloc_topology_t topo)
{
hwloc_obj_t osdev;
unsigned i;
char **cps = NULL;
char *cpstring = NULL;
int depth;
/* coprocessors are recorded under OS_DEVICEs, so first
* see if we have any of those
*/
if (HWLOC_TYPE_DEPTH_UNKNOWN == (depth = hwloc_get_type_depth(topo, HWLOC_OBJ_OS_DEVICE))) {
return NULL;
}
/* check the device objects for coprocessors */
osdev = hwloc_get_obj_by_depth(topo, depth, 0);
while (NULL != osdev) {
if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type) {
/* got one! find and save its serial number */
for (i=0; i < osdev->infos_count; i++) {
if (0 == strncmp(osdev->infos[i].name, "MICSerialNumber", strlen("MICSerialNumber"))) {
opal_argv_append_nosize(&cps, osdev->infos[i].value);
}
}
}
osdev = osdev->next_cousin;
}
if (NULL != cps) {
cpstring = opal_argv_join(cps, ',');
opal_argv_free(cps);
}
return cpstring;
}
#define OPAL_HWLOC_MAX_ELOG_LINE 1024
static char *hwloc_getline(FILE *fp)
{
char *ret, *buff;
char input[OPAL_HWLOC_MAX_ELOG_LINE];
ret = fgets(input, OPAL_HWLOC_MAX_ELOG_LINE, fp);
if (NULL != ret) {
input[strlen(input)-1] = '\0'; /* remove newline */
buff = strdup(input);
return buff;
}
return NULL;
}
/* checks local environment to determine if this process
* is on a coprocessor - if so, it returns the serial number
* as a string, or NULL if it isn't on a coprocessor
*/
char* opal_hwloc_base_check_on_coprocessor(void)
{
/* this support currently is limited to Intel Phi processors
* but will hopefully be extended as we get better, more
* generalized ways of identifying coprocessors
*/
FILE *fp;
char *t, *cptr, *e, *cp=NULL;
if (OPAL_SUCCESS != opal_os_dirpath_access("/tmp/elog", S_IRUSR)) {
/* if the file isn't there, or we don't have permission
* to read it, then we are not on a coprocessor so far
* as we can tell
*/
return NULL;
}
if (NULL == (fp = fopen("/tmp/elog", "r"))) {
/* nothing we can do */
return NULL;
}
/* look for the line containing the serial number of this
* card - usually the first line in the file
*/
while (NULL != (cptr = hwloc_getline(fp))) {
if (NULL != (t = strstr(cptr, "Card"))) {
/* we want the string right after this - delimited by
* a colon at the end
*/
t += 5; // move past "Card "
if (NULL == (e = strchr(t, ':'))) {
/* not what we were expecting */
free(cptr);
continue;
}
*e = '\0';
cp = strdup(t);
free(cptr);
break;
}
free(cptr);
}
fclose(fp);
return cp;
}
char* opal_hwloc_base_print_binding(opal_binding_policy_t binding) char* opal_hwloc_base_print_binding(opal_binding_policy_t binding)
{ {
char *ret, *bind; char *ret, *bind;

Просмотреть файл

@ -79,16 +79,17 @@ enum {
OPAL_PROC_LOCALITY_UNKNOWN = 0x0000, OPAL_PROC_LOCALITY_UNKNOWN = 0x0000,
OPAL_PROC_NON_LOCAL = 0x8000, OPAL_PROC_NON_LOCAL = 0x8000,
OPAL_PROC_ON_CLUSTER = 0x0400, OPAL_PROC_ON_CLUSTER = 0x0400,
OPAL_PROC_ON_CU = 0x0200, OPAL_PROC_ON_CU = 0x0600,
OPAL_PROC_ON_NODE = 0x0100, OPAL_PROC_ON_HOST = 0x0700,
OPAL_PROC_ON_BOARD = 0x0080, OPAL_PROC_ON_BOARD = 0x0680,
OPAL_PROC_ON_NUMA = 0x0040, OPAL_PROC_ON_NODE = 0x0780, // same host and board
OPAL_PROC_ON_SOCKET = 0x0020, OPAL_PROC_ON_NUMA = 0x07c0,
OPAL_PROC_ON_L3CACHE = 0x0010, OPAL_PROC_ON_SOCKET = 0x07b0,
OPAL_PROC_ON_L2CACHE = 0x0008, OPAL_PROC_ON_L3CACHE = 0x07a0,
OPAL_PROC_ON_L1CACHE = 0x0004, OPAL_PROC_ON_L2CACHE = 0x07a8,
OPAL_PROC_ON_CORE = 0x0002, OPAL_PROC_ON_L1CACHE = 0x07ac,
OPAL_PROC_ON_HWTHREAD = 0x0001, OPAL_PROC_ON_CORE = 0x07ab,
OPAL_PROC_ON_HWTHREAD = 0x07aa,
OPAL_PROC_ALL_LOCAL = 0x0fff OPAL_PROC_ALL_LOCAL = 0x0fff
}; };
@ -101,7 +102,8 @@ enum {
#define OPAL_PROC_ON_LOCAL_SOCKET(n) ((n) & OPAL_PROC_ON_SOCKET) #define OPAL_PROC_ON_LOCAL_SOCKET(n) ((n) & OPAL_PROC_ON_SOCKET)
#define OPAL_PROC_ON_LOCAL_NUMA(n) ((n) & OPAL_PROC_ON_NUMA) #define OPAL_PROC_ON_LOCAL_NUMA(n) ((n) & OPAL_PROC_ON_NUMA)
#define OPAL_PROC_ON_LOCAL_BOARD(n) ((n) & OPAL_PROC_ON_BOARD) #define OPAL_PROC_ON_LOCAL_BOARD(n) ((n) & OPAL_PROC_ON_BOARD)
#define OPAL_PROC_ON_LOCAL_NODE(n) ((n) & OPAL_PROC_ON_NODE) #define OPAL_PROC_ON_LOCAL_HOST(n) ((n) & OPAL_PROC_ON_HOST)
#define OPAL_PROC_ON_LOCAL_NODE(n) (((n) & OPAL_PROC_ON_HOST) && ((n) & OPAL_PROC_ON_BOARD))
#define OPAL_PROC_ON_LOCAL_CU(n) ((n) & OPAL_PROC_ON_CU) #define OPAL_PROC_ON_LOCAL_CU(n) ((n) & OPAL_PROC_ON_CU)
#define OPAL_PROC_ON_LOCAL_CLUSTER(n) ((n) & OPAL_PROC_ON_CLUSTER) #define OPAL_PROC_ON_LOCAL_CLUSTER(n) ((n) & OPAL_PROC_ON_CLUSTER)

Просмотреть файл

@ -389,8 +389,26 @@ static int rte_init(void)
node->name = strdup(orte_process_info.nodename); node->name = strdup(orte_process_info.nodename);
node->index = opal_pointer_array_set_item(orte_node_pool, 0, node); node->index = opal_pointer_array_set_item(orte_node_pool, 0, node);
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
/* add it to the array of known topologies */ {
opal_pointer_array_add(orte_node_topologies, opal_hwloc_topology); char *coprocessors;
/* add it to the array of known topologies */
opal_pointer_array_add(orte_node_topologies, opal_hwloc_topology);
/* detect and add any coprocessors */
coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology);
if (NULL != coprocessors) {
node->coprocessors = opal_argv_split(coprocessors, ',');
node->coprocessor_host = true;
free(coprocessors);
orte_coprocessors_detected = true;
}
/* see if I am on a coprocessor */
coprocessors = opal_hwloc_base_check_on_coprocessor();
if (NULL != coprocessors) {
node->coprocessors = opal_argv_split(coprocessors, ',');
free(coprocessors);
orte_coprocessors_detected = true;
}
}
#endif #endif
/* create and store a proc object for us */ /* create and store a proc object for us */

Просмотреть файл

@ -252,7 +252,7 @@ static int modex(orte_grpcomm_collective_t *coll)
/* if we share a node, but we don't know anything more, then /* if we share a node, but we don't know anything more, then
* mark us as on the node as this is all we know * mark us as on the node as this is all we know
*/ */
locality = OPAL_PROC_ON_NODE; locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
} else { } else {
/* determine relative location on our node */ /* determine relative location on our node */
locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2009 Institut National de Recherche en Informatique * Copyright (c) 2009 Institut National de Recherche en Informatique
* et Automatique. All rights reserved. * et Automatique. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. * Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -334,6 +335,8 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
#endif #endif
orte_job_t *jdata, *jdatorted; orte_job_t *jdata, *jdatorted;
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
int i, j, k;
orte_node_t *node, *nptr;
/* if we don't want to launch the apps, now is the time to leave */ /* if we don't want to launch the apps, now is the time to leave */
if (orte_do_not_launch) { if (orte_do_not_launch) {
@ -406,6 +409,51 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
} }
#endif #endif
/* if coprocessors were detected, now is the time to
* identify who is attached to what host - this info
* will be shipped to the daemons in the nidmap. Someday,
* there may be a direct way for daemons on coprocessors
* to detect their hosts - but not today.
*/
if (orte_coprocessors_detected) {
/* cycle thru the nodes looking for hosts with
* coprocessors present
*/
for (i=0; i < orte_node_pool->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
if (!node->coprocessor_host) {
continue;
}
/* set our hostid to our own daemon vpid */
node->hostid = node->daemon->name.vpid;
/* cycle thru our list of coprocessors */
for (j=0; NULL != node->coprocessors[j]; j++) {
/* search the list of nodes for this coprocessor - yes,
* this search stinks for scalability, but we'll have to
* find a more scalable method at some point
*/
for (k=0; k < orte_node_pool->size; k++) {
if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, k))) {
continue;
}
if (nptr->coprocessor_host || NULL == nptr->coprocessors) {
continue;
}
if (0 == strcmp(node->coprocessors[j], nptr->coprocessors[0])) {
/* found it - record the hostid as the vpid of the
* daemon on the host
*/
nptr->hostid = node->daemon->name.vpid;
break;
}
}
}
}
}
/* set the job state to the next position */ /* set the job state to the next position */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_APPS); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_APPS);
@ -772,49 +820,87 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
} }
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
/* store the local resources for that node */ {
if (1 == dname.vpid || orte_hetero_nodes) { char *coprocessors;
hwloc_topology_t topo, t;
int i; /* store the local resources for that node */
bool found; if (1 == dname.vpid || orte_hetero_nodes) {
hwloc_topology_t topo, t;
int i;
bool found;
idx=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &idx, OPAL_HWLOC_TOPO))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s RECEIVED TOPOLOGY FROM NODE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename));
if (10 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
opal_dss.dump(0, topo, OPAL_HWLOC_TOPO);
}
/* do we already have this topology from some other node? */
found = false;
for (i=0; i < orte_node_topologies->size; i++) {
if (NULL == (t = (hwloc_topology_t)opal_pointer_array_get_item(orte_node_topologies, i))) {
continue;
}
if (OPAL_EQUAL == opal_dss.compare(topo, t, OPAL_HWLOC_TOPO)) {
/* yes - just point to it */
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s TOPOLOGY MATCHES - DISCARDING",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
found = true;
node->topology = t;
hwloc_topology_destroy(topo);
break;
}
}
if (!found) {
/* nope - add it */
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s NEW TOPOLOGY - ADDING",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
opal_pointer_array_add(orte_node_topologies, topo);
node->topology = topo;
}
}
/* unpack any coprocessors */
idx=1; idx=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &idx, OPAL_HWLOC_TOPO))) { if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &coprocessors, &idx, OPAL_STRING))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
orted_failed_launch = true; orted_failed_launch = true;
goto CLEANUP; goto CLEANUP;
} }
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, if (NULL != coprocessors) {
"%s RECEIVED TOPOLOGY FROM NODE %s", node->coprocessors = opal_argv_split(coprocessors, ',');
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename)); node->coprocessor_host = true;
if (10 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { free(coprocessors);
opal_dss.dump(0, topo, OPAL_HWLOC_TOPO); orte_coprocessors_detected = true;
} }
/* do we already have this topology from some other node? */ /* see if this daemon is on a coprocessor */
found = false; idx=1;
for (i=0; i < orte_node_topologies->size; i++) { if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &coprocessors, &idx, OPAL_STRING))) {
if (NULL == (t = (hwloc_topology_t)opal_pointer_array_get_item(orte_node_topologies, i))) { ORTE_ERROR_LOG(rc);
continue; orted_failed_launch = true;
} goto CLEANUP;
if (OPAL_EQUAL == opal_dss.compare(topo, t, OPAL_HWLOC_TOPO)) {
/* yes - just point to it */
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s TOPOLOGY MATCHES - DISCARDING",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
found = true;
node->topology = t;
hwloc_topology_destroy(topo);
break;
}
} }
if (!found) { if (NULL != coprocessors) {
/* nope - add it */ if (NULL != node->coprocessors) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, /* this is not allowed - a coprocessor cannot be host
"%s NEW TOPOLOGY - ADDING", * to another coprocessor at this time
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); */
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
opal_pointer_array_add(orte_node_topologies, topo); orted_failed_launch = true;
node->topology = topo; free(coprocessors);
goto CLEANUP;
}
node->coprocessors = opal_argv_split(coprocessors, ',');
free(coprocessors);
orte_coprocessors_detected = true;
} }
} }
#endif #endif

Просмотреть файл

@ -722,10 +722,23 @@ int orte_daemon(int argc, char *argv[])
} }
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
/* add the local topology */ {
if (NULL != opal_hwloc_topology && char *coprocessors;
(1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes)) { /* add the local topology */
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) { if (NULL != opal_hwloc_topology &&
(1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes)) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
ORTE_ERROR_LOG(ret);
}
}
/* detect and add any coprocessors */
coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology);
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
}
/* see if I am on a coprocessor */
coprocessors = opal_hwloc_base_check_on_coprocessor();
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
} }
} }

Просмотреть файл

@ -68,6 +68,7 @@ bool orted_spin_flag = false;
char *orte_local_cpu_type = NULL; char *orte_local_cpu_type = NULL;
char *orte_local_cpu_model = NULL; char *orte_local_cpu_model = NULL;
char *orte_basename = NULL; char *orte_basename = NULL;
bool orte_coprocessors_detected = false;
/* ORTE OOB port flags */ /* ORTE OOB port flags */
bool orte_static_ports = false; bool orte_static_ports = false;
@ -815,9 +816,12 @@ OBJ_CLASS_INSTANCE(orte_job_t,
static void orte_node_construct(orte_node_t* node) static void orte_node_construct(orte_node_t* node)
{ {
node->index = -1;
node->name = NULL; node->name = NULL;
node->alias = NULL; node->alias = NULL;
node->index = -1; node->coprocessors = NULL;
node->coprocessor_host = false;
node->hostid = ORTE_VPID_INVALID;
node->daemon = NULL; node->daemon = NULL;
node->daemon_launched = false; node->daemon_launched = false;
node->location_verified = false; node->location_verified = false;
@ -865,6 +869,11 @@ static void orte_node_destruct(orte_node_t* node)
node->alias = NULL; node->alias = NULL;
} }
if (NULL != node->coprocessors) {
opal_argv_free(node->coprocessors);
node->coprocessors = NULL;
}
if (NULL != node->daemon) { if (NULL != node->daemon) {
node->daemon->node = NULL; node->daemon->node = NULL;
OBJ_RELEASE(node->daemon); OBJ_RELEASE(node->daemon);

Просмотреть файл

@ -124,6 +124,7 @@ ORTE_DECLSPEC extern int orte_exit_status;
#define ORTE_DB_ARCH "orte.arch" #define ORTE_DB_ARCH "orte.arch"
#define ORTE_DB_NPROCS "orte.nprocs" #define ORTE_DB_NPROCS "orte.nprocs"
#define ORTE_DB_RMLURI "orte.rmluri" #define ORTE_DB_RMLURI "orte.rmluri"
#define ORTE_DB_HOSTID "orte.hostid"
/* State Machine lists */ /* State Machine lists */
@ -305,6 +306,18 @@ typedef struct {
char *name; char *name;
/* argv-like array of aliases for this node */ /* argv-like array of aliases for this node */
char **alias; char **alias;
/* argv-like array of co-processor id's on this node */
char **coprocessors;
/* whether or not this node hosts coprocessors - will
* be true if the coprocessor array contains hosted
* processors, false if this node itself is a coprocessor
*/
bool coprocessor_host;
/* if this "node" is a coprocessor being hosted on a
* different node, then we need to know the id of our
* "host" to help any procs on us to determine locality
*/
orte_vpid_t hostid;
/* daemon on this node */ /* daemon on this node */
struct orte_proc_t *daemon; struct orte_proc_t *daemon;
/* whether or not this daemon has been launched */ /* whether or not this daemon has been launched */
@ -591,6 +604,7 @@ ORTE_DECLSPEC extern bool orted_spin_flag;
ORTE_DECLSPEC extern char *orte_local_cpu_type; ORTE_DECLSPEC extern char *orte_local_cpu_type;
ORTE_DECLSPEC extern char *orte_local_cpu_model; ORTE_DECLSPEC extern char *orte_local_cpu_model;
ORTE_DECLSPEC extern char *orte_basename; ORTE_DECLSPEC extern char *orte_basename;
ORTE_DECLSPEC extern bool orte_coprocessors_detected;
/* ORTE OOB port flags */ /* ORTE OOB port flags */
ORTE_DECLSPEC extern bool orte_static_ports; ORTE_DECLSPEC extern bool orte_static_ports;

Просмотреть файл

@ -264,6 +264,7 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update)
char *ptr, *nodename; char *ptr, *nodename;
orte_job_t *daemons; orte_job_t *daemons;
orte_proc_t *dmn; orte_proc_t *dmn;
uint8_t flag;
OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output, OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output,
"%s orte:util:encode_nidmap", "%s orte:util:encode_nidmap",
@ -288,6 +289,17 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update)
return rc; return rc;
} }
/* flag if coprocessors were detected */
if (orte_coprocessors_detected) {
flag = 1;
} else {
flag = 0;
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &flag, 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* only send info on nodes that have daemons on them, and /* only send info on nodes that have daemons on them, and
* only regarding daemons that have changed - i.e., new * only regarding daemons that have changed - i.e., new
* daemons since the last time we sent the info - so we * daemons since the last time we sent the info - so we
@ -357,6 +369,14 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update)
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
/* if coprocessors were detected, send the hostid for this node */
if (orte_coprocessors_detected) {
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node->hostid, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
} }
/* transfer the payload to the byte object */ /* transfer the payload to the byte object */
@ -380,6 +400,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
int rc=ORTE_SUCCESS; int rc=ORTE_SUCCESS;
uint8_t oversub; uint8_t oversub;
char *nodename; char *nodename;
orte_vpid_t hostid;
OPAL_OUTPUT_VERBOSE((1, orte_nidmap_output, OPAL_OUTPUT_VERBOSE((1, orte_nidmap_output,
"%s decode:nidmap decoding nodemap", "%s decode:nidmap decoding nodemap",
@ -401,6 +422,18 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
return rc; return rc;
} }
/* see if coprocessors were detected */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &oversub, &n, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 == oversub) {
orte_coprocessors_detected = false;
} else {
orte_coprocessors_detected = true;
}
/* set the daemon jobid */ /* set the daemon jobid */
daemon.jobid = ORTE_DAEMON_JOBID(ORTE_PROC_MY_NAME->jobid); daemon.jobid = ORTE_DAEMON_JOBID(ORTE_PROC_MY_NAME->jobid);
@ -484,6 +517,32 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
/* if coprocessors were detected, unpack the hostid for the node - this
* value is associate with this daemon, not with any application process
*/
if (orte_coprocessors_detected) {
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &hostid, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&daemon, OPAL_SCOPE_NON_PEER,
ORTE_DB_HOSTID, &hostid, OPAL_UINT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if this is my daemon, then store it as my hostid as well */
if (daemon.vpid == ORTE_PROC_MY_DAEMON->vpid) {
if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)ORTE_PROC_MY_NAME, OPAL_SCOPE_NON_PEER,
ORTE_DB_HOSTID, &hostid, OPAL_UINT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* and record it */
orte_process_info.my_hostid = hostid;
}
}
} }
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -506,7 +565,7 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
orte_node_t *node; orte_node_t *node;
opal_buffer_t buf; opal_buffer_t buf;
int rc=ORTE_SUCCESS; int rc=ORTE_SUCCESS;
uint8_t *oversub; uint8_t oversub;
char *name; char *name;
orte_job_t *daemons; orte_job_t *daemons;
orte_proc_t *dptr; orte_proc_t *dptr;
@ -532,6 +591,18 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
return rc; return rc;
} }
/* see if coprocessors were detected */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &oversub, &n, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 == oversub) {
orte_coprocessors_detected = false;
} else {
orte_coprocessors_detected = true;
}
/* transfer the data to the nodes */ /* transfer the data to the nodes */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
daemons->num_procs = num_daemons; daemons->num_procs = num_daemons;
@ -597,6 +668,16 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
} else { } else {
node->oversubscribed = true; node->oversubscribed = true;
} }
/* if coprocessors were detected, unpack the hostid */
if (orte_coprocessors_detected) {
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &node->hostid, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
} }
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -789,7 +870,7 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update)
/* only APPS call this function - daemons have their own */ /* only APPS call this function - daemons have their own */
int orte_util_decode_pidmap(opal_byte_object_t *bo) int orte_util_decode_pidmap(opal_byte_object_t *bo)
{ {
orte_vpid_t num_procs; orte_vpid_t num_procs, hostid, *vptr;
orte_local_rank_t local_rank; orte_local_rank_t local_rank;
orte_node_rank_t node_rank; orte_node_rank_t node_rank;
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
@ -950,6 +1031,26 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; goto cleanup;
} }
/* if coprocessors were detected, lookup and store the hostid for this proc */
if (orte_coprocessors_detected) {
/* lookup the hostid for this daemon */
vptr = &hostid;
if (ORTE_SUCCESS != (rc = opal_db.fetch((opal_identifier_t*)&dmn, ORTE_DB_HOSTID,
(void**)&vptr, OPAL_UINT32))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output,
"%s FOUND HOSTID %s FOR DAEMON %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_VPID_PRINT(hostid), ORTE_VPID_PRINT(dmn.vpid)));
/* store it as hostid for this proc */
if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&proc, OPAL_SCOPE_NON_PEER,
ORTE_DB_HOSTID, &hostid, OPAL_UINT32))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
/* lookup and store the hostname for this proc */ /* lookup and store the hostname for this proc */
if (ORTE_SUCCESS != (rc = opal_db.fetch_pointer((opal_identifier_t*)&dmn, ORTE_DB_HOSTNAME, if (ORTE_SUCCESS != (rc = opal_db.fetch_pointer((opal_identifier_t*)&dmn, ORTE_DB_HOSTNAME,
(void**)&hostname, OPAL_STRING))) { (void**)&hostname, OPAL_STRING))) {

Просмотреть файл

@ -81,7 +81,8 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
/* .app_rank = */ -1, /* .app_rank = */ -1,
/* .peer_modex = */ -1, /* .peer_modex = */ -1,
/* .peer_init_barrier = */ -1, /* .peer_init_barrier = */ -1,
/* .peer_fini_barrier = */ -1 /* .peer_fini_barrier = */ -1,
/* .my_hostid = */ ORTE_VPID_INVALID
}; };
static bool init=false; static bool init=false;

Просмотреть файл

@ -11,6 +11,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. * Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -128,6 +129,7 @@ struct orte_proc_info_t {
orte_grpcomm_coll_id_t peer_modex; /**< modex collective id */ orte_grpcomm_coll_id_t peer_modex; /**< modex collective id */
orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */ orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */
orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */ orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */
orte_vpid_t my_hostid; /** identifies the local host for a coprocessor */
}; };
typedef struct orte_proc_info_t orte_proc_info_t; typedef struct orte_proc_info_t orte_proc_info_t;