From 2e2794fa15eb2a21bb7b98b03fbb0c02a6b4ae59 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 23 Oct 2013 15:56:23 +0000 Subject: [PATCH] Fix coprocessor detection by always adding the local daemon's co-processors to the hash table. Tested and working on a system with 2 Xeon Phi co-processors. cmr=v1.7.4:ticket=3847:reviewer=ompi-rm1.7 This commit was SVN r29489. The following Trac tickets were found above: Ticket 3847 --> https://svn.open-mpi.org/trac/ompi/ticket/3847 --- orte/mca/plm/base/plm_base_launch_support.c | 40 +++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 887a181187..0c3442a62b 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology @@ -12,7 +13,8 @@ * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 Institut National de Recherche en Informatique * et Automatique. All rights reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights + * reserved. * Copyright (c) 2013 Intel, Inc. All rights reserved. * $COPYRIGHT$ * @@ -677,6 +679,38 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); } +#if OPAL_HAVE_HWLOC + { + char *coprocessors, **sns; + + /* detect and add any of my coprocessors to the hash table */ + coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology); + + if (NULL != coprocessors) { + /* init the hash table, if necessary */ + if (NULL == orte_coprocessors) { + orte_coprocessors = OBJ_NEW(opal_hash_table_t); + opal_hash_table_init(orte_coprocessors, orte_process_info.num_procs); + } + /* separate the serial numbers of the coprocessors + * on this host + */ + sns = opal_argv_split(coprocessors, ','); + for (int idx = 0 ; NULL != sns[idx] ; ++idx) { + uint32_t h; + + /* compute the hash */ + OPAL_HASH_STR(sns[idx], h); + /* mark that this coprocessor is hosted by this daemon */ + opal_hash_table_set_value_uint32(orte_coprocessors, h, (void*)&ORTE_PROC_MY_NAME->vpid); + } + opal_argv_free(sns); + free(coprocessors); + orte_coprocessors_detected = true; + } + } +#endif + /* multiple daemons could be in this buffer, so unpack until we exhaust the data */ idx = 1; while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx, ORTE_NAME))) { @@ -1271,7 +1305,7 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, /* check for duplicate */ ignore = false; for (j=0; j < *argc; j++) { - if (0 == strcmp((*argv)[j], orted_cmd_line[i+1])) { + if (0 == strcmp((*argv)[j], orted_cmd_line[i+1])) { ignore = true; break; } @@ -1589,7 +1623,7 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) OBJ_DESTRUCT(&nodes); /* mark that the daemons have reported so we can proceed */ daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED; - daemons->updated = false; + daemons->updated = false; return ORTE_SUCCESS; }