1
1
openmpi/orte/mca/grpcomm/base/grpcomm_base_modex.c

952 строки
33 KiB
C
Исходник Обычный вид История

/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <string.h>
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/threads/condition.h"
#include "opal/util/output.h"
#include "opal/class/opal_hash_table.h"
#include "opal/dss/dss.h"
#include "opal/mca/hwloc/base/base.h"
#include "orte/util/proc_info.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/util/nidmap.h"
#include "orte/orted/orted.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/grpcomm/base/base.h"
#include "orte/mca/grpcomm/grpcomm.h"
/*************** MODEX SECTION **************/
int orte_grpcomm_base_full_modex(opal_list_t *procs)
{
opal_buffer_t buf, rbuf;
int32_t i, n, num_procs;
orte_std_cntr_t cnt;
orte_process_name_t proc_name;
int rc=ORTE_SUCCESS;
orte_nid_t *nid;
orte_local_rank_t local_rank;
orte_node_rank_t node_rank;
orte_jmap_t *jmap;
orte_pmap_t *pmap;
orte_vpid_t daemon;
char *hostname;
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
"%s grpcomm:base:full:modex: performing modex",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* setup the buffer that will actually be sent */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
OBJ_CONSTRUCT(&rbuf, opal_buffer_t);
/* put our process name in the buffer so it can be unpacked later */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* pack our hostname */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* pack our daemon's vpid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &ORTE_PROC_MY_DAEMON->vpid, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* pack our node rank */
node_rank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node_rank, 1, ORTE_NODE_RANK))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* pack our local rank */
local_rank = orte_ess.get_local_rank(ORTE_PROC_MY_NAME);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &local_rank, 1, ORTE_LOCAL_RANK))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
#if OPAL_HAVE_HWLOC
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
/* pack our binding info so other procs can determine our locality */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.bind_level, 1, OPAL_HWLOC_LEVEL_T))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.bind_idx, 1, OPAL_UINT))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
#endif
/* pack the entries we have received */
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
"%s grpcomm:base:full:modex: executing allgather",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
if (NULL == procs) {
/* exchange the buffer with my peers */
if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(&buf, &rbuf))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
} else {
/* exchange the buffer with the list of peers */
if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather_list(procs, &buf, &rbuf))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
"%s grpcomm:base:full:modex: processing modex info",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* extract the number of procs that put data in the buffer */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &num_procs, &cnt, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:full:modex: received %ld data bytes from %d procs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long)(rbuf.pack_ptr - rbuf.unpack_ptr), num_procs));
/* if the buffer doesn't have any more data, ignore it */
if (0 >= (rbuf.pack_ptr - rbuf.unpack_ptr)) {
goto cleanup;
}
/* otherwise, process it */
for (i=0; i < num_procs; i++) {
/* unpack the process name */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &proc_name, &cnt, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* unpack the hostname */
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &hostname, &cnt, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* unpack the daemon vpid */
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &daemon, &cnt, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* unpack the node rank */
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &node_rank, &cnt, ORTE_NODE_RANK))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* unpack the local rank */
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &local_rank, &cnt, ORTE_LOCAL_RANK))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* UPDATE THE NIDMAP/PIDMAP TO SUPPORT DYNAMIC OPERATIONS */
/* find this proc's node in the nidmap */
nid = NULL;
for (n=0; NULL != (nid = (orte_nid_t *) opal_pointer_array_get_item(&orte_nidmap, n)); n++) {
if (0 == strcmp(hostname, nid->name)) {
break;
}
}
if (NULL == nid) {
/* node wasn't found - let's add it */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:full:modex no nidmap entry for node %s",
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == hostname) ? "NULL" : hostname));
nid = OBJ_NEW(orte_nid_t);
nid->name = strdup(hostname);
nid->daemon = daemon;
nid->index = opal_pointer_array_add(&orte_nidmap, nid);
}
/* see if we have this job in a jobmap */
if (NULL == (jmap = orte_util_lookup_jmap(proc_name.jobid))) {
/* proc wasn't found - let's add it */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:full:modex no jobmap entry for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(proc_name.jobid)));
jmap = OBJ_NEW(orte_jmap_t);
jmap->job = proc_name.jobid;
/* unfortunately, job objects cannot be stored
* by index number as the jobid is a constructed
* value. So we have to just add it to the end
* of the array
*/
opal_pointer_array_add(&orte_jobmap, jmap);
jmap->num_procs = 1;
/* have to add the pidmap entry too, but this
* can be done at the specific site corresponding
* to the proc's vpid
*/
pmap = OBJ_NEW(orte_pmap_t);
pmap->node = nid->index;
pmap->local_rank = local_rank;
pmap->node_rank = node_rank;
opal_pointer_array_set_item(&jmap->pmap, proc_name.vpid, pmap);
} else {
/* see if we have this proc in a pidmap */
if (NULL == (pmap = orte_util_lookup_pmap(&proc_name))) {
/* proc wasn't found - let's add it */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:full:modex no pidmap entry for proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name)));
pmap = OBJ_NEW(orte_pmap_t);
pmap->node = nid->index;
pmap->local_rank = local_rank;
pmap->node_rank = node_rank;
/* this can be done at the specific site corresponding
* to the proc's vpid
*/
opal_pointer_array_set_item(&jmap->pmap, proc_name.vpid, pmap);
/* account for the proc entry in the jmap */
jmap->num_procs++;
}
}
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
#if OPAL_HAVE_HWLOC
{
opal_hwloc_level_t bind_level;
unsigned int bind_idx;
/* unpack the locality info */
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &bind_level, &cnt, OPAL_HWLOC_LEVEL_T))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &bind_idx, &cnt, OPAL_UINT))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
"%s grpcomm:base:modex setting proc %s level %s idx %u",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name),
opal_hwloc_base_print_level(bind_level), bind_idx));
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
/* store on the pmap */
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &proc_name, ORTE_PROC_MY_NAME)) {
/* if this data is from myself, then set locality to all */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex setting proc %s locale ALL",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name)));
pmap->locality = OPAL_PROC_ALL_LOCAL;
} else if (daemon != ORTE_PROC_MY_DAEMON->vpid) {
/* this is on a different node, then mark as non-local */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex setting proc %s locale NONLOCAL",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name)));
pmap->locality = OPAL_PROC_NON_LOCAL;
} else if (OPAL_HWLOC_NODE_LEVEL == orte_process_info.bind_level ||
OPAL_HWLOC_NODE_LEVEL == bind_level) {
/* one or both of us is not bound, so all we can say is we are on the
* same node
*/
pmap->locality = OPAL_PROC_ON_NODE;
} else {
/* determine relative location on our node */
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
orte_process_info.bind_level,
orte_process_info.bind_idx,
bind_level, bind_idx);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex setting proc %s locale %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name),
opal_hwloc_base_print_locality(pmap->locality)));
}
}
#else
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &proc_name, ORTE_PROC_MY_NAME)) {
/* if this data is from myself, then set locality to all */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex setting proc %s locale ALL",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name)));
pmap->locality = OPAL_PROC_ALL_LOCAL;
} else if (daemon != ORTE_PROC_MY_DAEMON->vpid) {
/* this is on a different node, then mark as non-local */
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex setting proc %s locale NONLOCAL",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name)));
pmap->locality = OPAL_PROC_NON_LOCAL;
} else {
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
/* must be on our node */
pmap->locality = OPAL_PROC_ON_NODE;
}
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
#endif
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:full:modex: adding modex entry for proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name)));
/* update the modex database */
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_update_modex_entries(&proc_name, &rbuf))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
cleanup:
OBJ_DESTRUCT(&buf);
OBJ_DESTRUCT(&rbuf);
return rc;
}
int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf)
{
int32_t i, num_procs;
orte_std_cntr_t cnt;
orte_process_name_t proc_name;
int rc=ORTE_SUCCESS;
orte_vpid_t daemon;
orte_pmap_t *pmap;
/* process the results */
/* extract the number of procs that put data in the buffer */
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &num_procs, &cnt, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
"%s grpcomm:base:modex:unpack: received %ld data bytes from %d procs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long)(rbuf->pack_ptr - rbuf->unpack_ptr), num_procs));
/* if the buffer doesn't have any more data, ignore it */
if (0 >= (rbuf->pack_ptr - rbuf->unpack_ptr)) {
goto cleanup;
}
/* otherwise, process it */
for (i = 0; i < num_procs; i++) {
/* unpack the process name */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &proc_name, &cnt, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
/* SINCE THIS IS AMONGST PEERS, THERE IS NO NEED TO UPDATE THE NIDMAP/PIDMAP */
if (ORTE_VPID_INVALID == (daemon = orte_ess.proc_get_daemon(&proc_name))) {
/* clear problem */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
if (NULL == (pmap = orte_util_lookup_pmap(&proc_name))) {
/* clear problem */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:modex:unpack: adding modex entry for proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc_name)));
/* update the modex database */
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_update_modex_entries(&proc_name, rbuf))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
cleanup:
return rc;
}
/**
* MODEX DATABASE DESIGN
*
* Modex data is always associated with a given orte process name, in
* an opal hash table. The hash table is necessary because modex data is
* received for entire jobids and when working with
* dynamic processes, it is possible we will receive data for a
* process not yet in the ompi_proc_all() list of processes. This
* information must be kept for later use, because if accept/connect
* causes the proc to be added to the ompi_proc_all() list, it could
* cause a connection storm. Therefore, we use an
* opal_hast_table_t backing store to contain all modex information.
*
* While we could add the now discovered proc into the ompi_proc_all()
* list, this has some problems, in that we don't have the
* architecture and hostname information needed to properly fill in
* the ompi_proc_t structure and we don't want to cause RML
* communication to get it when we don't really need to know anything
* about the remote proc.
*
* All data put into the modex (or received from the modex) is
* associated with a given proc,attr_name pair. The data structures
* to maintain this data look something like:
*
* opal_hash_table_t modex_data -> list of attr_proc_t objects
*
* +-----------------------------+
* | modex_proc_data_t |
* | - opal_list_item_t |
* +-----------------------------+
* | opal_mutex_t modex_lock |
* | bool modex_received_data | 1
* | opal_list_t modules | ---------+
* +-----------------------------+ |
* * |
* +--------------------------------+ <--------+
* | modex_module_data_t |
* | - opal_list_item_t |
* +--------------------------------+
* | mca_base_component_t component |
* | void *module_data |
* | size_t module_data_size |
* +--------------------------------+
*
*/
/* Local "globals" */
static orte_std_cntr_t num_entries;
static opal_buffer_t *modex_buffer;
static opal_hash_table_t *modex_data;
static opal_mutex_t mutex;
static opal_condition_t cond;
/**
* Modex data for a particular orte process
*
* Locking infrastructure and list of module data for a given orte
* process name. The name association is maintained in the
* modex_data hash table.
*/
struct modex_proc_data_t {
/** Structure can be put on lists (including in hash tables) */
opal_list_item_t super;
/* Lock held whenever the modex data for this proc is being
modified */
opal_mutex_t modex_lock;
/* True if modex data has ever been received from this process,
false otherwise. */
bool modex_received_data;
/* List of modex_module_data_t structures containing all data
received from this process, sorted by component name. */
opal_list_t modex_module_data;
};
typedef struct modex_proc_data_t modex_proc_data_t;
static void
modex_construct(modex_proc_data_t * modex)
{
OBJ_CONSTRUCT(&modex->modex_lock, opal_mutex_t);
modex->modex_received_data = false;
OBJ_CONSTRUCT(&modex->modex_module_data, opal_list_t);
}
static void
modex_destruct(modex_proc_data_t * modex)
{
OBJ_DESTRUCT(&modex->modex_module_data);
OBJ_DESTRUCT(&modex->modex_lock);
}
OBJ_CLASS_INSTANCE(modex_proc_data_t, opal_object_t,
modex_construct, modex_destruct);
/**
* Data for a particular attribute
*
* Container for data for a particular module,attribute pair. This
* structure should be contained in the modex_module_data list in an
* modex_proc_data_t structure to maintain an association with a
* given proc. The list is then searched for a matching attribute
* name.
*
* While searching the list or reading from (or writing to) this
* structure, the lock in the proc_data_t should be held.
*/
struct modex_attr_data_t {
/** Structure can be put on lists */
opal_list_item_t super;
/** Attribute name */
char * attr_name;
/** Binary blob of data associated with this proc,component pair */
void *attr_data;
/** Size (in bytes) of module_data */
size_t attr_data_size;
};
typedef struct modex_attr_data_t modex_attr_data_t;
static void
modex_attr_construct(modex_attr_data_t * module)
{
module->attr_name = NULL;
module->attr_data = NULL;
module->attr_data_size = 0;
}
static void
modex_attr_destruct(modex_attr_data_t * module)
{
if (NULL != module->attr_name) {
free(module->attr_name);
}
if (NULL != module->attr_data) {
free(module->attr_data);
}
}
OBJ_CLASS_INSTANCE(modex_attr_data_t,
opal_list_item_t,
modex_attr_construct,
modex_attr_destruct);
/**
* Find data for a given attribute in a given modex_proc_data_t
* container.
*
* The proc_data's modex_lock must be held during this
* search.
*/
static modex_attr_data_t *
modex_lookup_attr_data(modex_proc_data_t *proc_data,
const char *attr_name,
bool create_if_not_found)
{
modex_attr_data_t *attr_data = NULL;
for (attr_data = (modex_attr_data_t *) opal_list_get_first(&proc_data->modex_module_data);
attr_data != (modex_attr_data_t *) opal_list_get_end(&proc_data->modex_module_data);
attr_data = (modex_attr_data_t *) opal_list_get_next(attr_data)) {
if (0 == strcmp(attr_name, attr_data->attr_name)) {
return attr_data;
}
}
if (create_if_not_found) {
attr_data = OBJ_NEW(modex_attr_data_t);
if (NULL == attr_data) return NULL;
attr_data->attr_name = strdup(attr_name);
opal_list_append(&proc_data->modex_module_data, &attr_data->super);
return attr_data;
}
return NULL;
}
/**
* Find modex_proc_data_t container associated with given
* orte_process_name_t.
*
* The global lock should *NOT* be held when
* calling this function.
*/
static modex_proc_data_t*
modex_lookup_orte_proc(const orte_process_name_t *orte_proc)
{
modex_proc_data_t *proc_data = NULL;
OPAL_THREAD_LOCK(&mutex);
opal_hash_table_get_value_uint64(modex_data,
orte_util_hash_name(orte_proc), (void**)&proc_data);
if (NULL == proc_data) {
/* The proc clearly exists, so create a modex structure
for it */
proc_data = OBJ_NEW(modex_proc_data_t);
if (NULL == proc_data) {
opal_output(0, "grpcomm_basic_modex_lookup_orte_proc: unable to allocate modex_proc_data_t\n");
OPAL_THREAD_UNLOCK(&mutex);
return NULL;
}
opal_hash_table_set_value_uint64(modex_data,
orte_util_hash_name(orte_proc), proc_data);
}
OPAL_THREAD_UNLOCK(&mutex);
return proc_data;
}
int orte_grpcomm_base_modex_init(void)
{
OBJ_CONSTRUCT(&mutex, opal_mutex_t);
OBJ_CONSTRUCT(&cond, opal_condition_t);
modex_data = OBJ_NEW(opal_hash_table_t);
opal_hash_table_init(modex_data, 256);
num_entries = 0;
modex_buffer = OBJ_NEW(opal_buffer_t);
return ORTE_SUCCESS;
}
void orte_grpcomm_base_modex_finalize(void)
{
OBJ_DESTRUCT(&mutex);
OBJ_DESTRUCT(&cond);
opal_hash_table_remove_all(modex_data);
OBJ_RELEASE(modex_data);
OBJ_RELEASE(modex_buffer);
}
int orte_grpcomm_base_set_proc_attr(const char *attr_name,
const void *data,
size_t size)
{
int rc;
OPAL_THREAD_LOCK(&mutex);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:set_proc_attr: setting attribute %s data size %lu",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
attr_name, (unsigned long)size));
/* Pack the attribute name information into the local buffer */
if (ORTE_SUCCESS != (rc = opal_dss.pack(modex_buffer, &attr_name, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* pack the size */
if (ORTE_SUCCESS != (rc = opal_dss.pack(modex_buffer, &size, 1, OPAL_SIZE))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* Pack the actual data into the buffer */
if (0 != size) {
if (ORTE_SUCCESS != (rc = opal_dss.pack(modex_buffer, (void *) data, size, OPAL_BYTE))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
/* track the number of entries */
++num_entries;
cleanup:
OPAL_THREAD_UNLOCK(&mutex);
return rc;
}
int orte_grpcomm_base_get_proc_attr(const orte_process_name_t proc,
const char * attribute_name, void **val,
size_t *size)
{
modex_proc_data_t *proc_data;
modex_attr_data_t *attr_data;
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
Enable modex-less launch. Consists of: 1. minor modification to include two new opal MCA params: (a) opal_profile: outputs what components were selected by each framework currently enabled for most, but not all, frameworks (b) opal_profile_file: name of file that contains profile info required for modex 2. introduction of two new tools: (a) ompi-probe: MPI process that simply calls MPI_Init/Finalize with opal_profile set. Also reports back the rml IP address for all interfaces on the node (b) ompi-profiler: uses ompi-probe to create the profile_file, also reports out a summary of what framework components are actually being used to help with configuration options 3. modification of the grpcomm basic component to utilize the profile file in place of the modex where possible 4. modification of orterun so it properly sees opal mca params and handles opal_profile correctly to ensure we don't get its profile 5. similar mod to orted as for orterun 6. addition of new test that calls orte_init followed by calls to grpcomm.barrier This is all completely benign unless actively selected. At the moment, it only supports modex-less launch for openib-based systems. Minor mod to the TCP btl would be required to enable it as well, if people are interested. Similarly, anyone interested in enabling other BTL's for modex-less operation should let me know and I'll give you the magic details. This seems to significantly improve scalability provided the file can be locally located on the nodes. I'm looking at an alternative means of disseminating the info (perhaps in launch message) as an option for removing that constraint. This commit was SVN r20098.
2008-12-09 23:49:02 +00:00
"%s grpcomm:get_proc_attr: searching for attr %s on proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), attribute_name,
ORTE_NAME_PRINT(&proc)));
Enable modex-less launch. Consists of: 1. minor modification to include two new opal MCA params: (a) opal_profile: outputs what components were selected by each framework currently enabled for most, but not all, frameworks (b) opal_profile_file: name of file that contains profile info required for modex 2. introduction of two new tools: (a) ompi-probe: MPI process that simply calls MPI_Init/Finalize with opal_profile set. Also reports back the rml IP address for all interfaces on the node (b) ompi-profiler: uses ompi-probe to create the profile_file, also reports out a summary of what framework components are actually being used to help with configuration options 3. modification of the grpcomm basic component to utilize the profile file in place of the modex where possible 4. modification of orterun so it properly sees opal mca params and handles opal_profile correctly to ensure we don't get its profile 5. similar mod to orted as for orterun 6. addition of new test that calls orte_init followed by calls to grpcomm.barrier This is all completely benign unless actively selected. At the moment, it only supports modex-less launch for openib-based systems. Minor mod to the TCP btl would be required to enable it as well, if people are interested. Similarly, anyone interested in enabling other BTL's for modex-less operation should let me know and I'll give you the magic details. This seems to significantly improve scalability provided the file can be locally located on the nodes. I'm looking at an alternative means of disseminating the info (perhaps in launch message) as an option for removing that constraint. This commit was SVN r20098.
2008-12-09 23:49:02 +00:00
proc_data = modex_lookup_orte_proc(&proc);
Repair the MPI-2 dynamic operations. This includes: 1. repair of the linear and direct routed modules 2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI 3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature 4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes. Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB 5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept. Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules. 6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch 7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch 8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies 9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc. 10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon There is still more cleanup to be done for efficiency, but this at least works. Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn. Fixes ticket #1256 This commit was SVN r18804.
2008-07-03 17:53:37 +00:00
if (NULL == proc_data) {
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
Repair the MPI-2 dynamic operations. This includes: 1. repair of the linear and direct routed modules 2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI 3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature 4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes. Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB 5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept. Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules. 6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch 7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch 8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies 9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc. 10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon There is still more cleanup to be done for efficiency, but this at least works. Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn. Fixes ticket #1256 This commit was SVN r18804.
2008-07-03 17:53:37 +00:00
"%s grpcomm:get_proc_attr: no modex entry for proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc)));
Repair the MPI-2 dynamic operations. This includes: 1. repair of the linear and direct routed modules 2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI 3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature 4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes. Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB 5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept. Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules. 6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch 7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch 8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies 9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc. 10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon There is still more cleanup to be done for efficiency, but this at least works. Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn. Fixes ticket #1256 This commit was SVN r18804.
2008-07-03 17:53:37 +00:00
return ORTE_ERR_NOT_FOUND;
}
OPAL_THREAD_LOCK(&proc_data->modex_lock);
/* look up attribute */
attr_data = modex_lookup_attr_data(proc_data, attribute_name, false);
/* copy the data out to the user */
if ((NULL == attr_data) ||
(attr_data->attr_data_size == 0)) {
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
Repair the MPI-2 dynamic operations. This includes: 1. repair of the linear and direct routed modules 2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI 3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature 4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes. Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB 5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept. Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules. 6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch 7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch 8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies 9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc. 10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon There is still more cleanup to be done for efficiency, but this at least works. Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn. Fixes ticket #1256 This commit was SVN r18804.
2008-07-03 17:53:37 +00:00
"%s grpcomm:get_proc_attr: no attr avail or zero byte size for proc %s attribute %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc), attribute_name));
*val = NULL;
*size = 0;
} else {
void *copy = malloc(attr_data->attr_data_size);
if (copy == NULL) {
OPAL_THREAD_UNLOCK(&proc_data->modex_lock);
return ORTE_ERR_OUT_OF_RESOURCE;
}
memcpy(copy, attr_data->attr_data, attr_data->attr_data_size);
*val = copy;
*size = attr_data->attr_data_size;
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
Enable modex-less launch. Consists of: 1. minor modification to include two new opal MCA params: (a) opal_profile: outputs what components were selected by each framework currently enabled for most, but not all, frameworks (b) opal_profile_file: name of file that contains profile info required for modex 2. introduction of two new tools: (a) ompi-probe: MPI process that simply calls MPI_Init/Finalize with opal_profile set. Also reports back the rml IP address for all interfaces on the node (b) ompi-profiler: uses ompi-probe to create the profile_file, also reports out a summary of what framework components are actually being used to help with configuration options 3. modification of the grpcomm basic component to utilize the profile file in place of the modex where possible 4. modification of orterun so it properly sees opal mca params and handles opal_profile correctly to ensure we don't get its profile 5. similar mod to orted as for orterun 6. addition of new test that calls orte_init followed by calls to grpcomm.barrier This is all completely benign unless actively selected. At the moment, it only supports modex-less launch for openib-based systems. Minor mod to the TCP btl would be required to enable it as well, if people are interested. Similarly, anyone interested in enabling other BTL's for modex-less operation should let me know and I'll give you the magic details. This seems to significantly improve scalability provided the file can be locally located on the nodes. I'm looking at an alternative means of disseminating the info (perhaps in launch message) as an option for removing that constraint. This commit was SVN r20098.
2008-12-09 23:49:02 +00:00
"%s grpcomm:get_proc_attr: found %d bytes for attr %s on proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)attr_data->attr_data_size,
attribute_name, ORTE_NAME_PRINT(&proc)));
Enable modex-less launch. Consists of: 1. minor modification to include two new opal MCA params: (a) opal_profile: outputs what components were selected by each framework currently enabled for most, but not all, frameworks (b) opal_profile_file: name of file that contains profile info required for modex 2. introduction of two new tools: (a) ompi-probe: MPI process that simply calls MPI_Init/Finalize with opal_profile set. Also reports back the rml IP address for all interfaces on the node (b) ompi-profiler: uses ompi-probe to create the profile_file, also reports out a summary of what framework components are actually being used to help with configuration options 3. modification of the grpcomm basic component to utilize the profile file in place of the modex where possible 4. modification of orterun so it properly sees opal mca params and handles opal_profile correctly to ensure we don't get its profile 5. similar mod to orted as for orterun 6. addition of new test that calls orte_init followed by calls to grpcomm.barrier This is all completely benign unless actively selected. At the moment, it only supports modex-less launch for openib-based systems. Minor mod to the TCP btl would be required to enable it as well, if people are interested. Similarly, anyone interested in enabling other BTL's for modex-less operation should let me know and I'll give you the magic details. This seems to significantly improve scalability provided the file can be locally located on the nodes. I'm looking at an alternative means of disseminating the info (perhaps in launch message) as an option for removing that constraint. This commit was SVN r20098.
2008-12-09 23:49:02 +00:00
}
OPAL_THREAD_UNLOCK(&proc_data->modex_lock);
return ORTE_SUCCESS;
}
int orte_grpcomm_base_purge_proc_attrs(void)
{
/*
* Purge the attributes
*/
opal_hash_table_remove_all(modex_data);
OBJ_RELEASE(modex_data);
modex_data = OBJ_NEW(opal_hash_table_t);
opal_hash_table_init(modex_data, 256);
/*
* Clear the modex buffer
*/
OBJ_RELEASE(modex_buffer);
num_entries = 0;
modex_buffer = OBJ_NEW(opal_buffer_t);
return ORTE_SUCCESS;
}
int orte_grpcomm_base_pack_modex_entries(opal_buffer_t *buf)
{
int rc;
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:pack_modex: reporting %ld entries",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long)num_entries));
/* put the number of entries into the buffer */
OPAL_THREAD_LOCK(&mutex);
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &num_entries, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* if there are entries, non-destructively copy the data across */
if (0 < num_entries) {
if (ORTE_SUCCESS != (opal_dss.copy_payload(buf, modex_buffer))) {
ORTE_ERROR_LOG(rc);
}
}
cleanup:
OPAL_THREAD_UNLOCK(&mutex);
return rc;
}
int orte_grpcomm_base_update_modex_entries(orte_process_name_t *proc_name,
opal_buffer_t *rbuf)
{
modex_proc_data_t *proc_data;
modex_attr_data_t *attr_data;
int rc = ORTE_SUCCESS;
orte_std_cntr_t num_recvd_entries;
orte_std_cntr_t cnt;
orte_std_cntr_t j;
/* look up the modex data structure */
proc_data = modex_lookup_orte_proc(proc_name);
if (proc_data == NULL) {
/* report the error */
opal_output(0, "grpcomm:base:update_modex: received modex info for unknown proc %s\n",
ORTE_NAME_PRINT(proc_name));
return ORTE_ERR_NOT_FOUND;
}
OPAL_THREAD_LOCK(&proc_data->modex_lock);
/* unpack the number of entries for this proc */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &num_recvd_entries, &cnt, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
Enable modex-less launch. Consists of: 1. minor modification to include two new opal MCA params: (a) opal_profile: outputs what components were selected by each framework currently enabled for most, but not all, frameworks (b) opal_profile_file: name of file that contains profile info required for modex 2. introduction of two new tools: (a) ompi-probe: MPI process that simply calls MPI_Init/Finalize with opal_profile set. Also reports back the rml IP address for all interfaces on the node (b) ompi-profiler: uses ompi-probe to create the profile_file, also reports out a summary of what framework components are actually being used to help with configuration options 3. modification of the grpcomm basic component to utilize the profile file in place of the modex where possible 4. modification of orterun so it properly sees opal mca params and handles opal_profile correctly to ensure we don't get its profile 5. similar mod to orted as for orterun 6. addition of new test that calls orte_init followed by calls to grpcomm.barrier This is all completely benign unless actively selected. At the moment, it only supports modex-less launch for openib-based systems. Minor mod to the TCP btl would be required to enable it as well, if people are interested. Similarly, anyone interested in enabling other BTL's for modex-less operation should let me know and I'll give you the magic details. This seems to significantly improve scalability provided the file can be locally located on the nodes. I'm looking at an alternative means of disseminating the info (perhaps in launch message) as an option for removing that constraint. This commit was SVN r20098.
2008-12-09 23:49:02 +00:00
"%s grpcomm:base:update_modex_entries: adding %d entries for proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_recvd_entries,
ORTE_NAME_PRINT(proc_name)));
Enable modex-less launch. Consists of: 1. minor modification to include two new opal MCA params: (a) opal_profile: outputs what components were selected by each framework currently enabled for most, but not all, frameworks (b) opal_profile_file: name of file that contains profile info required for modex 2. introduction of two new tools: (a) ompi-probe: MPI process that simply calls MPI_Init/Finalize with opal_profile set. Also reports back the rml IP address for all interfaces on the node (b) ompi-profiler: uses ompi-probe to create the profile_file, also reports out a summary of what framework components are actually being used to help with configuration options 3. modification of the grpcomm basic component to utilize the profile file in place of the modex where possible 4. modification of orterun so it properly sees opal mca params and handles opal_profile correctly to ensure we don't get its profile 5. similar mod to orted as for orterun 6. addition of new test that calls orte_init followed by calls to grpcomm.barrier This is all completely benign unless actively selected. At the moment, it only supports modex-less launch for openib-based systems. Minor mod to the TCP btl would be required to enable it as well, if people are interested. Similarly, anyone interested in enabling other BTL's for modex-less operation should let me know and I'll give you the magic details. This seems to significantly improve scalability provided the file can be locally located on the nodes. I'm looking at an alternative means of disseminating the info (perhaps in launch message) as an option for removing that constraint. This commit was SVN r20098.
2008-12-09 23:49:02 +00:00
/*
* Extract the attribute names and values
*/
for (j = 0; j < num_recvd_entries; j++) {
size_t num_bytes;
void *bytes = NULL;
char *attr_name;
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &attr_name, &cnt, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &num_bytes, &cnt, OPAL_SIZE))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (num_bytes != 0) {
if (NULL == (bytes = malloc(num_bytes))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
cnt = (orte_std_cntr_t) num_bytes;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, bytes, &cnt, OPAL_BYTE))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
num_bytes = cnt;
}
/*
* Lookup the corresponding modex structure
*/
if (NULL == (attr_data = modex_lookup_attr_data(proc_data,
attr_name, true))) {
opal_output(0, "grpcomm:base:update_modex: modex_lookup_attr_data failed\n");
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
if (NULL != attr_data->attr_data) {
/* some pre-existing value must be here - release it */
free(attr_data->attr_data);
}
attr_data->attr_data = bytes;
attr_data->attr_data_size = num_bytes;
proc_data->modex_received_data = true;
}
cleanup:
OPAL_THREAD_UNLOCK(&proc_data->modex_lock);
return rc;
}
Enable modex-less launch. Consists of: 1. minor modification to include two new opal MCA params: (a) opal_profile: outputs what components were selected by each framework currently enabled for most, but not all, frameworks (b) opal_profile_file: name of file that contains profile info required for modex 2. introduction of two new tools: (a) ompi-probe: MPI process that simply calls MPI_Init/Finalize with opal_profile set. Also reports back the rml IP address for all interfaces on the node (b) ompi-profiler: uses ompi-probe to create the profile_file, also reports out a summary of what framework components are actually being used to help with configuration options 3. modification of the grpcomm basic component to utilize the profile file in place of the modex where possible 4. modification of orterun so it properly sees opal mca params and handles opal_profile correctly to ensure we don't get its profile 5. similar mod to orted as for orterun 6. addition of new test that calls orte_init followed by calls to grpcomm.barrier This is all completely benign unless actively selected. At the moment, it only supports modex-less launch for openib-based systems. Minor mod to the TCP btl would be required to enable it as well, if people are interested. Similarly, anyone interested in enabling other BTL's for modex-less operation should let me know and I'll give you the magic details. This seems to significantly improve scalability provided the file can be locally located on the nodes. I'm looking at an alternative means of disseminating the info (perhaps in launch message) as an option for removing that constraint. This commit was SVN r20098.
2008-12-09 23:49:02 +00:00
int orte_grpcomm_base_load_modex_data(orte_process_name_t *proc_name, char *attr_name,
void *data, int num_bytes)
{
modex_proc_data_t *proc_data;
modex_attr_data_t *attr_data;
int rc = ORTE_SUCCESS;
void *bytes;
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
Enable modex-less launch. Consists of: 1. minor modification to include two new opal MCA params: (a) opal_profile: outputs what components were selected by each framework currently enabled for most, but not all, frameworks (b) opal_profile_file: name of file that contains profile info required for modex 2. introduction of two new tools: (a) ompi-probe: MPI process that simply calls MPI_Init/Finalize with opal_profile set. Also reports back the rml IP address for all interfaces on the node (b) ompi-profiler: uses ompi-probe to create the profile_file, also reports out a summary of what framework components are actually being used to help with configuration options 3. modification of the grpcomm basic component to utilize the profile file in place of the modex where possible 4. modification of orterun so it properly sees opal mca params and handles opal_profile correctly to ensure we don't get its profile 5. similar mod to orted as for orterun 6. addition of new test that calls orte_init followed by calls to grpcomm.barrier This is all completely benign unless actively selected. At the moment, it only supports modex-less launch for openib-based systems. Minor mod to the TCP btl would be required to enable it as well, if people are interested. Similarly, anyone interested in enabling other BTL's for modex-less operation should let me know and I'll give you the magic details. This seems to significantly improve scalability provided the file can be locally located on the nodes. I'm looking at an alternative means of disseminating the info (perhaps in launch message) as an option for removing that constraint. This commit was SVN r20098.
2008-12-09 23:49:02 +00:00
"%s grpcomm:base:load_modex_data: loading %ld bytes for attr %s on proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long)num_bytes, attr_name, ORTE_NAME_PRINT(proc_name)));
Enable modex-less launch. Consists of: 1. minor modification to include two new opal MCA params: (a) opal_profile: outputs what components were selected by each framework currently enabled for most, but not all, frameworks (b) opal_profile_file: name of file that contains profile info required for modex 2. introduction of two new tools: (a) ompi-probe: MPI process that simply calls MPI_Init/Finalize with opal_profile set. Also reports back the rml IP address for all interfaces on the node (b) ompi-profiler: uses ompi-probe to create the profile_file, also reports out a summary of what framework components are actually being used to help with configuration options 3. modification of the grpcomm basic component to utilize the profile file in place of the modex where possible 4. modification of orterun so it properly sees opal mca params and handles opal_profile correctly to ensure we don't get its profile 5. similar mod to orted as for orterun 6. addition of new test that calls orte_init followed by calls to grpcomm.barrier This is all completely benign unless actively selected. At the moment, it only supports modex-less launch for openib-based systems. Minor mod to the TCP btl would be required to enable it as well, if people are interested. Similarly, anyone interested in enabling other BTL's for modex-less operation should let me know and I'll give you the magic details. This seems to significantly improve scalability provided the file can be locally located on the nodes. I'm looking at an alternative means of disseminating the info (perhaps in launch message) as an option for removing that constraint. This commit was SVN r20098.
2008-12-09 23:49:02 +00:00
/* look up the modex data structure */
proc_data = modex_lookup_orte_proc(proc_name);
if (proc_data == NULL) {
/* report the error */
opal_output(0, "grpcomm:base:update_modex: received modex info for unknown proc %s\n",
ORTE_NAME_PRINT(proc_name));
Enable modex-less launch. Consists of: 1. minor modification to include two new opal MCA params: (a) opal_profile: outputs what components were selected by each framework currently enabled for most, but not all, frameworks (b) opal_profile_file: name of file that contains profile info required for modex 2. introduction of two new tools: (a) ompi-probe: MPI process that simply calls MPI_Init/Finalize with opal_profile set. Also reports back the rml IP address for all interfaces on the node (b) ompi-profiler: uses ompi-probe to create the profile_file, also reports out a summary of what framework components are actually being used to help with configuration options 3. modification of the grpcomm basic component to utilize the profile file in place of the modex where possible 4. modification of orterun so it properly sees opal mca params and handles opal_profile correctly to ensure we don't get its profile 5. similar mod to orted as for orterun 6. addition of new test that calls orte_init followed by calls to grpcomm.barrier This is all completely benign unless actively selected. At the moment, it only supports modex-less launch for openib-based systems. Minor mod to the TCP btl would be required to enable it as well, if people are interested. Similarly, anyone interested in enabling other BTL's for modex-less operation should let me know and I'll give you the magic details. This seems to significantly improve scalability provided the file can be locally located on the nodes. I'm looking at an alternative means of disseminating the info (perhaps in launch message) as an option for removing that constraint. This commit was SVN r20098.
2008-12-09 23:49:02 +00:00
return ORTE_ERR_NOT_FOUND;
}
OPAL_THREAD_LOCK(&proc_data->modex_lock);
/*
* Lookup the corresponding modex structure
*/
if (NULL == (attr_data = modex_lookup_attr_data(proc_data,
attr_name, true))) {
opal_output(0, "grpcomm:base:update_modex: modex_lookup_attr_data failed\n");
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
if (NULL != attr_data->attr_data) {
/* some pre-existing value must be here - release it */
free(attr_data->attr_data);
}
/* create space for the data - this is necessary since the data being
* passed to us may be static or released on the other end
*/
bytes = (void*)malloc(num_bytes);
memcpy(bytes, data, num_bytes);
attr_data->attr_data = bytes;
attr_data->attr_data_size = num_bytes;
proc_data->modex_received_data = true;
cleanup:
OPAL_THREAD_UNLOCK(&proc_data->modex_lock);
return rc;
}