2008-04-30 23:49:53 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2011-06-24 00:38:02 +04:00
|
|
|
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
2008-04-30 23:49:53 +04:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include "orte/types.h"
|
|
|
|
#include "orte/constants.h"
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <ctype.h>
|
2009-01-07 17:58:38 +03:00
|
|
|
#include <fcntl.h>
|
|
|
|
#ifdef HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
2009-05-16 08:15:55 +04:00
|
|
|
#ifdef HAVE_SYS_SOCKET_H
|
|
|
|
#include <sys/socket.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_NETINET_IN_H
|
|
|
|
#include <netinet/in.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_ARPA_INET_H
|
|
|
|
#include <arpa/inet.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_NETDB_H
|
|
|
|
#include <netdb.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_IFADDRS_H
|
|
|
|
#include <ifaddrs.h>
|
|
|
|
#endif
|
2008-04-30 23:49:53 +04:00
|
|
|
|
|
|
|
#include "opal/dss/dss.h"
|
2009-01-07 17:58:38 +03:00
|
|
|
#include "opal/runtime/opal.h"
|
2009-02-06 18:28:32 +03:00
|
|
|
#include "opal/class/opal_pointer_array.h"
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
#include "opal/mca/hwloc/base/base.h"
|
2009-02-14 05:26:12 +03:00
|
|
|
#include "opal/util/output.h"
|
2009-05-16 08:15:55 +04:00
|
|
|
#include "opal/util/argv.h"
|
2008-04-30 23:49:53 +04:00
|
|
|
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
2010-03-23 23:47:41 +03:00
|
|
|
#include "orte/mca/odls/base/odls_private.h"
|
2008-06-09 18:53:58 +04:00
|
|
|
#include "orte/util/show_help.h"
|
2008-04-30 23:49:53 +04:00
|
|
|
#include "orte/util/proc_info.h"
|
|
|
|
#include "orte/util/name_fns.h"
|
2009-06-24 00:25:38 +04:00
|
|
|
#include "orte/util/regex.h"
|
2008-04-30 23:49:53 +04:00
|
|
|
#include "orte/runtime/orte_globals.h"
|
2009-05-16 08:15:55 +04:00
|
|
|
#include "orte/mca/rml/base/rml_contact.h"
|
2008-04-30 23:49:53 +04:00
|
|
|
|
|
|
|
#include "orte/util/nidmap.h"
|
|
|
|
|
2009-01-07 17:58:38 +03:00
|
|
|
static bool initialized = false;
|
|
|
|
|
|
|
|
int orte_util_nidmap_init(opal_buffer_t *buffer)
|
|
|
|
{
|
|
|
|
int32_t cnt;
|
|
|
|
int rc;
|
|
|
|
opal_byte_object_t *bo;
|
|
|
|
|
|
|
|
if (!initialized) {
|
|
|
|
/* need to construct the global arrays */
|
|
|
|
/* setup the nidmap array */
|
|
|
|
OBJ_CONSTRUCT(&orte_nidmap, opal_pointer_array_t);
|
|
|
|
opal_pointer_array_init(&orte_nidmap, 8, INT32_MAX, 8);
|
|
|
|
|
|
|
|
/* setup array of jmaps */
|
|
|
|
OBJ_CONSTRUCT(&orte_jobmap, opal_pointer_array_t);
|
|
|
|
opal_pointer_array_init(&orte_jobmap, 1, INT32_MAX, 1);
|
|
|
|
|
|
|
|
/* make sure we don't do this twice */
|
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
|
2009-02-06 18:28:32 +03:00
|
|
|
/* it is okay if the buffer is empty */
|
2009-01-07 17:58:38 +03:00
|
|
|
if (NULL == buffer || 0 == buffer->bytes_used) {
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
{
|
|
|
|
hwloc_topology_t topo;
|
|
|
|
|
|
|
|
/* extract the topology */
|
|
|
|
cnt=1;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &cnt, OPAL_HWLOC_TOPO))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (NULL == opal_hwloc_topology) {
|
|
|
|
opal_hwloc_topology = topo;
|
|
|
|
} else {
|
|
|
|
hwloc_topology_destroy(topo);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2009-01-07 17:58:38 +03:00
|
|
|
/* extract the byte object holding the daemonmap */
|
|
|
|
cnt=1;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
/* unpack the node map */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(bo))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
/* the bytes in the object were free'd by the decode */
|
|
|
|
|
|
|
|
/* extract the byte object holding the process map */
|
|
|
|
cnt=1;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
/* unpack the process map */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_util_decode_pidmap(bo))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
/* the bytes in the object were free'd by the decode */
|
2011-09-11 23:02:24 +04:00
|
|
|
|
2009-01-07 17:58:38 +03:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
void orte_util_nidmap_finalize(void)
|
|
|
|
{
|
2010-12-11 19:42:06 +03:00
|
|
|
orte_nid_t *nid;
|
|
|
|
orte_jmap_t *jmap;
|
2009-01-07 17:58:38 +03:00
|
|
|
int32_t i;
|
|
|
|
|
|
|
|
if (!initialized) {
|
|
|
|
/* nothing to do */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* deconstruct the global nidmap and jobmap arrays */
|
2010-12-11 19:42:06 +03:00
|
|
|
for (i=0; i < orte_nidmap.size; i++) {
|
|
|
|
if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
OBJ_RELEASE(nid);
|
2009-01-07 17:58:38 +03:00
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&orte_nidmap);
|
2010-12-11 19:42:06 +03:00
|
|
|
for (i=0; i < orte_jobmap.size; i++) {
|
|
|
|
if (NULL == (jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, i))) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
OBJ_RELEASE(jmap);
|
2009-01-07 17:58:38 +03:00
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&orte_jobmap);
|
|
|
|
|
2011-09-11 23:02:24 +04:00
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
/* destroy the topology */
|
|
|
|
if (NULL != opal_hwloc_topology) {
|
|
|
|
hwloc_topology_destroy(opal_hwloc_topology);
|
2011-09-13 23:21:10 +04:00
|
|
|
opal_hwloc_topology = NULL;
|
2011-09-11 23:02:24 +04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2009-01-07 17:58:38 +03:00
|
|
|
/* flag that these are no longer initialized */
|
|
|
|
initialized = false;
|
|
|
|
}
|
|
|
|
|
2009-02-20 00:28:58 +03:00
|
|
|
int orte_util_setup_local_nidmap_entries(void)
|
|
|
|
{
|
|
|
|
orte_nid_t *node;
|
|
|
|
orte_jmap_t *jmap;
|
2009-02-25 05:43:22 +03:00
|
|
|
orte_pmap_t *pmap;
|
2009-02-20 00:28:58 +03:00
|
|
|
|
|
|
|
/* add a jmap entry for myself */
|
|
|
|
jmap = OBJ_NEW(orte_jmap_t);
|
|
|
|
jmap->job = ORTE_PROC_MY_NAME->jobid;
|
|
|
|
opal_pointer_array_add(&orte_jobmap, jmap);
|
|
|
|
jmap->num_procs = 1;
|
|
|
|
|
|
|
|
/* create a nidmap entry for this node */
|
|
|
|
node = OBJ_NEW(orte_nid_t);
|
2009-03-06 00:56:03 +03:00
|
|
|
node->name = strdup(orte_process_info.nodename);
|
2009-02-20 00:28:58 +03:00
|
|
|
node->daemon = ORTE_PROC_MY_DAEMON->vpid;
|
2009-02-25 05:43:22 +03:00
|
|
|
pmap = OBJ_NEW(orte_pmap_t);
|
|
|
|
pmap->local_rank = 0;
|
|
|
|
pmap->node_rank = 0;
|
2009-02-20 00:28:58 +03:00
|
|
|
node->index = opal_pointer_array_add(&orte_nidmap, node);
|
2009-02-25 05:43:22 +03:00
|
|
|
pmap->node = node->index;
|
|
|
|
opal_pointer_array_set_item(&jmap->pmap, ORTE_PROC_MY_NAME->vpid, pmap);
|
2009-02-20 00:28:58 +03:00
|
|
|
|
|
|
|
/* all done */
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2009-05-16 08:15:55 +04:00
|
|
|
int orte_util_build_daemon_nidmap(char **nodes)
|
|
|
|
{
|
|
|
|
orte_nid_t *node;
|
|
|
|
int i, num_nodes;
|
|
|
|
int rc;
|
|
|
|
struct hostent *h;
|
|
|
|
opal_buffer_t buf;
|
|
|
|
orte_process_name_t proc;
|
|
|
|
char *uri, *addr;
|
|
|
|
char *proc_name;
|
|
|
|
|
|
|
|
num_nodes = opal_argv_count(nodes);
|
|
|
|
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
2011-08-18 18:59:18 +04:00
|
|
|
"%s orte:util:build:daemon:nidmap found %d nodes",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_nodes));
|
2009-05-16 08:15:55 +04:00
|
|
|
|
|
|
|
if (0 == num_nodes) {
|
|
|
|
/* nothing to do */
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* set the size of the nidmap storage so we minimize realloc's */
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&orte_nidmap, num_nodes+1))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* install the entry for the HNP */
|
|
|
|
node = OBJ_NEW(orte_nid_t);
|
|
|
|
node->name = strdup("HNP");
|
|
|
|
node->daemon = 0;
|
|
|
|
/* the arch defaults to our arch so that non-hetero
|
|
|
|
* case will yield correct behavior
|
|
|
|
*/
|
|
|
|
opal_pointer_array_set_item(&orte_nidmap, 0, node);
|
|
|
|
|
|
|
|
/* the daemon vpids will be assigned in order,
|
|
|
|
* starting with vpid=1 for the first node in
|
|
|
|
* the list
|
|
|
|
*/
|
|
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
|
|
|
proc.jobid = ORTE_PROC_MY_NAME->jobid;
|
|
|
|
for (i=0; i < num_nodes; i++) {
|
|
|
|
node = OBJ_NEW(orte_nid_t);
|
|
|
|
node->name = strdup(nodes[i]);
|
|
|
|
node->daemon = i+1;
|
|
|
|
/* the arch defaults to our arch so that non-hetero
|
|
|
|
* case will yield correct behavior
|
|
|
|
*/
|
|
|
|
opal_pointer_array_set_item(&orte_nidmap, node->daemon, node);
|
|
|
|
|
|
|
|
/* lookup the address of this node */
|
|
|
|
if (NULL == (h = gethostbyname(node->name))) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
|
|
return ORTE_ERR_NOT_FOUND;
|
|
|
|
}
|
|
|
|
addr = inet_ntoa(*(struct in_addr*)h->h_addr_list[0]);
|
|
|
|
|
|
|
|
/* since we are using static ports, all my fellow daemons will be on my
|
|
|
|
* port. Setup the contact info for each daemon in my hash tables. Note
|
|
|
|
* that this will -not- open a port to those daemons, but will only
|
|
|
|
* define the info necessary for opening such a port if/when I communicate
|
|
|
|
* to them
|
|
|
|
*/
|
|
|
|
/* construct the URI */
|
|
|
|
proc.vpid = node->daemon;
|
2011-06-24 00:38:02 +04:00
|
|
|
|
2009-05-16 08:15:55 +04:00
|
|
|
orte_util_convert_process_name_to_string(&proc_name, &proc);
|
|
|
|
asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, (int)orte_process_info.my_port);
|
2011-08-18 18:59:18 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
|
|
|
"%s orte:util:build:daemon:nidmap node %s daemon %d addr %s uri %s",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
node->name, (int)node->daemon, addr, uri));
|
2009-05-16 08:15:55 +04:00
|
|
|
opal_dss.pack(&buf, &uri, 1, OPAL_STRING);
|
|
|
|
free(proc_name);
|
|
|
|
free(uri);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* load the hash tables */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&buf);
|
|
|
|
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2008-04-30 23:49:53 +04:00
|
|
|
int orte_util_encode_nodemap(opal_byte_object_t *boptr)
|
|
|
|
{
|
2008-05-28 22:38:47 +04:00
|
|
|
orte_vpid_t *vpids;
|
2009-06-24 06:47:45 +04:00
|
|
|
orte_node_t *node, *hnp;
|
2009-06-24 00:25:38 +04:00
|
|
|
int32_t i, num_nodes;
|
2008-04-30 23:49:53 +04:00
|
|
|
int rc;
|
|
|
|
char *nodename;
|
|
|
|
opal_buffer_t buf;
|
2009-06-24 06:47:45 +04:00
|
|
|
char *ptr;
|
2010-12-01 15:51:39 +03:00
|
|
|
uint8_t *oversub=NULL;
|
2009-06-24 06:47:45 +04:00
|
|
|
|
2008-05-28 22:38:47 +04:00
|
|
|
/* setup a buffer for tmp use */
|
2008-04-30 23:49:53 +04:00
|
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
2008-05-28 22:38:47 +04:00
|
|
|
|
|
|
|
/* determine the number of nodes in the global node array */
|
|
|
|
num_nodes = 0;
|
2009-06-24 06:47:45 +04:00
|
|
|
for (i=0; i < orte_node_pool->size; i++) {
|
|
|
|
if (NULL == opal_pointer_array_get_item(orte_node_pool, i)) {
|
|
|
|
continue;
|
|
|
|
}
|
2008-05-28 22:38:47 +04:00
|
|
|
++num_nodes;
|
|
|
|
}
|
2009-07-02 00:46:05 +04:00
|
|
|
|
2008-05-28 22:38:47 +04:00
|
|
|
/* pack number of nodes */
|
2008-09-25 17:39:08 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &num_nodes, 1, OPAL_INT32))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2008-04-30 23:49:53 +04:00
|
|
|
|
2009-07-02 00:46:05 +04:00
|
|
|
/* the HNP always has an entry at posn 0 - get its pointer as
|
|
|
|
* we will need it later
|
2008-04-30 23:49:53 +04:00
|
|
|
*/
|
2009-06-24 06:47:45 +04:00
|
|
|
hnp = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
|
2009-07-02 00:46:05 +04:00
|
|
|
|
2009-06-24 00:25:38 +04:00
|
|
|
/* pack every nodename individually */
|
2009-07-02 00:46:05 +04:00
|
|
|
for (i=0; i < orte_node_pool->size; i++) {
|
2009-06-27 02:07:25 +04:00
|
|
|
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
|
|
|
continue;
|
|
|
|
}
|
2009-06-24 00:25:38 +04:00
|
|
|
if (!orte_keep_fqdn_hostnames) {
|
2009-06-24 06:47:45 +04:00
|
|
|
nodename = strdup(node->name);
|
2009-06-24 00:25:38 +04:00
|
|
|
if (NULL != (ptr = strchr(nodename, '.'))) {
|
|
|
|
*ptr = '\0';
|
2008-04-30 23:49:53 +04:00
|
|
|
}
|
2009-06-24 00:25:38 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodename, 1, OPAL_STRING))) {
|
2008-09-25 17:39:08 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2009-06-24 00:25:38 +04:00
|
|
|
free(nodename);
|
2008-04-30 23:49:53 +04:00
|
|
|
} else {
|
2009-06-24 06:47:45 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node->name, 1, OPAL_STRING))) {
|
2008-09-25 17:39:08 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2008-04-30 23:49:53 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-05-28 22:38:47 +04:00
|
|
|
/* since the daemon vpids may not correspond to the node
|
|
|
|
* index, we need to also pack the vpid array for all
|
|
|
|
* daemons. This scenario can happen when the user is
|
|
|
|
* employing a mapping algo that doesn't use all allocated
|
|
|
|
* nodes, and sprinkles procs across them in some non-contig
|
|
|
|
* manner. For example, use of the seq mapper where only
|
|
|
|
* some nodes are used, and where the usage leaves "holes"
|
|
|
|
* in the node array, will cause the daemon vpids to not
|
|
|
|
* match their node array index
|
|
|
|
*/
|
|
|
|
|
2010-12-01 15:51:39 +03:00
|
|
|
/* allocate space for the daemon vpids and oversubscribed flags */
|
2008-05-28 22:38:47 +04:00
|
|
|
vpids = (orte_vpid_t*)malloc(num_nodes * sizeof(orte_vpid_t));
|
2010-12-01 15:51:39 +03:00
|
|
|
oversub = (uint8_t*)malloc(num_nodes * sizeof(uint8_t));
|
2009-07-02 00:46:05 +04:00
|
|
|
for (i=0; i < orte_node_pool->size; i++) {
|
2009-06-24 06:47:45 +04:00
|
|
|
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (NULL == node->daemon) {
|
2008-05-28 22:38:47 +04:00
|
|
|
/* some nodes may not have daemons on them */
|
|
|
|
vpids[i] = ORTE_VPID_INVALID;
|
|
|
|
continue;
|
|
|
|
}
|
2009-06-24 06:47:45 +04:00
|
|
|
vpids[i] = node->daemon->name.vpid;
|
2010-12-01 15:51:39 +03:00
|
|
|
oversub[i] = node->oversubscribed;
|
2008-05-28 22:38:47 +04:00
|
|
|
}
|
2008-09-25 17:39:08 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, vpids, num_nodes, ORTE_VPID))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2008-05-28 22:38:47 +04:00
|
|
|
free(vpids);
|
2010-12-01 15:51:39 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, oversub, num_nodes, OPAL_UINT8))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
free(oversub);
|
2011-05-18 22:27:29 +04:00
|
|
|
|
2008-04-30 23:49:53 +04:00
|
|
|
/* transfer the payload to the byte object */
|
|
|
|
opal_dss.unload(&buf, (void**)&boptr->bytes, &boptr->size);
|
|
|
|
OBJ_DESTRUCT(&buf);
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2009-01-07 17:58:38 +03:00
|
|
|
int orte_util_decode_nodemap(opal_byte_object_t *bo)
|
2008-04-30 23:49:53 +04:00
|
|
|
{
|
2009-06-24 00:25:38 +04:00
|
|
|
int n;
|
|
|
|
int32_t num_nodes, i, num_daemons;
|
2008-04-30 23:49:53 +04:00
|
|
|
orte_nid_t *node;
|
2008-05-28 22:38:47 +04:00
|
|
|
orte_vpid_t *vpids;
|
2009-06-24 06:47:45 +04:00
|
|
|
orte_nid_t *nd, *ndptr;
|
2008-04-30 23:49:53 +04:00
|
|
|
opal_buffer_t buf;
|
2008-09-25 17:39:08 +04:00
|
|
|
int rc;
|
2010-12-01 15:51:39 +03:00
|
|
|
uint8_t *oversub;
|
2008-04-30 23:49:53 +04:00
|
|
|
|
2012-04-06 18:23:13 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
|
2008-04-30 23:49:53 +04:00
|
|
|
"%s decode:nidmap decoding nodemap",
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
2008-04-30 23:49:53 +04:00
|
|
|
|
Repair the MPI-2 dynamic operations. This includes:
1. repair of the linear and direct routed modules
2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI
3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature
4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes.
Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB
5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept.
Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules.
6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch
7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch
8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies
9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc.
10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon
There is still more cleanup to be done for efficiency, but this at least works.
Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn.
Fixes ticket #1256
This commit was SVN r18804.
2008-07-03 21:53:37 +04:00
|
|
|
/* if there are any entries already in the node array, clear it out */
|
2009-01-07 17:58:38 +03:00
|
|
|
if (0 < orte_nidmap.size) {
|
Repair the MPI-2 dynamic operations. This includes:
1. repair of the linear and direct routed modules
2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI
3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature
4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes.
Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB
5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept.
Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules.
6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch
7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch
8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies
9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc.
10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon
There is still more cleanup to be done for efficiency, but this at least works.
Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn.
Fixes ticket #1256
This commit was SVN r18804.
2008-07-03 21:53:37 +04:00
|
|
|
/* unfortunately, the opal function "remove_all" doesn't release
|
|
|
|
* the memory pointed to by the elements in the array, so we need
|
|
|
|
* to release those first
|
|
|
|
*/
|
2009-06-24 06:47:45 +04:00
|
|
|
for (i=0; i < orte_nidmap.size; i++) {
|
|
|
|
if (NULL != (ndptr = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
|
|
|
|
OBJ_RELEASE(ndptr);
|
|
|
|
}
|
Repair the MPI-2 dynamic operations. This includes:
1. repair of the linear and direct routed modules
2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI
3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature
4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes.
Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB
5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept.
Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules.
6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch
7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch
8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies
9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc.
10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon
There is still more cleanup to be done for efficiency, but this at least works.
Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn.
Fixes ticket #1256
This commit was SVN r18804.
2008-07-03 21:53:37 +04:00
|
|
|
}
|
|
|
|
/* now use the opal function to reset the internal pointers */
|
2009-01-07 17:58:38 +03:00
|
|
|
opal_pointer_array_remove_all(&orte_nidmap);
|
Repair the MPI-2 dynamic operations. This includes:
1. repair of the linear and direct routed modules
2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI
3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature
4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes.
Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB
5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept.
Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules.
6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch
7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch
8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies
9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc.
10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon
There is still more cleanup to be done for efficiency, but this at least works.
Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn.
Fixes ticket #1256
This commit was SVN r18804.
2008-07-03 21:53:37 +04:00
|
|
|
}
|
|
|
|
|
2008-04-30 23:49:53 +04:00
|
|
|
/* xfer the byte object to a buffer for unpacking */
|
|
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
|
|
|
opal_dss.load(&buf, bo->bytes, bo->size);
|
|
|
|
|
|
|
|
/* unpack number of nodes */
|
|
|
|
n=1;
|
2008-09-25 17:39:08 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_nodes, &n, OPAL_INT32))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2008-04-30 23:49:53 +04:00
|
|
|
|
2012-04-06 18:23:13 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
|
2009-07-02 00:46:05 +04:00
|
|
|
"%s decode:nidmap decoding %d nodes",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_nodes));
|
2008-04-30 23:49:53 +04:00
|
|
|
|
Repair the MPI-2 dynamic operations. This includes:
1. repair of the linear and direct routed modules
2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI
3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature
4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes.
Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB
5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept.
Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules.
6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch
7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch
8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies
9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc.
10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon
There is still more cleanup to be done for efficiency, but this at least works.
Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn.
Fixes ticket #1256
This commit was SVN r18804.
2008-07-03 21:53:37 +04:00
|
|
|
/* set the size of the nidmap storage so we minimize realloc's */
|
2009-01-07 17:58:38 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&orte_nidmap, num_nodes))) {
|
2008-09-25 17:39:08 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2008-04-30 23:49:53 +04:00
|
|
|
|
2009-06-24 00:25:38 +04:00
|
|
|
/* loop over nodes and unpack the raw nodename */
|
2009-07-02 00:46:05 +04:00
|
|
|
for (i=0; i < num_nodes; i++) {
|
2009-06-24 00:25:38 +04:00
|
|
|
node = OBJ_NEW(orte_nid_t);
|
|
|
|
/* the arch defaults to our arch so that non-hetero
|
|
|
|
* case will yield correct behavior
|
|
|
|
*/
|
|
|
|
opal_pointer_array_set_item(&orte_nidmap, i, node);
|
2008-04-30 23:49:53 +04:00
|
|
|
|
2009-06-24 00:25:38 +04:00
|
|
|
/* unpack the node's name */
|
2008-04-30 23:49:53 +04:00
|
|
|
n=1;
|
2009-06-24 00:25:38 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &(node->name), &n, OPAL_STRING))) {
|
2008-09-25 17:39:08 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2008-04-30 23:49:53 +04:00
|
|
|
}
|
|
|
|
|
2009-06-27 02:07:25 +04:00
|
|
|
/* unpack the daemon vpids */
|
2008-05-28 22:38:47 +04:00
|
|
|
vpids = (orte_vpid_t*)malloc(num_nodes * sizeof(orte_vpid_t));
|
|
|
|
n=num_nodes;
|
2008-09-25 17:39:08 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, vpids, &n, ORTE_VPID))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2010-12-01 15:51:39 +03:00
|
|
|
|
|
|
|
/* unpack the oversubscribed flags */
|
|
|
|
oversub = (uint8_t*)malloc(num_nodes * sizeof(uint8_t));
|
|
|
|
n=num_nodes;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, oversub, &n, OPAL_UINT8))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2008-05-28 22:38:47 +04:00
|
|
|
/* transfer the data to the nidmap, counting the number of
|
|
|
|
* daemons in the system
|
|
|
|
*/
|
|
|
|
num_daemons = 0;
|
2009-07-02 00:46:05 +04:00
|
|
|
for (i=0; i < num_nodes; i++) {
|
2009-06-24 06:47:45 +04:00
|
|
|
if (NULL != (ndptr = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
|
|
|
|
ndptr->daemon = vpids[i];
|
2010-12-01 15:51:39 +03:00
|
|
|
if (0 == oversub[i]) {
|
|
|
|
ndptr->oversubscribed = false;
|
|
|
|
} else {
|
|
|
|
ndptr->oversubscribed = true;
|
|
|
|
}
|
2009-06-24 06:47:45 +04:00
|
|
|
if (ORTE_VPID_INVALID != vpids[i]) {
|
|
|
|
++num_daemons;
|
|
|
|
}
|
2008-05-28 22:38:47 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
free(vpids);
|
2010-12-01 15:51:39 +03:00
|
|
|
free(oversub);
|
|
|
|
|
2008-05-28 22:38:47 +04:00
|
|
|
/* if we are a daemon or the HNP, update our num_procs */
|
2009-05-04 15:07:40 +04:00
|
|
|
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
|
2009-03-06 00:56:03 +03:00
|
|
|
orte_process_info.num_procs = num_daemons;
|
2011-06-24 00:38:02 +04:00
|
|
|
|
|
|
|
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
|
|
|
orte_process_info.max_procs = orte_process_info.num_procs;
|
|
|
|
}
|
2008-05-28 22:38:47 +04:00
|
|
|
}
|
2010-05-04 06:40:09 +04:00
|
|
|
/* update num_daemons */
|
|
|
|
orte_process_info.num_daemons = num_daemons;
|
2008-05-28 22:38:47 +04:00
|
|
|
|
2008-06-09 18:53:58 +04:00
|
|
|
if (0 < opal_output_get_verbosity(orte_debug_output)) {
|
2008-04-30 23:49:53 +04:00
|
|
|
for (i=0; i < num_nodes; i++) {
|
2009-06-24 06:47:45 +04:00
|
|
|
if (NULL == (nd = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
|
|
|
|
continue;
|
|
|
|
}
|
2012-04-06 18:23:13 +04:00
|
|
|
opal_output(5, "%s node[%d].name %s daemon %s",
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i,
|
2009-06-24 06:47:45 +04:00
|
|
|
(NULL == nd->name) ? "NULL" : nd->name,
|
2009-07-14 00:03:41 +04:00
|
|
|
ORTE_VPID_PRINT(nd->daemon));
|
2008-04-30 23:49:53 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
OBJ_DESTRUCT(&buf);
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2008-11-18 18:35:50 +03:00
|
|
|
int orte_util_encode_pidmap(opal_byte_object_t *boptr)
|
2008-04-30 23:49:53 +04:00
|
|
|
{
|
2009-05-11 07:24:49 +04:00
|
|
|
orte_proc_t *proc;
|
2008-04-30 23:49:53 +04:00
|
|
|
opal_buffer_t buf;
|
2009-07-15 23:36:53 +04:00
|
|
|
orte_local_rank_t *lrank = NULL;
|
|
|
|
orte_node_rank_t *nrank = NULL;
|
|
|
|
orte_job_t *jdata = NULL;
|
2009-07-16 21:38:09 +04:00
|
|
|
int32_t *nodes = NULL;
|
|
|
|
int i, j, k, rc = ORTE_SUCCESS;
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
unsigned int *bind_idx=NULL;
|
|
|
|
#endif
|
2008-04-30 23:49:53 +04:00
|
|
|
|
|
|
|
/* setup the working buffer */
|
|
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
|
|
|
|
2009-03-03 19:39:13 +03:00
|
|
|
for (j=1; j < orte_job_data->size; j++) {
|
|
|
|
/* the job array is no longer left-justified and may
|
|
|
|
* have holes in it as we recover resources at job
|
|
|
|
* completion
|
|
|
|
*/
|
2009-04-13 23:06:54 +04:00
|
|
|
if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) {
|
2009-03-03 19:39:13 +03:00
|
|
|
continue;
|
2010-03-26 01:54:57 +03:00
|
|
|
}
|
|
|
|
/* if this job doesn't have a map, then it is a tool
|
|
|
|
* and doesn't need to be included
|
|
|
|
*/
|
|
|
|
if (NULL == jdata->map) {
|
|
|
|
continue;
|
2010-05-14 22:44:49 +04:00
|
|
|
}
|
2008-11-18 18:35:50 +03:00
|
|
|
/* pack the jobid */
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata->jobid, 1, ORTE_JOBID))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
2009-07-15 23:36:53 +04:00
|
|
|
goto cleanup_and_return;
|
2008-11-18 18:35:50 +03:00
|
|
|
}
|
|
|
|
/* pack the number of procs */
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata->num_procs, 1, ORTE_VPID))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
2009-07-15 23:36:53 +04:00
|
|
|
goto cleanup_and_return;
|
2008-11-18 18:35:50 +03:00
|
|
|
}
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
/* pack the bind level */
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &(jdata->map->bind_level), 1, OPAL_HWLOC_LEVEL_T))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup_and_return;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* allocate memory for the nodes, local ranks, node ranks, and bind_idx */
|
2009-07-15 23:36:53 +04:00
|
|
|
nodes = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t));
|
|
|
|
lrank = (orte_local_rank_t*)malloc(jdata->num_procs*sizeof(orte_local_rank_t));
|
|
|
|
nrank = (orte_node_rank_t*)malloc(jdata->num_procs*sizeof(orte_node_rank_t));
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
bind_idx = (unsigned int*)malloc(jdata->num_procs*sizeof(unsigned int));
|
|
|
|
#endif
|
2008-11-18 18:35:50 +03:00
|
|
|
/* transfer and pack the node info in one pack */
|
2009-07-14 00:03:41 +04:00
|
|
|
for (i=0, k=0; i < jdata->procs->size; i++) {
|
2009-05-12 13:46:52 +04:00
|
|
|
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, i))) {
|
2009-05-11 07:24:49 +04:00
|
|
|
continue;
|
|
|
|
}
|
2009-07-16 00:06:45 +04:00
|
|
|
if( k >= (int)jdata->num_procs ) {
|
2009-07-15 23:36:53 +04:00
|
|
|
orte_show_help("help-orte-runtime.txt", "orte_nidmap:too_many_nodes",
|
|
|
|
true, jdata->num_procs);
|
2009-07-16 00:06:45 +04:00
|
|
|
break;
|
2009-07-15 23:36:53 +04:00
|
|
|
}
|
2009-07-16 00:06:45 +04:00
|
|
|
nodes[k] = proc->node->index;
|
|
|
|
lrank[k] = proc->local_rank;
|
|
|
|
nrank[k] = proc->node_rank;
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
bind_idx[k] = proc->bind_idx;
|
|
|
|
#endif
|
2009-07-16 00:06:45 +04:00
|
|
|
++k;
|
2008-11-18 18:35:50 +03:00
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, nodes, jdata->num_procs, OPAL_INT32))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
2009-07-15 23:36:53 +04:00
|
|
|
goto cleanup_and_return;
|
2008-11-18 18:35:50 +03:00
|
|
|
}
|
|
|
|
/* transfer and pack the local_ranks in one pack */
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, lrank, jdata->num_procs, ORTE_LOCAL_RANK))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
2009-07-15 23:36:53 +04:00
|
|
|
goto cleanup_and_return;
|
2008-11-18 18:35:50 +03:00
|
|
|
}
|
|
|
|
/* transfer and pack the node ranks in one pack */
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, nrank, jdata->num_procs, ORTE_NODE_RANK))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
2009-07-15 23:36:53 +04:00
|
|
|
goto cleanup_and_return;
|
2008-11-18 18:35:50 +03:00
|
|
|
}
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
/* transfer and pack the bind_idx in one pack */
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, bind_idx, jdata->num_procs, OPAL_UINT))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup_and_return;
|
|
|
|
}
|
|
|
|
#endif
|
2008-09-25 17:39:08 +04:00
|
|
|
}
|
2008-04-30 23:49:53 +04:00
|
|
|
|
|
|
|
/* transfer the payload to the byte object */
|
|
|
|
opal_dss.unload(&buf, (void**)&boptr->bytes, &boptr->size);
|
2009-07-15 23:36:53 +04:00
|
|
|
|
|
|
|
cleanup_and_return:
|
|
|
|
|
|
|
|
if( NULL != lrank ) {
|
|
|
|
free(lrank);
|
|
|
|
}
|
|
|
|
if( NULL != nrank ) {
|
|
|
|
free(nrank);
|
|
|
|
}
|
|
|
|
if( NULL != nodes ) {
|
|
|
|
free(nodes);
|
|
|
|
}
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
if( NULL != bind_idx ) {
|
|
|
|
free(bind_idx);
|
|
|
|
}
|
|
|
|
#endif
|
2008-04-30 23:49:53 +04:00
|
|
|
OBJ_DESTRUCT(&buf);
|
|
|
|
|
2009-07-15 23:36:53 +04:00
|
|
|
return rc;
|
2008-04-30 23:49:53 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-01-07 17:58:38 +03:00
|
|
|
int orte_util_decode_pidmap(opal_byte_object_t *bo)
|
2008-04-30 23:49:53 +04:00
|
|
|
{
|
2008-11-18 18:35:50 +03:00
|
|
|
orte_jobid_t jobid;
|
2008-04-30 23:49:53 +04:00
|
|
|
orte_vpid_t i, num_procs;
|
2009-02-25 05:43:22 +03:00
|
|
|
orte_pmap_t *pmap;
|
2012-04-05 00:40:16 +04:00
|
|
|
int32_t *nodes=NULL, my_node = 0;
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
orte_local_rank_t *local_rank=NULL;
|
|
|
|
orte_node_rank_t *node_rank=NULL;
|
|
|
|
#if OPAL_HAVE_HWLOC
|
2011-12-02 17:18:54 +04:00
|
|
|
opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL;
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
unsigned int *bind_idx=NULL;
|
|
|
|
#endif
|
2008-04-30 23:49:53 +04:00
|
|
|
orte_std_cntr_t n;
|
|
|
|
opal_buffer_t buf;
|
2009-06-24 07:05:04 +04:00
|
|
|
orte_jmap_t *jmap;
|
2008-11-18 18:35:50 +03:00
|
|
|
bool already_present;
|
|
|
|
int j;
|
2008-09-25 17:39:08 +04:00
|
|
|
int rc;
|
2008-04-30 23:49:53 +04:00
|
|
|
|
|
|
|
/* xfer the byte object to a buffer for unpacking */
|
|
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
2008-09-25 17:39:08 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.load(&buf, bo->bytes, bo->size))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
2008-11-24 20:57:55 +03:00
|
|
|
goto cleanup;
|
2008-09-25 17:39:08 +04:00
|
|
|
}
|
2008-04-30 23:49:53 +04:00
|
|
|
|
2008-11-18 18:35:50 +03:00
|
|
|
n = 1;
|
2008-11-24 20:57:55 +03:00
|
|
|
/* cycle through the buffer */
|
2008-11-18 18:35:50 +03:00
|
|
|
while (ORTE_SUCCESS == (rc = opal_dss.unpack(&buf, &jobid, &n, ORTE_JOBID))) {
|
2009-04-09 06:48:33 +04:00
|
|
|
/* unfortunately, job objects cannot be stored
|
|
|
|
* by index number as the jobid is a constructed
|
|
|
|
* value. So we have no choice but to cycle through
|
2009-04-09 06:59:23 +04:00
|
|
|
* the jobmap pointer array and look for this entry. Since
|
|
|
|
* jobs are cleaned up as they complete, check the
|
|
|
|
* entire array
|
2009-04-09 06:48:33 +04:00
|
|
|
*/
|
2009-07-01 20:51:11 +04:00
|
|
|
jmap = NULL;
|
2008-11-18 18:35:50 +03:00
|
|
|
already_present = false;
|
2009-04-09 06:59:23 +04:00
|
|
|
for (j=0; j < orte_jobmap.size; j++) {
|
2009-06-24 07:05:04 +04:00
|
|
|
if (NULL == (jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, j))) {
|
2009-04-09 06:59:23 +04:00
|
|
|
continue;
|
|
|
|
}
|
2009-06-24 07:05:04 +04:00
|
|
|
if (jobid == jmap->job) {
|
2008-11-18 18:35:50 +03:00
|
|
|
already_present = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* unpack the number of procs */
|
|
|
|
n=1;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_procs, &n, ORTE_VPID))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
2008-11-24 20:57:55 +03:00
|
|
|
goto cleanup;
|
2008-11-18 18:35:50 +03:00
|
|
|
}
|
2009-07-14 00:03:41 +04:00
|
|
|
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
/* unpack the binding level */
|
|
|
|
n=1;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &bind_level, &n, OPAL_HWLOC_LEVEL_T))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
/* set mine */
|
|
|
|
orte_process_info.bind_level = bind_level;
|
|
|
|
#endif
|
|
|
|
|
2008-11-18 18:35:50 +03:00
|
|
|
/* allocate memory for the node info */
|
|
|
|
nodes = (int32_t*)malloc(num_procs * 4);
|
|
|
|
/* unpack it in one shot */
|
|
|
|
n=num_procs;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, nodes, &n, OPAL_INT32))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
2008-11-24 20:57:55 +03:00
|
|
|
goto cleanup;
|
2008-11-18 18:35:50 +03:00
|
|
|
}
|
2011-11-06 21:06:41 +04:00
|
|
|
|
2008-11-18 18:35:50 +03:00
|
|
|
/* allocate memory for local ranks */
|
|
|
|
local_rank = (orte_local_rank_t*)malloc(num_procs*sizeof(orte_local_rank_t));
|
|
|
|
/* unpack them in one shot */
|
|
|
|
n=num_procs;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, local_rank, &n, ORTE_LOCAL_RANK))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
2008-11-24 20:57:55 +03:00
|
|
|
goto cleanup;
|
2008-11-18 18:35:50 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* allocate memory for node ranks */
|
|
|
|
node_rank = (orte_node_rank_t*)malloc(num_procs*sizeof(orte_node_rank_t));
|
|
|
|
/* unpack node ranks in one shot */
|
|
|
|
n=num_procs;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, node_rank, &n, ORTE_NODE_RANK))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
2008-11-24 20:57:55 +03:00
|
|
|
goto cleanup;
|
2008-11-18 18:35:50 +03:00
|
|
|
}
|
|
|
|
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
/* allocate memory for bind_idx */
|
|
|
|
bind_idx = (unsigned int*)malloc(num_procs*sizeof(unsigned int));
|
|
|
|
/* unpack bind_idx in one shot */
|
|
|
|
n=num_procs;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, bind_idx, &n, OPAL_UINT))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2012-03-03 04:39:37 +04:00
|
|
|
if (ORTE_PROC_IS_APP) {
|
|
|
|
/* set mine */
|
|
|
|
orte_process_info.bind_idx = bind_idx[ORTE_PROC_MY_NAME->vpid];
|
|
|
|
}
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
#endif
|
|
|
|
|
2009-06-24 07:05:04 +04:00
|
|
|
/* if we already know about this job, we need to check the data to see
|
|
|
|
* if something has changed - e.g., a proc that is being restarted somewhere
|
|
|
|
* other than where it previously was
|
|
|
|
*/
|
|
|
|
if (already_present) {
|
2009-06-27 00:54:58 +04:00
|
|
|
/* we already have the jmap object, so let's refresh its pidmap
|
|
|
|
* using the new data - start by cleaning out the old array
|
2009-06-24 07:05:04 +04:00
|
|
|
*/
|
2009-06-27 00:54:58 +04:00
|
|
|
for (j=0; j < jmap->pmap.size; j++) {
|
|
|
|
if (NULL == (pmap = (orte_pmap_t*)opal_pointer_array_get_item(&jmap->pmap, j))) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
OBJ_RELEASE(pmap);
|
|
|
|
}
|
2009-11-19 22:42:15 +03:00
|
|
|
/* now use the opal function to reset the internal pointers */
|
|
|
|
opal_pointer_array_remove_all(&jmap->pmap);
|
2009-06-24 07:05:04 +04:00
|
|
|
} else {
|
|
|
|
/* if we don't already have this data, store it
|
|
|
|
* unfortunately, job objects cannot be stored
|
2009-04-09 06:48:33 +04:00
|
|
|
* by index number as the jobid is a constructed
|
|
|
|
* value. So we have to just add it to the end
|
|
|
|
* of the array
|
|
|
|
*/
|
2008-11-18 18:35:50 +03:00
|
|
|
jmap = OBJ_NEW(orte_jmap_t);
|
|
|
|
jmap->job = jobid;
|
2009-01-07 17:58:38 +03:00
|
|
|
if (0 > (j = opal_pointer_array_add(&orte_jobmap, jmap))) {
|
2008-11-24 20:57:55 +03:00
|
|
|
ORTE_ERROR_LOG(j);
|
|
|
|
rc = j;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
}
|
|
|
|
/* update the binding level and num_procs */
|
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
jmap->bind_level = bind_level;
|
|
|
|
#endif
|
|
|
|
jmap->num_procs = num_procs;
|
|
|
|
/* set the size of the storage so we minimize realloc's */
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&jmap->pmap, num_procs))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2012-03-03 03:10:10 +04:00
|
|
|
if (ORTE_PROC_IS_APP) {
|
|
|
|
/* track my node */
|
|
|
|
my_node = nodes[ORTE_PROC_MY_NAME->vpid];
|
|
|
|
}
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
/* xfer the data */
|
|
|
|
for (i=0; i < num_procs; i++) {
|
|
|
|
pmap = OBJ_NEW(orte_pmap_t);
|
|
|
|
pmap->node = nodes[i];
|
|
|
|
pmap->local_rank = local_rank[i];
|
|
|
|
pmap->node_rank = node_rank[i];
|
2012-03-03 03:10:10 +04:00
|
|
|
/* if I am an app, record the locality of this proc
|
|
|
|
* relative to me - daemons don't need this info
|
|
|
|
*/
|
|
|
|
if (ORTE_PROC_IS_APP) {
|
|
|
|
if (ORTE_PROC_MY_NAME->vpid == i) {
|
|
|
|
/* this is me */
|
|
|
|
pmap->locality = OPAL_PROC_ALL_LOCAL;
|
|
|
|
} else if (pmap->node == my_node) {
|
2011-11-18 14:22:58 +04:00
|
|
|
#if OPAL_HAVE_HWLOC
|
2012-03-03 03:10:10 +04:00
|
|
|
/* we share a node - see what else we share */
|
|
|
|
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
|
|
|
|
orte_process_info.bind_level,
|
|
|
|
orte_process_info.bind_idx,
|
|
|
|
jmap->bind_level,
|
|
|
|
bind_idx[i]);
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
#else
|
2012-03-03 03:10:10 +04:00
|
|
|
pmap->locality = OPAL_PROC_ON_NODE;
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
#endif
|
2012-03-03 03:10:10 +04:00
|
|
|
} else {
|
|
|
|
pmap->locality = OPAL_PROC_NON_LOCAL;
|
|
|
|
}
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
}
|
|
|
|
/* add the pidmap entry at the specific site corresponding
|
|
|
|
* to the proc's vpid
|
|
|
|
*/
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(&jmap->pmap, i, pmap))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
2008-11-18 18:35:50 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* release data */
|
|
|
|
free(nodes);
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
nodes = NULL;
|
2008-11-18 18:35:50 +03:00
|
|
|
free(local_rank);
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
local_rank = NULL;
|
2008-11-18 18:35:50 +03:00
|
|
|
free(node_rank);
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
node_rank = NULL;
|
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
free(bind_idx);
|
|
|
|
bind_idx = NULL;
|
|
|
|
#endif
|
2008-11-24 20:57:55 +03:00
|
|
|
/* setup for next cycle */
|
|
|
|
n = 1;
|
|
|
|
}
|
2011-10-20 00:18:14 +04:00
|
|
|
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER == rc) {
|
2008-11-24 20:57:55 +03:00
|
|
|
rc = ORTE_SUCCESS;
|
2008-04-30 23:49:53 +04:00
|
|
|
}
|
|
|
|
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
cleanup:
|
|
|
|
if (NULL != nodes) {
|
|
|
|
free(nodes);
|
|
|
|
}
|
|
|
|
if (NULL != local_rank) {
|
|
|
|
free(local_rank);
|
|
|
|
}
|
|
|
|
if (NULL != node_rank) {
|
|
|
|
free(node_rank);
|
|
|
|
}
|
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
if (NULL != bind_idx) {
|
|
|
|
free(bind_idx);
|
|
|
|
}
|
|
|
|
#endif
|
2008-04-30 23:49:53 +04:00
|
|
|
OBJ_DESTRUCT(&buf);
|
2008-11-24 20:57:55 +03:00
|
|
|
return rc;
|
2008-04-30 23:49:53 +04:00
|
|
|
}
|
|
|
|
|
2009-01-07 17:58:38 +03:00
|
|
|
|
|
|
|
/*** NIDMAP UTILITIES ***/
|
2009-02-06 18:28:32 +03:00
|
|
|
orte_jmap_t* orte_util_lookup_jmap(orte_jobid_t job)
|
2009-01-07 17:58:38 +03:00
|
|
|
{
|
|
|
|
int i;
|
2009-06-24 00:25:38 +04:00
|
|
|
orte_jmap_t *jmap;
|
2009-01-07 17:58:38 +03:00
|
|
|
|
2009-04-09 06:48:33 +04:00
|
|
|
/* unfortunately, job objects cannot be stored
|
|
|
|
* by index number as the jobid is a constructed
|
|
|
|
* value. So we have no choice but to cycle through
|
|
|
|
* the jobmap pointer array and look for the entry
|
2009-04-09 06:59:23 +04:00
|
|
|
* we want. We also cannot trust that the array is
|
|
|
|
* left-justified as cleanup is done - and array
|
|
|
|
* entries set to NULL - upon job completion.
|
2009-04-09 06:48:33 +04:00
|
|
|
*/
|
2009-04-09 06:59:23 +04:00
|
|
|
for (i=0; i < orte_jobmap.size; i++) {
|
2009-06-24 00:25:38 +04:00
|
|
|
if (NULL == (jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, i))) {
|
2009-04-09 06:59:23 +04:00
|
|
|
continue;
|
|
|
|
}
|
2009-01-07 17:58:38 +03:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, orte_debug_output,
|
|
|
|
"%s lookup:pmap: checking job %s for job %s",
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
2009-06-24 00:25:38 +04:00
|
|
|
ORTE_JOBID_PRINT(jmap->job), ORTE_JOBID_PRINT(job)));
|
|
|
|
if (job == jmap->job) {
|
|
|
|
return jmap;
|
2009-01-07 17:58:38 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-02-06 18:28:32 +03:00
|
|
|
/* if we didn't find it, return NULL */
|
2009-01-07 17:58:38 +03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2009-02-06 18:28:32 +03:00
|
|
|
orte_pmap_t* orte_util_lookup_pmap(orte_process_name_t *proc)
|
|
|
|
{
|
|
|
|
orte_jmap_t *jmap;
|
|
|
|
|
|
|
|
if (NULL == (jmap = orte_util_lookup_jmap(proc->jobid))) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2009-04-09 06:48:33 +04:00
|
|
|
/* the get_item function will check the array index range,
|
|
|
|
* so we can just access it here
|
2009-02-25 05:43:22 +03:00
|
|
|
*/
|
2009-04-09 17:34:55 +04:00
|
|
|
return (orte_pmap_t *) opal_pointer_array_get_item(&jmap->pmap, proc->vpid);
|
2009-02-06 18:28:32 +03:00
|
|
|
}
|
|
|
|
|
2009-01-07 17:58:38 +03:00
|
|
|
/* the daemon's vpid does not necessarily correlate
|
|
|
|
* to the node's index in the node array since
|
|
|
|
* some nodes may not have a daemon on them. Thus,
|
|
|
|
* we have to search for the daemon in the array.
|
|
|
|
* Fortunately, this is rarely done
|
|
|
|
*/
|
|
|
|
static orte_nid_t* find_daemon_node(orte_process_name_t *proc)
|
|
|
|
{
|
|
|
|
int32_t i;
|
2009-06-24 00:25:38 +04:00
|
|
|
orte_nid_t *nid;
|
2009-01-07 17:58:38 +03:00
|
|
|
|
2009-04-09 06:59:23 +04:00
|
|
|
for (i=0; i < orte_nidmap.size; i++) {
|
2009-06-24 00:25:38 +04:00
|
|
|
if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
|
2009-04-09 06:59:23 +04:00
|
|
|
continue;
|
|
|
|
}
|
2009-01-07 17:58:38 +03:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, orte_debug_output,
|
|
|
|
"%s find:daemon:node: checking daemon %s for %s",
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
2009-06-24 00:25:38 +04:00
|
|
|
ORTE_VPID_PRINT(nid->daemon), ORTE_VPID_PRINT(proc->vpid)));
|
|
|
|
if (nid->daemon == proc->vpid) {
|
|
|
|
return nid;
|
2009-01-07 17:58:38 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-02-06 18:28:32 +03:00
|
|
|
/* if we didn't find it, return NULL */
|
2009-01-07 17:58:38 +03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
orte_nid_t* orte_util_lookup_nid(orte_process_name_t *proc)
|
|
|
|
{
|
|
|
|
orte_pmap_t *pmap;
|
|
|
|
|
2012-04-06 18:23:13 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, orte_debug_output,
|
2009-01-07 17:58:38 +03:00
|
|
|
"%s lookup:nid: looking for proc %s",
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(proc)));
|
2009-01-07 17:58:38 +03:00
|
|
|
|
2009-05-16 08:15:55 +04:00
|
|
|
if (ORTE_JOBID_IS_DAEMON(proc->jobid)) {
|
2009-02-06 18:28:32 +03:00
|
|
|
/* looking for a daemon */
|
|
|
|
return find_daemon_node(proc);
|
2009-01-07 17:58:38 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* looking for an application proc */
|
|
|
|
if (NULL == (pmap = orte_util_lookup_pmap(proc))) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2009-04-09 06:48:33 +04:00
|
|
|
/* the get_item function will check the array index range,
|
|
|
|
* so we can just access it here
|
|
|
|
*/
|
2009-04-09 17:34:55 +04:00
|
|
|
return (orte_nid_t *) opal_pointer_array_get_item(&orte_nidmap, pmap->node);
|
2009-01-07 17:58:38 +03:00
|
|
|
}
|
|
|
|
|
2009-06-18 08:36:00 +04:00
|
|
|
void orte_nidmap_dump(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
orte_nid_t *nid;
|
|
|
|
|
|
|
|
opal_output(orte_clean_output, "*** DUMP OF NIDMAP ***");
|
|
|
|
for (i=0; i < orte_nidmap.size; i++) {
|
|
|
|
if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
|
|
|
|
continue;
|
|
|
|
}
|
2009-07-14 00:03:41 +04:00
|
|
|
opal_output(orte_clean_output, "%s node[%d].name %s daemon %s",
|
2009-06-18 08:36:00 +04:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i,
|
|
|
|
(NULL == nid->name) ? "NULL" : nid->name,
|
2009-07-14 00:03:41 +04:00
|
|
|
ORTE_VPID_PRINT(nid->daemon));
|
2009-06-18 08:36:00 +04:00
|
|
|
}
|
|
|
|
opal_output(orte_clean_output, "\n\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
void orte_jmap_dump(orte_jmap_t *jmap)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
orte_pmap_t *pmap;
|
|
|
|
|
|
|
|
opal_output(orte_clean_output, "**** DUMP OF JOB %s (%s procs) ***",
|
|
|
|
ORTE_JOBID_PRINT(jmap->job), ORTE_VPID_PRINT(jmap->num_procs));
|
|
|
|
|
|
|
|
for (i=0; i < jmap->pmap.size; i++) {
|
|
|
|
if (NULL == (pmap = (orte_pmap_t*)opal_pointer_array_get_item(&jmap->pmap, i))) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
opal_output(orte_clean_output, "\tnode %d local_rank %d node_rank %d",
|
|
|
|
pmap->node, (int)pmap->local_rank, (int)pmap->node_rank);
|
|
|
|
}
|
|
|
|
opal_output(orte_clean_output, "\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
void orte_jobmap_dump(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
orte_jmap_t *jmap;
|
|
|
|
|
|
|
|
opal_output(orte_clean_output, "*** DUMP OF JOBMAP ***");
|
|
|
|
for (i=0; i < orte_jobmap.size; i++) {
|
|
|
|
if (NULL == (jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, i))) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
orte_jmap_dump(jmap);
|
|
|
|
}
|
|
|
|
opal_output(orte_clean_output, "\n\n");
|
|
|
|
}
|