diff --git a/NEWS b/NEWS index 0d8fc06229..c3e289f2bf 100644 --- a/NEWS +++ b/NEWS @@ -62,6 +62,11 @@ Trunk (not on release branches yet) OPAL levels - intended for use when configuring without MPI support - Modified paffinity system to provide warning when bindings result in being "bound to all", which is equivalent to "not bound" +- Added btl_tcp_if_seq MCA parameter to select a different ethernet + interface for each MPI process on a node. This parameter is only + useful when used with virtual ethernet interfaces on a single + network card (e.g., when using virtual interfaces give dedicated + hardware resources on the NIC to each process). 1.5.1 diff --git a/ompi/mca/btl/tcp/btl_tcp.h b/ompi/mca/btl/tcp/btl_tcp.h index 708341dd1d..8859977fa5 100644 --- a/ompi/mca/btl/tcp/btl_tcp.h +++ b/ompi/mca/btl/tcp/btl_tcp.h @@ -1,4 +1,3 @@ - /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -38,7 +38,7 @@ #include "ompi/class/ompi_free_list.h" #include "ompi/mca/btl/btl.h" #include "ompi/mca/btl/base/base.h" -#include "ompi/mca/mpool/mpool.h" +#include "ompi/mca/mpool/mpool.h" #include "ompi/mca/btl/btl.h" #include "opal/class/opal_hash_table.h" @@ -92,6 +92,10 @@ struct mca_btl_tcp_component_t { /* Do we want to use TCP_NODELAY? */ int tcp_use_nodelay; + + /* If btl_tcp_if_seq was specified, this is the one interface + (name) that we're supposed to use. */ + char *tcp_if_seq; }; typedef struct mca_btl_tcp_component_t mca_btl_tcp_component_t; diff --git a/ompi/mca/btl/tcp/btl_tcp_component.c b/ompi/mca/btl/tcp/btl_tcp_component.c index 515262f80a..ce8384c3b2 100644 --- a/ompi/mca/btl/tcp/btl_tcp_component.c +++ b/ompi/mca/btl/tcp/btl_tcp_component.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Oak Ridge National Laboratory * $COPYRIGHT$ @@ -48,19 +48,21 @@ #include #include -#include "ompi/constants.h" #include "opal/mca/event/event.h" #include "opal/util/if.h" #include "opal/util/output.h" #include "opal/util/argv.h" #include "opal/util/net.h" #include "opal/util/opal_sos.h" +#include "opal/mca/base/mca_base_param.h" #include "orte/types.h" #include "orte/util/show_help.h" +#include "orte/mca/ess/ess.h" +#include "ompi/constants.h" #include "ompi/mca/btl/btl.h" -#include "opal/mca/base/mca_base_param.h" +#include "ompi/mca/btl/base/base.h" #include "ompi/runtime/ompi_module_exchange.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/btl/base/btl_base_error.h" @@ -69,7 +71,6 @@ #include "btl_tcp_proc.h" #include "btl_tcp_frag.h" #include "btl_tcp_endpoint.h" -#include "ompi/mca/btl/base/base.h" mca_btl_tcp_component_t mca_btl_tcp_component = { @@ -281,6 +282,53 @@ int mca_btl_tcp_component_open(void) mca_btl_tcp_component.tcp_disable_family = mca_btl_tcp_param_register_int ("disable_family", NULL, 0); + /* Register a list of interfaces to use in sequence */ + message = mca_btl_tcp_param_register_string("if_seq", + "If specified, a comma-delimited list of TCP interfaces. Interfaces will be assigned, one to each MPI process, in a round-robin fashion on each server. For example, if the list is \"eth0,eth1\" and four MPI processes are run on a single server, then local ranks 0 and 2 will use eth0 and local ranks 1 and 3 will use eth1.", NULL); + mca_btl_tcp_component.tcp_if_seq = NULL; + if (NULL != message && '\0' != *message) { + char **argv = opal_argv_split(message, ','); + + if (NULL != argv && '\0' != *(argv[0])) { + int if_index, rc, count; + orte_node_rank_t node_rank; + char name[256]; + + node_rank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME); + + /* Now that we've got that local rank, take the + corresponding entry from the tcp_if_seq list (wrapping + if necessary) */ + count = opal_argv_count(argv); + mca_btl_tcp_component.tcp_if_seq = + strdup(argv[node_rank % count]); + opal_argv_free(argv); + + /* Double check that the selected interface actually exists */ + for (if_index = opal_ifbegin(); if_index >= 0; + if_index = opal_ifnext(if_index)){ + if (OPAL_SUCCESS != + (rc = opal_ifindextoname(if_index, name, sizeof(name)))) { + return rc; + } + if (0 == strcmp(name, mca_btl_tcp_component.tcp_if_seq)) { + break; + } + } + if (if_index < 0) { + orte_show_help("help-mpi-btl-tcp.txt", + "invalid if_inexclude", + true, "if_seq", + orte_process_info.nodename, + mca_btl_tcp_component.tcp_if_seq, + "Interface does not exist"); + return OMPI_ERR_BAD_PARAM; + } + BTL_VERBOSE(("Node rank %d using TCP interface %s", + node_rank, mca_btl_tcp_component.tcp_if_seq)); + } + } + return OMPI_SUCCESS; } @@ -302,6 +350,9 @@ int mca_btl_tcp_component_close(void) free(mca_btl_tcp_component.tcp_if_exclude); mca_btl_tcp_component.tcp_if_exclude = NULL; } + if (NULL != mca_btl_tcp_component.tcp_if_seq) { + free(mca_btl_tcp_component.tcp_if_seq); + } if (NULL != mca_btl_tcp_component.tcp_btls) free(mca_btl_tcp_component.tcp_btls); @@ -319,7 +370,6 @@ int mca_btl_tcp_component_close(void) } #endif - /* cleanup any pending events */ OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock); for(item = opal_list_get_first(&mca_btl_tcp_component.tcp_events); @@ -546,14 +596,26 @@ static int mca_btl_tcp_component_create_instances(void) for(if_index = opal_ifbegin(); if_index >= 0; if_index = opal_ifnext(if_index)){ int index = opal_ifindextokindex (if_index); if (index > 0) { - bool already_seen = false; - for (j=0; (false == already_seen) && (j < kif_count); j++) { + bool want_this_if = true; + + /* Have we seen this if already? */ + for (j = 0; want_this_if && (j < kif_count); j++) { if (kindexes[j] == index) { - already_seen = true; + want_this_if = false; } } - if (false == already_seen) { + /* If we have an if_seq list, see if this is the one + interface that we're supposed to have */ + if (NULL != mca_btl_tcp_component.tcp_if_seq) { + char name[256]; + opal_ifindextoname(if_index, name, sizeof(name)); + if (0 != strcmp(mca_btl_tcp_component.tcp_if_seq, name)) { + want_this_if = false; + } + } + + if (want_this_if) { kindexes[kif_count] = index; kif_count++; }