fix for #1130 - adds support for multi-rail configurations
This commit was SVN r17152.
Этот коммит содержится в:
родитель
601fb4389d
Коммит
5f884b1ca4
@ -12,7 +12,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -48,6 +48,7 @@ static int udapl_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg);
|
|||||||
static int mca_btl_udapl_set_peer_parameters(
|
static int mca_btl_udapl_set_peer_parameters(
|
||||||
struct mca_btl_udapl_module_t* udapl_btl,
|
struct mca_btl_udapl_module_t* udapl_btl,
|
||||||
size_t nprocs);
|
size_t nprocs);
|
||||||
|
static int mca_btl_udapl_assign_netmask(mca_btl_udapl_module_t* udapl_btl);
|
||||||
|
|
||||||
mca_btl_udapl_module_t mca_btl_udapl_module = {
|
mca_btl_udapl_module_t mca_btl_udapl_module = {
|
||||||
{
|
{
|
||||||
@ -189,6 +190,9 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
|
|||||||
memcpy(&btl->udapl_addr.addr, (btl->udapl_ia_attr).ia_address_ptr,
|
memcpy(&btl->udapl_addr.addr, (btl->udapl_ia_attr).ia_address_ptr,
|
||||||
sizeof(DAT_SOCK_ADDR));
|
sizeof(DAT_SOCK_ADDR));
|
||||||
|
|
||||||
|
/* determine netmask */
|
||||||
|
mca_btl_udapl_assign_netmask(btl);
|
||||||
|
|
||||||
/* check evd qlen against adapter max */
|
/* check evd qlen against adapter max */
|
||||||
if (btl->udapl_dto_evd_qlen > (btl->udapl_ia_attr).max_evd_qlen) {
|
if (btl->udapl_dto_evd_qlen > (btl->udapl_ia_attr).max_evd_qlen) {
|
||||||
BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP, ("help-mpi-btl-udapl.txt",
|
BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP, ("help-mpi-btl-udapl.txt",
|
||||||
@ -415,6 +419,14 @@ int mca_btl_udapl_finalize(struct mca_btl_base_module_t* base_btl)
|
|||||||
OBJ_DESTRUCT(&udapl_btl->udapl_frag_control);
|
OBJ_DESTRUCT(&udapl_btl->udapl_frag_control);
|
||||||
OBJ_DESTRUCT(&udapl_btl->udapl_eager_rdma_lock);
|
OBJ_DESTRUCT(&udapl_btl->udapl_eager_rdma_lock);
|
||||||
|
|
||||||
|
/* destroy mpool */
|
||||||
|
if (OMPI_SUCCESS !=
|
||||||
|
mca_mpool_base_module_destroy(udapl_btl->super.btl_mpool)) {
|
||||||
|
BTL_UDAPL_VERBOSE_OUTPUT(VERBOSE_INFORM,
|
||||||
|
("WARNING: Failed to release mpool"));
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
free(udapl_btl);
|
free(udapl_btl);
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -635,6 +647,97 @@ static int mca_btl_udapl_set_peer_parameters(
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Find and assign system netmask for the address of the uDAPL BTL
|
||||||
|
* module, but only if udapl_if_mask has not been set by the "--mca
|
||||||
|
* btl_udapl_if_mask" parameter. This routine will either find
|
||||||
|
* the system netmask or set the value to 0.
|
||||||
|
*
|
||||||
|
* @param udapl_btl (IN) BTL module
|
||||||
|
*
|
||||||
|
* @return OMPI_SUCCESS or OMPI_ERROR
|
||||||
|
*/
|
||||||
|
static int mca_btl_udapl_assign_netmask(mca_btl_udapl_module_t* udapl_btl)
|
||||||
|
{
|
||||||
|
struct sockaddr *saddr;
|
||||||
|
struct sockaddr_in *btl_addr;
|
||||||
|
char btl_addr_string[INET_ADDRSTRLEN];
|
||||||
|
char btl_ifname[INET_ADDRSTRLEN];
|
||||||
|
|
||||||
|
/* Setting if_mask to 0 informs future steps to assume all
|
||||||
|
* addresses are reachable.
|
||||||
|
*/
|
||||||
|
udapl_btl->udapl_if_mask = 0;
|
||||||
|
|
||||||
|
if (mca_btl_udapl_component.udapl_compare_subnet) {
|
||||||
|
/* go get system netmask value */
|
||||||
|
|
||||||
|
/* use generic address to find address family */
|
||||||
|
saddr = (struct sockaddr *)&(udapl_btl->udapl_addr.addr);
|
||||||
|
|
||||||
|
if (saddr->sa_family == AF_INET) {
|
||||||
|
|
||||||
|
btl_addr = (struct sockaddr_in *)saddr;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Retrieve the netmask of the udapl btl address. To
|
||||||
|
* accomplish this requires 4 steps and the use of an opal
|
||||||
|
* utility. This same utility is used by the tcp oob.
|
||||||
|
* Steps:
|
||||||
|
* 1. Get string value of known udapl btl module address.
|
||||||
|
* 2. Use string value to find the interface name of address.
|
||||||
|
* 3. Use interface name to find its index.
|
||||||
|
* 4. From the index get the netmask.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* retrieve string value of udapl btl address */
|
||||||
|
inet_ntop(AF_INET, (void *) &btl_addr->sin_addr,
|
||||||
|
btl_addr_string, INET_ADDRSTRLEN);
|
||||||
|
|
||||||
|
/* use address string to retrieve associated interface name */
|
||||||
|
if (OPAL_SUCCESS !=
|
||||||
|
opal_ifaddrtoname(btl_addr_string,
|
||||||
|
btl_ifname, INET_ADDRSTRLEN)) {
|
||||||
|
|
||||||
|
BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP,
|
||||||
|
("help-mpi-btl-udapl.txt", "interface not found",
|
||||||
|
true, orte_system_info.nodename, btl_addr_string));
|
||||||
|
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* use interface name to retrieve index; then
|
||||||
|
* use index to retrieve udapl btl address netmask
|
||||||
|
*/
|
||||||
|
if (OPAL_SUCCESS !=
|
||||||
|
opal_ifindextomask(opal_ifnametoindex(btl_ifname),
|
||||||
|
&(udapl_btl->udapl_if_mask), sizeof(udapl_btl->udapl_if_mask))) {
|
||||||
|
|
||||||
|
BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP,
|
||||||
|
("help-mpi-btl-udapl.txt", "netmask not found",
|
||||||
|
true, orte_system_info.nodename, btl_addr_string));
|
||||||
|
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* report if_mask used by address */
|
||||||
|
BTL_UDAPL_VERBOSE_OUTPUT(VERBOSE_INFORM,
|
||||||
|
("uDAPL BTL address %s : if_mask = %d",
|
||||||
|
btl_addr_string, udapl_btl->udapl_if_mask));
|
||||||
|
|
||||||
|
} else {
|
||||||
|
/* current uDAPL BTL does not support IPv6 */
|
||||||
|
BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP,
|
||||||
|
("help-mpi-btl-udapl.txt", "IPv4 only",
|
||||||
|
true, orte_system_info.nodename));
|
||||||
|
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
@ -662,12 +765,6 @@ int mca_btl_udapl_add_procs(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Check to make sure that the peer has at least as many interface
|
|
||||||
* addresses exported as we are trying to use. If not, then
|
|
||||||
* don't bind this BTL instance to the proc.
|
|
||||||
*/
|
|
||||||
|
|
||||||
OPAL_THREAD_LOCK(&udapl_proc->proc_lock);
|
OPAL_THREAD_LOCK(&udapl_proc->proc_lock);
|
||||||
|
|
||||||
/* The btl_proc datastructure is shared by all uDAPL BTL
|
/* The btl_proc datastructure is shared by all uDAPL BTL
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
@ -88,6 +88,7 @@ struct mca_btl_udapl_component_t {
|
|||||||
opal_list_t udapl_procs; /**< list of udapl proc structures */
|
opal_list_t udapl_procs; /**< list of udapl proc structures */
|
||||||
opal_mutex_t udapl_lock; /**< lock for accessing module state */
|
opal_mutex_t udapl_lock; /**< lock for accessing module state */
|
||||||
char* udapl_mpool_name; /**< name of memory pool */
|
char* udapl_mpool_name; /**< name of memory pool */
|
||||||
|
int32_t udapl_compare_subnet;/**< whether to compare with netmask or not */
|
||||||
char *if_include;
|
char *if_include;
|
||||||
char **if_include_list;
|
char **if_include_list;
|
||||||
char *if_exclude;
|
char *if_exclude;
|
||||||
@ -149,6 +150,7 @@ struct mca_btl_udapl_module_t {
|
|||||||
int udapl_max_recv_dtos; /**< maximum number of outstanding consumer
|
int udapl_max_recv_dtos; /**< maximum number of outstanding consumer
|
||||||
submitted recv operations, see section
|
submitted recv operations, see section
|
||||||
6.6.6 of uDAPL Spec */
|
6.6.6 of uDAPL Spec */
|
||||||
|
uint32_t udapl_if_mask; /**< netmask value btl module */
|
||||||
};
|
};
|
||||||
typedef struct mca_btl_udapl_module_t mca_btl_udapl_module_t;
|
typedef struct mca_btl_udapl_module_t mca_btl_udapl_module_t;
|
||||||
extern mca_btl_udapl_module_t mca_btl_udapl_module;
|
extern mca_btl_udapl_module_t mca_btl_udapl_module;
|
||||||
|
@ -261,6 +261,8 @@ mca_btl_udapl_modex_send(void)
|
|||||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
memset(addrs, 0, size);
|
||||||
|
|
||||||
for (i = 0; i < mca_btl_udapl_component.udapl_num_btls; i++) {
|
for (i = 0; i < mca_btl_udapl_component.udapl_num_btls; i++) {
|
||||||
mca_btl_udapl_module_t* btl = mca_btl_udapl_component.udapl_btls[i];
|
mca_btl_udapl_module_t* btl = mca_btl_udapl_component.udapl_btls[i];
|
||||||
addrs[i] = btl->udapl_addr;
|
addrs[i] = btl->udapl_addr;
|
||||||
|
@ -254,7 +254,6 @@ int mca_btl_udapl_endpoint_send(mca_btl_base_endpoint_t* endpoint,
|
|||||||
} else {
|
} else {
|
||||||
assert(frag->size ==
|
assert(frag->size ==
|
||||||
mca_btl_udapl_component.udapl_max_frag_size);
|
mca_btl_udapl_component.udapl_max_frag_size);
|
||||||
OPAL_THREAD_ADD32(&endpoint->endpoint_max_sends, -1);
|
|
||||||
opal_list_append(&endpoint->endpoint_max_frags,
|
opal_list_append(&endpoint->endpoint_max_frags,
|
||||||
(opal_list_item_t*)frag);
|
(opal_list_item_t*)frag);
|
||||||
}
|
}
|
||||||
@ -540,9 +539,11 @@ void mca_btl_udapl_endpoint_recv(int status, orte_process_name_t* endpoint,
|
|||||||
for(i = 0; i < proc->proc_endpoint_count; i++) {
|
for(i = 0; i < proc->proc_endpoint_count; i++) {
|
||||||
ep = proc->proc_endpoints[i];
|
ep = proc->proc_endpoints[i];
|
||||||
|
|
||||||
/* Does this endpoint match? */
|
/* Does this endpoint match? Only compare the address
|
||||||
|
* portion of mca_btl_udapl_addr_t.
|
||||||
|
*/
|
||||||
if(!memcmp(&addr, &ep->endpoint_addr,
|
if(!memcmp(&addr, &ep->endpoint_addr,
|
||||||
sizeof(mca_btl_udapl_addr_t))) {
|
(sizeof(DAT_CONN_QUAL) + sizeof(DAT_SOCK_ADDR)))) {
|
||||||
OPAL_THREAD_UNLOCK(&mca_btl_udapl_component.udapl_lock);
|
OPAL_THREAD_UNLOCK(&mca_btl_udapl_component.udapl_lock);
|
||||||
mca_btl_udapl_endpoint_connect(ep);
|
mca_btl_udapl_endpoint_connect(ep);
|
||||||
return;
|
return;
|
||||||
@ -805,11 +806,10 @@ static int mca_btl_udapl_endpoint_finish_max(mca_btl_udapl_endpoint_t* endpoint)
|
|||||||
opal_list_remove_first(&endpoint->endpoint_max_frags))) {
|
opal_list_remove_first(&endpoint->endpoint_max_frags))) {
|
||||||
cookie.as_ptr = frag;
|
cookie.as_ptr = frag;
|
||||||
|
|
||||||
assert(frag->triplet.virtual_address == (DAT_VADDR)frag->ftr);
|
|
||||||
assert(frag->triplet.segment_length ==
|
assert(frag->triplet.segment_length ==
|
||||||
frag->segment.seg_len + sizeof(mca_btl_udapl_footer_t));
|
frag->segment.seg_len + sizeof(mca_btl_udapl_footer_t));
|
||||||
assert(frag->size ==
|
assert(frag->size ==
|
||||||
mca_btl_udapl_component.udapl_eager_frag_size);
|
mca_btl_udapl_component.udapl_max_frag_size);
|
||||||
|
|
||||||
rc = dat_ep_post_send(endpoint->endpoint_max, 1,
|
rc = dat_ep_post_send(endpoint->endpoint_max, 1,
|
||||||
&frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
|
&frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
|
||||||
@ -947,10 +947,14 @@ static void mca_btl_udapl_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
|
|||||||
OBJ_DESTRUCT(&endpoint->endpoint_lock);
|
OBJ_DESTRUCT(&endpoint->endpoint_lock);
|
||||||
|
|
||||||
/* release eager rdma resources */
|
/* release eager rdma resources */
|
||||||
udapl_btl->super.btl_mpool->mpool_free(udapl_btl->super.btl_mpool,
|
if (NULL != reg) {
|
||||||
NULL,
|
udapl_btl->super.btl_mpool->mpool_free(udapl_btl->super.btl_mpool,
|
||||||
reg);
|
NULL, reg);
|
||||||
free(endpoint->endpoint_eager_rdma_local.base.pval);
|
}
|
||||||
|
|
||||||
|
if (NULL != endpoint->endpoint_eager_rdma_local.base.pval) {
|
||||||
|
free(endpoint->endpoint_eager_rdma_local.base.pval);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2006-2008 Sun Microsystems, Inc. All rights reserved.
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -46,6 +46,7 @@ extern "C" {
|
|||||||
struct mca_btl_udapl_addr_t {
|
struct mca_btl_udapl_addr_t {
|
||||||
DAT_CONN_QUAL port;
|
DAT_CONN_QUAL port;
|
||||||
DAT_SOCK_ADDR addr;
|
DAT_SOCK_ADDR addr;
|
||||||
|
bool inuse;
|
||||||
};
|
};
|
||||||
typedef struct mca_btl_udapl_addr_t mca_btl_udapl_addr_t;
|
typedef struct mca_btl_udapl_addr_t mca_btl_udapl_addr_t;
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -242,6 +242,15 @@ int mca_btl_udapl_register_mca_params(void)
|
|||||||
&(mca_btl_udapl_component.udapl_verbosity),
|
&(mca_btl_udapl_component.udapl_verbosity),
|
||||||
REGINT_NEG_ONE_OK), tmp_rc, rc);
|
REGINT_NEG_ONE_OK), tmp_rc, rc);
|
||||||
|
|
||||||
|
CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("compare_subnet",
|
||||||
|
"By default uDAPL BTL will compare subnets using netmask to "
|
||||||
|
"determine if an interface is reachable. Setting this parameter to "
|
||||||
|
"0 will essentially turn this comparison off and the uDAPL BTL will "
|
||||||
|
"assume all uDAPL interfaces are reachable (0 or 1, default==1).",
|
||||||
|
1,
|
||||||
|
&(mca_btl_udapl_component.udapl_compare_subnet),
|
||||||
|
REGINT_GE_ZERO), tmp_rc, rc);
|
||||||
|
|
||||||
/* register uDAPL module parameters */
|
/* register uDAPL module parameters */
|
||||||
CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("async_evd_qlen",
|
CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("async_evd_qlen",
|
||||||
"The asynchronous event dispatcher queue length.",
|
"The asynchronous event dispatcher queue length.",
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -22,8 +22,9 @@
|
|||||||
#include "ompi_config.h"
|
#include "ompi_config.h"
|
||||||
|
|
||||||
#include "opal/class/opal_hash_table.h"
|
#include "opal/class/opal_hash_table.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
#include "ompi/runtime/ompi_module_exchange.h"
|
#include "ompi/runtime/ompi_module_exchange.h"
|
||||||
|
#include "opal/util/net.h"
|
||||||
#include "btl_udapl.h"
|
#include "btl_udapl.h"
|
||||||
#include "btl_udapl_endpoint.h"
|
#include "btl_udapl_endpoint.h"
|
||||||
#include "btl_udapl_proc.h"
|
#include "btl_udapl_proc.h"
|
||||||
@ -160,6 +161,114 @@ mca_btl_udapl_proc_t* mca_btl_udapl_proc_create(ompi_proc_t* ompi_proc)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Find an address on the peer_process which matches stated criteria
|
||||||
|
* to the udapl btl module address information. Return in peer_addr_idx
|
||||||
|
* the index to the peer_process address that matches the btl module
|
||||||
|
* address. Where match criteria is:
|
||||||
|
* - the address in not already in use
|
||||||
|
* - compare addresses using netmask, the netmask value can be modified with
|
||||||
|
* "--mca btl_udapl_if_mask"
|
||||||
|
*
|
||||||
|
* Note: since this is called from mca_btl_udapl_proc_insert() it
|
||||||
|
* is assumed that the process lock is locked when entered.
|
||||||
|
*
|
||||||
|
* @param udapl_btl (IN) BTL module
|
||||||
|
* @param peer_process (IN) BTL peer process
|
||||||
|
* @param peer_addr_idx(IN/OUT) Index of address on peer_process
|
||||||
|
* which matches the udapl_btl address data.
|
||||||
|
* On success should be >= 0.
|
||||||
|
* @return OMPI_SUCCESS or error status on failure
|
||||||
|
*/
|
||||||
|
static int mca_btl_udapl_proc_address_match(
|
||||||
|
mca_btl_udapl_module_t* udapl_btl,
|
||||||
|
mca_btl_udapl_proc_t* peer_proc,
|
||||||
|
int* peer_addr_idx)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
struct sockaddr *saddr;
|
||||||
|
struct sockaddr_in *btl_addr;
|
||||||
|
struct sockaddr_in *peer_addr;
|
||||||
|
char btl_addr_string[INET_ADDRSTRLEN];
|
||||||
|
char peer_addr_string[INET_ADDRSTRLEN];
|
||||||
|
|
||||||
|
*peer_addr_idx = MCA_BTL_UDAPL_INVALID_PEER_ADDR_IDX;
|
||||||
|
|
||||||
|
/* use generic address to find address family */
|
||||||
|
saddr = (struct sockaddr *)&(udapl_btl->udapl_addr.addr);
|
||||||
|
|
||||||
|
if (saddr->sa_family == AF_INET) {
|
||||||
|
|
||||||
|
btl_addr = (struct sockaddr_in *)saddr;
|
||||||
|
|
||||||
|
/* Loop thru peer process addresses looking for match.
|
||||||
|
* Match criteria:
|
||||||
|
* - address should not be "inuse"
|
||||||
|
* - both udapl btl module and peer address should be on
|
||||||
|
* the same subnet (compare with if_mask value)
|
||||||
|
*/
|
||||||
|
for(i = 0; i < (int) peer_proc->proc_addr_count; i++) {
|
||||||
|
|
||||||
|
peer_addr =
|
||||||
|
(struct sockaddr_in *)&(peer_proc->proc_addrs[i].addr);
|
||||||
|
|
||||||
|
if (VERBOSE_INFORM <=
|
||||||
|
mca_btl_udapl_component.udapl_verbosity) {
|
||||||
|
|
||||||
|
/* retrieve udapl btl and peer address string for reporting */
|
||||||
|
inet_ntop(AF_INET, (void *) &btl_addr->sin_addr,
|
||||||
|
btl_addr_string, INET_ADDRSTRLEN);
|
||||||
|
inet_ntop(AF_INET, (void *) &peer_addr->sin_addr,
|
||||||
|
peer_addr_string, INET_ADDRSTRLEN);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((false == peer_proc->proc_addrs[i].inuse) &&
|
||||||
|
(opal_net_samenetwork((struct sockaddr *)btl_addr,
|
||||||
|
(struct sockaddr *)peer_addr, udapl_btl->udapl_if_mask))) {
|
||||||
|
|
||||||
|
/* capture index of remote address where match found */
|
||||||
|
*peer_addr_idx = i;
|
||||||
|
|
||||||
|
/* mark this address as now being used */
|
||||||
|
peer_proc->proc_addrs[i].inuse = true;
|
||||||
|
|
||||||
|
/* report what address was found to match */
|
||||||
|
BTL_UDAPL_VERBOSE_OUTPUT(VERBOSE_INFORM,
|
||||||
|
("uDAPL BTL module(%s) matched %s",
|
||||||
|
btl_addr_string, peer_addr_string));
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
/* peer address already used by another udapl btl
|
||||||
|
* module or netmask check not successful so skip
|
||||||
|
*/
|
||||||
|
BTL_UDAPL_VERBOSE_OUTPUT(VERBOSE_INFORM,
|
||||||
|
("uDAPL BTL module(%s) either skipped because it "
|
||||||
|
"is already in use or match criteria not successful "
|
||||||
|
"for peer address %s",
|
||||||
|
btl_addr_string, peer_addr_string));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
/* current uDAPL BTL only supports IPv4 */
|
||||||
|
BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP,
|
||||||
|
("help-mpi-btl-udapl.txt", "IPv4 only",
|
||||||
|
true, orte_system_info.nodename));
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MCA_BTL_UDAPL_INVALID_PEER_ADDR_IDX == *peer_addr_idx) {
|
||||||
|
BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP,
|
||||||
|
("help-mpi-btl-udapl.txt", "no network match",
|
||||||
|
true, btl_addr_string, orte_system_info.nodename,
|
||||||
|
peer_proc->proc_ompi->proc_hostname));
|
||||||
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Note that this routine must be called with the lock on the process
|
* Note that this routine must be called with the lock on the process
|
||||||
* already held. Insert a btl instance into the proc array and assign
|
* already held. Insert a btl instance into the proc array and assign
|
||||||
@ -169,13 +278,32 @@ int mca_btl_udapl_proc_insert(
|
|||||||
mca_btl_udapl_proc_t* udapl_proc,
|
mca_btl_udapl_proc_t* udapl_proc,
|
||||||
mca_btl_base_endpoint_t* udapl_endpoint)
|
mca_btl_base_endpoint_t* udapl_endpoint)
|
||||||
{
|
{
|
||||||
/* insert into endpoint array */
|
int peer_address_idx;
|
||||||
if(udapl_proc->proc_endpoint_count > udapl_proc->proc_addr_count)
|
mca_btl_udapl_module_t* udapl_btl = udapl_endpoint->endpoint_btl;
|
||||||
|
|
||||||
|
/* Check so as not to create more endpoints than addresses.
|
||||||
|
* Example: If one node has 3 btl modules and another only has 2,
|
||||||
|
* this check prevents the node with 3 btl modules from
|
||||||
|
* overloading the other, i.e. only 2 possible connections will
|
||||||
|
* be possible.
|
||||||
|
*/
|
||||||
|
if (udapl_proc->proc_endpoint_count > udapl_proc->proc_addr_count)
|
||||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
|
|
||||||
|
/* Find an endpoint on the udapl process of interest that matches
|
||||||
|
* the endpoint information of the current udapl btl module
|
||||||
|
*/
|
||||||
|
if (OMPI_SUCCESS !=
|
||||||
|
mca_btl_udapl_proc_address_match(udapl_btl, udapl_proc,
|
||||||
|
&peer_address_idx)) {
|
||||||
|
/* no address on peer proc met criteria */
|
||||||
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* insert into endpoint array */
|
||||||
udapl_endpoint->endpoint_proc = udapl_proc;
|
udapl_endpoint->endpoint_proc = udapl_proc;
|
||||||
udapl_endpoint->endpoint_addr =
|
udapl_endpoint->endpoint_addr =
|
||||||
udapl_proc->proc_addrs[udapl_proc->proc_endpoint_count];
|
udapl_proc->proc_addrs[peer_address_idx];
|
||||||
|
|
||||||
udapl_proc->proc_endpoints[udapl_proc->proc_endpoint_count] = udapl_endpoint;
|
udapl_proc->proc_endpoints[udapl_proc->proc_endpoint_count] = udapl_endpoint;
|
||||||
udapl_proc->proc_endpoint_count++;
|
udapl_proc->proc_endpoint_count++;
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -63,6 +64,8 @@ struct mca_btl_udapl_proc_t {
|
|||||||
typedef struct mca_btl_udapl_proc_t mca_btl_udapl_proc_t;
|
typedef struct mca_btl_udapl_proc_t mca_btl_udapl_proc_t;
|
||||||
OBJ_CLASS_DECLARATION(mca_btl_udapl_proc_t);
|
OBJ_CLASS_DECLARATION(mca_btl_udapl_proc_t);
|
||||||
|
|
||||||
|
#define MCA_BTL_UDAPL_INVALID_PEER_ADDR_IDX 4294967295
|
||||||
|
|
||||||
mca_btl_udapl_proc_t* mca_btl_udapl_proc_create(ompi_proc_t* ompi_proc);
|
mca_btl_udapl_proc_t* mca_btl_udapl_proc_create(ompi_proc_t* ompi_proc);
|
||||||
int mca_btl_udapl_proc_insert(mca_btl_udapl_proc_t*, mca_btl_base_endpoint_t*);
|
int mca_btl_udapl_proc_insert(mca_btl_udapl_proc_t*, mca_btl_base_endpoint_t*);
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
# University of Stuttgart. All rights reserved.
|
# University of Stuttgart. All rights reserved.
|
||||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
# Copyright (c) 2004-2006 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
# Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
# Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
|
||||||
#
|
#
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
@ -21,81 +21,69 @@
|
|||||||
# This is the US/English general help file for Open MPI.
|
# This is the US/English general help file for Open MPI.
|
||||||
#
|
#
|
||||||
[invalid num rdma segments]
|
[invalid num rdma segments]
|
||||||
|
|
||||||
WARNING: MCA parameter [btl_udapl_eager_rdma_num = %d] is not valid.
|
WARNING: MCA parameter [btl_udapl_eager_rdma_num = %d] is not valid.
|
||||||
RDMA will not be used for short messages. Try setting to positive
|
RDMA will not be used for short messages. Try setting to positive
|
||||||
value, e.g. 16.
|
value, e.g. 16.
|
||||||
|
#
|
||||||
[use default endpoint params]
|
[use default endpoint params]
|
||||||
|
|
||||||
WARNING: Using default uDAPL endpoint parameters not those that
|
WARNING: Using default uDAPL endpoint parameters not those that
|
||||||
would have been modified by MCA parameters.
|
would have been modified by MCA parameters.
|
||||||
|
#
|
||||||
[optimal buffer alignment mismatch]
|
[optimal buffer alignment mismatch]
|
||||||
|
|
||||||
WARNING: DAT_OPTIMAL_ALIGNMENT = %d : BTL buffer_alignment = %d.
|
WARNING: DAT_OPTIMAL_ALIGNMENT = %d : BTL buffer_alignment = %d.
|
||||||
The BTL buffer_alignment value may not be optimal. If all nodes
|
The BTL buffer_alignment value may not be optimal. If all nodes
|
||||||
report the same DAT_OPTIMAL_ALIGNMENT value and this differs from
|
report the same DAT_OPTIMAL_ALIGNMENT value and this differs from
|
||||||
BTL buffer_alignment then setting "--mca btl_udapl_buffer_alignment
|
BTL buffer_alignment then setting "--mca btl_udapl_buffer_alignment
|
||||||
%d" may improve performance.
|
%d" may improve performance.
|
||||||
|
#
|
||||||
[max_recv_dtos too low]
|
[max_recv_dtos too low]
|
||||||
|
|
||||||
WARNING: The MCA parameter max_recv_dtos has been modified to a value,
|
WARNING: The MCA parameter max_recv_dtos has been modified to a value,
|
||||||
%d, that is insufficient. This value must be greater than or equal to
|
%d, that is insufficient. This value must be greater than or equal to
|
||||||
num_recvs, %d. The uDAPL BTL will adjust to allow the program to
|
num_recvs, %d. The uDAPL BTL will adjust to allow the program to
|
||||||
proceed.
|
proceed.
|
||||||
|
#
|
||||||
[max_request_dtos too low]
|
[max_request_dtos too low]
|
||||||
|
|
||||||
WARNING: The MCA parameter max_request_dtos has been modified to a
|
WARNING: The MCA parameter max_request_dtos has been modified to a
|
||||||
value, %d, which may not be sufficient. Try setting max_request_dtos
|
value, %d, which may not be sufficient. Try setting max_request_dtos
|
||||||
to %d if program fails.
|
to %d if program fails.
|
||||||
|
#
|
||||||
[max_recv_dtos system max]
|
[max_recv_dtos system max]
|
||||||
|
|
||||||
WARNING: The MCA parameter max_recv_dtos is trying to be set to,
|
WARNING: The MCA parameter max_recv_dtos is trying to be set to,
|
||||||
%d, which is larger than allowable so the value will be set to maximum
|
%d, which is larger than allowable so the value will be set to maximum
|
||||||
allowed, %d.
|
allowed, %d.
|
||||||
|
#
|
||||||
[max_request_dtos system max]
|
[max_request_dtos system max]
|
||||||
|
|
||||||
WARNING: The MCA parameter max_request_dtos is trying to be set to,
|
WARNING: The MCA parameter max_request_dtos is trying to be set to,
|
||||||
%d, which is larger than allowable so the value will be set to maximum
|
%d, which is larger than allowable so the value will be set to maximum
|
||||||
allowed, %d.
|
allowed, %d.
|
||||||
|
#
|
||||||
[evd_qlen adapter max]
|
[evd_qlen adapter max]
|
||||||
|
|
||||||
WARNING: The MCA parameter %s is trying to be set to %d,
|
WARNING: The MCA parameter %s is trying to be set to %d,
|
||||||
which is larger than allowable so the value will be set to maximum
|
which is larger than allowable so the value will be set to maximum
|
||||||
allowed, %d.
|
allowed, %d.
|
||||||
|
#
|
||||||
[evd_qlen too low]
|
[evd_qlen too low]
|
||||||
|
|
||||||
WARNING: The MCA parameter %s has been modified to a value,
|
WARNING: The MCA parameter %s has been modified to a value,
|
||||||
%d, which may not be sufficient. Try setting %s to %d if
|
%d, which may not be sufficient. Try setting %s to %d if
|
||||||
program fails.
|
program fails.
|
||||||
|
#
|
||||||
[connection timeout low]
|
[connection timeout low]
|
||||||
|
|
||||||
WARNING: The MCA parameter %s has been modified to a value,
|
WARNING: The MCA parameter %s has been modified to a value,
|
||||||
%d, which may not be sufficient. Try setting %s to %d if
|
%d, which may not be sufficient. Try setting %s to %d if
|
||||||
program fails.
|
program fails.
|
||||||
|
#
|
||||||
[dat_lmr_create DAT_INSUFFICIENT_RESOURCES]
|
[dat_lmr_create DAT_INSUFFICIENT_RESOURCES]
|
||||||
|
|
||||||
WARNING: The uDAPL BTL is not able to register memory. Possibly out of
|
WARNING: The uDAPL BTL is not able to register memory. Possibly out of
|
||||||
allowed privileged memory (i.e. memory that can be pinned). Increasing
|
allowed privileged memory (i.e. memory that can be pinned). Increasing
|
||||||
the allowed privileged memory may alleviate this issue.
|
the allowed privileged memory may alleviate this issue.
|
||||||
|
#
|
||||||
[dat_ia_open fail]
|
[dat_ia_open fail]
|
||||||
|
|
||||||
WARNING: Failed to open "%s" [%s:%s].
|
WARNING: Failed to open "%s" [%s:%s].
|
||||||
This may be a real error or it may be an invalid entry in the uDAPL
|
This may be a real error or it may be an invalid entry in the uDAPL
|
||||||
Registry which is contained in the dat.conf file. Contact your local
|
Registry which is contained in the dat.conf file. Contact your local
|
||||||
System Administrator to confirm the availability of the interfaces in
|
System Administrator to confirm the availability of the interfaces in
|
||||||
the dat.conf file.
|
the dat.conf file.
|
||||||
|
#
|
||||||
[specified include and exclude]
|
[specified include and exclude]
|
||||||
ERROR: You have specified both the btl_udapl_if_include and
|
ERROR: You have specified both the btl_udapl_if_include and
|
||||||
btl_udapl_if_exclude MCA parameters. These two parameters are
|
btl_udapl_if_exclude MCA parameters. These two parameters are
|
||||||
@ -105,7 +93,7 @@ For reference, the values that you specified are:
|
|||||||
|
|
||||||
btl_udapl_if_include: %s
|
btl_udapl_if_include: %s
|
||||||
btl_udapl_if_exclude: %s
|
btl_udapl_if_exclude: %s
|
||||||
|
#
|
||||||
[nonexistent entry]
|
[nonexistent entry]
|
||||||
WARNING: One or more nonexistent interfaces were specified:
|
WARNING: One or more nonexistent interfaces were specified:
|
||||||
|
|
||||||
@ -114,3 +102,23 @@ WARNING: One or more nonexistent interfaces were specified:
|
|||||||
Nonexistent entities: %s
|
Nonexistent entities: %s
|
||||||
|
|
||||||
These entities will be ignored.
|
These entities will be ignored.
|
||||||
|
#
|
||||||
|
[IPv4 only]
|
||||||
|
WARNING: uDAPL BTL only supports IPv4 addressing at this time.
|
||||||
|
Something other than an IPv4 address was detected on %s.
|
||||||
|
#
|
||||||
|
[no network match]
|
||||||
|
WARNING: Interface %s on node %s not able to find matching
|
||||||
|
interface on peer node %s. Could be that the interfaces are on
|
||||||
|
different subnets or there are fewer available uDAPL interfaces on peer.
|
||||||
|
#
|
||||||
|
[interface not found]
|
||||||
|
WARNING: Host %s, not able to determine interface name for
|
||||||
|
address %s. Will attempt to continue, assuming all addresses to
|
||||||
|
peer are reachable.
|
||||||
|
#
|
||||||
|
[netmask not found]
|
||||||
|
WARNING: Host %s, not able to determine netmask for address
|
||||||
|
%s. Will attempt to continue assuming all addresses to
|
||||||
|
peer are reachable.
|
||||||
|
#
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user