diff --git a/opal/mca/btl/tcp/btl_tcp.c b/opal/mca/btl/tcp/btl_tcp.c index c14d655f9b..87e5b0ef15 100644 --- a/opal/mca/btl/tcp/btl_tcp.c +++ b/opal/mca/btl/tcp/btl_tcp.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights * reserved. * * $COPYRIGHT$ @@ -72,6 +72,7 @@ int mca_btl_tcp_add_procs( struct mca_btl_base_module_t* btl, struct opal_proc_t* opal_proc = procs[i]; mca_btl_tcp_proc_t* tcp_proc; mca_btl_base_endpoint_t* tcp_endpoint; + bool existing_found = false; /* Do not create loopback TCP connections */ if( my_proc == opal_proc ) { @@ -90,28 +91,43 @@ int mca_btl_tcp_add_procs( struct mca_btl_base_module_t* btl, OPAL_THREAD_LOCK(&tcp_proc->proc_lock); - /* The btl_proc datastructure is shared by all TCP BTL - * instances that are trying to reach this destination. - * Cache the peer instance on the btl_proc. - */ - tcp_endpoint = OBJ_NEW(mca_btl_tcp_endpoint_t); - if(NULL == tcp_endpoint) { - OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock); - return OPAL_ERR_OUT_OF_RESOURCE; + for (int j = 0 ; j < tcp_proc->proc_endpoint_count ; ++j) { + tcp_endpoint = tcp_proc->proc_endpoints[j]; + if (tcp_endpoint->endpoint_btl == tcp_btl) { + existing_found = true; + break; + } } - tcp_endpoint->endpoint_btl = tcp_btl; - rc = mca_btl_tcp_proc_insert(tcp_proc, tcp_endpoint); - if(rc != OPAL_SUCCESS) { - OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock); - OBJ_RELEASE(tcp_endpoint); - continue; + if (!existing_found) { + /* The btl_proc datastructure is shared by all TCP BTL + * instances that are trying to reach this destination. + * Cache the peer instance on the btl_proc. + */ + tcp_endpoint = OBJ_NEW(mca_btl_tcp_endpoint_t); + if(NULL == tcp_endpoint) { + OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + tcp_endpoint->endpoint_btl = tcp_btl; + rc = mca_btl_tcp_proc_insert(tcp_proc, tcp_endpoint); + if(rc != OPAL_SUCCESS) { + OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock); + OBJ_RELEASE(tcp_endpoint); + continue; + } + + opal_list_append(&tcp_btl->tcp_endpoints, (opal_list_item_t*)tcp_endpoint); } - opal_bitmap_set_bit(reachable, i); OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock); + + if (NULL != reachable) { + opal_bitmap_set_bit(reachable, i); + } + peers[i] = tcp_endpoint; - opal_list_append(&tcp_btl->tcp_endpoints, (opal_list_item_t*)tcp_endpoint); /* we increase the count of MPI users of the event library once per peer, so that we are used until we aren't diff --git a/opal/mca/btl/tcp/btl_tcp_proc.c b/opal/mca/btl/tcp/btl_tcp_proc.c index c86977dde3..c0d3399fb8 100644 --- a/opal/mca/btl/tcp/btl_tcp_proc.c +++ b/opal/mca/btl/tcp/btl_tcp_proc.c @@ -14,7 +14,9 @@ * Copyright (c) 2013-2015 Intel, Inc. All rights reserved * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -738,6 +740,31 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const opal_process_name_t *name) opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs, *name, (void**)&proc); OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock); + if (OPAL_UNLIKELY(NULL == proc)) { + mca_btl_base_endpoint_t *endpoint; + opal_proc_t *opal_proc; + int rc; + + BTL_VERBOSE(("adding tcp proc for unknown peer {.jobid = 0x%x, .vpid = 0x%x}", + name->jobid, name->vpid)); + + opal_proc = opal_proc_for_name (*name); + if (NULL == opal_proc) { + return NULL; + } + + /* try adding this proc to each btl until */ + for (int i = 0 ; i < mca_btl_tcp_component.tcp_num_btls ; ++i) { + endpoint = NULL; + (void) mca_btl_tcp_add_procs (&mca_btl_tcp_component.tcp_btls[i]->super, 1, &opal_proc, + &endpoint, NULL); + if (NULL != endpoint && NULL == proc) { + /* get the proc and continue on (could probably just break here) */ + proc = endpoint->endpoint_proc; + } + } + } + return proc; }