From ec4a235e6a55af7aaa03d7cdc30853016509651d Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 1 Mar 2017 13:17:54 -0500 Subject: [PATCH] Allow a TCP proc release during the create. This is mostly for error cases, where we need to release the newly created proc. Currently the code deadlocks because the endpoint lock is help at the release and the lock is not recursive. Aslo added some code to print the IP addresses that don't match during the TCP connection step. Signed-off-by: George Bosilca --- opal/mca/btl/tcp/btl_tcp_proc.c | 46 ++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/opal/mca/btl/tcp/btl_tcp_proc.c b/opal/mca/btl/tcp/btl_tcp_proc.c index 60c807cffc..3b84ee22a3 100644 --- a/opal/mca/btl/tcp/btl_tcp_proc.c +++ b/opal/mca/btl/tcp/btl_tcp_proc.c @@ -125,16 +125,18 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc) return btl_proc; } - do { + do { /* This loop is only necessary so that we can break out of the serial code */ btl_proc = OBJ_NEW(mca_btl_tcp_proc_t); if(NULL == btl_proc) { rc = OPAL_ERR_OUT_OF_RESOURCE; break; } - btl_proc->proc_opal = proc; - - OBJ_RETAIN(btl_proc->proc_opal); + /* Retain the proc, but don't store the ref into the btl_proc just yet. This + * provides a way to release the btl_proc in case of failure without having to + * unlock the mutex. + */ + OBJ_RETAIN(proc); /* lookup tcp parameters exported by this proc */ OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version, @@ -180,12 +182,14 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc) } while (0); if (OPAL_SUCCESS == rc) { + btl_proc->proc_opal = proc; /* link with the proc */ /* add to hash table of all proc instance. */ opal_proc_table_set_value(&mca_btl_tcp_component.tcp_procs, proc->proc_name, btl_proc); } else { if (btl_proc) { - OBJ_RELEASE(btl_proc); + OBJ_RELEASE(btl_proc); /* release the local proc */ + OBJ_RELEASE(proc); /* and the ref on the OMPI proc */ btl_proc = NULL; } } @@ -820,12 +824,36 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); return; } - OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); - opal_output(0, "btl: tcp: Incoming connection from %s does not match known addresses for peer %s. Drop !\n", - opal_net_get_hostname((struct sockaddr*)addr), - OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name)); /* No further use of this socket. Close it */ CLOSE_THE_SOCKET(sd); + { + size_t len = 1024; + char* addr_str = (char*)malloc(len); + if( NULL != addr_str ) { + memset(addr_str, 0, len); + for (size_t i = 0; i < btl_proc->proc_endpoint_count; i++) { + mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i]; + if (btl_endpoint->endpoint_addr->addr_family != addr->sa_family) { + continue; + } + + if (addr_str[0] != '\0') { + strncat(addr_str, ", ", len); + len -= 2; + } + strncat(addr_str, inet_ntop(AF_INET6, (void*)(struct in6_addr*)&btl_endpoint->endpoint_addr->addr_inet, + addr_str + 1024 - len, INET6_ADDRSTRLEN), len); + len = 1024 - strlen(addr_str); + } + } + opal_output(0, "btl: tcp: Incoming connection from %s does not match known addresses for peer %s [hostname=%s addr=%s]. Drop !\n", + opal_net_get_hostname((struct sockaddr*)addr), + OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name), + btl_proc->proc_opal->proc_hostname, + addr_str); + free(addr_str); + } + OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); } /*