diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c index 1f048408c2..f03bd70576 100644 --- a/opal/mca/btl/openib/btl_openib.c +++ b/opal/mca/btl/openib/btl_openib.c @@ -798,16 +798,14 @@ static int prepare_device_for_use (mca_btl_openib_device_t *device) static int init_ib_proc_nolock(mca_btl_openib_module_t* openib_btl, mca_btl_openib_proc_t* ib_proc, mca_btl_base_endpoint_t **endpoint_ptr, - int local_port_cnt, int btl_rank, bool *is_reachable) + int local_port_cnt, int btl_rank) { int rem_port_cnt, matching_port = -1, j, rc; mca_btl_base_endpoint_t *endpoint; opal_btl_openib_connect_base_module_t *local_cpc; opal_btl_openib_connect_base_module_data_t *remote_cpc_data; - *endpoint_ptr = NULL; - *is_reachable = false; /* check if the remote proc has any ports that: - on the same subnet as the local proc, and @@ -939,9 +937,7 @@ static int init_ib_proc_nolock(mca_btl_openib_module_t* openib_btl, mca_btl_open } } - *is_reachable = true; *endpoint_ptr = endpoint; - return OPAL_SUCCESS; } @@ -1005,7 +1001,6 @@ int mca_btl_openib_add_procs( struct opal_proc_t* proc = procs[i]; mca_btl_openib_proc_t* ib_proc; bool found_existing = false; - bool is_reachable; bool is_new; opal_output(-1, "add procs: adding proc %d", i); @@ -1026,39 +1021,41 @@ int mca_btl_openib_add_procs( } #endif - if(NULL == (ib_proc = mca_btl_openib_proc_get_locked(proc))) { + if(NULL == (ib_proc = mca_btl_openib_proc_get_locked(proc, &is_new)) ) { /* if we don't have connection info for this process, it's * okay because some other method might be able to reach it, * so just mark it as unreachable by us */ continue; } - for (j = 0 ; j < (int) ib_proc->proc_endpoint_count ; ++j) { - endpoint = ib_proc->proc_endpoints[j]; - if (endpoint->endpoint_btl == openib_btl) { - found_existing = true; - break; + found_existing = false; + + if( !is_new ){ + for (j = 0 ; j < (int) ib_proc->proc_endpoint_count ; ++j) { + endpoint = ib_proc->proc_endpoints[j]; + if (endpoint->endpoint_btl == openib_btl) { + found_existing = true; + break; + } } } + if( !found_existing ) { + rc = init_ib_proc_nolock(openib_btl, ib_proc, &endpoint, + lcl_subnet_id_port_cnt, btl_rank); + if( OPAL_SUCCESS == rc ){ + found_existing = true; + } + } + OPAL_THREAD_UNLOCK( &ib_proc->proc_lock ); + if (found_existing) { if (reachable) { opal_bitmap_set_bit(reachable, i); } peers[i] = endpoint; - OPAL_THREAD_UNLOCK( &ib_proc->proc_lock ); - continue; } - rc = init_ib_proc_nolock(openib_btl, ib_proc, &endpoint, lcl_subnet_id_port_cnt, - btl_rank, &is_reachable); - OPAL_THREAD_UNLOCK( &ib_proc->proc_lock ); - if( OPAL_SUCCESS == rc ){ - peers[i] = endpoint; - if( is_reachable && NULL != reachable ){ - opal_bitmap_set_bit(reachable, i); - } - } } openib_btl->local_procs += local_procs; @@ -1070,11 +1067,11 @@ int mca_btl_openib_add_procs( struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_module_t *btl, struct opal_proc_t *proc) { mca_btl_openib_module_t *openib_btl = (mca_btl_openib_module_t *) btl; - mca_btl_base_endpoint_t *endpoint; + mca_btl_base_endpoint_t *endpoint = NULL; mca_btl_openib_proc_t *ib_proc; int j, rc; int local_port_cnt = 0, btl_rank; - bool is_reachable; + bool is_new; // TODO: shift to the separate function /* protect the device */ @@ -1083,34 +1080,35 @@ struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_modul if (OPAL_SUCCESS != rc) { BTL_ERROR(("could not prepare openib device for use")); OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock); - return rc; + return NULL; } rc = mca_btl_openib_size_queues(openib_btl, 1); if (OPAL_SUCCESS != rc) { BTL_ERROR(("error creating cqs")); OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock); - return rc; + return NULL; } OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock); - if (NULL == (ib_proc = mca_btl_openib_proc_get_locked(proc))) { + if (NULL == (ib_proc = mca_btl_openib_proc_get_locked(proc, &is_new))) { /* if we don't have connection info for this process, it's * okay because some other method might be able to reach it, * so just mark it as unreachable by us */ return NULL; } - - for (size_t j = 0 ; j < ib_proc->proc_endpoint_count ; ++j) { - endpoint = ib_proc->proc_endpoints[j]; - if (endpoint->endpoint_btl == openib_btl) { - OPAL_THREAD_UNLOCK(&ib_proc->proc_lock); - return endpoint; + if( !is_new ){ + for (size_t j = 0 ; j < ib_proc->proc_endpoint_count ; ++j) { + endpoint = ib_proc->proc_endpoints[j]; + if (endpoint->endpoint_btl == openib_btl) { + goto exit; + } } } + endpoint = NULL; for(j=0; j < mca_btl_openib_component.ib_num_btls; j++){ if(mca_btl_openib_component.openib_btls[j]->port_info.subnet_id == openib_btl->port_info.subnet_id) { @@ -1122,7 +1120,9 @@ struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_modul } (void)init_ib_proc_nolock(openib_btl, ib_proc, &endpoint, - local_port_cnt, btl_rank, &is_reachable); + local_port_cnt, btl_rank); +exit: + OPAL_THREAD_UNLOCK(&ib_proc->proc_lock); return endpoint; } diff --git a/opal/mca/btl/openib/btl_openib_proc.c b/opal/mca/btl/openib/btl_openib_proc.c index fce5d5347c..ff84c4820a 100644 --- a/opal/mca/btl/openib/btl_openib_proc.c +++ b/opal/mca/btl/openib/btl_openib_proc.c @@ -123,7 +123,7 @@ static void inline unpack8(char **src, uint8_t *value) * associated w/ a given destination on this datastructure. */ -mca_btl_openib_proc_t* mca_btl_openib_proc_get_locked(opal_proc_t* proc) +mca_btl_openib_proc_t* mca_btl_openib_proc_get_locked(opal_proc_t* proc, bool *is_new) { mca_btl_openib_proc_t *ib_proc = NULL, *ib_proc_ret = NULL; size_t msg_size; @@ -133,7 +133,7 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_get_locked(opal_proc_t* proc) char *offset; int modex_message_size; mca_btl_openib_modex_message_t dummy; - bool found = false; + *is_new = false; /* Check if we have already created a IB proc * structure for this ompi process */ @@ -273,17 +273,16 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_get_locked(opal_proc_t* proc) OPAL_THREAD_LOCK(&ib_proc->proc_lock); opal_list_append(&mca_btl_openib_component.ib_procs, &ib_proc->super); ib_proc_ret = ib_proc; - found = true; + *is_new = true; } else { /* otherwise - release module_proc */ OBJ_RELEASE(ib_proc); - found = false; } OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock); /* if we haven't insert the process - lock it here so we * won't lock mca_btl_openib_component.ib_lock */ - if( !found ){ + if( !(*is_new) ){ OPAL_THREAD_LOCK(&ib_proc_ret->proc_lock); } diff --git a/opal/mca/btl/openib/btl_openib_proc.h b/opal/mca/btl/openib/btl_openib_proc.h index 4088ba95b1..47a3bda7d8 100644 --- a/opal/mca/btl/openib/btl_openib_proc.h +++ b/opal/mca/btl/openib/btl_openib_proc.h @@ -84,7 +84,7 @@ typedef struct mca_btl_openib_proc_t mca_btl_openib_proc_t; OBJ_CLASS_DECLARATION(mca_btl_openib_proc_t); -mca_btl_openib_proc_t* mca_btl_openib_proc_get_locked(opal_proc_t* proc); +mca_btl_openib_proc_t* mca_btl_openib_proc_get_locked(opal_proc_t* proc, bool *is_new); int mca_btl_openib_proc_insert(mca_btl_openib_proc_t*, mca_btl_base_endpoint_t*); int mca_btl_openib_proc_remove(opal_proc_t* proc, mca_btl_base_endpoint_t* module_endpoint);