diff --git a/ompi/mca/ptl/mx/ptl_mx.c b/ompi/mca/ptl/mx/ptl_mx.c index d8cb404902..691680de9e 100644 --- a/ompi/mca/ptl/mx/ptl_mx.c +++ b/ompi/mca/ptl/mx/ptl_mx.c @@ -287,14 +287,13 @@ int mca_ptl_mx_send( sendfrag->frag_send.frag_base.frag_size); /* start the fragment */ - mx_return = mx_isend( - mx_ptl->mx_endpoint, - segments, - sendfrag->frag_segment_count, - ptl_peer->peer_addr, - match.lval, - sendfrag, - &sendfrag->frag_request); + mx_return = mx_isend( mx_ptl->mx_endpoint, + segments, + sendfrag->frag_segment_count, + ptl_peer->peer_addr, + match.lval, + sendfrag, + &sendfrag->frag_request ); if(mx_return != MX_SUCCESS) { opal_output(0, "mca_ptl_mx_send: mx_isend() failed with return value=%d\n", mx_return); return OMPI_ERROR; diff --git a/ompi/mca/ptl/mx/ptl_mx.h b/ompi/mca/ptl/mx/ptl_mx.h index 149ef4982d..f059941511 100644 --- a/ompi/mca/ptl/mx/ptl_mx.h +++ b/ompi/mca/ptl/mx/ptl_mx.h @@ -42,7 +42,7 @@ struct mca_ptl_mx_component_t { uint32_t mx_filter; /**< filter assigned to application */ uint32_t mx_num_ptls; /**< number of MX NICs available to app */ uint32_t mx_max_ptls; /**< max number of MX NICs to use */ - struct mca_ptl_mx_module_t** mx_ptls; /**< array of available PTL moduless */ + struct mca_ptl_mx_module_t** mx_ptls; /**< array of available PTL modules */ ompi_free_list_t mx_send_frags; /**< free list of mx send fragments */ ompi_free_list_t mx_recv_frags; /**< free list of mx recv fragments */ opal_hash_table_t mx_procs; /**< hash table of procs */ @@ -133,9 +133,6 @@ extern int mca_ptl_mx_component_progress( struct mca_ptl_mx_module_t { mca_ptl_base_module_t super; /**< base PTL module interface */ opal_list_t mx_peers; /**< list of peers */ - uint64_t mx_nic_addr; /**< NIC MAC address */ - uint32_t mx_filter; /**< endpoint filter */ - uint32_t mx_endpoint_id; /**< endpoint ID */ bool mx_enabled; /**< flag to indicate if endpoint enabled */ mx_endpoint_t mx_endpoint; /**< endpoint */ mx_endpoint_addr_t mx_endpoint_addr; /**< endpoint address */ diff --git a/ompi/mca/ptl/mx/ptl_mx_component.c b/ompi/mca/ptl/mx/ptl_mx_component.c index 4528889524..459b5357e9 100644 --- a/ompi/mca/ptl/mx/ptl_mx_component.c +++ b/ompi/mca/ptl/mx/ptl_mx_component.c @@ -150,11 +150,11 @@ int mca_ptl_mx_component_close(void) #endif /* release resources */ - OBJ_DESTRUCT(&mca_ptl_mx_component.mx_lock); OBJ_DESTRUCT(&mca_ptl_mx_component.mx_send_frags); OBJ_DESTRUCT(&mca_ptl_mx_component.mx_recv_frags); OBJ_DESTRUCT(&mca_ptl_mx_component.mx_procs); OBJ_DESTRUCT(&mca_ptl_mx_component.mx_pending_acks); + OBJ_DESTRUCT(&mca_ptl_mx_component.mx_lock); return OMPI_SUCCESS; } @@ -171,44 +171,42 @@ mca_ptl_base_module_t** mca_ptl_mx_component_init( *num_ptls = 0; /* initialize objects */ - OBJ_CONSTRUCT(&mca_ptl_mx_component.mx_lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_ptl_mx_component.mx_send_frags, ompi_free_list_t); OBJ_CONSTRUCT(&mca_ptl_mx_component.mx_recv_frags, ompi_free_list_t); OBJ_CONSTRUCT(&mca_ptl_mx_component.mx_procs, opal_hash_table_t); OBJ_CONSTRUCT(&mca_ptl_mx_component.mx_pending_acks, opal_list_t); + OBJ_CONSTRUCT(&mca_ptl_mx_component.mx_lock, opal_mutex_t); - ompi_free_list_init(&mca_ptl_mx_component.mx_send_frags, - sizeof(mca_ptl_mx_send_frag_t), - OBJ_CLASS(mca_ptl_mx_send_frag_t), - mca_ptl_mx_component.mx_free_list_num, - mca_ptl_mx_component.mx_free_list_max, - mca_ptl_mx_component.mx_free_list_inc, - NULL); /* use default allocator */ + ompi_free_list_init( &mca_ptl_mx_component.mx_send_frags, + sizeof(mca_ptl_mx_send_frag_t), + OBJ_CLASS(mca_ptl_mx_send_frag_t), + mca_ptl_mx_component.mx_free_list_num, + mca_ptl_mx_component.mx_free_list_max, + mca_ptl_mx_component.mx_free_list_inc, + NULL ); /* use default allocator */ - ompi_free_list_init(&mca_ptl_mx_component.mx_recv_frags, - sizeof(mca_ptl_mx_recv_frag_t), - OBJ_CLASS(mca_ptl_mx_recv_frag_t), - mca_ptl_mx_component.mx_free_list_num, - mca_ptl_mx_component.mx_free_list_max, - mca_ptl_mx_component.mx_free_list_inc, - NULL); /* use default allocator */ + ompi_free_list_init( &mca_ptl_mx_component.mx_recv_frags, + sizeof(mca_ptl_mx_recv_frag_t), + OBJ_CLASS(mca_ptl_mx_recv_frag_t), + mca_ptl_mx_component.mx_free_list_num, + mca_ptl_mx_component.mx_free_list_max, + mca_ptl_mx_component.mx_free_list_inc, + NULL ); /* use default allocator */ /* intialize process hash table */ - opal_hash_table_init(&mca_ptl_mx_component.mx_procs, 256); + opal_hash_table_init( &mca_ptl_mx_component.mx_procs, 256 ); /* initialize mx ptls */ if(OMPI_SUCCESS != mca_ptl_mx_module_init()) return NULL; /* allocate and return a copy of the ptl array */ - ptls = malloc(mca_ptl_mx_component.mx_num_ptls * - sizeof(mca_ptl_base_module_t*)); + ptls = malloc( mca_ptl_mx_component.mx_num_ptls * sizeof(mca_ptl_base_module_t*) ); if(NULL == ptls) return NULL; - memcpy(ptls, - mca_ptl_mx_component.mx_ptls, - mca_ptl_mx_component.mx_num_ptls*sizeof(mca_ptl_mx_module_t*)); + memcpy( ptls, mca_ptl_mx_component.mx_ptls, + mca_ptl_mx_component.mx_num_ptls*sizeof(mca_ptl_mx_module_t*) ); *num_ptls = mca_ptl_mx_component.mx_num_ptls; return ptls; } diff --git a/ompi/mca/ptl/mx/ptl_mx_module.c b/ompi/mca/ptl/mx/ptl_mx_module.c index a84329d0b7..46e66489c7 100644 --- a/ompi/mca/ptl/mx/ptl_mx_module.c +++ b/ompi/mca/ptl/mx/ptl_mx_module.c @@ -40,7 +40,7 @@ int mca_ptl_mx_module_init(void) uint32_t i; int rc; uint64_t *nic_addrs; - mx_endpoint_addr_t *endpoint_addrs; + mca_ptl_mx_endpoint_t *endpoint_addrs; mx_return_t status; /* intialize MX library */ @@ -92,20 +92,19 @@ int mca_ptl_mx_module_init(void) free(nic_addrs); /* post local endpoint addresses */ - size = mca_ptl_mx_component.mx_num_ptls * sizeof(mx_endpoint_addr_t); - endpoint_addrs = (mx_endpoint_addr_t*)malloc(size); + size = mca_ptl_mx_component.mx_num_ptls * sizeof(mca_ptl_mx_endpoint_t); + endpoint_addrs = (mca_ptl_mx_endpoint_t*)malloc(size); if(NULL == endpoint_addrs) { opal_output(0, "mca_ptl_mx_module_init: malloc() failed\n"); return OMPI_ERR_OUT_OF_RESOURCE; } - for(i=0; imx_endpoint_addr; + mx_decompose_endpoint_addr( ptl->mx_endpoint_addr, + &(endpoint_addrs[i].nic_id), &(endpoint_addrs[i].endpoint_id) ); } if((rc = mca_base_modex_send( &mca_ptl_mx_component.super.ptlm_version, - endpoint_addrs, - mca_ptl_mx_component.mx_num_ptls * sizeof(mx_endpoint_addr_t))) - != OMPI_SUCCESS) + endpoint_addrs, size )) != OMPI_SUCCESS ) return rc; return OMPI_SUCCESS; } @@ -294,7 +293,7 @@ static mca_ptl_mx_module_t* mca_ptl_mx_create(uint64_t addr) mca_ptl_mx_finalize(&ptl->super); return NULL; } - + /* query the endpoint address */ if((status = mx_get_endpoint_addr( ptl->mx_endpoint, &ptl->mx_endpoint_addr)) != MX_SUCCESS) { @@ -303,23 +302,6 @@ static mca_ptl_mx_module_t* mca_ptl_mx_create(uint64_t addr) return NULL; } - /* breakup the endpoint address */ - if((status = mx_decompose_endpoint_addr( ptl->mx_endpoint_addr, - &ptl->mx_nic_addr, - &ptl->mx_endpoint_id)) != MX_SUCCESS) { - opal_output(0, "mca_ptl_mx_init: mx_decompose_endpoint_addr() failed with status=%d\n", status); - mca_ptl_mx_finalize(&ptl->super); - return NULL; - } - - if(mca_ptl_mx_component.mx_debug) { - opal_output(0, "mca_ptl_mx_create: opened %08X:%08X:%08X:%08X\n", - (uint32_t)(ptl->mx_nic_addr >> 32), - (uint32_t)ptl->mx_nic_addr, - ptl->mx_endpoint_id, - ptl->mx_filter); - } - /* prepost a receive buffer */ ptl->mx_recvs_posted = 1; MCA_PTL_MX_POST(ptl, MCA_PTL_HDR_TYPE_MATCH, sizeof(mca_ptl_base_match_header_t)); @@ -376,6 +358,7 @@ int mca_ptl_mx_finalize(struct mca_ptl_base_module_t* ptl) opal_thread_join(&mx_ptl->mx_thread, NULL); #endif mx_close_endpoint(mx_ptl->mx_endpoint); + mx_ptl->mx_endpoint = NULL; free(mx_ptl); return OMPI_SUCCESS; } diff --git a/ompi/mca/ptl/mx/ptl_mx_proc.c b/ompi/mca/ptl/mx/ptl_mx_proc.c index df530b1e4f..3fa3810184 100644 --- a/ompi/mca/ptl/mx/ptl_mx_proc.c +++ b/ompi/mca/ptl/mx/ptl_mx_proc.c @@ -114,11 +114,11 @@ mca_ptl_mx_proc_t* mca_ptl_mx_proc_create(ompi_proc_t* ompi_proc) OBJ_RELEASE(ptl_proc); return NULL; } - if(0 != (size % sizeof(mx_endpoint_addr_t))) { + if(0 != (size % sizeof(mca_ptl_mx_endpoint_t))) { opal_output(0, "mca_ptl_mx_proc_create: mca_base_modex_recv: invalid size %d\n", size); return NULL; } - ptl_proc->proc_addr_count = size / sizeof(mx_endpoint_addr_t); + ptl_proc->proc_addr_count = size / sizeof(mca_ptl_mx_endpoint_t); /* allocate space for peer array - one for each exported address */ ptl_proc->proc_peers = (mca_ptl_mx_peer_t**) @@ -152,12 +152,28 @@ mca_ptl_mx_proc_t* mca_ptl_mx_proc_lookup(const orte_process_name_t *name) */ int mca_ptl_mx_proc_insert(mca_ptl_mx_proc_t* ptl_proc, mca_ptl_base_peer_t* ptl_peer) { + mx_return_t mx_status; + mca_ptl_mx_endpoint_t* remote = &(ptl_proc->proc_addrs[ptl_proc->proc_peer_count]); + int num_retry = 0; /* insert into peer array */ ptl_peer->peer_proc = ptl_proc; - ptl_peer->peer_addr = ptl_proc->proc_addrs[ptl_proc->proc_peer_count]; ptl_proc->proc_peers[ptl_proc->proc_peer_count] = ptl_peer; - ptl_proc->proc_peer_count++; + /* construct the remote endpoint addr */ + retry_connect: + mx_status = mx_connect( ptl_peer->peer_ptl->mx_endpoint, remote->nic_id, remote->endpoint_id, + mca_ptl_mx_component.mx_filter, 1, &(ptl_peer->peer_addr) ); + if( OMPI_SUCCESS != mx_status ) { + if( MX_TIMEOUT == mx_status ) + if( num_retry++ < 5 ) + goto retry_connect; + opal_output( 0, "mx_connect fail for peer %d remote %lx %d filter %x with error %s\n", + ptl_proc->proc_peer_count, + remote->nic_id, remote->endpoint_id, mca_ptl_mx_component.mx_filter, + mx_strerror(mx_status) ); + return OMPI_ERROR; + } + ptl_proc->proc_peer_count++; return OMPI_SUCCESS; } diff --git a/ompi/mca/ptl/mx/ptl_mx_proc.h b/ompi/mca/ptl/mx/ptl_mx_proc.h index 577dc899a6..7180f9df03 100644 --- a/ompi/mca/ptl/mx/ptl_mx_proc.h +++ b/ompi/mca/ptl/mx/ptl_mx_proc.h @@ -30,6 +30,11 @@ extern "C" { #endif +typedef struct mca_ptl_mx_endpoint_t { + uint64_t nic_id; + uint32_t endpoint_id; +} mca_ptl_mx_endpoint_t; + /** * Represents the state of a remote process and the set of addresses * that it exports. Also cache an instance of mca_ptl_base_peer_t for each @@ -39,7 +44,7 @@ struct mca_ptl_mx_proc_t { opal_list_item_t super; /**< allow proc to be placed on a list */ ompi_proc_t *proc_ompi; /**< pointer to corresponding ompi_proc_t */ orte_process_name_t proc_name; /**< globally unique identifier for the process */ - mx_endpoint_addr_t *proc_addrs; /**< peer endpoint address */ + mca_ptl_mx_endpoint_t *proc_addrs; /**< peer endpoint address */ size_t proc_addr_count; /**< number of addresses published by peer */ mca_ptl_mx_peer_t **proc_peers; /**< array of peers that have been created to access this proc */ size_t proc_peer_count; /**< number of peers */