1
1

Do not return errors in add_proc. If something bad happend or if one of the node is unreacheable

just dont create the ptl_peer struct for this node. Later the PML have to take in account the
management of the reacheable nodes.

This commit was SVN r5314.
Этот коммит содержится в:
George Bosilca 2005-04-13 18:25:07 +00:00
родитель befac00371
Коммит 685ab2a54e
2 изменённых файлов: 18 добавлений и 11 удалений

Просмотреть файл

@ -83,21 +83,24 @@ mca_ptl_gm_add_procs (struct mca_ptl_base_module_t *ptl,
if( orte_proc == local_proc ) continue;
ptl_proc = mca_ptl_gm_proc_create ((mca_ptl_gm_module_t *) ptl, orte_proc);
if (NULL == ptl_proc) {
return OMPI_ERR_OUT_OF_RESOURCE;
ompi_output( 0, "[%s:%d] cannot allocate memory for the GM module", __FILE__, __LINE__ );
continue;
}
OMPI_THREAD_LOCK (&ptl_proc->proc_lock);
if (ptl_proc->proc_addr_count == ptl_proc->proc_peer_count) {
OMPI_THREAD_UNLOCK (&ptl_proc->proc_lock);
return OMPI_ERR_UNREACH;
ompi_output( 0, "[%s:%d] modex exchange failed for GM module", __FILE__, __LINE__ );
continue;
}
ptl_peer = NULL; /* force it to NULL before looping through the ptls */
/* TODO: make this extensible to multiple nics */
for( j = 0; j < num_peer_ptls; j++ ) {
ptl_peer = OBJ_NEW (mca_ptl_gm_peer_t);
if (NULL == ptl_peer) {
OMPI_THREAD_UNLOCK (&ptl_proc->proc_lock);
return OMPI_ERR_OUT_OF_RESOURCE;
ompi_output( 0, "[%s:%d] cannot allocate memory for one of the GM ptl", __FILE__, __LINE__ );
continue;
}
ptl_peer->peer_ptl = (mca_ptl_gm_module_t *) ptl;
@ -110,7 +113,9 @@ mca_ptl_gm_add_procs (struct mca_ptl_base_module_t *ptl,
&(ptl_peer->peer_addr.local_id))) {
ompi_output( 0, "[%s:%d] error in converting global to local id \n",
__FILE__, __LINE__ );
return OMPI_ERR_BAD_PARAM;
OBJ_RELEASE( ptl_peer );
assert( NULL == ptl_peer );
continue;
}
#else
strncpy( ptl_peer->peer_addr.global_id, ptl_proc->proc_addrs->global_id, GM_MAX_HOST_NAME_LEN );
@ -119,13 +124,15 @@ mca_ptl_gm_add_procs (struct mca_ptl_base_module_t *ptl,
if( GM_NO_SUCH_NODE_ID == ptl_peer->peer_addr.local_id ) {
ompi_output( 0, "Unable to convert the remote host name (%s) to a host id",
ptl_proc->proc_addrs[j].global_id );
return OMPI_ERR_BAD_PARAM;
OBJ_RELEASE( ptl_peer );
assert( NULL == ptl_peer );
continue;
}
#endif /* GM_API_VERSION > 0x200 */
ptl_proc->peer_arr[ptl_proc->proc_peer_count] = ptl_peer;
ptl_proc->proc_peer_count++;
ompi_bitmap_set_bit (reachable, i); /* set the bit again and again */
}
ompi_bitmap_set_bit (reachable, i);
OMPI_THREAD_UNLOCK (&ptl_proc->proc_lock);
peers[i] = (struct mca_ptl_base_peer_t*)ptl_peer;
}
@ -341,7 +348,7 @@ mca_ptl_gm_matched( mca_ptl_base_module_t* ptl,
if( frag->frag_base.frag_header.hdr_common.hdr_flags & MCA_PTL_FLAGS_ACK ) { /* need to send an ack back */
ompi_list_item_t *item;
OMPI_FREE_LIST_TRY_GET( &(gm_ptl->gm_send_dma_frags), item );
OMPI_FREE_LIST_WAIT( &(gm_ptl->gm_send_dma_frags), item, rc );
if( NULL == item ) {
ompi_output(0,"[%s:%d] unable to alloc a gm fragment\n", __FILE__,__LINE__);
OMPI_THREAD_LOCK (&mca_ptl_gm_component.gm_lock);

Просмотреть файл

@ -463,9 +463,9 @@ mca_ptl_gm_init( mca_ptl_gm_component_t * gm )
mca_ptl_gm_component.gm_num_ptl_modules =
mca_ptl_gm_discover_boards( mca_ptl_gm_component.gm_ptl_modules,
mca_ptl_gm_component.gm_max_ptl_modules,
mca_ptl_gm_component.gm_max_boards_number,
mca_ptl_gm_component.gm_max_port_number );
mca_ptl_gm_component.gm_max_ptl_modules,
mca_ptl_gm_component.gm_max_boards_number,
mca_ptl_gm_component.gm_max_port_number );
/* In the case when we are in a multi-threaded environment each
* PTL will have its own thread. At this point all structures are