Add more output when calls to the MX library fails.
Move the connection status from theproc into the endpoint. This commit was SVN r12924.
Этот коммит содержится в:
родитель
14dc72f595
Коммит
e8bd985870
@ -31,36 +31,6 @@
|
||||
#include "ompi/mca/mpool/base/base.h"
|
||||
#include "ompi/mca/mpool/mpool.h"
|
||||
|
||||
|
||||
mca_btl_mx_module_t mca_btl_mx_module = {
|
||||
{
|
||||
&mca_btl_mx_component.super,
|
||||
0, /* max size of first fragment */
|
||||
0, /* min send fragment size */
|
||||
0, /* max send fragment size */
|
||||
0, /* min rdma fragment size */
|
||||
0, /* max rdma fragment size */
|
||||
0, /* exclusivity */
|
||||
0, /* latency */
|
||||
0, /* bandwidth */
|
||||
MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_PUT, /* flags */
|
||||
mca_btl_mx_add_procs,
|
||||
mca_btl_mx_del_procs,
|
||||
mca_btl_mx_register,
|
||||
mca_btl_mx_finalize,
|
||||
mca_btl_mx_alloc,
|
||||
mca_btl_mx_free,
|
||||
mca_btl_mx_prepare_src,
|
||||
mca_btl_mx_prepare_dst,
|
||||
mca_btl_mx_send,
|
||||
NULL, /* put */
|
||||
NULL, /* get */
|
||||
mca_btl_base_dump,
|
||||
NULL, /* mpool */
|
||||
NULL /* register error */
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
@ -378,8 +348,8 @@ int mca_btl_mx_send( struct mca_btl_base_module_t* btl,
|
||||
mx_return_t mx_return;
|
||||
uint64_t total_length;
|
||||
|
||||
if( MCA_BTL_MX_CONNECTED != ((mca_btl_mx_endpoint_t*)endpoint)->endpoint_proc->status ) {
|
||||
if( MCA_BTL_MX_NOT_REACHEABLE == ((mca_btl_mx_endpoint_t*)endpoint)->endpoint_proc->status )
|
||||
if( MCA_BTL_MX_CONNECTED != ((mca_btl_mx_endpoint_t*)endpoint)->status ) {
|
||||
if( MCA_BTL_MX_NOT_REACHEABLE == ((mca_btl_mx_endpoint_t*)endpoint)->status )
|
||||
return OMPI_ERROR;
|
||||
if( OMPI_SUCCESS != mca_btl_mx_proc_connect( (mca_btl_mx_endpoint_t*)endpoint ) )
|
||||
return OMPI_ERROR;
|
||||
@ -433,3 +403,32 @@ int mca_btl_mx_finalize( struct mca_btl_base_module_t* btl )
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
mca_btl_mx_module_t mca_btl_mx_module = {
|
||||
{
|
||||
&mca_btl_mx_component.super,
|
||||
0, /* max size of first fragment */
|
||||
0, /* min send fragment size */
|
||||
0, /* max send fragment size */
|
||||
0, /* min rdma fragment size */
|
||||
0, /* max rdma fragment size */
|
||||
0, /* exclusivity */
|
||||
0, /* latency */
|
||||
0, /* bandwidth */
|
||||
MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_PUT, /* flags */
|
||||
mca_btl_mx_add_procs,
|
||||
mca_btl_mx_del_procs,
|
||||
mca_btl_mx_register,
|
||||
mca_btl_mx_finalize,
|
||||
mca_btl_mx_alloc,
|
||||
mca_btl_mx_free,
|
||||
mca_btl_mx_prepare_src,
|
||||
mca_btl_mx_prepare_dst,
|
||||
mca_btl_mx_send,
|
||||
NULL, /* put */
|
||||
NULL, /* get */
|
||||
mca_btl_base_dump,
|
||||
NULL, /* mpool */
|
||||
NULL /* register error */
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -336,6 +336,8 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules,
|
||||
return NULL;
|
||||
if( (status = mx_get_info( NULL, MX_NIC_IDS, NULL, 0,
|
||||
nic_addrs, size)) != MX_SUCCESS) {
|
||||
opal_output(0, "MX BTL error (mx_get_info failed) size = %ld [%s] #cards %d\n",
|
||||
size, mx_strerror(status), mca_btl_mx_component.mx_num_btls );
|
||||
free(nic_addrs);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -35,18 +35,17 @@
|
||||
* Initialize state of the endpoint instance.
|
||||
*
|
||||
*/
|
||||
|
||||
static void mca_btl_mx_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
|
||||
{
|
||||
endpoint->endpoint_btl = NULL;
|
||||
endpoint->endpoint_btl = NULL;
|
||||
endpoint->endpoint_proc = NULL;
|
||||
endpoint->status = MCA_BTL_MX_NOT_CONNECTED;
|
||||
}
|
||||
|
||||
/*
|
||||
* Destroy a endpoint
|
||||
*
|
||||
*/
|
||||
|
||||
static void mca_btl_mx_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
|
||||
{
|
||||
}
|
||||
|
@ -32,6 +32,10 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define MCA_BTL_MX_NOT_CONNECTED 0x0000
|
||||
#define MCA_BTL_MX_NOT_REACHEABLE 0x0001
|
||||
#define MCA_BTL_MX_CONNECTED 0x0002
|
||||
|
||||
/**
|
||||
* Structure used to publish MX information to peers
|
||||
*/
|
||||
@ -62,6 +66,8 @@ struct mca_btl_base_endpoint_t {
|
||||
|
||||
mx_endpoint_addr_t mx_peer_addr;
|
||||
/** the remote MX endpoint address */
|
||||
|
||||
int status; /**< status of the endpoint */
|
||||
};
|
||||
|
||||
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
|
||||
|
@ -159,7 +159,6 @@ int mca_btl_mx_proc_insert( mca_btl_mx_proc_t* module_proc,
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
module_proc->status = MCA_BTL_MX_NOT_CONNECTED;
|
||||
module_proc->mx_peers = mx_peers;
|
||||
|
||||
if( NULL == module_proc->proc_endpoints ) {
|
||||
@ -169,7 +168,6 @@ int mca_btl_mx_proc_insert( mca_btl_mx_proc_t* module_proc,
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
/* insert into endpoint array */
|
||||
module_endpoint->endpoint_proc = module_proc;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
@ -206,13 +204,12 @@ int mca_btl_mx_proc_connect( mca_btl_mx_endpoint_t* module_endpoint )
|
||||
module_endpoint->mx_peer.nic_id = module_proc->mx_peers[i].nic_id;
|
||||
module_endpoint->mx_peer.endpoint_id = module_proc->mx_peers[i].endpoint_id;
|
||||
module_endpoint->mx_peer_addr = mx_remote_addr;
|
||||
module_endpoint->status = MCA_BTL_MX_CONNECTED;
|
||||
module_proc->proc_addr_index = i;
|
||||
module_proc->status = MCA_BTL_MX_CONNECTED;
|
||||
break;
|
||||
}
|
||||
|
||||
if( i == module_proc->mx_peers_count ) { /* no available connection */
|
||||
module_proc->status = MCA_BTL_MX_NOT_REACHEABLE;
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
|
@ -29,10 +29,6 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define MCA_BTL_MX_NOT_CONNECTED 0x0000
|
||||
#define MCA_BTL_MX_NOT_REACHEABLE 0x0001
|
||||
#define MCA_BTL_MX_CONNECTED 0x0002
|
||||
|
||||
/**
|
||||
* Represents the state of a remote process and the set of addresses
|
||||
* that it exports. Also cache an instance of mca_btl_base_endpoint_t for
|
||||
@ -46,8 +42,6 @@ extern "C" {
|
||||
ompi_proc_t *proc_ompi;
|
||||
/**< pointer to corresponding ompi_proc_t */
|
||||
|
||||
int status; /**< status of the connection */
|
||||
|
||||
mca_btl_mx_addr_t *mx_peers; /**< peers addresses */
|
||||
int mx_peers_count;
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user