Add more output when calls to the MX library fails.
Move the connection status from theproc into the endpoint. This commit was SVN r12924.
Этот коммит содержится в:
родитель
14dc72f595
Коммит
e8bd985870
@ -31,36 +31,6 @@
|
|||||||
#include "ompi/mca/mpool/base/base.h"
|
#include "ompi/mca/mpool/base/base.h"
|
||||||
#include "ompi/mca/mpool/mpool.h"
|
#include "ompi/mca/mpool/mpool.h"
|
||||||
|
|
||||||
|
|
||||||
mca_btl_mx_module_t mca_btl_mx_module = {
|
|
||||||
{
|
|
||||||
&mca_btl_mx_component.super,
|
|
||||||
0, /* max size of first fragment */
|
|
||||||
0, /* min send fragment size */
|
|
||||||
0, /* max send fragment size */
|
|
||||||
0, /* min rdma fragment size */
|
|
||||||
0, /* max rdma fragment size */
|
|
||||||
0, /* exclusivity */
|
|
||||||
0, /* latency */
|
|
||||||
0, /* bandwidth */
|
|
||||||
MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_PUT, /* flags */
|
|
||||||
mca_btl_mx_add_procs,
|
|
||||||
mca_btl_mx_del_procs,
|
|
||||||
mca_btl_mx_register,
|
|
||||||
mca_btl_mx_finalize,
|
|
||||||
mca_btl_mx_alloc,
|
|
||||||
mca_btl_mx_free,
|
|
||||||
mca_btl_mx_prepare_src,
|
|
||||||
mca_btl_mx_prepare_dst,
|
|
||||||
mca_btl_mx_send,
|
|
||||||
NULL, /* put */
|
|
||||||
NULL, /* get */
|
|
||||||
mca_btl_base_dump,
|
|
||||||
NULL, /* mpool */
|
|
||||||
NULL /* register error */
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
@ -378,8 +348,8 @@ int mca_btl_mx_send( struct mca_btl_base_module_t* btl,
|
|||||||
mx_return_t mx_return;
|
mx_return_t mx_return;
|
||||||
uint64_t total_length;
|
uint64_t total_length;
|
||||||
|
|
||||||
if( MCA_BTL_MX_CONNECTED != ((mca_btl_mx_endpoint_t*)endpoint)->endpoint_proc->status ) {
|
if( MCA_BTL_MX_CONNECTED != ((mca_btl_mx_endpoint_t*)endpoint)->status ) {
|
||||||
if( MCA_BTL_MX_NOT_REACHEABLE == ((mca_btl_mx_endpoint_t*)endpoint)->endpoint_proc->status )
|
if( MCA_BTL_MX_NOT_REACHEABLE == ((mca_btl_mx_endpoint_t*)endpoint)->status )
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
if( OMPI_SUCCESS != mca_btl_mx_proc_connect( (mca_btl_mx_endpoint_t*)endpoint ) )
|
if( OMPI_SUCCESS != mca_btl_mx_proc_connect( (mca_btl_mx_endpoint_t*)endpoint ) )
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
@ -433,3 +403,32 @@ int mca_btl_mx_finalize( struct mca_btl_base_module_t* btl )
|
|||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mca_btl_mx_module_t mca_btl_mx_module = {
|
||||||
|
{
|
||||||
|
&mca_btl_mx_component.super,
|
||||||
|
0, /* max size of first fragment */
|
||||||
|
0, /* min send fragment size */
|
||||||
|
0, /* max send fragment size */
|
||||||
|
0, /* min rdma fragment size */
|
||||||
|
0, /* max rdma fragment size */
|
||||||
|
0, /* exclusivity */
|
||||||
|
0, /* latency */
|
||||||
|
0, /* bandwidth */
|
||||||
|
MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_PUT, /* flags */
|
||||||
|
mca_btl_mx_add_procs,
|
||||||
|
mca_btl_mx_del_procs,
|
||||||
|
mca_btl_mx_register,
|
||||||
|
mca_btl_mx_finalize,
|
||||||
|
mca_btl_mx_alloc,
|
||||||
|
mca_btl_mx_free,
|
||||||
|
mca_btl_mx_prepare_src,
|
||||||
|
mca_btl_mx_prepare_dst,
|
||||||
|
mca_btl_mx_send,
|
||||||
|
NULL, /* put */
|
||||||
|
NULL, /* get */
|
||||||
|
mca_btl_base_dump,
|
||||||
|
NULL, /* mpool */
|
||||||
|
NULL /* register error */
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
@ -336,6 +336,8 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules,
|
|||||||
return NULL;
|
return NULL;
|
||||||
if( (status = mx_get_info( NULL, MX_NIC_IDS, NULL, 0,
|
if( (status = mx_get_info( NULL, MX_NIC_IDS, NULL, 0,
|
||||||
nic_addrs, size)) != MX_SUCCESS) {
|
nic_addrs, size)) != MX_SUCCESS) {
|
||||||
|
opal_output(0, "MX BTL error (mx_get_info failed) size = %ld [%s] #cards %d\n",
|
||||||
|
size, mx_strerror(status), mca_btl_mx_component.mx_num_btls );
|
||||||
free(nic_addrs);
|
free(nic_addrs);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -35,18 +35,17 @@
|
|||||||
* Initialize state of the endpoint instance.
|
* Initialize state of the endpoint instance.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static void mca_btl_mx_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
|
static void mca_btl_mx_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
|
||||||
{
|
{
|
||||||
endpoint->endpoint_btl = NULL;
|
endpoint->endpoint_btl = NULL;
|
||||||
endpoint->endpoint_proc = NULL;
|
endpoint->endpoint_proc = NULL;
|
||||||
|
endpoint->status = MCA_BTL_MX_NOT_CONNECTED;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Destroy a endpoint
|
* Destroy a endpoint
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static void mca_btl_mx_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
|
static void mca_btl_mx_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
@ -32,6 +32,10 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define MCA_BTL_MX_NOT_CONNECTED 0x0000
|
||||||
|
#define MCA_BTL_MX_NOT_REACHEABLE 0x0001
|
||||||
|
#define MCA_BTL_MX_CONNECTED 0x0002
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Structure used to publish MX information to peers
|
* Structure used to publish MX information to peers
|
||||||
*/
|
*/
|
||||||
@ -62,6 +66,8 @@ struct mca_btl_base_endpoint_t {
|
|||||||
|
|
||||||
mx_endpoint_addr_t mx_peer_addr;
|
mx_endpoint_addr_t mx_peer_addr;
|
||||||
/** the remote MX endpoint address */
|
/** the remote MX endpoint address */
|
||||||
|
|
||||||
|
int status; /**< status of the endpoint */
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
|
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
|
||||||
|
@ -159,7 +159,6 @@ int mca_btl_mx_proc_insert( mca_btl_mx_proc_t* module_proc,
|
|||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
module_proc->status = MCA_BTL_MX_NOT_CONNECTED;
|
|
||||||
module_proc->mx_peers = mx_peers;
|
module_proc->mx_peers = mx_peers;
|
||||||
|
|
||||||
if( NULL == module_proc->proc_endpoints ) {
|
if( NULL == module_proc->proc_endpoints ) {
|
||||||
@ -169,7 +168,6 @@ int mca_btl_mx_proc_insert( mca_btl_mx_proc_t* module_proc,
|
|||||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* insert into endpoint array */
|
|
||||||
module_endpoint->endpoint_proc = module_proc;
|
module_endpoint->endpoint_proc = module_proc;
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
@ -206,13 +204,12 @@ int mca_btl_mx_proc_connect( mca_btl_mx_endpoint_t* module_endpoint )
|
|||||||
module_endpoint->mx_peer.nic_id = module_proc->mx_peers[i].nic_id;
|
module_endpoint->mx_peer.nic_id = module_proc->mx_peers[i].nic_id;
|
||||||
module_endpoint->mx_peer.endpoint_id = module_proc->mx_peers[i].endpoint_id;
|
module_endpoint->mx_peer.endpoint_id = module_proc->mx_peers[i].endpoint_id;
|
||||||
module_endpoint->mx_peer_addr = mx_remote_addr;
|
module_endpoint->mx_peer_addr = mx_remote_addr;
|
||||||
|
module_endpoint->status = MCA_BTL_MX_CONNECTED;
|
||||||
module_proc->proc_addr_index = i;
|
module_proc->proc_addr_index = i;
|
||||||
module_proc->status = MCA_BTL_MX_CONNECTED;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( i == module_proc->mx_peers_count ) { /* no available connection */
|
if( i == module_proc->mx_peers_count ) { /* no available connection */
|
||||||
module_proc->status = MCA_BTL_MX_NOT_REACHEABLE;
|
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -29,10 +29,6 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MCA_BTL_MX_NOT_CONNECTED 0x0000
|
|
||||||
#define MCA_BTL_MX_NOT_REACHEABLE 0x0001
|
|
||||||
#define MCA_BTL_MX_CONNECTED 0x0002
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Represents the state of a remote process and the set of addresses
|
* Represents the state of a remote process and the set of addresses
|
||||||
* that it exports. Also cache an instance of mca_btl_base_endpoint_t for
|
* that it exports. Also cache an instance of mca_btl_base_endpoint_t for
|
||||||
@ -46,8 +42,6 @@ extern "C" {
|
|||||||
ompi_proc_t *proc_ompi;
|
ompi_proc_t *proc_ompi;
|
||||||
/**< pointer to corresponding ompi_proc_t */
|
/**< pointer to corresponding ompi_proc_t */
|
||||||
|
|
||||||
int status; /**< status of the connection */
|
|
||||||
|
|
||||||
mca_btl_mx_addr_t *mx_peers; /**< peers addresses */
|
mca_btl_mx_addr_t *mx_peers; /**< peers addresses */
|
||||||
int mx_peers_count;
|
int mx_peers_count;
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user