From e8bd9858704ce6e5e9527de1f6f55429f47a8166 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Sun, 24 Dec 2006 22:34:48 +0000 Subject: [PATCH] Add more output when calls to the MX library fails. Move the connection status from theproc into the endpoint. This commit was SVN r12924. --- ompi/mca/btl/mx/btl_mx.c | 63 +++++++++++++++--------------- ompi/mca/btl/mx/btl_mx_component.c | 2 + ompi/mca/btl/mx/btl_mx_endpoint.c | 5 +-- ompi/mca/btl/mx/btl_mx_endpoint.h | 6 +++ ompi/mca/btl/mx/btl_mx_proc.c | 5 +-- ompi/mca/btl/mx/btl_mx_proc.h | 6 --- 6 files changed, 42 insertions(+), 45 deletions(-) diff --git a/ompi/mca/btl/mx/btl_mx.c b/ompi/mca/btl/mx/btl_mx.c index 353d2e9491..9f02e1aa08 100644 --- a/ompi/mca/btl/mx/btl_mx.c +++ b/ompi/mca/btl/mx/btl_mx.c @@ -31,36 +31,6 @@ #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/mpool/mpool.h" - -mca_btl_mx_module_t mca_btl_mx_module = { - { - &mca_btl_mx_component.super, - 0, /* max size of first fragment */ - 0, /* min send fragment size */ - 0, /* max send fragment size */ - 0, /* min rdma fragment size */ - 0, /* max rdma fragment size */ - 0, /* exclusivity */ - 0, /* latency */ - 0, /* bandwidth */ - MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_PUT, /* flags */ - mca_btl_mx_add_procs, - mca_btl_mx_del_procs, - mca_btl_mx_register, - mca_btl_mx_finalize, - mca_btl_mx_alloc, - mca_btl_mx_free, - mca_btl_mx_prepare_src, - mca_btl_mx_prepare_dst, - mca_btl_mx_send, - NULL, /* put */ - NULL, /* get */ - mca_btl_base_dump, - NULL, /* mpool */ - NULL /* register error */ - } -}; - /** * */ @@ -378,8 +348,8 @@ int mca_btl_mx_send( struct mca_btl_base_module_t* btl, mx_return_t mx_return; uint64_t total_length; - if( MCA_BTL_MX_CONNECTED != ((mca_btl_mx_endpoint_t*)endpoint)->endpoint_proc->status ) { - if( MCA_BTL_MX_NOT_REACHEABLE == ((mca_btl_mx_endpoint_t*)endpoint)->endpoint_proc->status ) + if( MCA_BTL_MX_CONNECTED != ((mca_btl_mx_endpoint_t*)endpoint)->status ) { + if( MCA_BTL_MX_NOT_REACHEABLE == ((mca_btl_mx_endpoint_t*)endpoint)->status ) return OMPI_ERROR; if( OMPI_SUCCESS != mca_btl_mx_proc_connect( (mca_btl_mx_endpoint_t*)endpoint ) ) return OMPI_ERROR; @@ -433,3 +403,32 @@ int mca_btl_mx_finalize( struct mca_btl_base_module_t* btl ) return OMPI_SUCCESS; } +mca_btl_mx_module_t mca_btl_mx_module = { + { + &mca_btl_mx_component.super, + 0, /* max size of first fragment */ + 0, /* min send fragment size */ + 0, /* max send fragment size */ + 0, /* min rdma fragment size */ + 0, /* max rdma fragment size */ + 0, /* exclusivity */ + 0, /* latency */ + 0, /* bandwidth */ + MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_PUT, /* flags */ + mca_btl_mx_add_procs, + mca_btl_mx_del_procs, + mca_btl_mx_register, + mca_btl_mx_finalize, + mca_btl_mx_alloc, + mca_btl_mx_free, + mca_btl_mx_prepare_src, + mca_btl_mx_prepare_dst, + mca_btl_mx_send, + NULL, /* put */ + NULL, /* get */ + mca_btl_base_dump, + NULL, /* mpool */ + NULL /* register error */ + } +}; + diff --git a/ompi/mca/btl/mx/btl_mx_component.c b/ompi/mca/btl/mx/btl_mx_component.c index 6898806891..623866cfc8 100644 --- a/ompi/mca/btl/mx/btl_mx_component.c +++ b/ompi/mca/btl/mx/btl_mx_component.c @@ -336,6 +336,8 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules, return NULL; if( (status = mx_get_info( NULL, MX_NIC_IDS, NULL, 0, nic_addrs, size)) != MX_SUCCESS) { + opal_output(0, "MX BTL error (mx_get_info failed) size = %ld [%s] #cards %d\n", + size, mx_strerror(status), mca_btl_mx_component.mx_num_btls ); free(nic_addrs); return NULL; } diff --git a/ompi/mca/btl/mx/btl_mx_endpoint.c b/ompi/mca/btl/mx/btl_mx_endpoint.c index de1ced6d20..bc0a07068f 100644 --- a/ompi/mca/btl/mx/btl_mx_endpoint.c +++ b/ompi/mca/btl/mx/btl_mx_endpoint.c @@ -35,18 +35,17 @@ * Initialize state of the endpoint instance. * */ - static void mca_btl_mx_endpoint_construct(mca_btl_base_endpoint_t* endpoint) { - endpoint->endpoint_btl = NULL; + endpoint->endpoint_btl = NULL; endpoint->endpoint_proc = NULL; + endpoint->status = MCA_BTL_MX_NOT_CONNECTED; } /* * Destroy a endpoint * */ - static void mca_btl_mx_endpoint_destruct(mca_btl_base_endpoint_t* endpoint) { } diff --git a/ompi/mca/btl/mx/btl_mx_endpoint.h b/ompi/mca/btl/mx/btl_mx_endpoint.h index 79cfcea3d6..5f4c725c80 100644 --- a/ompi/mca/btl/mx/btl_mx_endpoint.h +++ b/ompi/mca/btl/mx/btl_mx_endpoint.h @@ -32,6 +32,10 @@ extern "C" { #endif +#define MCA_BTL_MX_NOT_CONNECTED 0x0000 +#define MCA_BTL_MX_NOT_REACHEABLE 0x0001 +#define MCA_BTL_MX_CONNECTED 0x0002 + /** * Structure used to publish MX information to peers */ @@ -62,6 +66,8 @@ struct mca_btl_base_endpoint_t { mx_endpoint_addr_t mx_peer_addr; /** the remote MX endpoint address */ + + int status; /**< status of the endpoint */ }; typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; diff --git a/ompi/mca/btl/mx/btl_mx_proc.c b/ompi/mca/btl/mx/btl_mx_proc.c index 165d174ae1..fa4d8e9408 100644 --- a/ompi/mca/btl/mx/btl_mx_proc.c +++ b/ompi/mca/btl/mx/btl_mx_proc.c @@ -159,7 +159,6 @@ int mca_btl_mx_proc_insert( mca_btl_mx_proc_t* module_proc, return OMPI_ERROR; } - module_proc->status = MCA_BTL_MX_NOT_CONNECTED; module_proc->mx_peers = mx_peers; if( NULL == module_proc->proc_endpoints ) { @@ -169,7 +168,6 @@ int mca_btl_mx_proc_insert( mca_btl_mx_proc_t* module_proc, return OMPI_ERR_OUT_OF_RESOURCE; } } - /* insert into endpoint array */ module_endpoint->endpoint_proc = module_proc; return OMPI_SUCCESS; @@ -206,13 +204,12 @@ int mca_btl_mx_proc_connect( mca_btl_mx_endpoint_t* module_endpoint ) module_endpoint->mx_peer.nic_id = module_proc->mx_peers[i].nic_id; module_endpoint->mx_peer.endpoint_id = module_proc->mx_peers[i].endpoint_id; module_endpoint->mx_peer_addr = mx_remote_addr; + module_endpoint->status = MCA_BTL_MX_CONNECTED; module_proc->proc_addr_index = i; - module_proc->status = MCA_BTL_MX_CONNECTED; break; } if( i == module_proc->mx_peers_count ) { /* no available connection */ - module_proc->status = MCA_BTL_MX_NOT_REACHEABLE; return OMPI_ERROR; } diff --git a/ompi/mca/btl/mx/btl_mx_proc.h b/ompi/mca/btl/mx/btl_mx_proc.h index 11ceae22b0..f742503763 100644 --- a/ompi/mca/btl/mx/btl_mx_proc.h +++ b/ompi/mca/btl/mx/btl_mx_proc.h @@ -29,10 +29,6 @@ extern "C" { #endif -#define MCA_BTL_MX_NOT_CONNECTED 0x0000 -#define MCA_BTL_MX_NOT_REACHEABLE 0x0001 -#define MCA_BTL_MX_CONNECTED 0x0002 - /** * Represents the state of a remote process and the set of addresses * that it exports. Also cache an instance of mca_btl_base_endpoint_t for @@ -46,8 +42,6 @@ extern "C" { ompi_proc_t *proc_ompi; /**< pointer to corresponding ompi_proc_t */ - int status; /**< status of the connection */ - mca_btl_mx_addr_t *mx_peers; /**< peers addresses */ int mx_peers_count;