diff --git a/ompi/mca/btl/base/Makefile.am b/ompi/mca/btl/base/Makefile.am index 7597a039ba..0ec617d2ce 100644 --- a/ompi/mca/btl/base/Makefile.am +++ b/ompi/mca/btl/base/Makefile.am @@ -30,6 +30,7 @@ headers = \ libmca_btl_base_la_SOURCES = \ $(headers) \ btl_base_close.c \ + btl_base_error.c \ btl_base_open.c \ btl_base_select.c diff --git a/ompi/mca/btl/base/btl_base_error.c b/ompi/mca/btl/base/btl_base_error.c new file mode 100644 index 0000000000..cdbf2a92a0 --- /dev/null +++ b/ompi/mca/btl/base/btl_base_error.c @@ -0,0 +1,32 @@ +#include "btl_base_error.h" +#include + +#if OMPI_ENABLE_DEBUG +int mca_btl_base_debug = 1; +#endif + + +int mca_btl_base_err(const char* fmt, ...) +{ + va_list list; + int ret; + + va_start(list, fmt); + ret = vfprintf(stderr, fmt, list); + va_end(list); + return ret; +} + + +int mca_btl_base_out(const char* fmt, ...) +{ + va_list list; + int ret; + + va_start(list, fmt); + ret = vfprintf(stdout, fmt, list); + va_end(list); + return ret; +} + + diff --git a/ompi/mca/btl/base/btl_base_error.h b/ompi/mca/btl/base/btl_base_error.h index a0da44c5d9..e99c05670c 100644 --- a/ompi/mca/btl/base/btl_base_error.h +++ b/ompi/mca/btl/base/btl_base_error.h @@ -17,67 +17,49 @@ #ifndef MCA_BTL_BASE_ERROR_H #define MCA_BTL_BASE_ERROR_H +#include "ompi_config.h" +#include -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) -# define BTL_ERROR(fmt, ...) { \ - opal_output(0, "[%s:%d:%s] my_name: [%lu,%lu,%lu] " fmt "\n", __FILE__, __LINE__, __func__, \ - ORTE_NAME_ARGS(orte_process_info.my_name), __VA_ARGS__); \ - } +extern int mca_btl_base_debug; + +extern int mca_btl_base_err(const char*, ...); +extern int mca_btl_base_out(const char*, ...); + + +#define BTL_OUTPUT(args) \ +do { \ + mca_btl_base_out("[%lu,%lu,%lu][%s:%d:%s] ", \ + ORTE_NAME_ARGS(orte_process_info.my_name), \ + __FILE__, __LINE__, __func__); \ + mca_btl_base_out args; \ + mca_btl_base_out("\n"); \ +} while(0); + + +#define BTL_ERROR(args) \ +do { \ + mca_btl_base_err("[%lu,%lu,%lu][%s:%d:%s] ", \ + ORTE_NAME_ARGS(orte_process_info.my_name), \ + __FILE__, __LINE__, __func__); \ + mca_btl_base_err args; \ + mca_btl_base_out("\n"); \ +} while(0); + + +#if OMPI_ENABLE_DEBUG +#define BTL_DEBUG(args) \ +do { \ + if(mca_btl_base_debug) { \ + mca_btl_base_err("[%lu,%lu,%lu][%s:%d:%s] ", \ + ORTE_NAME_ARGS(orte_process_info.my_name), \ + __FILE__, __LINE__, __func__); \ + mca_btl_base_err args; \ + mca_btl_base_out("\n"); \ + } \ +} while(0); #else -# if defined(__GNUC__) && !defined(__STDC__) -#define BTL_ERROR(fmt, args...) { \ - opal_output(0, "[%s:%d:%s] my_name: [%lu,%lu,%lu]" fmt "\n", __FILE__, __LINE__, __func__,\ - ORTE_NAME_ARGS(orte_process_info.my_name), ##args); \ - } -#else -static inline void BTL_ERROR(char *fmt, ... ) -{ - va_list list; - va_start(list, fmt); - fprintf(stderr,"[%s:%d:%s] my_name: [%lu,%lu,%lu]", - __FILE__, __LINE__, __func__, - ORTE_NAME_ARGS(orte_process_info.my_name)); - - vfprintf(stderr, fmt, list); - fprintf(stderr, "\n"); - va_end(list); -} -#endif -#endif -#if 0 - #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)L - # define BTL_DEBUG_OUT(fmt, ...) { \ - opal_output(0, "[%s:%d:%s] " fmt "\n", __FILE__, __LINE__, __func__, __VA_ARGS__); \ - } - #else - # if defined(__GNUC__) && !defined(__STDC__) - #define BTL_DEBUG_OUT(fmt, args...) { \ - opal_output(0, "[%s:%d:%s] " fmt "\n", __FILE__, __LINE__, __func__, ##args); \ - } - #else - static inline void BTL_DEBUG_OUT(char *fmt, ... ) - { - va_list list; - va_start(list, fmt); - fprintf(stderr, "[%s:%d:%s]", __FILE__, __LINE__, __func__, list); - vfprintf(stderr, fmt, list); - vfpritnf(stderr, "\n"); - va_end(list); - } - #endif - #endif -#else - #if defined(ACCEPT_C99) && __STDC_VERSION__ >= 199901L - # define BTL_DEBUG_OUT(fmt, ...) - #else - # if defined(__GNUC__) && !defined(__STDC__) - #define BTL_DEBUG_OUT(fmt, args...) - #else - static inline void BTL_DEBUG_OUT(char *fmt, ... ) - { - } - #endif - #endif -#endif +#define BTL_DEBUG(args) +#endif + #endif diff --git a/ompi/mca/btl/base/btl_base_open.c b/ompi/mca/btl/base/btl_base_open.c index 62c13964f7..45879423f4 100644 --- a/ompi/mca/btl/base/btl_base_open.c +++ b/ompi/mca/btl/base/btl_base_open.c @@ -25,6 +25,9 @@ #include "mca/btl/btl.h" #include "mca/btl/base/base.h" +int mca_btl_base_debug; + + /* * mca_btl_base_descriptor_t */ diff --git a/ompi/mca/btl/mvapi/btl_mvapi.c b/ompi/mca/btl/mvapi/btl_mvapi.c index 385cf1bd90..82ec45382b 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi.c +++ b/ompi/mca/btl/mvapi/btl_mvapi.c @@ -127,7 +127,7 @@ int mca_btl_mvapi_del_procs(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t ** peers) { /* Stub */ - BTL_DEBUG_OUT("Stub\n"); + BTL_DEBUG(("Stub\n")); return OMPI_SUCCESS; } @@ -209,7 +209,7 @@ int mca_btl_mvapi_free( } else if(frag->size == mca_btl_mvapi_component.eager_limit){ MCA_BTL_IB_FRAG_RETURN_EAGER(btl, frag); } else { - BTL_ERROR("invalid descriptor"); + BTL_ERROR(("invalid descriptor")); } return OMPI_SUCCESS; } @@ -288,13 +288,13 @@ mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_src( rc = mca_mpool_base_remove((void*) vapi_reg->base_reg.base); if(OMPI_SUCCESS != rc) { - BTL_ERROR("error removing memory region from memory pool tree"); + BTL_ERROR(("error removing memory region from memory pool tree")); return NULL; } if(is_leave_pinned) { if(NULL == opal_list_remove_item(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg)){ - BTL_ERROR("error removing item from reg_mru_list"); + BTL_ERROR(("error removing item from reg_mru_list")); return NULL; } } @@ -315,7 +315,7 @@ mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_src( if(rc != OMPI_SUCCESS) { - BTL_ERROR("error inserting memory region into memory pool tree"); + BTL_ERROR(("error inserting memory region into memory pool tree")); return NULL; } @@ -329,7 +329,7 @@ mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_src( else if(is_leave_pinned) { /* the current memory region is large enough and we should leave the memory pinned */ if(NULL == opal_list_remove_item(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg)) { - BTL_ERROR("error removing item from reg_mru_list"); + BTL_ERROR(("error removing item from reg_mru_list")); return NULL; } @@ -391,7 +391,7 @@ mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_src( opal_list_remove_first(&mvapi_btl->reg_mru_list); if( NULL == old_reg) { - BTL_ERROR("error removing item from reg_mru_list"); + BTL_ERROR(("error removing item from reg_mru_list")); return NULL; } @@ -399,7 +399,7 @@ mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_src( rc = mca_mpool_base_remove((void*) old_reg->base_reg.base); if(OMPI_SUCCESS != rc) { - BTL_ERROR("error removing memory region from memory pool tree"); + BTL_ERROR(("error removing memory region from memory pool tree")); return NULL; } @@ -570,7 +570,7 @@ mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_dst( rc = mca_mpool_base_remove((void*) vapi_reg->base_reg.base); if(OMPI_SUCCESS != rc) { - BTL_ERROR("error removing memory region from memory pool tree"); + BTL_ERROR(("error removing memory region from memory pool tree")); return NULL; } @@ -580,7 +580,7 @@ mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_dst( */ if(NULL == opal_list_remove_item(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg)) { - BTL_ERROR("error removing item from reg_mru_list"); + BTL_ERROR(("error removing item from reg_mru_list")); return NULL; } } @@ -599,7 +599,7 @@ mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_dst( (mca_mpool_base_registration_t*) vapi_reg); if(OMPI_SUCCESS != rc) { - BTL_ERROR("error inserting memory region into memory pool tree"); + BTL_ERROR(("error inserting memory region into memory pool tree")); return NULL; } OBJ_RETAIN(vapi_reg); @@ -614,7 +614,7 @@ mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_dst( else if(is_leave_pinned){ /* the current memory region is large enough and we should leave the memory pinned */ if(NULL == opal_list_remove_item(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg)) { - BTL_ERROR("error removing item from reg_mru_list"); + BTL_ERROR(("error removing item from reg_mru_list")); return NULL; } opal_list_append(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg); @@ -639,13 +639,13 @@ mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_dst( opal_list_remove_first(&mvapi_btl->reg_mru_list); if( NULL == old_reg) { - BTL_ERROR("error removing item from reg_mru_list"); + BTL_ERROR(("error removing item from reg_mru_list")); return NULL; } rc = mca_mpool_base_remove((void*) old_reg->base_reg.base); if(OMPI_SUCCESS !=rc ) { - BTL_ERROR("error removing memory region from memory pool tree"); + BTL_ERROR(("error removing memory region from memory pool tree")); return NULL; } @@ -665,7 +665,7 @@ mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_dst( (void*) (&mvapi_btl->super), (mca_mpool_base_registration_t*) vapi_reg); if(OMPI_SUCCESS != rc){ - BTL_ERROR("error inserting memory region into memory pool"); + BTL_ERROR(("error inserting memory region into memory pool")); return NULL; } @@ -822,8 +822,7 @@ static void async_event_handler(VAPI_hca_hndl_t hca_hndl, case VAPI_SEND_QUEUE_DRAINED: case VAPI_PORT_ACTIVE: { - BTL_DEBUG_OUT("Got an asynchronous event: %s\n", - VAPI_event_record_sym(event_p->type)); + BTL_DEBUG(("Got an asynchronous event: %s\n", VAPI_event_record_sym(event_p->type))); break; } case VAPI_CQ_ERROR: @@ -835,14 +834,14 @@ static void async_event_handler(VAPI_hca_hndl_t hca_hndl, case VAPI_LOCAL_CATASTROPHIC_ERROR: case VAPI_PORT_ERROR: { - BTL_ERROR("Got an asynchronous event: %s (%s)", + BTL_ERROR(("Got an asynchronous event: %s (%s)", VAPI_event_record_sym(event_p->type), - VAPI_event_syndrome_sym(event_p->syndrome)); + VAPI_event_syndrome_sym(event_p->syndrome))); break; } default: - BTL_ERROR("Warning!! Got an undefined " - "asynchronous event"); + BTL_ERROR(("Warning!! Got an undefined " + "asynchronous event")); } } @@ -863,7 +862,7 @@ int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl) ret = VAPI_alloc_pd(mvapi_btl->nic, &mvapi_btl->ptag); if(ret != VAPI_OK) { - BTL_ERROR("error in VAPI_alloc_pd: %s", VAPI_strerror(ret)); + BTL_ERROR(("error in VAPI_alloc_pd: %s", VAPI_strerror(ret))); return OMPI_ERROR; } @@ -880,7 +879,7 @@ int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl) &mvapi_btl->srq_hndl_high, &srq_attr_out); if(ret != VAPI_OK) { - BTL_ERROR("error in VAPI_create_srq: %s", VAPI_strerror(ret)); + BTL_ERROR(("error in VAPI_create_srq: %s", VAPI_strerror(ret))); return OMPI_ERROR; } ret = VAPI_create_srq(mvapi_btl->nic, @@ -888,7 +887,7 @@ int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl) &mvapi_btl->srq_hndl_low, &srq_attr_out); if(ret != VAPI_OK) { - BTL_ERROR("error in VAPI_create_srq: %s", VAPI_strerror(ret)); + BTL_ERROR(("error in VAPI_create_srq: %s", VAPI_strerror(ret))); return OMPI_ERROR; } @@ -901,7 +900,7 @@ int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl) if( VAPI_OK != ret) { - BTL_ERROR("error in VAPI_create_cq: %s", VAPI_strerror(ret)); + BTL_ERROR(("error in VAPI_create_cq: %s", VAPI_strerror(ret))); return OMPI_ERROR; } @@ -910,13 +909,13 @@ int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl) if( VAPI_OK != ret) { - BTL_ERROR("error in VAPI_create_cq: %s", VAPI_strerror(ret)); + BTL_ERROR(("error in VAPI_create_cq: %s", VAPI_strerror(ret))); return OMPI_ERROR; } if(cqe_cnt <= 0) { - BTL_ERROR("error creating completion queue "); + BTL_ERROR(("error creating completion queue ")); return OMPI_ERROR; } @@ -924,7 +923,7 @@ int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl) async_event_handler, 0, &mvapi_btl->async_handler); if(VAPI_OK != ret) { - BTL_ERROR("error in EVAPI_set_async_event_handler: %s", VAPI_strerror(ret)); + BTL_ERROR(("error in EVAPI_set_async_event_handler: %s", VAPI_strerror(ret))); return OMPI_ERROR; } diff --git a/ompi/mca/btl/mvapi/btl_mvapi.h b/ompi/mca/btl/mvapi/btl_mvapi.h index 6dac862d0f..2fb1b02410 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi.h +++ b/ompi/mca/btl/mvapi/btl_mvapi.h @@ -240,10 +240,10 @@ struct mca_btl_mvapi_module_t { post_srr_sub_desc_post, \ &post_srr_sub_rwqe_posted); \ if(VAPI_OK != post_srr_sub_frag->ret) { \ - BTL_ERROR("error posting receive descriptors to shared receive queue: %s",\ - VAPI_strerror(post_srr_sub_frag->ret)); \ + BTL_ERROR(("error posting receive descriptors to shared receive queue: %s",\ + VAPI_strerror(post_srr_sub_frag->ret))); \ } else if(post_srr_sub_rwqe_posted < 1) { \ - BTL_ERROR("error posting receive descriptors to shared receive queue, number of entries posted is %d", post_srr_sub_rwqe_posted); \ + BTL_ERROR(("error posting receive descriptors to shared receive queue, number of entries posted is %d", post_srr_sub_rwqe_posted)); \ } else {\ OPAL_THREAD_ADD32(post_srr_sub_srr_posted, post_srr_sub_cnt); \ }\ diff --git a/ompi/mca/btl/mvapi/btl_mvapi_component.c b/ompi/mca/btl/mvapi/btl_mvapi_component.c index fa15937db3..80f7995bf4 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_component.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_component.c @@ -255,7 +255,7 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules, /* Determine the number of hca's available on the host */ vapi_ret=EVAPI_list_hcas(0, &num_hcas, NULL); if( VAPI_EAGAIN != vapi_ret || 0 == num_hcas ) { - BTL_ERROR("No hca's found on this host!"); + BTL_ERROR(("No hca's found on this host!")); return NULL; } @@ -284,14 +284,14 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules, for(i = 0; i < num_hcas; i++){ vapi_ret = EVAPI_get_hca_hndl(hca_ids[i], &hca_hndl); if(VAPI_OK != vapi_ret) { - BTL_ERROR("error getting hca handle: %s", VAPI_strerror(vapi_ret)); + BTL_ERROR(("error getting hca handle: %s", VAPI_strerror(vapi_ret))); return NULL; } vapi_ret = VAPI_query_hca_cap(hca_hndl, &hca_vendor, &hca_cap); if(VAPI_OK != vapi_ret) { - BTL_ERROR("error getting hca properties %s", VAPI_strerror(vapi_ret)); + BTL_ERROR(("error getting hca properties %s", VAPI_strerror(vapi_ret))); return NULL; } @@ -300,7 +300,7 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules, for(j = 1; j <= hca_cap.phys_port_num; j++){ vapi_ret = VAPI_query_hca_port_prop(hca_hndl, (IB_port_t) j, &hca_port); if(VAPI_OK != vapi_ret) { - BTL_ERROR("error getting hca port properties %s", VAPI_strerror(vapi_ret)); + BTL_ERROR(("error getting hca port properties %s", VAPI_strerror(vapi_ret))); return NULL; } @@ -385,7 +385,7 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules, &hca_pd); if(NULL == mvapi_btl->ib_pool) { - BTL_ERROR("error creating vapi memory pool! aborting mvapi btl initialization"); + BTL_ERROR(("error creating vapi memory pool! aborting mvapi btl initialization")); return NULL; } /* Initialize pool of send fragments */ @@ -495,16 +495,16 @@ int mca_btl_mvapi_component_progress() ret = VAPI_poll_cq(mvapi_btl->nic, mvapi_btl->cq_hndl_high, &comp); if(VAPI_OK == ret) { if(comp.status != VAPI_SUCCESS) { - BTL_ERROR("Got error : %s, Vendor code : %d Frag : %p", + BTL_ERROR(("Got error : %s, Vendor code : %d Frag : %p", VAPI_wc_status_sym(comp.status), - comp.vendor_err_syndrome, comp.id); + comp.vendor_err_syndrome, comp.id)); return OMPI_ERROR; } /* Handle work completions */ switch(comp.opcode) { case VAPI_CQE_RQ_RDMA_WITH_IMM: - BTL_ERROR("Got an RDMA with Immediate data!, not supported!"); + BTL_ERROR(("Got an RDMA with Immediate data!, not supported!")); return OMPI_ERROR; case VAPI_CQE_SQ_RDMA_WRITE: @@ -520,7 +520,7 @@ int mca_btl_mvapi_component_progress() case VAPI_CQE_RQ_SEND_DATA: /* Process a RECV */ - BTL_DEBUG_OUT("Got a recv completion"); + BTL_DEBUG(("Got a recv completion")); frag = (mca_btl_mvapi_frag_t*) comp.id; endpoint = (mca_btl_mvapi_endpoint_t*) frag->endpoint; @@ -543,7 +543,7 @@ int mca_btl_mvapi_component_progress() break; default: - BTL_ERROR("Unhandled work completion opcode is %d", comp.opcode); + BTL_ERROR(("Unhandled work completion opcode is %d", comp.opcode)); break; } } @@ -553,16 +553,16 @@ int mca_btl_mvapi_component_progress() ret = VAPI_poll_cq(mvapi_btl->nic, mvapi_btl->cq_hndl_low, &comp); if(VAPI_OK == ret) { if(comp.status != VAPI_SUCCESS) { - BTL_ERROR("Got error : %s, Vendor code : %d Frag : %p", + BTL_ERROR(("Got error : %s, Vendor code : %d Frag : %p", VAPI_wc_status_sym(comp.status), - comp.vendor_err_syndrome, comp.id); + comp.vendor_err_syndrome, comp.id)); return OMPI_ERROR; } /* Handle n/w completions */ switch(comp.opcode) { case VAPI_CQE_RQ_RDMA_WITH_IMM: - BTL_ERROR("Got an RDMA with Immediate data!, not supported!"); + BTL_ERROR(("Got an RDMA with Immediate data!, not supported!")); return OMPI_ERROR; case VAPI_CQE_SQ_RDMA_WRITE: @@ -577,7 +577,7 @@ int mca_btl_mvapi_component_progress() case VAPI_CQE_RQ_SEND_DATA: - BTL_DEBUG_OUT("Got a recv completion"); + BTL_DEBUG(("Got a recv completion")); frag = (mca_btl_mvapi_frag_t*) comp.id; endpoint = (mca_btl_mvapi_endpoint_t*) frag->endpoint; frag->rc=OMPI_SUCCESS; @@ -599,7 +599,7 @@ int mca_btl_mvapi_component_progress() break; default: - BTL_ERROR("Errorneous network completion"); + BTL_ERROR(("Errorneous network completion")); break; } } diff --git a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c index 8ebb07c25d..548c58e9a6 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c @@ -184,10 +184,10 @@ static int mca_btl_mvapi_endpoint_send_connect_req(mca_btl_base_endpoint_t* endp mca_btl_mvapi_endpoint_send_cb, NULL); - BTL_DEBUG_OUT("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d", + BTL_DEBUG(("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d", endpoint->lcl_qp_prop_high.qp_num, endpoint->lcl_qp_prop_low.qp_num, - endpoint->endpoint_btl->port.lid); + endpoint->endpoint_btl->port.lid)); if(rc < 0) { ORTE_ERROR_LOG(rc); @@ -260,10 +260,10 @@ static int mca_btl_mvapi_endpoint_set_remote_info(mca_btl_base_endpoint_t* endpo ORTE_ERROR_LOG(rc); return rc; } - BTL_DEBUG_OUT("Received High Priority QP num = %d, Low Priority QP num %d, LID = %d", + BTL_DEBUG(("Received High Priority QP num = %d, Low Priority QP num %d, LID = %d", endpoint->rem_qp_num_high, endpoint->rem_qp_num_low, - endpoint->rem_lid); + endpoint->rem_lid)); return ORTE_SUCCESS; } @@ -293,7 +293,7 @@ static int mca_btl_mvapi_endpoint_start_connect(mca_btl_base_endpoint_t* endpoin &endpoint->lcl_qp_hndl_high, &endpoint->lcl_qp_prop_high, VAPI_TS_RC))) { - BTL_ERROR("error creating queue pair, error code %d", rc); + BTL_ERROR(("error creating queue pair, error code %d", rc)); return rc; } @@ -308,19 +308,19 @@ static int mca_btl_mvapi_endpoint_start_connect(mca_btl_base_endpoint_t* endpoin &endpoint->lcl_qp_prop_low, VAPI_TS_RC))) { - BTL_ERROR("error creating queue pair, error code %d", rc); + BTL_ERROR(("error creating queue pair, error code %d", rc)); return rc; } - BTL_DEBUG_OUT("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", + BTL_DEBUG(("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", endpoint->lcl_qp_prop_high.qp_num, endpoint->lcl_qp_prop_low.qp_num, - endpoint->endpoint_btl->port.lid); + endpoint->endpoint_btl->port.lid)); /* Send connection info over to remote endpoint */ endpoint->endpoint_state = MCA_BTL_IB_CONNECTING; if(OMPI_SUCCESS != (rc = mca_btl_mvapi_endpoint_send_connect_req(endpoint))) { - BTL_ERROR("error sending connect request, error code %d", rc); + BTL_ERROR(("error sending connect request, error code %d", rc)); return rc; } return OMPI_SUCCESS; @@ -344,7 +344,7 @@ static int mca_btl_mvapi_endpoint_reply_start_connect(mca_btl_mvapi_endpoint_t * &endpoint->lcl_qp_hndl_high, &endpoint->lcl_qp_prop_high, VAPI_TS_RC))) { - BTL_ERROR("error creating queue pair, error code %d", rc); + BTL_ERROR(("error creating queue pair, error code %d", rc)); return rc; } @@ -358,14 +358,14 @@ static int mca_btl_mvapi_endpoint_reply_start_connect(mca_btl_mvapi_endpoint_t * &endpoint->lcl_qp_hndl_low, &endpoint->lcl_qp_prop_low, VAPI_TS_RC))) { - BTL_ERROR("error creating queue pair, error code %d", rc); + BTL_ERROR(("error creating queue pair, error code %d", rc)); return rc; } - BTL_DEBUG_OUT("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", + BTL_DEBUG(("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", endpoint->lcl_qp_prop_high.qp_num, endpoint->lcl_qp_prop_low.qp_num, - endpoint->endpoint_btl->port.lid); + endpoint->endpoint_btl->port.lid)); @@ -377,13 +377,13 @@ static int mca_btl_mvapi_endpoint_reply_start_connect(mca_btl_mvapi_endpoint_t * rc = mca_btl_mvapi_endpoint_connect(endpoint); if(rc != OMPI_SUCCESS) { - BTL_ERROR("error in endpoint connect error code is %d", rc); + BTL_ERROR(("error in endpoint connect error code is %d", rc)); return rc; } /* Send connection info over to remote endpoint */ if(OMPI_SUCCESS != (rc = mca_btl_mvapi_endpoint_send_connect_req(endpoint))) { - BTL_ERROR("error in endpoint send connect request error code is %d", rc); + BTL_ERROR(("error in endpoint send connect request error code is %d", rc)); return rc; } return OMPI_SUCCESS; @@ -446,7 +446,7 @@ static void mca_btl_mvapi_endpoint_recv( * and then reply with our QP information */ if(OMPI_SUCCESS != (rc = mca_btl_mvapi_endpoint_reply_start_connect(ib_endpoint, buffer))) { - BTL_ERROR("error in endpoint reply start connect"); + BTL_ERROR(("error in endpoint reply start connect")); break; } @@ -458,7 +458,7 @@ static void mca_btl_mvapi_endpoint_recv( mca_btl_mvapi_endpoint_set_remote_info(ib_endpoint, buffer); if(OMPI_SUCCESS != (rc = mca_btl_mvapi_endpoint_connect(ib_endpoint))) { - BTL_ERROR("endpoint connect error: %d", rc); + BTL_ERROR(("endpoint connect error: %d", rc)); break; } @@ -478,7 +478,7 @@ static void mca_btl_mvapi_endpoint_recv( case MCA_BTL_IB_CONNECTED : break; default : - BTL_ERROR("Invalid endpoint state %d", endpoint_state); + BTL_ERROR(("Invalid endpoint state %d", endpoint_state)); } break; @@ -519,7 +519,7 @@ int mca_btl_mvapi_endpoint_send( switch(endpoint->endpoint_state) { case MCA_BTL_IB_CONNECTING: - BTL_DEBUG_OUT("Queing because state is connecting"); + BTL_DEBUG(("Queing because state is connecting")); opal_list_append(&endpoint->pending_send_frags, (opal_list_item_t *)frag); @@ -529,7 +529,7 @@ int mca_btl_mvapi_endpoint_send( case MCA_BTL_IB_CONNECT_ACK: - BTL_DEBUG_OUT("Queuing because waiting for ack"); + BTL_DEBUG(("Queuing because waiting for ack")); opal_list_append(&endpoint->pending_send_frags, (opal_list_item_t *)frag); @@ -539,7 +539,7 @@ int mca_btl_mvapi_endpoint_send( case MCA_BTL_IB_CLOSED: - BTL_DEBUG_OUT("Connection to endpoint closed ... connecting ..."); + BTL_DEBUG(("Connection to endpoint closed ... connecting ...")); opal_list_append(&endpoint->pending_send_frags, (opal_list_item_t *)frag); @@ -558,10 +558,10 @@ int mca_btl_mvapi_endpoint_send( mvapi_btl = endpoint->endpoint_btl; - BTL_DEBUG_OUT("Send to : %d, len : %d, frag : %p", + BTL_DEBUG(("Send to : %d, len : %d, frag : %p", endpoint->endpoint_proc->proc_guid.vpid, frag->sg_entry.len, - frag); + frag)); rc = mca_btl_mvapi_endpoint_post_send(mvapi_btl, endpoint, frag); @@ -598,7 +598,7 @@ void mca_btl_mvapi_progress_send_frags(mca_btl_mvapi_endpoint_t* endpoint) /* We need to post this one */ if(OMPI_SUCCESS != mca_btl_mvapi_endpoint_post_send(mvapi_btl, endpoint, frag)) - BTL_ERROR("error in mca_btl_mvapi_endpoint_send"); + BTL_ERROR(("error in mca_btl_mvapi_endpoint_send")); } } @@ -703,7 +703,7 @@ int mca_btl_mvapi_endpoint_create_qp( qp_prop); if(VAPI_OK != ret) { - BTL_ERROR("error creating the queue pair: %s", VAPI_strerror(ret)); + BTL_ERROR(("error creating the queue pair: %s", VAPI_strerror(ret))); return OMPI_ERROR; } return OMPI_SUCCESS; @@ -749,11 +749,11 @@ int mca_btl_mvapi_endpoint_qp_init_query( &qp_attr, &qp_attr_mask, &qp_cap); if(VAPI_OK != ret) { - BTL_ERROR("Error modifying the queue pair: %s", VAPI_strerror(ret)); + BTL_ERROR(("Error modifying the queue pair: %s", VAPI_strerror(ret))); return OMPI_ERROR; } - BTL_DEBUG_OUT("Modified to init..Qp %d", qp_hndl); + BTL_DEBUG(("Modified to init..Qp %d", qp_hndl)); /********************** INIT --> RTR ************************/ QP_ATTR_MASK_CLR_ALL(qp_attr_mask); @@ -784,11 +784,11 @@ int mca_btl_mvapi_endpoint_qp_init_query( &qp_attr, &qp_attr_mask, &qp_cap); if(VAPI_OK != ret) { - BTL_ERROR("Error modifying the queue pair: %s", VAPI_strerror(ret)); + BTL_ERROR(("Error modifying the queue pair: %s", VAPI_strerror(ret))); return OMPI_ERROR; } - BTL_DEBUG_OUT("Modified to RTR..Qp %d", qp_hndl); + BTL_DEBUG(("Modified to RTR..Qp %d", qp_hndl)); /************** RTS *******************/ QP_ATTR_MASK_CLR_ALL(qp_attr_mask); @@ -811,11 +811,11 @@ int mca_btl_mvapi_endpoint_qp_init_query( if(VAPI_OK != ret) { return OMPI_ERROR; } - BTL_DEBUG_OUT("Modified to RTS..Qp %d", qp_hndl); + BTL_DEBUG(("Modified to RTS..Qp %d", qp_hndl)); ret = VAPI_query_qp(nic, qp_hndl, &qp_attr, &qp_attr_mask, &qp_init_attr ); if (ret != VAPI_OK) { - BTL_ERROR("Error modifying the queue pair: %s", VAPI_strerror(ret)); + BTL_ERROR(("Error modifying the queue pair: %s", VAPI_strerror(ret))); return OMPI_ERROR; } diff --git a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.h b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.h index 820c76f9a3..881cd58e80 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.h +++ b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.h @@ -189,8 +189,8 @@ void mca_btl_mvapi_progress_send_frags(mca_btl_mvapi_endpoint_t*); post_rr_sub_cnt, \ post_rr_sub_desc_post); \ if(VAPI_OK != post_rr_sub_frag->ret) { \ - BTL_ERROR("error posting receive descriptors: %s",\ - VAPI_strerror(post_rr_sub_frag->ret)); \ + BTL_ERROR(("error posting receive descriptors: %s",\ + VAPI_strerror(post_rr_sub_frag->ret))); \ } else {\ OPAL_THREAD_ADD32(post_rr_sub_rr_posted, post_rr_sub_cnt); \ }\ @@ -227,7 +227,7 @@ void mca_btl_mvapi_progress_send_frags(mca_btl_mvapi_endpoint_t*); /* cnt, */ /* rr_desc_post); */ /* if(VAPI_OK != frag->ret) { */ -/* BTL_ERROR("error posting receive descriptors: %s", VAPI_strerror(frag->ret)); */ +/* BTL_ERROR(("error posting receive descriptors: %s", VAPI_strerror(frag->ret))); */ /* return OMPI_ERROR; */ /* } */ /* OPAL_THREAD_ADD32(rr_posted, cnt); */ diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index 7da277f3f4..a60e054e70 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -131,8 +131,7 @@ int mca_btl_openib_del_procs(struct mca_btl_base_module_t* btl, struct ompi_proc_t **procs, struct mca_btl_base_endpoint_t ** peers) { - /* TODO */ - BTL_DEBUG_OUT("Stub\n"); + BTL_DEBUG(("TODO\n")); return OMPI_SUCCESS; } @@ -215,7 +214,7 @@ int mca_btl_openib_free( } else if(frag->size == mca_btl_openib_component.eager_limit){ MCA_BTL_IB_FRAG_RETURN_EAGER(btl, frag); } else { - BTL_ERROR("invalid descriptor"); + BTL_ERROR(("invalid descriptor")); } return OMPI_SUCCESS; @@ -299,13 +298,13 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( rc = mca_mpool_base_remove((void*) openib_reg->base_reg.base); if(OMPI_SUCCESS != rc) { - BTL_ERROR("error removing memory region from memory pool tree"); + BTL_ERROR(("error removing memory region from memory pool tree")); return NULL; } if(is_leave_pinned) { if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg)){ - BTL_ERROR("error removing item from reg_mru_list"); + BTL_ERROR(("error removing item from reg_mru_list")); return NULL; } } @@ -324,7 +323,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( if(rc != OMPI_SUCCESS) { - BTL_ERROR("error inserting memory region into memory pool tree"); + BTL_ERROR(("error inserting memory region into memory pool tree")); return NULL; } @@ -338,7 +337,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( } else if(is_leave_pinned) { /* the current memory region is large enough and we should leave the memory pinned */ if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg)) { - BTL_ERROR("error removing item from reg_mru_list"); + BTL_ERROR(("error removing item from reg_mru_list")); return NULL; } opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg); @@ -398,7 +397,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( opal_list_remove_last(&openib_btl->reg_mru_list); if( NULL == old_reg) { - BTL_ERROR("error removing item from reg_mru_list"); + BTL_ERROR(("error removing item from reg_mru_list")); return NULL; } @@ -406,7 +405,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( rc = mca_mpool_base_remove((void*) old_reg->base_reg.base); if(OMPI_SUCCESS != rc) { - BTL_ERROR("error removing memory region from memory pool tree"); + BTL_ERROR(("error removing memory region from memory pool tree")); return NULL; } @@ -452,8 +451,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( frag->base.des_dst = NULL; frag->base.des_dst_cnt = 0; frag->openib_reg = openib_reg; - BTL_DEBUG_OUT("frag->sg_entry.lkey = %lu .addr = %llu", frag->sg_entry.lkey, frag->sg_entry.addr); - + BTL_DEBUG(("frag->sg_entry.lkey = %lu .addr = %llu", frag->sg_entry.lkey, frag->sg_entry.addr)); return &frag->base; @@ -580,7 +578,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( rc = mca_mpool_base_remove((void*) openib_reg->base_reg.base); if(OMPI_SUCCESS != rc) { - BTL_ERROR("error removing memory region from memory pool tree"); + BTL_ERROR(("error removing memory region from memory pool tree")); return NULL; } @@ -589,7 +587,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( * pull it off the MRU list */ if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg)) { - BTL_ERROR("error removing item from reg_mru_list"); + BTL_ERROR(("error removing item from reg_mru_list")); return NULL; } } @@ -608,7 +606,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( (mca_mpool_base_registration_t*) openib_reg); if(OMPI_SUCCESS != rc) { - BTL_ERROR("error inserting memory region into memory pool tree"); + BTL_ERROR(("error inserting memory region into memory pool tree")); return NULL; } OBJ_RETAIN(openib_reg); @@ -623,7 +621,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( else if(is_leave_pinned){ /* the current memory region is large enough and we should leave the memory pinned */ if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg)) { - BTL_ERROR("error removing item from reg_mru_list"); + BTL_ERROR(("error removing item from reg_mru_list")); return NULL; } opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg); @@ -647,13 +645,13 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( opal_list_remove_last(&openib_btl->reg_mru_list); if( NULL == old_reg) { - BTL_ERROR("error removing item from reg_mru_list"); + BTL_ERROR(("error removing item from reg_mru_list")); return NULL; } rc = mca_mpool_base_remove((void*) old_reg->base_reg.base); if(OMPI_SUCCESS !=rc ) { - BTL_ERROR("error removing memory region from memory pool tree"); + BTL_ERROR(("error removing memory region from memory pool tree")); return NULL; } @@ -673,7 +671,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( (void*) (&openib_btl->super), (mca_mpool_base_registration_t*) openib_reg); if(OMPI_SUCCESS != rc){ - BTL_ERROR("error inserting memory region into memory pool"); + BTL_ERROR(("error inserting memory region into memory pool")); return NULL; } @@ -705,7 +703,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( frag->base.des_src = NULL; frag->base.des_src_cnt = 0; frag->openib_reg = openib_reg; - BTL_DEBUG_OUT("frag->sg_entry.lkey = %lu .addr = %llu frag->segment.seg_key.key32[0] = %lu" , frag->sg_entry.lkey, frag->sg_entry.addr, frag->segment.seg_key.key32[0]); + BTL_DEBUG(("frag->sg_entry.lkey = %lu .addr = %llu frag->segment.seg_key.key32[0] = %lu" , frag->sg_entry.lkey, frag->sg_entry.addr, frag->segment.seg_key.key32[0])); return &frag->base; @@ -794,16 +792,16 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, frag->sg_entry.addr = (uintptr_t) frag->base.des_src->seg_addr.pval; frag->sg_entry.length = frag->base.des_src->seg_len; - BTL_DEBUG_OUT("frag->wr_desc.sr_desc.wr.rdma.remote_addr = %llu .rkey = %lu frag->sg_entry.addr = %llu .length = %lu" + BTL_DEBUG(("frag->wr_desc.sr_desc.wr.rdma.remote_addr = %llu .rkey = %lu frag->sg_entry.addr = %llu .length = %lu" , frag->wr_desc.sr_desc.wr.rdma.remote_addr , frag->wr_desc.sr_desc.wr.rdma.rkey , frag->sg_entry.addr - , frag->sg_entry.length); + , frag->sg_entry.length)); if(ibv_post_send(endpoint->lcl_qp_low, &frag->wr_desc.sr_desc, &bad_wr)){ - BTL_ERROR("error posting send request errno says %s", strerror(errno)); + BTL_ERROR(("error posting send request errno says %s", strerror(errno))); return OMPI_ERROR; } @@ -830,9 +828,9 @@ int mca_btl_openib_module_init(mca_btl_openib_module_t *openib_btl) if(NULL == openib_btl->ib_pd) { - BTL_ERROR("error allocating pd for %s errno says %s\n", + BTL_ERROR(("error allocating pd for %s errno says %s\n", ibv_get_device_name(openib_btl->ib_dev), - strerror(errno)); + strerror(errno))); return OMPI_ERROR; } @@ -840,18 +838,18 @@ int mca_btl_openib_module_init(mca_btl_openib_module_t *openib_btl) openib_btl->ib_cq_low = ibv_create_cq(ctx, mca_btl_openib_component.ib_cq_size, NULL); if(NULL == openib_btl->ib_cq_low) { - BTL_ERROR("error creating low priority cq for %s errno says %s\n", + BTL_ERROR(("error creating low priority cq for %s errno says %s\n", ibv_get_device_name(openib_btl->ib_dev), - strerror(errno)); + strerror(errno))); return OMPI_ERROR; } openib_btl->ib_cq_high = ibv_create_cq(ctx, mca_btl_openib_component.ib_cq_size, NULL); if(NULL == openib_btl->ib_cq_high) { - BTL_ERROR("error creating high priority cq for %s errno says %s\n", + BTL_ERROR(("error creating high priority cq for %s errno says %s\n", ibv_get_device_name(openib_btl->ib_dev), - strerror(errno)); + strerror(errno))); return OMPI_ERROR; } diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 8fe5b07358..eeca74bc86 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -265,7 +265,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, num_devs++; if(0 == num_devs) { - BTL_ERROR("No hca's found on this host!"); + BTL_ERROR(("No hca's found on this host!")); return NULL; } @@ -300,12 +300,12 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, ib_dev_context = ibv_open_device(ib_dev); if(!ib_dev_context) { - BTL_ERROR(" error obtaining device context for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno)); + BTL_ERROR((" error obtaining device context for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno))); return NULL; } if(ibv_query_device(ib_dev_context, &ib_dev_attr)){ - BTL_ERROR("error obtaining device attributes for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno)); + BTL_ERROR(("error obtaining device attributes for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno))); return NULL; } @@ -316,8 +316,8 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, struct ibv_port_attr* ib_port_attr; ib_port_attr = (struct ibv_port_attr*) malloc(sizeof(struct ibv_port_attr)); if(ibv_query_port(ib_dev_context, (uint8_t) j, ib_port_attr)){ - BTL_ERROR("error getting port attributes for device %s port number %d errno says %s", - ibv_get_device_name(ib_dev), j, strerror(errno)); + BTL_ERROR(("error getting port attributes for device %s port number %d errno says %s", + ibv_get_device_name(ib_dev), j, strerror(errno))); return NULL; } @@ -402,7 +402,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, &mpool_resources); if(NULL == openib_btl->ib_pool) { - BTL_ERROR("error creating vapi memory pool! aborting openib btl initialization"); + BTL_ERROR(("error creating vapi memory pool! aborting openib btl initialization")); return NULL; } @@ -510,27 +510,27 @@ int mca_btl_openib_component_progress() do{ ne=ibv_poll_cq(openib_btl->ib_cq_high, 1, &wc ); if(ne < 0 ){ - BTL_ERROR("error polling CQ with %d errno says %s\n", ne, strerror(errno)); + BTL_ERROR(("error polling CQ with %d errno says %s\n", ne, strerror(errno))); return OMPI_ERROR; } else if(wc.status != IBV_WC_SUCCESS) { - BTL_ERROR("error polling CQ with status %d for wr_id %llu\n", - wc.status, wc.wr_id); + BTL_ERROR(("error polling CQ with status %d for wr_id %llu\n", + wc.status, wc.wr_id)); return OMPI_ERROR; } else if(1 == ne) { - BTL_DEBUG_OUT("completion queue event says opcode is %d\n", wc.opcode); + BTL_DEBUG(("completion queue event says opcode is %d\n", wc.opcode)); /* Handle work completions */ switch(wc.opcode) { case IBV_WC_RECV_RDMA_WITH_IMM: - BTL_ERROR("Got an RDMA with Immediate data Not supported!"); + BTL_ERROR(("Got an RDMA with Immediate data Not supported!")); return OMPI_ERROR; case IBV_WC_RECV: /* Process a RECV */ - BTL_DEBUG_OUT("Got an recv on the completion queue"); + BTL_DEBUG(("Got an recv on the completion queue")); frag = (mca_btl_openib_frag_t*) wc.wr_id; endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint; frag->rc=OMPI_SUCCESS; @@ -566,7 +566,7 @@ int mca_btl_openib_component_progress() break; default: - BTL_ERROR("Unhandled work completion opcode is %d", wc.opcode); + BTL_ERROR(("Unhandled work completion opcode is %d", wc.opcode)); break; } } @@ -575,24 +575,24 @@ int mca_btl_openib_component_progress() ne=ibv_poll_cq(openib_btl->ib_cq_low, 1, &wc ); if(ne < 0){ - BTL_ERROR("error polling CQ with %d errno says %s", ne, strerror(errno)); + BTL_ERROR(("error polling CQ with %d errno says %s", ne, strerror(errno))); return OMPI_ERROR; } else if(wc.status != IBV_WC_SUCCESS) { - BTL_ERROR("error polling CQ with status %d for wr_id %llu", - wc.status, wc.wr_id); + BTL_ERROR(("error polling CQ with status %d for wr_id %llu", + wc.status, wc.wr_id)); return OMPI_ERROR; } else if(1 == ne) { /* Handle n/w completions */ switch(wc.opcode) { case IBV_WC_RECV_RDMA_WITH_IMM: - BTL_ERROR("Got an RDMA with Immediate data Not supported!"); + BTL_ERROR(("Got an RDMA with Immediate data Not supported!")); return OMPI_ERROR; case IBV_WC_RECV: /* process a recv completion (this should only occur for a send not an rdma) */ - BTL_DEBUG_OUT( "Got a recv completion"); + BTL_DEBUG(( "Got a recv completion")); frag = (mca_btl_openib_frag_t*) wc.wr_id; endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint; frag->rc=OMPI_SUCCESS; @@ -624,7 +624,7 @@ int mca_btl_openib_component_progress() break; default: - BTL_ERROR("Unhandled work completion opcode is %d", wc.opcode); + BTL_ERROR(("Unhandled work completion opcode is %d", wc.opcode)); break; } } diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index a08f6e806c..ca67e5e13c 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -93,7 +93,7 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope if(ibv_post_send(ib_qp, &frag->wr_desc.sr_desc, &bad_wr)) { - BTL_ERROR("error posting send request errno says %s", strerror(errno)); + BTL_ERROR(("error posting send request errno says %s", strerror(errno))); return OMPI_ERROR; } MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(endpoint, 1); @@ -206,10 +206,10 @@ static int mca_btl_openib_endpoint_send_connect_req(mca_btl_base_endpoint_t* end mca_btl_openib_endpoint_send_cb, NULL); - BTL_DEBUG_OUT("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d", + BTL_DEBUG(("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d", endpoint->lcl_qp_high->qp_num, endpoint->lcl_qp_low->qp_num, - endpoint->endpoint_btl->ib_port_attr->lid); + endpoint->endpoint_btl->ib_port_attr->lid)); if(rc < 0) { ORTE_ERROR_LOG(rc); @@ -297,10 +297,10 @@ static int mca_btl_openib_endpoint_set_remote_info(mca_btl_base_endpoint_t* endp ORTE_ERROR_LOG(rc); return rc; } - BTL_DEBUG_OUT("Received High Priority QP num = %d, Low Priority QP num %d, LID = %d", + BTL_DEBUG(("Received High Priority QP num = %d, Low Priority QP num %d, LID = %d", endpoint->rem_qp_num_high, endpoint->rem_qp_num_low, - endpoint->rem_lid); + endpoint->rem_lid)); return ORTE_SUCCESS; } @@ -328,7 +328,7 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi openib_btl->ib_cq_high, endpoint->lcl_qp_attr_high, &endpoint->lcl_qp_high))) { - BTL_ERROR("error creating queue pair, error code %d", rc); + BTL_ERROR(("error creating queue pair, error code %d", rc)); return rc; } srand48(getpid() * time(NULL)); @@ -340,20 +340,20 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi openib_btl->ib_cq_low, endpoint->lcl_qp_attr_low, &endpoint->lcl_qp_low))) { - BTL_ERROR("error creating queue pair, error code %d", rc); + BTL_ERROR(("error creating queue pair, error code %d", rc)); return rc; } endpoint->lcl_psn_low = lrand48() & 0xffffff; - BTL_DEBUG_OUT("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", + BTL_DEBUG(("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", endpoint->lcl_qp_high->qp_num, endpoint->lcl_qp_low->qp_num, - openib_btl->ib_port_attr->lid); + openib_btl->ib_port_attr->lid)); /* Send connection info over to remote endpoint */ endpoint->endpoint_state = MCA_BTL_IB_CONNECTING; if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_send_connect_req(endpoint))) { - BTL_ERROR("error sending connect request, error code %d", rc); + BTL_ERROR(("error sending connect request, error code %d", rc)); return rc; } return OMPI_SUCCESS; @@ -375,7 +375,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t openib_btl->ib_cq_high, endpoint->lcl_qp_attr_high, &endpoint->lcl_qp_high))) { - BTL_ERROR("error creating queue pair, error code %d", rc); + BTL_ERROR(("error creating queue pair, error code %d", rc)); return rc; } srand48(getpid() * time(NULL)); @@ -387,15 +387,15 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t openib_btl->ib_cq_low, endpoint->lcl_qp_attr_low, &endpoint->lcl_qp_low))) { - BTL_ERROR("error creating queue pair, error code %d", rc); + BTL_ERROR(("error creating queue pair, error code %d", rc)); return rc; } endpoint->lcl_psn_low = lrand48() & 0xffffff; - BTL_DEBUG_OUT("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", + BTL_DEBUG(("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", endpoint->lcl_qp_high->qp_num, endpoint->lcl_qp_low->qp_num, - openib_btl->ib_port_attr->lid); + openib_btl->ib_port_attr->lid)); /* Set the remote side info */ @@ -405,13 +405,13 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t rc = mca_btl_openib_endpoint_connect(endpoint); if(rc != OMPI_SUCCESS) { - BTL_ERROR("error in endpoint connect error code is %d", rc); + BTL_ERROR(("error in endpoint connect error code is %d", rc)); return rc; } /* Send connection info over to remote endpoint */ if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_send_connect_req(endpoint))) { - BTL_ERROR("error in endpoint send connect request error code is %d", rc); + BTL_ERROR(("error in endpoint send connect request error code is %d", rc)); return rc; } return OMPI_SUCCESS; @@ -476,7 +476,7 @@ static void mca_btl_openib_endpoint_recv( * and then reply with our QP information */ if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_reply_start_connect(ib_endpoint, buffer))) { - BTL_ERROR("error in endpoint reply start connect"); + BTL_ERROR(("error in endpoint reply start connect")); break; } @@ -488,7 +488,7 @@ static void mca_btl_openib_endpoint_recv( mca_btl_openib_endpoint_set_remote_info(ib_endpoint, buffer); if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_connect(ib_endpoint))) { - BTL_ERROR("endpoint connect error: %d", rc); + BTL_ERROR(("endpoint connect error: %d", rc)); break; } @@ -508,7 +508,7 @@ static void mca_btl_openib_endpoint_recv( break; default : - BTL_ERROR("Invalid endpoint state %d", endpoint_state); + BTL_ERROR(("Invalid endpoint state %d", endpoint_state)); } break; @@ -553,7 +553,7 @@ int mca_btl_openib_endpoint_send( switch(endpoint->endpoint_state) { case MCA_BTL_IB_CONNECTING: - BTL_DEBUG_OUT("Queing because state is connecting"); + BTL_DEBUG(("Queing because state is connecting")); opal_list_append(&endpoint->pending_send_frags, (opal_list_item_t *)frag); @@ -563,7 +563,7 @@ int mca_btl_openib_endpoint_send( case MCA_BTL_IB_CONNECT_ACK: - BTL_DEBUG_OUT("Queuing because waiting for ack"); + BTL_DEBUG(("Queuing because waiting for ack")); opal_list_append(&endpoint->pending_send_frags, (opal_list_item_t *)frag); @@ -573,7 +573,7 @@ int mca_btl_openib_endpoint_send( case MCA_BTL_IB_CLOSED: - BTL_DEBUG_OUT("Connection to endpoint closed ... connecting ..."); + BTL_DEBUG(("Connection to endpoint closed ... connecting ...")); opal_list_append(&endpoint->pending_send_frags, (opal_list_item_t *)frag); rc = mca_btl_openib_endpoint_start_connect(endpoint); @@ -587,10 +587,10 @@ int mca_btl_openib_endpoint_send( case MCA_BTL_IB_CONNECTED: { openib_btl = endpoint->endpoint_btl; - BTL_DEBUG_OUT("Send to : %d, len : %lu, frag : %llu", + BTL_DEBUG(("Send to : %d, len : %lu, frag : %llu", endpoint->endpoint_proc->proc_guid.vpid, frag->sg_entry.length, - (unsigned long long) frag); + (unsigned long long) frag)); rc = mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag); break; } @@ -628,7 +628,7 @@ void mca_btl_openib_progress_send_frags(mca_btl_openib_endpoint_t* endpoint) /* We need to post this one */ if(OMPI_SUCCESS != mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag)) - BTL_ERROR("Error posting send"); + BTL_ERROR(("Error posting send")); } } @@ -716,7 +716,7 @@ int mca_btl_openib_endpoint_create_qp( my_qp = ibv_create_qp(pd, &qp_init_attr); if(NULL == my_qp) { - BTL_ERROR("error creating qp errno says %s", strerror(errno)); + BTL_ERROR(("error creating qp errno says %s", strerror(errno))); return OMPI_ERROR; } (*qp) = my_qp; @@ -735,7 +735,7 @@ int mca_btl_openib_endpoint_create_qp( IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS )) { - BTL_ERROR("error modifying qp to INIT errno says %s", strerror(errno)); + BTL_ERROR(("error modifying qp to INIT errno says %s", strerror(errno))); return OMPI_ERROR; } } @@ -781,7 +781,7 @@ int mca_btl_openib_endpoint_qp_init_query( IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)) { - BTL_ERROR("error modifing QP to RTR errno says %s", strerror(errno)); + BTL_ERROR(("error modifing QP to RTR errno says %s", strerror(errno))); return OMPI_ERROR; } attr->qp_state = IBV_QPS_RTS; @@ -797,7 +797,7 @@ int mca_btl_openib_endpoint_qp_init_query( IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)) { - BTL_ERROR("error modifying QP to RTS errno says %s", strerror(errno)); + BTL_ERROR(("error modifying QP to RTS errno says %s", strerror(errno))); return OMPI_ERROR; } return OMPI_SUCCESS; diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.h b/ompi/mca/btl/openib/btl_openib_endpoint.h index 32b0db1884..973f70b688 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h @@ -188,7 +188,7 @@ void mca_btl_openib_progress_send_frags(mca_btl_openib_endpoint_t*); if(ibv_post_recv(post_rr_sub_qp, \ &post_rr_sub_frag->wr_desc.rr_desc, \ &post_rr_sub_bad_wr)) { \ - BTL_ERROR("error posting receive errno says %s\n", strerror(errno)); \ + BTL_ERROR(("error posting receive errno says %s\n", strerror(errno))); \ return OMPI_ERROR; \ }\ }\ diff --git a/ompi/mca/btl/tcp/Makefile.am b/ompi/mca/btl/tcp/Makefile.am new file mode 100644 index 0000000000..b34267dcfe --- /dev/null +++ b/ompi/mca/btl/tcp/Makefile.am @@ -0,0 +1,56 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University. +# All rights reserved. +# Copyright (c) 2004-2005 The Trustees of the University of Tennessee. +# All rights reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Use the top-level Makefile.options + +include $(top_ompi_srcdir)/config/Makefile.options + +sources = \ + btl_tcp.c \ + btl_tcp.h \ + btl_tcp_component.c \ + btl_tcp_endpoint.c \ + btl_tcp_endpoint.h \ + btl_tcp_frag.c \ + btl_tcp_frag.h \ + btl_tcp_proc.c \ + btl_tcp_proc.h \ + btl_tcp_error.h + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_btl_tcp_DSO +lib = +lib_sources = +component = mca_btl_tcp.la +component_sources = $(sources) +else +lib = libmca_btl_tcp.la +lib_sources = $(sources) +component = +component_sources = +endif + +mcacomponentdir = $(libdir)/openmpi +mcacomponent_LTLIBRARIES = $(component) +mca_btl_tcp_la_SOURCES = $(component_sources) +mca_btl_tcp_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(lib) +libmca_btl_tcp_la_SOURCES = $(lib_sources) +libmca_btl_tcp_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/btl/tcp/btl_tcp.c b/ompi/mca/btl/tcp/btl_tcp.c new file mode 100644 index 0000000000..46894cedc9 --- /dev/null +++ b/ompi/mca/btl/tcp/btl_tcp.c @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include +#include "util/output.h" +#include "util/if.h" +#include "mca/pml/pml.h" +#include "mca/btl/btl.h" + +#include "btl_tcp.h" +#include "btl_tcp_frag.h" +#include "btl_tcp_proc.h" +#include "btl_tcp_endpoint.h" +#include "datatype/convertor.h" +#include "mca/mpool/base/base.h" +#include "mca/mpool/mpool.h" + + +mca_btl_tcp_module_t mca_btl_tcp_module = { + { + &mca_btl_tcp_component.super, + 0, /* max size of first fragment */ + 0, /* min send fragment size */ + 0, /* max send fragment size */ + 0, /* min rdma fragment size */ + 0, /* max rdma fragment size */ + 0, /* exclusivity */ + 0, /* latency */ + 0, /* bandwidth */ + 0, /* flags */ + mca_btl_tcp_add_procs, + mca_btl_tcp_del_procs, + mca_btl_tcp_register, + mca_btl_tcp_finalize, + mca_btl_tcp_alloc, + mca_btl_tcp_free, + mca_btl_tcp_prepare_src, + mca_btl_tcp_prepare_dst, + mca_btl_tcp_send, + mca_btl_tcp_put, + NULL /* get */ + } +}; + +/** + * + */ + +int mca_btl_tcp_add_procs( + struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **ompi_procs, + struct mca_btl_base_endpoint_t** peers, + ompi_bitmap_t* reachable) +{ + mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*)btl; + int i, rc; + + for(i = 0; i < (int) nprocs; i++) { + + struct ompi_proc_t* ompi_proc = ompi_procs[i]; + mca_btl_tcp_proc_t* tcp_proc; + mca_btl_base_endpoint_t* tcp_endpoint; + + if(NULL == (tcp_proc = mca_btl_tcp_proc_create(ompi_proc))) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* + * Check to make sure that the peer has at least as many interface + * addresses exported as we are trying to use. If not, then + * don't bind this BTL instance to the proc. + */ + + OPAL_THREAD_LOCK(&tcp_proc->proc_lock); + + /* The btl_proc datastructure is shared by all TCP BTL + * instances that are trying to reach this destination. + * Cache the peer instance on the btl_proc. + */ + tcp_endpoint = OBJ_NEW(mca_btl_tcp_endpoint_t); + if(NULL == tcp_endpoint) { + OPAL_THREAD_UNLOCK(&module_proc->proc_lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + tcp_endpoint->endpoint_btl = tcp_btl; + rc = mca_btl_tcp_proc_insert(tcp_proc, tcp_endpoint); + if(rc != OMPI_SUCCESS) { + OBJ_RELEASE(tcp_endpoint); + OPAL_THREAD_UNLOCK(&module_proc->proc_lock); + continue; + } + + ompi_bitmap_set_bit(reachable, i); + OPAL_THREAD_UNLOCK(&module_proc->proc_lock); + peers[i] = tcp_endpoint; + opal_list_append(&tcp_btl->tcp_endpoints, (opal_list_item_t*)tcp_endpoint); + + /* we increase the count of MPI users of the event library + once per peer, so that we are used until we aren't + connected to a peer */ + opal_progress_event_increment(); + } + + return OMPI_SUCCESS; +} + +int mca_btl_tcp_del_procs(struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t ** peers) +{ + /* TODO */ + return OMPI_SUCCESS; +} + + +/** + * Register callback function to support send/recv semantics + */ + +int mca_btl_tcp_register( + struct mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_module_recv_cb_fn_t cbfunc, + void* cbdata) +{ + mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl; + tcp_btl->tcp_reg[tag].cbfunc = cbfunc; + tcp_btl->tcp_reg[tag].cbdata = cbdata; + return OMPI_SUCCESS; +} + + +/** + * Allocate a segment. + * + * @param btl (IN) BTL module + * @param size (IN) Request segment size. + */ + +mca_btl_base_descriptor_t* mca_btl_tcp_alloc( + struct mca_btl_base_module_t* btl, + size_t size) +{ + mca_btl_tcp_frag_t* frag; + int rc; + + if(size <= btl->btl_eager_limit){ + MCA_BTL_TCP_FRAG_ALLOC_EAGER(frag, rc); + frag->segments[0].seg_len = + size <= btl->btl_eager_limit ? + size : btl->btl_eager_limit ; + } else { + MCA_BTL_TCP_FRAG_ALLOC_MAX(frag, rc); + frag->segments[0].seg_len = + size <= btl->btl_max_send_size ? + size : btl->btl_max_send_size ; + } + frag->segments[0].seg_addr.pval = frag+1; + + frag->base.des_src = frag->segments; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->base.des_flags = 0; + frag->btl = (mca_btl_tcp_module_t*)btl; + return (mca_btl_base_descriptor_t*)frag; +} + + +/** + * Return a segment + */ + +int mca_btl_tcp_free( + struct mca_btl_base_module_t* btl, + mca_btl_base_descriptor_t* des) +{ + mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)des; + if(frag->size == 0) { + MCA_BTL_TCP_FRAG_RETURN_USER(frag); + } else if(frag->size == btl->btl_eager_limit){ + MCA_BTL_TCP_FRAG_RETURN_EAGER(frag); + } else if(frag->size == btl->btl_max_send_size) { + MCA_BTL_TCP_FRAG_RETURN_MAX(frag); + } else { + return OMPI_ERR_BAD_PARAM; + } + return OMPI_SUCCESS; +} + +/** + * Pack data and return a descriptor that can be + * used for send/put. + * + * @param btl (IN) BTL module + * @param peer (IN) BTL peer addressing + */ +mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_mpool_base_registration_t* registration, + struct ompi_convertor_t* convertor, + size_t reserve, + size_t* size) +{ + mca_btl_tcp_frag_t* frag; + struct iovec iov; + uint32_t iov_count = 1; + size_t max_data = *size; + int32_t free_after; + int rc; + + /* + * if we aren't pinning the data and the requested size is less + * than the eager limit pack into a fragment from the eager pool + */ + + if (max_data+reserve <= btl->btl_eager_limit) { + MCA_BTL_TCP_FRAG_ALLOC_EAGER(frag, rc); + } + + /* + * otherwise pack as much data as we can into a fragment + * that is the max send size. + */ + else { + MCA_BTL_TCP_FRAG_ALLOC_MAX(frag, rc); + } + if(NULL == frag) { + return NULL; + } + + if(ompi_convertor_need_buffers(convertor)) { + + if (max_data + reserve > frag->size) { + max_data = frag->size - reserve; + } + iov.iov_len = max_data; + iov.iov_base = (void*)(((unsigned char*)(frag+1)) + reserve); + + rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after); + *size = max_data; + if( rc < 0 ) { + MCA_BTL_TCP_FRAG_RETURN_EAGER(frag); + return NULL; + } + + frag->segments[0].seg_addr.pval = iov.iov_base; + frag->segments[0].seg_len = max_data + reserve; + frag->base.des_src_cnt = 1; + + } else { + + iov.iov_len = max_data; + iov.iov_base = NULL; + + rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after); + *size = max_data; + if( rc < 0 ) { + MCA_BTL_TCP_FRAG_RETURN_EAGER(frag); + return NULL; + } + + frag->segments[0].seg_addr.pval = frag+1; + frag->segments[0].seg_len = reserve; + frag->segments[1].seg_addr.pval = iov.iov_base; + frag->segments[1].seg_len = max_data; + frag->base.des_src_cnt = 2; + } + + frag->base.des_src = frag->segments; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->base.des_flags = 0; + return &frag->base; +} + + +/** + * Prepare a descriptor for send/rdma using the supplied + * convertor. If the convertor references data that is contigous, + * the descriptor may simply point to the user buffer. Otherwise, + * this routine is responsible for allocating buffer space and + * packing if required. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL peer addressing + * @param convertor (IN) Data type convertor + * @param reserve (IN) Additional bytes requested by upper layer to precede user data + * @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT) + */ + +mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_mpool_base_registration_t* registration, + struct ompi_convertor_t* convertor, + size_t reserve, + size_t* size) +{ + mca_btl_tcp_frag_t* frag; + int rc; + + MCA_BTL_TCP_FRAG_ALLOC_USER(frag, rc); + if(NULL == frag) { + return NULL; + } + + frag->segments->seg_len = *size; + frag->segments->seg_addr.pval = convertor->pBaseBuf + convertor->bConverted; + + frag->base.des_src = NULL; + frag->base.des_src_cnt = 0; + frag->base.des_dst = frag->segments; + frag->base.des_dst_cnt = 1; + frag->base.des_flags = 0; + return &frag->base; +} + + +/** + * Initiate an asynchronous send. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transfered + * @param tag (IN) The tag value used to notify the peer. + */ + +int mca_btl_tcp_send( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_btl_base_descriptor_t* descriptor, + mca_btl_base_tag_t tag) +{ + mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl; + mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor; + frag->btl = tcp_btl; + frag->hdr.base.tag = tag; + frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_SEND; + MCA_BTL_TCP_FRAG_INIT_SRC(frag,endpoint); + return mca_btl_tcp_endpoint_send(endpoint,frag); +} + + +/** + * Initiate an asynchronous put. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transferred + */ + +int mca_btl_tcp_put( + mca_btl_base_module_t* btl, + mca_btl_base_endpoint_t* endpoint, + mca_btl_base_descriptor_t* descriptor) +{ + mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl; + mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*) descriptor; + frag->btl = tcp_btl; + frag->endpoint = endpoint; + /* TODO */ + return OMPI_ERR_NOT_IMPLEMENTED; +} + + +/** + * Initiate an asynchronous get. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transferred + * + */ + +int mca_btl_tcp_get( + mca_btl_base_module_t* btl, + mca_btl_base_endpoint_t* endpoint, + mca_btl_base_descriptor_t* descriptor) +{ + mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl; + mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*) descriptor; + frag->btl = tcp_btl; + frag->endpoint = endpoint; + /* TODO */ + return OMPI_ERR_NOT_IMPLEMENTED; +} + + +/* + * Cleanup/release module resources. + */ + +int mca_btl_tcp_finalize(struct mca_btl_base_module_t* btl) +{ + mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl; + opal_list_item_t* item; + for( item = opal_list_remove_first(&tcp_btl->tcp_endpoints); + item != NULL; + item = opal_list_remove_first(&tcp_btl->tcp_endpoints)) { + mca_btl_tcp_endpoint_t *endpoint = (mca_btl_tcp_endpoint_t*)item; + OBJ_RELEASE(endpoint); + opal_progress_event_decrement(); + } + free(tcp_btl); + return OMPI_SUCCESS; +} + diff --git a/ompi/mca/btl/tcp/btl_tcp.h b/ompi/mca/btl/tcp/btl_tcp.h new file mode 100644 index 0000000000..2651cf2f44 --- /dev/null +++ b/ompi/mca/btl/tcp/btl_tcp.h @@ -0,0 +1,319 @@ + +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_BTL_TCP_H +#define MCA_BTL_TCP_H + +/* Standard system includes */ +#include "ompi_config.h" +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_SYS_SOCKET_H +#include +#endif +#ifdef HAVE_NETINET_IN_H +#include +#endif + +/* Open MPI includes */ +#include "opal/event/event.h" +#include "opal/util/output.h" +#include "ompi/class/ompi_bitmap.h" +#include "ompi/class/ompi_free_list.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/btl/base/base.h" +#include "ompi/mca/mpool/mpool.h" +#include "ompi/mca/btl/btl.h" + +#define MCA_BTL_TCP_STATISTICS 0 +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + + +/** + * Infiniband (TCP) BTL component. + */ + +struct mca_btl_tcp_component_t { + mca_btl_base_component_1_0_0_t super; /**< base BTL component */ + uint32_t tcp_num_btls; /**< number of hcas available to the TCP component */ + struct mca_btl_tcp_module_t **tcp_btls; /**< array of available BTL modules */ + struct mca_btl_tcp_proc_t* tcp_local; /**< local proc struct */ + int tcp_free_list_num; /**< initial size of free lists */ + int tcp_free_list_max; /**< maximum size of free lists */ + int tcp_free_list_inc; /**< number of elements to alloc when growing free lists */ + opal_hash_table_t tcp_procs; /**< hash table of tcp proc structures */ + opal_list_t tcp_events; /**< list of pending tcp events */ + opal_mutex_t tcp_lock; /**< lock for accessing module state */ + opal_event_t tcp_recv_event; /**< recv event for listen socket */ + int tcp_listen_sd; /**< listen socket for incoming connection requests */ + unsigned short tcp_listen_port; /**< listen port */ + char* tcp_if_include; /**< comma seperated list of interface to include */ + char* tcp_if_exclude; /**< comma seperated list of interface to exclude */ + int tcp_sndbuf; /**< socket sndbuf size */ + int tcp_rcvbuf; /**< socket rcvbuf size */ + + /* free list of fragment descriptors */ + ompi_free_list_t tcp_frag_eager; + ompi_free_list_t tcp_frag_max; + ompi_free_list_t tcp_frag_user; +}; +typedef struct mca_btl_tcp_component_t mca_btl_tcp_component_t; + +extern mca_btl_tcp_component_t mca_btl_tcp_component; + + + +/** + * BTL Module Interface + */ +struct mca_btl_tcp_module_t { + mca_btl_base_module_t super; /**< base BTL interface */ + mca_btl_base_recv_reg_t tcp_reg[256]; + int tcp_ifindex; /**< PTL interface index */ + struct sockaddr_in tcp_ifaddr; /**< PTL interface address */ + struct sockaddr_in tcp_ifmask; /**< PTL interface netmask */ + opal_list_t tcp_endpoints; +#if MCA_BTL_TCP_STATISTICS + size_t tcp_bytes_sent; + size_t tcp_bytes_recv; + size_t tcp_send_handler; +#endif +}; +typedef struct mca_btl_tcp_module_t mca_btl_tcp_module_t; +extern mca_btl_tcp_module_t mca_btl_tcp_module; + + +/** + * Register TCP component parameters with the MCA framework + */ +extern int mca_btl_tcp_component_open(void); + +/** + * Any final cleanup before being unloaded. + */ +extern int mca_btl_tcp_component_close(void); + +/** + * TCP component initialization. + * + * @param num_btl_modules (OUT) Number of BTLs returned in BTL array. + * @param allow_multi_user_threads (OUT) Flag indicating wether BTL supports user threads (TRUE) + * @param have_hidden_threads (OUT) Flag indicating wether BTL uses threads (TRUE) + */ +extern mca_btl_base_module_t** mca_btl_tcp_component_init( + int *num_btl_modules, + bool allow_multi_user_threads, + bool have_hidden_threads +); + +/** + * TCP component control. + */ +int mca_btl_tcp_component_control( + int param, + void* value, + size_t size +); + + +/** + * TCP component progress. + */ +extern int mca_btl_tcp_component_progress(void); + + + +/** + * Cleanup any resources held by the BTL. + * + * @param btl BTL instance. + * @return OMPI_SUCCESS or error status on failure. + */ + +extern int mca_btl_tcp_finalize( + struct mca_btl_base_module_t* btl +); + + +/** + * PML->BTL notification of change in the process list. + * + * @param btl (IN) + * @param nprocs (IN) Number of processes + * @param procs (IN) Set of processes + * @param peers (OUT) Set of (optional) peer addressing info. + * @param peers (IN/OUT) Set of processes that are reachable via this BTL. + * @return OMPI_SUCCESS or error status on failure. + * + */ + +extern int mca_btl_tcp_add_procs( + struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t** peers, + ompi_bitmap_t* reachable +); + +/** + * PML->BTL notification of change in the process list. + * + * @param btl (IN) BTL instance + * @param nproc (IN) Number of processes. + * @param procs (IN) Set of processes. + * @param peers (IN) Set of peer data structures. + * @return Status indicating if cleanup was successful + * + */ + +extern int mca_btl_tcp_del_procs( + struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t** peers +); + + +/** + * Initiate an asynchronous send. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transfered + * @param tag (IN) The tag value used to notify the peer. + */ + +extern int mca_btl_tcp_send( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* btl_peer, + struct mca_btl_base_descriptor_t* descriptor, + mca_btl_base_tag_t tag +); + + +/** + * Initiate an asynchronous put. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transferred + */ + +extern int mca_btl_tcp_put( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* btl_peer, + struct mca_btl_base_descriptor_t* decriptor +); + + +/** + * Initiate an asynchronous get. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transferred + */ + +extern int mca_btl_tcp_get( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* btl_peer, + struct mca_btl_base_descriptor_t* decriptor +); + +/** + * Register a callback function that is called on receipt + * of a fragment. + * + * @param btl (IN) BTL module + * @return Status indicating if registration was successful + * + */ + +extern int mca_btl_tcp_register( + struct mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_module_recv_cb_fn_t cbfunc, + void* cbdata); + +/** + * Allocate a descriptor with a segment of the requested size. + * Note that the BTL layer may choose to return a smaller size + * if it cannot support the request. + * + * @param btl (IN) BTL module + * @param size (IN) Request segment size. + */ + +extern mca_btl_base_descriptor_t* mca_btl_tcp_alloc( + struct mca_btl_base_module_t* btl, + size_t size); + + +/** + * Return a segment allocated by this BTL. + * + * @param btl (IN) BTL module + * @param descriptor (IN) Allocated descriptor. + */ + +extern int mca_btl_tcp_free( + struct mca_btl_base_module_t* btl, + mca_btl_base_descriptor_t* des); + + +/** + * Prepare a descriptor for send/rdma using the supplied + * convertor. If the convertor references data that is contigous, + * the descriptor may simply point to the user buffer. Otherwise, + * this routine is responsible for allocating buffer space and + * packing if required. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL peer addressing + * @param convertor (IN) Data type convertor + * @param reserve (IN) Additional bytes requested by upper layer to precede user data + * @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT) +*/ + +mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* peer, + struct mca_mpool_base_registration_t*, + struct ompi_convertor_t* convertor, + size_t reserve, + size_t* size +); + +extern mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* peer, + struct mca_mpool_base_registration_t*, + struct ompi_convertor_t* convertor, + size_t reserve, + size_t* size); + + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/btl/tcp/btl_tcp_addr.h b/ompi/mca/btl/tcp/btl_tcp_addr.h new file mode 100644 index 0000000000..c89145d063 --- /dev/null +++ b/ompi/mca/btl/tcp/btl_tcp_addr.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_BTL_TCP_ADDR_H +#define MCA_BTL_TCP_ADDR_H + +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_SYS_SOCKET_H +#include +#endif +#ifdef HAVE_NETINET_IN_H +#include +#endif + + +/** + * Structure used to publish TCP connection information to peers. + */ +struct mca_btl_tcp_addr_t { + struct in_addr addr_inet; /**< IPv4 address in network byte order */ + in_port_t addr_port; /**< listen port */ + unsigned short addr_inuse; /**< local meaning only */ +}; +typedef struct mca_btl_tcp_addr_t mca_btl_tcp_addr_t; + +#endif + diff --git a/ompi/mca/btl/tcp/btl_tcp_component.c b/ompi/mca/btl/tcp/btl_tcp_component.c new file mode 100644 index 0000000000..f6cb19f22d --- /dev/null +++ b/ompi/mca/btl/tcp/btl_tcp_component.c @@ -0,0 +1,630 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "include/ompi_socket_errno.h" +#ifdef HAVE_UNISTD_H +#include +#endif +#include +#include +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_SYS_SOCKET_H +#include +#endif +#ifdef HAVE_NETINET_IN_H +#include +#endif +#ifdef HAVE_ARPA_INET_H +#include +#endif + +#include "include/constants.h" +#include "opal/event/event.h" +#include "opal/util/if.h" +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "orte/mca/oob/base/base.h" +#include "orte/mca/ns/ns_types.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/btl/btl.h" + +#include "mca/base/mca_base_param.h" +#include "mca/pml/base/pml_base_module_exchange.h" +#include "mca/errmgr/errmgr.h" +#include "mca/mpool/base/base.h" +#include "mca/btl/base/btl_base_error.h" +#include "btl_tcp.h" +#include "btl_tcp_addr.h" +#include "btl_tcp_proc.h" +#include "btl_tcp_frag.h" +#include "btl_tcp_endpoint.h" +#include "mca/btl/base/base.h" +#include "datatype/convertor.h" + + +#define IMPORTANT_WINDOWS_COMMENT() \ + /* In windows, many of the socket functions return an EWOULDBLOCK instead of \ + things like EAGAIN, EINPROGRESS, etc. It has been verified that this will \ + not conflict with other error codes that are returned by these functions \ + under UNIX/Linux environments */ + + +mca_btl_tcp_component_t mca_btl_tcp_component = { + { + /* First, the mca_base_component_t struct containing meta information + about the component itself */ + + { + /* Indicate that we are a pml v1.0.0 component (which also implies a + specific MCA version) */ + + MCA_BTL_BASE_VERSION_1_0_0, + + "tcp", /* MCA component name */ + 1, /* MCA component major version */ + 0, /* MCA component minor version */ + 0, /* MCA component release version */ + mca_btl_tcp_component_open, /* component open */ + mca_btl_tcp_component_close /* component close */ + }, + + /* Next the MCA v1.0.0 component meta data */ + + { + /* Whether the component is checkpointable or not */ + + false + }, + + mca_btl_tcp_component_init, + NULL, + } +}; + + +/* + * utility routines for parameter registration + */ + +static inline char* mca_btl_tcp_param_register_string( + const char* param_name, + const char* default_value) +{ + char *param_value; + int id = mca_base_param_register_string("btl","tcp",param_name,NULL,default_value); + mca_base_param_lookup_string(id, ¶m_value); + return param_value; +} + +static inline int mca_btl_tcp_param_register_int( + const char* param_name, + int default_value) +{ + int id = mca_base_param_register_int("btl","tcp",param_name,NULL,default_value); + int param_value = default_value; + mca_base_param_lookup_int(id,¶m_value); + return param_value; +} + + +/* + * Data structure for accepting connections. + */ + +struct mca_btl_tcp_event_t { + opal_list_item_t item; + opal_event_t event; +}; +typedef struct mca_btl_tcp_event_t mca_btl_tcp_event_t; + +static void mca_btl_tcp_event_construct(mca_btl_tcp_event_t* event) +{ + OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock); + opal_list_append(&mca_btl_tcp_component.tcp_events, &event->item); + OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock); +} + +static void mca_btl_tcp_event_destruct(mca_btl_tcp_event_t* event) +{ + OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock); + opal_list_remove_item(&mca_btl_tcp_component.tcp_events, &event->item); + OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock); +} + +OBJ_CLASS_INSTANCE( + mca_btl_tcp_event_t, + opal_list_item_t, + mca_btl_tcp_event_construct, + mca_btl_tcp_event_destruct); + + +/* + * functions for receiving event callbacks + */ + +static void mca_btl_tcp_component_recv_handler(int, short, void*); + + +/* + * Called by MCA framework to open the component, registers + * component parameters. + */ + +int mca_btl_tcp_component_open(void) +{ +#ifdef WIN32 + WSADATA win_sock_data; + if (WSAStartup(MAKEWORD(2,2), &win_sock_data) != 0) { + BTL_ERROR(("failed to initialise windows sockets:%d", WSAGetLastError())); + return OMPI_ERROR; + } +#endif + + /* initialize state */ + mca_btl_tcp_component.tcp_listen_sd = -1; + mca_btl_tcp_component.tcp_num_btls=0; + mca_btl_tcp_component.tcp_btls=NULL; + + /* initialize objects */ + OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t); + OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_hash_table_t); + OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t); + OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_eager, ompi_free_list_t); + OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_max, ompi_free_list_t); + OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_user, ompi_free_list_t); + opal_hash_table_init(&mca_btl_tcp_component.tcp_procs, 256); + + /* register TCP component parameters */ + mca_btl_tcp_component.tcp_if_include = + mca_btl_tcp_param_register_string("if_include", ""); + mca_btl_tcp_component.tcp_if_exclude = + mca_btl_tcp_param_register_string("if_exclude", "lo"); + mca_btl_tcp_component.tcp_free_list_num = + mca_btl_tcp_param_register_int ("free_list_num", 8); + mca_btl_tcp_component.tcp_free_list_max = + mca_btl_tcp_param_register_int ("free_list_max", 1024); + mca_btl_tcp_component.tcp_free_list_inc = + mca_btl_tcp_param_register_int ("free_list_inc", 32); + mca_btl_tcp_component.tcp_sndbuf = + mca_btl_tcp_param_register_int ("sndbuf", 128*1024); + mca_btl_tcp_component.tcp_rcvbuf = + mca_btl_tcp_param_register_int ("rcvbuf", 128*1024); + mca_btl_tcp_module.super.btl_exclusivity = + mca_btl_tcp_param_register_int ("exclusivity", 0); + mca_btl_tcp_module.super.btl_eager_limit = + mca_btl_tcp_param_register_int ("first_frag_size", 64*1024) - sizeof(mca_btl_base_header_t); + mca_btl_tcp_module.super.btl_min_send_size = + mca_btl_tcp_param_register_int ("min_send_size", 64*1024) - sizeof(mca_btl_base_header_t); + mca_btl_tcp_module.super.btl_max_send_size = + mca_btl_tcp_param_register_int ("max_send_size", 256*1024) - sizeof(mca_btl_base_header_t); + mca_btl_tcp_module.super.btl_min_rdma_size = + mca_btl_tcp_param_register_int("min_rdma_size", 1024*1024); + mca_btl_tcp_module.super.btl_max_rdma_size = + mca_btl_tcp_param_register_int("max_rdma_size", 2*1024*1024); + mca_btl_tcp_module.super.btl_flags = + mca_btl_tcp_param_register_int("flags", 0); + return OMPI_SUCCESS; +} + + +/* + * module cleanup - sanity checking of queue lengths + */ + +int mca_btl_tcp_component_close(void) +{ + opal_list_item_t* item; +#ifdef WIN32 + WSACleanup(); +#endif + + if(NULL != mca_btl_tcp_component.tcp_if_include) + free(mca_btl_tcp_component.tcp_if_include); + if(NULL != mca_btl_tcp_component.tcp_if_exclude) + free(mca_btl_tcp_component.tcp_if_exclude); + if (NULL != mca_btl_tcp_component.tcp_btls) + free(mca_btl_tcp_component.tcp_btls); + + if (mca_btl_tcp_component.tcp_listen_sd >= 0) { + opal_event_del(&mca_btl_tcp_component.tcp_recv_event); + close(mca_btl_tcp_component.tcp_listen_sd); + mca_btl_tcp_component.tcp_listen_sd = -1; + } + + /* cleanup any pending events */ + OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock); + for(item = opal_list_remove_first(&mca_btl_tcp_component.tcp_events); + item != NULL; + item = opal_list_remove_first(&mca_btl_tcp_component.tcp_events)) { + mca_btl_tcp_event_t* event = (mca_btl_tcp_event_t*)item; + opal_event_del(&event->event); + OBJ_RELEASE(event); + } + OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock); + + /* release resources */ + OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_procs); + OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_events); + OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_eager); + OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_max); + OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_user); + OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_lock); + return OMPI_SUCCESS; +} + + +/* + * Create a btl instance and add to modules list. + */ + +static int mca_btl_tcp_create(int if_index, const char* if_name) +{ + struct mca_btl_tcp_module_t* btl = (struct mca_btl_tcp_module_t *)malloc(sizeof(mca_btl_tcp_module_t)); + char param[256]; + if(NULL == btl) + return OMPI_ERR_OUT_OF_RESOURCE; + memcpy(btl, &mca_btl_tcp_module, sizeof(mca_btl_tcp_module)); + OBJ_CONSTRUCT(&btl->tcp_endpoints, opal_list_t); + mca_btl_tcp_component.tcp_btls[mca_btl_tcp_component.tcp_num_btls++] = btl; + + /* initialize the btl */ + btl->tcp_ifindex = if_index; +#if MCA_BTL_TCP_STATISTICS + btl->tcp_bytes_recv = 0; + btl->tcp_bytes_sent = 0; + btl->tcp_send_handler = 0; +#endif + opal_ifindextoaddr(if_index, (struct sockaddr*)&btl->tcp_ifaddr, sizeof(btl->tcp_ifaddr)); + opal_ifindextomask(if_index, (struct sockaddr*)&btl->tcp_ifmask, sizeof(btl->tcp_ifmask)); + + /* allow user to specify interface bandwidth */ + sprintf(param, "bandwidth_%s", if_name); + btl->super.btl_bandwidth = mca_btl_tcp_param_register_int(param, 0); + + /* allow user to override/specify latency ranking */ + sprintf(param, "latency_%s", if_name); + btl->super.btl_latency = mca_btl_tcp_param_register_int(param, 0); + +#if 0 && OMPI_ENABLE_DEBUG + BTL_OUTPUT(("interface: %s bandwidth %d latency %d", + if_name, btl->super.btl_bandwidth, btl->super.btl_latency)); +#endif + return OMPI_SUCCESS; +} + +/* + * Create a TCP BTL instance for either: + * (1) all interfaces specified by the user + * (2) all available interfaces + * (3) all available interfaces except for those excluded by the user + */ + +static int mca_btl_tcp_component_create_instances(void) +{ + int if_count = opal_ifcount(); + int if_index; + char **include; + char **exclude; + char **argv; + + if(if_count <= 0) + return OMPI_ERROR; + + /* allocate memory for btls */ + mca_btl_tcp_component.tcp_btls = (mca_btl_tcp_module_t **)malloc(if_count * sizeof(mca_btl_tcp_module_t*)); + if(NULL == mca_btl_tcp_component.tcp_btls) + return OMPI_ERR_OUT_OF_RESOURCE; + + /* if the user specified an interface list - use these exclusively */ + argv = include = opal_argv_split(mca_btl_tcp_component.tcp_if_include,','); + while(argv && *argv) { + char* if_name = *argv; + int if_index = opal_ifnametoindex(if_name); + if(if_index < 0) { + BTL_ERROR(("invalid interface \"%s\"", if_name)); + } else { + mca_btl_tcp_create(if_index, if_name); + } + argv++; + } + opal_argv_free(include); + if(mca_btl_tcp_component.tcp_num_btls) + return OMPI_SUCCESS; + + /* if the interface list was not specified by the user, create + * a BTL for each interface that was not excluded. + */ + exclude = opal_argv_split(mca_btl_tcp_component.tcp_if_exclude,','); + for(if_index = opal_ifbegin(); if_index >= 0; if_index = opal_ifnext(if_index)) { + char if_name[32]; + opal_ifindextoname(if_index, if_name, sizeof(if_name)); + + /* check to see if this interface exists in the exclude list */ + if(opal_ifcount() > 1) { + argv = exclude; + while(argv && *argv) { + if(strncmp(*argv,if_name,strlen(*argv)) == 0) + break; + argv++; + } + /* if this interface was not found in the excluded list - create a BTL */ + if(argv == 0 || *argv == 0) { + mca_btl_tcp_create(if_index, if_name); + } + } else { + mca_btl_tcp_create(if_index, if_name); + } + } + opal_argv_free(exclude); + return OMPI_SUCCESS; +} + +/* + * Create a listen socket and bind to all interfaces + */ + +static int mca_btl_tcp_component_create_listen(void) +{ + int flags; + struct sockaddr_in inaddr; + ompi_socklen_t addrlen; + + /* create a listen socket for incoming connections */ + mca_btl_tcp_component.tcp_listen_sd = socket(AF_INET, SOCK_STREAM, 0); + if(mca_btl_tcp_component.tcp_listen_sd < 0) { + BTL_ERROR(("socket() failed with errno=%d", ompi_socket_errno)); + return OMPI_ERROR; + } + mca_btl_tcp_set_socket_options(mca_btl_tcp_component.tcp_listen_sd); + + /* bind to all addresses and dynamically assigned port */ + memset(&inaddr, 0, sizeof(inaddr)); + inaddr.sin_family = AF_INET; + inaddr.sin_addr.s_addr = INADDR_ANY; + inaddr.sin_port = 0; + + if(bind(mca_btl_tcp_component.tcp_listen_sd, (struct sockaddr*)&inaddr, sizeof(inaddr)) < 0) { + BTL_ERROR(("bind() failed with errno=%d", ompi_socket_errno)); + return OMPI_ERROR; + } + + /* resolve system assignend port */ + addrlen = sizeof(struct sockaddr_in); + if(getsockname(mca_btl_tcp_component.tcp_listen_sd, (struct sockaddr*)&inaddr, &addrlen) < 0) { + BTL_ERROR(("getsockname() failed with errno=%d", ompi_socket_errno)); + return OMPI_ERROR; + } + mca_btl_tcp_component.tcp_listen_port = inaddr.sin_port; + + /* setup listen backlog to maximum allowed by kernel */ + if(listen(mca_btl_tcp_component.tcp_listen_sd, SOMAXCONN) < 0) { + BTL_ERROR(("listen() failed with errno=%d", ompi_socket_errno)); + return OMPI_ERROR; + } + + /* set socket up to be non-blocking, otherwise accept could block */ + if((flags = fcntl(mca_btl_tcp_component.tcp_listen_sd, F_GETFL, 0)) < 0) { + BTL_ERROR(("fcntl(F_GETFL) failed with errno=%d", ompi_socket_errno)); + return OMPI_ERROR; + } else { + flags |= O_NONBLOCK; + if(fcntl(mca_btl_tcp_component.tcp_listen_sd, F_SETFL, flags) < 0) { + BTL_ERROR(("fcntl(F_SETFL) failed with errno=%d", ompi_socket_errno)); + return OMPI_ERROR; + } + } + + /* register listen port */ + opal_event_set( + &mca_btl_tcp_component.tcp_recv_event, + mca_btl_tcp_component.tcp_listen_sd, + OPAL_EV_READ|OPAL_EV_PERSIST, + mca_btl_tcp_component_recv_handler, + 0); + opal_event_add(&mca_btl_tcp_component.tcp_recv_event,0); + return OMPI_SUCCESS; +} + +/* + * Register TCP module addressing information. The MCA framework + * will make this available to all peers. + */ + +static int mca_btl_tcp_component_exchange(void) +{ + int rc=0; + size_t i=0; + size_t size = mca_btl_tcp_component.tcp_num_btls * sizeof(mca_btl_tcp_addr_t); + if(mca_btl_tcp_component.tcp_num_btls != 0) { + mca_btl_tcp_addr_t *addrs = (mca_btl_tcp_addr_t *)malloc(size); + for(i=0; itcp_ifaddr.sin_addr; + addrs[i].addr_port = mca_btl_tcp_component.tcp_listen_port; + addrs[i].addr_inuse = 0; + } + rc = mca_base_modex_send(&mca_btl_tcp_component.super.btl_version, addrs, size); + free(addrs); + } + return rc; +} + +/* + * TCP module initialization: + * (1) read interface list from kernel and compare against module parameters + * then create a BTL instance for selected interfaces + * (2) setup TCP listen socket for incoming connection attempts + * (3) register BTL parameters with the MCA + */ +mca_btl_base_module_t** mca_btl_tcp_component_init(int *num_btl_modules, + bool enable_progress_threads, + bool enable_mpi_threads) +{ + mca_btl_base_module_t **btls; + *num_btl_modules = 0; + + /* initialize free lists */ + ompi_free_list_init( &mca_btl_tcp_component.tcp_frag_eager, + sizeof (mca_btl_tcp_frag_eager_t) + mca_btl_tcp_module.super.btl_eager_limit, + OBJ_CLASS (mca_btl_tcp_frag_eager_t), + mca_btl_tcp_component.tcp_free_list_num, + mca_btl_tcp_component.tcp_free_list_max, + mca_btl_tcp_component.tcp_free_list_inc, + NULL ); + + ompi_free_list_init( &mca_btl_tcp_component.tcp_frag_max, + sizeof (mca_btl_tcp_frag_max_t) + mca_btl_tcp_module.super.btl_max_send_size, + OBJ_CLASS (mca_btl_tcp_frag_max_t), + mca_btl_tcp_component.tcp_free_list_num, + mca_btl_tcp_component.tcp_free_list_max, + mca_btl_tcp_component.tcp_free_list_inc, + NULL ); + + ompi_free_list_init( &mca_btl_tcp_component.tcp_frag_user, + sizeof (mca_btl_tcp_frag_user_t), + OBJ_CLASS (mca_btl_tcp_frag_user_t), + mca_btl_tcp_component.tcp_free_list_num, + mca_btl_tcp_component.tcp_free_list_max, + mca_btl_tcp_component.tcp_free_list_inc, + NULL ); + + /* create a BTL TCP module for selected interfaces */ + if(mca_btl_tcp_component_create_instances() != OMPI_SUCCESS) + return 0; + + /* create a TCP listen socket for incoming connection attempts */ + if(mca_btl_tcp_component_create_listen() != OMPI_SUCCESS) + return 0; + + /* publish TCP parameters with the MCA framework */ + if(mca_btl_tcp_component_exchange() != OMPI_SUCCESS) + return 0; + + btls = (mca_btl_base_module_t **)malloc(mca_btl_tcp_component.tcp_num_btls * + sizeof(mca_btl_base_module_t*)); + if(NULL == btls) + return NULL; + + memcpy(btls, mca_btl_tcp_component.tcp_btls, mca_btl_tcp_component.tcp_num_btls*sizeof(mca_btl_tcp_module_t*)); + *num_btl_modules = mca_btl_tcp_component.tcp_num_btls; + return btls; +} + +/* + * TCP module control + */ + +int mca_btl_tcp_component_control(int param, void* value, size_t size) +{ + return OMPI_SUCCESS; +} + + +/* + * Called by mca_btl_tcp_component_recv() when the TCP listen + * socket has pending connection requests. Accept incoming + * requests and queue for completion of the connection handshake. +*/ + + +static void mca_btl_tcp_component_accept(void) +{ + while(true) { + ompi_socklen_t addrlen = sizeof(struct sockaddr_in); + struct sockaddr_in addr; + mca_btl_tcp_event_t *event; + int sd = accept(mca_btl_tcp_component.tcp_listen_sd, (struct sockaddr*)&addr, &addrlen); + if(sd < 0) { + IMPORTANT_WINDOWS_COMMENT(); + if(ompi_socket_errno == EINTR) + continue; + if(ompi_socket_errno != EAGAIN || ompi_socket_errno != EWOULDBLOCK) + BTL_ERROR(("accept() failed with errno %d.", ompi_socket_errno)); + return; + } + mca_btl_tcp_set_socket_options(sd); + + /* wait for receipt of peers process identifier to complete this connection */ + + event = OBJ_NEW(mca_btl_tcp_event_t); + opal_event_set(&event->event, sd, OPAL_EV_READ, mca_btl_tcp_component_recv_handler, event); + opal_event_add(&event->event, 0); + } +} + + +/* + * Event callback when there is data available on the registered + * socket to recv. + */ +static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user) +{ + orte_process_name_t guid; + struct sockaddr_in addr; + int retval; + mca_btl_tcp_proc_t* btl_proc; + ompi_socklen_t addr_len = sizeof(addr); + mca_btl_tcp_event_t *event = (mca_btl_tcp_event_t *)user; + + /* accept new connections on the listen socket */ + if(mca_btl_tcp_component.tcp_listen_sd == sd) { + mca_btl_tcp_component_accept(); + return; + } + OBJ_RELEASE(event); + + /* recv the process identifier */ + retval = recv(sd, (char *)&guid, sizeof(guid), 0); + if(retval != sizeof(guid)) { + close(sd); + return; + } + + /* now set socket up to be non-blocking */ + if((flags = fcntl(sd, F_GETFL, 0)) < 0) { + BTL_ERROR(("fcntl(F_GETFL) failed with errno=%d", ompi_socket_errno)); + } else { + flags |= O_NONBLOCK; + if(fcntl(sd, F_SETFL, flags) < 0) { + BTL_ERROR(("fcntl(F_SETFL) failed with errno=%d", ompi_socket_errno)); + } + } + + /* lookup the corresponding process */ + btl_proc = mca_btl_tcp_proc_lookup(&guid); + if(NULL == btl_proc) { + BTL_ERROR(("errno=%d",errno)); + close(sd); + return; + } + + /* lookup peer address */ + if(getpeername(sd, (struct sockaddr*)&addr, &addr_len) != 0) { + BTL_ERROR(("getpeername() failed with errno=%d", ompi_socket_errno)); + close(sd); + return; + } + + /* are there any existing peer instances will to accept this connection */ + if(mca_btl_tcp_proc_accept(btl_proc, &addr, sd) == false) { + close(sd); + return; + } +} + diff --git a/ompi/mca/btl/tcp/btl_tcp_endpoint.c b/ompi/mca/btl/tcp/btl_tcp_endpoint.c new file mode 100644 index 0000000000..7a5deac9c6 --- /dev/null +++ b/ompi/mca/btl/tcp/btl_tcp_endpoint.c @@ -0,0 +1,686 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "ompi_config.h" +#include +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#include "include/ompi_socket_errno.h" +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_FCNTL_H +#include +#endif +#ifdef HAVE_NETINET_IN_H +#include +#endif +#ifdef HAVE_NETINET_TCP_H +#include +#endif +#ifdef HAVE_ARPA_INET_H +#include +#endif +#include +#include +#include "include/types.h" +#include "mca/ns/base/base.h" +#include "mca/oob/base/base.h" +#include "mca/rml/rml.h" +#include "mca/errmgr/errmgr.h" +#include "mca/btl/base/btl_base_error.h" +#include "dps/dps.h" +#include "btl_tcp.h" +#include "btl_tcp_endpoint.h" +#include "btl_tcp_proc.h" +#include "btl_tcp_frag.h" +#include "btl_tcp_addr.h" + + +/* + * Initialize state of the endpoint instance. + * + */ + +static void mca_btl_tcp_endpoint_construct(mca_btl_tcp_endpoint_t* endpoint) +{ + endpoint->endpoint_btl = NULL; + endpoint->endpoint_proc = NULL; + endpoint->endpoint_addr = NULL; + endpoint->endpoint_sd = -1; + endpoint->endpoint_send_frag = 0; + endpoint->endpoint_recv_frag = 0; + endpoint->endpoint_send_event.ev_flags = 0; + endpoint->endpoint_recv_event.ev_flags = 0; + endpoint->endpoint_state = MCA_BTL_TCP_CLOSED; + endpoint->endpoint_retries = 0; + endpoint->endpoint_nbo = false; + OBJ_CONSTRUCT(&endpoint->endpoint_frags, opal_list_t); + OBJ_CONSTRUCT(&endpoint->endpoint_send_lock, opal_mutex_t); + OBJ_CONSTRUCT(&endpoint->endpoint_recv_lock, opal_mutex_t); +} + +/* + * Destroy a endpoint + * + */ + + +static void mca_btl_tcp_endpoint_destruct(mca_btl_tcp_endpoint_t* endpoint) +{ + mca_btl_tcp_proc_remove(endpoint->endpoint_proc, endpoint); + mca_btl_tcp_endpoint_close(endpoint); + OBJ_DESTRUCT(&endpoint->endpoint_frags); + OBJ_DESTRUCT(&endpoint->endpoint_send_lock); + OBJ_DESTRUCT(&endpoint->endpoint_recv_lock); +} + +OBJ_CLASS_INSTANCE( + mca_btl_tcp_endpoint_t, + opal_list_item_t, + mca_btl_tcp_endpoint_construct, + mca_btl_tcp_endpoint_destruct); + + +#define IMPORTANT_WINDOWS_COMMENT() \ + /* In windows, many of the socket functions return an EWOULDBLOCK instead of \ + things like EAGAIN, EINPROGRESS, etc. It has been verified that this will \ + not conflict with other error codes that are returned by these functions \ + under UNIX/Linux environments */ + +static void mca_btl_tcp_endpoint_construct(mca_btl_base_endpoint_t* btl_endpoint); +static void mca_btl_tcp_endpoint_destruct(mca_btl_base_endpoint_t* btl_endpoint); +static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t*); +static void mca_btl_tcp_endpoint_connected(mca_btl_base_endpoint_t*); +static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user); +static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user); + +/* + * Diagnostics: change this to "1" to enable the function + * mca_btl_tcp_endpoint_dump(), below + */ +#define WANT_PEER_DUMP 0 +/* + * diagnostics + */ + +#if WANT_PEER_DUMP +static void mca_btl_tcp_endpoint_dump(mca_btl_base_endpoint_t* btl_endpoint, const char* msg) +{ + char src[64]; + char dst[64]; + int sndbuf,rcvbuf,nodelay,flags; + struct sockaddr_in inaddr; + ompi_socklen_t obtlen; + ompi_socklen_t addrlen = sizeof(struct sockaddr_in); + + getsockname(btl_endpoint->endpoint_sd, (struct sockaddr*)&inaddr, &addrlen); + sprintf(src, "%s", inet_ntoa(inaddr.sin_addr)); + getpeername(btl_endpoint->endpoint_sd, (struct sockaddr*)&inaddr, &addrlen); + sprintf(dst, "%s", inet_ntoa(inaddr.sin_addr)); + + if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) { + BTL_ERROR(("fcntl(F_GETFL) failed with errno=%d", ompi_socket_errno)); + } + +#if defined(SO_SNDBUF) + obtlen = sizeof(sndbuf); + if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_SNDBUF, (char *)&sndbuf, &obtlen) < 0) { + BTL_ERROR(("SO_SNDBUF option: errno %d", ompi_socket_errno)); + } +#else + sndbuf = -1; +#endif +#if defined(SO_RCVBUF) + obtlen = sizeof(rcvbuf); + if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_RCVBUF, (char *)&rcvbuf, &obtlen) < 0) { + BTL_ERROR(("SO_RCVBUF option: errno %d", ompi_socket_errno)); + } +#else + rcvbuf = -1; +#endif +#if defined(TCP_NODELAY) + obtlen = sizeof(nodelay); + if(getsockopt(btl_endpoint->endpoint_sd, IPPROTO_TCP, TCP_NODELAY, (char *)&nodelay, &obtlen) < 0) { + BTL_ERROR(("TCP_NODELAY option: errno %d", ompi_socket_errno)); + } +#else + nodelay = 0; +#endif + + BTL_DEBUG(("%s: %s - %s nodelay %d sndbuf %d rcvbuf %d flags %08x", + msg, src, dst, nodelay, sndbuf, rcvbuf, flags)); +} +#endif + +/* + * Initialize events to be used by the endpoint instance for TCP select/poll callbacks. + */ + +static inline void mca_btl_tcp_endpoint_event_init(mca_btl_base_endpoint_t* btl_endpoint, int sd) +{ + opal_event_set( + &btl_endpoint->endpoint_recv_event, + btl_endpoint->endpoint_sd, + OPAL_EV_READ|OPAL_EV_PERSIST, + mca_btl_tcp_endpoint_recv_handler, + btl_endpoint); + opal_event_set( + &btl_endpoint->endpoint_send_event, + btl_endpoint->endpoint_sd, + OPAL_EV_WRITE|OPAL_EV_PERSIST, + mca_btl_tcp_endpoint_send_handler, + btl_endpoint); +} + + +/* + * Attempt to send a fragment using a given endpoint. If the endpoint is not connected, + * queue the fragment and start the connection as required. + */ + +int mca_btl_tcp_endpoint_send(mca_btl_base_endpoint_t* btl_endpoint, mca_btl_tcp_frag_t* frag) +{ + int rc = OMPI_SUCCESS; + OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); + switch(btl_endpoint->endpoint_state) { + case MCA_BTL_TCP_CONNECTING: + case MCA_BTL_TCP_CONNECT_ACK: + case MCA_BTL_TCP_CLOSED: + opal_list_append(&btl_endpoint->endpoint_frags, (opal_list_item_t*)frag); + if(btl_endpoint->endpoint_state == MCA_BTL_TCP_CLOSED) + rc = mca_btl_tcp_endpoint_start_connect(btl_endpoint); + break; + case MCA_BTL_TCP_FAILED: + rc = OMPI_ERR_UNREACH; + break; + case MCA_BTL_TCP_CONNECTED: + if (NULL != btl_endpoint->endpoint_send_frag) { + opal_list_append(&btl_endpoint->endpoint_frags, (opal_list_item_t*)frag); + } else { + if(mca_btl_tcp_frag_send(frag, btl_endpoint->endpoint_sd)) { + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); + frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, frag->rc); + return OMPI_SUCCESS; + } else { + btl_endpoint->endpoint_send_frag = frag; + opal_event_add(&btl_endpoint->endpoint_send_event, 0); + } + } + break; + case MCA_BTL_TCP_SHUTDOWN: + rc = OMPI_ERROR; + break; + } + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); + return rc; +} + + +/* + * A blocking send on a non-blocking socket. Used to send the small amount of connection + * information that identifies the endpoints endpoint. + */ +static int mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint, void* data, size_t size) +{ + unsigned char* ptr = (unsigned char*)data; + size_t cnt = 0; + while(cnt < size) { + int retval = send(btl_endpoint->endpoint_sd, (const char *)ptr+cnt, size-cnt, 0); + if(retval < 0) { + IMPORTANT_WINDOWS_COMMENT(); + if(ompi_socket_errno != EINTR && ompi_socket_errno != EAGAIN && ompi_socket_errno != EWOULDBLOCK) { + BTL_ERROR(("send() failed with errno=%d",ompi_socket_errno)); + mca_btl_tcp_endpoint_close(btl_endpoint); + return -1; + } + continue; + } + cnt += retval; + } + return cnt; +} + + +/* + * Send the globally unique identifier for this process to a endpoint on + * a newly connected socket. + */ + +static int mca_btl_tcp_endpoint_send_connect_ack(mca_btl_base_endpoint_t* btl_endpoint) +{ + /* send process identifier to remote endpoint */ + mca_btl_tcp_proc_t* btl_proc = mca_btl_tcp_proc_local(); + if(mca_btl_tcp_endpoint_send_blocking(btl_endpoint, &btl_proc->proc_name, sizeof(btl_proc->proc_name)) != + sizeof(btl_proc->proc_name)) { + return OMPI_ERR_UNREACH; + } + return OMPI_SUCCESS; +} + +/* + * Check the state of this endpoint. If the incoming connection request matches + * our endpoints address, check the state of our connection: + * (1) if a connection has not been attempted, accept the connection + * (2) if a connection has not been established, and the endpoints process identifier + * is less than the local process, accept the connection + * otherwise, reject the connection and continue with the current connection + */ + +bool mca_btl_tcp_endpoint_accept(mca_btl_base_endpoint_t* btl_endpoint, struct sockaddr_in* addr, int sd) +{ + mca_btl_tcp_addr_t* btl_addr; + mca_btl_tcp_proc_t* this_proc = mca_btl_tcp_proc_local(); + orte_ns_cmp_bitmask_t mask = ORTE_NS_CMP_ALL; + int cmpval; + + OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock); + OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); + if((btl_addr = btl_endpoint->endpoint_addr) != NULL && + btl_addr->addr_inet.s_addr == addr->sin_addr.s_addr) { + mca_btl_tcp_proc_t *endpoint_proc = btl_endpoint->endpoint_proc; + cmpval = orte_ns.compare(mask, + &endpoint_proc->proc_ompi->proc_name, + &this_proc->proc_ompi->proc_name); + if((btl_endpoint->endpoint_sd < 0) || + (btl_endpoint->endpoint_state != MCA_BTL_TCP_CONNECTED && + cmpval < 0)) { + mca_btl_tcp_endpoint_close(btl_endpoint); + btl_endpoint->endpoint_sd = sd; + if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) != OMPI_SUCCESS) { + mca_btl_tcp_endpoint_close(btl_endpoint); + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); + return false; + } + mca_btl_tcp_endpoint_event_init(btl_endpoint, sd); + opal_event_add(&btl_endpoint->endpoint_recv_event, 0); + mca_btl_tcp_endpoint_connected(btl_endpoint); +#if OMPI_ENABLE_DEBUG && WANT_PEER_DUMP + mca_btl_tcp_endpoint_dump(btl_endpoint, "accepted"); +#endif + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); + return true; + } + } + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); + return false; +} + + +/* + * Remove any event registrations associated with the socket + * and update the endpoint state to reflect the connection has + * been closed. + */ + +void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t* btl_endpoint) +{ + if(btl_endpoint->endpoint_sd >= 0) { + opal_event_del(&btl_endpoint->endpoint_recv_event); + opal_event_del(&btl_endpoint->endpoint_send_event); + close(btl_endpoint->endpoint_sd); + btl_endpoint->endpoint_sd = -1; + } + btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED; + btl_endpoint->endpoint_retries++; +} + +void mca_btl_tcp_endpoint_shutdown(mca_btl_base_endpoint_t* btl_endpoint) +{ + OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock); + OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); + mca_btl_tcp_endpoint_close(btl_endpoint); + btl_endpoint->endpoint_state = MCA_BTL_TCP_SHUTDOWN; + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); +} + + +/* + * Setup endpoint state to reflect that connection has been established, + * and start any pending sends. + */ + +static void mca_btl_tcp_endpoint_connected(mca_btl_base_endpoint_t* btl_endpoint) +{ + /* setup socket options */ + btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTED; + btl_endpoint->endpoint_retries = 0; + if(opal_list_get_size(&btl_endpoint->endpoint_frags) > 0) { + if(NULL == btl_endpoint->endpoint_send_frag) + btl_endpoint->endpoint_send_frag = (mca_btl_tcp_frag_t*) + opal_list_remove_first(&btl_endpoint->endpoint_frags); + opal_event_add(&btl_endpoint->endpoint_send_event, 0); + } +} + + +/* + * A blocking recv on a non-blocking socket. Used to receive the small amount of connection + * information that identifies the endpoints endpoint. + */ +static int mca_btl_tcp_endpoint_recv_blocking(mca_btl_base_endpoint_t* btl_endpoint, void* data, size_t size) +{ + unsigned char* ptr = (unsigned char*)data; + size_t cnt = 0; + while(cnt < size) { + int retval = recv(btl_endpoint->endpoint_sd, (char *)ptr+cnt, size-cnt, 0); + + /* remote closed connection */ + if(retval == 0) { + mca_btl_tcp_endpoint_close(btl_endpoint); + return -1; + } + + /* socket is non-blocking so handle errors */ + if(retval < 0) { + IMPORTANT_WINDOWS_COMMENT(); + if(ompi_socket_errno != EINTR && ompi_socket_errno != EAGAIN && ompi_socket_errno != EWOULDBLOCK) { + BTL_ERROR(("recv() failed with errno=%d",ompi_socket_errno)); + mca_btl_tcp_endpoint_close(btl_endpoint); + return -1; + } + continue; + } + cnt += retval; + } + return cnt; +} + + + +/* + * Receive the endpoints globally unique process identification from a newly + * connected socket and verify the expected response. If so, move the + * socket to a connected state. + */ + +static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_endpoint) +{ + orte_process_name_t guid; + mca_btl_tcp_proc_t* btl_proc = btl_endpoint->endpoint_proc; + + if((mca_btl_tcp_endpoint_recv_blocking(btl_endpoint, &guid, sizeof(orte_process_name_t))) != sizeof(orte_process_name_t)) { + return OMPI_ERR_UNREACH; + } + + /* compare this to the expected values */ + if(memcmp(&btl_proc->proc_name, &guid, sizeof(orte_process_name_t)) != 0) { + BTL_ERROR(("received unexpected process identifier [%lu,%lu,%lu]", + ORTE_NAME_ARGS(&guid))); + mca_btl_tcp_endpoint_close(btl_endpoint); + return OMPI_ERR_UNREACH; + } + + /* connected */ + mca_btl_tcp_endpoint_connected(btl_endpoint); +#if OMPI_ENABLE_DEBUG && WANT_PEER_DUMP + mca_btl_tcp_endpoint_dump(btl_endpoint, "connected"); +#endif + return OMPI_SUCCESS; +} + + +void mca_btl_tcp_set_socket_options(int sd) +{ + int optval; +#if defined(TCP_NODELAY) + optval = 1; + if(setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char *)&optval, sizeof(optval)) < 0) { + BTL_ERROR(("setsockopt(TCP_NODELAY) failed with errno=%d", ompi_socket_errno)); + } +#endif +#if defined(SO_SNDBUF) + if(mca_btl_tcp_component.tcp_sndbuf > 0 && + setsockopt(sd, SOL_SOCKET, SO_SNDBUF, (char *)&mca_btl_tcp_component.tcp_sndbuf, sizeof(int)) < 0) { + BTL_ERROR(("setsockopt(SO_SNDBUF) failed with errno %d", ompi_socket_errno)); + } +#endif +#if defined(SO_RCVBUF) + if(mca_btl_tcp_component.tcp_rcvbuf > 0 && + setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (char *)&mca_btl_tcp_component.tcp_rcvbuf, sizeof(int)) < 0) { + BTL_ERROR(("setsockopt(SO_RCVBUF) failed with errno %d", ompi_socket_errno)); + } +#endif +} + + + +/* + * Start a connection to the endpoint. This will likely not complete, + * as the socket is set to non-blocking, so register for event + * notification of connect completion. On connection we send + * our globally unique process identifier to the endpoint and wait for + * the endpoints response. + */ + +static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpoint) +{ + int rc,flags; + struct sockaddr_in endpoint_addr; + + btl_endpoint->endpoint_sd = socket(AF_INET, SOCK_STREAM, 0); + if (btl_endpoint->endpoint_sd < 0) { + btl_endpoint->endpoint_retries++; + return OMPI_ERR_UNREACH; + } + + /* setup socket buffer sizes */ + mca_btl_tcp_set_socket_options(btl_endpoint->endpoint_sd); + + /* setup event callbacks */ + mca_btl_tcp_endpoint_event_init(btl_endpoint, btl_endpoint->endpoint_sd); + + /* setup the socket as non-blocking */ + if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) { + BTL_ERROR(("fcntl(F_GETFL) failed with errno=%d", ompi_socket_errno)); + } else { + flags |= O_NONBLOCK; + if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0) + BTL_ERROR(("fcntl(F_SETFL) failed with errno=%d", ompi_socket_errno)); + } + + /* start the connect - will likely fail with EINPROGRESS */ + endpoint_addr.sin_family = AF_INET; + endpoint_addr.sin_addr = btl_endpoint->endpoint_addr->addr_inet; + endpoint_addr.sin_port = btl_endpoint->endpoint_addr->addr_port; + if(connect(btl_endpoint->endpoint_sd, (struct sockaddr*)&endpoint_addr, sizeof(endpoint_addr)) < 0) { + /* non-blocking so wait for completion */ + IMPORTANT_WINDOWS_COMMENT(); + if(ompi_socket_errno == EINPROGRESS || ompi_socket_errno == EWOULDBLOCK) { + btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING; + opal_event_add(&btl_endpoint->endpoint_send_event, 0); + return OMPI_SUCCESS; + } + mca_btl_tcp_endpoint_close(btl_endpoint); + btl_endpoint->endpoint_retries++; + return OMPI_ERR_UNREACH; + } + + /* send our globally unique process identifier to the endpoint */ + if((rc = mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint)) == OMPI_SUCCESS) { + btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; + opal_event_add(&btl_endpoint->endpoint_recv_event, 0); + } else { + mca_btl_tcp_endpoint_close(btl_endpoint); + } + return rc; +} + + +/* + * Check the status of the connection. If the connection failed, will retry + * later. Otherwise, send this processes identifier to the endpoint on the + * newly connected socket. + */ + +static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint) +{ + int so_error = 0; + ompi_socklen_t so_length = sizeof(so_error); + + /* unregister from receiving event notifications */ + opal_event_del(&btl_endpoint->endpoint_send_event); + + /* check connect completion status */ + if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) { + BTL_ERROR(("getsockopt() failed with errno=%d", ompi_socket_errno)); + mca_btl_tcp_endpoint_close(btl_endpoint); + return; + } + IMPORTANT_WINDOWS_COMMENT(); + if(so_error == EINPROGRESS || so_error == EWOULDBLOCK) { + opal_event_add(&btl_endpoint->endpoint_send_event, 0); + return; + } + if(so_error != 0) { + BTL_ERROR(("connect() failed with errno=%d", so_error)); + mca_btl_tcp_endpoint_close(btl_endpoint); + return; + } + + if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) == OMPI_SUCCESS) { + btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; + opal_event_add(&btl_endpoint->endpoint_recv_event, 0); + } else { + mca_btl_tcp_endpoint_close(btl_endpoint); + } +} + + +/* + * A file descriptor is available/ready for recv. Check the state + * of the socket and take the appropriate action. + */ + +static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user) +{ + mca_btl_base_endpoint_t* btl_endpoint = (mca_btl_base_endpoint_t *)user; + OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock); + switch(btl_endpoint->endpoint_state) { + case MCA_BTL_TCP_CONNECT_ACK: + { + mca_btl_tcp_endpoint_recv_connect_ack(btl_endpoint); + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); + break; + } + case MCA_BTL_TCP_CONNECTED: + { + mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_recv_frag; + if(NULL == frag) { + int rc; + MCA_BTL_TCP_FRAG_ALLOC_MAX(frag, rc); + if(NULL == frag) { + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); + return; + } + MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint); + } + + /* check for completion of non-blocking recv on the current fragment */ + if(mca_btl_tcp_frag_recv(frag, sd) == false) { + btl_endpoint->endpoint_recv_frag = frag; + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); + } else { + btl_endpoint->endpoint_recv_frag = NULL; + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); + switch(frag->hdr.type) { + case MCA_BTL_TCP_HDR_TYPE_SEND: + { + mca_btl_base_recv_reg_t* reg = frag->btl->tcp_reg + frag->hdr.base.tag; + reg->cbfunc(&frag->btl->super, frag->hdr.base.tag, &frag->base, reg->cbdata); + break; + } + default: + { + break; + } + } + MCA_BTL_TCP_FRAG_RETURN_MAX(frag); + } + break; + } + case MCA_BTL_TCP_SHUTDOWN: + { + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); + break; + } + default: + { + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); + BTL_ERROR(("invalid socket state(%d)", btl_endpoint->endpoint_state)); + mca_btl_tcp_endpoint_close(btl_endpoint); + break; + } + } +} + + +/* + * A file descriptor is available/ready for send. Check the state + * of the socket and take the appropriate action. + */ + +static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user) +{ + mca_btl_tcp_endpoint_t* btl_endpoint = (mca_btl_tcp_endpoint_t *)user; + OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); + switch(btl_endpoint->endpoint_state) { + case MCA_BTL_TCP_CONNECTING: + mca_btl_tcp_endpoint_complete_connect(btl_endpoint); + break; + case MCA_BTL_TCP_CONNECTED: + { + /* complete the current send */ + do { + mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag; + if(mca_btl_tcp_frag_send(frag, btl_endpoint->endpoint_sd) == false) { + break; + } + + /* if required - update request status and release fragment */ + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); + frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, frag->rc); + OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); + + /* progress any pending sends */ + btl_endpoint->endpoint_send_frag = (mca_btl_tcp_frag_t*) + opal_list_remove_first(&btl_endpoint->endpoint_frags); + } while (NULL != btl_endpoint->endpoint_send_frag); + + /* if nothing else to do unregister for send event notifications */ + if(NULL == btl_endpoint->endpoint_send_frag) { + opal_event_del(&btl_endpoint->endpoint_send_event); + } + break; + } + default: + BTL_ERROR(("invalid connection state (%d)", + btl_endpoint->endpoint_state)); + opal_event_del(&btl_endpoint->endpoint_send_event); + break; + } + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); +} + + + diff --git a/ompi/mca/btl/tcp/btl_tcp_endpoint.h b/ompi/mca/btl/tcp/btl_tcp_endpoint.h new file mode 100644 index 0000000000..311a8f9f24 --- /dev/null +++ b/ompi/mca/btl/tcp/btl_tcp_endpoint.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_TCP_ENDPOINT_H +#define MCA_BTL_TCP_ENDPOINT_H + +#include "opal/class/opal_list.h" +#include "opal/event/event.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/btl/btl.h" +#include "btl_tcp_frag.h" +#include "btl_tcp.h" +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +OBJ_CLASS_DECLARATION(mca_btl_tcp_endpoint_t); + + +/** + * State of TCP endpoint connection. + */ + +typedef enum { + MCA_BTL_TCP_CLOSED, + MCA_BTL_TCP_CONNECTING, + MCA_BTL_TCP_CONNECT_ACK, + MCA_BTL_TCP_CONNECTED, + MCA_BTL_TCP_SHUTDOWN, + MCA_BTL_TCP_FAILED +} mca_btl_tcp_state_t; + +/** + * An abstraction that represents a connection to a endpoint process. + * An instance of mca_btl_base_endpoint_t is associated w/ each process + * and BTL pair at startup. However, connections to the endpoint + * are established dynamically on an as-needed basis: + */ + +struct mca_btl_base_endpoint_t { + opal_list_item_t super; + struct mca_btl_tcp_module_t* endpoint_btl; /**< BTL instance that created this connection */ + struct mca_btl_tcp_proc_t* endpoint_proc; /**< proc structure corresponding to endpoint */ + struct mca_btl_tcp_addr_t* endpoint_addr; /**< address of endpoint */ + int endpoint_sd; /**< socket connection to endpoint */ + struct mca_btl_tcp_frag_t* endpoint_send_frag; /**< current send frag being processed */ + struct mca_btl_tcp_frag_t* endpoint_recv_frag; /**< current recv frag being processed */ + mca_btl_tcp_state_t endpoint_state; /**< current state of the connection */ + size_t endpoint_retries; /**< number of connection retries attempted */ + opal_list_t endpoint_frags; /**< list of pending frags to send */ + opal_mutex_t endpoint_send_lock; /**< lock for concurrent access to endpoint state */ + opal_mutex_t endpoint_recv_lock; /**< lock for concurrent access to endpoint state */ + opal_event_t endpoint_send_event; /**< event for async processing of send frags */ + opal_event_t endpoint_recv_event; /**< event for async processing of recv frags */ + bool endpoint_nbo; /**< convert headers to network byte order? */ +}; + +typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; +typedef mca_btl_base_endpoint_t mca_btl_tcp_endpoint_t; + +void mca_btl_tcp_set_socket_options(int sd); +void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t*); +int mca_btl_tcp_endpoint_send(mca_btl_base_endpoint_t*, struct mca_btl_tcp_frag_t*); +bool mca_btl_tcp_endpoint_accept(mca_btl_base_endpoint_t*, struct sockaddr_in*, int); +void mca_btl_tcp_endpoint_shutdown(mca_btl_base_endpoint_t*); + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/btl/tcp/btl_tcp_frag.c b/ompi/mca/btl/tcp/btl_tcp_frag.c new file mode 100644 index 0000000000..8e74ee14da --- /dev/null +++ b/ompi/mca/btl/tcp/btl_tcp_frag.c @@ -0,0 +1,170 @@ +#include "ompi_config.h" +#include "include/ompi_socket_errno.h" +#include "ompi/mca/btl/base/btl_base_error.h" +#include "btl_tcp_frag.h" +#include "btl_tcp_endpoint.h" + + + +static void mca_btl_tcp_frag_common_constructor(mca_btl_tcp_frag_t* frag) +{ + frag->base.des_src = NULL; + frag->base.des_src_cnt = 0; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; +} + +static void mca_btl_tcp_frag_eager_constructor(mca_btl_tcp_frag_t* frag) +{ + frag->size = mca_btl_tcp_module.super.btl_eager_limit; + mca_btl_tcp_frag_common_constructor(frag); +} + +static void mca_btl_tcp_frag_max_constructor(mca_btl_tcp_frag_t* frag) +{ + frag->size = mca_btl_tcp_module.super.btl_max_send_size; + mca_btl_tcp_frag_common_constructor(frag); +} + +static void mca_btl_tcp_frag_user_constructor(mca_btl_tcp_frag_t* frag) +{ + frag->size = 0; + mca_btl_tcp_frag_common_constructor(frag); +} + + +OBJ_CLASS_INSTANCE( + mca_btl_tcp_frag_t, + mca_btl_base_descriptor_t, + NULL, + NULL); + +OBJ_CLASS_INSTANCE( + mca_btl_tcp_frag_eager_t, + mca_btl_base_descriptor_t, + mca_btl_tcp_frag_eager_constructor, + NULL); + +OBJ_CLASS_INSTANCE( + mca_btl_tcp_frag_max_t, + mca_btl_base_descriptor_t, + mca_btl_tcp_frag_max_constructor, + NULL); + +OBJ_CLASS_INSTANCE( + mca_btl_tcp_frag_user_t, + mca_btl_base_descriptor_t, + mca_btl_tcp_frag_user_constructor, + NULL); + + +bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd) +{ + int cnt=-1; + size_t i, num_vecs; + + /* non-blocking write, but continue if interrupted */ + while(cnt < 0) { + cnt = writev(sd, frag->iov_ptr, frag->iov_cnt); + if(cnt < 0) { + switch(ompi_socket_errno) { + case EINTR: + continue; + case EWOULDBLOCK: + /* opal_output(0, "mca_btl_tcp_frag_send: EWOULDBLOCK\n"); */ + return false; + case EFAULT: + BTL_ERROR(("writev error (%p, %d)\n\t%s(%d)\n", + frag->iov_ptr[0].iov_base, frag->iov_ptr[0].iov_len, + strerror(ompi_socket_errno), frag->iov_cnt)); + default: + { + BTL_ERROR(("writev failed with errno=%d", ompi_socket_errno)); + mca_btl_tcp_endpoint_close(frag->endpoint); + return false; + } + } + } + } + + /* if the write didn't complete - update the iovec state */ + num_vecs = frag->iov_cnt; + for(i=0; i= (int)frag->iov_ptr->iov_len) { + cnt -= frag->iov_ptr->iov_len; + frag->iov_ptr++; + frag->iov_idx++; + frag->iov_cnt--; + } else { + frag->iov_ptr->iov_base = (ompi_iov_base_ptr_t) + (((unsigned char*)frag->iov_ptr->iov_base) + cnt); + frag->iov_ptr->iov_len -= cnt; + break; + } + } + return (frag->iov_cnt == 0); +} + + +bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) +{ + int cnt=-1; + size_t i, num_vecs; + + /* non-blocking read, but continue if interrupted */ + while(cnt < 0) { + cnt = readv(sd, frag->iov_ptr, frag->iov_cnt); + if(cnt < 0) { + switch(ompi_socket_errno) { + case EINTR: + continue; + case EWOULDBLOCK: + return false; + case EFAULT: + opal_output( 0, "mca_btl_tcp_frag_send: writev error (%p, %d)\n\t%s(%d)\n", + frag->iov_ptr[0].iov_base, frag->iov_ptr[0].iov_len, + strerror(ompi_socket_errno), frag->iov_cnt ); + default: + { + opal_output(0, "mca_btl_tcp_frag_send: writev failed with errno=%d", + ompi_socket_errno); + mca_btl_tcp_endpoint_close(frag->endpoint); + return false; + } + } + } + } + + /* if the write didn't complete - update the iovec state */ + num_vecs = frag->iov_cnt; + for(i=0; i= (int)frag->iov_ptr->iov_len) { + cnt -= frag->iov_ptr->iov_len; + frag->iov_idx++; + frag->iov_ptr++; + frag->iov_cnt--; + } else { + frag->iov_ptr->iov_base = (ompi_iov_base_ptr_t) + (((unsigned char*)frag->iov_ptr->iov_base) + cnt); + frag->iov_ptr->iov_len -= cnt; + break; + } + } + + /* read header */ + if(frag->iov_cnt == 0 && frag->iov_idx == 1) { + switch(frag->hdr.type) { + case MCA_BTL_TCP_HDR_TYPE_SEND: + frag->iov[1].iov_base = (frag+1); + frag->iov[1].iov_len = frag->hdr.size; + frag->segments[0].seg_addr.pval = frag+1; + frag->segments[0].seg_len = frag->hdr.size; + frag->iov_cnt++; + return false; + default: + break; + } + } + return (frag->iov_cnt == 0); +} + diff --git a/ompi/mca/btl/tcp/btl_tcp_frag.h b/ompi/mca/btl/tcp/btl_tcp_frag.h new file mode 100644 index 0000000000..0ea13377c0 --- /dev/null +++ b/ompi/mca/btl/tcp/btl_tcp_frag.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_TCP_FRAG_H +#define MCA_BTL_TCP_FRAG_H + + +#define MCA_BTL_TCP_FRAG_ALIGN (8) +#include "ompi_config.h" +#include "btl_tcp.h" +#include "btl_tcp_hdr.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif +OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_tcp_frag_t); + + + +/** + * TCP fragment derived type. + */ +struct mca_btl_tcp_frag_t { + mca_btl_base_descriptor_t base; + mca_btl_base_segment_t segments[2]; + struct mca_btl_base_endpoint_t *endpoint; + struct mca_btl_tcp_module_t* btl; + mca_btl_tcp_hdr_t hdr; + struct iovec iov[3]; + struct iovec *iov_ptr; + size_t iov_cnt; + size_t iov_idx; + size_t size; + int rc; +}; +typedef struct mca_btl_tcp_frag_t mca_btl_tcp_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_tcp_frag_t); + + +typedef struct mca_btl_tcp_frag_t mca_btl_tcp_frag_eager_t; + +OBJ_CLASS_DECLARATION(mca_btl_tcp_frag_eager_t); + +typedef struct mca_btl_tcp_frag_t mca_btl_tcp_frag_max_t; + +OBJ_CLASS_DECLARATION(mca_btl_tcp_frag_max_t); + +typedef struct mca_btl_tcp_frag_t mca_btl_tcp_frag_user_t; + +OBJ_CLASS_DECLARATION(mca_btl_tcp_frag_user_t); + + +/* + * Macros to allocate/return descriptors from module specific + * free list(s). + */ + +#define MCA_BTL_TCP_FRAG_ALLOC_EAGER(frag, rc) \ +{ \ + \ + opal_list_item_t *item; \ + OMPI_FREE_LIST_WAIT(&mca_btl_tcp_component.tcp_frag_eager, item, rc); \ + frag = (mca_btl_tcp_frag_t*) item; \ +} + +#define MCA_BTL_TCP_FRAG_RETURN_EAGER(frag) \ +{ \ + OMPI_FREE_LIST_RETURN(&mca_btl_tcp_component.tcp_frag_eager, \ + (opal_list_item_t*)(frag)); \ +} + +#define MCA_BTL_TCP_FRAG_ALLOC_MAX(frag, rc) \ +{ \ + \ + opal_list_item_t *item; \ + OMPI_FREE_LIST_WAIT(&mca_btl_tcp_component.tcp_frag_max, item, rc); \ + frag = (mca_btl_tcp_frag_t*) item; \ +} + +#define MCA_BTL_TCP_FRAG_RETURN_MAX(frag) \ +{ \ + OMPI_FREE_LIST_RETURN(&mca_btl_tcp_component.tcp_frag_max, \ + (opal_list_item_t*)(frag)); \ +} + + +#define MCA_BTL_TCP_FRAG_ALLOC_USER(frag, rc) \ +{ \ + opal_list_item_t *item; \ + OMPI_FREE_LIST_WAIT(&mca_btl_tcp_component.tcp_frag_user, item, rc); \ + frag = (mca_btl_tcp_frag_t*) item; \ +} + +#define MCA_BTL_TCP_FRAG_RETURN_USER(frag) \ +{ \ + OMPI_FREE_LIST_RETURN(&mca_btl_tcp_component.tcp_frag_user, \ + (opal_list_item_t*)(frag)); \ +} + +#define MCA_BTL_TCP_FRAG_INIT_SRC(frag,endpoint) \ +do { \ + size_t i; \ + frag->rc = 0; \ + frag->hdr.size = 0; \ + frag->iov_idx = 0; \ + frag->endpoint = endpoint; \ + frag->hdr.size = 0; \ + frag->iov_cnt = 1; \ + frag->iov_ptr = frag->iov; \ + frag->iov[0].iov_base = &frag->hdr; \ + frag->iov[0].iov_len = sizeof(frag->hdr); \ + for(i=0; ibase.des_src_cnt; i++) { \ + frag->hdr.size += frag->segments[i].seg_len; \ + frag->iov[i+1].iov_len = frag->segments[i].seg_len; \ + frag->iov[i+1].iov_base = frag->segments[i].seg_addr.pval; \ + frag->iov_cnt++; \ + } \ +} while(0) + +#define MCA_BTL_TCP_FRAG_INIT_DST(frag,ep) \ +do { \ + frag->rc = 0; \ + frag->btl = ep->endpoint_btl; \ + frag->endpoint = ep; \ + frag->iov[0].iov_len = sizeof(frag->hdr); \ + frag->iov[0].iov_base = &frag->hdr; \ + frag->iov_cnt = 1; \ + frag->iov_idx = 0; \ + frag->iov_ptr = frag->iov; \ + frag->base.des_src = NULL; \ + frag->base.des_dst_cnt = 0; \ + frag->base.des_dst = frag->segments; \ + frag->base.des_dst_cnt = 1; \ +} while(0) + + +bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t*, int sd); +bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t*, int sd); + + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/btl/tcp/btl_tcp_hdr.h b/ompi/mca/btl/tcp/btl_tcp_hdr.h new file mode 100644 index 0000000000..27dd313d7d --- /dev/null +++ b/ompi/mca/btl/tcp/btl_tcp_hdr.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_TCP_HDR_H +#define MCA_BTL_TCP_HDR_H + + +#include "ompi_config.h" +#include "mca/btl/base/base.h" +#include "btl_tcp.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +/** + * TCP header. + */ + +#define MCA_BTL_TCP_HDR_TYPE_SEND 1 +#define MCA_BTL_TCP_HDR_TYPE_PUT 2 +#define MCA_BTL_TCP_HDR_TYPE_GET 3 + + +struct mca_btl_tcp_hdr_t { + mca_btl_base_header_t base; + uint8_t type; + uint64_t size; +}; +typedef struct mca_btl_tcp_hdr_t mca_btl_tcp_hdr_t; + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/btl/tcp/btl_tcp_proc.c b/ompi/mca/btl/tcp/btl_tcp_proc.c new file mode 100644 index 0000000000..723f56668a --- /dev/null +++ b/ompi/mca/btl/tcp/btl_tcp_proc.c @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "orte/class/orte_proc_table.h" +#include "ompi/mca/btl/base/btl_base_error.h" +#include "ompi/mca/pml/base/pml_base_module_exchange.h" + +#include "btl_tcp.h" +#include "btl_tcp_proc.h" + +static void mca_btl_tcp_proc_construct(mca_btl_tcp_proc_t* proc); +static void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* proc); + + +OBJ_CLASS_INSTANCE( + mca_btl_tcp_proc_t, + opal_list_item_t, + mca_btl_tcp_proc_construct, + mca_btl_tcp_proc_destruct); + + +void mca_btl_tcp_proc_construct(mca_btl_tcp_proc_t* proc) +{ + proc->proc_ompi = 0; + proc->proc_addrs = NULL; + proc->proc_addr_count = 0; + proc->proc_endpoints = NULL; + proc->proc_endpoint_count = 0; + OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t); +} + +/* + * Cleanup ib proc instance + */ + +void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* proc) +{ + /* remove from list of all proc instances */ + OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock); + orte_hash_table_remove_proc(&mca_btl_tcp_component.tcp_procs, &proc->proc_name); + OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock); + + /* release resources */ + if(NULL != proc->proc_endpoints) { + free(proc->proc_endpoints); + OBJ_DESTRUCT(&proc->proc_lock); + } +} + + +/* + * Create a TCP process structure. There is a one-to-one correspondence + * between a ompi_proc_t and a mca_btl_tcp_proc_t instance. We cache + * additional data (specifically the list of mca_btl_tcp_endpoint_t instances, + * and published addresses) associated w/ a given destination on this + * datastructure. + */ + +mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(ompi_proc_t* ompi_proc) +{ + int rc; + size_t size; + mca_btl_tcp_proc_t* btl_proc; + + OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock); + btl_proc = (mca_btl_tcp_proc_t*)orte_hash_table_get_proc( + &mca_btl_tcp_component.tcp_procs, &ompi_proc->proc_name); + if(NULL != btl_proc) { + OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock); + return btl_proc; + } + + btl_proc = OBJ_NEW(mca_btl_tcp_proc_t); + if(NULL == btl_proc) + return NULL; + btl_proc->proc_ompi = ompi_proc; + btl_proc->proc_name = ompi_proc->proc_name; + + /* add to hash table of all proc instance */ + orte_hash_table_set_proc( + &mca_btl_tcp_component.tcp_procs, + &btl_proc->proc_name, + btl_proc); + OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock); + + /* lookup tcp parameters exported by this proc */ + rc = mca_base_modex_recv( &mca_btl_tcp_component.super.btl_version, + ompi_proc, + (void**)&btl_proc->proc_addrs, + &size); + if(rc != OMPI_SUCCESS) { + BTL_ERROR(("mca_base_modex_recv: failed with return value=%d", rc)); + OBJ_RELEASE(btl_proc); + return NULL; + } + if(0 != (size % sizeof(mca_btl_tcp_addr_t))) { + BTL_ERROR(("mca_base_modex_recv: invalid size %d\n", size)); + return NULL; + } + btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp_addr_t); + + /* allocate space for endpoint array - one for each exported address */ + btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**) + malloc(btl_proc->proc_addr_count * sizeof(mca_btl_base_endpoint_t*)); + if(NULL == btl_proc->proc_endpoints) { + OBJ_RELEASE(btl_proc); + return NULL; + } + if(NULL == mca_btl_tcp_component.tcp_local && ompi_proc == ompi_proc_local()) + mca_btl_tcp_component.tcp_local = btl_proc; + return btl_proc; +} + + +/* + * Note that this routine must be called with the lock on the process + * already held. Insert a btl instance into the proc array and assign + * it an address. + */ +int mca_btl_tcp_proc_insert( + mca_btl_tcp_proc_t* btl_proc, + mca_btl_base_endpoint_t* btl_endpoint) +{ + struct mca_btl_tcp_module_t *btl_tcp = btl_endpoint->endpoint_btl; + size_t i; + unsigned long net1; + + /* insert into endpoint array */ + btl_endpoint->endpoint_proc = btl_proc; + btl_proc->proc_endpoints[btl_proc->proc_endpoint_count++] = btl_endpoint; + + net1 = btl_tcp->tcp_ifaddr.sin_addr.s_addr & btl_tcp->tcp_ifmask.sin_addr.s_addr; + + /* + * Look through the proc instance for an address that is on the + * directly attached network. If we don't find one, pick the first + * unused address. + */ + for(i=0; iproc_addr_count; i++) { + mca_btl_tcp_addr_t* endpoint_addr = btl_proc->proc_addrs + i; + unsigned long net2 = endpoint_addr->addr_inet.s_addr & btl_tcp->tcp_ifmask.sin_addr.s_addr; + if(endpoint_addr->addr_inuse != 0) + continue; + if(net1 == net2) { + btl_endpoint->endpoint_addr = endpoint_addr; + break; + } else if(btl_endpoint->endpoint_addr != 0) + btl_endpoint->endpoint_addr = endpoint_addr; + } + + /* Make sure there is a common interface */ + if( NULL != btl_endpoint->endpoint_addr ) { + btl_endpoint->endpoint_addr->addr_inuse++; + return OMPI_SUCCESS; + } + return OMPI_ERR_UNREACH; +} + +/* + * Remove an endpoint from the proc array and indicate the address is + * no longer in use. + */ + +int mca_btl_tcp_proc_remove(mca_btl_tcp_proc_t* btl_proc, mca_btl_base_endpoint_t* btl_endpoint) +{ + size_t i; + OPAL_THREAD_LOCK(&btl_proc->proc_lock); + for(i=0; iproc_endpoint_count; i++) { + if(btl_proc->proc_endpoints[i] == btl_endpoint) { + memmove(btl_proc->proc_endpoints+i, btl_proc->proc_endpoints+i+1, + (btl_proc->proc_endpoint_count-i-1)*sizeof(mca_btl_base_endpoint_t*)); + if(--btl_proc->proc_endpoint_count == 0) { + OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); + OBJ_RELEASE(btl_proc); + return OMPI_SUCCESS; + } + btl_endpoint->endpoint_addr->addr_inuse--; + break; + } + } + OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); + return OMPI_SUCCESS; +} + +/* + * Look for an existing TCP process instance based on the globally unique + * process identifier. + */ +mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const orte_process_name_t *name) +{ + mca_btl_tcp_proc_t* proc; + OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock); + proc = (mca_btl_tcp_proc_t*)orte_hash_table_get_proc( + &mca_btl_tcp_component.tcp_procs, name); + OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock); + return proc; +} + +/* + * loop through all available PTLs for one matching the source address + * of the request. + */ +bool mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr_in* addr, int sd) +{ + size_t i; + OPAL_THREAD_LOCK(&btl_proc->proc_lock); + for(i=0; iproc_endpoint_count; i++) { + mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i]; + if(mca_btl_tcp_endpoint_accept(btl_endpoint, addr, sd)) { + OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); + return true; + } + } + OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); + return false; +} + + diff --git a/ompi/mca/btl/tcp/btl_tcp_proc.h b/ompi/mca/btl/tcp/btl_tcp_proc.h new file mode 100644 index 0000000000..40691c28ef --- /dev/null +++ b/ompi/mca/btl/tcp/btl_tcp_proc.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_TCP_PROC_H +#define MCA_BTL_TCP_PROC_H + +#include "opal/class/opal_object.h" +#include "orte/mca/ns/ns.h" +#include "ompi/proc/proc.h" +#include "btl_tcp.h" +#include "btl_tcp_addr.h" +#include "btl_tcp_endpoint.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif +OBJ_CLASS_DECLARATION(mca_btl_tcp_proc_t); + +/** + * Represents the state of a remote process and the set of addresses + * that it exports. Also cache an instance of mca_btl_base_endpoint_t for + * each + * BTL instance that attempts to open a connection to the process. + */ +struct mca_btl_tcp_proc_t { + opal_list_item_t super; + /**< allow proc to be placed on a list */ + + ompi_proc_t *proc_ompi; + /**< pointer to corresponding ompi_proc_t */ + + orte_process_name_t proc_name; + /**< globally unique identifier for the process */ + + struct mca_btl_tcp_addr_t* proc_addrs; + /**< array of addresses exported by peer */ + + size_t proc_addr_count; + /**< number of addresses published by endpoint */ + + struct mca_btl_base_endpoint_t **proc_endpoints; + /**< array of endpoints that have been created to access this proc */ + + size_t proc_endpoint_count; + /**< number of endpoints */ + + opal_mutex_t proc_lock; + /**< lock to protect against concurrent access to proc state */ +}; +typedef struct mca_btl_tcp_proc_t mca_btl_tcp_proc_t; + +mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(ompi_proc_t* ompi_proc); +mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const orte_process_name_t* name); +int mca_btl_tcp_proc_insert(mca_btl_tcp_proc_t*, mca_btl_base_endpoint_t*); +int mca_btl_tcp_proc_remove(mca_btl_tcp_proc_t*, mca_btl_base_endpoint_t*); +bool mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t*, struct sockaddr_in*, int); + +/** + * Inlined function to return local TCP proc instance. + */ + +static inline mca_btl_tcp_proc_t* mca_btl_tcp_proc_local(void) +{ + if(NULL == mca_btl_tcp_component.tcp_local) + mca_btl_tcp_component.tcp_local = mca_btl_tcp_proc_create(ompi_proc_local()); + return mca_btl_tcp_component.tcp_local; +} + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/btl/tcp/configure.params b/ompi/mca/btl/tcp/configure.params new file mode 100644 index 0000000000..e99d97d9b9 --- /dev/null +++ b/ompi/mca/btl/tcp/configure.params @@ -0,0 +1,22 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University. +# All rights reserved. +# Copyright (c) 2004-2005 The Trustees of the University of Tennessee. +# All rights reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_INIT_FILE=btl_tcp.c +PARAM_CONFIG_HEADER_FILE="tcp_config.h" +PARAM_CONFIG_FILES="Makefile"