diff --git a/opal/mca/btl/usnic/README.txt b/opal/mca/btl/usnic/README.txt index 1af846b1c9..ab0b7d12b7 100644 --- a/opal/mca/btl/usnic/README.txt +++ b/opal/mca/btl/usnic/README.txt @@ -335,3 +335,40 @@ libfabric abstractions: fi_fabric: corresponds to a VIC PF fi_domain: corresponds to a VIC VF fi_endpoint: resources inside the VIC VF (basically a QP) + +====================================== + +MPI_THREAD_MULTIPLE support + +In order to make usnic btl thread-safe, the mutex locks are issued +to protect the critical path. ie; libfabric routines, book keeping, etc. + +The said lock is btl_usnic_lock. It is a RECURSIVE lock, meaning that +the same thread can take the lock again even if it already has the lock to +allow the callback function to post another segment right away if we know +that the current segment is completed inline. (So we can call send in send +without deadlocking) + +These two functions taking care of hotel checkin/checkout and we +have to protect that part. So we take the mutex lock before we enter the +function. + +- opal_btl_usnic_check_rts() +- opal_btl_usnic_handle_ack() + +We also have to protect the call to libfabric routines + +- opal_btl_usnic_endpoint_send_segment() (fi_send) +- opal_btl_usnic_recv_call() (fi_recvmsg) + +have to be protected as well. + +Also cclient connection checking (opal_btl_usnic_connectivity_ping) has to be +protected. This happens only in the beginning but cclient communicate with cagent +through opal_fd_read/write() and if two or more clients do opal_fd_write() at the +same time, the data might be corrupt. + +With this concept, many functions in btl/usnic that make calls to the +listed functions are protected by OPAL_THREAD_LOCK macro which will only +be active if the user specify MPI_Init_thread() with MPI_THREAD_MULTIPLE +support. diff --git a/opal/mca/btl/usnic/btl_usnic_cclient.c b/opal/mca/btl/usnic/btl_usnic_cclient.c index d76b3b8ca9..6370c2c790 100644 --- a/opal/mca/btl/usnic/btl_usnic_cclient.c +++ b/opal/mca/btl/usnic/btl_usnic_cclient.c @@ -228,7 +228,8 @@ int opal_btl_usnic_connectivity_ping(uint32_t src_ipv4_addr, int src_port, uint32_t dest_netmask, int dest_port, char *dest_nodename, size_t max_msg_size) -{ +{ + OPAL_THREAD_LOCK(&btl_usnic_lock); /* If connectivity checking is not enabled, do nothing */ if (!mca_btl_usnic_component.connectivity_enabled) { return OPAL_SUCCESS; @@ -259,6 +260,7 @@ int opal_btl_usnic_connectivity_ping(uint32_t src_ipv4_addr, int src_port, ABORT("usnic connectivity client IPC write failed"); /* Will not return */ } + OPAL_THREAD_UNLOCK(&btl_usnic_lock); return OPAL_SUCCESS; } diff --git a/opal/mca/btl/usnic/btl_usnic_compat.c b/opal/mca/btl/usnic/btl_usnic_compat.c index 1289093ac1..2346dc56c5 100644 --- a/opal/mca/btl/usnic/btl_usnic_compat.c +++ b/opal/mca/btl/usnic/btl_usnic_compat.c @@ -509,6 +509,7 @@ opal_btl_usnic_prepare_src( size_t* size, uint32_t flags) { + OPAL_THREAD_LOCK(&btl_usnic_lock); opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module; opal_btl_usnic_send_frag_t *frag; uint32_t payload_len; @@ -552,6 +553,7 @@ opal_btl_usnic_prepare_src( #endif #endif + OPAL_THREAD_UNLOCK(&btl_usnic_lock); return &frag->sf_base.uf_base; } diff --git a/opal/mca/btl/usnic/btl_usnic_component.c b/opal/mca/btl/usnic/btl_usnic_component.c index 07803ce9ee..8a3b6e1800 100644 --- a/opal/mca/btl/usnic/btl_usnic_component.c +++ b/opal/mca/btl/usnic/btl_usnic_component.c @@ -86,6 +86,9 @@ #define OPAL_BTL_USNIC_NUM_COMPLETIONS 500 +/* MPI_THREAD_MULTIPLE_SUPPORT */ +opal_recursive_mutex_t btl_usnic_lock; + /* RNG buffer definition */ opal_rng_buff_t opal_btl_usnic_rand_buff = {0}; @@ -222,6 +225,8 @@ static int usnic_component_close(void) opal_btl_usnic_cleanup_tests(); #endif + OBJ_DESTRUCT(&btl_usnic_lock); + return OPAL_SUCCESS; } @@ -615,13 +620,22 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, *num_btl_modules = 0; - /* Currently refuse to run if MPI_THREAD_MULTIPLE is enabled */ + /* MPI_THREAD_MULTIPLE is only supported in 2.0+ */ if (want_mpi_threads && !mca_btl_base_thread_multiple_override) { - opal_output_verbose(5, USNIC_OUT, - "btl:usnic: MPI_THREAD_MULTIPLE not supported; skipping this component"); - return NULL; + + if (OMPI_MAJOR_VERSION >= 2) { + opal_output_verbose(5, USNIC_OUT, + "btl:usnic: MPI_THREAD_MULTIPLE support is in testing phase."); + } + else { + opal_output_verbose(5, USNIC_OUT, + "btl:usnic: MPI_THREAD_MULTIPLE is not supported in version < 2."); + return NULL; + } } + OBJ_CONSTRUCT(&btl_usnic_lock, opal_recursive_mutex_t); + /* We only want providers named "usnic that are of type EP_DGRAM */ fabric_attr.prov_name = "usnic"; ep_attr.type = FI_EP_DGRAM; @@ -1151,6 +1165,8 @@ static int usnic_handle_completion( /* Make the completion be Valgrind-defined */ opal_memchecker_base_mem_defined(seg, sizeof(*seg)); + OPAL_THREAD_LOCK(&btl_usnic_lock); + /* Handle work completions */ switch(seg->us_type) { @@ -1181,6 +1197,8 @@ static int usnic_handle_completion( BTL_ERROR(("Unhandled completion segment type %d", seg->us_type)); break; } + + OPAL_THREAD_UNLOCK(&btl_usnic_lock); return 1; } diff --git a/opal/mca/btl/usnic/btl_usnic_module.c b/opal/mca/btl/usnic/btl_usnic_module.c index 786f444da4..7cca5354ab 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.c +++ b/opal/mca/btl/usnic/btl_usnic_module.c @@ -1086,6 +1086,7 @@ opal_btl_usnic_module_progress_sends( /* * Handle all the retransmits we can */ + OPAL_THREAD_LOCK(&btl_usnic_lock); if (OPAL_UNLIKELY(!opal_list_is_empty(&module->pending_resend_segs))) { usnic_do_resends(module); } @@ -1195,6 +1196,7 @@ opal_btl_usnic_module_progress_sends( endpoint = next_endpoint; } + OPAL_THREAD_UNLOCK(&btl_usnic_lock); } /* @@ -1229,6 +1231,7 @@ usnic_send( opal_btl_usnic_module_t *module; opal_btl_usnic_send_segment_t *sseg; + OPAL_THREAD_LOCK(&btl_usnic_lock); endpoint = (opal_btl_usnic_endpoint_t *)base_endpoint; module = (opal_btl_usnic_module_t *)base_module; frag = (opal_btl_usnic_send_frag_t*) descriptor; @@ -1337,6 +1340,7 @@ usnic_send( ++module->stats.pml_module_sends; + OPAL_THREAD_UNLOCK(&btl_usnic_lock); return rc; } diff --git a/opal/mca/btl/usnic/btl_usnic_module.h b/opal/mca/btl/usnic/btl_usnic_module.h index b7f49c596b..15cc1933ef 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.h +++ b/opal/mca/btl/usnic/btl_usnic_module.h @@ -53,6 +53,12 @@ BEGIN_C_DECLS +/* + * MPI_THREAD_MULTIPLE support + */ +extern opal_recursive_mutex_t btl_usnic_lock; + + /* * Forward declarations to avoid include loops */ diff --git a/opal/mca/btl/usnic/btl_usnic_recv.c b/opal/mca/btl/usnic/btl_usnic_recv.c index c77388ef23..443e2b0e96 100644 --- a/opal/mca/btl/usnic/btl_usnic_recv.c +++ b/opal/mca/btl/usnic/btl_usnic_recv.c @@ -340,8 +340,9 @@ void opal_btl_usnic_recv_call(opal_btl_usnic_module_t *module, opal_output(0, " Received ACK for sequence number %" UDSEQ " from %s to %s\n", bseg->us_btl_header->ack_seq, remote_ip, local_ip); #endif + OPAL_THREAD_LOCK(&btl_usnic_lock); opal_btl_usnic_handle_ack(endpoint, ack_seq); - + OPAL_THREAD_UNLOCK(&btl_usnic_lock); goto repost; } diff --git a/opal/mca/btl/usnic/btl_usnic_recv.h b/opal/mca/btl/usnic/btl_usnic_recv.h index 4773bba4aa..70ffa7d4db 100644 --- a/opal/mca/btl/usnic/btl_usnic_recv.h +++ b/opal/mca/btl/usnic/btl_usnic_recv.h @@ -157,8 +157,10 @@ opal_btl_usnic_check_rx_seq( #if MSGDEBUG1 opal_output(0, "Handle piggy-packed ACK seq %"UDSEQ"\n", seg->rs_base.us_btl_header->ack_seq); #endif + OPAL_THREAD_LOCK(&btl_usnic_lock); opal_btl_usnic_handle_ack(endpoint, seg->rs_base.us_btl_header->ack_seq); + OPAL_THREAD_UNLOCK(&btl_usnic_lock); } /* Do we have room in the endpoint's receiver window?