From 732d89095b22ebc2db6ae6ef7c7b457796e67084 Mon Sep 17 00:00:00 2001 From: Artem Polyakov Date: Tue, 28 Jun 2016 18:34:09 +0300 Subject: [PATCH] MPI_Waitsome performance improvement by avoiding extra atomic exchanges. Use indices array to mark already completed connections in the pre-wait loop to avoid extra atomic exchanges in the after-wait loop. --- ompi/request/req_wait.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/ompi/request/req_wait.c b/ompi/request/req_wait.c index 5ab2b39648..9473563d0e 100644 --- a/ompi/request/req_wait.c +++ b/ompi/request/req_wait.c @@ -391,8 +391,8 @@ int ompi_request_default_wait_some(size_t count, num_requests_null_inactive++; continue; } - - if( !OPAL_ATOMIC_CMPSET_PTR(&request->req_complete, REQUEST_PENDING, &sync) ) { + indices[i] = OPAL_ATOMIC_CMPSET_PTR(&request->req_complete, REQUEST_PENDING, &sync); + if( !indices[i] ) { /* If the request is completed go ahead and mark it as such */ assert( REQUEST_COMPLETE(request) ); num_requests_done++; @@ -423,15 +423,23 @@ int ompi_request_default_wait_some(size_t count, if( request->req_state == OMPI_REQUEST_INACTIVE ) { continue; } - /* Atomically mark the request as pending. If this succeed - * then the request was not completed, and it is now marked as - * pending. Otherwise, the request is complete )either it was - * before or it has been meanwhile). The major drawback here - * is that we will do all the atomics operations in all cases. + /* Here we have 3 possibilities: + * a) request was found completed in the first loop + * => ( indices[i] == 0 ) + * b) request was completed between first loop and this check + * => ( indices[i] == 1 ) and we can NOT atomically mark the + * request as pending. + * c) request wasn't finished yet + * => ( indices[i] == 1 ) and we CAN atomically mark the + * request as pending. + * NOTE that in any case (i >= num_requests_done) as latter grows + * either slowly (in case of partial completion) + * OR in parallel with `i` (in case of full set completion) */ - if( !OPAL_ATOMIC_CMPSET_PTR(&request->req_complete, &sync, REQUEST_PENDING) ) { - indices[num_requests_done] = i; - num_requests_done++; + if( !indices[i] ){ + indices[num_requests_done++] = i; + } else if( !OPAL_ATOMIC_CMPSET_PTR(&request->req_complete, &sync, REQUEST_PENDING) ) { + indices[num_requests_done++] = i; } } sync_unsets = count - num_requests_null_inactive - num_requests_done;