1
1

ob1: optimize blocking send and receive paths

Per RFC. There are two optimizations in this commit:

 - Allocate requests for blocking sends and receives on the stack. This
   bypasses the request free list and saves two atomics on the critical path.
   This change improves the small message ping-pong by 50-200ns on both AMD
   and Intel CPUs.

 - For small messages try to use the btl sendi function before intializing a
   send request. If the sendi fails or the btl does not have a sendi function
   silently fallback on the standard send path.

cmr=v1.7.5:reviewer=brbarret

This commit was SVN r30343.
Этот коммит содержится в:
Nathan Hjelm 2014-01-21 15:16:21 +00:00
родитель 7e1593ef80
Коммит 2b57f4227e
4 изменённых файлов: 157 добавлений и 67 удалений

Просмотреть файл

@ -1,22 +1,23 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* Copyright (c) 2004-2014 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -43,10 +44,10 @@ int mca_pml_ob1_irecv_init(void *addr,
MCA_PML_OB1_RECV_REQUEST_INIT(recvreq,
addr,
count, datatype, src, tag, comm, true);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&((recvreq)->req_recv.req_base),
PERUSE_RECV);
PERUSE_RECV);
*request = (ompi_request_t *) recvreq;
return OMPI_SUCCESS;
@ -87,28 +88,29 @@ int mca_pml_ob1_recv(void *addr,
struct ompi_communicator_t *comm,
ompi_status_public_t * status)
{
mca_pml_ob1_recv_request_t recvreq;
int rc;
mca_pml_ob1_recv_request_t *recvreq;
MCA_PML_OB1_RECV_REQUEST_ALLOC(recvreq);
if (NULL == recvreq)
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
MCA_PML_OB1_RECV_REQUEST_INIT(recvreq,
addr,
count, datatype, src, tag, comm, false);
OBJ_CONSTRUCT(&recvreq, mca_pml_ob1_recv_request_t);
MCA_PML_OB1_RECV_REQUEST_INIT(&recvreq, addr, count, datatype,
src, tag, comm, false);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&((recvreq)->req_recv.req_base),
&(recvreq.req_recv.req_base),
PERUSE_RECV);
MCA_PML_OB1_RECV_REQUEST_START(recvreq);
ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi);
MCA_PML_OB1_RECV_REQUEST_START(&recvreq);
ompi_request_wait_completion(&recvreq.req_recv.req_base.req_ompi);
if (NULL != status) { /* return status */
*status = recvreq->req_recv.req_base.req_ompi.req_status;
*status = recvreq.req_recv.req_base.req_ompi.req_status;
}
rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
ompi_request_free( (ompi_request_t**)&recvreq );
rc = recvreq.req_recv.req_base.req_ompi.req_status.MPI_ERROR;
MCA_PML_BASE_RECV_REQUEST_FINI(&recvreq.req_recv);
OBJ_DESTRUCT(&recvreq);
return rc;
}
@ -128,7 +130,7 @@ mca_pml_ob1_imrecv( void *buf,
mca_pml_ob1_comm_proc_t* proc;
mca_pml_ob1_comm_t* ob1_comm;
uint64_t seq;
/* get the request from the message and the frag from the request
before we overwrite everything */
recvreq = (mca_pml_ob1_recv_request_t*) (*message)->req_ptr;
@ -152,7 +154,7 @@ mca_pml_ob1_imrecv( void *buf,
recvreq->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV;
MCA_PML_OB1_RECV_REQUEST_INIT(recvreq,
buf,
count, datatype,
count, datatype,
src, tag, comm, false);
OBJ_RELEASE(comm);
@ -199,7 +201,7 @@ mca_pml_ob1_imrecv( void *buf,
assert(0);
}
MCA_PML_OB1_RECV_FRAG_RETURN(frag);
ompi_message_return(*message);
*message = MPI_MESSAGE_NULL;
*request = (ompi_request_t *) recvreq;
@ -247,7 +249,7 @@ mca_pml_ob1_mrecv( void *buf,
recvreq->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV;
MCA_PML_OB1_RECV_REQUEST_INIT(recvreq,
buf,
count, datatype,
count, datatype,
src, tag, comm, false);
OBJ_RELEASE(comm);
@ -292,7 +294,7 @@ mca_pml_ob1_mrecv( void *buf,
default:
assert(0);
}
ompi_message_return(*message);
*message = MPI_MESSAGE_NULL;
ompi_request_wait_completion(&(recvreq->req_recv.req_base.req_ompi));

Просмотреть файл

@ -1,20 +1,21 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* Copyright (c) 2004-2014 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -45,7 +46,7 @@ int mca_pml_ob1_isend_init(void *buf,
datatype,
dst, tag,
comm, sendmode, true);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&(sendreq)->req_send.req_base,
PERUSE_SEND);
@ -54,6 +55,62 @@ int mca_pml_ob1_isend_init(void *buf,
return OMPI_SUCCESS;
}
/* try to get a small message out on to the wire quickly */
static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
ompi_datatype_t * datatype,
int dst, int tag, int16_t seqn,
ompi_proc_t *dst_proc, mca_bml_base_endpoint_t* endpoint,
ompi_communicator_t * comm)
{
mca_pml_ob1_match_hdr_t match;
mca_bml_base_btl_t* bml_btl;
OPAL_PTRDIFF_TYPE lb, extent;
opal_convertor_t convertor;
size_t size = 0;
int rc;
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
ompi_datatype_get_extent (datatype, &lb, &extent);
if (OPAL_UNLIKELY((extent * count) > 256 || !bml_btl->btl->btl_sendi)) {
return OMPI_ERR_NOT_AVAILABLE;
}
if (count > 0) {
/* initialize just enough of the convertor to avoid a SEGV in opal_convertor_cleanup */
convertor.stack_size = 0;
/* We will create a convertor specialized for the */
/* remote architecture and prepared with the datatype. */
opal_convertor_copy_and_prepare_for_send (dst_proc->proc_convertor,
(const struct opal_datatype_t *) datatype,
count, buf, 0, &convertor);
/* find out the packed size of the data */
opal_convertor_get_packed_size (&convertor, &size);
}
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
match.hdr_ctx = comm->c_contextid;
match.hdr_src = comm->c_my_rank;
match.hdr_tag = tag;
match.hdr_seq = seqn;
/* try to send immediately */
rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN,
size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
MCA_PML_OB1_HDR_TYPE_MATCH, NULL);
if (count > 0) {
opal_convertor_cleanup (&convertor);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
return rc;
}
return (int) size;
}
int mca_pml_ob1_isend(void *buf,
size_t count,
@ -66,11 +123,11 @@ int mca_pml_ob1_isend(void *buf,
{
int rc;
mca_pml_ob1_send_request_t *sendreq = NULL;
MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq);
if (NULL == sendreq)
return OMPI_ERR_OUT_OF_RESOURCE;
MCA_PML_OB1_SEND_REQUEST_INIT(sendreq,
buf,
count,
@ -87,7 +144,6 @@ int mca_pml_ob1_isend(void *buf,
return rc;
}
int mca_pml_ob1_send(void *buf,
size_t count,
ompi_datatype_t * datatype,
@ -96,14 +152,33 @@ int mca_pml_ob1_send(void *buf,
mca_pml_base_send_mode_t sendmode,
ompi_communicator_t * comm)
{
mca_pml_ob1_comm_t* ob1_comm = comm->c_pml_comm;
ompi_proc_t *dst_proc = ompi_comm_peer_lookup (comm, dst);
mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*)
dst_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
mca_pml_ob1_send_request_t sendreq;
int16_t seqn;
int rc;
mca_pml_ob1_send_request_t *sendreq = NULL;
MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq);
if (NULL == sendreq)
return OMPI_ERR_OUT_OF_RESOURCE;
MCA_PML_OB1_SEND_REQUEST_INIT(sendreq,
if (OPAL_UNLIKELY(NULL == endpoint)) {
return OMPI_ERR_UNREACH;
}
seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_comm->procs[dst].send_sequence, 1);
if (MCA_PML_BASE_SEND_SYNCHRONOUS != sendmode) {
rc = mca_pml_ob1_send_inline (buf, count, datatype, dst, tag, seqn, dst_proc,
endpoint, comm);
if (OPAL_LIKELY(0 <= rc)) {
return OMPI_SUCCESS;
}
}
OBJ_CONSTRUCT(&sendreq, mca_pml_ob1_send_request_t);
sendreq.req_send.req_base.req_proc = dst_proc;
sendreq.src_des = NULL;
MCA_PML_OB1_SEND_REQUEST_INIT(&sendreq,
buf,
count,
datatype,
@ -111,18 +186,19 @@ int mca_pml_ob1_send(void *buf,
comm, sendmode, false);
PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
&(sendreq)->req_send.req_base,
&sendreq.req_send.req_base,
PERUSE_SEND);
MCA_PML_OB1_SEND_REQUEST_START(sendreq, rc);
MCA_PML_OB1_SEND_REQUEST_START_W_SEQ(&sendreq, endpoint, seqn, rc);
if (rc != OMPI_SUCCESS) {
MCA_PML_OB1_SEND_REQUEST_RETURN( sendreq );
return rc;
}
ompi_request_wait_completion(&sendreq->req_send.req_base.req_ompi);
ompi_request_wait_completion(&sendreq.req_send.req_base.req_ompi);
rc = sendreq.req_send.req_base.req_ompi.req_status.MPI_ERROR;
MCA_PML_BASE_SEND_REQUEST_FINI(&sendreq.req_send);
OBJ_DESTRUCT(&sendreq);
rc = sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR;
ompi_request_free( (ompi_request_t**)&sendreq );
return rc;
}

Просмотреть файл

@ -145,7 +145,7 @@ static int mca_pml_ob1_recv_request_cancel(struct ompi_request_t* ompi_request,
static void mca_pml_ob1_recv_request_construct(mca_pml_ob1_recv_request_t* request)
{
request->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV;
/* the request type is set by the superclass */
request->req_recv.req_base.req_ompi.req_free = mca_pml_ob1_recv_request_free;
request->req_recv.req_base.req_ompi.req_cancel = mca_pml_ob1_recv_request_cancel;
request->req_rdma_cnt = 0;

Просмотреть файл

@ -143,7 +143,7 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
sendmode, \
persistent) \
{ \
MCA_PML_BASE_SEND_REQUEST_INIT(&sendreq->req_send, \
MCA_PML_BASE_SEND_REQUEST_INIT(&(sendreq)->req_send, \
buf, \
count, \
datatype, \
@ -157,9 +157,9 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
}
#define MCA_PML_OB1_SEND_REQUEST_RESET(sendreq) \
if (sendreq->req_send.req_bytes_packed > 0) { \
if ((sendreq)->req_send.req_bytes_packed > 0) { \
size_t _position = 0; \
opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, \
opal_convertor_set_position(&(sendreq)->req_send.req_base.req_convertor, \
&_position); \
assert( 0 == _position ); \
}
@ -188,6 +188,11 @@ static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* s
rc = mca_pml_ob1_send_request_start(sendreq); \
} while (0)
#define MCA_PML_OB1_SEND_REQUEST_START_W_SEQ(sendreq, endpoint, seq, rc) \
do { \
rc = mca_pml_ob1_send_request_start_seq (sendreq, endpoint, seq); \
} while (0)
/*
* Mark a send request as completed at the MPI level.
@ -430,29 +435,19 @@ mca_pml_ob1_send_request_start_btl( mca_pml_ob1_send_request_t* sendreq,
}
static inline int
mca_pml_ob1_send_request_start( mca_pml_ob1_send_request_t* sendreq )
{
mca_pml_ob1_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm;
mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*)
sendreq->req_send.req_base.req_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
size_t i;
if( OPAL_UNLIKELY(endpoint == NULL) ) {
return OMPI_ERR_UNREACH;
}
mca_pml_ob1_send_request_start_seq (mca_pml_ob1_send_request_t* sendreq, mca_bml_base_endpoint_t* endpoint, int32_t seqn)
{
sendreq->req_endpoint = endpoint;
sendreq->req_state = 0;
sendreq->req_lock = 0;
sendreq->req_pipeline_depth = 0;
sendreq->req_bytes_delivered = 0;
sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_NONE;
sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32(
&comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1);
sendreq->req_send.req_base.req_sequence = seqn;
MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base );
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
for(size_t i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
mca_bml_base_btl_t* bml_btl;
int rc;
@ -467,6 +462,23 @@ mca_pml_ob1_send_request_start( mca_pml_ob1_send_request_t* sendreq )
return OMPI_SUCCESS;
}
static inline int
mca_pml_ob1_send_request_start( mca_pml_ob1_send_request_t* sendreq )
{
mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*)
sendreq->req_send.req_base.req_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
mca_pml_ob1_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm;
int32_t seqn;
if (OPAL_UNLIKELY(NULL == endpoint)) {
return OMPI_ERR_UNREACH;
}
seqn = OPAL_THREAD_ADD32(&comm->procs[sendreq->req_send.req_base.req_peer].send_sequence, 1);
return mca_pml_ob1_send_request_start_seq (sendreq, endpoint, seqn);
}
/**
* Initiate a put scheduled by the receiver.
*/