1
1

Merge pull request #2762 from rhc54/topic/oobfast

Speed-up the OOB/TCP communications by using writev instead of writing the header, and then separately write the body
Этот коммит содержится в:
Ralph Castain 2017-01-19 15:39:06 -08:00 коммит произвёл GitHub
родитель 63caeba84d e5f687f896
Коммит ca50b31de1

Просмотреть файл

@ -13,7 +13,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -75,47 +75,89 @@
#include "orte/mca/oob/tcp/oob_tcp_common.h" #include "orte/mca/oob/tcp/oob_tcp_common.h"
#include "orte/mca/oob/tcp/oob_tcp_connection.h" #include "orte/mca/oob/tcp/oob_tcp_connection.h"
static int send_bytes(mca_oob_tcp_peer_t* peer) static int send_msg(mca_oob_tcp_peer_t* peer, mca_oob_tcp_send_t* msg)
{ {
mca_oob_tcp_send_t* msg = peer->send_msg; struct iovec iov[2];
int rc; int iov_count;
ssize_t remain = msg->sdbytes, rc;
OPAL_TIMING_EVENT((&tm_oob, "to %s %d bytes", OPAL_TIMING_EVENT((&tm_oob, "to %s %d bytes",
ORTE_NAME_PRINT(&(peer->name)), msg->sdbytes)); ORTE_NAME_PRINT(&(peer->name)), msg->sdbytes));
while (0 < msg->sdbytes) { iov[0].iov_base = msg->sdptr;
rc = write(peer->sd, msg->sdptr, msg->sdbytes); iov[0].iov_len = msg->sdbytes;
if (rc < 0) { if (!msg->hdr_sent) {
if (opal_socket_errno == EINTR) { if (NULL != msg->data) {
continue; /* relay message - just send that data */
} else if (opal_socket_errno == EAGAIN) { iov[1].iov_base = msg->data;
/* tell the caller to keep this message on active, } else if (NULL != msg->msg->buffer) {
* but let the event lib cycle so other messages /* buffer send */
* can progress while this socket is busy iov[1].iov_base = msg->msg->buffer->base_ptr;
*/ } else {
return ORTE_ERR_RESOURCE_BUSY; iov[1].iov_base = msg->msg->data;
} else if (opal_socket_errno == EWOULDBLOCK) {
/* tell the caller to keep this message on active,
* but let the event lib cycle so other messages
* can progress while this socket is busy
*/
return ORTE_ERR_WOULD_BLOCK;
}
/* we hit an error and cannot progress this message */
opal_output(0, "%s->%s mca_oob_tcp_msg_send_bytes: write failed: %s (%d) [sd = %d]",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)),
strerror(opal_socket_errno),
opal_socket_errno,
peer->sd);
return ORTE_ERR_COMM_FAILURE;
} }
/* update location */ iov[1].iov_len = ntohl(msg->hdr.nbytes);
msg->sdbytes -= rc; remain += ntohl(msg->hdr.nbytes);
msg->sdptr += rc; iov_count = 2;
} else {
iov_count = 1;
}
retry:
rc = writev(peer->sd, iov, iov_count);
if (OPAL_LIKELY(rc == remain)) {
/* we successfully sent the header and the msg data if any */
msg->hdr_sent = true;
msg->sdbytes = 0;
msg->sdptr = (char *)iov[iov_count-1].iov_base + iov[iov_count-1].iov_len;
return ORTE_SUCCESS;
} else if (rc < 0) {
if (opal_socket_errno == EINTR) {
goto retry;
} else if (opal_socket_errno == EAGAIN) {
/* tell the caller to keep this message on active,
* but let the event lib cycle so other messages
* can progress while this socket is busy
*/
return ORTE_ERR_RESOURCE_BUSY;
} else if (opal_socket_errno == EWOULDBLOCK) {
/* tell the caller to keep this message on active,
* but let the event lib cycle so other messages
* can progress while this socket is busy
*/
return ORTE_ERR_WOULD_BLOCK;
} else {
/* we hit an error and cannot progress this message */
opal_output(0, "oob:tcp: send_msg: write failed: %s (%d) [sd = %d]",
strerror(opal_socket_errno),
opal_socket_errno, peer->sd);
return ORTE_ERR_UNREACH;
}
} else {
/* short writev. This usually means the kernel buffer is full,
* so there is no point for retrying at that time.
* simply update the msg and return with PMIX_ERR_RESOURCE_BUSY */
if ((size_t)rc < msg->sdbytes) {
/* partial write of the header or the msg data */
msg->sdptr = (char *)msg->sdptr + rc;
msg->sdbytes -= rc;
} else {
/* header was fully written, but only a part of the msg data was written */
msg->hdr_sent = true;
rc -= msg->sdbytes;
if (NULL != msg->data) {
/* technically, this should never happen as iov_count
* would be 1 for a zero-byte message, and so we cannot
* have a case where we write the header and part of the
* msg. However, code checkers don't know that and are
* fooled by our earlier check for NULL, and so
* we silence their warnings by using this check */
msg->sdptr = (char *)msg->data + rc;
}
msg->sdbytes = ntohl(msg->hdr.nbytes) - rc;
}
return ORTE_ERR_RESOURCE_BUSY;
} }
/* we sent the full data block */
return ORTE_SUCCESS;
} }
/* /*
@ -155,68 +197,56 @@ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == peer->send_msg) ? "NULL" : ORTE_NAME_PRINT(&peer->name)); (NULL == peer->send_msg) ? "NULL" : ORTE_NAME_PRINT(&peer->name));
if (NULL != msg) { if (NULL != msg) {
/* if the header hasn't been completely sent, send it */ opal_output_verbose(2, orte_oob_base_framework.framework_output,
if (!msg->hdr_sent) { "oob:tcp:send_handler SENDING MSG");
if (ORTE_SUCCESS == (rc = send_bytes(peer))) { if (ORTE_SUCCESS == (rc = send_msg(peer, msg))) {
/* header is completely sent */ /* this msg is complete */
msg->hdr_sent = true; if (NULL != msg->data || NULL == msg->msg) {
/* setup to send the data */ /* the relay is complete - release the data */
if (NULL != msg->data) { opal_output_verbose(2, orte_oob_base_framework.framework_output,
/* relay msg - send that data */ "%s MESSAGE RELAY COMPLETE TO %s OF %d BYTES ON SOCKET %d",
msg->sdptr = msg->data; ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
msg->sdbytes = (int)ntohl(msg->hdr.nbytes); ORTE_NAME_PRINT(&(peer->name)),
} else if (NULL == msg->msg) { (int)ntohl(msg->hdr.nbytes), peer->sd);
/* this was a zero-byte relay - nothing more to do */ OBJ_RELEASE(msg);
OBJ_RELEASE(msg); peer->send_msg = NULL;
peer->send_msg = NULL; } else if (NULL != msg->msg->buffer) {
goto next; /* we are done - notify the RML */
} else if (NULL != msg->msg->buffer) { opal_output_verbose(2, orte_oob_base_framework.framework_output,
/* send the buffer data as a single block */ "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d",
msg->sdptr = msg->msg->buffer->base_ptr; ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
msg->sdbytes = msg->msg->buffer->bytes_used; ORTE_NAME_PRINT(&(peer->name)),
} else if (NULL != msg->msg->iov) { (int)ntohl(msg->hdr.nbytes), peer->sd);
/* start with the first iovec */ msg->msg->status = ORTE_SUCCESS;
msg->sdptr = msg->msg->iov[0].iov_base;
msg->sdbytes = msg->msg->iov[0].iov_len;
msg->iovnum = 0;
} else {
/* just send the data */
msg->sdptr = msg->msg->data;
msg->sdbytes = msg->msg->count;
}
/* fall thru and let the send progress */
} else if (ORTE_ERR_RESOURCE_BUSY == rc ||
ORTE_ERR_WOULD_BLOCK == rc) {
/* exit this event and let the event lib progress */
return;
} else {
// report the error
opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: unable to send header",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)));
opal_event_del(&peer->send_event);
msg->msg->status = rc;
ORTE_RML_SEND_COMPLETE(msg->msg); ORTE_RML_SEND_COMPLETE(msg->msg);
OBJ_RELEASE(msg); OBJ_RELEASE(msg);
peer->send_msg = NULL; peer->send_msg = NULL;
goto next; } else if (NULL != msg->msg->data) {
} /* this was a relay we have now completed - no need to
} * notify the RML as the local proc didn't initiate
/* progress the data transmission */ * the send
if (msg->hdr_sent) { */
if (ORTE_SUCCESS == (rc = send_bytes(peer))) { opal_output_verbose(2, orte_oob_base_framework.framework_output,
/* this block is complete */ "%s MESSAGE RELAY COMPLETE TO %s OF %d BYTES ON SOCKET %d",
if (NULL != msg->data || NULL == msg->msg) { ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
/* the relay is complete - release the data */ ORTE_NAME_PRINT(&(peer->name)),
opal_output_verbose(2, orte_oob_base_framework.framework_output, (int)ntohl(msg->hdr.nbytes), peer->sd);
"%s MESSAGE RELAY COMPLETE TO %s OF %d BYTES ON SOCKET %d", msg->msg->status = ORTE_SUCCESS;
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), OBJ_RELEASE(msg);
ORTE_NAME_PRINT(&(peer->name)), peer->send_msg = NULL;
(int)ntohl(msg->hdr.nbytes), peer->sd); } else {
OBJ_RELEASE(msg); /* rotate to the next iovec */
peer->send_msg = NULL; msg->iovnum++;
} else if (NULL != msg->msg->buffer) { if (msg->iovnum < msg->msg->count) {
/* we are done - notify the RML */ msg->sdptr = msg->msg->iov[msg->iovnum].iov_base;
msg->sdbytes = msg->msg->iov[msg->iovnum].iov_len;
/* exit this event to give the event lib
* a chance to progress any other pending
* actions
*/
return;
} else {
/* this message is complete - notify the RML */
opal_output_verbose(2, orte_oob_base_framework.framework_output, opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -226,64 +256,27 @@ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata)
ORTE_RML_SEND_COMPLETE(msg->msg); ORTE_RML_SEND_COMPLETE(msg->msg);
OBJ_RELEASE(msg); OBJ_RELEASE(msg);
peer->send_msg = NULL; peer->send_msg = NULL;
} else if (NULL != msg->msg->data) {
/* this was a relay we have now completed - no need to
* notify the RML as the local proc didn't initiate
* the send
*/
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s MESSAGE RELAY COMPLETE TO %s OF %d BYTES ON SOCKET %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)),
(int)ntohl(msg->hdr.nbytes), peer->sd);
msg->msg->status = ORTE_SUCCESS;
OBJ_RELEASE(msg);
peer->send_msg = NULL;
} else {
/* rotate to the next iovec */
msg->iovnum++;
if (msg->iovnum < msg->msg->count) {
msg->sdptr = msg->msg->iov[msg->iovnum].iov_base;
msg->sdbytes = msg->msg->iov[msg->iovnum].iov_len;
/* exit this event to give the event lib
* a chance to progress any other pending
* actions
*/
return;
} else {
/* this message is complete - notify the RML */
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)),
(int)ntohl(msg->hdr.nbytes), peer->sd);
msg->msg->status = ORTE_SUCCESS;
ORTE_RML_SEND_COMPLETE(msg->msg);
OBJ_RELEASE(msg);
peer->send_msg = NULL;
}
} }
/* fall thru to queue the next message */
} else if (ORTE_ERR_RESOURCE_BUSY == rc ||
ORTE_ERR_WOULD_BLOCK == rc) {
/* exit this event and let the event lib progress */
return;
} else {
// report the error
opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: unable to send message ON SOCKET %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)), peer->sd);
opal_event_del(&peer->send_event);
msg->msg->status = rc;
ORTE_RML_SEND_COMPLETE(msg->msg);
OBJ_RELEASE(msg);
peer->send_msg = NULL;
ORTE_FORCED_TERMINATE(1);
return;
} }
/* fall thru to queue the next message */
} else if (ORTE_ERR_RESOURCE_BUSY == rc ||
ORTE_ERR_WOULD_BLOCK == rc) {
/* exit this event and let the event lib progress */
return;
} else {
// report the error
opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: unable to send message ON SOCKET %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)), peer->sd);
opal_event_del(&peer->send_event);
msg->msg->status = rc;
ORTE_RML_SEND_COMPLETE(msg->msg);
OBJ_RELEASE(msg);
peer->send_msg = NULL;
ORTE_FORCED_TERMINATE(1);
return;
} }
next:
/* if current message completed - progress any pending sends by /* if current message completed - progress any pending sends by
* moving the next in the queue into the "on-deck" position. Note * moving the next in the queue into the "on-deck" position. Note
* that this doesn't mean we send the message right now - we will * that this doesn't mean we send the message right now - we will
@ -657,4 +650,3 @@ static void err_cons(mca_oob_tcp_msg_error_t *ptr)
OBJ_CLASS_INSTANCE(mca_oob_tcp_msg_error_t, OBJ_CLASS_INSTANCE(mca_oob_tcp_msg_error_t,
opal_object_t, opal_object_t,
err_cons, NULL); err_cons, NULL);