1
1

Merge pull request #2762 from rhc54/topic/oobfast

Speed-up the OOB/TCP communications by using writev instead of writing the header, and then separately write the body
Этот коммит содержится в:
Ralph Castain 2017-01-19 15:39:06 -08:00 коммит произвёл GitHub
родитель 63caeba84d e5f687f896
Коммит ca50b31de1

Просмотреть файл

@ -13,7 +13,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -75,19 +75,45 @@
#include "orte/mca/oob/tcp/oob_tcp_common.h" #include "orte/mca/oob/tcp/oob_tcp_common.h"
#include "orte/mca/oob/tcp/oob_tcp_connection.h" #include "orte/mca/oob/tcp/oob_tcp_connection.h"
static int send_bytes(mca_oob_tcp_peer_t* peer) static int send_msg(mca_oob_tcp_peer_t* peer, mca_oob_tcp_send_t* msg)
{ {
mca_oob_tcp_send_t* msg = peer->send_msg; struct iovec iov[2];
int rc; int iov_count;
ssize_t remain = msg->sdbytes, rc;
OPAL_TIMING_EVENT((&tm_oob, "to %s %d bytes", OPAL_TIMING_EVENT((&tm_oob, "to %s %d bytes",
ORTE_NAME_PRINT(&(peer->name)), msg->sdbytes)); ORTE_NAME_PRINT(&(peer->name)), msg->sdbytes));
while (0 < msg->sdbytes) { iov[0].iov_base = msg->sdptr;
rc = write(peer->sd, msg->sdptr, msg->sdbytes); iov[0].iov_len = msg->sdbytes;
if (rc < 0) { if (!msg->hdr_sent) {
if (NULL != msg->data) {
/* relay message - just send that data */
iov[1].iov_base = msg->data;
} else if (NULL != msg->msg->buffer) {
/* buffer send */
iov[1].iov_base = msg->msg->buffer->base_ptr;
} else {
iov[1].iov_base = msg->msg->data;
}
iov[1].iov_len = ntohl(msg->hdr.nbytes);
remain += ntohl(msg->hdr.nbytes);
iov_count = 2;
} else {
iov_count = 1;
}
retry:
rc = writev(peer->sd, iov, iov_count);
if (OPAL_LIKELY(rc == remain)) {
/* we successfully sent the header and the msg data if any */
msg->hdr_sent = true;
msg->sdbytes = 0;
msg->sdptr = (char *)iov[iov_count-1].iov_base + iov[iov_count-1].iov_len;
return ORTE_SUCCESS;
} else if (rc < 0) {
if (opal_socket_errno == EINTR) { if (opal_socket_errno == EINTR) {
continue; goto retry;
} else if (opal_socket_errno == EAGAIN) { } else if (opal_socket_errno == EAGAIN) {
/* tell the caller to keep this message on active, /* tell the caller to keep this message on active,
* but let the event lib cycle so other messages * but let the event lib cycle so other messages
@ -100,22 +126,38 @@ static int send_bytes(mca_oob_tcp_peer_t* peer)
* can progress while this socket is busy * can progress while this socket is busy
*/ */
return ORTE_ERR_WOULD_BLOCK; return ORTE_ERR_WOULD_BLOCK;
} } else {
/* we hit an error and cannot progress this message */ /* we hit an error and cannot progress this message */
opal_output(0, "%s->%s mca_oob_tcp_msg_send_bytes: write failed: %s (%d) [sd = %d]", opal_output(0, "oob:tcp: send_msg: write failed: %s (%d) [sd = %d]",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)),
strerror(opal_socket_errno), strerror(opal_socket_errno),
opal_socket_errno, opal_socket_errno, peer->sd);
peer->sd); return ORTE_ERR_UNREACH;
return ORTE_ERR_COMM_FAILURE;
} }
/* update location */ } else {
/* short writev. This usually means the kernel buffer is full,
* so there is no point for retrying at that time.
* simply update the msg and return with PMIX_ERR_RESOURCE_BUSY */
if ((size_t)rc < msg->sdbytes) {
/* partial write of the header or the msg data */
msg->sdptr = (char *)msg->sdptr + rc;
msg->sdbytes -= rc; msg->sdbytes -= rc;
msg->sdptr += rc; } else {
/* header was fully written, but only a part of the msg data was written */
msg->hdr_sent = true;
rc -= msg->sdbytes;
if (NULL != msg->data) {
/* technically, this should never happen as iov_count
* would be 1 for a zero-byte message, and so we cannot
* have a case where we write the header and part of the
* msg. However, code checkers don't know that and are
* fooled by our earlier check for NULL, and so
* we silence their warnings by using this check */
msg->sdptr = (char *)msg->data + rc;
}
msg->sdbytes = ntohl(msg->hdr.nbytes) - rc;
}
return ORTE_ERR_RESOURCE_BUSY;
} }
/* we sent the full data block */
return ORTE_SUCCESS;
} }
/* /*
@ -155,57 +197,10 @@ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == peer->send_msg) ? "NULL" : ORTE_NAME_PRINT(&peer->name)); (NULL == peer->send_msg) ? "NULL" : ORTE_NAME_PRINT(&peer->name));
if (NULL != msg) { if (NULL != msg) {
/* if the header hasn't been completely sent, send it */ opal_output_verbose(2, orte_oob_base_framework.framework_output,
if (!msg->hdr_sent) { "oob:tcp:send_handler SENDING MSG");
if (ORTE_SUCCESS == (rc = send_bytes(peer))) { if (ORTE_SUCCESS == (rc = send_msg(peer, msg))) {
/* header is completely sent */ /* this msg is complete */
msg->hdr_sent = true;
/* setup to send the data */
if (NULL != msg->data) {
/* relay msg - send that data */
msg->sdptr = msg->data;
msg->sdbytes = (int)ntohl(msg->hdr.nbytes);
} else if (NULL == msg->msg) {
/* this was a zero-byte relay - nothing more to do */
OBJ_RELEASE(msg);
peer->send_msg = NULL;
goto next;
} else if (NULL != msg->msg->buffer) {
/* send the buffer data as a single block */
msg->sdptr = msg->msg->buffer->base_ptr;
msg->sdbytes = msg->msg->buffer->bytes_used;
} else if (NULL != msg->msg->iov) {
/* start with the first iovec */
msg->sdptr = msg->msg->iov[0].iov_base;
msg->sdbytes = msg->msg->iov[0].iov_len;
msg->iovnum = 0;
} else {
/* just send the data */
msg->sdptr = msg->msg->data;
msg->sdbytes = msg->msg->count;
}
/* fall thru and let the send progress */
} else if (ORTE_ERR_RESOURCE_BUSY == rc ||
ORTE_ERR_WOULD_BLOCK == rc) {
/* exit this event and let the event lib progress */
return;
} else {
// report the error
opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: unable to send header",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)));
opal_event_del(&peer->send_event);
msg->msg->status = rc;
ORTE_RML_SEND_COMPLETE(msg->msg);
OBJ_RELEASE(msg);
peer->send_msg = NULL;
goto next;
}
}
/* progress the data transmission */
if (msg->hdr_sent) {
if (ORTE_SUCCESS == (rc = send_bytes(peer))) {
/* this block is complete */
if (NULL != msg->data || NULL == msg->msg) { if (NULL != msg->data || NULL == msg->msg) {
/* the relay is complete - release the data */ /* the relay is complete - release the data */
opal_output_verbose(2, orte_oob_base_framework.framework_output, opal_output_verbose(2, orte_oob_base_framework.framework_output,
@ -281,9 +276,7 @@ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata)
ORTE_FORCED_TERMINATE(1); ORTE_FORCED_TERMINATE(1);
return; return;
} }
}
next:
/* if current message completed - progress any pending sends by /* if current message completed - progress any pending sends by
* moving the next in the queue into the "on-deck" position. Note * moving the next in the queue into the "on-deck" position. Note
* that this doesn't mean we send the message right now - we will * that this doesn't mean we send the message right now - we will
@ -657,4 +650,3 @@ static void err_cons(mca_oob_tcp_msg_error_t *ptr)
OBJ_CLASS_INSTANCE(mca_oob_tcp_msg_error_t, OBJ_CLASS_INSTANCE(mca_oob_tcp_msg_error_t,
opal_object_t, opal_object_t,
err_cons, NULL); err_cons, NULL);