1
1
openmpi/opal/mca/btl/scif/btl_scif_endpoint.c
Ralph Castain 552c9ca5a0 George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT:    Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL

All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies.  This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP.  Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose.  UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs.  A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.

This commit was SVN r32317.
2014-07-26 00:47:28 +00:00

298 строки
10 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_scif.h"
#include "btl_scif_endpoint.h"
#include "opal/mca/memchecker/base/base.h"
static void mca_btl_scif_ep_construct (mca_btl_base_endpoint_t *ep) {
memset ((char *) ep + sizeof(ep->super), 0, sizeof (*ep) - sizeof (ep->super));
OBJ_CONSTRUCT(&ep->lock, opal_mutex_t);
OBJ_CONSTRUCT(&ep->frag_wait_list, opal_list_t);
}
static void mca_btl_scif_ep_destruct (mca_btl_base_endpoint_t *ep) {
if (ep->send_buffer.buffer) {
scif_munmap (ep->send_buffer.buffer, mca_btl_scif_component.segment_size);
}
if (ep->recv_buffer.buffer) {
scif_unregister (ep->scif_epd, ep->recv_buffer.scif_offset, mca_btl_scif_component.segment_size);
free (ep->recv_buffer.buffer);
}
if (ep->scif_epd) {
scif_close (ep->scif_epd);
}
OBJ_DESTRUCT(&ep->lock);
OBJ_DESTRUCT(&ep->frag_wait_list);
}
OBJ_CLASS_INSTANCE(mca_btl_scif_endpoint_t, opal_list_item_t,
mca_btl_scif_ep_construct, mca_btl_scif_ep_destruct);
static void mca_btl_scif_ep_free_buffer (mca_btl_base_endpoint_t *ep) {
if (ep->recv_buffer.buffer) {
scif_unregister (ep->scif_epd, ep->recv_buffer.scif_offset, mca_btl_scif_component.segment_size);
free (ep->recv_buffer.buffer);
ep->recv_buffer.buffer = NULL;
ep->recv_buffer.scif_offset = (off_t) -1;
}
}
static inline int mca_btl_scif_ep_get_buffer (mca_btl_base_endpoint_t *ep) {
int rc;
rc = posix_memalign ((void **) &ep->recv_buffer.buffer, getpagesize(), mca_btl_scif_component.segment_size);
if (0 > rc) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
memset (ep->recv_buffer.buffer, 0, mca_btl_scif_component.segment_size);
ep->recv_buffer.scif_offset = scif_register (ep->scif_epd, ep->recv_buffer.buffer,
mca_btl_scif_component.segment_size, 0,
SCIF_PROT_READ | SCIF_PROT_WRITE, 0);
if (SCIF_REGISTER_FAILED == ep->recv_buffer.scif_offset) {
BTL_VERBOSE(("failed to register a scif buffer of size %d. errno = %d",
mca_btl_scif_component.segment_size, errno));
free (ep->recv_buffer.buffer);
ep->recv_buffer.buffer = NULL;
return OPAL_ERROR;
}
ep->recv_buffer.startp = (uint32_t *) ep->recv_buffer.buffer;
ep->recv_buffer.endp = ep->recv_buffer.startp + 1;
ep->recv_buffer.startp[0] = ep->recv_buffer.endp[0] = 64;
BTL_VERBOSE(("allocated buffer of size %d bytes. with scif registration %lu",
mca_btl_scif_component.segment_size, (unsigned long) ep->recv_buffer.scif_offset));
return OPAL_SUCCESS;
}
/* must be called with the endpoint lock held */
static int mca_btl_scif_ep_connect_finish (mca_btl_base_endpoint_t *ep, bool passive) {
int rc;
rc = mca_btl_scif_ep_get_buffer (ep);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
BTL_VERBOSE(("error allocating buffer for scif peer"));
return rc;
}
if (passive) {
rc = scif_recv (ep->scif_epd, &ep->send_buffer.scif_offset,
sizeof (ep->send_buffer.scif_offset), SCIF_RECV_BLOCK);
if (OPAL_LIKELY(-1 != rc)) {
rc = scif_send (ep->scif_epd, &ep->recv_buffer.scif_offset,
sizeof (ep->recv_buffer.scif_offset), SCIF_SEND_BLOCK);
}
} else {
rc = scif_send (ep->scif_epd, &ep->recv_buffer.scif_offset,
sizeof (ep->recv_buffer.scif_offset), SCIF_SEND_BLOCK);
if (OPAL_LIKELY(-1 != rc)) {
rc = scif_recv (ep->scif_epd, &ep->send_buffer.scif_offset,
sizeof (ep->send_buffer.scif_offset), SCIF_RECV_BLOCK);
}
}
if (OPAL_UNLIKELY(-1 == rc)) {
BTL_VERBOSE(("error exchanging connection data with peer %d", ep->peer_proc->proc_name.vpid));
mca_btl_scif_ep_free_buffer (ep);
return OPAL_ERROR;
}
BTL_VERBOSE(("remote peer %d has scif offset %lu", ep->peer_proc->proc_name.vpid,
(unsigned long) ep->send_buffer.scif_offset));
ep->send_buffer.buffer = scif_mmap (0, mca_btl_scif_component.segment_size,
SCIF_PROT_READ | SCIF_PROT_WRITE,
0, ep->scif_epd, ep->send_buffer.scif_offset);
if (OPAL_UNLIKELY(NULL == ep->send_buffer.buffer)) {
BTL_VERBOSE(("error in scif_mmap"));
mca_btl_scif_ep_free_buffer (ep);
return OPAL_ERROR;
}
opal_memchecker_base_mem_defined (ep->send_buffer.buffer, mca_btl_scif_component.segment_size);
BTL_VERBOSE(("remote peer %d buffer mapped to local pointer %p", ep->peer_proc->proc_name.vpid,
ep->send_buffer.buffer));
/* setup the circular send buffers */
ep->send_buffer.start = ep->send_buffer.end = 64;
ep->send_buffer.startp = (uint32_t *) ep->send_buffer.buffer;
ep->send_buffer.endp = ep->send_buffer.startp + 1;
ep->recv_buffer.start = 64;
/* connection complete */
ep->state = MCA_BTL_SCIF_EP_STATE_CONNECTED;
BTL_VERBOSE(("btl/scif connection to remote peer %d established", ep->peer_proc->proc_name.vpid));
return OPAL_SUCCESS;
}
int mca_btl_scif_ep_connect_start_passive (void) {
mca_btl_base_endpoint_t *ep = NULL;
orte_process_name_t remote_name;
struct scif_portID port_id;
unsigned int i;
scif_epd_t epd;
int rc;
/* accept the connection request. if the endpoint is already connecting we
* may close this endpoint and alloc mca_btl_scif_ep_connect_start_active
* to finish the connection. */
rc = scif_accept (mca_btl_scif_module.scif_fd, &port_id, &epd, SCIF_ACCEPT_SYNC);
if (OPAL_UNLIKELY(0 > rc)) {
BTL_VERBOSE(("error accepting connecton from scif peer. %d", errno));
return OPAL_ERROR;
}
/* determine which peer sent the connection request */
rc = scif_recv (epd, &remote_name, sizeof (remote_name), SCIF_RECV_BLOCK);
if (OPAL_UNLIKELY(-1 == rc)) {
BTL_VERBOSE(("error in scif_recv"));
scif_close (epd);
return OPAL_ERROR;
}
BTL_VERBOSE(("got connection request from vpid %d on port %u on node %u",
remote_name.vpid, port_id.port, port_id.node));
for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) {
if (mca_btl_scif_module.endpoints[i].peer_proc->proc_name.vpid == remote_name.vpid) {
ep = mca_btl_scif_module.endpoints + i;
break;
}
}
/* peer not found */
if (i == mca_btl_scif_module.endpoint_count) {
BTL_VERBOSE(("remote peer %d unknown", remote_name.vpid));
scif_close (epd);
return OPAL_ERROR;
}
/* similtaneous connections (active side) */
if ((MCA_BTL_SCIF_EP_STATE_CONNECTING == ep->state &&
ep->port_id.port < mca_btl_scif_module.port_id.port) ||
MCA_BTL_SCIF_EP_STATE_CONNECTED == ep->state) {
BTL_VERBOSE(("active connection in progress. connection request from peer %d rejected", remote_name.vpid));
scif_close (epd);
return OPAL_SUCCESS;
}
opal_mutex_lock (&ep->lock);
if (MCA_BTL_SCIF_EP_STATE_CONNECTED == ep->state) {
opal_mutex_unlock (&ep->lock);
scif_close (epd);
return OPAL_SUCCESS;
}
BTL_VERBOSE(("accepted connection from port %d", ep->port_id.port));
ep->state = MCA_BTL_SCIF_EP_STATE_CONNECTING;
ep->scif_epd = epd;
rc = mca_btl_scif_ep_connect_finish (ep, true);
if (OPAL_SUCCESS != rc) {
scif_close (ep->scif_epd);
ep->scif_epd = -1;
ep->state = MCA_BTL_SCIF_EP_STATE_INIT;
}
opal_mutex_unlock (&ep->lock);
return rc;
}
static inline int mca_btl_scif_ep_connect_start_active (mca_btl_base_endpoint_t *ep) {
int rc = OPAL_SUCCESS;
BTL_VERBOSE(("initiaiting connection to remote peer %d with port: %u on local scif node: %u",
ep->peer_proc->proc_name.vpid, ep->port_id.port, ep->port_id.node));
opal_mutex_lock (&ep->lock);
do {
if (MCA_BTL_SCIF_EP_STATE_INIT != ep->state) {
/* the accept thread has already finished this connection */
rc = OPAL_SUCCESS;
break;
}
ep->state = MCA_BTL_SCIF_EP_STATE_CONNECTING;
ep->scif_epd = scif_open ();
if (OPAL_UNLIKELY(SCIF_OPEN_FAILED == ep->scif_epd)) {
BTL_VERBOSE(("error creating new scif endpoint"));
rc = OPAL_ERROR;
break;
}
rc = scif_connect (ep->scif_epd, &ep->port_id);
if (OPAL_UNLIKELY(-1 == rc)) {
/* the connection attempt failed. this could mean the peer is currently
* processing connections. we will to try again later. */
BTL_VERBOSE(("error connecting to scif peer. %d", errno));
rc = OPAL_ERR_RESOURCE_BUSY;
break;
}
rc = scif_send (ep->scif_epd, &OPAL_PROC_MY_NAME, sizeof (OPAL_PROC_MY_NAME), SCIF_SEND_BLOCK);
if (OPAL_UNLIKELY(-1 == rc)) {
BTL_VERBOSE(("error in scif_send"));
rc = OPAL_ERROR;
break;
}
/* build connection data */
rc = mca_btl_scif_ep_connect_finish (ep, false);
} while (0);
if (OPAL_SUCCESS != rc) {
scif_close (ep->scif_epd);
ep->scif_epd = -1;
ep->state = MCA_BTL_SCIF_EP_STATE_INIT;
}
opal_mutex_unlock (&ep->lock);
return rc;
}
int mca_btl_scif_ep_connect (mca_btl_base_endpoint_t *ep) {
int rc;
if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED == ep->state)) {
return OPAL_SUCCESS;
} else if (MCA_BTL_SCIF_EP_STATE_CONNECTING == ep->state) {
return OPAL_ERR_RESOURCE_BUSY;
}
if (MCA_BTL_SCIF_EP_STATE_INIT == ep->state) {
rc = mca_btl_scif_ep_connect_start_active (ep);
if (OPAL_SUCCESS != rc) {
return rc;
}
}
return OPAL_SUCCESS;
}