1
1
openmpi/orte/mca/oob/usock/oob_usock.c
Ralph Castain d6d69e2b13 Get the direct routed component to work with both TCP and USOCK OOB components. We previously had setup the direct component so it would only support direct-launched applications. Thus, all routes went direct between processes. However, if the job had been launched by mpirun, this made no sense - what you wanted instead was to have each app proc talk directly to its daemon, but have the daemons all directly connect to each other.
So we need all the routing code for dealing with cross-job communications, lifelines, etc. The HNP will be directly connected to all daemons as they must callback at startup, and so we need to track those children correctly so we know when it is okay to terminate.

We still have to support direct launch, though, as this is the only component we can use in that scenario. So if the app doesn't have daemon URI info, then it must fall back to directly connecting to everything.
2014-12-07 09:11:48 -08:00

475 строки
16 KiB
C

/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/types.h"
#include "opal/types.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#include <fcntl.h>
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#ifdef HAVE_ARPA_INET_H
#include <arpa/inet.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#include <ctype.h>
#include "opal/util/show_help.h"
#include "opal/util/error.h"
#include "opal/util/output.h"
#include "opal/opal_socket_errno.h"
#include "opal/util/if.h"
#include "opal/util/net.h"
#include "opal/util/argv.h"
#include "opal/class/opal_hash_table.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/name_fns.h"
#include "orte/util/parse_options.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/oob/usock/oob_usock.h"
#include "orte/mca/oob/usock/oob_usock_component.h"
#include "orte/mca/oob/usock/oob_usock_peer.h"
#include "orte/mca/oob/usock/oob_usock_connection.h"
#include "orte/mca/oob/usock/oob_usock_ping.h"
static void usock_init(void);
static void usock_fini(void);
static void accept_connection(const int accepted_fd,
const struct sockaddr *addr);
static void ping(const orte_process_name_t *proc);
static void send_nb(orte_rml_send_t *msg);
static void ft_event(int state);
mca_oob_usock_module_t mca_oob_usock_module = {
{
usock_init,
usock_fini,
accept_connection,
ping,
send_nb,
ft_event
}
};
/*
* Local utility functions
*/
static void recv_handler(int sd, short flags, void* user);
static void* progress_thread_engine(opal_object_t *obj)
{
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s USOCK PROGRESS THREAD RUNNING",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
while (mca_oob_usock_module.ev_active) {
opal_event_loop(mca_oob_usock_module.ev_base, OPAL_EVLOOP_ONCE);
}
return OPAL_THREAD_CANCELLED;
}
/*
* Initialize global variables used w/in this module.
*/
static void usock_init(void)
{
/* setup the module's state variables */
OBJ_CONSTRUCT(&mca_oob_usock_module.peers, opal_hash_table_t);
opal_hash_table_init(&mca_oob_usock_module.peers, 32);
mca_oob_usock_module.ev_active = false;
if (orte_oob_base.use_module_threads) {
/* if we are to use independent progress threads at
* the module level, start it now
*/
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s STARTING USOCK PROGRESS THREAD",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
mca_oob_usock_module.ev_base = opal_event_base_create();
/* construct the thread object */
OBJ_CONSTRUCT(&mca_oob_usock_module.progress_thread, opal_thread_t);
/* fork off a thread to progress it */
mca_oob_usock_module.progress_thread.t_run = progress_thread_engine;
mca_oob_usock_module.progress_thread.t_arg = NULL;
mca_oob_usock_module.ev_active = true;
if (OPAL_SUCCESS != opal_thread_start(&mca_oob_usock_module.progress_thread)) {
opal_output(0, "%s USOCK progress thread failed to start",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
} else {
mca_oob_usock_module.ev_base = orte_event_base;
}
}
/*
* Module cleanup.
*/
static void usock_fini(void)
{
/* cleanup all peers */
OBJ_DESTRUCT(&mca_oob_usock_module.peers);
if (mca_oob_usock_module.ev_active) {
/* if we used an independent progress thread at
* the module level, stop it now
*/
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s STOPPING USOCK PROGRESS THREAD",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* stop the progress thread */
mca_oob_usock_module.ev_active = false;
/* break the event loop */
opal_event_base_loopexit(mca_oob_usock_module.ev_base);
/* wait for thread to exit */
opal_thread_join(&mca_oob_usock_module.progress_thread, NULL);
OBJ_DESTRUCT(&mca_oob_usock_module.progress_thread);
/* release the event base */
opal_event_base_free(mca_oob_usock_module.ev_base);
}
}
/* Called by mca_oob_usock_accept() and connection_handler() on
* a socket that has been accepted. This call finishes processing the
* socket by registering for the OOB-level connection handshake. Used
* in both the threaded and event listen modes.
*/
static void accept_connection(const int accepted_fd,
const struct sockaddr *addr)
{
opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s accept_connection",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* use a one-time event to wait for receipt of peer's
* process ident message to complete this connection
*/
ORTE_ACTIVATE_USOCK_ACCEPT_STATE(accepted_fd, addr, recv_handler);
}
/* API functions */
static void process_ping(int fd, short args, void *cbdata)
{
mca_oob_usock_ping_t *op = (mca_oob_usock_ping_t*)cbdata;
mca_oob_usock_peer_t *peer;
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s:[%s:%d] processing ping to peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
__FILE__, __LINE__,
ORTE_NAME_PRINT(&op->peer));
/* do we know this peer? */
if (NULL == (peer = mca_oob_usock_peer_lookup(&op->peer))) {
/* push this back to the framework so another component can try */
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s:[%s:%d] hop %s unknown",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
__FILE__, __LINE__,
ORTE_NAME_PRINT(&op->peer));
#if 0
ORTE_ACTIVATE_USOCK_MSG_ERROR(NULL, NULL, &op->peer, mca_oob_usock_component_hop_unknown);
#endif
goto cleanup;
}
/* if we are already connected, there is nothing to do */
if (MCA_OOB_USOCK_CONNECTED == peer->state) {
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s:[%s:%d] already connected to peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
__FILE__, __LINE__,
ORTE_NAME_PRINT(&op->peer));
goto cleanup;
}
/* if we are already connecting, there is nothing to do */
if (MCA_OOB_USOCK_CONNECTING == peer->state &&
MCA_OOB_USOCK_CONNECT_ACK == peer->state) {
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s:[%s:%d] already connecting to peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
__FILE__, __LINE__,
ORTE_NAME_PRINT(&op->peer));
goto cleanup;
}
/* attempt the connection */
peer->state = MCA_OOB_USOCK_CONNECTING;
ORTE_ACTIVATE_USOCK_CONN_STATE(peer, mca_oob_usock_peer_try_connect);
cleanup:
OBJ_RELEASE(op);
}
static void ping(const orte_process_name_t *proc)
{
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s:[%s:%d] pinging peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
__FILE__, __LINE__,
ORTE_NAME_PRINT(proc));
/* push this into our event base for processing */
ORTE_ACTIVATE_USOCK_PING(proc, process_ping);
}
static void process_send(int fd, short args, void *cbdata)
{
mca_oob_usock_msg_op_t *op = (mca_oob_usock_msg_op_t*)cbdata;
mca_oob_usock_peer_t *peer;
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s:[%s:%d] processing send to peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
__FILE__, __LINE__,
ORTE_NAME_PRINT(&op->msg->dst));
/* if I am an app, the only route is to my daemon, so
* send the msg there
*/
if (ORTE_PROC_IS_APP) {
if (NULL == (peer = mca_oob_usock_peer_lookup(ORTE_PROC_MY_DAEMON))) {
/* we don't know how to talk to our daemon,
* which is strange since we already got here.
* likely means we lost a race condition, so
*
*/
ORTE_ACTIVATE_USOCK_MSG_ERROR(NULL, op->msg,
ORTE_PROC_MY_DAEMON,
mca_oob_usock_component_cannot_send);
goto cleanup;
}
} else if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
/* if I am a daemon, the only way I should be given this
* message to send is if the proc is local to me
*/
if (NULL == (peer = mca_oob_usock_peer_lookup(&op->msg->dst))) {
/* we don't know how to talk to this proc,
* so send this back up to the OOB base so it
* can try another transport
*/
ORTE_ACTIVATE_USOCK_MSG_ERROR(NULL, op->msg,
&op->msg->dst,
mca_oob_usock_component_cannot_send);
goto cleanup;
}
} else {
/* otherwise, this message can't be handled by me, so
* notify the component of the mistake
*/
opal_output(0, "CAN'T BE HANDLED");
goto cleanup;
}
/* add the msg to the target's send queue */
if (MCA_OOB_USOCK_CONNECTED == peer->state) {
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s usock:send_nb: already connected to %s - queueing for send",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->name));
MCA_OOB_USOCK_QUEUE_SEND(op->msg, peer);
goto cleanup;
}
/* add the message to the queue for sending after the
* connection is formed
*/
MCA_OOB_USOCK_QUEUE_PENDING(op->msg, peer);
if (MCA_OOB_USOCK_CONNECTING != peer->state &&
MCA_OOB_USOCK_CONNECT_ACK != peer->state) {
/* we have to initiate the connection - again, we do not
* want to block while the connection is created.
* So throw us into an event that will create
* the connection via a mini-state-machine :-)
*/
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s usock:send_nb: initiating connection to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->name));
peer->state = MCA_OOB_USOCK_CONNECTING;
ORTE_ACTIVATE_USOCK_CONN_STATE(peer, mca_oob_usock_peer_try_connect);
}
cleanup:
OBJ_RELEASE(op);
}
static void send_nb(orte_rml_send_t *msg)
{
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s usock:send_nb to peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&msg->dst));
/* push this into our event base for processing */
ORTE_ACTIVATE_USOCK_POST_SEND(msg, process_send);
}
/*
* Event callback when there is data available on the registered
* socket to recv. This is called for the listen sockets to accept an
* incoming connection, on new sockets trying to complete the software
* connection process, and for probes. Data on an established
* connection is handled elsewhere.
*/
static void recv_handler(int sd, short flags, void *cbdata)
{
mca_oob_usock_conn_op_t *op = (mca_oob_usock_conn_op_t*)cbdata;
mca_oob_usock_hdr_t hdr;
mca_oob_usock_peer_t *peer;
uint64_t *ui64;
opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s:usock:recv:handler called",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* get the handshake */
if (ORTE_SUCCESS != mca_oob_usock_peer_recv_connect_ack(NULL, sd, &hdr)) {
goto cleanup;
}
/* finish processing ident */
if (MCA_OOB_USOCK_IDENT == hdr.type) {
if (NULL == (peer = mca_oob_usock_peer_lookup(&hdr.origin))) {
/* should never happen */
mca_oob_usock_peer_close(peer);
goto cleanup;
}
/* set socket up to be non-blocking */
if ((flags = fcntl(sd, F_GETFL, 0)) < 0) {
opal_output(0, "%s mca_oob_usock_recv_connect: fcntl(F_GETFL) failed: %s (%d)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), strerror(opal_socket_errno), opal_socket_errno);
} else {
flags |= O_NONBLOCK;
if (fcntl(sd, F_SETFL, flags) < 0) {
opal_output(0, "%s mca_oob_usock_recv_connect: fcntl(F_SETFL) failed: %s (%d)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), strerror(opal_socket_errno), opal_socket_errno);
}
}
/* is the peer instance willing to accept this connection */
peer->sd = sd;
if (mca_oob_usock_peer_accept(peer) == false) {
if (OOB_USOCK_DEBUG_CONNECT <= opal_output_get_verbosity(orte_oob_base_framework.framework_output)) {
opal_output(0, "%s-%s mca_oob_usock_recv_connect: "
"rejected connection state %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)),
peer->state);
}
CLOSE_THE_SOCKET(sd);
ui64 = (uint64_t*)(&peer->name);
opal_hash_table_set_value_uint64(&mca_oob_usock_module.peers, (*ui64), NULL);
OBJ_RELEASE(peer);
}
}
cleanup:
OBJ_RELEASE(op);
}
/* Dummy function for when we are not using FT. */
#if OPAL_ENABLE_FT_CR == 0
static void ft_event(int state)
{
return;
}
#else
static void ft_event(int state) {
#if 0
opal_list_item_t *item;
#endif
if(OPAL_CRS_CHECKPOINT == state) {
#if 0
/*
* Disable event processing while we are working
*/
opal_event_disable();
#endif
}
else if(OPAL_CRS_CONTINUE == state) {
#if 0
/*
* Resume event processing
*/
opal_event_enable();
}
else if(OPAL_CRS_RESTART == state) {
/*
* Clean out cached connection information
* Select pieces of finalize/init
*/
for (item = opal_list_remove_first(&mod->peer_list);
item != NULL;
item = opal_list_remove_first(&mod->peer_list)) {
mca_oob_usock_peer_t* peer = (mca_oob_usock_peer_t*)item;
/* JJH: Use the below command for debugging restarts with invalid sockets
* mca_oob_usock_peer_dump(peer, "RESTART CLEAN")
*/
MCA_OOB_USOCK_PEER_RETURN(peer);
}
OBJ_DESTRUCT(&mod->peer_free);
OBJ_DESTRUCT(&mod->peer_names);
OBJ_DESTRUCT(&mod->peers);
OBJ_DESTRUCT(&mod->peer_list);
OBJ_CONSTRUCT(&mod->peer_list, opal_list_t);
OBJ_CONSTRUCT(&mod->peers, opal_hash_table_t);
OBJ_CONSTRUCT(&mod->peer_names, opal_hash_table_t);
OBJ_CONSTRUCT(&mod->peer_free, opal_free_list_t);
/*
* Resume event processing
*/
opal_event_enable();
#endif
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
return;
}
#endif