2004-08-25 21:39:08 +04:00
|
|
|
/*
|
2005-11-05 22:57:48 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2004-11-28 23:09:25 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2004-11-22 04:38:40 +03:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
2004-08-25 21:39:08 +04:00
|
|
|
* $HEADER$
|
2006-03-11 06:09:24 +03:00
|
|
|
*
|
|
|
|
* In windows, many of the socket functions return an EWOULDBLOCK
|
|
|
|
* instead of \ things like EAGAIN, EINPROGRESS, etc. It has been
|
|
|
|
* verified that this will \ not conflict with other error codes that
|
|
|
|
* are returned by these functions \ under UNIX/Linux environments
|
2004-08-25 21:39:08 +04:00
|
|
|
*/
|
2006-03-11 06:09:24 +03:00
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte_config.h"
|
2004-10-22 20:06:05 +04:00
|
|
|
#ifdef HAVE_UNISTD_H
|
2004-07-13 02:46:57 +04:00
|
|
|
#include <unistd.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
2004-07-13 02:46:57 +04:00
|
|
|
#include <fcntl.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#ifdef HAVE_SYS_UIO_H
|
2004-07-15 17:51:40 +04:00
|
|
|
#include <sys/uio.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_TYPES_H
|
2004-07-13 02:46:57 +04:00
|
|
|
#include <sys/types.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte/orte_socket_errno.h"
|
2004-10-22 20:06:05 +04:00
|
|
|
#ifdef HAVE_NETINET_IN_H
|
2004-07-13 02:46:57 +04:00
|
|
|
#include <netinet/in.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
|
|
|
#ifdef HAVE_ARPA_INET_H
|
2004-07-13 02:46:57 +04:00
|
|
|
#include <arpa/inet.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
2005-04-12 02:48:50 +04:00
|
|
|
#ifdef HAVE_NETINET_TCP_H
|
|
|
|
#include <netinet/tcp.h>
|
|
|
|
#endif
|
2005-03-14 23:57:21 +03:00
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte/class/orte_proc_table.h"
|
2005-07-04 03:31:27 +04:00
|
|
|
#include "opal/util/output.h"
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte/util/univ_info.h"
|
2005-03-14 23:57:21 +03:00
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte/mca/gpr/gpr.h"
|
|
|
|
#include "orte/mca/ns/ns.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
2005-03-14 23:57:21 +03:00
|
|
|
|
2004-09-02 03:07:40 +04:00
|
|
|
#include "oob_tcp.h"
|
|
|
|
#include "oob_tcp_peer.h"
|
2004-07-01 18:49:54 +04:00
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
static int mca_oob_tcp_peer_start_connect(mca_oob_tcp_peer_t* peer);
|
2004-07-15 17:51:40 +04:00
|
|
|
static int mca_oob_tcp_peer_event_init(mca_oob_tcp_peer_t* peer);
|
2004-07-13 02:46:57 +04:00
|
|
|
static void mca_oob_tcp_peer_connected(mca_oob_tcp_peer_t* peer);
|
|
|
|
static void mca_oob_tcp_peer_construct(mca_oob_tcp_peer_t* peer);
|
|
|
|
static void mca_oob_tcp_peer_destruct(mca_oob_tcp_peer_t* peer);
|
|
|
|
static int mca_oob_tcp_peer_send_connect_ack(mca_oob_tcp_peer_t* peer);
|
|
|
|
static int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* peer);
|
|
|
|
static int mca_oob_tcp_peer_recv_blocking(mca_oob_tcp_peer_t* peer, void* data, size_t size);
|
|
|
|
static int mca_oob_tcp_peer_send_blocking(mca_oob_tcp_peer_t* peer, void* data, size_t size);
|
|
|
|
static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user);
|
|
|
|
static void mca_oob_tcp_peer_send_handler(int sd, short flags, void* user);
|
2004-08-28 05:15:19 +04:00
|
|
|
static void mca_oob_tcp_peer_timer_handler(int sd, short flags, void* user);
|
2004-07-13 02:46:57 +04:00
|
|
|
static void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg);
|
|
|
|
|
|
|
|
|
2004-07-01 21:45:34 +04:00
|
|
|
OBJ_CLASS_INSTANCE(
|
|
|
|
mca_oob_tcp_peer_t,
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t,
|
2004-07-13 02:46:57 +04:00
|
|
|
mca_oob_tcp_peer_construct,
|
|
|
|
mca_oob_tcp_peer_destruct);
|
2004-07-01 21:45:34 +04:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is the constructor function for the mca_oob_tcp_peer
|
|
|
|
* struct. Note that this function and OBJ_NEW should NEVER
|
|
|
|
* be called directly. Instead, use mca_oob_tcp_add_peer
|
|
|
|
*
|
|
|
|
* @param peer a pointer to the mca_oob_tcp_peer_t struct to be initialized
|
|
|
|
* @retval none
|
|
|
|
*/
|
2004-07-13 02:46:57 +04:00
|
|
|
static void mca_oob_tcp_peer_construct(mca_oob_tcp_peer_t* peer)
|
2004-07-01 21:45:34 +04:00
|
|
|
{
|
2005-07-03 20:22:16 +04:00
|
|
|
OBJ_CONSTRUCT(&(peer->peer_send_queue), opal_list_t);
|
2005-07-04 02:45:48 +04:00
|
|
|
OBJ_CONSTRUCT(&(peer->peer_lock), opal_mutex_t);
|
2004-09-30 19:09:29 +04:00
|
|
|
memset(&peer->peer_send_event, 0, sizeof(peer->peer_send_event));
|
|
|
|
memset(&peer->peer_recv_event, 0, sizeof(peer->peer_recv_event));
|
2004-08-28 05:15:19 +04:00
|
|
|
memset(&peer->peer_timer_event, 0, sizeof(peer->peer_timer_event));
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_evtimer_set(&peer->peer_timer_event, mca_oob_tcp_peer_timer_handler, peer);
|
2004-07-01 21:45:34 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is the destructor function for the mca_oob_tcp_peer
|
|
|
|
* struct. Note that this function and OBJ_RELEASE should NEVER
|
|
|
|
* be called directly. Instead, use mca_oob_tcp_del_peer
|
|
|
|
*
|
|
|
|
* @param peer a pointer to the mca_oob_tcp_peer_t struct to be destroyed
|
|
|
|
* @retval none
|
|
|
|
*/
|
2004-07-13 02:46:57 +04:00
|
|
|
static void mca_oob_tcp_peer_destruct(mca_oob_tcp_peer_t * peer)
|
2004-07-01 21:45:34 +04:00
|
|
|
{
|
2005-03-14 23:57:21 +03:00
|
|
|
mca_oob_tcp_peer_shutdown(peer);
|
2004-07-13 02:46:57 +04:00
|
|
|
OBJ_DESTRUCT(&(peer->peer_send_queue));
|
2004-07-01 21:45:34 +04:00
|
|
|
OBJ_DESTRUCT(&(peer->peer_lock));
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
2004-07-01 21:45:34 +04:00
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
/*
|
|
|
|
* Initialize events to be used by the peer instance for TCP select/poll callbacks.
|
|
|
|
*/
|
2004-07-15 17:51:40 +04:00
|
|
|
static int mca_oob_tcp_peer_event_init(mca_oob_tcp_peer_t* peer)
|
2004-07-13 02:46:57 +04:00
|
|
|
{
|
2004-09-02 03:07:40 +04:00
|
|
|
memset(&peer->peer_recv_event, 0, sizeof(peer->peer_recv_event));
|
|
|
|
memset(&peer->peer_send_event, 0, sizeof(peer->peer_send_event));
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_set(
|
2004-07-13 02:46:57 +04:00
|
|
|
&peer->peer_recv_event,
|
|
|
|
peer->peer_sd,
|
2005-07-04 03:09:55 +04:00
|
|
|
OPAL_EV_READ|OPAL_EV_PERSIST,
|
2004-07-13 02:46:57 +04:00
|
|
|
mca_oob_tcp_peer_recv_handler,
|
|
|
|
peer);
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_set(
|
2004-07-13 02:46:57 +04:00
|
|
|
&peer->peer_send_event,
|
|
|
|
peer->peer_sd,
|
2005-07-04 03:09:55 +04:00
|
|
|
OPAL_EV_WRITE|OPAL_EV_PERSIST,
|
2004-07-13 02:46:57 +04:00
|
|
|
mca_oob_tcp_peer_send_handler,
|
|
|
|
peer);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-07-01 21:45:34 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2004-08-03 01:24:00 +04:00
|
|
|
* Initiate the appropriate action based on the state of the connection
|
|
|
|
* to the peer.
|
2004-07-01 21:45:34 +04:00
|
|
|
*
|
|
|
|
*/
|
2004-07-13 02:46:57 +04:00
|
|
|
int mca_oob_tcp_peer_send(mca_oob_tcp_peer_t* peer, mca_oob_tcp_msg_t* msg)
|
|
|
|
{
|
2006-02-12 04:33:29 +03:00
|
|
|
int rc = ORTE_SUCCESS;
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&peer->peer_lock);
|
2004-07-13 02:46:57 +04:00
|
|
|
switch(peer->peer_state) {
|
|
|
|
case MCA_OOB_TCP_CONNECTING:
|
|
|
|
case MCA_OOB_TCP_CONNECT_ACK:
|
|
|
|
case MCA_OOB_TCP_CLOSED:
|
2004-09-02 03:07:40 +04:00
|
|
|
case MCA_OOB_TCP_RESOLVE:
|
2004-07-13 02:46:57 +04:00
|
|
|
/*
|
2004-09-02 03:07:40 +04:00
|
|
|
* queue the message and attempt to resolve the peer address
|
2004-07-13 02:46:57 +04:00
|
|
|
*/
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_append(&peer->peer_send_queue, (opal_list_item_t*)msg);
|
2004-08-28 05:15:19 +04:00
|
|
|
if(peer->peer_state == MCA_OOB_TCP_CLOSED) {
|
2004-09-02 03:07:40 +04:00
|
|
|
peer->peer_state = MCA_OOB_TCP_RESOLVE;
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
2004-09-02 03:07:40 +04:00
|
|
|
return mca_oob_tcp_resolve(peer);
|
2004-08-28 05:15:19 +04:00
|
|
|
}
|
2004-07-13 02:46:57 +04:00
|
|
|
break;
|
|
|
|
case MCA_OOB_TCP_FAILED:
|
2006-02-12 04:33:29 +03:00
|
|
|
rc = ORTE_ERR_UNREACH;
|
2004-07-13 02:46:57 +04:00
|
|
|
break;
|
|
|
|
case MCA_OOB_TCP_CONNECTED:
|
|
|
|
/*
|
|
|
|
* start the message and queue if not completed
|
|
|
|
*/
|
|
|
|
if (NULL != peer->peer_send_msg) {
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_append(&peer->peer_send_queue, (opal_list_item_t*)msg);
|
2004-07-13 02:46:57 +04:00
|
|
|
} else {
|
2004-07-15 17:51:40 +04:00
|
|
|
/*if the send does not complete */
|
|
|
|
if(!mca_oob_tcp_msg_send_handler(msg, peer)) {
|
2004-07-13 02:46:57 +04:00
|
|
|
peer->peer_send_msg = msg;
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_add(&peer->peer_send_event, 0);
|
2004-08-03 20:34:59 +04:00
|
|
|
} else {
|
|
|
|
mca_oob_tcp_msg_complete(msg, &peer->peer_name);
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
2004-07-13 02:46:57 +04:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Lookup a peer by name, create one if it doesn't exist.
|
|
|
|
* @param name Peers globally unique identifier.
|
|
|
|
* @retval Pointer to the newly created struture or NULL on error.
|
|
|
|
*/
|
2005-03-14 23:57:21 +03:00
|
|
|
mca_oob_tcp_peer_t * mca_oob_tcp_peer_lookup(const orte_process_name_t* name)
|
2004-07-01 21:45:34 +04:00
|
|
|
{
|
2004-07-13 02:46:57 +04:00
|
|
|
int rc;
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
mca_oob_tcp_peer_t * peer, *old;
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t* item;
|
2004-07-13 02:46:57 +04:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
if (NULL == name) { /* can't look this one up */
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
2005-07-19 16:25:19 +04:00
|
|
|
peer = (mca_oob_tcp_peer_t*)orte_hash_table_get_proc(
|
2005-03-14 23:57:21 +03:00
|
|
|
&mca_oob_tcp_component.tcp_peers, name);
|
2004-09-29 21:18:14 +04:00
|
|
|
if(NULL != peer && memcmp(&peer->peer_name,name,sizeof(peer->peer_name)) == 0) {
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
2004-07-13 02:46:57 +04:00
|
|
|
return peer;
|
|
|
|
}
|
|
|
|
|
2004-09-29 21:18:14 +04:00
|
|
|
/* search the peer list - if we find it here this is a bug in the tree */
|
2005-07-03 20:22:16 +04:00
|
|
|
for(item = opal_list_get_first(&mca_oob_tcp_component.tcp_peer_list);
|
|
|
|
item != opal_list_get_end(&mca_oob_tcp_component.tcp_peer_list);
|
|
|
|
item = opal_list_get_next(item)) {
|
2004-09-29 21:18:14 +04:00
|
|
|
peer = (mca_oob_tcp_peer_t*)item;
|
|
|
|
if (memcmp(&peer->peer_name, name, sizeof(peer->peer_name)) == 0) {
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
2004-09-29 21:18:14 +04:00
|
|
|
return peer;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-08-28 05:15:19 +04:00
|
|
|
/* allocate from free list */
|
2004-07-13 02:46:57 +04:00
|
|
|
MCA_OOB_TCP_PEER_ALLOC(peer, rc);
|
|
|
|
if(NULL == peer) {
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
2004-07-01 21:45:34 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
2004-07-13 02:46:57 +04:00
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
/* initialize peer state */
|
2004-07-13 02:46:57 +04:00
|
|
|
peer->peer_name = *name;
|
2004-09-02 03:07:40 +04:00
|
|
|
peer->peer_addr = NULL;
|
2004-08-03 01:24:00 +04:00
|
|
|
peer->peer_sd = -1;
|
|
|
|
peer->peer_state = MCA_OOB_TCP_CLOSED;
|
2004-08-04 18:33:02 +04:00
|
|
|
peer->peer_recv_msg = NULL;
|
|
|
|
peer->peer_send_msg = NULL;
|
|
|
|
peer->peer_retries = 0;
|
2004-07-15 17:51:40 +04:00
|
|
|
|
2004-08-28 05:15:19 +04:00
|
|
|
/* add to lookup table */
|
2006-02-12 04:33:29 +03:00
|
|
|
if(ORTE_SUCCESS != orte_hash_table_set_proc(&mca_oob_tcp_component.tcp_peers,
|
2005-03-14 23:57:21 +03:00
|
|
|
&peer->peer_name, peer)) {
|
2004-07-13 02:46:57 +04:00
|
|
|
MCA_OOB_TCP_PEER_RETURN(peer);
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
2004-07-13 02:46:57 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
2004-08-03 01:24:00 +04:00
|
|
|
|
2004-07-14 01:03:03 +04:00
|
|
|
/* if the peer list is over the maximum size, remove one unsed peer */
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_prepend(&mca_oob_tcp_component.tcp_peer_list, (opal_list_item_t *) peer);
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
if(mca_oob_tcp_component.tcp_peer_limit > 0 &&
|
2005-07-03 20:22:16 +04:00
|
|
|
(int)opal_list_get_size(&mca_oob_tcp_component.tcp_peer_list) >
|
2005-03-14 23:57:21 +03:00
|
|
|
mca_oob_tcp_component.tcp_peer_limit) {
|
2004-07-15 17:51:40 +04:00
|
|
|
old = (mca_oob_tcp_peer_t *)
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_get_last(&mca_oob_tcp_component.tcp_peer_list);
|
2004-07-15 17:51:40 +04:00
|
|
|
while(1) {
|
2005-07-03 20:22:16 +04:00
|
|
|
if(0 == opal_list_get_size(&(old->peer_send_queue)) &&
|
2004-07-15 17:51:40 +04:00
|
|
|
NULL == peer->peer_recv_msg) {
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_remove_item(&mca_oob_tcp_component.tcp_peer_list,
|
|
|
|
(opal_list_item_t *) old);
|
2004-07-15 17:51:40 +04:00
|
|
|
MCA_OOB_TCP_PEER_RETURN(old);
|
|
|
|
break;
|
|
|
|
} else {
|
2005-07-03 20:22:16 +04:00
|
|
|
old = (mca_oob_tcp_peer_t *) opal_list_get_prev(old);
|
|
|
|
if(opal_list_get_begin(&mca_oob_tcp_component.tcp_peer_list) == (opal_list_item_t*)old) {
|
2004-07-15 17:51:40 +04:00
|
|
|
/* we tried, but we couldn't find one that was valid to get rid
|
|
|
|
* of. Oh well. */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
2004-07-13 02:46:57 +04:00
|
|
|
return peer;
|
2004-07-01 21:45:34 +04:00
|
|
|
}
|
|
|
|
|
2004-09-02 03:07:40 +04:00
|
|
|
|
2004-07-01 21:45:34 +04:00
|
|
|
/*
|
2004-07-13 02:46:57 +04:00
|
|
|
* Start a connection to the peer. This will likely not complete,
|
|
|
|
* as the socket is set to non-blocking, so register for event
|
|
|
|
* notification of connect completion. On connection we send
|
|
|
|
* our globally unique process identifier to the peer and wait for
|
|
|
|
* the peers response.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int mca_oob_tcp_peer_start_connect(mca_oob_tcp_peer_t* peer)
|
|
|
|
{
|
2005-08-27 21:03:19 +04:00
|
|
|
int rc, flags;
|
2004-09-02 03:07:40 +04:00
|
|
|
struct sockaddr_in inaddr;
|
|
|
|
|
|
|
|
/* create socket */
|
|
|
|
peer->peer_state = MCA_OOB_TCP_CONNECTING;
|
2004-07-13 02:46:57 +04:00
|
|
|
peer->peer_sd = socket(AF_INET, SOCK_STREAM, 0);
|
|
|
|
if (peer->peer_sd < 0) {
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
struct timeval tv = { 1,0 };
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0,
|
2005-05-08 17:22:55 +04:00
|
|
|
"[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_start_connect: socket() failed with errno=%d\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
2005-01-20 03:03:23 +03:00
|
|
|
ompi_socket_errno);
|
2005-03-19 02:40:08 +03:00
|
|
|
mca_oob_tcp_peer_shutdown(peer);
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_evtimer_add(&peer->peer_timer_event, &tv);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERR_UNREACH;
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
|
2005-10-31 19:21:11 +03:00
|
|
|
/* setup socket options */
|
|
|
|
mca_oob_tcp_set_socket_options(peer->peer_sd);
|
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
/* setup event callbacks */
|
2004-07-15 17:51:40 +04:00
|
|
|
mca_oob_tcp_peer_event_init(peer);
|
2004-07-13 02:46:57 +04:00
|
|
|
|
|
|
|
/* setup the socket as non-blocking */
|
|
|
|
if((flags = fcntl(peer->peer_sd, F_GETFL, 0)) < 0) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_connect: fcntl(F_GETFL) failed with errno=%d\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
2005-01-20 03:03:23 +03:00
|
|
|
ompi_socket_errno);
|
2004-07-13 02:46:57 +04:00
|
|
|
} else {
|
2004-09-02 03:07:40 +04:00
|
|
|
flags |= O_NONBLOCK;
|
2004-07-13 02:46:57 +04:00
|
|
|
if(fcntl(peer->peer_sd, F_SETFL, flags) < 0)
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_connect: fcntl(F_SETFL) failed with errno=%d\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
2005-01-20 03:03:23 +03:00
|
|
|
ompi_socket_errno);
|
2004-09-02 03:07:40 +04:00
|
|
|
}
|
|
|
|
|
2005-03-19 02:40:08 +03:00
|
|
|
if(mca_oob_tcp_component.tcp_debug > 0) {
|
2005-08-27 21:03:19 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_start_connect: trying all %d addresses\n",
|
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)), peer->peer_addr->addr_count );
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
|
2005-08-27 21:03:19 +04:00
|
|
|
/*
|
|
|
|
* We should parse all the IP addresses exported by the peer and try to connect to each of them.
|
|
|
|
*/
|
|
|
|
do {
|
|
|
|
/* pick an address in round-robin fashion from the list exported by the peer */
|
2006-02-12 04:33:29 +03:00
|
|
|
if((rc = mca_oob_tcp_addr_get_next(peer->peer_addr, &inaddr)) != ORTE_SUCCESS) {
|
2005-08-27 21:03:19 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_start_connect: mca_oob_tcp_addr_get_next failed with error=%d",
|
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
|
|
|
rc);
|
|
|
|
break;
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
2005-08-27 21:03:19 +04:00
|
|
|
if(mca_oob_tcp_component.tcp_debug > 0) {
|
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_start_connect: connecting port %d to: %s:%d\n",
|
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
|
|
|
ntohs(mca_oob_tcp_component.tcp_listen_port),
|
|
|
|
inet_ntoa(inaddr.sin_addr),
|
|
|
|
ntohs(inaddr.sin_port));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* start the connect - will likely fail with EINPROGRESS */
|
|
|
|
if(connect(peer->peer_sd, (struct sockaddr*)&inaddr, sizeof(inaddr)) < 0) {
|
|
|
|
/* non-blocking so wait for completion */
|
|
|
|
if(ompi_socket_errno == EINPROGRESS || ompi_socket_errno == EWOULDBLOCK) {
|
|
|
|
opal_event_add(&peer->peer_send_event, 0);
|
|
|
|
/* Waiting for completion in the middle of the list ?! Let's just hope we try with the
|
|
|
|
* correct IP address...
|
|
|
|
*/
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_SUCCESS;
|
2005-08-27 21:03:19 +04:00
|
|
|
}
|
|
|
|
if(mca_oob_tcp_component.tcp_debug > 0) {
|
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_start_connect: connect to %s:%d failed with errno=%d",
|
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
|
|
|
inet_ntoa(inaddr.sin_addr),
|
|
|
|
ntohs(inaddr.sin_port),
|
|
|
|
ompi_socket_errno);
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* send our globally unique process identifier to the peer */
|
2006-02-12 04:33:29 +03:00
|
|
|
if((rc = mca_oob_tcp_peer_send_connect_ack(peer)) == ORTE_SUCCESS) {
|
2005-08-27 21:03:19 +04:00
|
|
|
peer->peer_state = MCA_OOB_TCP_CONNECT_ACK;
|
|
|
|
opal_event_add(&peer->peer_recv_event, 0);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_SUCCESS; /* successfully connect to the peer */
|
2005-08-27 21:03:19 +04:00
|
|
|
} else {
|
|
|
|
opal_output(0,
|
|
|
|
"[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_start_connect: "
|
|
|
|
"mca_oob_tcp_peer_send_connect_ack to %s:%d failed with errno=%d",
|
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
|
|
|
inet_ntoa(inaddr.sin_addr),
|
|
|
|
ntohs(inaddr.sin_port),
|
|
|
|
rc);
|
|
|
|
}
|
|
|
|
} while( peer->peer_addr->addr_next != 0 );
|
|
|
|
mca_oob_tcp_peer_close(peer);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERR_UNREACH;
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
/*
|
|
|
|
* Check the status of the connection. If the connection failed, will retry
|
|
|
|
* later. Otherwise, send this processes identifier to the peer on the
|
|
|
|
* newly connected socket.
|
|
|
|
*/
|
|
|
|
static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer)
|
|
|
|
{
|
|
|
|
int so_error = 0;
|
|
|
|
ompi_socklen_t so_length = sizeof(so_error);
|
2004-08-03 01:24:00 +04:00
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
/* unregister from receiving event notifications */
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_del(&peer->peer_send_event);
|
2004-08-03 01:24:00 +04:00
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
/* check connect completion status */
|
2004-10-28 22:13:43 +04:00
|
|
|
if(getsockopt(peer->peer_sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: getsockopt() failed with errno=%d\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
2005-01-20 03:03:23 +03:00
|
|
|
ompi_socket_errno);
|
2004-07-13 02:46:57 +04:00
|
|
|
mca_oob_tcp_peer_close(peer);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if(so_error == EINPROGRESS) {
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_add(&peer->peer_send_event, 0);
|
2004-07-13 02:46:57 +04:00
|
|
|
return;
|
2005-03-19 02:40:08 +03:00
|
|
|
} else if (so_error == ECONNREFUSED || so_error == ETIMEDOUT) {
|
2004-08-28 05:15:19 +04:00
|
|
|
struct timeval tv = { 1,0 };
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: "
|
2005-03-19 02:40:08 +03:00
|
|
|
"connection failed (errno=%d) - retrying (pid=%d)\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
2005-03-19 02:40:08 +03:00
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
|
|
|
so_error, getpid());
|
|
|
|
mca_oob_tcp_peer_shutdown(peer);
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_evtimer_add(&peer->peer_timer_event, &tv);
|
2004-08-03 01:24:00 +04:00
|
|
|
return;
|
|
|
|
} else if(so_error != 0) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: connect() failed with errno=%d\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
2004-09-02 03:07:40 +04:00
|
|
|
so_error);
|
2004-07-13 02:46:57 +04:00
|
|
|
mca_oob_tcp_peer_close(peer);
|
|
|
|
return;
|
|
|
|
}
|
2004-07-14 01:03:03 +04:00
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
if(mca_oob_tcp_peer_send_connect_ack(peer) == ORTE_SUCCESS) {
|
2004-07-13 02:46:57 +04:00
|
|
|
peer->peer_state = MCA_OOB_TCP_CONNECT_ACK;
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_add(&peer->peer_recv_event, 0);
|
2004-07-13 02:46:57 +04:00
|
|
|
} else {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: unable to send connect ack.",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)));
|
2004-07-13 02:46:57 +04:00
|
|
|
mca_oob_tcp_peer_close(peer);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Setup peer state to reflect that connection has been established,
|
|
|
|
* and start any pending sends.
|
|
|
|
*/
|
|
|
|
static void mca_oob_tcp_peer_connected(mca_oob_tcp_peer_t* peer)
|
2004-07-01 21:45:34 +04:00
|
|
|
{
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_del(&peer->peer_timer_event);
|
2004-07-13 02:46:57 +04:00
|
|
|
peer->peer_state = MCA_OOB_TCP_CONNECTED;
|
|
|
|
peer->peer_retries = 0;
|
2005-07-03 20:22:16 +04:00
|
|
|
if(opal_list_get_size(&peer->peer_send_queue) > 0) {
|
2004-07-13 02:46:57 +04:00
|
|
|
if(NULL == peer->peer_send_msg)
|
|
|
|
peer->peer_send_msg = (mca_oob_tcp_msg_t*)
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_remove_first(&peer->peer_send_queue);
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_add(&peer->peer_send_event, 0);
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
}
|
2004-07-15 17:51:40 +04:00
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
/*
|
|
|
|
* Remove any event registrations associated with the socket
|
|
|
|
* and update the peer state to reflect the connection has
|
|
|
|
* been closed.
|
|
|
|
*/
|
2004-08-03 01:24:00 +04:00
|
|
|
void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
|
2004-07-13 02:46:57 +04:00
|
|
|
{
|
2005-03-19 02:40:08 +03:00
|
|
|
if(mca_oob_tcp_component.tcp_debug > 0) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_close(%p) sd %d state %d\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
2004-09-29 21:18:14 +04:00
|
|
|
peer,
|
2004-09-02 03:07:40 +04:00
|
|
|
peer->peer_sd,
|
|
|
|
peer->peer_state);
|
|
|
|
}
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
/* if we lose the connection to the seed - abort */
|
|
|
|
if(memcmp(&peer->peer_name,&mca_oob_name_seed,sizeof(mca_oob_name_seed)) == 0) {
|
2005-09-03 01:07:21 +04:00
|
|
|
/* If we are not already inside orte_finalize, then call abort */
|
|
|
|
if (ORTE_UNIVERSE_STATE_FINALIZE > orte_universe_info.state) {
|
2005-09-29 09:04:43 +04:00
|
|
|
/* Should free the peer lock before we abort so we don't
|
|
|
|
* get stuck in the orte_wait_kill when receiving messages in the
|
|
|
|
* tcp OOB. */
|
|
|
|
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
2005-09-03 01:07:21 +04:00
|
|
|
orte_errmgr.abort();
|
|
|
|
}
|
2005-03-14 23:57:21 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
mca_oob_tcp_peer_shutdown(peer);
|
|
|
|
}
|
|
|
|
|
|
|
|
void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
|
|
|
|
{
|
2004-08-28 05:15:19 +04:00
|
|
|
/* giving up and cleanup any pending messages */
|
|
|
|
if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) {
|
|
|
|
mca_oob_tcp_msg_t *msg = peer->peer_send_msg;
|
|
|
|
while(msg != NULL) {
|
2006-02-12 04:33:29 +03:00
|
|
|
msg->msg_rc = ORTE_ERR_UNREACH;
|
2004-08-28 05:15:19 +04:00
|
|
|
mca_oob_tcp_msg_complete(msg, &peer->peer_name);
|
2005-07-03 20:22:16 +04:00
|
|
|
msg = (mca_oob_tcp_msg_t*)opal_list_remove_first(&peer->peer_send_queue);
|
2004-08-28 05:15:19 +04:00
|
|
|
}
|
|
|
|
peer->peer_send_msg = NULL;
|
|
|
|
}
|
|
|
|
|
2004-09-30 19:09:29 +04:00
|
|
|
if (peer->peer_sd >= 0) {
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_del(&peer->peer_recv_event);
|
|
|
|
opal_event_del(&peer->peer_send_event);
|
2004-07-13 02:46:57 +04:00
|
|
|
close(peer->peer_sd);
|
|
|
|
peer->peer_sd = -1;
|
2004-08-31 06:57:39 +04:00
|
|
|
}
|
2004-09-02 03:07:40 +04:00
|
|
|
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_del(&peer->peer_timer_event);
|
2004-07-13 02:46:57 +04:00
|
|
|
peer->peer_state = MCA_OOB_TCP_CLOSED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Send the globally unique identifier for this process to a peer on
|
|
|
|
* a newly connected socket.
|
|
|
|
*/
|
|
|
|
static int mca_oob_tcp_peer_send_connect_ack(mca_oob_tcp_peer_t* peer)
|
|
|
|
{
|
2004-08-25 21:39:08 +04:00
|
|
|
/* send process identifier of self and peer - note that we may
|
|
|
|
* have assigned the peer a unique process name - if it came up
|
|
|
|
* without one.
|
|
|
|
*/
|
2005-05-05 20:31:40 +04:00
|
|
|
mca_oob_tcp_hdr_t hdr;
|
|
|
|
memset(&hdr,0,sizeof(hdr));
|
2005-03-14 23:57:21 +03:00
|
|
|
if (NULL == orte_process_info.my_name) { /* my name isn't defined yet */
|
2005-05-05 20:31:40 +04:00
|
|
|
hdr.msg_src = *MCA_OOB_NAME_ANY;
|
2005-03-14 23:57:21 +03:00
|
|
|
} else {
|
2005-05-05 20:31:40 +04:00
|
|
|
hdr.msg_src = *(orte_process_info.my_name);
|
2005-03-14 23:57:21 +03:00
|
|
|
}
|
2005-05-05 20:31:40 +04:00
|
|
|
hdr.msg_dst = peer->peer_name;
|
|
|
|
hdr.msg_type = MCA_OOB_TCP_CONNECT;
|
|
|
|
MCA_OOB_TCP_HDR_HTON(&hdr);
|
|
|
|
if(mca_oob_tcp_peer_send_blocking(peer, &hdr, sizeof(hdr)) != sizeof(hdr)) {
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERR_UNREACH;
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-07-01 21:45:34 +04:00
|
|
|
}
|
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
/*
|
|
|
|
* Receive the peers globally unique process identification from a newly
|
|
|
|
* connected socket and verify the expected response. If so, move the
|
|
|
|
* socket to a connected state.
|
|
|
|
*/
|
|
|
|
static int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* peer)
|
|
|
|
{
|
2005-05-05 20:31:40 +04:00
|
|
|
mca_oob_tcp_hdr_t hdr;
|
|
|
|
if((mca_oob_tcp_peer_recv_blocking(peer, &hdr, sizeof(hdr))) != sizeof(hdr)) {
|
|
|
|
mca_oob_tcp_peer_close(peer);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERR_UNREACH;
|
2005-05-05 20:31:40 +04:00
|
|
|
}
|
|
|
|
MCA_OOB_TCP_HDR_NTOH(&hdr);
|
|
|
|
if(hdr.msg_type != MCA_OOB_TCP_CONNECT) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "mca_oob_tcp_peer_recv_connect_ack: invalid header type: %d\n", hdr.msg_type);
|
2004-09-29 21:18:14 +04:00
|
|
|
mca_oob_tcp_peer_close(peer);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERR_UNREACH;
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
|
2004-08-25 21:39:08 +04:00
|
|
|
/* compare the peers name to the expected value */
|
2005-05-05 20:31:40 +04:00
|
|
|
if(memcmp(&peer->peer_name, &hdr.msg_src, sizeof(orte_process_name_t)) != 0) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_connect_ack: "
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
"received unexpected process identifier [%d,%d,%d]\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
2005-05-05 20:31:40 +04:00
|
|
|
ORTE_NAME_ARGS(&(hdr.msg_src)));
|
2004-07-13 02:46:57 +04:00
|
|
|
mca_oob_tcp_peer_close(peer);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERR_UNREACH;
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
|
2004-08-25 21:39:08 +04:00
|
|
|
/* if we have a wildcard name - use the name returned by the peer */
|
2005-05-23 20:23:11 +04:00
|
|
|
if(orte_process_info.my_name == NULL) {
|
|
|
|
orte_ns.create_process_name(&orte_process_info.my_name,
|
|
|
|
hdr.msg_dst.cellid, hdr.msg_dst.jobid, hdr.msg_dst.vpid);
|
|
|
|
} else if(orte_ns.compare(ORTE_NS_CMP_ALL, orte_process_info.my_name, &mca_oob_name_any) == 0) {
|
2005-05-05 20:31:40 +04:00
|
|
|
*orte_process_info.my_name = hdr.msg_dst;
|
2004-08-25 21:39:08 +04:00
|
|
|
}
|
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
/* connected */
|
|
|
|
mca_oob_tcp_peer_connected(peer);
|
2005-03-19 02:40:08 +03:00
|
|
|
if(mca_oob_tcp_component.tcp_debug > 0) {
|
2004-09-02 03:07:40 +04:00
|
|
|
mca_oob_tcp_peer_dump(peer, "connected");
|
|
|
|
}
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
2004-07-15 17:51:40 +04:00
|
|
|
|
2004-09-29 21:18:14 +04:00
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
/*
|
|
|
|
* A blocking recv on a non-blocking socket. Used to receive the small amount of connection
|
|
|
|
* information that identifies the peers endpoint.
|
|
|
|
*/
|
|
|
|
static int mca_oob_tcp_peer_recv_blocking(mca_oob_tcp_peer_t* peer, void* data, size_t size)
|
|
|
|
{
|
|
|
|
unsigned char* ptr = (unsigned char*)data;
|
|
|
|
size_t cnt = 0;
|
|
|
|
while(cnt < size) {
|
2004-10-28 22:13:43 +04:00
|
|
|
int retval = recv(peer->peer_sd,(char *)ptr+cnt, size-cnt, 0);
|
2004-07-13 02:46:57 +04:00
|
|
|
|
|
|
|
/* remote closed connection */
|
|
|
|
if(retval == 0) {
|
2005-03-19 02:40:08 +03:00
|
|
|
if(mca_oob_tcp_component.tcp_debug > 0) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_blocking: "
|
2004-09-08 21:02:24 +04:00
|
|
|
"peer closed connection: peer state %d",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
2004-09-08 21:02:24 +04:00
|
|
|
peer->peer_state);
|
|
|
|
}
|
2004-07-13 02:46:57 +04:00
|
|
|
mca_oob_tcp_peer_close(peer);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* socket is non-blocking so handle errors */
|
|
|
|
if(retval < 0) {
|
2005-01-20 03:03:23 +03:00
|
|
|
if(ompi_socket_errno != EINTR && ompi_socket_errno != EAGAIN && ompi_socket_errno != EWOULDBLOCK) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_blocking: recv() failed with errno=%d\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
2005-01-20 03:03:23 +03:00
|
|
|
errno);
|
2004-07-13 02:46:57 +04:00
|
|
|
mca_oob_tcp_peer_close(peer);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
cnt += retval;
|
|
|
|
}
|
|
|
|
return cnt;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A blocking send on a non-blocking socket. Used to send the small amount of connection
|
|
|
|
* information that identifies the peers endpoint.
|
|
|
|
*/
|
|
|
|
static int mca_oob_tcp_peer_send_blocking(mca_oob_tcp_peer_t* peer, void* data, size_t size)
|
|
|
|
{
|
|
|
|
unsigned char* ptr = (unsigned char*)data;
|
|
|
|
size_t cnt = 0;
|
|
|
|
while(cnt < size) {
|
2004-10-28 22:13:43 +04:00
|
|
|
int retval = send(peer->peer_sd, (char *)ptr+cnt, size-cnt, 0);
|
2004-07-13 02:46:57 +04:00
|
|
|
if(retval < 0) {
|
2005-01-20 03:03:23 +03:00
|
|
|
if(ompi_socket_errno != EINTR && ompi_socket_errno != EAGAIN && ompi_socket_errno != EWOULDBLOCK) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_send_blocking: send() failed with errno=%d\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
2005-01-20 03:03:23 +03:00
|
|
|
ompi_socket_errno);
|
2004-07-13 02:46:57 +04:00
|
|
|
mca_oob_tcp_peer_close(peer);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
cnt += retval;
|
|
|
|
}
|
|
|
|
return cnt;
|
|
|
|
}
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
|
2004-09-02 03:07:40 +04:00
|
|
|
int mca_oob_tcp_peer_send_ident(mca_oob_tcp_peer_t* peer)
|
|
|
|
{
|
|
|
|
mca_oob_tcp_hdr_t hdr;
|
|
|
|
if(peer->peer_state != MCA_OOB_TCP_CONNECTED)
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_SUCCESS;
|
2005-03-14 23:57:21 +03:00
|
|
|
hdr.msg_src = *orte_process_info.my_name;
|
2004-09-02 03:07:40 +04:00
|
|
|
hdr.msg_dst = peer->peer_name;
|
|
|
|
hdr.msg_type = MCA_OOB_TCP_IDENT;
|
|
|
|
hdr.msg_size = 0;
|
|
|
|
hdr.msg_tag = 0;
|
|
|
|
MCA_OOB_TCP_HDR_HTON(&hdr);
|
|
|
|
if(mca_oob_tcp_peer_send_blocking(peer, &hdr, sizeof(hdr)) != sizeof(hdr))
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERR_UNREACH;
|
|
|
|
return ORTE_SUCCESS;
|
2004-09-02 03:07:40 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-11-21 20:20:42 +03:00
|
|
|
/* static void mca_oob_tcp_peer_recv_ident(mca_oob_tcp_peer_t* peer, mca_oob_tcp_hdr_t* hdr) */
|
|
|
|
/* { */
|
2005-07-04 02:45:48 +04:00
|
|
|
/* OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); */
|
2004-11-21 20:20:42 +03:00
|
|
|
/* ompi_rb_tree_delete(&mca_oob_tcp_component.tcp_peer_tree, &peer->peer_name); */
|
|
|
|
/* peer->peer_name = hdr->msg_src; */
|
|
|
|
/* ompi_rb_tree_insert(&mca_oob_tcp_component.tcp_peer_tree, &peer->peer_name, peer); */
|
2005-07-04 02:45:48 +04:00
|
|
|
/* OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock); */
|
2004-11-21 20:20:42 +03:00
|
|
|
/* } */
|
2004-09-02 03:07:40 +04:00
|
|
|
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
/*
|
|
|
|
* Dispatch to the appropriate action routine based on the state
|
|
|
|
* of the connection with the peer.
|
|
|
|
*/
|
2004-07-13 02:46:57 +04:00
|
|
|
|
|
|
|
static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user)
|
|
|
|
{
|
2004-10-28 22:13:43 +04:00
|
|
|
mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t *)user;
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&peer->peer_lock);
|
2004-07-13 02:46:57 +04:00
|
|
|
switch(peer->peer_state) {
|
2004-08-03 01:24:00 +04:00
|
|
|
case MCA_OOB_TCP_CONNECT_ACK:
|
2004-07-13 02:46:57 +04:00
|
|
|
{
|
2004-08-03 01:24:00 +04:00
|
|
|
mca_oob_tcp_peer_recv_connect_ack(peer);
|
|
|
|
break;
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
2004-08-03 01:24:00 +04:00
|
|
|
case MCA_OOB_TCP_CONNECTED:
|
2004-07-13 02:46:57 +04:00
|
|
|
{
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
/* allocate a new message and setup for recv */
|
2004-08-03 01:24:00 +04:00
|
|
|
if(NULL == peer->peer_recv_msg) {
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
int rc;
|
|
|
|
mca_oob_tcp_msg_t* msg;
|
|
|
|
MCA_OOB_TCP_MSG_ALLOC(msg, rc);
|
|
|
|
if(NULL == msg) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_handler: unable to allocate recv message\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)));
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
msg->msg_type = MCA_OOB_TCP_UNEXPECTED;
|
|
|
|
msg->msg_rc = 0;
|
|
|
|
msg->msg_flags = 0;
|
|
|
|
msg->msg_peer = peer->peer_name;
|
|
|
|
msg->msg_rwiov = mca_oob_tcp_msg_iov_alloc(msg,2);
|
|
|
|
msg->msg_rwbuf = NULL;
|
2005-01-26 03:20:35 +03:00
|
|
|
msg->msg_rwiov->iov_base = (ompi_iov_base_ptr_t)msg->msg_rwbuf;
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
msg->msg_rwiov->iov_len = 1;
|
|
|
|
msg->msg_rwcnt = msg->msg_rwnum = 1;
|
|
|
|
msg->msg_rwptr = msg->msg_rwiov;
|
2005-01-26 03:20:35 +03:00
|
|
|
msg->msg_rwiov[0].iov_base = (ompi_iov_base_ptr_t)&msg->msg_hdr;
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
msg->msg_rwiov[0].iov_len = sizeof(msg->msg_hdr);
|
|
|
|
peer->peer_recv_msg = msg;
|
2004-08-28 05:15:19 +04:00
|
|
|
}
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
|
2004-08-28 05:15:19 +04:00
|
|
|
if (peer->peer_recv_msg &&
|
|
|
|
mca_oob_tcp_msg_recv_handler(peer->peer_recv_msg, peer)) {
|
|
|
|
mca_oob_tcp_msg_t* msg = peer->peer_recv_msg;
|
2004-08-03 01:24:00 +04:00
|
|
|
peer->peer_recv_msg = NULL;
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
mca_oob_tcp_msg_recv_complete(msg, peer);
|
2004-08-28 05:15:19 +04:00
|
|
|
return;
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
2004-08-03 01:24:00 +04:00
|
|
|
break;
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
2004-08-03 01:24:00 +04:00
|
|
|
default:
|
2004-07-13 02:46:57 +04:00
|
|
|
{
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_handler: invalid socket state(%d)",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
2004-08-31 06:57:39 +04:00
|
|
|
peer->peer_state);
|
2004-08-03 01:24:00 +04:00
|
|
|
mca_oob_tcp_peer_close(peer);
|
|
|
|
break;
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A file descriptor is available/ready for send. Check the state
|
|
|
|
* of the socket and take the appropriate action.
|
|
|
|
*/
|
|
|
|
static void mca_oob_tcp_peer_send_handler(int sd, short flags, void* user)
|
|
|
|
{
|
2004-10-28 22:13:43 +04:00
|
|
|
mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t *)user;
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&peer->peer_lock);
|
2004-07-13 02:46:57 +04:00
|
|
|
switch(peer->peer_state) {
|
|
|
|
case MCA_OOB_TCP_CONNECTING:
|
|
|
|
mca_oob_tcp_peer_complete_connect(peer);
|
|
|
|
break;
|
|
|
|
case MCA_OOB_TCP_CONNECTED:
|
|
|
|
{
|
2004-09-16 16:58:50 +04:00
|
|
|
while(peer->peer_send_msg != NULL) {
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
/* complete the current send */
|
2004-07-13 02:46:57 +04:00
|
|
|
mca_oob_tcp_msg_t* msg = peer->peer_send_msg;
|
2004-08-03 01:24:00 +04:00
|
|
|
if(mca_oob_tcp_msg_send_handler(msg, peer)) {
|
|
|
|
mca_oob_tcp_msg_complete(msg, &peer->peer_name);
|
|
|
|
} else {
|
2004-07-13 02:46:57 +04:00
|
|
|
break;
|
|
|
|
}
|
2004-08-03 01:24:00 +04:00
|
|
|
|
|
|
|
/* if current completed - progress any pending sends */
|
2004-07-13 02:46:57 +04:00
|
|
|
peer->peer_send_msg = (mca_oob_tcp_msg_t*)
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_remove_first(&peer->peer_send_queue);
|
2004-09-16 16:58:50 +04:00
|
|
|
}
|
2004-07-15 17:51:40 +04:00
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
/* if nothing else to do unregister for send event notifications */
|
|
|
|
if(NULL == peer->peer_send_msg) {
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_del(&peer->peer_send_event);
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default:
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_send_handler: invalid connection state (%d)",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
2004-07-13 02:46:57 +04:00
|
|
|
peer->peer_state);
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_del(&peer->peer_send_event);
|
2004-07-13 02:46:57 +04:00
|
|
|
break;
|
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Routine for debugging to print the connection state and socket options
|
|
|
|
*/
|
|
|
|
static void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg)
|
|
|
|
{
|
|
|
|
char src[64];
|
|
|
|
char dst[64];
|
|
|
|
char buff[255];
|
|
|
|
int sndbuf,rcvbuf,nodelay,flags;
|
|
|
|
struct sockaddr_in inaddr;
|
|
|
|
ompi_socklen_t optlen;
|
|
|
|
ompi_socklen_t addrlen = sizeof(struct sockaddr_in);
|
|
|
|
|
|
|
|
getsockname(peer->peer_sd, (struct sockaddr*)&inaddr, &addrlen);
|
|
|
|
sprintf(src, "%s", inet_ntoa(inaddr.sin_addr));
|
|
|
|
getpeername(peer->peer_sd, (struct sockaddr*)&inaddr, &addrlen);
|
|
|
|
sprintf(dst, "%s", inet_ntoa(inaddr.sin_addr));
|
|
|
|
|
|
|
|
if((flags = fcntl(peer->peer_sd, F_GETFL, 0)) < 0) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "mca_oob_tcp_peer_dump: fcntl(F_GETFL) failed with errno=%d\n", ompi_socket_errno);
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
#if defined(SO_SNDBUF)
|
|
|
|
optlen = sizeof(sndbuf);
|
|
|
|
if(getsockopt(peer->peer_sd, SOL_SOCKET, SO_SNDBUF, (char *)&sndbuf, &optlen) < 0) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "mca_oob_tcp_peer_dump: SO_SNDBUF option: errno %d\n", ompi_socket_errno);
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
#else
|
|
|
|
sndbuf = -1;
|
|
|
|
#endif
|
|
|
|
#if defined(SO_RCVBUF)
|
|
|
|
optlen = sizeof(rcvbuf);
|
|
|
|
if(getsockopt(peer->peer_sd, SOL_SOCKET, SO_RCVBUF, (char *)&rcvbuf, &optlen) < 0) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "mca_oob_tcp_peer_dump: SO_RCVBUF option: errno %d\n", ompi_socket_errno);
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
#else
|
|
|
|
rcvbuf = -1;
|
|
|
|
#endif
|
|
|
|
#if defined(TCP_NODELAY)
|
|
|
|
optlen = sizeof(nodelay);
|
2004-10-28 22:13:43 +04:00
|
|
|
if(getsockopt(peer->peer_sd, IPPROTO_TCP, TCP_NODELAY, (char *)&nodelay, &optlen) < 0) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "mca_oob_tcp_peer_dump: TCP_NODELAY option: errno %d\n", ompi_socket_errno);
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
#else
|
|
|
|
nodelay = 0;
|
|
|
|
#endif
|
2004-09-29 21:18:14 +04:00
|
|
|
|
2005-05-08 17:22:55 +04:00
|
|
|
sprintf(buff, "[%lu,%lu,%lu]-[%lu,%lu,%lu] %s: %s - %s nodelay %d sndbuf %d rcvbuf %d flags %08x\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
2004-07-13 02:46:57 +04:00
|
|
|
msg, src, dst, nodelay, sndbuf, rcvbuf, flags);
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, buff);
|
2004-07-13 02:46:57 +04:00
|
|
|
}
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Accept incoming connection - if not already connected.
|
|
|
|
*/
|
|
|
|
|
|
|
|
bool mca_oob_tcp_peer_accept(mca_oob_tcp_peer_t* peer, int sd)
|
|
|
|
{
|
2005-03-14 23:57:21 +03:00
|
|
|
int cmpval;
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&peer->peer_lock);
|
2005-03-14 23:57:21 +03:00
|
|
|
cmpval = orte_ns.compare(ORTE_NS_CMP_ALL, &peer->peer_name, orte_process_info.my_name);
|
2004-08-03 01:24:00 +04:00
|
|
|
if ((peer->peer_state == MCA_OOB_TCP_CLOSED) ||
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
(peer->peer_state == MCA_OOB_TCP_RESOLVE) ||
|
2004-08-03 01:24:00 +04:00
|
|
|
(peer->peer_state != MCA_OOB_TCP_CONNECTED &&
|
2005-03-14 23:57:21 +03:00
|
|
|
cmpval < 0)) {
|
2004-08-31 06:57:39 +04:00
|
|
|
|
|
|
|
if(peer->peer_state != MCA_OOB_TCP_CLOSED) {
|
|
|
|
mca_oob_tcp_peer_close(peer);
|
|
|
|
}
|
2004-08-03 01:24:00 +04:00
|
|
|
peer->peer_sd = sd;
|
|
|
|
mca_oob_tcp_peer_event_init(peer);
|
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
if(mca_oob_tcp_peer_send_connect_ack(peer) != ORTE_SUCCESS) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_accept: "
|
2004-09-29 21:18:14 +04:00
|
|
|
"mca_oob_tcp_peer_send_connect_ack failed\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)));
|
2004-08-03 01:24:00 +04:00
|
|
|
mca_oob_tcp_peer_close(peer);
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
2004-08-03 01:24:00 +04:00
|
|
|
return false;
|
|
|
|
}
|
2004-08-31 06:57:39 +04:00
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
mca_oob_tcp_peer_connected(peer);
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_add(&peer->peer_recv_event, 0);
|
2005-03-19 02:40:08 +03:00
|
|
|
if(mca_oob_tcp_component.tcp_debug > 0) {
|
2004-09-02 03:07:40 +04:00
|
|
|
mca_oob_tcp_peer_dump(peer, "accepted");
|
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
2004-08-03 01:24:00 +04:00
|
|
|
return true;
|
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
2004-08-03 01:24:00 +04:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-08-16 23:39:54 +04:00
|
|
|
/*
|
|
|
|
* resolve process name to an actual internet address.
|
|
|
|
*/
|
|
|
|
|
2004-09-02 03:07:40 +04:00
|
|
|
void mca_oob_tcp_peer_resolved(mca_oob_tcp_peer_t* peer, mca_oob_tcp_addr_t* addr)
|
2004-08-16 23:39:54 +04:00
|
|
|
{
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&peer->peer_lock);
|
2004-09-02 03:07:40 +04:00
|
|
|
peer->peer_addr = addr;
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
if((peer->peer_state == MCA_OOB_TCP_RESOLVE) ||
|
2005-07-03 20:22:16 +04:00
|
|
|
(peer->peer_state == MCA_OOB_TCP_CLOSED && opal_list_get_size(&peer->peer_send_queue))) {
|
2004-09-02 03:07:40 +04:00
|
|
|
mca_oob_tcp_peer_start_connect(peer);
|
2004-08-16 23:39:54 +04:00
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
2004-08-16 23:39:54 +04:00
|
|
|
}
|
|
|
|
|
2004-08-28 05:15:19 +04:00
|
|
|
/*
|
|
|
|
* Callback on timeout - retry connection attempt.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static void mca_oob_tcp_peer_timer_handler(int sd, short flags, void* user)
|
|
|
|
{
|
2004-09-02 03:07:40 +04:00
|
|
|
/* start the connection to the peer */
|
|
|
|
mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)user;
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "mca_oob_tcp_peer_timer_handler\n");
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&peer->peer_lock);
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
if(peer->peer_state == MCA_OOB_TCP_CLOSED)
|
|
|
|
mca_oob_tcp_peer_start_connect(peer);
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
2004-08-28 05:15:19 +04:00
|
|
|
}
|
2004-08-19 23:34:37 +04:00
|
|
|
|
2004-09-16 18:12:22 +04:00
|
|
|
/*
|
|
|
|
* Remove any references to the indicated message.
|
|
|
|
*/
|
|
|
|
|
|
|
|
void mca_oob_tcp_peer_dequeue_msg(mca_oob_tcp_peer_t* peer, mca_oob_tcp_msg_t* msg)
|
|
|
|
{
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t* item;
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&peer->peer_lock);
|
2004-09-16 18:12:22 +04:00
|
|
|
if (peer->peer_send_msg == msg)
|
|
|
|
peer->peer_send_msg = NULL;
|
|
|
|
if (peer->peer_recv_msg == msg)
|
|
|
|
peer->peer_recv_msg = NULL;
|
|
|
|
|
2005-07-03 20:22:16 +04:00
|
|
|
for( item = opal_list_get_first(&peer->peer_send_queue);
|
|
|
|
item != opal_list_get_end(&peer->peer_send_queue);
|
|
|
|
item = opal_list_get_next(item)) {
|
|
|
|
if(item == (opal_list_item_t*)msg) {
|
|
|
|
opal_list_remove_item(&peer->peer_send_queue, item);
|
2004-09-16 18:12:22 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
2004-09-16 18:12:22 +04:00
|
|
|
}
|
|
|
|
|
2004-08-31 06:57:39 +04:00
|
|
|
|
2005-10-31 19:21:11 +03:00
|
|
|
/**
|
|
|
|
* Set socket buffering
|
|
|
|
*/
|
|
|
|
|
|
|
|
void mca_oob_tcp_set_socket_options(int sd)
|
|
|
|
{
|
|
|
|
int optval;
|
|
|
|
#if defined(TCP_NODELAY)
|
|
|
|
optval = 1;
|
|
|
|
if(setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char *)&optval, sizeof(optval)) < 0) {
|
|
|
|
opal_output(0, "[%s:%d] setsockopt(TCP_NODELAY) failed with errno=%d", __FILE__, __LINE__, ompi_socket_errno);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#if defined(SO_SNDBUF)
|
|
|
|
if(mca_oob_tcp_component.tcp_sndbuf > 0 &&
|
|
|
|
setsockopt(sd, SOL_SOCKET, SO_SNDBUF, (char *)&mca_oob_tcp_component.tcp_sndbuf, sizeof(int)) < 0) {
|
|
|
|
opal_output(0, "[%s:%d] setsockopt(SO_SNDBUF) failed with errno %d", __FILE__, __LINE__, ompi_socket_errno);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#if defined(SO_RCVBUF)
|
|
|
|
if(mca_oob_tcp_component.tcp_rcvbuf > 0 &&
|
|
|
|
setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (char *)&mca_oob_tcp_component.tcp_rcvbuf, sizeof(int)) < 0) {
|
|
|
|
opal_output(0, "[%s:%d] setsockopt(SO_RCVBUF) failed with errno %d", __FILE__, __LINE__, ompi_socket_errno);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|