2005-09-01 05:07:30 +04:00
|
|
|
/*
|
2007-03-17 02:11:45 +03:00
|
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
2005-11-05 22:57:48 +03:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2007-04-12 09:01:29 +04:00
|
|
|
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
2005-11-05 22:57:48 +03:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2005-09-01 05:07:30 +04:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
2004-11-28 23:09:25 +03:00
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
* Copyright (c) 2006-2007 Los Alamos National Security, LLC.
|
|
|
|
* All rights reserved.
|
2004-11-22 04:38:40 +03:00
|
|
|
* $COPYRIGHT$
|
2005-09-01 05:07:30 +04:00
|
|
|
*
|
2004-11-22 04:38:40 +03:00
|
|
|
* Additional copyrights may follow
|
2005-09-01 05:07:30 +04:00
|
|
|
*
|
2004-07-01 18:49:54 +04:00
|
|
|
* $HEADER$
|
2006-03-11 06:09:24 +03:00
|
|
|
*
|
|
|
|
* In windows, many of the socket functions return an EWOULDBLOCK
|
2007-04-12 09:01:29 +04:00
|
|
|
* instead of things like EAGAIN, EINPROGRESS, etc. It has been
|
|
|
|
* verified that this will not conflict with other error codes that
|
|
|
|
* are returned by these functions under UNIX/Linux environments
|
2004-07-01 18:49:54 +04:00
|
|
|
*/
|
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte_config.h"
|
2005-03-14 23:57:21 +03:00
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte/orte_types.h"
|
2005-03-14 23:57:21 +03:00
|
|
|
|
2004-10-22 20:06:05 +04:00
|
|
|
#ifdef HAVE_UNISTD_H
|
2004-08-03 01:24:00 +04:00
|
|
|
#include <unistd.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_TYPES_H
|
2004-08-03 01:24:00 +04:00
|
|
|
#include <sys/types.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
2004-08-03 01:24:00 +04:00
|
|
|
#include <fcntl.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#ifdef HAVE_NETINET_IN_H
|
2004-08-16 23:39:54 +04:00
|
|
|
#include <netinet/in.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
|
|
|
#ifdef HAVE_ARPA_INET_H
|
2004-08-16 23:39:54 +04:00
|
|
|
#include <arpa/inet.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
2007-04-25 05:55:40 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
# ifdef HAVE_NETDB_H
|
|
|
|
# include <netdb.h>
|
|
|
|
# endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include "opal/util/error.h"
|
2006-08-15 00:14:44 +04:00
|
|
|
#include "opal/opal_socket_errno.h"
|
2005-07-04 03:31:27 +04:00
|
|
|
#include "opal/util/output.h"
|
2005-07-04 05:36:20 +04:00
|
|
|
#include "opal/util/if.h"
|
2007-05-17 05:17:59 +04:00
|
|
|
#include "opal/util/net.h"
|
2007-03-17 02:11:45 +03:00
|
|
|
#include "opal/class/opal_hash_table.h"
|
2005-07-19 16:25:19 +04:00
|
|
|
#include "orte/class/orte_proc_table.h"
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte/mca/oob/tcp/oob_tcp.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "orte/mca/ns/ns.h"
|
|
|
|
#include "orte/mca/gpr/gpr.h"
|
2004-07-01 18:49:54 +04:00
|
|
|
|
2007-06-15 02:35:38 +04:00
|
|
|
#if defined(__WINDOWS__)
|
|
|
|
static opal_mutex_t windows_callback;
|
|
|
|
#endif /* defined(__WINDOWS__) */
|
|
|
|
|
2004-09-30 19:09:29 +04:00
|
|
|
/*
|
|
|
|
* Data structure for accepting connections.
|
|
|
|
*/
|
|
|
|
struct mca_oob_tcp_event_t {
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t item;
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_t event;
|
2004-09-30 19:09:29 +04:00
|
|
|
};
|
|
|
|
typedef struct mca_oob_tcp_event_t mca_oob_tcp_event_t;
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2004-09-30 19:09:29 +04:00
|
|
|
static void mca_oob_tcp_event_construct(mca_oob_tcp_event_t* event)
|
|
|
|
{
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_append(&mca_oob_tcp_component.tcp_events, &event->item);
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
2004-09-30 19:09:29 +04:00
|
|
|
}
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2004-09-30 19:09:29 +04:00
|
|
|
static void mca_oob_tcp_event_destruct(mca_oob_tcp_event_t* event)
|
|
|
|
{
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_remove_item(&mca_oob_tcp_component.tcp_events, &event->item);
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
2004-09-30 19:09:29 +04:00
|
|
|
}
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2004-09-30 19:09:29 +04:00
|
|
|
OBJ_CLASS_INSTANCE(
|
|
|
|
mca_oob_tcp_event_t,
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t,
|
2004-09-30 19:09:29 +04:00
|
|
|
mca_oob_tcp_event_construct,
|
|
|
|
mca_oob_tcp_event_destruct);
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2004-09-30 19:09:29 +04:00
|
|
|
/*
|
|
|
|
* Local utility functions
|
|
|
|
*/
|
2004-08-03 01:24:00 +04:00
|
|
|
|
2007-04-25 05:55:40 +04:00
|
|
|
static int mca_oob_tcp_create_listen(int *target_sd, uint16_t af_family);
|
2006-09-15 01:29:51 +04:00
|
|
|
static int mca_oob_tcp_create_listen_thread(void);
|
2004-08-03 01:24:00 +04:00
|
|
|
static void mca_oob_tcp_recv_handler(int sd, short flags, void* user);
|
2007-04-25 05:55:40 +04:00
|
|
|
static void mca_oob_tcp_accept(int incoming_sd);
|
2004-08-03 01:24:00 +04:00
|
|
|
|
2004-09-02 03:07:40 +04:00
|
|
|
struct mca_oob_tcp_subscription_t {
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t item;
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_jobid_t jobid;
|
2005-06-24 20:59:37 +04:00
|
|
|
orte_gpr_subscription_id_t subid;
|
2004-09-02 03:07:40 +04:00
|
|
|
};
|
|
|
|
typedef struct mca_oob_tcp_subscription_t mca_oob_tcp_subscription_t;
|
|
|
|
|
|
|
|
OBJ_CLASS_INSTANCE(
|
|
|
|
mca_oob_tcp_subscription_t,
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t,
|
2004-09-02 03:07:40 +04:00
|
|
|
NULL,
|
|
|
|
NULL);
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
OBJ_CLASS_INSTANCE(
|
|
|
|
mca_oob_tcp_pending_connection_t,
|
|
|
|
opal_free_list_item_t,
|
|
|
|
NULL,
|
|
|
|
NULL);
|
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
int mca_oob_tcp_output_handle = 0;
|
2004-09-02 03:07:40 +04:00
|
|
|
|
|
|
|
|
2004-07-01 18:49:54 +04:00
|
|
|
/*
|
|
|
|
* Struct of function pointers and all that to let us be initialized
|
|
|
|
*/
|
2004-08-02 04:24:22 +04:00
|
|
|
mca_oob_tcp_component_t mca_oob_tcp_component = {
|
|
|
|
{
|
2004-07-01 18:49:54 +04:00
|
|
|
{
|
|
|
|
MCA_OOB_BASE_VERSION_1_0_0,
|
|
|
|
"tcp", /* MCA module name */
|
2004-08-19 23:34:37 +04:00
|
|
|
1, /* MCA component major version */
|
|
|
|
0, /* MCA component minor version */
|
|
|
|
0, /* MCA component release version */
|
|
|
|
mca_oob_tcp_component_open, /* component open */
|
|
|
|
mca_oob_tcp_component_close /* component close */
|
2004-07-01 18:49:54 +04:00
|
|
|
},
|
|
|
|
{
|
2007-03-17 02:11:45 +03:00
|
|
|
/* The component is checkpoint ready */
|
|
|
|
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
2004-07-01 18:49:54 +04:00
|
|
|
},
|
2005-09-01 05:07:30 +04:00
|
|
|
mca_oob_tcp_component_init
|
2004-08-02 04:24:22 +04:00
|
|
|
}
|
2004-07-01 18:49:54 +04:00
|
|
|
};
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
static mca_oob_t mca_oob_tcp = {
|
2004-08-16 23:39:54 +04:00
|
|
|
mca_oob_tcp_get_addr,
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
mca_oob_tcp_set_addr,
|
2004-09-08 21:02:24 +04:00
|
|
|
mca_oob_tcp_ping,
|
2004-07-01 18:49:54 +04:00
|
|
|
mca_oob_tcp_send,
|
|
|
|
mca_oob_tcp_recv,
|
|
|
|
mca_oob_tcp_send_nb,
|
2004-08-11 01:02:36 +04:00
|
|
|
mca_oob_tcp_recv_nb,
|
2004-09-30 19:09:29 +04:00
|
|
|
mca_oob_tcp_recv_cancel,
|
2004-08-19 23:34:37 +04:00
|
|
|
mca_oob_tcp_init,
|
|
|
|
mca_oob_tcp_fini,
|
2007-03-17 02:11:45 +03:00
|
|
|
mca_oob_xcast,
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
mca_oob_xcast_nb,
|
|
|
|
mca_oob_xcast_gate,
|
|
|
|
mca_oob_tcp_ft_event,
|
|
|
|
mca_oob_tcp_register_contact_info,
|
|
|
|
mca_oob_tcp_register_subscription,
|
|
|
|
mca_oob_tcp_get_contact_info,
|
|
|
|
mca_oob_tcp_registry_callback
|
2004-07-01 18:49:54 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
2004-07-13 02:46:57 +04:00
|
|
|
* Initialize global variables used w/in this module.
|
2004-07-01 18:49:54 +04:00
|
|
|
*/
|
2004-08-19 23:34:37 +04:00
|
|
|
int mca_oob_tcp_component_open(void)
|
2004-07-01 18:49:54 +04:00
|
|
|
{
|
2007-03-17 02:11:45 +03:00
|
|
|
int value = 0;
|
2007-05-24 16:52:26 +04:00
|
|
|
char *listen_type, *str;
|
2006-09-15 01:29:51 +04:00
|
|
|
int tmp;
|
|
|
|
|
2005-12-12 23:04:00 +03:00
|
|
|
#ifdef __WINDOWS__
|
2004-11-02 16:14:34 +03:00
|
|
|
WSADATA win_sock_data;
|
|
|
|
if (WSAStartup(MAKEWORD(2,2), &win_sock_data) != 0) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output (0, "mca_oob_tcp_component_init: failed to initialise windows sockets: error %d\n", WSAGetLastError());
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERROR;
|
2004-11-02 16:14:34 +03:00
|
|
|
}
|
|
|
|
#endif
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"verbose",
|
|
|
|
"Verbose level for the OOB tcp component",
|
|
|
|
false, false,
|
|
|
|
0,
|
|
|
|
&value);
|
|
|
|
mca_oob_tcp_output_handle = opal_output_open(NULL);
|
|
|
|
opal_output_set_verbosity(mca_oob_tcp_output_handle, value);
|
|
|
|
|
2005-07-03 20:22:16 +04:00
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_subscriptions, opal_list_t);
|
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_list, opal_list_t);
|
2005-10-25 17:48:08 +04:00
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peers, opal_hash_table_t);
|
2005-07-03 20:52:32 +04:00
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_names, opal_hash_table_t);
|
2005-07-02 20:46:27 +04:00
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_peer_free, opal_free_list_t);
|
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_msgs, opal_free_list_t);
|
2005-07-04 02:45:48 +04:00
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_lock, opal_mutex_t);
|
2005-07-03 20:22:16 +04:00
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_events, opal_list_t);
|
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_msg_post, opal_list_t);
|
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_msg_recv, opal_list_t);
|
2005-10-25 17:48:08 +04:00
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_msg_completed, opal_list_t);
|
2005-07-04 02:45:48 +04:00
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_match_lock, opal_mutex_t);
|
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_match_cond, opal_condition_t);
|
2006-09-15 01:29:51 +04:00
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_listen_thread, opal_thread_t);
|
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_pending_connections_fl, opal_free_list_t);
|
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_pending_connections, opal_list_t);
|
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_copy_out_connections, opal_list_t);
|
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_copy_in_connections, opal_list_t);
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_connections_return, opal_list_t);
|
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_connections_return_copy, opal_list_t);
|
2006-09-15 01:29:51 +04:00
|
|
|
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_pending_connections_lock, opal_mutex_t);
|
2004-08-03 01:24:00 +04:00
|
|
|
|
|
|
|
/* register oob module parameters */
|
2007-05-24 17:01:55 +04:00
|
|
|
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"peer_limit",
|
|
|
|
"Maximum number of peer connections to simultaneously maintain (-1 = infinite)",
|
|
|
|
false, false, -1,
|
|
|
|
&mca_oob_tcp_component.tcp_peer_limit);
|
|
|
|
|
|
|
|
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"peer_retries",
|
|
|
|
"Number of times to try shutting down a connection before giving up",
|
|
|
|
false, false, 60,
|
|
|
|
&mca_oob_tcp_component.tcp_peer_retries);
|
|
|
|
|
|
|
|
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"debug",
|
|
|
|
"Enable (1) / disable (0) debugging output for this component",
|
|
|
|
false, false, 0,
|
|
|
|
&mca_oob_tcp_component.tcp_debug);
|
|
|
|
|
|
|
|
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"sndbuf",
|
|
|
|
"TCP socket send buffering size (in bytes)",
|
|
|
|
false, false, 128 * 1024,
|
|
|
|
&mca_oob_tcp_component.tcp_sndbuf);
|
|
|
|
|
|
|
|
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"rcvbuf",
|
|
|
|
"TCP socket receive buffering size (in bytes)",
|
|
|
|
false, false, 128 * 1024,
|
|
|
|
&mca_oob_tcp_component.tcp_rcvbuf);
|
2007-05-24 16:52:26 +04:00
|
|
|
|
|
|
|
mca_base_param_reg_string(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"if_include",
|
|
|
|
"Comma-delimited list of TCP interfaces to use",
|
|
|
|
false, false, NULL,
|
|
|
|
&mca_oob_tcp_component.tcp_include);
|
|
|
|
mca_base_param_reg_string(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"include",
|
|
|
|
"Obsolete synonym for oob_tcp_if_include",
|
|
|
|
true, false, NULL, &str);
|
|
|
|
if (NULL != str) {
|
|
|
|
if (NULL == mca_oob_tcp_component.tcp_include) {
|
|
|
|
mca_oob_tcp_component.tcp_include = str;
|
|
|
|
} else {
|
|
|
|
free(str);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mca_base_param_reg_string(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"if_exclude",
|
|
|
|
"Comma-delimited list of TCP interfaces to exclude",
|
|
|
|
false, false, NULL,
|
|
|
|
&mca_oob_tcp_component.tcp_exclude);
|
|
|
|
mca_base_param_reg_string(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"exclude",
|
|
|
|
"Obsolete synonym for oob_tcp_if_exclude",
|
|
|
|
true, false, NULL, &str);
|
|
|
|
if (NULL != str) {
|
|
|
|
if (NULL == mca_oob_tcp_component.tcp_exclude) {
|
|
|
|
mca_oob_tcp_component.tcp_exclude = str;
|
|
|
|
} else {
|
|
|
|
free(str);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-11-06 21:00:46 +03:00
|
|
|
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"connect_sleep",
|
2007-05-24 16:52:26 +04:00
|
|
|
"Enable (1) / disable (0) random sleep for connection wireup",
|
2006-11-06 21:00:46 +03:00
|
|
|
false,
|
|
|
|
false,
|
|
|
|
1,
|
|
|
|
&mca_oob_tcp_component.connect_sleep);
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
mca_base_param_reg_string(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"listen_mode",
|
|
|
|
"Mode for HNP to accept incoming connections: event, listen_thread",
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
"event",
|
|
|
|
&listen_type);
|
2006-09-19 23:33:49 +04:00
|
|
|
|
2006-10-12 01:29:29 +04:00
|
|
|
if (0 == strcmp(listen_type, "event")) {
|
2006-09-15 01:29:51 +04:00
|
|
|
mca_oob_tcp_component.tcp_listen_type = OOB_TCP_EVENT;
|
|
|
|
} else if (0 == strcmp(listen_type, "listen_thread")) {
|
|
|
|
mca_oob_tcp_component.tcp_listen_type = OOB_TCP_LISTEN_THREAD;
|
|
|
|
} else {
|
|
|
|
opal_output(0, "Invalid value for oob_tcp_listen_mode parameter: %s",
|
|
|
|
listen_type);
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"listen_thread_max_queue",
|
|
|
|
"High water mark for queued accepted socket list size",
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
10,
|
|
|
|
&mca_oob_tcp_component.tcp_copy_max_size);
|
|
|
|
|
|
|
|
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"listen_thread_max_time",
|
|
|
|
"Maximum amount of time (in milliseconds) to wait between processing accepted socket list",
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
10,
|
|
|
|
&tmp);
|
|
|
|
|
|
|
|
#if OPAL_TIMER_USEC_NATIVE
|
|
|
|
mca_oob_tcp_component.tcp_copy_delta = tmp * 1000;
|
|
|
|
#else
|
|
|
|
mca_oob_tcp_component.tcp_copy_delta = tmp *
|
|
|
|
opal_timer_base_get_freq() / 1000;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
|
|
|
"accept_spin_count",
|
|
|
|
"Number of times to let accept return EWOULDBLOCK before updating accepted socket list",
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
10,
|
|
|
|
&mca_oob_tcp_component.tcp_copy_spin_count);
|
|
|
|
|
2004-08-03 02:16:35 +04:00
|
|
|
/* initialize state */
|
2006-09-15 01:29:51 +04:00
|
|
|
mca_oob_tcp_component.tcp_shutdown = false;
|
2004-08-03 02:16:35 +04:00
|
|
|
mca_oob_tcp_component.tcp_listen_sd = -1;
|
2007-04-25 05:55:40 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
mca_oob_tcp_component.tcp6_listen_sd = -1;
|
|
|
|
#endif
|
2004-09-30 19:09:29 +04:00
|
|
|
mca_oob_tcp_component.tcp_match_count = 0;
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
mca_oob_tcp_component.tcp_last_copy_time = 0;
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2007-05-31 06:29:44 +04:00
|
|
|
/* updated with real value during tcp_init */
|
|
|
|
mca_oob_tcp_component.tcp_ignore_localhost = true;
|
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-07-01 18:49:54 +04:00
|
|
|
}
|
|
|
|
|
2007-06-14 08:38:06 +04:00
|
|
|
#if defined(__WINDOWS__)
|
|
|
|
static int oob_tcp_windows_progress_callback( void )
|
|
|
|
{
|
|
|
|
opal_list_item_t* item;
|
|
|
|
mca_oob_tcp_msg_t* msg;
|
2007-06-15 02:35:38 +04:00
|
|
|
int event_count = 0;
|
|
|
|
|
|
|
|
/* Only one thread at the time is allowed to execute callbacks */
|
|
|
|
if( !opal_mutex_trylock(&windows_callback) )
|
|
|
|
return 0;
|
2007-06-14 08:38:06 +04:00
|
|
|
|
|
|
|
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
|
|
|
while(NULL !=
|
|
|
|
(item = opal_list_remove_first(&mca_oob_tcp_component.tcp_msg_completed))) {
|
|
|
|
msg = (mca_oob_tcp_msg_t*)item;
|
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
|
|
|
msg->msg_cbfunc( msg->msg_rc,
|
|
|
|
&msg->msg_peer,
|
|
|
|
msg->msg_uiov,
|
|
|
|
msg->msg_ucnt,
|
|
|
|
msg->msg_hdr.msg_tag,
|
|
|
|
msg->msg_cbdata);
|
2007-06-15 02:35:38 +04:00
|
|
|
event_count++;
|
2007-06-14 08:38:06 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
|
|
|
MCA_OOB_TCP_MSG_RETURN(msg);
|
|
|
|
}
|
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
|
|
|
|
2007-06-15 02:35:38 +04:00
|
|
|
opal_mutex_unlock(&windows_callback);
|
|
|
|
|
|
|
|
return event_count;
|
2007-06-14 08:38:06 +04:00
|
|
|
}
|
|
|
|
#endif /* defined(__WINDOWS__) */
|
2004-07-01 18:49:54 +04:00
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
/*
|
|
|
|
* Cleanup of global variables used by this module.
|
|
|
|
*/
|
|
|
|
|
2004-08-19 23:34:37 +04:00
|
|
|
int mca_oob_tcp_component_close(void)
|
2004-07-01 18:49:54 +04:00
|
|
|
{
|
2007-06-14 08:38:06 +04:00
|
|
|
#if defined(__WINDOWS__)
|
|
|
|
opal_progress_unregister(oob_tcp_windows_progress_callback);
|
2007-06-15 02:35:38 +04:00
|
|
|
OBJ_DESTRUCT( &windows_callback );
|
2004-11-02 16:14:34 +03:00
|
|
|
WSACleanup();
|
2007-06-14 08:38:06 +04:00
|
|
|
#endif /* defined(__WINDOWS__) */
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2004-09-30 19:09:29 +04:00
|
|
|
/* cleanup resources */
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_pending_connections_lock);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_connections_return_copy);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_connections_return);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_copy_out_connections);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_pending_connections);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_pending_connections_fl);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_listen_thread);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_match_cond);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_match_lock);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_msg_completed);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_msg_recv);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_msg_post);
|
2004-09-30 19:09:29 +04:00
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_events);
|
2004-08-03 01:24:00 +04:00
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_lock);
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_msgs);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_free);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_names);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peers);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_subscriptions);
|
|
|
|
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_list);
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-07-01 18:49:54 +04:00
|
|
|
}
|
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
/*
|
|
|
|
* Called by mca_oob_tcp_recv_handler() when the TCP listen
|
|
|
|
* socket has pending connection requests. Accept incoming
|
|
|
|
* requests and queue for completion of the connection handshake.
|
2004-07-13 02:46:57 +04:00
|
|
|
*/
|
|
|
|
|
2007-04-25 05:55:40 +04:00
|
|
|
static void mca_oob_tcp_accept(int incoming_sd)
|
2004-07-13 02:46:57 +04:00
|
|
|
{
|
2004-08-03 01:24:00 +04:00
|
|
|
while(true) {
|
2007-04-25 05:55:40 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
opal_socklen_t addrlen = sizeof(struct sockaddr_in6);
|
|
|
|
struct sockaddr_in6 addr;
|
|
|
|
#else
|
2006-09-15 01:29:51 +04:00
|
|
|
opal_socklen_t addrlen = sizeof(struct sockaddr_in);
|
2004-08-03 01:24:00 +04:00
|
|
|
struct sockaddr_in addr;
|
2007-04-25 05:55:40 +04:00
|
|
|
#endif
|
2004-09-30 19:09:29 +04:00
|
|
|
mca_oob_tcp_event_t* event;
|
|
|
|
int sd;
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2007-04-25 05:55:40 +04:00
|
|
|
sd = accept(incoming_sd, (struct sockaddr*)&addr, &addrlen);
|
2004-08-03 01:24:00 +04:00
|
|
|
if(sd < 0) {
|
2007-04-25 05:55:40 +04:00
|
|
|
if(opal_socket_errno == EINTR) {
|
2004-08-03 01:24:00 +04:00
|
|
|
continue;
|
2007-04-25 05:55:40 +04:00
|
|
|
}
|
|
|
|
if(opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0, "mca_oob_tcp_accept: accept() failed: %s (%d).",
|
|
|
|
strerror(opal_socket_errno), opal_socket_errno);
|
2007-04-25 05:55:40 +04:00
|
|
|
}
|
2004-08-03 01:24:00 +04:00
|
|
|
return;
|
|
|
|
}
|
2005-03-19 02:40:08 +03:00
|
|
|
|
2005-10-31 19:21:11 +03:00
|
|
|
/* setup socket options */
|
|
|
|
mca_oob_tcp_set_socket_options(sd);
|
|
|
|
|
2005-03-19 02:40:08 +03:00
|
|
|
/* log the accept */
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_accept: %s:%d\n",
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
2007-05-17 05:17:59 +04:00
|
|
|
opal_net_get_hostname((struct sockaddr*) &addr),
|
|
|
|
opal_net_get_port((struct sockaddr*) &addr));
|
2005-03-19 02:40:08 +03:00
|
|
|
}
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
/* wait for receipt of peers process identifier to complete this connection */
|
2004-09-30 19:09:29 +04:00
|
|
|
event = OBJ_NEW(mca_oob_tcp_event_t);
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_set(&event->event, sd, OPAL_EV_READ, mca_oob_tcp_recv_handler, event);
|
|
|
|
opal_event_add(&event->event, 0);
|
2004-08-03 01:24:00 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a listen socket and bind to all interfaces
|
|
|
|
*/
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2007-04-25 05:55:40 +04:00
|
|
|
static int mca_oob_tcp_create_listen(int *target_sd, uint16_t af_family)
|
2004-08-03 01:24:00 +04:00
|
|
|
{
|
|
|
|
int flags;
|
2007-04-25 05:55:40 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
struct sockaddr_in6 inaddr;
|
|
|
|
#else
|
2004-08-03 01:24:00 +04:00
|
|
|
struct sockaddr_in inaddr;
|
2007-04-25 05:55:40 +04:00
|
|
|
#endif
|
2006-09-15 01:29:51 +04:00
|
|
|
opal_socklen_t addrlen;
|
2004-08-12 17:29:37 +04:00
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
/* create a listen socket for incoming connections */
|
2007-04-25 05:55:40 +04:00
|
|
|
*target_sd = socket(af_family, SOCK_STREAM, 0);
|
|
|
|
|
|
|
|
if(*target_sd < 0) {
|
|
|
|
if (EAFNOSUPPORT != opal_socket_errno) {
|
|
|
|
opal_output(0,"mca_oob_tcp_component_init: socket() failed: %s (%d)",
|
|
|
|
strerror(opal_socket_errno), opal_socket_errno);
|
|
|
|
}
|
|
|
|
return ORTE_ERR_IN_ERRNO;
|
2004-08-03 01:24:00 +04:00
|
|
|
}
|
2005-10-31 19:21:11 +03:00
|
|
|
|
|
|
|
/* setup socket options */
|
2007-04-25 05:55:40 +04:00
|
|
|
mca_oob_tcp_set_socket_options(*target_sd);
|
2005-10-31 19:21:11 +03:00
|
|
|
|
|
|
|
/* bind address */
|
2004-08-03 01:24:00 +04:00
|
|
|
memset(&inaddr, 0, sizeof(inaddr));
|
2007-04-25 05:55:40 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
{
|
|
|
|
struct addrinfo hints, *res = NULL;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
memset (&hints, 0, sizeof(hints));
|
|
|
|
hints.ai_family = af_family;
|
|
|
|
hints.ai_socktype = SOCK_STREAM;
|
|
|
|
hints.ai_flags = AI_PASSIVE;
|
|
|
|
|
|
|
|
if ((error = getaddrinfo(NULL, "0", &hints, &res))) {
|
|
|
|
opal_output (0,
|
|
|
|
"mca_oob_tcp_create_listen: unable to resolve. %s\n",
|
|
|
|
gai_strerror (error));
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy (&inaddr, res->ai_addr, res->ai_addrlen);
|
|
|
|
addrlen = res->ai_addrlen;
|
|
|
|
freeaddrinfo (res);
|
|
|
|
|
2007-06-28 22:52:15 +04:00
|
|
|
#ifdef IPV6_V6ONLY
|
2007-04-25 05:55:40 +04:00
|
|
|
/* in case of AF_INET6, disable v4-mapped addresses */
|
|
|
|
if (AF_INET6 == af_family) {
|
|
|
|
int flg = 0;
|
|
|
|
if (setsockopt (*target_sd, IPPROTO_IPV6, IPV6_V6ONLY,
|
|
|
|
&flg, sizeof (flg)) < 0) {
|
|
|
|
opal_output(0,
|
|
|
|
"mca_oob_tcp_create_listen: unable to disable v4-mapped addresses\n");
|
|
|
|
}
|
|
|
|
}
|
2007-06-28 22:52:15 +04:00
|
|
|
#endif /* IPV6_V6ONLY */
|
2007-04-25 05:55:40 +04:00
|
|
|
}
|
|
|
|
#else
|
2004-08-03 01:24:00 +04:00
|
|
|
inaddr.sin_family = AF_INET;
|
|
|
|
inaddr.sin_addr.s_addr = INADDR_ANY;
|
2004-08-19 23:34:37 +04:00
|
|
|
inaddr.sin_port = 0;
|
2007-04-25 05:55:40 +04:00
|
|
|
addrlen = sizeof(struct sockaddr_in);
|
|
|
|
#endif
|
2004-08-12 17:29:37 +04:00
|
|
|
|
2007-04-25 05:55:40 +04:00
|
|
|
if(bind(*target_sd, (struct sockaddr*)&inaddr, addrlen) < 0) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0,"mca_oob_tcp_create_listen: bind() failed: %s (%d)",
|
|
|
|
strerror(opal_socket_errno), opal_socket_errno);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERROR;
|
2004-08-03 01:24:00 +04:00
|
|
|
}
|
2004-08-12 17:29:37 +04:00
|
|
|
|
2004-09-02 03:07:40 +04:00
|
|
|
/* resolve system assigned port */
|
2007-04-25 05:55:40 +04:00
|
|
|
if(getsockname(*target_sd, (struct sockaddr*)&inaddr, &addrlen) < 0) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0, "mca_oob_tcp_create_listen: getsockname(): %s (%d)",
|
|
|
|
strerror(opal_socket_errno), opal_socket_errno);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERROR;
|
2004-08-03 01:24:00 +04:00
|
|
|
}
|
2007-04-25 05:55:40 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
if (AF_INET == af_family) {
|
|
|
|
mca_oob_tcp_component.tcp_listen_port = inaddr.sin6_port;
|
|
|
|
}
|
|
|
|
if (AF_INET6 == af_family) {
|
|
|
|
mca_oob_tcp_component.tcp6_listen_port = inaddr.sin6_port;
|
|
|
|
}
|
|
|
|
#else
|
2004-08-03 01:24:00 +04:00
|
|
|
mca_oob_tcp_component.tcp_listen_port = inaddr.sin_port;
|
2007-04-25 05:55:40 +04:00
|
|
|
#endif
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
/* setup listen backlog to maximum allowed by kernel */
|
2007-04-25 05:55:40 +04:00
|
|
|
if(listen(*target_sd, SOMAXCONN) < 0) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0, "mca_oob_tcp_component_init: listen(): %s (%d)",
|
|
|
|
strerror(opal_socket_errno), opal_socket_errno);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERROR;
|
2004-08-03 01:24:00 +04:00
|
|
|
}
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
/* set socket up to be non-blocking, otherwise accept could block */
|
2007-04-25 05:55:40 +04:00
|
|
|
if((flags = fcntl(*target_sd, F_GETFL, 0)) < 0) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0, "mca_oob_tcp_component_init: fcntl(F_GETFL) failed: %s (%d)",
|
|
|
|
strerror(opal_socket_errno), opal_socket_errno);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERROR;
|
2004-08-03 01:24:00 +04:00
|
|
|
} else {
|
|
|
|
flags |= O_NONBLOCK;
|
2007-04-25 05:55:40 +04:00
|
|
|
if(fcntl(*target_sd, F_SETFL, flags) < 0) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0, "mca_oob_tcp_component_init: fcntl(F_SETFL) failed: %s (%d)",
|
|
|
|
strerror(opal_socket_errno), opal_socket_errno);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERROR;
|
2004-08-03 01:24:00 +04:00
|
|
|
}
|
|
|
|
}
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
/* register listen port */
|
2007-04-25 05:55:40 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
if (AF_INET == af_family) {
|
|
|
|
opal_event_set(
|
|
|
|
&mca_oob_tcp_component.tcp_recv_event,
|
|
|
|
*target_sd,
|
|
|
|
OPAL_EV_READ|OPAL_EV_PERSIST,
|
|
|
|
mca_oob_tcp_recv_handler,
|
|
|
|
0);
|
|
|
|
opal_event_add(&mca_oob_tcp_component.tcp_recv_event, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (AF_INET6 == af_family) {
|
|
|
|
opal_event_set(
|
|
|
|
&mca_oob_tcp_component.tcp6_recv_event,
|
|
|
|
*target_sd,
|
|
|
|
OPAL_EV_READ|OPAL_EV_PERSIST,
|
|
|
|
mca_oob_tcp_recv_handler,
|
|
|
|
0);
|
|
|
|
opal_event_add(&mca_oob_tcp_component.tcp6_recv_event, 0);
|
|
|
|
}
|
|
|
|
#else
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_set(
|
2007-04-25 05:55:40 +04:00
|
|
|
&mca_oob_tcp_component.tcp_recv_event,
|
|
|
|
*target_sd,
|
|
|
|
OPAL_EV_READ|OPAL_EV_PERSIST,
|
|
|
|
mca_oob_tcp_recv_handler,
|
|
|
|
0);
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_add(&mca_oob_tcp_component.tcp_recv_event, 0);
|
2007-04-25 05:55:40 +04:00
|
|
|
#endif
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-08-03 01:24:00 +04:00
|
|
|
}
|
2004-07-13 02:46:57 +04:00
|
|
|
|
2004-07-10 05:27:07 +04:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
static void* mca_oob_tcp_listen_thread(opal_object_t *obj)
|
|
|
|
{
|
|
|
|
int rc, count;
|
|
|
|
opal_socklen_t addrlen = sizeof(struct sockaddr_in);
|
|
|
|
opal_free_list_item_t *fl_item;
|
|
|
|
mca_oob_tcp_pending_connection_t *item;
|
|
|
|
struct timeval timeout;
|
|
|
|
fd_set readfds;
|
|
|
|
|
|
|
|
while (false == mca_oob_tcp_component.tcp_shutdown) {
|
|
|
|
count = 0;
|
|
|
|
|
|
|
|
FD_ZERO(&readfds);
|
|
|
|
FD_SET(mca_oob_tcp_component.tcp_listen_sd, &readfds);
|
|
|
|
timeout.tv_sec = 0;
|
|
|
|
timeout.tv_usec = 10000;
|
|
|
|
|
|
|
|
rc = select(mca_oob_tcp_component.tcp_listen_sd + 1, &readfds,
|
|
|
|
NULL, NULL, &timeout);
|
|
|
|
if (rc < 0) {
|
|
|
|
if (EAGAIN != opal_socket_errno && EINTR != opal_socket_errno) {
|
|
|
|
perror("select");
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (count < mca_oob_tcp_component.tcp_copy_spin_count &&
|
|
|
|
opal_list_get_size(&mca_oob_tcp_component.tcp_copy_in_connections) <
|
|
|
|
(size_t) mca_oob_tcp_component.tcp_copy_max_size) {
|
|
|
|
OPAL_FREE_LIST_WAIT(&mca_oob_tcp_component.tcp_pending_connections_fl,
|
|
|
|
fl_item, rc);
|
|
|
|
item = (mca_oob_tcp_pending_connection_t*) fl_item;
|
|
|
|
item->fd = accept(mca_oob_tcp_component.tcp_listen_sd,
|
|
|
|
(struct sockaddr*)&(item->addr), &addrlen);
|
|
|
|
if(item->fd < 0) {
|
|
|
|
OPAL_FREE_LIST_RETURN(&mca_oob_tcp_component.tcp_pending_connections_fl,
|
|
|
|
fl_item);
|
|
|
|
if (mca_oob_tcp_component.tcp_shutdown) return NULL;
|
|
|
|
|
|
|
|
if(opal_socket_errno != EAGAIN || opal_socket_errno != EWOULDBLOCK) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0, "mca_oob_tcp_accept: accept() failed: %s (%d).",
|
|
|
|
strerror(opal_socket_errno), opal_socket_errno);
|
2007-01-23 06:17:23 +03:00
|
|
|
CLOSE_THE_SOCKET(item->fd);
|
2006-09-15 01:29:51 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
count++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
2006-09-15 01:29:51 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_listen_thread: (%d, %d) %s:%d\n",
|
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
item->fd, opal_socket_errno,
|
|
|
|
inet_ntoa(item->addr.sin_addr),
|
|
|
|
item->addr.sin_port);
|
|
|
|
}
|
|
|
|
|
|
|
|
opal_list_append(&mca_oob_tcp_component.tcp_copy_in_connections,
|
|
|
|
(opal_list_item_t*) item);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (0 < opal_list_get_size(&mca_oob_tcp_component.tcp_copy_in_connections)) {
|
|
|
|
opal_mutex_lock(&mca_oob_tcp_component.tcp_pending_connections_lock);
|
|
|
|
opal_list_join(&mca_oob_tcp_component.tcp_pending_connections,
|
|
|
|
opal_list_get_end(&mca_oob_tcp_component.tcp_pending_connections),
|
|
|
|
&mca_oob_tcp_component.tcp_copy_in_connections);
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
while (NULL != (fl_item = (opal_free_list_item_t*) opal_list_remove_first(&mca_oob_tcp_component.tcp_connections_return_copy))) {
|
|
|
|
OPAL_FREE_LIST_RETURN(&mca_oob_tcp_component.tcp_pending_connections_fl, fl_item);
|
|
|
|
}
|
2006-09-15 01:29:51 +04:00
|
|
|
opal_mutex_unlock(&mca_oob_tcp_component.tcp_pending_connections_lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* called from opal_progress() to create the oob contact information
|
|
|
|
for the file descriptors accepted() by the accept thread. */
|
|
|
|
static int mca_oob_tcp_listen_progress(void)
|
|
|
|
{
|
|
|
|
int count = 0;
|
|
|
|
mca_oob_tcp_pending_connection_t *item;
|
|
|
|
mca_oob_tcp_event_t* event;
|
|
|
|
#if OPAL_TIMER_USEC_NATIVE
|
|
|
|
opal_timer_t now = opal_timer_base_get_usec();
|
|
|
|
#else
|
|
|
|
opal_timer_t now = opal_timer_base_get_cycles();
|
|
|
|
#endif /* OPAL_TIMER_USEC_NATIVE */
|
|
|
|
|
|
|
|
/* if we've not pulled pending connections for a while OR we've
|
|
|
|
hit the high water mark of pending connections, grab all the
|
|
|
|
pending connections */
|
|
|
|
if ((now - mca_oob_tcp_component.tcp_last_copy_time >
|
|
|
|
mca_oob_tcp_component.tcp_copy_delta) ||
|
|
|
|
((size_t) mca_oob_tcp_component.tcp_copy_max_size <
|
|
|
|
opal_list_get_size(&mca_oob_tcp_component.tcp_pending_connections))) {
|
|
|
|
|
|
|
|
/* copy the pending connections from the list the accept
|
|
|
|
thread is inserting into into a temporary list for us to
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
process from. Then copy the returned free list items into
|
|
|
|
that thread's return list, so it can free them soonish.
|
|
|
|
This is an O(1) operation, so we minimize the lock time. */
|
2006-09-15 01:29:51 +04:00
|
|
|
opal_mutex_lock(&mca_oob_tcp_component.tcp_pending_connections_lock);
|
|
|
|
opal_list_join(&mca_oob_tcp_component.tcp_copy_out_connections,
|
|
|
|
opal_list_get_end(&mca_oob_tcp_component.tcp_copy_out_connections),
|
|
|
|
&mca_oob_tcp_component.tcp_pending_connections);
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
opal_list_join(&mca_oob_tcp_component.tcp_connections_return_copy,
|
|
|
|
opal_list_get_end(&mca_oob_tcp_component.tcp_connections_return_copy),
|
|
|
|
&mca_oob_tcp_component.tcp_connections_return);
|
2006-09-15 01:29:51 +04:00
|
|
|
opal_mutex_unlock(&mca_oob_tcp_component.tcp_pending_connections_lock);
|
|
|
|
|
|
|
|
/* process al the connections */
|
|
|
|
while (NULL != (item = (mca_oob_tcp_pending_connection_t*)
|
|
|
|
opal_list_remove_first(&mca_oob_tcp_component.
|
|
|
|
tcp_copy_out_connections))) {
|
|
|
|
|
|
|
|
/* setup socket options */
|
|
|
|
mca_oob_tcp_set_socket_options(item->fd);
|
|
|
|
|
|
|
|
/* log the accept */
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
2006-09-15 01:29:51 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_listen_progress: %s:%d\n",
|
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
inet_ntoa(item->addr.sin_addr),
|
|
|
|
item->addr.sin_port);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* wait for receipt of peers process identifier to
|
|
|
|
complete this connection */
|
|
|
|
event = OBJ_NEW(mca_oob_tcp_event_t);
|
|
|
|
opal_event_set(&event->event, item->fd, OPAL_EV_READ, mca_oob_tcp_recv_handler, event);
|
|
|
|
opal_event_add(&event->event, 0);
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
/* put on the needs returning list */
|
|
|
|
opal_list_append(&mca_oob_tcp_component.tcp_connections_return,
|
|
|
|
(opal_list_item_t*) item);
|
2006-09-15 01:29:51 +04:00
|
|
|
count++;
|
|
|
|
}
|
|
|
|
|
|
|
|
mca_oob_tcp_component.tcp_last_copy_time = now;
|
|
|
|
}
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int mca_oob_tcp_create_listen_thread(void)
|
|
|
|
{
|
|
|
|
struct sockaddr_in inaddr;
|
|
|
|
opal_socklen_t addrlen;
|
|
|
|
int flags;
|
|
|
|
|
|
|
|
/* create a listen socket for incoming connections */
|
|
|
|
mca_oob_tcp_component.tcp_listen_sd = socket(AF_INET, SOCK_STREAM, 0);
|
|
|
|
if(mca_oob_tcp_component.tcp_listen_sd < 0) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0,"mca_oob_tcp_component_init: socket() failed: %s (%d)",
|
|
|
|
strerror(opal_socket_errno), opal_socket_errno);
|
2006-09-15 01:29:51 +04:00
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* setup socket options */
|
|
|
|
mca_oob_tcp_set_socket_options(mca_oob_tcp_component.tcp_listen_sd);
|
|
|
|
|
|
|
|
/* bind address */
|
|
|
|
memset(&inaddr, 0, sizeof(inaddr));
|
|
|
|
inaddr.sin_family = AF_INET;
|
|
|
|
inaddr.sin_addr.s_addr = INADDR_ANY;
|
|
|
|
inaddr.sin_port = 0;
|
|
|
|
|
|
|
|
if(bind(mca_oob_tcp_component.tcp_listen_sd, (struct sockaddr*)&inaddr, sizeof(inaddr)) < 0) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0,"mca_oob_tcp_create_listen: bind() failed: %s (%d)",
|
|
|
|
strerror(opal_socket_errno), opal_socket_errno);
|
2006-09-15 01:29:51 +04:00
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* resolve system assigned port */
|
|
|
|
addrlen = sizeof(struct sockaddr_in);
|
|
|
|
if(getsockname(mca_oob_tcp_component.tcp_listen_sd, (struct sockaddr*)&inaddr, &addrlen) < 0) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0, "mca_oob_tcp_create_listen: getsockname() failed: %s (%d)",
|
|
|
|
strerror(opal_socket_errno), opal_socket_errno);
|
2006-09-15 01:29:51 +04:00
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
mca_oob_tcp_component.tcp_listen_port = inaddr.sin_port;
|
|
|
|
|
|
|
|
/* setup listen backlog to maximum allowed by kernel */
|
2007-02-09 23:13:02 +03:00
|
|
|
if(listen(mca_oob_tcp_component.tcp_listen_sd, SOMAXCONN) < 0) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0, "mca_oob_tcp_component_init: listen() failed: %s (%d)",
|
|
|
|
strerror(opal_socket_errno), opal_socket_errno);
|
2006-09-15 01:29:51 +04:00
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* set socket up to be non-blocking, otherwise accept could block */
|
|
|
|
if((flags = fcntl(mca_oob_tcp_component.tcp_listen_sd, F_GETFL, 0)) < 0) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0, "mca_oob_tcp_component_init: fcntl(F_GETFL) failed: %s (%d)",
|
|
|
|
strerror(opal_socket_errno), opal_socket_errno);
|
2006-09-15 01:29:51 +04:00
|
|
|
return ORTE_ERROR;
|
|
|
|
} else {
|
|
|
|
flags |= O_NONBLOCK;
|
|
|
|
if(fcntl(mca_oob_tcp_component.tcp_listen_sd, F_SETFL, flags) < 0) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0, "mca_oob_tcp_component_init: fcntl(F_SETFL) failed: %s (%d)",
|
|
|
|
strerror(opal_socket_errno), opal_socket_errno);
|
2006-09-15 01:29:51 +04:00
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* start the listen thread */
|
|
|
|
mca_oob_tcp_component.tcp_listen_thread.t_run = mca_oob_tcp_listen_thread;
|
|
|
|
mca_oob_tcp_component.tcp_listen_thread.t_arg = NULL;
|
|
|
|
|
|
|
|
return opal_thread_start(&mca_oob_tcp_component.tcp_listen_thread);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-07-01 18:49:54 +04:00
|
|
|
/*
|
2005-05-05 20:31:40 +04:00
|
|
|
* Handle probe
|
2004-07-01 18:49:54 +04:00
|
|
|
*/
|
2005-05-05 20:31:40 +04:00
|
|
|
static void mca_oob_tcp_recv_probe(int sd, mca_oob_tcp_hdr_t* hdr)
|
2004-08-03 01:24:00 +04:00
|
|
|
{
|
2005-05-19 20:16:19 +04:00
|
|
|
unsigned char* ptr = (unsigned char*)hdr;
|
2005-05-05 20:31:40 +04:00
|
|
|
size_t cnt = 0;
|
|
|
|
|
2005-05-19 20:16:19 +04:00
|
|
|
hdr->msg_type = MCA_OOB_TCP_PROBE;
|
2005-05-05 20:31:40 +04:00
|
|
|
hdr->msg_dst = hdr->msg_src;
|
|
|
|
hdr->msg_src = *orte_process_info.my_name;
|
2005-05-19 20:16:19 +04:00
|
|
|
MCA_OOB_TCP_HDR_HTON(hdr);
|
|
|
|
|
2005-05-05 20:31:40 +04:00
|
|
|
while(cnt < sizeof(mca_oob_tcp_hdr_t)) {
|
|
|
|
int retval = send(sd, (char *)ptr+cnt, sizeof(mca_oob_tcp_hdr_t)-cnt, 0);
|
|
|
|
if(retval < 0) {
|
2006-08-15 00:14:44 +04:00
|
|
|
if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_probe: send() failed: %s (%d)\n",
|
2005-05-05 20:31:40 +04:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(hdr->msg_src)),
|
2006-12-14 21:20:43 +03:00
|
|
|
strerror(opal_socket_errno),
|
2006-08-15 00:14:44 +04:00
|
|
|
opal_socket_errno);
|
2006-08-23 07:32:36 +04:00
|
|
|
CLOSE_THE_SOCKET(sd);
|
2005-05-05 20:31:40 +04:00
|
|
|
return;
|
2004-09-09 23:21:34 +04:00
|
|
|
}
|
2005-05-05 20:31:40 +04:00
|
|
|
continue;
|
2004-09-02 03:07:40 +04:00
|
|
|
}
|
2005-05-05 20:31:40 +04:00
|
|
|
cnt += retval;
|
2004-08-03 01:24:00 +04:00
|
|
|
}
|
2006-08-23 07:32:36 +04:00
|
|
|
CLOSE_THE_SOCKET(sd);
|
2005-05-05 20:31:40 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Handle connection request
|
|
|
|
*/
|
|
|
|
static void mca_oob_tcp_recv_connect(int sd, mca_oob_tcp_hdr_t* hdr)
|
|
|
|
{
|
|
|
|
mca_oob_tcp_peer_t* peer;
|
|
|
|
int flags;
|
|
|
|
int cmpval;
|
2004-08-03 01:24:00 +04:00
|
|
|
|
|
|
|
/* now set socket up to be non-blocking */
|
|
|
|
if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: fcntl(F_GETFL) failed: %s (%d)",
|
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
|
2004-08-03 01:24:00 +04:00
|
|
|
} else {
|
|
|
|
flags |= O_NONBLOCK;
|
|
|
|
if(fcntl(sd, F_SETFL, flags) < 0) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: fcntl(F_SETFL) failed: %s (%d)",
|
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
|
2004-08-03 01:24:00 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
/* check for invalid name - if this is true - we allocate a name from the name server
|
2005-09-01 05:07:30 +04:00
|
|
|
* and return to the peer
|
2004-08-25 21:39:08 +04:00
|
|
|
*/
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
cmpval = orte_ns.compare_fields(ORTE_NS_CMP_ALL, &hdr->msg_src, ORTE_NAME_INVALID);
|
|
|
|
if (cmpval == ORTE_EQUAL) {
|
|
|
|
if (ORTE_SUCCESS != orte_ns.create_jobid(&hdr->msg_src.jobid, NULL)) {
|
2005-03-14 23:57:21 +03:00
|
|
|
return;
|
|
|
|
}
|
2005-05-05 20:31:40 +04:00
|
|
|
if (ORTE_SUCCESS != orte_ns.reserve_range(hdr->msg_src.jobid, 1, &hdr->msg_src.vpid)) {
|
2005-03-14 23:57:21 +03:00
|
|
|
return;
|
|
|
|
}
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
hdr->msg_src.cellid = ORTE_PROC_MY_NAME->cellid;
|
2004-08-25 21:39:08 +04:00
|
|
|
}
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
/* lookup the corresponding process */
|
2005-05-05 20:31:40 +04:00
|
|
|
peer = mca_oob_tcp_peer_lookup(&hdr->msg_src);
|
2004-08-03 01:24:00 +04:00
|
|
|
if(NULL == peer) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: unable to locate peer",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name));
|
2006-08-23 07:32:36 +04:00
|
|
|
CLOSE_THE_SOCKET(sd);
|
2004-08-03 01:24:00 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* is the peer instance willing to accept this connection */
|
|
|
|
if(mca_oob_tcp_peer_accept(peer, sd) == false) {
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT_FAIL) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_recv_handler: "
|
2005-05-08 17:22:55 +04:00
|
|
|
"rejected connection from [%lu,%lu,%lu] connection state %d",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(peer->peer_name)),
|
2005-05-05 20:31:40 +04:00
|
|
|
ORTE_NAME_ARGS(&(hdr->msg_src)),
|
2004-09-10 01:57:45 +04:00
|
|
|
peer->peer_state);
|
|
|
|
}
|
2006-08-23 07:32:36 +04:00
|
|
|
CLOSE_THE_SOCKET(sd);
|
2004-08-03 01:24:00 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-05-05 20:31:40 +04:00
|
|
|
/*
|
|
|
|
* Event callback when there is data available on the registered
|
|
|
|
* socket to recv.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static void mca_oob_tcp_recv_handler(int sd, short flags, void* user)
|
|
|
|
{
|
|
|
|
mca_oob_tcp_hdr_t hdr;
|
|
|
|
mca_oob_tcp_event_t* event = (mca_oob_tcp_event_t *)user;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* accept new connections on the listen socket */
|
2007-04-25 05:55:40 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
if((mca_oob_tcp_component.tcp_listen_sd == sd) ||
|
|
|
|
(mca_oob_tcp_component.tcp6_listen_sd == sd)) {
|
|
|
|
#else
|
2005-05-05 20:31:40 +04:00
|
|
|
if(mca_oob_tcp_component.tcp_listen_sd == sd) {
|
2007-04-25 05:55:40 +04:00
|
|
|
#endif
|
|
|
|
mca_oob_tcp_accept(sd);
|
2005-05-05 20:31:40 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
OBJ_RELEASE(event);
|
|
|
|
|
2006-01-05 01:29:09 +03:00
|
|
|
/* Some mem checkers don't realize that hdr will guarantee to be
|
|
|
|
fully filled in during the read(), below :-( */
|
|
|
|
OMPI_DEBUG_ZERO(hdr);
|
|
|
|
|
2005-05-05 20:31:40 +04:00
|
|
|
/* recv the process identifier */
|
|
|
|
while((rc = recv(sd, (char *)&hdr, sizeof(hdr), 0)) != sizeof(hdr)) {
|
|
|
|
if(rc >= 0) {
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT_FAIL) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: peer closed connection",
|
2005-05-05 20:31:40 +04:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name));
|
|
|
|
}
|
2006-08-23 07:32:36 +04:00
|
|
|
CLOSE_THE_SOCKET(sd);
|
2005-05-05 20:31:40 +04:00
|
|
|
return;
|
|
|
|
}
|
2006-08-15 00:14:44 +04:00
|
|
|
if(opal_socket_errno != EINTR) {
|
2006-12-14 21:20:43 +03:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: recv() failed: %s (%d)\n",
|
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
|
2006-08-23 07:32:36 +04:00
|
|
|
CLOSE_THE_SOCKET(sd);
|
2005-05-05 20:31:40 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
MCA_OOB_TCP_HDR_NTOH(&hdr);
|
|
|
|
|
|
|
|
/* dispatch based on message type */
|
|
|
|
switch(hdr.msg_type) {
|
|
|
|
case MCA_OOB_TCP_PROBE:
|
|
|
|
mca_oob_tcp_recv_probe(sd, &hdr);
|
|
|
|
break;
|
|
|
|
case MCA_OOB_TCP_CONNECT:
|
|
|
|
mca_oob_tcp_recv_connect(sd, &hdr);
|
|
|
|
break;
|
|
|
|
default:
|
2005-09-01 05:07:30 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: invalid message type: %d\n",
|
2005-05-18 19:31:23 +04:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name), hdr.msg_type);
|
2006-08-23 07:32:36 +04:00
|
|
|
CLOSE_THE_SOCKET(sd);
|
2005-05-05 20:31:40 +04:00
|
|
|
break;
|
|
|
|
}
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2005-05-05 20:31:40 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
/*
|
2004-08-19 23:34:37 +04:00
|
|
|
* Component initialization - create a module.
|
2004-08-03 01:24:00 +04:00
|
|
|
* (1) initialize static resources
|
|
|
|
* (2) create listen socket
|
|
|
|
*/
|
2005-03-14 23:57:21 +03:00
|
|
|
mca_oob_t* mca_oob_tcp_component_init(int* priority)
|
2004-07-01 18:49:54 +04:00
|
|
|
{
|
2007-05-31 06:29:44 +04:00
|
|
|
int i;
|
|
|
|
|
2004-08-28 05:15:19 +04:00
|
|
|
*priority = 1;
|
2004-11-02 16:14:34 +03:00
|
|
|
|
2004-10-01 01:23:10 +04:00
|
|
|
/* are there any interfaces? */
|
2006-09-26 20:37:04 +04:00
|
|
|
if(opal_ifcount() <= 0)
|
2004-10-01 01:23:10 +04:00
|
|
|
return NULL;
|
|
|
|
|
2007-05-31 06:29:44 +04:00
|
|
|
/* see if we should use localhost as an address. We should do so
|
|
|
|
if after looking at all available interfaces (based on what we
|
|
|
|
find and what the user restricts with MCA parameters) there are
|
|
|
|
only local addresses available. */
|
|
|
|
mca_oob_tcp_component.tcp_ignore_localhost = false;
|
|
|
|
for (i = opal_ifbegin() ; i > 0 ; i = opal_ifnext(i)) {
|
|
|
|
char name[32];
|
|
|
|
struct sockaddr_storage inaddr;
|
|
|
|
opal_ifindextoname(i, name, sizeof(name));
|
|
|
|
if (mca_oob_tcp_component.tcp_include != NULL &&
|
|
|
|
strstr(mca_oob_tcp_component.tcp_include,name) == NULL) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (mca_oob_tcp_component.tcp_exclude != NULL &&
|
|
|
|
strstr(mca_oob_tcp_component.tcp_exclude,name) != NULL) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
opal_ifindextoaddr(i, (struct sockaddr*) &inaddr, sizeof(inaddr));
|
|
|
|
if(!opal_net_islocalhost((struct sockaddr*) &inaddr)) {
|
|
|
|
mca_oob_tcp_component.tcp_ignore_localhost = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
/* initialize data structures */
|
2005-07-03 20:52:32 +04:00
|
|
|
opal_hash_table_init(&mca_oob_tcp_component.tcp_peers, 128);
|
|
|
|
opal_hash_table_init(&mca_oob_tcp_component.tcp_peer_names, 128);
|
2004-08-03 01:24:00 +04:00
|
|
|
|
2005-07-02 20:46:27 +04:00
|
|
|
opal_free_list_init(&mca_oob_tcp_component.tcp_peer_free,
|
2004-08-03 01:24:00 +04:00
|
|
|
sizeof(mca_oob_tcp_peer_t),
|
|
|
|
OBJ_CLASS(mca_oob_tcp_peer_t),
|
|
|
|
8, /* initial number */
|
|
|
|
mca_oob_tcp_component.tcp_peer_limit, /* maximum number */
|
2005-07-02 20:46:27 +04:00
|
|
|
8); /* increment to grow by */
|
2004-08-03 01:24:00 +04:00
|
|
|
|
2005-07-02 20:46:27 +04:00
|
|
|
opal_free_list_init(&mca_oob_tcp_component.tcp_msgs,
|
2004-08-03 01:24:00 +04:00
|
|
|
sizeof(mca_oob_tcp_msg_t),
|
|
|
|
OBJ_CLASS(mca_oob_tcp_msg_t),
|
|
|
|
8, /* initial number */
|
2004-08-03 02:16:35 +04:00
|
|
|
-1, /* maximum number */
|
2005-07-02 20:46:27 +04:00
|
|
|
8); /* increment to grow by */
|
2004-08-03 01:24:00 +04:00
|
|
|
|
|
|
|
/* intialize event library */
|
2005-07-04 03:09:55 +04:00
|
|
|
memset(&mca_oob_tcp_component.tcp_recv_event, 0, sizeof(opal_event_t));
|
|
|
|
memset(&mca_oob_tcp_component.tcp_send_event, 0, sizeof(opal_event_t));
|
2007-04-25 05:55:40 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
memset(&mca_oob_tcp_component.tcp6_recv_event, 0, sizeof(opal_event_t));
|
|
|
|
memset(&mca_oob_tcp_component.tcp6_send_event, 0, sizeof(opal_event_t));
|
|
|
|
#endif
|
2007-06-14 08:38:06 +04:00
|
|
|
|
|
|
|
#if defined(__WINDOWS__)
|
|
|
|
/* Register the libevent callback which will trigger the OOB
|
|
|
|
* completion callbacks. */
|
2007-06-15 02:35:38 +04:00
|
|
|
OBJ_CONSTRUCT(&windows_callback, opal_mutex_t);
|
2007-06-14 08:38:06 +04:00
|
|
|
opal_progress_register(oob_tcp_windows_progress_callback);
|
|
|
|
#endif /* defined(__WINDOWS__) */
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
return &mca_oob_tcp;
|
2004-07-01 18:49:54 +04:00
|
|
|
}
|
|
|
|
|
2004-09-02 03:07:40 +04:00
|
|
|
/*
|
|
|
|
* Callback from registry on change to subscribed segments.
|
|
|
|
*/
|
|
|
|
|
2004-11-20 22:12:43 +03:00
|
|
|
void mca_oob_tcp_registry_callback(
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_gpr_notify_data_t* data,
|
2004-09-02 03:07:40 +04:00
|
|
|
void* cbdata)
|
|
|
|
{
|
2006-08-15 23:54:10 +04:00
|
|
|
orte_std_cntr_t i, j, k;
|
2006-02-07 06:32:36 +03:00
|
|
|
int rc;
|
2005-07-18 22:49:00 +04:00
|
|
|
orte_gpr_value_t **values, *value;
|
|
|
|
orte_gpr_keyval_t *keyval;
|
2006-02-07 06:32:36 +03:00
|
|
|
orte_byte_object_t *bo;
|
2005-07-18 22:49:00 +04:00
|
|
|
orte_buffer_t buffer;
|
|
|
|
mca_oob_tcp_addr_t* addr, *existing;
|
|
|
|
mca_oob_tcp_peer_t* peer;
|
|
|
|
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_registry_callback\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name));
|
2004-09-02 03:07:40 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* process the callback */
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
2005-07-18 22:49:00 +04:00
|
|
|
values = (orte_gpr_value_t**)(data->values)->addr;
|
|
|
|
for(i = 0, k=0; k < data->cnt &&
|
|
|
|
i < (data->values)->size; i++) {
|
|
|
|
if (NULL != values[i]) {
|
|
|
|
k++;
|
|
|
|
value = values[i];
|
|
|
|
for(j = 0; j < value->cnt; j++) {
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2005-07-18 22:49:00 +04:00
|
|
|
/* check to make sure this is the requested key */
|
|
|
|
keyval = value->keyvals[j];
|
2007-03-17 02:11:45 +03:00
|
|
|
if(strcmp(keyval->key, ORTE_OOB_TCP_KEY) != 0)
|
2005-07-18 22:49:00 +04:00
|
|
|
continue;
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2005-07-18 22:49:00 +04:00
|
|
|
/* transfer ownership of registry object to buffer and unpack */
|
|
|
|
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
|
2006-02-07 06:32:36 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bo, keyval->value, ORTE_BYTE_OBJECT))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if(orte_dss.load(&buffer, bo->bytes, bo->size) != ORTE_SUCCESS) {
|
2005-07-18 22:49:00 +04:00
|
|
|
/* TSW - throw ERROR */
|
|
|
|
continue;
|
|
|
|
}
|
2006-02-07 06:32:36 +03:00
|
|
|
/* protect the values from the release */
|
|
|
|
keyval->value->type = ORTE_NULL;
|
|
|
|
keyval->value->data = NULL;
|
|
|
|
/* unpack the buffer */
|
2005-07-18 22:49:00 +04:00
|
|
|
addr = mca_oob_tcp_addr_unpack(&buffer);
|
|
|
|
OBJ_DESTRUCT(&buffer);
|
|
|
|
if(NULL == addr) {
|
|
|
|
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_registry_callback: unable to unpack peer address\n",
|
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name));
|
|
|
|
continue;
|
|
|
|
}
|
2005-09-01 05:07:30 +04:00
|
|
|
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
if(mca_oob_tcp_component.tcp_debug > OOB_TCP_DEBUG_INFO) {
|
2005-07-18 22:49:00 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_registry_callback: received peer [%lu,%lu,%lu]\n",
|
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name),
|
|
|
|
ORTE_NAME_ARGS(&(addr->addr_name)));
|
|
|
|
}
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2005-07-18 22:49:00 +04:00
|
|
|
/* check for existing cache entry */
|
2005-07-19 16:25:19 +04:00
|
|
|
existing = (mca_oob_tcp_addr_t *)orte_hash_table_get_proc(
|
2005-07-18 22:49:00 +04:00
|
|
|
&mca_oob_tcp_component.tcp_peer_names, &addr->addr_name);
|
2007-05-22 17:28:23 +04:00
|
|
|
if(NULL != existing && ORTE_EQUAL != orte_dss.compare(ORTE_PROC_MY_NAME, &addr->addr_name, ORTE_NAME)) {
|
|
|
|
/* need to update existing entry - but don't update our own entry! */
|
2007-06-06 21:39:23 +04:00
|
|
|
if(mca_oob_tcp_component.tcp_debug > OOB_TCP_DEBUG_INFO) {
|
|
|
|
opal_output( 0, "[%ld,%ld,%ld] Received OOB update for [%ld,%ld,%ld]",
|
|
|
|
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(&addr->addr_name) );
|
|
|
|
}
|
2007-03-17 02:11:45 +03:00
|
|
|
orte_hash_table_set_proc(&mca_oob_tcp_component.tcp_peer_names, &addr->addr_name, addr);
|
2005-07-18 22:49:00 +04:00
|
|
|
OBJ_RELEASE(addr);
|
|
|
|
continue;
|
|
|
|
}
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2005-07-18 22:49:00 +04:00
|
|
|
/* insert into cache and notify peer */
|
2005-07-19 16:25:19 +04:00
|
|
|
orte_hash_table_set_proc(&mca_oob_tcp_component.tcp_peer_names, &addr->addr_name, addr);
|
2005-07-19 17:33:04 +04:00
|
|
|
peer = (mca_oob_tcp_peer_t *)orte_hash_table_get_proc(
|
2005-07-18 22:49:00 +04:00
|
|
|
&mca_oob_tcp_component.tcp_peers, &addr->addr_name);
|
|
|
|
if(NULL != peer)
|
|
|
|
mca_oob_tcp_peer_resolved(peer, addr);
|
2005-03-14 23:57:21 +03:00
|
|
|
}
|
|
|
|
}
|
2004-09-02 03:07:40 +04:00
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
2004-09-02 03:07:40 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Attempt to resolve peer name.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int mca_oob_tcp_resolve(mca_oob_tcp_peer_t* peer)
|
|
|
|
{
|
2005-04-13 01:25:51 +04:00
|
|
|
mca_oob_tcp_addr_t* addr;
|
2005-09-01 05:07:30 +04:00
|
|
|
|
2005-04-13 01:25:51 +04:00
|
|
|
/* if the address is already cached - simply return it */
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
2005-09-01 05:07:30 +04:00
|
|
|
addr = (mca_oob_tcp_addr_t *)orte_hash_table_get_proc(&mca_oob_tcp_component.tcp_peer_names,
|
2005-03-14 23:57:21 +03:00
|
|
|
&peer->peer_name);
|
2005-04-13 01:25:51 +04:00
|
|
|
if(NULL != addr) {
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
2004-09-02 03:07:40 +04:00
|
|
|
mca_oob_tcp_peer_resolved(peer, addr);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_SUCCESS;
|
2005-04-13 01:25:51 +04:00
|
|
|
}
|
2007-06-12 20:25:26 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
2004-09-02 03:07:40 +04:00
|
|
|
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
/* if we don't know it, then report unknown - don't try to go get it */
|
|
|
|
return ORTE_ERR_ADDRESSEE_UNKNOWN;
|
2004-09-02 03:07:40 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-08-19 23:34:37 +04:00
|
|
|
/*
|
|
|
|
* Setup contact information in the registry.
|
|
|
|
*/
|
|
|
|
int mca_oob_tcp_init(void)
|
|
|
|
{
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_jobid_t jobid;
|
2004-08-19 23:34:37 +04:00
|
|
|
int rc;
|
2006-10-02 05:27:22 +04:00
|
|
|
int randval = orte_process_info.num_procs;
|
|
|
|
|
|
|
|
if (0 == randval) randval = 10;
|
2005-03-14 23:57:21 +03:00
|
|
|
|
2005-03-19 02:40:08 +03:00
|
|
|
/* random delay to stagger connections back to seed */
|
2005-12-12 23:04:00 +03:00
|
|
|
#if defined(__WINDOWS__)
|
2006-11-06 21:00:46 +03:00
|
|
|
if(1 == mca_oob_tcp_component.connect_sleep) {
|
|
|
|
Sleep((orte_process_info.my_name->vpid % randval % 1000) * 100);
|
|
|
|
}
|
2005-04-19 08:38:48 +04:00
|
|
|
#else
|
2006-11-06 21:00:46 +03:00
|
|
|
if(1 == mca_oob_tcp_component.connect_sleep) {
|
|
|
|
usleep((orte_process_info.my_name->vpid % randval % 1000) * 1000);
|
|
|
|
}
|
2005-04-19 08:38:48 +04:00
|
|
|
#endif
|
2006-07-13 02:18:53 +04:00
|
|
|
|
2005-03-19 02:40:08 +03:00
|
|
|
/* get my jobid */
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
jobid = ORTE_PROC_MY_NAME->jobid;
|
|
|
|
|
2006-10-12 01:29:29 +04:00
|
|
|
/* create a listen socket */
|
2007-03-23 16:29:18 +03:00
|
|
|
if ((OOB_TCP_LISTEN_THREAD == mca_oob_tcp_component.tcp_listen_type) &&
|
|
|
|
orte_process_info.seed) {
|
2006-10-12 01:29:29 +04:00
|
|
|
if (mca_oob_tcp_create_listen_thread() != ORTE_SUCCESS) {
|
|
|
|
opal_output(0, "mca_oob_tcp_init: unable to create listen thread");
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
opal_free_list_init(&mca_oob_tcp_component.tcp_pending_connections_fl,
|
|
|
|
sizeof(mca_oob_tcp_pending_connection_t),
|
|
|
|
OBJ_CLASS(mca_oob_tcp_pending_connection_t),
|
|
|
|
16, /* initial number */
|
|
|
|
-1, /* maximum number */
|
|
|
|
16); /* increment to grow by */
|
|
|
|
opal_progress_register(mca_oob_tcp_listen_progress);
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
|
2007-03-23 16:29:18 +03:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] accepting connections via listen thread",
|
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* fix up the listen_type, since we might have been in thread,
|
|
|
|
but can't do that since we weren't the HNP. */
|
|
|
|
mca_oob_tcp_component.tcp_listen_type = OOB_TCP_EVENT;
|
|
|
|
|
2007-04-25 05:55:40 +04:00
|
|
|
rc = mca_oob_tcp_create_listen(&mca_oob_tcp_component.tcp_listen_sd,
|
|
|
|
AF_INET);
|
|
|
|
if (ORTE_SUCCESS != rc &&
|
|
|
|
(EAFNOSUPPORT != opal_socket_errno ||
|
|
|
|
mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT)) {
|
|
|
|
opal_output(0,
|
|
|
|
"mca_oob_tcp_init: unable to create IPv4 listen socket: %s\n",
|
|
|
|
opal_strerror(rc));
|
|
|
|
}
|
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
rc = mca_oob_tcp_create_listen(&mca_oob_tcp_component.tcp6_listen_sd,
|
|
|
|
AF_INET6);
|
|
|
|
if (ORTE_SUCCESS != rc &&
|
|
|
|
(EAFNOSUPPORT != opal_socket_errno ||
|
|
|
|
mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT)) {
|
|
|
|
opal_output(0,
|
|
|
|
"mca_oob_tcp_init: unable to create IPv6 listen socket: %s\n",
|
|
|
|
opal_strerror(rc));
|
2007-03-23 16:29:18 +03:00
|
|
|
}
|
2007-04-25 05:55:40 +04:00
|
|
|
#endif
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
|
2007-03-23 16:29:18 +03:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] accepting connections via event library",
|
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name));
|
|
|
|
}
|
2006-10-12 01:29:29 +04:00
|
|
|
}
|
|
|
|
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
2004-09-02 03:07:40 +04:00
|
|
|
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
|
|
|
|
int mca_oob_tcp_register_subscription(orte_jobid_t jobid, char *trigger)
|
|
|
|
{
|
|
|
|
char *sub_name, *segment, *trig_name;
|
|
|
|
mca_oob_tcp_subscription_t *subscription;
|
|
|
|
orte_gpr_subscription_id_t sub_id;
|
|
|
|
int rc;
|
|
|
|
|
2004-11-20 22:12:43 +03:00
|
|
|
/* register subscribe callback to receive notification when all processes have registered */
|
2004-09-02 03:07:40 +04:00
|
|
|
subscription = OBJ_NEW(mca_oob_tcp_subscription_t);
|
2005-03-14 23:57:21 +03:00
|
|
|
subscription->jobid = jobid;
|
2007-05-30 22:50:29 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_append(&mca_oob_tcp_component.tcp_subscriptions, &subscription->item);
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
2004-09-02 03:07:40 +04:00
|
|
|
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_ALL) {
|
2005-09-01 05:07:30 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_init: calling orte_gpr.subscribe\n",
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name));
|
2004-09-02 03:07:40 +04:00
|
|
|
}
|
|
|
|
|
2005-09-01 05:07:30 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_schema.get_std_subscription_name(&sub_name,
|
2007-01-02 19:16:50 +03:00
|
|
|
ORTE_OOB_SUBSCRIPTION, jobid))) {
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
/* attach to the specified trigger */
|
2005-09-01 05:07:30 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_schema.get_std_trigger_name(&trig_name,
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
trigger, jobid))) {
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
2005-09-01 05:07:30 +04:00
|
|
|
free(sub_name);
|
2005-03-14 23:57:21 +03:00
|
|
|
return rc;
|
|
|
|
}
|
2005-06-24 20:59:37 +04:00
|
|
|
|
2005-09-01 05:07:30 +04:00
|
|
|
/* define the segment */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) {
|
2005-06-24 20:59:37 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
2005-09-01 05:07:30 +04:00
|
|
|
free(sub_name);
|
|
|
|
free(trig_name);
|
2005-06-24 20:59:37 +04:00
|
|
|
return rc;
|
2005-03-14 23:57:21 +03:00
|
|
|
}
|
2005-06-24 20:59:37 +04:00
|
|
|
|
2005-09-01 05:07:30 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.subscribe_1(&sub_id, trig_name, sub_name,
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
ORTE_GPR_NOTIFY_DELETE_AFTER_TRIG,
|
2006-12-10 02:10:25 +03:00
|
|
|
ORTE_GPR_KEYS_OR | ORTE_GPR_TOKENS_OR | ORTE_GPR_STRIPPED,
|
2005-09-01 05:07:30 +04:00
|
|
|
segment,
|
|
|
|
NULL, /* look at all containers on this segment */
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
ORTE_OOB_TCP_KEY,
|
2005-09-01 05:07:30 +04:00
|
|
|
mca_oob_tcp_registry_callback, NULL))) {
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
2005-09-01 05:07:30 +04:00
|
|
|
free(sub_name);
|
|
|
|
free(trig_name);
|
|
|
|
free(segment);
|
2005-03-14 23:57:21 +03:00
|
|
|
return rc;
|
|
|
|
}
|
2007-03-17 02:11:45 +03:00
|
|
|
|
|
|
|
|
2005-09-01 05:07:30 +04:00
|
|
|
/* the id of each subscription is recorded
|
2005-06-24 20:59:37 +04:00
|
|
|
* here so we can (if desired) cancel that subscription later
|
|
|
|
*/
|
2005-09-01 05:07:30 +04:00
|
|
|
subscription->subid = sub_id;
|
2005-06-24 20:59:37 +04:00
|
|
|
/* done with these, so release any memory */
|
2005-09-01 05:07:30 +04:00
|
|
|
free(trig_name);
|
|
|
|
free(sub_name);
|
|
|
|
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
2005-03-14 23:57:21 +03:00
|
|
|
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
int mca_oob_tcp_register_contact_info(void)
|
|
|
|
{
|
|
|
|
orte_std_cntr_t i, num_tokens;
|
|
|
|
orte_buffer_t *buffer;
|
|
|
|
orte_data_value_t *values[2];
|
|
|
|
orte_byte_object_t bo;
|
|
|
|
char *tmp, *tmp2, *tmp3;
|
|
|
|
char *segment, **tokens;
|
|
|
|
char *keys[] = { ORTE_OOB_TCP_KEY, ORTE_PROC_RML_IP_ADDRESS_KEY};
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* setup to put our contact info on registry */
|
2005-03-14 23:57:21 +03:00
|
|
|
buffer = OBJ_NEW(orte_buffer_t);
|
|
|
|
if(buffer == NULL) {
|
2005-07-18 22:49:00 +04:00
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
2005-09-01 05:07:30 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = mca_oob_tcp_addr_pack(buffer))) {
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
2005-09-01 05:07:30 +04:00
|
|
|
OBJ_RELEASE(buffer);
|
2005-03-14 23:57:21 +03:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-09-01 05:07:30 +04:00
|
|
|
/* extract payload for storage */
|
2006-02-07 06:32:36 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.unload(buffer, (void**)&(bo.bytes), &(bo.size)))) {
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(buffer);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-09-01 05:07:30 +04:00
|
|
|
OBJ_RELEASE(buffer);
|
2006-02-07 06:32:36 +03:00
|
|
|
values[0] = OBJ_NEW(orte_data_value_t);
|
|
|
|
if (NULL == values[0]) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
values[0]->type = ORTE_BYTE_OBJECT;
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.copy(&(values[0]->data), &bo, ORTE_BYTE_OBJECT))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2004-08-19 23:34:37 +04:00
|
|
|
|
2005-09-01 05:07:30 +04:00
|
|
|
/* setup the IP address for storage */
|
2005-07-18 22:49:00 +04:00
|
|
|
tmp = mca_oob.oob_get_addr();
|
Add a job_info segment to the system that holds a container for each job. Within each container is a keyval indicating the job state (i.e., all procs at stage1, finalized, etc.). This provides a rough state-of-health for the job.
This required a little fiddling with a number of areas. Biggest problem was that it uncovered a potential for an infinite loop to be created in the registry. If a callback function modified the registry, the registry checked the triggers to see if anything had fired. Well, if the original callback was due to a trigger firing, that condition hadn't changed - so the trigger fired again....which caused the callback to be called, which modified the registry, which checked the triggers, etc. etc.
Triggers are now checked and then "flagged" as being "in process" so that the registry will NOT recheck that trigger until all callbacks have been processed. Tried doing this with subscriptions as well, but that caused a problem - when we release processes from a stagegate, they (at the moment) immediately place data on the registry that should cause a subscription to fire. Unfortunately, the system will just hang if that subscription doesn't get processed. So, I have left the subscription system alone - any callback function that modifies the registry in a fashion that will fire a subscription will indeed fire that subscription. We'll have to see if this causes problems - it shouldn't, but a careless user could lock things up if the callback generates a callback to itself.
Also fixed the code that placed a process' RML contact info on the registry to eliminate the leading '/' from the string.
This commit was SVN r6684.
2005-07-29 18:11:19 +04:00
|
|
|
tmp2 = strrchr(tmp, '/') + 1;
|
2005-07-18 22:49:00 +04:00
|
|
|
tmp3 = strrchr(tmp, ':');
|
2005-07-20 00:12:51 +04:00
|
|
|
if(NULL == tmp2 || NULL == tmp3) {
|
|
|
|
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_init: invalid address \'%s\' "
|
2005-09-01 05:07:30 +04:00
|
|
|
"returned for selected oob interfaces.\n",
|
2005-07-20 00:12:51 +04:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name), tmp);
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
2005-09-01 05:07:30 +04:00
|
|
|
free(tmp);
|
2006-02-07 06:32:36 +03:00
|
|
|
free(bo.bytes);
|
2005-07-20 00:12:51 +04:00
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
2005-07-18 22:49:00 +04:00
|
|
|
*tmp3 = '\0';
|
2006-02-07 06:32:36 +03:00
|
|
|
values[1] = OBJ_NEW(orte_data_value_t);
|
|
|
|
if (NULL == values[1]) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
values[1]->type = ORTE_STRING;
|
|
|
|
values[1]->data = strdup(tmp2);
|
2005-07-18 22:49:00 +04:00
|
|
|
free(tmp);
|
2004-09-02 03:07:40 +04:00
|
|
|
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
/* define the segment */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, ORTE_PROC_MY_NAME->jobid))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(values[0]);
|
|
|
|
OBJ_RELEASE(values[1]);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-09-01 05:07:30 +04:00
|
|
|
/* get the process tokens */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_schema.get_proc_tokens(&tokens, &num_tokens,
|
|
|
|
orte_process_info.my_name))) {
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
2005-09-01 05:07:30 +04:00
|
|
|
free(segment);
|
2006-02-07 06:32:36 +03:00
|
|
|
OBJ_RELEASE(values[0]);
|
|
|
|
OBJ_RELEASE(values[1]);
|
2005-03-14 23:57:21 +03:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-09-01 05:07:30 +04:00
|
|
|
/* put our contact info in registry */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.put_N(ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_XAND,
|
2006-02-07 06:32:36 +03:00
|
|
|
segment, tokens, 2, keys, values))) {
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
2004-08-19 23:34:37 +04:00
|
|
|
}
|
2005-09-01 05:07:30 +04:00
|
|
|
|
|
|
|
free(segment);
|
2005-09-01 21:38:04 +04:00
|
|
|
for(i=0; i < num_tokens; i++) {
|
|
|
|
free(tokens[i]);
|
|
|
|
tokens[i] = NULL;
|
|
|
|
}
|
|
|
|
if (NULL != tokens) free(tokens);
|
2006-02-07 06:32:36 +03:00
|
|
|
OBJ_RELEASE(values[0]);
|
|
|
|
OBJ_RELEASE(values[1]);
|
2005-09-01 05:07:30 +04:00
|
|
|
|
|
|
|
return rc;
|
2004-08-19 23:34:37 +04:00
|
|
|
}
|
2004-07-01 18:49:54 +04:00
|
|
|
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
|
|
|
|
static int get_contact_info(orte_jobid_t job, char **tokens, orte_gpr_notify_data_t **data)
|
|
|
|
{
|
|
|
|
char *segment;
|
2007-05-23 20:35:03 +04:00
|
|
|
char *keys[] = {
|
|
|
|
ORTE_OOB_TCP_KEY,
|
|
|
|
ORTE_PROC_RML_IP_ADDRESS_KEY,
|
2007-05-23 20:39:18 +04:00
|
|
|
NULL
|
|
|
|
};
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
orte_gpr_value_t **values;
|
|
|
|
orte_std_cntr_t cnt, i, idx;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* define the segment */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, job))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get the data */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_TOKENS_AND | ORTE_GPR_KEYS_OR,
|
|
|
|
segment, tokens, keys, &cnt, &values))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
free(segment);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* see if we got data back */
|
|
|
|
if (0 < cnt) {
|
|
|
|
/* build the data into the notify_data object. If the data
|
|
|
|
* pointer is NULL, then we are the first values, so initialize
|
|
|
|
* it. Otherwise, just add the data to it
|
|
|
|
*/
|
|
|
|
if (NULL == *data) {
|
|
|
|
*data = OBJ_NEW(orte_gpr_notify_data_t);
|
|
|
|
}
|
|
|
|
for (i=0; i < cnt; i++) {
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&idx, (*data)->values, (void*)values[i]))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
++(*data)->cnt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int mca_oob_tcp_get_contact_info(orte_process_name_t *name, orte_gpr_notify_data_t **data)
|
|
|
|
{
|
|
|
|
char **tokens=NULL;
|
|
|
|
orte_std_cntr_t num_tokens;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* if the vpid is WILDCARD, then we want the info from all procs in the specified job. This
|
|
|
|
* is the default condition, so do nothing for this case. If the vpid is not WILDCARD,
|
|
|
|
* then go get the process tokens
|
|
|
|
*/
|
|
|
|
if (ORTE_VPID_WILDCARD != name->vpid) {
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_schema.get_proc_tokens(&tokens, &num_tokens, name))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If the jobid is not WILDCARD, then we only want the info from the specified job -
|
|
|
|
* this is the most common case, so treat it first
|
|
|
|
*/
|
|
|
|
if (ORTE_JOBID_WILDCARD != name->jobid) {
|
|
|
|
if (ORTE_SUCCESS != (rc = get_contact_info(name->jobid, tokens, data))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if the jobid is WILDCARD, then we want the info from all jobs. */
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
/*
|
|
|
|
* Module cleanup.
|
|
|
|
*/
|
2004-08-19 23:34:37 +04:00
|
|
|
int mca_oob_tcp_fini(void)
|
2004-07-01 18:49:54 +04:00
|
|
|
{
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t *item;
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_disable(); /* disable event processing */
|
2004-09-30 19:09:29 +04:00
|
|
|
|
2004-08-12 17:29:37 +04:00
|
|
|
/* close listen socket */
|
2004-08-05 23:37:48 +04:00
|
|
|
if (mca_oob_tcp_component.tcp_listen_sd >= 0) {
|
2006-09-15 01:29:51 +04:00
|
|
|
if (OOB_TCP_EVENT == mca_oob_tcp_component.tcp_listen_type) {
|
|
|
|
opal_event_del(&mca_oob_tcp_component.tcp_recv_event);
|
2007-01-23 06:17:23 +03:00
|
|
|
CLOSE_THE_SOCKET(mca_oob_tcp_component.tcp_listen_sd);
|
2007-04-25 05:55:40 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
if (mca_oob_tcp_component.tcp6_listen_sd >= 0) {
|
|
|
|
opal_event_del(&mca_oob_tcp_component.tcp6_recv_event);
|
|
|
|
CLOSE_THE_SOCKET(mca_oob_tcp_component.tcp6_listen_sd);
|
|
|
|
mca_oob_tcp_component.tcp6_listen_sd = -1;
|
|
|
|
}
|
|
|
|
#endif
|
2006-09-15 01:29:51 +04:00
|
|
|
} else if (OOB_TCP_LISTEN_THREAD == mca_oob_tcp_component.tcp_listen_type) {
|
|
|
|
void *data;
|
2007-04-25 05:55:40 +04:00
|
|
|
/* adi@2007-04-12: Bug, FIXME:
|
|
|
|
* once the thread listener is IPv6 capable, don't forget to
|
|
|
|
* close the v6 socket
|
|
|
|
*/
|
2006-09-15 01:29:51 +04:00
|
|
|
mca_oob_tcp_component.tcp_shutdown = true;
|
2007-01-23 06:17:23 +03:00
|
|
|
CLOSE_THE_SOCKET(mca_oob_tcp_component.tcp_listen_sd);
|
2006-09-15 01:29:51 +04:00
|
|
|
opal_thread_join(&mca_oob_tcp_component.tcp_listen_thread, &data);
|
|
|
|
opal_progress_unregister(mca_oob_tcp_listen_progress);
|
|
|
|
}
|
2004-09-30 19:09:29 +04:00
|
|
|
mca_oob_tcp_component.tcp_listen_sd = -1;
|
2004-08-05 23:37:48 +04:00
|
|
|
}
|
2004-08-12 17:29:37 +04:00
|
|
|
|
|
|
|
/* cleanup all peers */
|
2005-07-03 20:22:16 +04:00
|
|
|
for(item = opal_list_remove_first(&mca_oob_tcp_component.tcp_peer_list);
|
2004-09-30 19:09:29 +04:00
|
|
|
item != NULL;
|
2005-07-03 20:22:16 +04:00
|
|
|
item = opal_list_remove_first(&mca_oob_tcp_component.tcp_peer_list)) {
|
2004-09-30 19:09:29 +04:00
|
|
|
mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)item;
|
|
|
|
MCA_OOB_TCP_PEER_RETURN(peer);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* delete any pending events */
|
2007-06-15 02:33:09 +04:00
|
|
|
for( item = opal_list_get_first(&mca_oob_tcp_component.tcp_events);
|
|
|
|
item != opal_list_get_end(&mca_oob_tcp_component.tcp_events);
|
|
|
|
item = opal_list_get_first(&mca_oob_tcp_component.tcp_events) ) {
|
2004-09-30 19:09:29 +04:00
|
|
|
mca_oob_tcp_event_t* event = (mca_oob_tcp_event_t*)item;
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_del(&event->event);
|
2004-09-30 19:09:29 +04:00
|
|
|
OBJ_RELEASE(event);
|
2004-08-06 21:23:37 +04:00
|
|
|
}
|
2004-09-30 19:09:29 +04:00
|
|
|
|
2005-07-04 03:09:55 +04:00
|
|
|
opal_event_enable();
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-07-01 18:49:54 +04:00
|
|
|
}
|
2004-07-13 02:46:57 +04:00
|
|
|
|
2004-09-30 19:09:29 +04:00
|
|
|
|
2004-08-10 03:07:53 +04:00
|
|
|
/*
|
2004-08-05 19:30:36 +04:00
|
|
|
* Compare two process names for equality.
|
|
|
|
*
|
|
|
|
* @param n1 Process name 1.
|
|
|
|
* @param n2 Process name 2.
|
|
|
|
* @return (-1 for n1<n2 0 for equality, 1 for n1>n2)
|
|
|
|
*
|
|
|
|
* Note that the definition of < or > is somewhat arbitrary -
|
|
|
|
* just needs to be consistently applied to maintain an ordering
|
|
|
|
* when process names are used as indices.
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
*
|
|
|
|
* Currently, this function is ONLY used in one place - in oob_tcp_send.c to
|
|
|
|
* determine if the recipient of the message-to-be-sent is ourselves. Hence,
|
|
|
|
* this comparison is okay to be LITERAL and can/should use the ns.compare_fields
|
|
|
|
* function
|
2004-08-05 19:30:36 +04:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
int mca_oob_tcp_process_name_compare(const orte_process_name_t* n1, const orte_process_name_t* n2)
|
2004-08-05 19:30:36 +04:00
|
|
|
{
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
return orte_ns.compare_fields(ORTE_NS_CMP_ALL, n1, n2);
|
2004-08-05 19:30:36 +04:00
|
|
|
}
|
|
|
|
|
2004-08-16 23:39:54 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Return local process address as a URI string.
|
|
|
|
*/
|
|
|
|
|
|
|
|
char* mca_oob_tcp_get_addr(void)
|
|
|
|
{
|
|
|
|
int i;
|
2005-07-04 05:36:20 +04:00
|
|
|
char *contact_info = (char *)malloc((opal_ifcount()+1) * 32);
|
2004-08-16 23:39:54 +04:00
|
|
|
char *ptr = contact_info;
|
|
|
|
*ptr = 0;
|
|
|
|
|
2005-07-04 05:36:20 +04:00
|
|
|
for(i=opal_ifbegin(); i>0; i=opal_ifnext(i)) {
|
2007-04-25 23:08:07 +04:00
|
|
|
struct sockaddr_storage addr;
|
2005-01-14 00:44:58 +03:00
|
|
|
char name[32];
|
2005-07-04 05:36:20 +04:00
|
|
|
opal_ifindextoname(i, name, sizeof(name));
|
2005-01-14 00:44:58 +03:00
|
|
|
if (mca_oob_tcp_component.tcp_include != NULL &&
|
2007-04-25 05:55:40 +04:00
|
|
|
strstr(mca_oob_tcp_component.tcp_include,name) == NULL) {
|
2005-01-14 00:44:58 +03:00
|
|
|
continue;
|
2007-04-25 05:55:40 +04:00
|
|
|
}
|
2005-01-14 00:44:58 +03:00
|
|
|
if (mca_oob_tcp_component.tcp_exclude != NULL &&
|
2007-04-25 05:55:40 +04:00
|
|
|
strstr(mca_oob_tcp_component.tcp_exclude,name) != NULL) {
|
2005-01-14 00:44:58 +03:00
|
|
|
continue;
|
2007-04-25 05:55:40 +04:00
|
|
|
}
|
2007-05-17 05:17:59 +04:00
|
|
|
opal_ifindextoaddr(i, (struct sockaddr*) &addr, sizeof(addr));
|
2007-05-31 06:29:44 +04:00
|
|
|
if(mca_oob_tcp_component.tcp_ignore_localhost &&
|
2007-05-17 05:17:59 +04:00
|
|
|
opal_net_islocalhost((struct sockaddr*) &addr)) {
|
2004-09-02 03:07:40 +04:00
|
|
|
continue;
|
2007-04-25 05:55:40 +04:00
|
|
|
}
|
2004-08-16 23:39:54 +04:00
|
|
|
if(ptr != contact_info) {
|
|
|
|
ptr += sprintf(ptr, ";");
|
|
|
|
}
|
2007-04-25 23:08:07 +04:00
|
|
|
|
|
|
|
if (addr.ss_family == AF_INET) {
|
2007-05-17 05:17:59 +04:00
|
|
|
ptr += sprintf(ptr, "tcp://%s:%d", opal_net_get_hostname((struct sockaddr*) &addr),
|
2007-04-25 05:55:40 +04:00
|
|
|
ntohs(mca_oob_tcp_component.tcp_listen_port));
|
|
|
|
}
|
|
|
|
|
2007-04-25 23:08:07 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
if (addr.ss_family == AF_INET6) {
|
2007-05-17 05:17:59 +04:00
|
|
|
ptr += sprintf(ptr, "tcp6://%s:%d", opal_net_get_hostname((struct sockaddr*) &addr),
|
2007-04-25 05:55:40 +04:00
|
|
|
ntohs(mca_oob_tcp_component.tcp6_listen_port));
|
|
|
|
}
|
|
|
|
#endif
|
2007-04-25 23:08:07 +04:00
|
|
|
|
2004-08-16 23:39:54 +04:00
|
|
|
}
|
|
|
|
return contact_info;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parse a URI string into an IP address and port number.
|
|
|
|
*/
|
|
|
|
|
2007-04-25 05:55:40 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
int mca_oob_tcp_parse_uri(const char* uri, struct sockaddr_in6* inaddr)
|
|
|
|
#else
|
2004-08-19 23:34:37 +04:00
|
|
|
int mca_oob_tcp_parse_uri(const char* uri, struct sockaddr_in* inaddr)
|
2007-04-25 05:55:40 +04:00
|
|
|
#endif
|
2004-08-16 23:39:54 +04:00
|
|
|
{
|
|
|
|
char* tmp = strdup(uri);
|
|
|
|
char* ptr = tmp + 6;
|
|
|
|
char* addr = ptr;
|
|
|
|
char* port;
|
2007-04-25 15:51:18 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
2007-04-25 05:55:40 +04:00
|
|
|
uint16_t af_family = AF_INET;
|
2007-04-25 15:51:18 +04:00
|
|
|
#endif
|
2007-04-25 05:55:40 +04:00
|
|
|
|
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
if(strncmp(tmp, "tcp6://", 7) == 0) {
|
|
|
|
af_family = AF_INET6;
|
|
|
|
addr++; /* we have one more character to skip ('[') */
|
|
|
|
} else {
|
|
|
|
if(strncmp(tmp, "tcp://", 6) != 0) {
|
|
|
|
free(tmp);
|
|
|
|
return ORTE_ERR_BAD_PARAM;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#else
|
2004-08-16 23:39:54 +04:00
|
|
|
if(strncmp(tmp, "tcp://", 6) != 0) {
|
|
|
|
free(tmp);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERR_BAD_PARAM;
|
2004-08-16 23:39:54 +04:00
|
|
|
}
|
2007-04-25 05:55:40 +04:00
|
|
|
#endif
|
|
|
|
|
|
|
|
ptr = strrchr(addr, ':');
|
2004-08-16 23:39:54 +04:00
|
|
|
if(NULL == ptr) {
|
|
|
|
free(tmp);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERR_BAD_PARAM;
|
2004-08-16 23:39:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
*ptr = '\0';
|
|
|
|
ptr++;
|
|
|
|
port = ptr;
|
|
|
|
|
|
|
|
memset(inaddr, 0, sizeof(inaddr));
|
2007-04-25 05:55:40 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
struct addrinfo hints, *res;
|
|
|
|
memset(&hints, 0, sizeof(hints));
|
|
|
|
hints.ai_family = af_family;
|
|
|
|
hints.ai_socktype = SOCK_STREAM;
|
|
|
|
error = getaddrinfo (addr, NULL, &hints, &res);
|
|
|
|
|
|
|
|
if (error) {
|
|
|
|
opal_output (0, "oob_tcp_parse_uri: Could not resolve %s. [Error: %s]\n",
|
|
|
|
addr, gai_strerror (error));
|
|
|
|
free (tmp);
|
|
|
|
return ORTE_ERR_BAD_PARAM;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (res->ai_family != af_family) {
|
|
|
|
/* should never happen */
|
|
|
|
opal_output (0, "oob_tcp_parse_uri: getaddrinfo returned wrong af_family for %s",
|
|
|
|
addr);
|
|
|
|
free (tmp);
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy (inaddr, res->ai_addr, res->ai_addrlen);
|
|
|
|
freeaddrinfo (res);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (inaddr->sin6_family != af_family) {
|
|
|
|
/* should never happen */
|
|
|
|
opal_output (0, "oob_tcp_parse_uri: getaddrinfo+memcpy resulted in wrong af_family for %s",
|
|
|
|
addr);
|
|
|
|
free (tmp);
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
inaddr->sin6_port = htons(atoi(port));
|
|
|
|
#else
|
2004-08-16 23:39:54 +04:00
|
|
|
inaddr->sin_family = AF_INET;
|
|
|
|
inaddr->sin_addr.s_addr = inet_addr(addr);
|
|
|
|
if(inaddr->sin_addr.s_addr == INADDR_ANY) {
|
|
|
|
free(tmp);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_ERR_BAD_PARAM;
|
2004-08-16 23:39:54 +04:00
|
|
|
}
|
|
|
|
inaddr->sin_port = htons(atoi(port));
|
2007-04-25 05:55:40 +04:00
|
|
|
#endif
|
2004-08-16 23:39:54 +04:00
|
|
|
free(tmp);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-08-16 23:39:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
* Setup address in the cache. Note that this could be called multiple
|
|
|
|
* times if a given destination exports multiple addresses.
|
2004-08-16 23:39:54 +04:00
|
|
|
*/
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
int mca_oob_tcp_set_addr(const orte_process_name_t* name, const char* uri)
|
2004-08-16 23:39:54 +04:00
|
|
|
{
|
2007-04-25 05:55:40 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
struct sockaddr_in6 inaddr;
|
|
|
|
#else
|
2004-08-16 23:39:54 +04:00
|
|
|
struct sockaddr_in inaddr;
|
2007-04-25 05:55:40 +04:00
|
|
|
#endif
|
2004-09-02 03:07:40 +04:00
|
|
|
mca_oob_tcp_addr_t* addr;
|
2005-03-14 23:57:21 +03:00
|
|
|
mca_oob_tcp_peer_t* peer;
|
2004-08-16 23:39:54 +04:00
|
|
|
int rc;
|
2006-02-12 04:33:29 +03:00
|
|
|
if((rc = mca_oob_tcp_parse_uri(uri,&inaddr)) != ORTE_SUCCESS)
|
2004-08-16 23:39:54 +04:00
|
|
|
return rc;
|
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
2005-07-19 17:33:04 +04:00
|
|
|
addr = (mca_oob_tcp_addr_t*)orte_hash_table_get_proc(&mca_oob_tcp_component.tcp_peer_names, name);
|
2004-09-02 03:07:40 +04:00
|
|
|
if(NULL == addr) {
|
|
|
|
addr = OBJ_NEW(mca_oob_tcp_addr_t);
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
addr->addr_name = *name;
|
2005-07-19 17:33:04 +04:00
|
|
|
orte_hash_table_set_proc(&mca_oob_tcp_component.tcp_peer_names, &addr->addr_name, addr);
|
2004-08-16 23:39:54 +04:00
|
|
|
}
|
2004-09-02 03:07:40 +04:00
|
|
|
rc = mca_oob_tcp_addr_insert(addr, &inaddr);
|
2005-07-19 17:33:04 +04:00
|
|
|
peer = (mca_oob_tcp_peer_t *)orte_hash_table_get_proc(
|
2005-03-14 23:57:21 +03:00
|
|
|
&mca_oob_tcp_component.tcp_peers, &addr->addr_name);
|
|
|
|
if(NULL != peer) {
|
|
|
|
mca_oob_tcp_peer_resolved(peer, addr);
|
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
2004-09-02 03:07:40 +04:00
|
|
|
return rc;
|
2004-08-16 23:39:54 +04:00
|
|
|
}
|
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
|
|
|
|
/* Dummy function for when we are not using FT. */
|
|
|
|
#if OPAL_ENABLE_FT == 0
|
|
|
|
int mca_oob_tcp_ft_event(int state) {
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
int mca_oob_tcp_ft_event(int state) {
|
|
|
|
int exit_status = ORTE_SUCCESS;
|
|
|
|
|
|
|
|
if(OPAL_CRS_CHECKPOINT == state) {
|
|
|
|
/*
|
|
|
|
* Disable event processing while we are working
|
|
|
|
*/
|
|
|
|
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
|
|
|
opal_event_disable();
|
|
|
|
|
|
|
|
}
|
|
|
|
else if(OPAL_CRS_CONTINUE == state) {
|
|
|
|
/*
|
|
|
|
* Resume event processing
|
|
|
|
*/
|
|
|
|
opal_event_enable();
|
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
|
|
|
}
|
|
|
|
else if(OPAL_CRS_RESTART == state) {
|
2007-04-25 23:51:52 +04:00
|
|
|
/*
|
|
|
|
* Resume event processing
|
|
|
|
*/
|
|
|
|
opal_event_enable();
|
2007-03-17 02:11:45 +03:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
|
|
|
}
|
|
|
|
else if(OPAL_CRS_TERM == state ) {
|
|
|
|
;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
return exit_status;
|
|
|
|
}
|
|
|
|
#endif
|