2004-07-01 18:49:54 +04:00
|
|
|
/*
|
2007-03-17 02:11:45 +03:00
|
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
2005-11-05 22:57:48 +03:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2006-08-23 07:32:36 +04:00
|
|
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
2005-11-05 22:57:48 +03:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2004-11-28 23:09:25 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
* Copyright (c) 2006-2007 Los Alamos National Security, LLC.
|
|
|
|
* All rights reserved.
|
2004-11-22 04:38:40 +03:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
2004-07-01 18:49:54 +04:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
/** @file:
|
|
|
|
*
|
|
|
|
* Defines the functions for the tcp module.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _MCA_OOB_TCP_H_
|
|
|
|
#define _MCA_OOB_TCP_H_
|
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte/mca/oob/oob.h"
|
|
|
|
#include "orte/mca/oob/base/base.h"
|
|
|
|
#include "opal/mca/base/base.h"
|
|
|
|
#include "orte/mca/ns/ns_types.h"
|
2005-07-02 20:46:27 +04:00
|
|
|
#include "opal/class/opal_free_list.h"
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "opal/class/opal_hash_table.h"
|
2006-03-29 02:09:40 +04:00
|
|
|
#include "opal/runtime/opal_progress.h"
|
2007-03-17 02:11:45 +03:00
|
|
|
#include "opal/runtime/opal_cr.h"
|
2005-07-04 02:45:48 +04:00
|
|
|
#include "opal/threads/mutex.h"
|
|
|
|
#include "opal/threads/condition.h"
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte/mca/oob/tcp/oob_tcp_peer.h"
|
|
|
|
#include "orte/mca/oob/tcp/oob_tcp_msg.h"
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
#include "opal/mca/timer/base/base.h"
|
2004-07-01 18:49:54 +04:00
|
|
|
|
|
|
|
|
|
|
|
#if defined(c_plusplus) || defined(__cplusplus)
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
#define ORTE_OOB_TCP_KEY "oob-tcp"
|
|
|
|
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
#define OOB_TCP_DEBUG_CONNECT_FAIL 1 /* debug connection establishment failures */
|
|
|
|
#define OOB_TCP_DEBUG_CONNECT 2 /* other connection information */
|
|
|
|
#define OOB_TCP_DEBUG_INFO 3 /* information about startup, connection establish, etc. */
|
|
|
|
#define OOB_TCP_DEBUG_ALL 4 /* everything else */
|
|
|
|
|
|
|
|
|
2004-07-01 18:49:54 +04:00
|
|
|
/*
|
2004-08-19 23:34:37 +04:00
|
|
|
* standard component functions
|
2004-07-01 18:49:54 +04:00
|
|
|
*/
|
2004-08-19 23:34:37 +04:00
|
|
|
int mca_oob_tcp_component_open(void);
|
|
|
|
int mca_oob_tcp_component_close(void);
|
2005-03-14 23:57:21 +03:00
|
|
|
mca_oob_t* mca_oob_tcp_component_init(int* priority);
|
2004-08-19 23:34:37 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Hook function to allow the selected oob components
|
|
|
|
* to register their contact info with the registry
|
|
|
|
*/
|
|
|
|
|
|
|
|
int mca_oob_tcp_init(void);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Cleanup resources during shutdown.
|
|
|
|
*/
|
|
|
|
int mca_oob_tcp_fini(void);
|
2004-07-01 18:49:54 +04:00
|
|
|
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
/*
|
|
|
|
* Register my contact info with the General Purpose Registry
|
|
|
|
* This function causes the component to "put" its contact info
|
|
|
|
* on the registry.
|
|
|
|
*/
|
|
|
|
int mca_oob_tcp_register_contact_info(void);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Register a subscription to receive contact info on other processes
|
|
|
|
* This function will typically be called from within a GPR compound command
|
|
|
|
* to register a subscription against a stage gate trigger. When fired, this
|
|
|
|
* will return the OOB contact info for all processes in the specified job
|
|
|
|
*/
|
|
|
|
int mca_oob_tcp_register_subscription(orte_jobid_t job, char *trigger);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get contact info for a process or job
|
|
|
|
* Returns contact info for the specified process. If the vpid in the process name
|
|
|
|
* is WILDCARD, then it returns the contact info for all processes in the specified
|
|
|
|
* job. If the jobid is WILDCARD, then it returns the contact info for processes
|
|
|
|
* of the specified vpid across all jobs. Obviously, combining the two WILDCARD
|
|
|
|
* values will return contact info for everyone!
|
|
|
|
*/
|
|
|
|
ORTE_DECLSPEC int mca_oob_tcp_get_contact_info(orte_process_name_t *name, orte_gpr_notify_data_t **data);
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
/**
|
|
|
|
* Compare two process names for equality.
|
|
|
|
*
|
|
|
|
* @param n1 Process name 1.
|
|
|
|
* @param n2 Process name 2.
|
|
|
|
* @return (-1 for n1<n2 0 for equality, 1 for n1>n2)
|
|
|
|
*
|
|
|
|
* Note that the definition of < or > is somewhat arbitrary -
|
|
|
|
* just needs to be consistently applied to maintain an ordering
|
|
|
|
* when process names are used as indices.
|
|
|
|
*/
|
2005-03-14 23:57:21 +03:00
|
|
|
int mca_oob_tcp_process_name_compare(const orte_process_name_t* n1, const orte_process_name_t* n2);
|
2005-09-15 21:13:13 +04:00
|
|
|
|
|
|
|
/**
|
2004-08-16 23:39:54 +04:00
|
|
|
* Obtain contact information for this host (e.g. <ipaddress>:<port>)
|
|
|
|
*/
|
|
|
|
|
|
|
|
char* mca_oob_tcp_get_addr(void);
|
|
|
|
|
|
|
|
/**
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
* Setup cached addresses for the peers.
|
2004-08-16 23:39:54 +04:00
|
|
|
*/
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
int mca_oob_tcp_set_addr(const orte_process_name_t*, const char*);
|
2004-08-16 23:39:54 +04:00
|
|
|
|
2004-09-08 21:02:24 +04:00
|
|
|
/**
|
|
|
|
* A routine to ping a given process name to determine if it is reachable.
|
|
|
|
*
|
|
|
|
* @param name The peer name.
|
|
|
|
* @param tv The length of time to wait on a connection/response.
|
|
|
|
*
|
|
|
|
* Note that this routine blocks up to the specified timeout waiting for a
|
|
|
|
* connection / response from the specified peer. If the peer is unavailable
|
|
|
|
* an error status is returned.
|
|
|
|
*/
|
|
|
|
|
2005-05-05 20:31:40 +04:00
|
|
|
int mca_oob_tcp_ping(const orte_process_name_t*, const char* uri, const struct timeval* tv);
|
2004-09-08 21:02:24 +04:00
|
|
|
|
2004-07-01 18:49:54 +04:00
|
|
|
/**
|
2004-07-14 01:03:03 +04:00
|
|
|
* Similiar to unix writev(2).
|
2004-07-01 18:49:54 +04:00
|
|
|
*
|
|
|
|
* @param peer (IN) Opaque name of peer process.
|
|
|
|
* @param msg (IN) Array of iovecs describing user buffers and lengths.
|
|
|
|
* @param count (IN) Number of elements in iovec array.
|
2004-08-03 01:24:00 +04:00
|
|
|
* @param tag (IN) User defined tag for matching send/recv.
|
2004-07-01 18:49:54 +04:00
|
|
|
* @param flags (IN) Currently unused.
|
|
|
|
* @return OMPI error code (<0) on error number of bytes actually sent.
|
|
|
|
*/
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
int mca_oob_tcp_send(
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_process_name_t* peer,
|
2004-08-13 02:41:42 +04:00
|
|
|
struct iovec *msg,
|
2004-08-03 01:24:00 +04:00
|
|
|
int count,
|
|
|
|
int tag,
|
|
|
|
int flags);
|
2004-07-01 18:49:54 +04:00
|
|
|
|
|
|
|
/**
|
2004-07-14 01:03:03 +04:00
|
|
|
* Similiar to unix readv(2)
|
2004-07-01 18:49:54 +04:00
|
|
|
*
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
|
2004-07-01 18:49:54 +04:00
|
|
|
* @param msg (IN) Array of iovecs describing user buffers and lengths.
|
|
|
|
* @param count (IN) Number of elements in iovec array.
|
2004-08-03 01:24:00 +04:00
|
|
|
* @param tag (IN) User defined tag for matching send/recv.
|
2004-07-15 23:08:54 +04:00
|
|
|
* @param flags (IN) May be MCA_OOB_PEEK to return up to the number of bytes provided in the
|
2004-07-01 18:49:54 +04:00
|
|
|
* iovec array without removing the message from the queue.
|
|
|
|
* @return OMPI error code (<0) on error or number of bytes actually received.
|
|
|
|
*/
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
int mca_oob_tcp_recv(
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_process_name_t* peer,
|
2004-08-13 02:41:42 +04:00
|
|
|
struct iovec * msg,
|
2004-08-03 01:24:00 +04:00
|
|
|
int count,
|
2005-03-14 23:57:21 +03:00
|
|
|
int tag,
|
2004-08-03 01:24:00 +04:00
|
|
|
int flags);
|
2004-07-01 18:49:54 +04:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Non-blocking versions of send/recv.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Non-blocking version of mca_oob_send().
|
|
|
|
*
|
|
|
|
* @param peer (IN) Opaque name of peer process.
|
|
|
|
* @param msg (IN) Array of iovecs describing user buffers and lengths.
|
|
|
|
* @param count (IN) Number of elements in iovec array.
|
2004-08-03 01:24:00 +04:00
|
|
|
* @param tag (IN) User defined tag for matching send/recv.
|
2004-07-01 18:49:54 +04:00
|
|
|
* @param flags (IN) Currently unused.
|
|
|
|
* @param cbfunc (IN) Callback function on send completion.
|
|
|
|
* @param cbdata (IN) User data that is passed to callback function.
|
|
|
|
* @return OMPI error code (<0) on error number of bytes actually sent.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
int mca_oob_tcp_send_nb(
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_process_name_t* peer,
|
2004-08-13 02:41:42 +04:00
|
|
|
struct iovec* msg,
|
2004-08-03 01:24:00 +04:00
|
|
|
int count,
|
|
|
|
int tag,
|
|
|
|
int flags,
|
|
|
|
mca_oob_callback_fn_t cbfunc,
|
|
|
|
void* cbdata);
|
2004-07-01 18:49:54 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Non-blocking version of mca_oob_recv().
|
|
|
|
*
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
|
2004-07-01 18:49:54 +04:00
|
|
|
* @param msg (IN) Array of iovecs describing user buffers and lengths.
|
|
|
|
* @param count (IN) Number of elements in iovec array.
|
2004-08-03 01:24:00 +04:00
|
|
|
* @param tag (IN) User defined tag for matching send/recv.
|
2004-07-15 23:08:54 +04:00
|
|
|
* @param flags (IN) May be MCA_OOB_PEEK to return up to size bytes of msg w/out removing it from the queue,
|
2004-07-01 18:49:54 +04:00
|
|
|
* @param cbfunc (IN) Callback function on recv completion.
|
|
|
|
* @param cbdata (IN) User data that is passed to callback function.
|
|
|
|
* @return OMPI error code (<0) on error or number of bytes actually received.
|
|
|
|
*/
|
|
|
|
|
2004-08-03 01:24:00 +04:00
|
|
|
int mca_oob_tcp_recv_nb(
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_process_name_t* peer,
|
2004-08-13 02:41:42 +04:00
|
|
|
struct iovec* msg,
|
2004-08-03 01:24:00 +04:00
|
|
|
int count,
|
|
|
|
int tag,
|
|
|
|
int flags,
|
|
|
|
mca_oob_callback_fn_t cbfunc,
|
|
|
|
void* cbdata);
|
2004-07-01 18:49:54 +04:00
|
|
|
|
2004-09-30 19:09:29 +04:00
|
|
|
/**
|
|
|
|
* Cancel non-blocking receive.
|
|
|
|
*
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
|
2004-09-30 19:09:29 +04:00
|
|
|
* @param tag (IN) User defined tag for matching send/recv.
|
|
|
|
* @return OMPI error code (<0) on error or number of bytes actually received.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int mca_oob_tcp_recv_cancel(
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_process_name_t* peer,
|
2004-09-30 19:09:29 +04:00
|
|
|
int tag);
|
|
|
|
|
2004-09-02 03:07:40 +04:00
|
|
|
/**
|
|
|
|
* Attempt to map a peer name to its corresponding address.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int mca_oob_tcp_resolve(mca_oob_tcp_peer_t*);
|
|
|
|
|
2004-08-19 23:34:37 +04:00
|
|
|
/**
|
|
|
|
* Parse a URI string into an IP address and port number.
|
|
|
|
*/
|
|
|
|
int mca_oob_tcp_parse_uri(
|
|
|
|
const char* uri,
|
2007-04-25 05:55:40 +04:00
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
struct sockaddr_in6* inaddr
|
|
|
|
#else
|
2004-08-19 23:34:37 +04:00
|
|
|
struct sockaddr_in* inaddr
|
2007-04-25 05:55:40 +04:00
|
|
|
#endif
|
2004-08-19 23:34:37 +04:00
|
|
|
);
|
|
|
|
|
2004-11-20 22:12:43 +03:00
|
|
|
/**
|
|
|
|
* Callback from registry on change to subscribed segments
|
|
|
|
*/
|
|
|
|
void mca_oob_tcp_registry_callback(
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_gpr_notify_data_t* data,
|
2004-11-20 22:12:43 +03:00
|
|
|
void* cbdata);
|
|
|
|
|
2005-10-31 19:21:11 +03:00
|
|
|
/**
|
|
|
|
* Setup socket options
|
|
|
|
*/
|
|
|
|
|
|
|
|
void mca_oob_tcp_set_socket_options(int sd);
|
2004-07-01 18:49:54 +04:00
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
int mca_oob_tcp_ft_event(int state);
|
|
|
|
|
2006-10-05 09:27:04 +04:00
|
|
|
typedef enum { OOB_TCP_EVENT, OOB_TCP_LISTEN_THREAD } mca_oob_tcp_listen_type_t;
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2004-07-13 02:46:57 +04:00
|
|
|
/**
|
|
|
|
* OOB TCP Component
|
|
|
|
*/
|
|
|
|
struct mca_oob_tcp_component_t {
|
2004-07-15 17:51:40 +04:00
|
|
|
mca_oob_base_component_1_0_0_t super; /**< base OOB component */
|
2005-03-19 02:40:08 +03:00
|
|
|
char* tcp_include; /**< list of ip interfaces to include */
|
|
|
|
char* tcp_exclude; /**< list of ip interfaces to exclude */
|
2007-04-25 05:55:40 +04:00
|
|
|
int tcp_listen_sd; /**< listen socket for incoming IPv4 connection requests */
|
|
|
|
unsigned short tcp_listen_port; /**< IPv4 listen port */
|
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
int tcp6_listen_sd; /**< listen socket for incoming IPv6 connection requests */
|
|
|
|
unsigned short tcp6_listen_port; /**< IPv6 listen port */
|
|
|
|
#endif
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_t tcp_subscriptions; /**< list of registry subscriptions */
|
|
|
|
opal_list_t tcp_peer_list; /**< list of peers sorted in mru order */
|
2005-07-03 20:52:32 +04:00
|
|
|
opal_hash_table_t tcp_peers; /**< peers sorted by name */
|
|
|
|
opal_hash_table_t tcp_peer_names; /**< cache of peer contact info sorted by name */
|
2005-07-02 20:46:27 +04:00
|
|
|
opal_free_list_t tcp_peer_free; /**< free list of peers */
|
Not as bad as this all may look. Tim and I made a significant change to the way we handle the startup of the oob, the seed, etc. We have made it backwards-compatible so that mpirun2 and singleton operations remain working. We had to adjust the name server and gpr as well, plus the process_info structure.
This also includes a checkpoint update to openmpi.c and ompid.c. I have re-enabled the ompid compile.
This latter raises an important point. The trunk compiles the programs like ompid just fine under Linux. It also does just fine for OSX under the dynamic libraries. However, we are seeing errors when compiling under OSX for the static case - the linker seems to have trouble resolving some variable names, even though linker diagnostics show the variables as being defined. Thus, a warning to Mac users that you may have to locally turn things off if you are trying to do static compiles. We ask, however, that you don't commit those changes that turn things off for everyone else - instead, let's try to figure out why the static compile is having a problem, and let everyone else continue to work.
Thanks
Ralph
This commit was SVN r2534.
2004-09-08 07:59:06 +04:00
|
|
|
int tcp_peer_limit; /**< max size of tcp peer cache */
|
2004-08-16 23:39:54 +04:00
|
|
|
int tcp_peer_retries; /**< max number of retries before declaring peer gone */
|
2005-10-31 19:21:11 +03:00
|
|
|
int tcp_sndbuf; /**< socket send buffer size */
|
|
|
|
int tcp_rcvbuf; /**< socket recv buffer size */
|
2005-07-02 20:46:27 +04:00
|
|
|
opal_free_list_t tcp_msgs; /**< free list of messages */
|
2007-04-25 05:55:40 +04:00
|
|
|
opal_event_t tcp_send_event; /**< event structure for IPv4 sends */
|
|
|
|
opal_event_t tcp_recv_event; /**< event structure for IPv4 recvs */
|
|
|
|
#if OPAL_WANT_IPV6
|
|
|
|
opal_event_t tcp6_send_event; /**< event structure for IPv6 sends */
|
|
|
|
opal_event_t tcp6_recv_event; /**< event structure for IPv6 recvs */
|
|
|
|
#endif
|
2005-07-04 02:45:48 +04:00
|
|
|
opal_mutex_t tcp_lock; /**< lock for accessing module state */
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_t tcp_events; /**< list of pending events (accepts) */
|
|
|
|
opal_list_t tcp_msg_post; /**< list of recieves user has posted */
|
|
|
|
opal_list_t tcp_msg_recv; /**< list of recieved messages */
|
2005-10-25 17:48:08 +04:00
|
|
|
opal_list_t tcp_msg_completed; /**< list of completed messages */
|
2005-07-04 02:45:48 +04:00
|
|
|
opal_mutex_t tcp_match_lock; /**< lock held while searching/posting messages */
|
|
|
|
opal_condition_t tcp_match_cond; /**< condition variable used in finalize */
|
2004-09-30 19:09:29 +04:00
|
|
|
int tcp_match_count; /**< number of matched recvs in progress */
|
2004-09-02 03:07:40 +04:00
|
|
|
int tcp_debug; /**< debug level */
|
2006-09-15 01:29:51 +04:00
|
|
|
|
|
|
|
bool tcp_shutdown;
|
2006-10-05 09:27:04 +04:00
|
|
|
mca_oob_tcp_listen_type_t tcp_listen_type;
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
opal_thread_t tcp_listen_thread;
|
|
|
|
opal_free_list_t tcp_pending_connections_fl;
|
|
|
|
opal_list_t tcp_pending_connections;
|
|
|
|
opal_list_t tcp_copy_out_connections;
|
|
|
|
opal_list_t tcp_copy_in_connections;
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
opal_list_t tcp_connections_return;
|
|
|
|
opal_list_t tcp_connections_return_copy;
|
2006-09-15 01:29:51 +04:00
|
|
|
opal_mutex_t tcp_pending_connections_lock;
|
Fix a number of OOB issues:
* Remove the connect() timeout code, as it had some nasty race conditions
when connections were established as the trigger was firing. A better
solution has been found for the cluster where this was needed, so just
removing it was easiest.
* When a fatal error (too many connection failures) occurs, set an error
on messages in the queue even if there isn't an active message. The
first message to any peer will be queued without being active (and
so will all subsequent messages until the connection is established),
and the orteds will hang until that first message completes. So if
an orted can never contact it's peer, it will never exit and just sit
waiting for that message to complete.
* Cover an interesting RST condition in the connect code. A connection
can complete the three-way handshake, the connector can even send
some data, but the server side will drop the connection because it
can't move it from the half-connected to fully-connected state because
of space shortage in the listen backlog queue. This causes a RST to
be received first time that recv() is called, which will be when waiting
for the remote side of the OOB ack. In this case, transition the
connection back into a CLOSED state and try to connect again.
* Add levels of debugging, rather than all or nothing, each building on
the previous level. 0 (default) is hard errors. 1 is connection
error debugging info. 2 is all connection info. 3 is more state
info. 4 includes all message info.
* Add some hopefully useful comments
This commit was SVN r14261.
2007-04-08 02:33:30 +04:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
opal_timer_t tcp_last_copy_time;
|
|
|
|
opal_timer_t tcp_copy_delta;
|
|
|
|
int tcp_copy_max_size;
|
|
|
|
int tcp_copy_spin_count;
|
2006-11-06 21:00:46 +03:00
|
|
|
int connect_sleep;
|
2007-05-31 06:29:44 +04:00
|
|
|
|
|
|
|
bool tcp_ignore_localhost; /**< should use localhost as an address or not */
|
2004-07-13 02:46:57 +04:00
|
|
|
};
|
2004-08-16 23:39:54 +04:00
|
|
|
|
2004-08-10 03:07:53 +04:00
|
|
|
/**
|
|
|
|
* Convenience Typedef
|
|
|
|
*/
|
2004-07-13 02:46:57 +04:00
|
|
|
typedef struct mca_oob_tcp_component_t mca_oob_tcp_component_t;
|
|
|
|
|
2006-08-23 07:32:36 +04:00
|
|
|
ORTE_MODULE_DECLSPEC extern mca_oob_tcp_component_t mca_oob_tcp_component;
|
2004-07-13 02:46:57 +04:00
|
|
|
|
2007-05-07 17:05:52 +04:00
|
|
|
extern int mca_oob_tcp_output_handle;
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2006-08-23 07:32:36 +04:00
|
|
|
#if defined(__WINDOWS__)
|
|
|
|
#define CLOSE_THE_SOCKET(socket) closesocket(socket)
|
|
|
|
#else
|
|
|
|
#define CLOSE_THE_SOCKET(socket) close(socket)
|
|
|
|
#endif /* defined(__WINDOWS__) */
|
2004-07-13 02:46:57 +04:00
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
struct mca_oob_tcp_pending_connection_t {
|
|
|
|
opal_free_list_item_t super;
|
|
|
|
int fd;
|
2007-04-25 05:55:40 +04:00
|
|
|
/* Bug, FIXME: Port to IPv6 */
|
2006-09-15 01:29:51 +04:00
|
|
|
struct sockaddr_in addr;
|
|
|
|
};
|
|
|
|
typedef struct mca_oob_tcp_pending_connection_t mca_oob_tcp_pending_connection_t;
|
|
|
|
OBJ_CLASS_DECLARATION(mca_oob_tcp_pending_connection_t);
|
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2004-07-01 18:49:54 +04:00
|
|
|
#if defined(c_plusplus) || defined(__cplusplus)
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* MCA_OOB_TCP_H_ */
|
|
|
|
|