1
1
openmpi/orte/mca/oob/tcp/oob_tcp.h
Ralph Castain 2a116ecdfc Fix a race condition created when two processes attempt to send to each other at the same time. This causes both processes to start connection procedures, resulting in a c
onflict that can cause messages to be lost. Add detection of this condition, and have both processes cancel their connect operations. The process with the higher rank will
 reconnect, while the lower rank process will simply wait for the connection to be created.

Refs trac:3696

This commit was SVN r29139.

The following Trac tickets were found above:
  Ticket 3696 --> https://svn.open-mpi.org/trac/ompi/ticket/3696
2013-09-06 05:15:25 +00:00

120 строки
4.7 KiB
C

/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef _MCA_OOB_TCP_H_
#define _MCA_OOB_TCP_H_
#include "orte_config.h"
#include "orte/types.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_free_list.h"
#include "opal/class/opal_hash_table.h"
#include "opal/mca/event/event.h"
#include "orte/mca/oob/oob.h"
#include "orte/mca/oob/base/base.h"
BEGIN_C_DECLS
/* define some debug levels */
#define OOB_TCP_DEBUG_FAIL 2
#define OOB_TCP_DEBUG_CONNECT 7
/* forward declare a couple of structures */
struct mca_oob_tcp_module_t;
struct mca_oob_tcp_msg_error_t;
/* define a struct for tracking NIC addresses */
typedef struct {
opal_list_item_t super;
uint16_t af_family;
struct sockaddr addr;
} mca_oob_tcp_nicaddr_t;
OBJ_CLASS_DECLARATION(mca_oob_tcp_nicaddr_t);
/* Module definition */
typedef void (*mca_oob_tcp_module_init_fn_t)(struct mca_oob_tcp_module_t *mod);
typedef void (*mca_oob_tcp_module_fini_fn_t)(struct mca_oob_tcp_module_t *mod);
typedef void (*mca_oob_tcp_module_accept_connection_fn_t)(struct mca_oob_tcp_module_t *md,
const int accepted_fd,
const struct sockaddr *addr);
typedef void (*mca_oob_tcp_module_set_peer_fn_t)(struct mca_oob_tcp_module_t *mod,
const orte_process_name_t* name,
const uint16_t af_family,
const char *net, const char *ports);
typedef void (*mca_oob_tcp_module_ping_fn_t)(struct mca_oob_tcp_module_t *mod,
const orte_process_name_t *proc);
typedef void (*mca_oob_tcp_module_send_nb_fn_t)(struct mca_oob_tcp_module_t *mod,
orte_rml_send_t *msg);
typedef void (*mca_oob_tcp_module_resend_nb_fn_t)(struct mca_oob_tcp_msg_error_t *mop);
typedef void (*mca_oob_tcp_module_ft_event_fn_t)(struct mca_oob_tcp_module_t *mod, int state);
typedef struct {
mca_oob_tcp_module_init_fn_t init;
mca_oob_tcp_module_fini_fn_t finalize;
mca_oob_tcp_module_accept_connection_fn_t accept_connection;
mca_oob_tcp_module_set_peer_fn_t set_peer;
mca_oob_tcp_module_ping_fn_t ping;
mca_oob_tcp_module_send_nb_fn_t send_nb;
mca_oob_tcp_module_resend_nb_fn_t resend;
mca_oob_tcp_module_ft_event_fn_t ft_event;
} mca_oob_tcp_module_api_t;
typedef struct {
mca_oob_tcp_module_api_t api;
int idx; // index in the module array
opal_event_base_t *ev_base; /* event base for the module progress thread */
bool ev_active;
opal_thread_t progress_thread;
int af_family; // interface family - v4 or v6
char* if_name; /* string name of the interface */
int if_kidx; /* interface kernel index */
opal_list_t addresses; /* list of addresses served by this NIC */
opal_hash_table_t peers; // connection addresses for peers
} mca_oob_tcp_module_t;
ORTE_MODULE_DECLSPEC extern mca_oob_tcp_module_t mca_oob_tcp_module;
/**
* the state of the connection
*/
typedef enum {
MCA_OOB_TCP_UNCONNECTED,
MCA_OOB_TCP_CLOSED,
MCA_OOB_TCP_RESOLVE,
MCA_OOB_TCP_CONNECTING,
MCA_OOB_TCP_CONNECT_ACK,
MCA_OOB_TCP_CONNECTED,
MCA_OOB_TCP_FAILED,
MCA_OOB_TCP_ACCEPTING
} mca_oob_tcp_state_t;
/* module-level shared functions */
ORTE_MODULE_DECLSPEC void mca_oob_tcp_send_handler(int fd, short args, void *cbdata);
ORTE_MODULE_DECLSPEC void mca_oob_tcp_recv_handler(int fd, short args, void *cbdata);
END_C_DECLS
#endif /* MCA_OOB_TCP_H_ */