2009-12-15 01:18:27 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
2011-06-23 20:38:02 +00:00
|
|
|
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2009-12-15 01:18:27 +00:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include "orte/constants.h"
|
|
|
|
#include "opal/types.h"
|
|
|
|
|
|
|
|
#ifdef HAVE_NETINET_IN_H
|
|
|
|
#include <netinet/in.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_ARPA_INET_H
|
|
|
|
#include <arpa/inet.h>
|
|
|
|
#endif
|
|
|
|
#include <errno.h>
|
|
|
|
#include <fcntl.h>
|
|
|
|
|
|
|
|
#include "opal/class/opal_list.h"
|
|
|
|
#include "opal/opal_socket_errno.h"
|
|
|
|
#include "opal/util/output.h"
|
|
|
|
#include "opal/util/argv.h"
|
|
|
|
#include "opal/util/if.h"
|
|
|
|
#include "opal/util/net.h"
|
|
|
|
#include "opal/dss/dss.h"
|
|
|
|
|
|
|
|
#include "orte/runtime/orte_globals.h"
|
|
|
|
#include "orte/runtime/orte_wait.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "orte/util/name_fns.h"
|
|
|
|
#include "orte/util/show_help.h"
|
|
|
|
#include "orte/mca/grpcomm/grpcomm.h"
|
|
|
|
#include "orte/mca/rml/rml.h"
|
|
|
|
#include "orte/mca/rml/base/rml_contact.h"
|
2010-05-25 22:36:57 +00:00
|
|
|
#include "orte/mca/odls/odls_types.h"
|
2011-01-13 01:54:05 +00:00
|
|
|
#include "orte/threads/threads.h"
|
2009-12-15 01:18:27 +00:00
|
|
|
|
|
|
|
#include "orte/mca/rmcast/base/private.h"
|
|
|
|
#include "orte/mca/rmcast/base/base.h"
|
|
|
|
#include "rmcast_tcp.h"
|
|
|
|
|
|
|
|
/* LOCAL DATA */
|
|
|
|
static bool init_completed = false;
|
2010-05-25 22:36:57 +00:00
|
|
|
static orte_job_t *daemons=NULL;
|
2010-12-01 13:41:19 +00:00
|
|
|
static bool comm_enabled = false;
|
2011-01-13 01:54:05 +00:00
|
|
|
static orte_thread_ctl_t ctl;
|
2011-02-12 16:52:03 +00:00
|
|
|
static opal_list_t tools;
|
2009-12-15 01:18:27 +00:00
|
|
|
|
|
|
|
/* LOCAL FUNCTIONS */
|
|
|
|
static void recv_handler(int status, orte_process_name_t* sender,
|
|
|
|
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
|
|
|
void* cbdata);
|
|
|
|
|
2010-11-08 19:09:23 +00:00
|
|
|
static int send_data(rmcast_base_send_t *snd, orte_rmcast_channel_t channel);
|
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
/* API FUNCTIONS */
|
|
|
|
static int init(void);
|
|
|
|
|
|
|
|
static void finalize(void);
|
|
|
|
|
|
|
|
static int tcp_send_buffer(orte_rmcast_channel_t channel,
|
|
|
|
orte_rmcast_tag_t tag,
|
|
|
|
opal_buffer_t *buf);
|
|
|
|
|
|
|
|
static int tcp_send_buffer_nb(orte_rmcast_channel_t channel,
|
|
|
|
orte_rmcast_tag_t tag,
|
|
|
|
opal_buffer_t *buf,
|
|
|
|
orte_rmcast_callback_buffer_fn_t cbfunc,
|
|
|
|
void *cbdata);
|
|
|
|
|
|
|
|
static int tcp_send(orte_rmcast_channel_t channel,
|
|
|
|
orte_rmcast_tag_t tag,
|
|
|
|
struct iovec *msg, int count);
|
|
|
|
|
|
|
|
static int tcp_send_nb(orte_rmcast_channel_t channel,
|
|
|
|
orte_rmcast_tag_t tag,
|
|
|
|
struct iovec *msg, int count,
|
|
|
|
orte_rmcast_callback_fn_t cbfunc,
|
|
|
|
void *cbdata);
|
|
|
|
|
|
|
|
static int tcp_recv_buffer(orte_process_name_t *sender,
|
|
|
|
orte_rmcast_channel_t channel,
|
|
|
|
orte_rmcast_tag_t tag,
|
2010-11-07 23:29:52 +00:00
|
|
|
orte_rmcast_seq_t *seq_num,
|
2010-02-14 19:20:56 +00:00
|
|
|
opal_buffer_t *buf);
|
2009-12-15 01:18:27 +00:00
|
|
|
|
|
|
|
static int tcp_recv_buffer_nb(orte_rmcast_channel_t channel,
|
|
|
|
orte_rmcast_tag_t tag,
|
|
|
|
orte_rmcast_flag_t flags,
|
|
|
|
orte_rmcast_callback_buffer_fn_t cbfunc,
|
|
|
|
void *cbdata);
|
|
|
|
|
|
|
|
static int tcp_recv(orte_process_name_t *sender,
|
|
|
|
orte_rmcast_channel_t channel,
|
|
|
|
orte_rmcast_tag_t tag,
|
2010-11-07 23:29:52 +00:00
|
|
|
orte_rmcast_seq_t *seq_num,
|
2010-02-14 19:20:56 +00:00
|
|
|
struct iovec **msg, int *count);
|
2009-12-15 01:18:27 +00:00
|
|
|
|
|
|
|
static int tcp_recv_nb(orte_rmcast_channel_t channel,
|
|
|
|
orte_rmcast_tag_t tag,
|
|
|
|
orte_rmcast_flag_t flags,
|
|
|
|
orte_rmcast_callback_fn_t cbfunc,
|
|
|
|
void *cbdata);
|
|
|
|
|
2010-05-25 22:36:57 +00:00
|
|
|
static int open_channel(orte_rmcast_channel_t channel, char *name,
|
2009-12-15 01:18:27 +00:00
|
|
|
char *network, int port, char *interface, uint8_t direction);
|
|
|
|
|
2010-12-01 13:41:19 +00:00
|
|
|
static void enable_comm(void);
|
|
|
|
|
|
|
|
static void disable_comm(void);
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
static void process_msg(orte_rmcast_msg_t *msg);
|
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
/* Define the module */
|
|
|
|
|
|
|
|
orte_rmcast_module_t orte_rmcast_tcp_module = {
|
|
|
|
init,
|
|
|
|
finalize,
|
|
|
|
tcp_send,
|
|
|
|
tcp_send_nb,
|
|
|
|
tcp_send_buffer,
|
|
|
|
tcp_send_buffer_nb,
|
|
|
|
tcp_recv,
|
|
|
|
tcp_recv_nb,
|
|
|
|
tcp_recv_buffer,
|
|
|
|
tcp_recv_buffer_nb,
|
2010-05-25 22:36:57 +00:00
|
|
|
orte_rmcast_base_cancel_recv,
|
2009-12-15 01:18:27 +00:00
|
|
|
open_channel,
|
2010-05-25 22:36:57 +00:00
|
|
|
orte_rmcast_base_close_channel,
|
2010-12-01 13:41:19 +00:00
|
|
|
orte_rmcast_base_query,
|
|
|
|
enable_comm,
|
2011-01-13 01:54:05 +00:00
|
|
|
disable_comm,
|
|
|
|
process_msg
|
2009-12-15 01:18:27 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/* during init, we setup two channels for both xmit and recv:
|
|
|
|
* (a) a public address announcement channel. There are two variants
|
|
|
|
* of this:
|
|
|
|
* (1) system processes - e.g., daemons, tools. This channel
|
|
|
|
* is reserved solely for their use in performing admin
|
|
|
|
* functions
|
|
|
|
* (2) application processes. This channel is used to announce
|
|
|
|
* their existence and contact info for auto-wireup
|
|
|
|
* (b) our own group's channel, which is where our own output
|
|
|
|
* will be sent. At this time, we assume that we always
|
|
|
|
* want to hear our peers, so this channels is also
|
|
|
|
* bidirectional
|
|
|
|
*
|
|
|
|
* In addition, the HNP opens a third channel which is used solely
|
|
|
|
* for cmd-control purposes. This is where a tool, for example, might
|
|
|
|
* send a cmd to the HNP to take some action - there is no point in
|
|
|
|
* having that cmd echo around to every daemon and/or other tool
|
|
|
|
* in the system.
|
|
|
|
*/
|
|
|
|
static int init(void)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
if (init_completed) {
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
init_completed = true;
|
|
|
|
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s rmcast:tcp: init called",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
/* setup local ctl */
|
|
|
|
OBJ_CONSTRUCT(&ctl, orte_thread_ctl_t);
|
2011-02-12 16:52:03 +00:00
|
|
|
OBJ_CONSTRUCT(&tools, opal_list_t);
|
2011-01-13 01:54:05 +00:00
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
/* setup the respective public address channel */
|
2010-05-25 22:36:57 +00:00
|
|
|
if (ORTE_PROC_IS_TOOL) {
|
|
|
|
/* tools only open the sys channel */
|
|
|
|
if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_SYS_CHANNEL, "system",
|
|
|
|
NULL, -1, NULL, ORTE_RMCAST_BIDIR))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2010-06-15 03:50:31 +00:00
|
|
|
orte_rmcast_base.my_output_channel = (rmcast_base_channel_t*)opal_list_get_last(&orte_rmcast_base.channels);
|
|
|
|
orte_rmcast_base.my_input_channel = NULL;
|
2011-02-12 16:52:03 +00:00
|
|
|
} else if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_SCHEDULER) {
|
2010-05-25 22:36:57 +00:00
|
|
|
/* daemons and hnp open the sys and data server channels */
|
|
|
|
if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_SYS_CHANNEL, "system",
|
|
|
|
NULL, -1, NULL, ORTE_RMCAST_BIDIR))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2010-06-15 03:50:31 +00:00
|
|
|
orte_rmcast_base.my_output_channel = (rmcast_base_channel_t*)opal_list_get_last(&orte_rmcast_base.channels);
|
|
|
|
orte_rmcast_base.my_input_channel = NULL;
|
2010-05-25 22:36:57 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_DATA_SERVER_CHANNEL, "data-server",
|
2009-12-15 01:18:27 +00:00
|
|
|
NULL, -1, NULL, ORTE_RMCAST_BIDIR))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2010-08-13 15:04:22 +00:00
|
|
|
/* open the error reporting channel */
|
|
|
|
if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_ERROR_CHANNEL, "error",
|
|
|
|
NULL, -1, NULL, ORTE_RMCAST_BIDIR))) {
|
2011-03-24 19:05:39 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
/* open the app public channel so we can hear app announcements and commands */
|
|
|
|
if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_APP_PUBLIC_CHANNEL, "app-announce",
|
|
|
|
NULL, -1, NULL, ORTE_RMCAST_BIDIR))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
/* open the heartbeat channel */
|
|
|
|
if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_HEARTBEAT_CHANNEL, "heartbeat",
|
|
|
|
NULL, -1, NULL, ORTE_RMCAST_BIDIR))) {
|
2010-08-13 15:04:22 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2009-12-15 01:18:27 +00:00
|
|
|
} else if (ORTE_PROC_IS_APP) {
|
2010-05-25 22:36:57 +00:00
|
|
|
/* apps open the app public and data server channels */
|
|
|
|
if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_APP_PUBLIC_CHANNEL, "app-announce",
|
2009-12-15 01:18:27 +00:00
|
|
|
NULL, -1, NULL, ORTE_RMCAST_BIDIR))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2010-05-25 22:36:57 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_DATA_SERVER_CHANNEL, "data-server",
|
2009-12-15 01:18:27 +00:00
|
|
|
NULL, -1, NULL, ORTE_RMCAST_BIDIR))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2010-08-13 15:04:22 +00:00
|
|
|
/* open the error reporting channel */
|
|
|
|
if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_ERROR_CHANNEL, "error",
|
|
|
|
NULL, -1, NULL, ORTE_RMCAST_BIDIR))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2011-06-10 16:28:18 +00:00
|
|
|
/* finally, setup our grp xmit/recv channels, if given */
|
|
|
|
if (NULL != orte_rmcast_base.my_group_name) {
|
2010-05-25 22:36:57 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = open_channel(orte_rmcast_base.my_group_number,
|
2010-06-16 19:40:59 +00:00
|
|
|
"recv", NULL, -1, NULL, ORTE_RMCAST_RECV))) {
|
2010-05-25 22:36:57 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2010-06-15 03:50:31 +00:00
|
|
|
orte_rmcast_base.my_input_channel = (rmcast_base_channel_t*)opal_list_get_last(&orte_rmcast_base.channels);
|
|
|
|
if (ORTE_SUCCESS != (rc = open_channel(orte_rmcast_base.my_group_number+1,
|
2010-06-16 19:40:59 +00:00
|
|
|
"xmit", NULL, -1, NULL, ORTE_RMCAST_XMIT))) {
|
2010-06-15 03:50:31 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
orte_rmcast_base.my_output_channel = (rmcast_base_channel_t*)opal_list_get_last(&orte_rmcast_base.channels);
|
2010-05-25 22:36:57 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
opal_output(0, "rmcast:tcp:init - unknown process type");
|
|
|
|
return ORTE_ERR_SILENT;
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
2010-05-25 22:36:57 +00:00
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
if (ORTE_JOBID_WILDCARD == orte_process_info.my_hnp.jobid) {
|
|
|
|
/* set the HNP info in our contact table */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(orte_process_info.my_hnp_uri))) {
|
|
|
|
orte_show_help("help-orcm-ps.txt", "orcm-ps:hnp-uri-bad", true, orte_process_info.my_hnp_uri);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
/* extract the name */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri,
|
|
|
|
&orte_process_info.my_hnp, NULL))) {
|
|
|
|
orte_show_help("help-orcm-ps.txt", "orcm-ps:hnp-uri-bad", true, orte_process_info.my_hnp_uri);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-11-08 19:09:23 +00:00
|
|
|
/* start the processing thread */
|
2011-01-13 01:54:05 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_rmcast_base_start_threads())) {
|
2010-11-08 19:09:23 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
/* now activate the non-blocking recv so we catch messages */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
|
|
|
ORTE_RML_TAG_MULTICAST,
|
2010-11-08 19:09:23 +00:00
|
|
|
ORTE_RML_PERSISTENT,
|
2009-12-15 01:18:27 +00:00
|
|
|
recv_handler,
|
|
|
|
NULL))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2010-12-01 13:41:19 +00:00
|
|
|
comm_enabled = true;
|
2009-12-15 01:18:27 +00:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void finalize(void)
|
|
|
|
{
|
2011-02-12 16:52:03 +00:00
|
|
|
opal_list_item_t *item;
|
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s rmcast:tcp: finalize called",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
|
2010-12-01 13:41:19 +00:00
|
|
|
/* stop the chatter */
|
|
|
|
comm_enabled = false;
|
|
|
|
|
2010-10-07 22:02:12 +00:00
|
|
|
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MULTICAST);
|
2010-11-08 19:09:23 +00:00
|
|
|
|
|
|
|
/* stop the processing thread */
|
|
|
|
orte_rmcast_base_stop_threads();
|
|
|
|
|
2011-02-12 16:52:03 +00:00
|
|
|
while (NULL != (item = opal_list_remove_first(&tools))) {
|
|
|
|
OBJ_RELEASE(item);
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&tools);
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
OBJ_DESTRUCT(&ctl);
|
2009-12-15 01:18:27 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-12-01 13:41:19 +00:00
|
|
|
static void enable_comm(void)
|
|
|
|
{
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_ACQUIRE_THREAD(&ctl);
|
|
|
|
orte_rmcast_base_start_threads();
|
2010-12-01 13:41:19 +00:00
|
|
|
comm_enabled = true;
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
2010-12-01 13:41:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void disable_comm(void)
|
|
|
|
{
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_ACQUIRE_THREAD(&ctl);
|
2010-12-01 13:41:19 +00:00
|
|
|
comm_enabled = false;
|
|
|
|
orte_rmcast_base_stop_threads();
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
|
|
|
|
2011-02-12 16:52:03 +00:00
|
|
|
static void cbfunc(int status,
|
|
|
|
struct orte_process_name_t* peer,
|
|
|
|
struct opal_buffer_t* buffer,
|
|
|
|
orte_rml_tag_t tag,
|
|
|
|
void* cbdata)
|
|
|
|
{
|
|
|
|
OBJ_RELEASE(buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-11-08 19:09:23 +00:00
|
|
|
static int send_data(rmcast_base_send_t *snd,
|
|
|
|
orte_rmcast_channel_t channel)
|
2009-12-15 01:18:27 +00:00
|
|
|
{
|
2011-02-12 16:52:03 +00:00
|
|
|
opal_list_item_t *item, *next;
|
2010-05-25 22:36:57 +00:00
|
|
|
orte_proc_t *proc;
|
|
|
|
orte_odls_child_t *child;
|
|
|
|
int rc, v;
|
|
|
|
opal_buffer_t *buf;
|
2010-11-08 19:09:23 +00:00
|
|
|
rmcast_base_channel_t *ch;
|
2011-02-12 16:52:03 +00:00
|
|
|
orte_namelist_t *tool;
|
2010-11-08 19:09:23 +00:00
|
|
|
|
2010-12-01 13:41:19 +00:00
|
|
|
if (!comm_enabled) {
|
|
|
|
return ORTE_ERR_COMM_DISABLED;
|
|
|
|
}
|
|
|
|
|
2010-05-25 22:36:57 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s rmcast:tcp: send of %d %s"
|
|
|
|
" called on multicast channel %d",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
(NULL == snd->iovec_array) ? (int)snd->buf->bytes_used : (int)snd->iovec_count,
|
|
|
|
(NULL == snd->iovec_array) ? "bytes" : "iovecs",
|
|
|
|
(int)channel));
|
2009-12-15 01:18:27 +00:00
|
|
|
|
2010-05-25 22:36:57 +00:00
|
|
|
/* setup the message for xmission */
|
2010-11-08 19:09:23 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_rmcast_base_queue_xmit(snd, channel, &buf, &ch))) {
|
2009-12-15 01:18:27 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
2010-05-25 22:36:57 +00:00
|
|
|
return rc;
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
2010-05-25 22:36:57 +00:00
|
|
|
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s rmcast:tcp multicasting %d bytes to channel %d tag %d",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)buf->bytes_used,
|
|
|
|
(int)ch->channel, (int)snd->tag));
|
2009-12-15 01:18:27 +00:00
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
|
|
|
|
/* if we are a daemon, then we have to send it to the HNP
|
|
|
|
* for relay to all other daemons - we cannot send it
|
|
|
|
* ourselves as, at startup, we won't know who else is
|
|
|
|
* out there until -after- a startup handshake is
|
|
|
|
* exchanged via multicast
|
|
|
|
*/
|
|
|
|
if (ORTE_PROC_IS_DAEMON) {
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s rmcast:tcp sending to %s",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_HNP)));
|
|
|
|
/* ignore errors */
|
2011-02-12 16:52:03 +00:00
|
|
|
OBJ_RETAIN(buf);
|
|
|
|
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_MULTICAST, 0, cbfunc, NULL))) {
|
|
|
|
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc && ORTE_ERR_UNREACH != rc) {
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
2011-02-12 16:52:03 +00:00
|
|
|
rc = ORTE_SUCCESS; /* don't confuse up-stream client */
|
2010-05-25 22:36:57 +00:00
|
|
|
}
|
2011-01-13 01:54:05 +00:00
|
|
|
} else {
|
|
|
|
/* if we don't already have it, get the daemon object */
|
|
|
|
if (NULL == daemons) {
|
|
|
|
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
|
|
|
}
|
|
|
|
/* send it to each daemon other than myself */
|
|
|
|
for (v=1; v < daemons->procs->size; v++) {
|
|
|
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (NULL == proc->rml_uri) {
|
|
|
|
/* not ready yet - don't know contact info */
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s rmcast:tcp dont have path to %s",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(&proc->name)));
|
|
|
|
continue;
|
|
|
|
}
|
2010-08-29 13:45:10 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
2011-01-13 01:54:05 +00:00
|
|
|
"%s rmcast:tcp sending to %s",
|
2010-08-29 13:45:10 +00:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(&proc->name)));
|
2011-01-13 01:54:05 +00:00
|
|
|
/* ignore errors */
|
2011-02-12 16:52:03 +00:00
|
|
|
OBJ_RETAIN(buf);
|
|
|
|
if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf, ORTE_RML_TAG_MULTICAST, 0, cbfunc, NULL))) {
|
|
|
|
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc && ORTE_ERR_UNREACH != rc) {
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
2011-02-12 16:52:03 +00:00
|
|
|
rc = ORTE_SUCCESS; /* don't confuse up-stream client */
|
|
|
|
}
|
|
|
|
|
|
|
|
/* now send it to all attached tools */
|
|
|
|
item = opal_list_get_first(&tools);
|
|
|
|
while (item != opal_list_get_end(&tools)) {
|
|
|
|
tool = (orte_namelist_t*)item;
|
|
|
|
next = opal_list_get_next(item);
|
|
|
|
OBJ_RETAIN(buf);
|
|
|
|
if (0 > (rc = orte_rml.send_buffer_nb(&tool->name, buf, ORTE_RML_TAG_MULTICAST, 0, cbfunc, NULL))) {
|
|
|
|
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc && ORTE_ERR_UNREACH != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
opal_list_remove_item(&tools, item);
|
|
|
|
OBJ_RELEASE(item);
|
|
|
|
OBJ_RELEASE(buf);
|
|
|
|
rc = ORTE_SUCCESS; /* don't confuse up-stream client */
|
|
|
|
}
|
|
|
|
item = next;
|
2011-01-13 01:54:05 +00:00
|
|
|
}
|
2009-12-30 01:45:31 +00:00
|
|
|
}
|
2010-05-25 22:36:57 +00:00
|
|
|
}
|
|
|
|
/* send the message to my children */
|
|
|
|
for (item = opal_list_get_first(&orte_local_children);
|
|
|
|
item != opal_list_get_end(&orte_local_children);
|
|
|
|
item = opal_list_get_next(item)) {
|
|
|
|
child = (orte_odls_child_t*)item;
|
2011-01-13 01:54:05 +00:00
|
|
|
if (!child->alive) {
|
|
|
|
continue;
|
|
|
|
}
|
2010-07-23 19:31:34 +00:00
|
|
|
if (NULL == child->rml_uri) {
|
|
|
|
/* race condition - hasn't reported in yet */
|
|
|
|
continue;
|
|
|
|
}
|
2011-01-13 01:54:05 +00:00
|
|
|
/* ignore errors */
|
2011-02-12 16:52:03 +00:00
|
|
|
OBJ_RETAIN(buf);
|
|
|
|
if (0 > (rc = orte_rml.send_buffer_nb(child->name, buf, ORTE_RML_TAG_MULTICAST, 0, cbfunc, NULL))) {
|
|
|
|
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc && ORTE_ERR_UNREACH != rc) {
|
2010-08-13 15:04:22 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
2011-02-12 16:52:03 +00:00
|
|
|
rc = ORTE_SUCCESS; /* don't confuse up-stream client */
|
2010-08-13 15:04:22 +00:00
|
|
|
}
|
|
|
|
}
|
2011-01-13 01:54:05 +00:00
|
|
|
rc = ORTE_SUCCESS;
|
|
|
|
} else {
|
|
|
|
/* I am a tool or an app - send it to my HNP for relay */
|
2010-05-25 22:36:57 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s rmcast:tcp sending multicast to HNP %s",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_HNP)));
|
2011-02-12 16:52:03 +00:00
|
|
|
OBJ_RETAIN(buf);
|
|
|
|
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_MULTICAST, 0, cbfunc, NULL))) {
|
|
|
|
orte_errmgr.abort(rc, "%s Failed to send message to multicast channel %d",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)ch->channel);
|
2009-12-15 01:18:27 +00:00
|
|
|
goto cleanup;
|
|
|
|
}
|
2010-05-25 22:36:57 +00:00
|
|
|
rc = ORTE_SUCCESS;
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
2010-05-25 22:36:57 +00:00
|
|
|
|
|
|
|
if (NULL != snd->buf) {
|
2009-12-15 01:18:27 +00:00
|
|
|
/* call the cbfunc if required */
|
|
|
|
if (NULL != snd->cbfunc_buffer) {
|
2010-11-07 23:29:52 +00:00
|
|
|
snd->cbfunc_buffer(rc, channel, ch->seq_num, snd->tag,
|
2010-02-14 19:20:56 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2009-12-15 01:18:27 +00:00
|
|
|
snd->buf, snd->cbdata);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* call the cbfunc if required */
|
|
|
|
if (NULL != snd->cbfunc_iovec) {
|
2010-11-07 23:29:52 +00:00
|
|
|
snd->cbfunc_iovec(rc, channel, ch->seq_num, snd->tag,
|
2010-02-14 19:20:56 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2009-12-15 01:18:27 +00:00
|
|
|
snd->iovec_array, snd->iovec_count, snd->cbdata);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-08-13 15:04:22 +00:00
|
|
|
cleanup:
|
2010-05-25 22:36:57 +00:00
|
|
|
OBJ_RELEASE(buf);
|
2009-12-15 01:18:27 +00:00
|
|
|
|
2010-05-25 22:36:57 +00:00
|
|
|
return rc;
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int tcp_send(orte_rmcast_channel_t channel,
|
2011-01-13 01:54:05 +00:00
|
|
|
orte_rmcast_tag_t tag,
|
|
|
|
struct iovec *msg, int count)
|
2009-12-15 01:18:27 +00:00
|
|
|
{
|
|
|
|
rmcast_base_send_t snd;
|
|
|
|
int ret;
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_ACQUIRE_THREAD(&ctl);
|
|
|
|
|
2010-12-01 13:41:19 +00:00
|
|
|
if (!comm_enabled) {
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
2010-12-01 13:41:19 +00:00
|
|
|
return ORTE_ERR_COMM_DISABLED;
|
|
|
|
}
|
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
/* queue it to be sent - preserves order! */
|
|
|
|
OBJ_CONSTRUCT(&snd, rmcast_base_send_t);
|
|
|
|
snd.iovec_array = msg;
|
|
|
|
snd.iovec_count = count;
|
|
|
|
snd.tag = tag;
|
2010-12-01 04:26:43 +00:00
|
|
|
|
2010-11-08 19:09:23 +00:00
|
|
|
if (ORTE_SUCCESS != (ret = send_data(&snd, channel))) {
|
2009-12-15 01:18:27 +00:00
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
}
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
/* carefully cleanup */
|
|
|
|
snd.iovec_array = NULL;
|
|
|
|
snd.iovec_count = 0;
|
2009-12-15 01:18:27 +00:00
|
|
|
OBJ_DESTRUCT(&snd);
|
2011-01-13 01:54:05 +00:00
|
|
|
|
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
|
|
|
return ret;
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int tcp_send_nb(orte_rmcast_channel_t channel,
|
2011-01-13 01:54:05 +00:00
|
|
|
orte_rmcast_tag_t tag,
|
|
|
|
struct iovec *msg, int count,
|
|
|
|
orte_rmcast_callback_fn_t cbfunc,
|
|
|
|
void *cbdata)
|
2009-12-15 01:18:27 +00:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
rmcast_base_send_t snd;
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_ACQUIRE_THREAD(&ctl);
|
|
|
|
|
2010-12-01 13:41:19 +00:00
|
|
|
if (!comm_enabled) {
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
2010-12-01 13:41:19 +00:00
|
|
|
return ORTE_ERR_COMM_DISABLED;
|
|
|
|
}
|
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
/* queue it to be sent - preserves order! */
|
|
|
|
OBJ_CONSTRUCT(&snd, rmcast_base_send_t);
|
|
|
|
snd.iovec_array = msg;
|
|
|
|
snd.iovec_count = count;
|
|
|
|
snd.tag = tag;
|
|
|
|
snd.cbfunc_iovec = cbfunc;
|
2010-01-15 20:02:47 +00:00
|
|
|
snd.cbdata = cbdata;
|
2009-12-15 01:18:27 +00:00
|
|
|
|
2010-11-08 19:09:23 +00:00
|
|
|
if (ORTE_SUCCESS != (ret = send_data(&snd, channel))) {
|
2009-12-15 01:18:27 +00:00
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
}
|
2011-01-13 01:54:05 +00:00
|
|
|
|
|
|
|
/* carefully cleanup */
|
|
|
|
snd.iovec_array = NULL;
|
|
|
|
snd.iovec_count = 0;
|
2009-12-15 01:18:27 +00:00
|
|
|
OBJ_DESTRUCT(&snd);
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
|
|
|
return ret;
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int tcp_send_buffer(orte_rmcast_channel_t channel,
|
2011-01-13 01:54:05 +00:00
|
|
|
orte_rmcast_tag_t tag,
|
|
|
|
opal_buffer_t *buf)
|
2009-12-15 01:18:27 +00:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
rmcast_base_send_t snd;
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_ACQUIRE_THREAD(&ctl);
|
|
|
|
|
2010-12-01 13:41:19 +00:00
|
|
|
if (!comm_enabled) {
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
2010-12-01 13:41:19 +00:00
|
|
|
return ORTE_ERR_COMM_DISABLED;
|
|
|
|
}
|
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
/* queue it to be sent - preserves order! */
|
|
|
|
OBJ_CONSTRUCT(&snd, rmcast_base_send_t);
|
|
|
|
snd.buf = buf;
|
|
|
|
snd.tag = tag;
|
|
|
|
|
2010-11-08 19:09:23 +00:00
|
|
|
if (ORTE_SUCCESS != (ret = send_data(&snd, channel))) {
|
2009-12-15 01:18:27 +00:00
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
}
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
/* carefully cleanup */
|
|
|
|
snd.buf = NULL;
|
2009-12-15 01:18:27 +00:00
|
|
|
OBJ_DESTRUCT(&snd);
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
2009-12-15 01:18:27 +00:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int tcp_send_buffer_nb(orte_rmcast_channel_t channel,
|
2011-01-13 01:54:05 +00:00
|
|
|
orte_rmcast_tag_t tag,
|
|
|
|
opal_buffer_t *buf,
|
|
|
|
orte_rmcast_callback_buffer_fn_t cbfunc,
|
|
|
|
void *cbdata)
|
2009-12-15 01:18:27 +00:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
rmcast_base_send_t snd;
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_ACQUIRE_THREAD(&ctl);
|
|
|
|
|
2010-12-01 13:41:19 +00:00
|
|
|
if (!comm_enabled) {
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
2010-12-01 13:41:19 +00:00
|
|
|
return ORTE_ERR_COMM_DISABLED;
|
|
|
|
}
|
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
/* queue it to be sent - preserves order! */
|
|
|
|
OBJ_CONSTRUCT(&snd, rmcast_base_send_t);
|
|
|
|
snd.buf = buf;
|
|
|
|
snd.tag = tag;
|
|
|
|
snd.cbfunc_buffer = cbfunc;
|
2010-01-15 20:02:47 +00:00
|
|
|
snd.cbdata = cbdata;
|
2009-12-15 01:18:27 +00:00
|
|
|
|
2010-11-08 19:09:23 +00:00
|
|
|
if (ORTE_SUCCESS != (ret = send_data(&snd, channel))) {
|
2009-12-15 01:18:27 +00:00
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
}
|
2011-01-13 01:54:05 +00:00
|
|
|
|
|
|
|
/* carefully cleanup */
|
|
|
|
snd.buf = NULL;
|
2009-12-15 01:18:27 +00:00
|
|
|
OBJ_DESTRUCT(&snd);
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
|
|
|
return ret;
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int tcp_recv(orte_process_name_t *name,
|
2010-11-07 23:29:52 +00:00
|
|
|
orte_rmcast_channel_t channel,
|
|
|
|
orte_rmcast_tag_t tag,
|
|
|
|
orte_rmcast_seq_t *seq_num,
|
|
|
|
struct iovec **msg, int *count)
|
2009-12-15 01:18:27 +00:00
|
|
|
{
|
|
|
|
rmcast_base_recv_t *recvptr;
|
|
|
|
int ret;
|
2010-05-25 22:36:57 +00:00
|
|
|
orte_rmcast_channel_t chan;
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_ACQUIRE_THREAD(&ctl);
|
|
|
|
|
2010-12-01 13:41:19 +00:00
|
|
|
if (!comm_enabled) {
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
2010-12-01 13:41:19 +00:00
|
|
|
return ORTE_ERR_COMM_DISABLED;
|
|
|
|
}
|
|
|
|
|
2010-06-15 03:50:31 +00:00
|
|
|
if (ORTE_RMCAST_GROUP_INPUT_CHANNEL == channel) {
|
|
|
|
chan = orte_rmcast_base.my_input_channel->channel;
|
|
|
|
} else if (ORTE_RMCAST_GROUP_OUTPUT_CHANNEL == channel) {
|
|
|
|
chan = orte_rmcast_base.my_output_channel->channel;
|
2010-05-25 22:36:57 +00:00
|
|
|
} else {
|
|
|
|
chan = channel;
|
|
|
|
}
|
2009-12-15 01:18:27 +00:00
|
|
|
|
2010-05-25 22:36:57 +00:00
|
|
|
if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(&recvptr, chan, tag,
|
|
|
|
ORTE_RMCAST_NON_PERSISTENT,
|
|
|
|
NULL, NULL, NULL, true))) {
|
2009-12-15 01:18:27 +00:00
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
return ret;
|
|
|
|
}
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
2009-12-15 01:18:27 +00:00
|
|
|
|
2011-03-22 18:45:54 +00:00
|
|
|
recvptr->ctl.active = true;
|
2010-12-01 04:26:43 +00:00
|
|
|
ORTE_ACQUIRE_THREAD(&recvptr->ctl);
|
2009-12-15 01:18:27 +00:00
|
|
|
|
|
|
|
/* xfer the data */
|
|
|
|
if (NULL != name) {
|
|
|
|
/* caller requested id of sender */
|
|
|
|
name->jobid = recvptr->name.jobid;
|
|
|
|
name->vpid = recvptr->name.vpid;
|
2011-08-26 22:16:14 +00:00
|
|
|
ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch);
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
2010-11-07 23:29:52 +00:00
|
|
|
*seq_num = recvptr->seq_num;
|
2009-12-15 01:18:27 +00:00
|
|
|
*msg = recvptr->iovec_array;
|
|
|
|
*count = recvptr->iovec_count;
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
/* remove the recv */
|
2010-12-01 04:26:43 +00:00
|
|
|
recvptr->iovec_array = NULL;
|
|
|
|
recvptr->iovec_count = 0;
|
2009-12-15 01:18:27 +00:00
|
|
|
OBJ_RELEASE(recvptr);
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int tcp_recv_nb(orte_rmcast_channel_t channel,
|
|
|
|
orte_rmcast_tag_t tag,
|
|
|
|
orte_rmcast_flag_t flags,
|
|
|
|
orte_rmcast_callback_fn_t cbfunc, void *cbdata)
|
|
|
|
{
|
|
|
|
int ret;
|
2010-05-25 22:36:57 +00:00
|
|
|
orte_rmcast_channel_t chan;
|
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s rmcast:tcp: recv_nb called on channel %d",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel));
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_ACQUIRE_THREAD(&ctl);
|
|
|
|
|
2010-06-15 03:50:31 +00:00
|
|
|
if (ORTE_RMCAST_GROUP_INPUT_CHANNEL == channel) {
|
|
|
|
chan = orte_rmcast_base.my_input_channel->channel;
|
|
|
|
} else if (ORTE_RMCAST_GROUP_OUTPUT_CHANNEL == channel) {
|
|
|
|
chan = orte_rmcast_base.my_output_channel->channel;
|
2010-05-25 22:36:57 +00:00
|
|
|
} else {
|
|
|
|
chan = channel;
|
|
|
|
}
|
2009-12-15 01:18:27 +00:00
|
|
|
|
2010-05-25 22:36:57 +00:00
|
|
|
if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(NULL, chan, tag, flags,
|
|
|
|
cbfunc, NULL, cbdata, false))) {
|
2010-05-26 14:29:36 +00:00
|
|
|
if (ORTE_EXISTS == ret) {
|
|
|
|
ret = ORTE_SUCCESS;
|
|
|
|
} else {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
}
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
2010-05-25 22:36:57 +00:00
|
|
|
return ret;
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int tcp_recv_buffer(orte_process_name_t *name,
|
|
|
|
orte_rmcast_channel_t channel,
|
|
|
|
orte_rmcast_tag_t tag,
|
2010-11-07 23:29:52 +00:00
|
|
|
orte_rmcast_seq_t *seq_num,
|
2010-02-14 19:20:56 +00:00
|
|
|
opal_buffer_t *buf)
|
2009-12-15 01:18:27 +00:00
|
|
|
{
|
|
|
|
rmcast_base_recv_t *recvptr;
|
|
|
|
int ret;
|
2010-05-25 22:36:57 +00:00
|
|
|
orte_rmcast_channel_t chan;
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_ACQUIRE_THREAD(&ctl);
|
|
|
|
|
2010-12-01 13:41:19 +00:00
|
|
|
if (!comm_enabled) {
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
2010-12-01 13:41:19 +00:00
|
|
|
return ORTE_ERR_COMM_DISABLED;
|
|
|
|
}
|
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s rmcast:tcp: recv_buffer called on multicast channel %d",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel));
|
|
|
|
|
2010-06-15 03:50:31 +00:00
|
|
|
if (ORTE_RMCAST_GROUP_INPUT_CHANNEL == channel) {
|
|
|
|
chan = orte_rmcast_base.my_input_channel->channel;
|
|
|
|
} else if (ORTE_RMCAST_GROUP_OUTPUT_CHANNEL == channel) {
|
|
|
|
chan = orte_rmcast_base.my_output_channel->channel;
|
2010-05-25 22:36:57 +00:00
|
|
|
} else {
|
|
|
|
chan = channel;
|
|
|
|
}
|
2009-12-15 01:18:27 +00:00
|
|
|
|
2010-05-25 22:36:57 +00:00
|
|
|
if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(&recvptr, chan, tag,
|
|
|
|
ORTE_RMCAST_NON_PERSISTENT,
|
|
|
|
NULL, NULL, NULL, true))) {
|
2009-12-15 01:18:27 +00:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
2010-05-25 22:36:57 +00:00
|
|
|
return ret;
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
2009-12-15 01:18:27 +00:00
|
|
|
|
2011-03-22 18:45:54 +00:00
|
|
|
recvptr->ctl.active = true;
|
2010-12-01 04:26:43 +00:00
|
|
|
ORTE_ACQUIRE_THREAD(&recvptr->ctl);
|
2009-12-15 01:18:27 +00:00
|
|
|
|
|
|
|
/* xfer the data */
|
|
|
|
if (NULL != name) {
|
|
|
|
/* caller requested id of sender */
|
|
|
|
name->jobid = recvptr->name.jobid;
|
|
|
|
name->vpid = recvptr->name.vpid;
|
2011-08-26 22:16:14 +00:00
|
|
|
ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch);
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
2010-11-07 23:29:52 +00:00
|
|
|
*seq_num = recvptr->seq_num;
|
2009-12-15 01:18:27 +00:00
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
}
|
2010-12-01 04:26:43 +00:00
|
|
|
/* release the recv */
|
2009-12-15 01:18:27 +00:00
|
|
|
OBJ_RELEASE(recvptr);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int tcp_recv_buffer_nb(orte_rmcast_channel_t channel,
|
|
|
|
orte_rmcast_tag_t tag,
|
|
|
|
orte_rmcast_flag_t flags,
|
|
|
|
orte_rmcast_callback_buffer_fn_t cbfunc, void *cbdata)
|
|
|
|
{
|
|
|
|
int ret;
|
2010-05-25 22:36:57 +00:00
|
|
|
orte_rmcast_channel_t chan;
|
2009-12-15 01:18:27 +00:00
|
|
|
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s rmcast:tcp: recv_buffer_nb called on multicast channel %d tag %d",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel, tag));
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_ACQUIRE_THREAD(&ctl);
|
|
|
|
|
2010-06-15 03:50:31 +00:00
|
|
|
if (ORTE_RMCAST_GROUP_INPUT_CHANNEL == channel) {
|
|
|
|
chan = orte_rmcast_base.my_input_channel->channel;
|
|
|
|
} else if (ORTE_RMCAST_GROUP_OUTPUT_CHANNEL == channel) {
|
|
|
|
chan = orte_rmcast_base.my_output_channel->channel;
|
2010-05-25 22:36:57 +00:00
|
|
|
} else {
|
|
|
|
chan = channel;
|
|
|
|
}
|
2009-12-15 01:18:27 +00:00
|
|
|
|
2010-05-25 22:36:57 +00:00
|
|
|
if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(NULL, chan, tag, flags,
|
|
|
|
NULL, cbfunc, cbdata, false))) {
|
2011-01-13 01:54:05 +00:00
|
|
|
if (ORTE_EXISTS == ret) {
|
|
|
|
ret = ORTE_SUCCESS;
|
|
|
|
} else {
|
2010-05-26 14:29:36 +00:00
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
}
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_RELEASE_THREAD(&ctl);
|
|
|
|
|
|
|
|
return ret;
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* for the tcp module, we will be using the RML to "fake" a
|
|
|
|
* multicast in combination with the grpcomm "xcast" interface.
|
|
|
|
* We cannot control the network and interface in this
|
|
|
|
* combination as it gets auto-picked well before us, so we
|
|
|
|
* ignore that info here
|
|
|
|
*/
|
2010-05-25 22:36:57 +00:00
|
|
|
static int open_channel(orte_rmcast_channel_t channel, char *name,
|
2009-12-15 01:18:27 +00:00
|
|
|
char *network, int port, char *interface, uint8_t direction)
|
|
|
|
{
|
|
|
|
opal_list_item_t *item;
|
2010-05-25 22:36:57 +00:00
|
|
|
rmcast_base_channel_t *chan;
|
2009-12-15 01:18:27 +00:00
|
|
|
|
2010-06-16 19:40:59 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s opening channel %d for %s",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel, name));
|
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
/* see if this name has already been assigned a channel on the specified network */
|
2010-12-01 04:26:43 +00:00
|
|
|
ORTE_ACQUIRE_THREAD(&orte_rmcast_base.main_ctl);
|
2010-05-25 22:36:57 +00:00
|
|
|
for (item = opal_list_get_first(&orte_rmcast_base.channels);
|
|
|
|
item != opal_list_get_end(&orte_rmcast_base.channels);
|
2009-12-15 01:18:27 +00:00
|
|
|
item = opal_list_get_next(item)) {
|
2010-05-25 22:36:57 +00:00
|
|
|
chan = (rmcast_base_channel_t*)item;
|
2009-12-15 01:18:27 +00:00
|
|
|
|
2010-05-25 22:36:57 +00:00
|
|
|
if (0 == strcasecmp(chan->name, name)) {
|
2009-12-15 01:18:27 +00:00
|
|
|
/* check the channel, if one was given */
|
2010-07-27 01:38:39 +00:00
|
|
|
if (ORTE_RMCAST_INVALID_CHANNEL != channel) {
|
|
|
|
if (ORTE_RMCAST_INVALID_CHANNEL == chan->channel) {
|
|
|
|
chan->channel = channel;
|
|
|
|
} else if (chan->channel != channel) {
|
|
|
|
/* another channel for this name */
|
|
|
|
goto newchan;
|
|
|
|
}
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
|
|
|
/* all setup - nothing to do */
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s rmcast:tcp using existing channel",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
2010-12-01 04:26:43 +00:00
|
|
|
ORTE_RELEASE_THREAD(&orte_rmcast_base.main_ctl);
|
2009-12-15 01:18:27 +00:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-07-27 01:38:39 +00:00
|
|
|
newchan:
|
2009-12-15 01:18:27 +00:00
|
|
|
/* we didn't find an existing match, so create a new channel */
|
2010-06-16 19:40:59 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s creating new channel %d for %s",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel, name));
|
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
chan = OBJ_NEW(rmcast_base_channel_t);
|
|
|
|
chan->name = strdup(name);
|
2010-05-25 22:36:57 +00:00
|
|
|
chan->channel = channel;
|
2009-12-15 01:18:27 +00:00
|
|
|
/* add to list of known channels */
|
2010-05-25 22:36:57 +00:00
|
|
|
opal_list_append(&orte_rmcast_base.channels, &chan->item);
|
2010-12-01 04:26:43 +00:00
|
|
|
ORTE_RELEASE_THREAD(&orte_rmcast_base.main_ctl);
|
2009-12-15 01:18:27 +00:00
|
|
|
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s rmcast:tcp opening new channel for%s%s",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
2010-07-27 01:38:39 +00:00
|
|
|
(ORTE_RMCAST_RECV & direction) ? " RECV" : " ",
|
2009-12-15 01:18:27 +00:00
|
|
|
(ORTE_RMCAST_XMIT & direction) ? " XMIT" : " "));
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
static void process_msg(orte_rmcast_msg_t *msg)
|
2010-05-25 22:36:57 +00:00
|
|
|
{
|
2011-01-13 01:54:05 +00:00
|
|
|
int rc;
|
2011-02-12 16:52:03 +00:00
|
|
|
opal_list_item_t *item, *next;
|
2011-01-13 01:54:05 +00:00
|
|
|
int v;
|
|
|
|
orte_proc_t *proc;
|
2010-05-25 22:36:57 +00:00
|
|
|
orte_odls_child_t *child;
|
2011-02-12 16:52:03 +00:00
|
|
|
opal_buffer_t *buf;
|
|
|
|
orte_namelist_t *tool;
|
2010-11-08 19:09:23 +00:00
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
2011-01-13 01:54:05 +00:00
|
|
|
"%s rmcast:tcp processing message from %s",
|
2009-12-15 01:18:27 +00:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
2010-05-25 22:36:57 +00:00
|
|
|
ORTE_NAME_PRINT(&msg->sender)));
|
2011-01-13 01:54:05 +00:00
|
|
|
|
2011-02-12 16:52:03 +00:00
|
|
|
buf = OBJ_NEW(opal_buffer_t);
|
|
|
|
opal_dss.copy_payload(buf, msg->buf);
|
|
|
|
|
2010-11-30 21:13:53 +00:00
|
|
|
if (ORTE_PROC_IS_HNP) {
|
2011-02-12 16:52:03 +00:00
|
|
|
/* if this message came from a different job family, then we have
|
|
|
|
* to track the sender so we can relay mcast messages to them as
|
|
|
|
* they won't be a member of the daemon job
|
|
|
|
*/
|
|
|
|
if (ORTE_JOB_FAMILY(msg->sender.jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
|
|
|
|
tool = OBJ_NEW(orte_namelist_t);
|
|
|
|
tool->name.jobid = msg->sender.jobid;
|
|
|
|
tool->name.vpid = msg->sender.vpid;
|
|
|
|
opal_list_append(&tools, &tool->item);
|
|
|
|
}
|
|
|
|
|
2010-11-30 21:13:53 +00:00
|
|
|
/* if we don't already have it, get the daemon object */
|
|
|
|
if (NULL == daemons) {
|
|
|
|
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
2010-07-27 18:24:11 +00:00
|
|
|
}
|
2011-01-13 01:54:05 +00:00
|
|
|
/* relay msg to each daemon excluding myself and whomever sent this to me */
|
2010-11-30 21:13:53 +00:00
|
|
|
for (v=1; v < daemons->procs->size; v++) {
|
|
|
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) {
|
|
|
|
continue;
|
|
|
|
}
|
2011-01-13 01:54:05 +00:00
|
|
|
if (NULL == proc->rml_uri) {
|
|
|
|
/* not ready yet - don't know contact info */
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s rmcast:tcp dont have path to %s",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(&proc->name)));
|
2010-11-30 21:13:53 +00:00
|
|
|
continue;
|
|
|
|
}
|
2011-01-13 01:54:05 +00:00
|
|
|
if (msg->sender.jobid == proc->name.jobid &&
|
|
|
|
msg->sender.vpid == proc->name.vpid) {
|
2010-11-30 21:13:53 +00:00
|
|
|
continue;
|
|
|
|
}
|
2011-01-13 01:54:05 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s rmcast:tcp relaying msg to %s",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(&proc->name)));
|
|
|
|
/* ignore errors */
|
2011-02-12 16:52:03 +00:00
|
|
|
OBJ_RETAIN(buf);
|
|
|
|
if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf, ORTE_RML_TAG_MULTICAST, 0, cbfunc, NULL))) {
|
|
|
|
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc && ORTE_ERR_UNREACH != rc) {
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
2011-02-12 16:52:03 +00:00
|
|
|
rc = ORTE_SUCCESS; /* don't confuse up-stream client */
|
|
|
|
}
|
|
|
|
/* now send it to all attached tools except whomever sent it to me, if applicable */
|
|
|
|
item = opal_list_get_first(&tools);
|
|
|
|
while (item != opal_list_get_end(&tools)) {
|
|
|
|
tool = (orte_namelist_t*)item;
|
|
|
|
next = opal_list_get_next(item);
|
|
|
|
if (msg->sender.jobid == tool->name.jobid &&
|
|
|
|
msg->sender.vpid == tool->name.vpid) {
|
|
|
|
item = next;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
OBJ_RETAIN(buf);
|
|
|
|
if (0 > (rc = orte_rml.send_buffer_nb(&tool->name, buf, ORTE_RML_TAG_MULTICAST, 0, cbfunc, NULL))) {
|
|
|
|
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc && ORTE_ERR_UNREACH != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
opal_list_remove_item(&tools, item);
|
|
|
|
OBJ_RELEASE(item);
|
|
|
|
OBJ_RELEASE(buf);
|
|
|
|
rc = ORTE_SUCCESS; /* don't confuse up-stream client */
|
|
|
|
}
|
|
|
|
item = next;
|
2010-11-30 21:13:53 +00:00
|
|
|
}
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
|
|
|
}
|
2011-01-13 01:54:05 +00:00
|
|
|
|
|
|
|
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
|
|
|
|
/* need to relay this to my children */
|
|
|
|
for (item = opal_list_get_first(&orte_local_children);
|
|
|
|
item != opal_list_get_end(&orte_local_children);
|
|
|
|
item = opal_list_get_next(item)) {
|
|
|
|
child = (orte_odls_child_t*)item;
|
|
|
|
if (!child->alive) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (NULL == child->rml_uri) {
|
|
|
|
/* race condition */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (msg->sender.jobid == child->name->jobid &&
|
|
|
|
msg->sender.vpid == child->name->vpid) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
|
|
|
"%s relaying multicast to %s",
|
2010-08-04 01:37:54 +00:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(child->name)));
|
2011-01-13 01:54:05 +00:00
|
|
|
|
|
|
|
/* ignore errors */
|
2011-02-12 16:52:03 +00:00
|
|
|
OBJ_RETAIN(buf);
|
|
|
|
if (0 > (rc = orte_rml.send_buffer_nb(child->name, buf, ORTE_RML_TAG_MULTICAST, 0, cbfunc, NULL))) {
|
|
|
|
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc && ORTE_ERR_UNREACH != rc) {
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
2011-02-12 16:52:03 +00:00
|
|
|
rc = ORTE_SUCCESS; /* don't confuse up-stream client */
|
2011-01-13 01:54:05 +00:00
|
|
|
}
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
|
|
|
}
|
2011-02-12 16:52:03 +00:00
|
|
|
OBJ_RELEASE(buf);
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
/* now process it myself - this releases the msg */
|
|
|
|
orte_rmcast_base_process_msg(msg);
|
2009-12-15 01:18:27 +00:00
|
|
|
}
|
|
|
|
|
2011-01-13 01:54:05 +00:00
|
|
|
/**** LOCAL FUNCTIONS ****/
|
|
|
|
static void recv_handler(int status, orte_process_name_t* sender,
|
|
|
|
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
|
|
|
void* cbdata)
|
2009-12-15 01:18:27 +00:00
|
|
|
{
|
2011-01-13 01:54:05 +00:00
|
|
|
if (!comm_enabled) {
|
2010-06-16 19:40:59 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
|
2011-01-13 01:54:05 +00:00
|
|
|
"%s rmcast:tcp recvd multicast msg",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
|
2009-12-15 01:18:27 +00:00
|
|
|
/* clear the way for the next message */
|
2011-01-13 01:54:05 +00:00
|
|
|
ORTE_MULTICAST_MESSAGE_EVENT(sender, buffer);
|
2009-12-15 01:18:27 +00:00
|
|
|
|
|
|
|
return;
|
|
|
|
}
|