diff --git a/contrib/spread/spread.conf b/contrib/spread/spread.conf index 475379bb26..79d7fd6092 100644 --- a/contrib/spread/spread.conf +++ b/contrib/spread/spread.conf @@ -12,12 +12,10 @@ # # This configures one spread daemon running on port 4803 on localhost. -Spread_Segment 172.16.174.255:4803 { +Spread_Segment 192.168.203.255:4803 { - rmcast-1 172.16.174.129 - rmcast-2 172.16.174.130 - rmcast-3 172.16.174.131 - rmcast-4 172.16.174.132 + sjc-rcastain-8713 192.168.203.1 + ubuntu 192.168.203.192 } diff --git a/orte/mca/db/daemon/db_daemon.c b/orte/mca/db/daemon/db_daemon.c index da176ddb5d..302da3b67a 100644 --- a/orte/mca/db/daemon/db_daemon.c +++ b/orte/mca/db/daemon/db_daemon.c @@ -350,10 +350,11 @@ static void recv_cmd(int status, orte_db_cmd_t cmd; opal_buffer_t *ans; int count, i; - int32_t rc, ret; + int32_t rc; char *key; orte_db_data_t *dat; orte_rmcast_channel_t ch; + char *ch_name; OPAL_OUTPUT_VERBOSE((2, orte_db_base_output, "%s db:daemon: cmd recvd from %s", @@ -365,6 +366,8 @@ static void recv_cmd(int status, count=1; opal_dss.unpack(buf, &ch, &count, ORTE_RMCAST_CHANNEL_T); count=1; + opal_dss.unpack(buf, &ch_name, &count, OPAL_STRING); + count=1; opal_dss.unpack(buf, &key, &count, OPAL_STRING); ans = OBJ_NEW(opal_buffer_t); @@ -420,11 +423,9 @@ static void recv_cmd(int status, rc = ORTE_ERR_NOT_FOUND; break; } - /* open a channel back to the sender */ - if (ORTE_SUCCESS != (ret = orte_rmcast.open_channel(&ch, ORTE_NAME_PRINT(sender), - NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { - ORTE_ERROR_LOG(ret); - return; - } + + /* ensure the return channel is open */ + orte_rmcast.open_channel(ch, ch_name, NULL, -1, NULL, ORTE_RMCAST_XMIT); + orte_rmcast.send_buffer_nb(ch, ORTE_RMCAST_TAG_CMD_ACK, ans, callback_fn, NULL); } diff --git a/orte/mca/rmcast/base/Makefile.am b/orte/mca/rmcast/base/Makefile.am index b4d60bc7cc..b8c8801875 100644 --- a/orte/mca/rmcast/base/Makefile.am +++ b/orte/mca/rmcast/base/Makefile.am @@ -20,6 +20,7 @@ if !ORTE_DISABLE_FULL_SUPPORT libmca_rmcast_la_SOURCES += \ base/rmcast_base_close.c \ - base/rmcast_base_select.c + base/rmcast_base_select.c \ + base/rmcast_base_fns.c endif diff --git a/orte/mca/rmcast/base/base.h b/orte/mca/rmcast/base/base.h index 53dde251f4..3d3dd7d50d 100644 --- a/orte/mca/rmcast/base/base.h +++ b/orte/mca/rmcast/base/base.h @@ -21,9 +21,12 @@ #include #endif +#include "opal/class/opal_list.h" #include "opal/event/event.h" +#include "opal/threads/threads.h" #include "orte/mca/rmcast/rmcast.h" +#include "orte/mca/rmcast/base/private.h" BEGIN_C_DECLS @@ -44,6 +47,13 @@ typedef struct { uint32_t interface; uint16_t ports[256]; int cache_size; + bool opened; + opal_mutex_t lock; + opal_condition_t cond; + bool active; + opal_list_t recvs; + opal_list_t channels; + rmcast_base_channel_t *my_group_channel; } orte_rmcast_base_t; ORTE_DECLSPEC extern orte_rmcast_base_t orte_rmcast_base; diff --git a/orte/mca/rmcast/base/private.h b/orte/mca/rmcast/base/private.h index 6905174e58..88ce000c45 100644 --- a/orte/mca/rmcast/base/private.h +++ b/orte/mca/rmcast/base/private.h @@ -41,7 +41,6 @@ BEGIN_C_DECLS } while(0) - /**** CLASS DEFINITIONS ****/ /* * Data structure for tracking assigned channels @@ -77,7 +76,6 @@ typedef struct { orte_process_name_t name; orte_rmcast_channel_t channel; bool recvd; - bool iovecs_requested; orte_rmcast_tag_t tag; orte_rmcast_flag_t flags; struct iovec *iovec_array; @@ -118,9 +116,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(rmcast_base_send_t); typedef struct { opal_object_t super; opal_event_t *ev; - uint8_t *data; - ssize_t sz; - rmcast_base_channel_t *channel; + opal_buffer_t *buf; } orte_mcast_msg_event_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_mcast_msg_event_t); @@ -145,8 +141,7 @@ typedef struct { } rmcast_send_log_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(rmcast_send_log_t); - -#define ORTE_MULTICAST_MESSAGE_EVENT(dat, n, chan, cbfunc) \ +#define ORTE_MULTICAST_MESSAGE_EVENT(bf, cbfunc) \ do { \ orte_mcast_msg_event_t *mev; \ struct timeval now; \ @@ -154,9 +149,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(rmcast_send_log_t); "defining mcast msg event: %s %d", \ __FILE__, __LINE__)); \ mev = OBJ_NEW(orte_mcast_msg_event_t); \ - mev->data = (dat); \ - mev->sz = (n); \ - mev->channel = (chan); \ + mev->buf = (bf); \ opal_evtimer_set(mev->ev, (cbfunc), mev); \ now.tv_sec = 0; \ now.tv_usec = 0; \ @@ -164,52 +157,6 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(rmcast_send_log_t); } while(0); -#define ORTE_MULTICAST_MESSAGE_HDR_HTON(bfr, tg, seq) \ - do { \ - uint32_t nm; \ - uint16_t tmp; \ - nm = htonl(ORTE_PROC_MY_NAME->jobid); \ - memcpy((bfr), &nm, 4); \ - nm = htonl(ORTE_PROC_MY_NAME->vpid); \ - memcpy((bfr)+4, &nm, 4); \ - /* add the tag data, also converted */ \ - tmp = htons((tg)); \ - memcpy((bfr)+8, &tmp, 2); \ - /* add the sequence number, also converted */ \ - nm = htonl((seq)); \ - memcpy((bfr)+10, &nm, 4); \ - } while(0); - -#define ORTE_MULTICAST_MESSAGE_HDR_NTOH(bfr, nm, tg, seq) \ - do { \ - uint32_t tmp; \ - uint16_t tmp16; \ - /* extract the name and convert it to host order */ \ - memcpy(&tmp, (bfr), 4); \ - (nm)->jobid = ntohl(tmp); \ - memcpy(&tmp, (bfr)+4, 4); \ - (nm)->vpid = ntohl(tmp); \ - /* extract the target tag */ \ - memcpy(&tmp16, (bfr)+8, 2); \ - (tg) = ntohs(tmp16); \ - /* extract the sequence number */ \ - memcpy(&tmp, (bfr)+10, 4); \ - (seq) = ntohl(tmp); \ - } while(0); - -#define ORTE_MULTICAST_LOAD_MESSAGE(bfr, dat, sz, maxsz, endsz) \ - do { \ - if ((maxsz) <= (sz) + 14) { \ - *(endsz) = -1 * ((sz) + 14); \ - } else { \ - memcpy((bfr)+14, (dat), (sz)); \ - *(endsz) = (sz) + 14; \ - } \ - } while(0); - -#define ORTE_MULTICAST_UNLOAD_MESSAGE(bfr, dat, sz) \ - opal_dss.load((bfr), (dat)+14, (sz)-14); - #define ORTE_MULTICAST_NEXT_SEQUENCE_NUM(seq) \ do { \ if ((seq) < ORTE_RMCAST_SEQ_MAX) { \ @@ -219,6 +166,28 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(rmcast_send_log_t); } \ } while(0); +/**** FUNCTIONS ****/ +ORTE_DECLSPEC int orte_rmcast_base_build_msg(rmcast_base_channel_t *ch, + opal_buffer_t **buffer, + rmcast_base_send_t *snd); + +ORTE_DECLSPEC int orte_rmcast_base_queue_recv(rmcast_base_recv_t **recvptr, + orte_rmcast_channel_t channel, + orte_rmcast_tag_t tag, + orte_rmcast_flag_t flags, + orte_rmcast_callback_fn_t cbfunc_iovec, + orte_rmcast_callback_buffer_fn_t cbfunc_buffer, + void *cbdata, bool blocking); + +ORTE_DECLSPEC void orte_rmcast_base_process_recv(orte_mcast_msg_event_t *msg); + +ORTE_DECLSPEC void orte_rmcast_base_cancel_recv(orte_rmcast_channel_t channel, + orte_rmcast_tag_t tag); + +ORTE_DECLSPEC int orte_rmcast_base_close_channel(orte_rmcast_channel_t channel); + +ORTE_DECLSPEC orte_rmcast_channel_t orte_rmcast_base_query(void); + #endif /* ORTE_DISABLE_FULL_SUPPORT */ END_C_DECLS diff --git a/orte/mca/rmcast/base/rmcast_base_close.c b/orte/mca/rmcast/base/rmcast_base_close.c index b58a17a102..ff7299d421 100644 --- a/orte/mca/rmcast/base/rmcast_base_close.c +++ b/orte/mca/rmcast/base/rmcast_base_close.c @@ -20,6 +20,10 @@ int orte_rmcast_base_close(void) { + if (!orte_rmcast_base.opened) { + return ORTE_SUCCESS; + } + /* finalize the active module */ if (NULL != orte_rmcast.finalize) { orte_rmcast.finalize(); @@ -31,5 +35,8 @@ int orte_rmcast_base_close(void) mca_base_components_close(orte_rmcast_base.rmcast_output, &orte_rmcast_base.rmcast_opened, NULL); + orte_rmcast_base.opened = false; + OBJ_DESTRUCT(&orte_rmcast_base.lock); + OBJ_DESTRUCT(&orte_rmcast_base.cond); return ORTE_SUCCESS; } diff --git a/orte/mca/rmcast/base/rmcast_base_fns.c b/orte/mca/rmcast/base/rmcast_base_fns.c new file mode 100644 index 0000000000..cbe73541df --- /dev/null +++ b/orte/mca/rmcast/base/rmcast_base_fns.c @@ -0,0 +1,518 @@ +/* + * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include + +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/threads/threads.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/rmcast/base/base.h" +#include "orte/mca/rmcast/base/private.h" + +static int extract_hdr(opal_buffer_t *buf, + orte_process_name_t *name, + orte_rmcast_channel_t *channel, + orte_rmcast_tag_t *tag, + orte_rmcast_seq_t *seq_num); + +static int insert_hdr(opal_buffer_t *buf, + orte_rmcast_channel_t channel, + orte_rmcast_tag_t tag, + orte_rmcast_seq_t seq_num); + +int orte_rmcast_base_build_msg(rmcast_base_channel_t *ch, + opal_buffer_t **buffer, + rmcast_base_send_t *snd) +{ + int32_t sz; + opal_buffer_t *buf; + int rc; + int8_t flag; + int32_t tmp32; + + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:base:build_msg of %d %s" + " for multicast on channel %d tag %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == snd->iovec_array) ? (int)snd->buf->bytes_used : (int)snd->iovec_count, + (NULL == snd->iovec_array) ? "bytes" : "iovecs", + (int)ch->channel, snd->tag)); + + /* setup a buffer */ + buf = OBJ_NEW(opal_buffer_t); + + /* insert the header */ + if (ORTE_SUCCESS != (rc = insert_hdr(buf, ch->channel, snd->tag, ch->seq_num))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* are we sending a buffer? */ + if (NULL == snd->buf) { + /* no, flag the buffer as containing iovecs */ + flag = 0; + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &flag, 1, OPAL_INT8))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + /* pack the number of iovecs */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &snd->iovec_count, 1, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + /* pack each iovec into a buffer in prep for sending + * so we can recreate the array at the other end + */ + for (sz=0; sz < snd->iovec_count; sz++) { + /* pack the size */ + tmp32 = snd->iovec_array[sz].iov_len; + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &tmp32, 1, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (0 < tmp32) { + /* pack the bytes */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, snd->iovec_array[sz].iov_base, tmp32, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } + } + + } else { + /* flag it as being a buffer */ + flag = 1; + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &flag, 1, OPAL_INT8))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + /* copy the payload */ + if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(buf, snd->buf))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } + *buffer = buf; + return ORTE_SUCCESS; + +cleanup: + if (NULL != buf) { + OBJ_RELEASE(buf); + } + return rc; +} + +int orte_rmcast_base_queue_recv(rmcast_base_recv_t **recvptr, + orte_rmcast_channel_t channel, + orte_rmcast_tag_t tag, + orte_rmcast_flag_t flags, + orte_rmcast_callback_fn_t cbfunc_iovec, + orte_rmcast_callback_buffer_fn_t cbfunc_buffer, + void *cbdata, bool blocking) +{ + opal_list_item_t *item; + rmcast_base_recv_t *rptr; + + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:tcp: queue_recv called on multicast channel %d tag %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel, tag)); + + if (!blocking) { + /* do we already have a recv for this channel/tag? */ + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + for (item = opal_list_get_first(&orte_rmcast_base.recvs); + item != opal_list_get_end(&orte_rmcast_base.recvs); + item = opal_list_get_next(item)) { + rptr = (rmcast_base_recv_t*)item; + if (channel != rptr->channel) { + /* different channel */ + continue; + } + if (tag != rptr->tag) { + /* different tag */ + continue; + } + if (NULL != cbfunc_iovec) { + if (NULL != rptr->cbfunc_iovec) { + /* already have one in place */ + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:tcp: matching recv already active on multicast channel %d tag %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel, tag)); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); + return ORTE_EXISTS; + } + rptr->cbfunc_iovec = cbfunc_iovec; + } + if (NULL != cbfunc_buffer) { + if (NULL != rptr->cbfunc_buffer) { + /* matching type - recv already in place */ + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:tcp: matching recv already active on multicast channel %d tag %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel, tag)); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); + return ORTE_EXISTS; + } + rptr->cbfunc_iovec = cbfunc_iovec; + } + if (NULL != recvptr) { + *recvptr = rptr; + } + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); + return ORTE_SUCCESS; + } + } + + /* if we get here, then we need to add a new recv */ + + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:tcp: adding recv on multicast channel %d tag %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel, tag)); + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + rptr = OBJ_NEW(rmcast_base_recv_t); + rptr->channel = channel; + rptr->tag = tag; + rptr->flags = flags; + rptr->cbfunc_iovec = cbfunc_iovec; + rptr->cbfunc_buffer = cbfunc_buffer; + rptr->cbdata = cbdata; + if (NULL != recvptr) { + *recvptr = rptr; + } + opal_list_append(&orte_rmcast_base.recvs, &rptr->item); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); + + return ORTE_SUCCESS; +} + +void orte_rmcast_base_process_recv(orte_mcast_msg_event_t *msg) +{ + orte_rmcast_channel_t channel; + opal_list_item_t *item; + rmcast_base_recv_t *ptr; + orte_process_name_t name; + orte_rmcast_tag_t tag; + int8_t flag; + struct iovec *iovec_array=NULL; + int32_t iovec_count=0, i, n, isz; + opal_buffer_t *recvd_buf=NULL; + int rc; + orte_rmcast_seq_t recvd_seq_num; + + /* extract the header */ + if (ORTE_SUCCESS != (rc = extract_hdr(msg->buf, &name, &channel, &tag, &recvd_seq_num))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + /* if this message is from myself, ignore it */ + if (name.jobid == ORTE_PROC_MY_NAME->jobid && name.vpid == ORTE_PROC_MY_NAME->vpid) { + OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output, + "%s rmcast:base:process_recv sent from myself: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&name))); + goto cleanup; + } + + /* if this message is from a different job family, ignore it unless + * it is on the system channel. We ignore these messages to avoid + * confusion between different jobs since we all may be sharing + * multicast channels. The system channel is left open to support + * cross-job communications via the HNP. + */ + if (ORTE_JOB_FAMILY(name.jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) && + (ORTE_RMCAST_SYS_CHANNEL != channel)) { + OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output, + "%s rmcast:base:process_recv from a different job family: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&name))); + goto cleanup; + } + + /* unpack the iovec vs buf flag */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(msg->buf, &flag, &n, OPAL_INT8))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:base:process_recv sender: %s channel: %d tag: %d %s seq_num: %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&name), channel, (int)tag, + (0 == flag) ? "iovecs" : "buffer", recvd_seq_num)); + + + /* find the recv for this channel, tag, and type */ + for (item = opal_list_get_first(&orte_rmcast_base.recvs); + item != opal_list_get_end(&orte_rmcast_base.recvs); + item = opal_list_get_next(item)) { + ptr = (rmcast_base_recv_t*)item; + + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:tcp:recv checking channel %d tag %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (int)ptr->channel, (int)ptr->tag)); + + if (channel != ptr->channel) { + continue; + } + + if (tag != ptr->tag && ORTE_RMCAST_TAG_WILDCARD != ptr->tag) { + continue; + } + + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:tcp:recv delivering message to channel %d tag %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ptr->channel, (int)tag)); + + /* we have a recv - unpack the data */ + if (0 == flag) { + /* get the number of iovecs in the buffer */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(msg->buf, &iovec_count, &n, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* malloc the required space */ + iovec_array = (struct iovec *)malloc(iovec_count * sizeof(struct iovec)); + /* unpack the iovecs */ + for (i=0; i < iovec_count; i++) { + /* unpack the number of bytes in this iovec */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(msg->buf, &isz, &n, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + iovec_array[i].iov_base = NULL; + iovec_array[i].iov_len = isz; + if (0 < isz) { + /* allocate the space */ + iovec_array[i].iov_base = (uint8_t*)malloc(isz); + /* unpack the data */ + if (ORTE_SUCCESS != (rc = opal_dss.unpack(msg->buf, iovec_array[i].iov_base, &isz, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } + } + if (NULL != ptr->cbfunc_iovec) { + ptr->cbfunc_iovec(ORTE_SUCCESS, ptr->channel, tag, + &name, iovec_array, iovec_count, ptr->cbdata); + /* if it isn't persistent, remove it */ + if (!(ORTE_RMCAST_PERSISTENT & ptr->flags)) { + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + opal_list_remove_item(&orte_rmcast_base.recvs, &ptr->item); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); + OBJ_RELEASE(ptr); + } + } else { + /* if something is already present, then we have a problem */ + if (NULL != ptr->iovec_array) { + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:tcp:recv blocking recv already fulfilled", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + goto cleanup; + } + /* copy over the iovec array since it will be released by + * the blocking recv + */ + ptr->iovec_array = (struct iovec *)malloc(iovec_count * sizeof(struct iovec)); + ptr->iovec_count = iovec_count; + for (i=0; i < iovec_count; i++) { + ptr->iovec_array[i].iov_base = (uint8_t*)malloc(iovec_array[i].iov_len); + ptr->iovec_array[i].iov_len = iovec_array[i].iov_len; + memcpy(ptr->iovec_array[i].iov_base, iovec_array[i].iov_base, iovec_array[i].iov_len); + } + /* flag it as recvd to release blocking recv */ + ptr->recvd = true; + } + } else { + /* buffer was included */ + recvd_buf = OBJ_NEW(opal_buffer_t); + if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(recvd_buf, msg->buf))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (NULL != ptr->cbfunc_buffer) { + ptr->cbfunc_buffer(ORTE_SUCCESS, ptr->channel, tag, + &name, recvd_buf, ptr->cbdata); + /* if it isn't persistent, remove it */ + if (!(ORTE_RMCAST_PERSISTENT & ptr->flags)) { + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + opal_list_remove_item(&orte_rmcast_base.recvs, &ptr->item); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); + OBJ_RELEASE(ptr); + } + } else { + /* if something is already present, then we have a problem */ + if (NULL != ptr->buf) { + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:tcp:recv blocking recv already fulfilled", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + goto cleanup; + } + /* copy the buffer across since it will be released + * by the blocking recv + */ + ptr->buf = OBJ_NEW(opal_buffer_t); + if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(ptr->buf, recvd_buf))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* flag it as recvd to release blocking recv */ + ptr->recvd = true; + } + } + /* we are done - only one recv can match */ + break; + } + +cleanup: + if (NULL != iovec_array) { + for (i=0; i < iovec_count; i++) { + free(iovec_array[i].iov_base); + } + free(iovec_array); + } + if (NULL != recvd_buf) { + OBJ_RELEASE(recvd_buf); + } + return; +} + +void orte_rmcast_base_cancel_recv(orte_rmcast_channel_t channel, + orte_rmcast_tag_t tag) +{ + opal_list_item_t *item, *next; + rmcast_base_recv_t *ptr; + orte_rmcast_channel_t ch; + + if (ORTE_RMCAST_GROUP_CHANNEL == channel) { + ch = orte_rmcast_base.my_group_number; + } else { + ch = channel; + } + + /* find all recv's for this channel and tag */ + item = opal_list_get_first(&orte_rmcast_base.recvs); + while (item != opal_list_get_end(&orte_rmcast_base.recvs)) { + next = opal_list_get_next(item); + + ptr = (rmcast_base_recv_t*)item; + if (ch == ptr->channel && + tag == ptr->tag) { + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + opal_list_remove_item(&orte_rmcast_base.recvs, &ptr->item); + OBJ_RELEASE(ptr); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); + } + item = next; + } +} + +int orte_rmcast_base_close_channel(orte_rmcast_channel_t channel) +{ + opal_list_item_t *item; + rmcast_base_channel_t *chan; + + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + for (item = opal_list_get_first(&orte_rmcast_base.channels); + item != opal_list_get_end(&orte_rmcast_base.channels); + item = opal_list_get_next(item)) { + chan = (rmcast_base_channel_t*)item; + + if (channel == chan->channel) { + opal_list_remove_item(&orte_rmcast_base.channels, item); + OBJ_RELEASE(chan); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); + return ORTE_SUCCESS; + } + } + + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); + return ORTE_ERR_NOT_FOUND; +} + +orte_rmcast_channel_t orte_rmcast_base_query(void) +{ + return orte_rmcast_base.my_group_channel->channel; +} + +static int extract_hdr(opal_buffer_t *buf, + orte_process_name_t *name, + orte_rmcast_channel_t *channel, + orte_rmcast_tag_t *tag, + orte_rmcast_seq_t *seq_num) +{ + int rc; + int32_t n; + + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, name, &n, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, channel, &n, ORTE_RMCAST_CHANNEL_T))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, tag, &n, ORTE_RMCAST_TAG_T))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, seq_num, &n, ORTE_RMCAST_SEQ_T))) { + ORTE_ERROR_LOG(rc); + } + return rc; +} + +static int insert_hdr(opal_buffer_t *buf, + orte_rmcast_channel_t channel, + orte_rmcast_tag_t tag, + orte_rmcast_seq_t seq_num) +{ + int rc; + + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &channel, 1, ORTE_RMCAST_CHANNEL_T))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &tag, 1, ORTE_RMCAST_TAG_T))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &seq_num, 1, ORTE_RMCAST_SEQ_T))) { + ORTE_ERROR_LOG(rc); + } + return rc; +} diff --git a/orte/mca/rmcast/base/rmcast_base_open.c b/orte/mca/rmcast/base/rmcast_base_open.c index bee04bdc28..2da6a8e1b3 100644 --- a/orte/mca/rmcast/base/rmcast_base_open.c +++ b/orte/mca/rmcast/base/rmcast_base_open.c @@ -27,6 +27,7 @@ #include "opal/util/if.h" #include "opal/util/opal_sos.h" #include "opal/class/opal_ring_buffer.h" +#include "opal/class/opal_list.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/util/name_fns.h" @@ -79,8 +80,7 @@ orte_rmcast_module_t orte_rmcast = { NULL }; orte_rmcast_base_t orte_rmcast_base; - -static bool opened = false; +static bool opened=false; /** * Function for finding and opening either all MCA components, or the one @@ -101,8 +101,15 @@ int orte_rmcast_base_open(void) return ORTE_SUCCESS; } opened = true; + orte_rmcast_base.opened = true; /* ensure all global values are initialized */ + OBJ_CONSTRUCT(&orte_rmcast_base.lock, opal_mutex_t); + OBJ_CONSTRUCT(&orte_rmcast_base.cond, opal_condition_t); + orte_rmcast_base.active = false; + OBJ_CONSTRUCT(&orte_rmcast_base.recvs, opal_list_t); + OBJ_CONSTRUCT(&orte_rmcast_base.channels, opal_list_t); + orte_rmcast_base.xmit_network = 0; orte_rmcast_base.my_group_name = NULL; orte_rmcast_base.my_group_number = 0; @@ -291,16 +298,12 @@ int orte_rmcast_base_open(void) static void mcast_event_constructor(orte_mcast_msg_event_t *ev) { ev->ev = (opal_event_t*)malloc(sizeof(opal_event_t)); - ev->data = NULL; } static void mcast_event_destructor(orte_mcast_msg_event_t *ev) { if (NULL != ev->ev) { free(ev->ev); } - if (NULL != ev->data) { - free(ev->data); - } } OBJ_CLASS_INSTANCE(orte_mcast_msg_event_t, opal_object_t, @@ -329,7 +332,6 @@ static void recv_construct(rmcast_base_recv_t *ptr) ptr->name.vpid = ORTE_VPID_INVALID; ptr->channel = ORTE_RMCAST_INVALID_CHANNEL; ptr->recvd = false; - ptr->iovecs_requested = false; ptr->tag = ORTE_RMCAST_TAG_INVALID; ptr->flags = ORTE_RMCAST_NON_PERSISTENT; /* default */ ptr->iovec_array = NULL; diff --git a/orte/mca/rmcast/rmcast.h b/orte/mca/rmcast/rmcast.h index 2fb73de541..be246e6bf4 100644 --- a/orte/mca/rmcast/rmcast.h +++ b/orte/mca/rmcast/rmcast.h @@ -122,8 +122,8 @@ typedef int (*orte_rmcast_base_module_recv_nb_fn_t)(orte_rmcast_channel_t channe typedef void (*orte_rmcast_base_module_cancel_recv_fn_t)(orte_rmcast_channel_t channel, orte_rmcast_tag_t tag); -/* open the next available channel */ -typedef int (*orte_rmcast_base_module_open_channel_fn_t)(orte_rmcast_channel_t *channel, char *name, +/* open the specified channel */ +typedef int (*orte_rmcast_base_module_open_channel_fn_t)(orte_rmcast_channel_t channel, char *name, char *network, int port, char *interface, uint8_t direction); /* close the channel */ diff --git a/orte/mca/rmcast/spread/rmcast_spread.c b/orte/mca/rmcast/spread/rmcast_spread.c index e9af9fb3d2..31fba7e37e 100644 --- a/orte/mca/rmcast/spread/rmcast_spread.c +++ b/orte/mca/rmcast/spread/rmcast_spread.c @@ -47,11 +47,7 @@ #define SPREAD_NAME "4803" /* LOCAL DATA */ -static opal_mutex_t lock; -static opal_list_t recvs; -static opal_list_t channels; static bool init_completed = false; -static orte_rmcast_channel_t next_channel; static opal_pointer_array_t msg_log; static char private_group[MAX_GROUP_NAME]; @@ -65,9 +61,6 @@ static int setup_channel(rmcast_base_channel_t *chan, uint8_t direction, mailbox static void xmit_data(int sd, short flags, void* send_req); -/* LOCAL STRUCTURE VALUES */ -static rmcast_base_channel_t *my_group_channel=NULL; - /* API FUNCTIONS */ static int init(void); @@ -115,16 +108,9 @@ static int spread_recv_nb(orte_rmcast_channel_t channel, orte_rmcast_callback_fn_t cbfunc, void *cbdata); -static void cancel_recv(orte_rmcast_channel_t channel, - orte_rmcast_tag_t tag); - -static int open_channel(orte_rmcast_channel_t *channel, char *name, +static int open_channel(orte_rmcast_channel_t channel, char *name, char *network, int port, char *interface, uint8_t direction); -static int close_channel(orte_rmcast_channel_t channel); - -static orte_rmcast_channel_t query(void); - /* Define the module */ orte_rmcast_module_t orte_rmcast_spread_module = { @@ -138,69 +124,69 @@ orte_rmcast_module_t orte_rmcast_spread_module = { spread_recv_nb, spread_recv_buffer, spread_recv_buffer_nb, - cancel_recv, + orte_rmcast_base_cancel_recv, open_channel, - close_channel, - query + orte_rmcast_base_close_channel, + orte_rmcast_base_query }; -static void SP_error2str( int error , char *error_str) +static char* SP_error2str( int error) { switch( error ) { case ILLEGAL_SPREAD: - sprintf( error_str, "SP_error: (%d) Illegal spread was provided\n", error ); + return "SP_error: Illegal spread was provided"; break; case COULD_NOT_CONNECT: - sprintf( error_str, "SP_error: (%d) Could not connect. Is Spread running?\n", error ); + return "SP_error: Could not connect. Is Spread running?"; break; case REJECT_QUOTA: - sprintf( error_str, "SP_error: (%d) Connection rejected, to many users\n", error ); + return "SP_error: Connection rejected, to many users"; break; case REJECT_NO_NAME: - sprintf( error_str, "SP_error: (%d) Connection rejected, no name was supplied\n", error ); + return "SP_error: Connection rejected, no name was supplied"; break; case REJECT_ILLEGAL_NAME: - sprintf( error_str, "SP_error: (%d) Connection rejected, illegal name\n", error ); + return "SP_error: Connection rejected, illegal name"; break; case REJECT_NOT_UNIQUE: - sprintf( error_str, "SP_error: (%d) Connection rejected, name not unique\n", error ); + return "SP_error: Connection rejected, name not unique"; break; case REJECT_VERSION: - sprintf( error_str, "SP_error: (%d) Connection rejected, library does not fit daemon\n", error ); + return "SP_error: Connection rejected, library does not fit daemon"; break; case CONNECTION_CLOSED: - sprintf( error_str, "SP_error: (%d) Connection closed by spread\n", error ); + return "SP_error: Connection closed by spread"; break; case REJECT_AUTH: - sprintf( error_str, "SP_error: (%d) Connection rejected, authentication failed\n", error ); + return "SP_error: Connection rejected, authentication failed"; break; case ILLEGAL_SESSION: - sprintf( error_str, "SP_error: (%d) Illegal session was supplied\n", error ); + return "SP_error: Illegal session was supplied"; break; case ILLEGAL_SERVICE: - sprintf( error_str, "SP_error: (%d) Illegal service request\n", error ); + return "SP_error: Illegal service request"; break; case ILLEGAL_MESSAGE: - sprintf( error_str, "SP_error: (%d) Illegal message\n", error ); + return "SP_error: Illegal message"; break; case ILLEGAL_GROUP: - sprintf( error_str, "SP_error: (%d) Illegal group\n", error ); + return "SP_error: Illegal group"; break; case BUFFER_TOO_SHORT: - sprintf( error_str, "SP_error: (%d) The supplied buffer was too short\n", error ); + return "SP_error: The supplied buffer was too short"; break; case GROUPS_TOO_SHORT: - sprintf( error_str, "SP_error: (%d) The supplied groups list was too short\n", error ); + return "SP_error: The supplied groups list was too short"; break; case MESSAGE_TOO_LONG: - sprintf( error_str, "SP_error: (%d) The message body + group names was too large to fit in a message\n", error ); + return "SP_error: (The message body + group names was too large to fit in a message"; break; case NET_ERROR_ON_SESSION: - sprintf( error_str, "SP_error: (%d) The network socket experienced an error. This Spread mailbox will no longer work until the connection is disconnected and then reconnected\n", error ); + return "SP_error: The network socket experienced an error. This Spread mailbox will no longer work until the connection is disconnected and then reconnected"; break; default: - sprintf( error_str, "SP_error: (%d) unrecognized error\n", error ); + return "SP_error: unrecognized error"; } } @@ -228,19 +214,15 @@ static void SP_error2str( int error , char *error_str) static int init(void) { int rc; - orte_rmcast_channel_t channel; if (init_completed) { return ORTE_SUCCESS; } if ((rc = SP_connect(SPREAD_NAME, getlogin(), 0, 1, &Mbox, private_group)) < 0) { - char error_string[1024]; - - SP_error2str(rc, error_string); OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "rmcast:spread: init SP_connect failed %s ", - error_string)); + SP_error2str(rc))); rc = ORTE_ERROR; return rc; } @@ -251,75 +233,75 @@ static int init(void) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* setup the globals */ - OBJ_CONSTRUCT(&lock, opal_mutex_t); - OBJ_CONSTRUCT(&recvs, opal_list_t); - OBJ_CONSTRUCT(&channels, opal_list_t); - next_channel = ORTE_RMCAST_DYNAMIC_CHANNELS; OBJ_CONSTRUCT(&msg_log, opal_pointer_array_t); opal_pointer_array_init(&msg_log, 8, INT_MAX, 8); /* setup the respective public address channel */ - if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_TOOL) { - channel = ORTE_RMCAST_SYS_CHANNEL; - if (ORTE_SUCCESS != (rc = open_channel(&channel, "system", + if (ORTE_PROC_IS_TOOL) { + /* tools only open the sys channel */ + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_SYS_CHANNEL, "system", NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { ORTE_ERROR_LOG(rc); return rc; } + orte_rmcast_base.my_group_channel = (rmcast_base_channel_t*)opal_list_get_last(&orte_rmcast_base.channels); + } else if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { + /* daemons and hnp open the sys and data server channels */ + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_SYS_CHANNEL, "system", + NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_DATA_SERVER_CHANNEL, "data-server", + NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + orte_rmcast_base.my_group_channel = (rmcast_base_channel_t*)opal_list_get_last(&orte_rmcast_base.channels); } else if (ORTE_PROC_IS_APP) { - channel = ORTE_RMCAST_APP_PUBLIC_CHANNEL; - if (ORTE_SUCCESS != (rc = open_channel(&channel, "app-announce", + /* apps open the app public and data server channels */ + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_APP_PUBLIC_CHANNEL, "app-announce", NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { ORTE_ERROR_LOG(rc); return rc; } - /* setup our grp channel, if one was given */ + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_DATA_SERVER_CHANNEL, "data-server", + NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* also setup our grp channel, if one was given */ if (NULL != orte_rmcast_base.my_group_name) { - channel = orte_rmcast_base.my_group_number; - if (ORTE_SUCCESS != (rc = open_channel(&channel, orte_rmcast_base.my_group_name, + if (ORTE_SUCCESS != (rc = open_channel(orte_rmcast_base.my_group_number, + orte_rmcast_base.my_group_name, NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { ORTE_ERROR_LOG(rc); return rc; } - my_group_channel = (rmcast_base_channel_t*)opal_list_get_last(&channels); + orte_rmcast_base.my_group_channel = (rmcast_base_channel_t*)opal_list_get_last(&orte_rmcast_base.channels); } } else { opal_output(0, "rmcast:spread:init - unknown process type"); return ORTE_ERR_SILENT; } - + return ORTE_SUCCESS; } static void finalize(void) { - opal_list_item_t *item; rmcast_recv_log_t *log; int j; OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:spread: finalize called", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* deconstruct the globals */ - OPAL_THREAD_LOCK(&lock); - while (NULL != (item = opal_list_remove_first(&recvs))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&recvs); - while (NULL != (item = opal_list_remove_first(&channels))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&channels); for (j=0; j < msg_log.size; j++) { if (NULL != (log = opal_pointer_array_get_item(&msg_log, j))) { OBJ_RELEASE(log); } } OBJ_DESTRUCT(&msg_log); - OPAL_THREAD_UNLOCK(&lock); - - OBJ_DESTRUCT(&lock); - return; } @@ -360,8 +342,7 @@ static rmcast_base_channel_t *get_chan_from_name(char *name) } static int queue_xmit(rmcast_base_send_t *snd, - orte_rmcast_channel_t channel, - orte_rmcast_tag_t tag) + orte_rmcast_channel_t channel) { opal_list_item_t *item; rmcast_base_channel_t *chptr, *ch; @@ -375,10 +356,11 @@ static int queue_xmit(rmcast_base_send_t *snd, * channel, substitute it */ if (ORTE_RMCAST_GROUP_CHANNEL == channel) { - if (NULL == my_group_channel) { + if (NULL == orte_rmcast_base.my_group_channel) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - ch = my_group_channel; + ch = orte_rmcast_base.my_group_channel; goto process; } @@ -438,7 +420,7 @@ static int spread_send(orte_rmcast_channel_t channel, snd->cbdata = snd; snd->send_complete = false; - if ((snd->cbdata == NULL) || (ORTE_SUCCESS != (ret = queue_xmit(snd, channel, tag)))) { + if ((snd->cbdata == NULL) || (ORTE_SUCCESS != (ret = queue_xmit(snd, channe)))) { ORTE_ERROR_LOG(ret); return ret; } @@ -466,7 +448,7 @@ static int spread_send_nb(orte_rmcast_channel_t channel, snd->cbfunc_iovec = cbfunc; snd->cbdata = cbdata; - if (ORTE_SUCCESS != (ret = queue_xmit(snd, channel, tag))) { + if (ORTE_SUCCESS != (ret = queue_xmit(snd, channel))) { ORTE_ERROR_LOG(ret); return ret; } @@ -489,7 +471,7 @@ static int spread_send_buffer(orte_rmcast_channel_t channel, snd->cbdata = snd; snd->send_complete = false; - if (ORTE_SUCCESS != (ret = queue_xmit(snd, channel, tag))) { + if (ORTE_SUCCESS != (ret = queue_xmit(snd, channel))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(snd); return ret; @@ -516,7 +498,7 @@ static int spread_send_buffer_nb(orte_rmcast_channel_t channel, snd->cbfunc_buffer = cbfunc; snd->cbdata = cbdata; - if (ORTE_SUCCESS != (ret = queue_xmit(snd, channel, tag))) { + if (ORTE_SUCCESS != (ret = queue_xmit(snd, channel))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(snd); return ret; @@ -525,75 +507,6 @@ static int spread_send_buffer_nb(orte_rmcast_channel_t channel, return ORTE_SUCCESS; } -static int queue_recv(rmcast_base_recv_t *recvptr, - orte_rmcast_channel_t channel, - orte_rmcast_tag_t tag, - orte_rmcast_callback_fn_t cbfunc_iovec, - orte_rmcast_callback_buffer_fn_t cbfunc_buffer, - bool blocking) -{ - opal_list_item_t *item; - rmcast_base_channel_t *ch, *chptr; - rmcast_base_recv_t *rptr; - - /* find the channel */ - ch = NULL; - for (item = opal_list_get_first(&channels); - item != opal_list_get_end(&channels); - item = opal_list_get_next(item)) { - chptr = (rmcast_base_channel_t*)item; - if (channel == chptr->channel) { - ch = chptr; - break; - } - } - if (NULL == ch) { - /* didn't find it */ - return ORTE_ERR_NOT_FOUND; - } - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:spread: queue_recv called on spread channel %s tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ch->name, tag)); - - if (!blocking) { - /* do we already have a recv for this channel/tag/type? */ - OPAL_THREAD_LOCK(&lock); - for (item = opal_list_get_first(&recvs); - item != opal_list_get_end(&recvs); - item = opal_list_get_next(item)) { - rptr = (rmcast_base_recv_t*)item; - if (channel != rptr->channel) { - /* different channel */ - continue; - } - if (tag != rptr->tag) { - /* different tag */ - continue; - } - if ((NULL != cbfunc_iovec && NULL != rptr->cbfunc_iovec) || - (NULL != cbfunc_buffer && NULL != rptr->cbfunc_buffer)) { - /* matching type - recv already in place */ - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:spread: matching recv already active on spread channel %s tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ch->name, tag)); - OPAL_THREAD_UNLOCK(&lock); - return ORTE_EXISTS; - } - } - OPAL_THREAD_UNLOCK(&lock); - } - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:spread: adding non-blocking recv on spread channel %s tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ch->name, tag)); - OPAL_THREAD_LOCK(&lock); - opal_list_append(&recvs, &recvptr->item); - OPAL_THREAD_UNLOCK(&lock); - - return ORTE_SUCCESS; -} - static int spread_recv(orte_process_name_t *name, orte_rmcast_channel_t channel, orte_rmcast_tag_t tag, @@ -601,15 +514,18 @@ static int spread_recv(orte_process_name_t *name, { rmcast_base_recv_t *recvptr; int ret; + orte_rmcast_channel_t chan; - recvptr = OBJ_NEW(rmcast_base_recv_t); - recvptr->iovecs_requested = true; - recvptr->channel = channel; - recvptr->tag = tag; + if (ORTE_RMCAST_GROUP_CHANNEL == channel) { + chan = orte_rmcast_base.my_group_number; + } else { + chan = channel; + } - if (ORTE_SUCCESS != (ret = queue_recv(recvptr, channel, tag, NULL, NULL, true))) { + if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(&recvptr, chan, tag, + ORTE_RMCAST_NON_PERSISTENT, + NULL, NULL, NULL, true))) { ORTE_ERROR_LOG(ret); - OBJ_RELEASE(recvptr); return ret; } @@ -625,9 +541,9 @@ static int spread_recv(orte_process_name_t *name, *count = recvptr->iovec_count; /* remove the recv */ - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &recvptr->item); - OPAL_THREAD_UNLOCK(&lock); + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + opal_list_remove_item(&orte_rmcast_base.recvs, &recvptr->item); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); OBJ_RELEASE(recvptr); return ORTE_SUCCESS; @@ -638,29 +554,22 @@ static int spread_recv_nb(orte_rmcast_channel_t channel, orte_rmcast_flag_t flags, orte_rmcast_callback_fn_t cbfunc, void *cbdata) { - rmcast_base_recv_t *recvptr; + orte_rmcast_channel_t chan; int ret; + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:spread: recv_nb called on channel %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel)); - recvptr = OBJ_NEW(rmcast_base_recv_t); - recvptr->iovecs_requested = true; - recvptr->channel = channel; - recvptr->tag = tag; - recvptr->flags = flags; - recvptr->cbfunc_iovec = cbfunc; - recvptr->cbdata = cbdata; + if (ORTE_RMCAST_GROUP_CHANNEL == channel) { + chan = orte_rmcast_base.my_group_number; + } else { + chan = channel; + } - if (ORTE_SUCCESS != (ret = queue_recv(recvptr, channel, tag, cbfunc, NULL, false))) { - if (ORTE_EXISTS == ret) { - /* this recv already exists - just release the copy */ - OBJ_RELEASE(recvptr); - return ORTE_SUCCESS; - } + if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(NULL, chan, tag, flags, + cbfunc, NULL, cbdata, false))) { ORTE_ERROR_LOG(ret); - OBJ_RELEASE(recvptr); - return ret; } return ORTE_SUCCESS; @@ -673,18 +582,23 @@ static int spread_recv_buffer(orte_process_name_t *name, { rmcast_base_recv_t *recvptr; int ret; + orte_rmcast_channel_t chan; OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:spread: recv_buffer called on multicast channel %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel)); - recvptr = OBJ_NEW(rmcast_base_recv_t); - recvptr->channel = channel; - recvptr->tag = tag; + if (ORTE_RMCAST_GROUP_CHANNEL == channel) { + chan = orte_rmcast_base.my_group_number; + } else { + chan = channel; + } - if (ORTE_SUCCESS != (ret = queue_recv(recvptr, channel, tag, NULL, NULL, true))) { + if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(&recvptr, chan, tag, + ORTE_RMCAST_NON_PERSISTENT, + NULL, NULL, NULL, true))) { ORTE_ERROR_LOG(ret); - goto cleanup; + return ret; } ORTE_PROGRESSED_WAIT(recvptr->recvd, 0, 1); @@ -701,10 +615,9 @@ static int spread_recv_buffer(orte_process_name_t *name, /* release the data */ OBJ_RELEASE(recvptr->buf); -cleanup: - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &recvptr->item); - OPAL_THREAD_UNLOCK(&lock); + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + opal_list_remove_item(&orte_rmcast_base.recvs, &recvptr->item); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); OBJ_RELEASE(recvptr); return ret; @@ -723,51 +636,22 @@ static int spread_recv_buffer_nb(orte_rmcast_channel_t channel, "%s rmcast:udp: recv_buffer_nb called on multicast channel %d tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel, tag)); - recvptr = OBJ_NEW(rmcast_base_recv_t); - recvptr->channel = channel; - recvptr->tag = tag; - recvptr->flags = flags; - recvptr->cbfunc_buffer = cbfunc; - recvptr->cbdata = cbdata; + if (ORTE_RMCAST_GROUP_CHANNEL == channel) { + chan = orte_rmcast_base.my_group_number; + } else { + chan = channel; + } - if (ORTE_SUCCESS != (ret = queue_recv(recvptr, channel, tag, NULL, cbfunc, false))) { - if (ORTE_EXISTS == ret) { - /* this recv already exists - just release the copy */ - OBJ_RELEASE(recvptr); - return ORTE_SUCCESS; - } + if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(NULL, chan, tag, flags, + NULL, cbfunc, cbdata, false))) { ORTE_ERROR_LOG(ret); - OBJ_RELEASE(recvptr); return ret; } return ORTE_SUCCESS; } -static void cancel_recv(orte_rmcast_channel_t channel, - orte_rmcast_tag_t tag) -{ - opal_list_item_t *item, *next; - rmcast_base_recv_t *ptr; - - /* find all recv's for this channel and tag */ - item = opal_list_get_first(&recvs); - while (item != opal_list_get_end(&recvs)) { - next = opal_list_get_next(item); - - ptr = (rmcast_base_recv_t*)item; - if (channel == ptr->channel && - tag == ptr->tag) { - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &ptr->item); - OBJ_RELEASE(ptr); - OPAL_THREAD_UNLOCK(&lock); - } - item = next; - } -} - -static int open_channel(orte_rmcast_channel_t *channel, char *name, +static int open_channel(orte_rmcast_channel_t channel, char *name, char *network, int port, char *interface, uint8_t direction) { opal_list_item_t *item; @@ -782,27 +666,21 @@ static int open_channel(orte_rmcast_channel_t *channel, char *name, /* see if this name has already been assigned a channel */ OPAL_OUTPUT_VERBOSE((7, orte_rmcast_base.rmcast_output, "%s open_channel: searching for %s:%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), name, *channel)); + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), name, channel)); chan = NULL; - for (item = opal_list_get_first(&channels); - item != opal_list_get_end(&channels); + for (item = opal_list_get_first(&orte_rmcast_base.channels); + item != opal_list_get_end(&orte_rmcast_base.channels); item = opal_list_get_next(item)) { nchan = (rmcast_base_channel_t*)item; OPAL_OUTPUT_VERBOSE((7, orte_rmcast_base.rmcast_output, "%s open_channel: channel %s:%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - nchan->name, *channel)); + nchan->name, channel)); - if (nchan->channel == *channel || + if (nchan->channel == channel || 0 == strcasecmp(nchan->name, name)) { - - /* check the channel, if one was given */ - if (ORTE_RMCAST_INVALID_CHANNEL != *channel && - nchan->channel != *channel) { - continue; - } chan = nchan; break; } @@ -827,17 +705,11 @@ static int open_channel(orte_rmcast_channel_t *channel, char *name, /* we didn't find an existing match, so create a new channel */ chan = OBJ_NEW(rmcast_base_channel_t); chan->name = strdup(name); - /* if we were given a channel, then just use it */ - if (ORTE_RMCAST_INVALID_CHANNEL != *channel) { - chan->channel = *channel; - } else { - chan->channel = next_channel++; - *channel = chan->channel; - } + chan->channel = channel; - OPAL_THREAD_LOCK(&lock); - opal_list_append(&channels, &chan->item); - OPAL_THREAD_UNLOCK(&lock); + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + opal_list_append(&orte_rmcast_base.channels, &chan->item); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:spread opening new channel %s:%d for%s%s", @@ -854,34 +726,6 @@ static int open_channel(orte_rmcast_channel_t *channel, char *name, return ORTE_SUCCESS; } -static int close_channel(orte_rmcast_channel_t channel) -{ - opal_list_item_t *item; - rmcast_base_channel_t *chan; - - OPAL_THREAD_LOCK(&lock); - for (item = opal_list_get_first(&channels); - item != opal_list_get_end(&channels); - item = opal_list_get_next(item)) { - chan = (rmcast_base_channel_t*)item; - - if (channel == chan->channel) { - opal_list_remove_item(&channels, item); - OBJ_RELEASE(chan); - OPAL_THREAD_UNLOCK(&lock); - return ORTE_SUCCESS; - } - } - - OPAL_THREAD_UNLOCK(&lock); - return ORTE_ERR_NOT_FOUND; -} - -static orte_rmcast_channel_t query(void) -{ - return orte_rmcast_base.my_group_number; -} - static int setup_channel(rmcast_base_channel_t *chan, uint8_t direction, mailbox Mbox) { @@ -931,7 +775,7 @@ static void xmit_data(int sd, short flags, void* send_req) int32_t sz, outbound; int rc; int8_t flag; - opal_buffer_t buf; + opal_buffer_t *buf; int32_t tmp32; rmcast_send_log_t *log, *lg; @@ -944,71 +788,10 @@ static void xmit_data(int sd, short flags, void* send_req) while (NULL != (item = opal_list_remove_first(&chan->pending_sends))) { snd = (rmcast_base_send_t*)item; - /* setup a tmp buffer for a working area */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - - /* start the send data area with our header */ - ORTE_MULTICAST_MESSAGE_HDR_HTON(chan->send_data, snd->tag, chan->seq_num); - - /* are we sending a buffer? */ - if (NULL == snd->buf) { - /* flag the buffer as containing iovecs */ - flag = 0; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &flag, 1, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s packing %d iovecs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - snd->iovec_count)); - - /* pack the number of iovecs */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &snd->iovec_count, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - - /* pack each iovec into a buffer in prep for sending - * so we can recreate the array at the other end - */ - for (sz=0; sz < snd->iovec_count; sz++) { - /* pack the size */ - tmp32 = snd->iovec_array[sz].iov_len; - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s packing %d bytes for iovec %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - tmp32, sz)); - - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &tmp32, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (0 < tmp32) { - /* pack the bytes */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, snd->iovec_array[sz].iov_base, tmp32, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - } - } - } else { - /* flag it as being a buffer */ - flag = 1; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &flag, 1, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s copying payload", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* copy the payload */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&buf, snd->buf))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } + /* setup the message for xmission */ + if (ORTE_SUCCESS != (rc = orte_rmcast_base_build_msg(chan, &buf, snd))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; } /* store the working buf in the send ring buffer in case we @@ -1017,7 +800,7 @@ static void xmit_data(int sd, short flags, void* send_req) log = OBJ_NEW(rmcast_send_log_t); log->channel = chan->channel; log->seq_num = chan->seq_num; - opal_dss.copy_payload(log->buf, &buf); + opal_dss.copy_payload(log->buf, buf); if (NULL != (lg = (rmcast_send_log_t*)opal_ring_buffer_push(&chan->cache, log))) { /* release the old message */ OPAL_OUTPUT_VERBOSE((5, orte_rmcast_base.rmcast_output, @@ -1028,48 +811,28 @@ static void xmit_data(int sd, short flags, void* send_req) } /* unload the working buf to obtain the payload */ - if (ORTE_SUCCESS != (rc = opal_dss.unload(&buf, (void**)&bytes, &sz))) { + if (ORTE_SUCCESS != (rc = opal_dss.unload(buf, (void**)&bytes, &sz))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* done with the working buf */ - OBJ_DESTRUCT(&buf); - - /* add the payload, up to the limit */ - ORTE_MULTICAST_LOAD_MESSAGE(chan->send_data, bytes, sz, - mca_rmcast_spread_component.max_msg_size, - &outbound); - - if (outbound < 0) { - /* message was too large */ - opal_output(0, "%s message to multicast network %03d.%03d.%03d.%03d failed - size %d was too large (limit: %d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), OPAL_IF_FORMAT_ADDR(chan->network), - -1*outbound, mca_rmcast_spread_component.max_msg_size); - if (1 == flag) { - /* reload into original buffer */ - if (ORTE_SUCCESS != (rc = opal_dss.load(snd->buf, (void*)bytes, sz))) { - ORTE_ERROR_LOG(rc); - } - } - /* cleanup */ - goto CLEANUP; - } + OBJ_RELEASE(buf); OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:spread multicasting %d bytes to group %s tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), outbound, chan->name, (int)snd->tag)); - if (outbound != (rc = SP_multicast(chan->xmit, RELIABLE_MESS, chan->name, 0, outbound, (const char *)chan->send_data))) { + if (0 > (rc = SP_multicast(chan->xmit, RELIABLE_MESS, chan->name, 0, sz, (const char *)bytes))) { /* didn't get the message out */ - opal_output(0, "%s failed to send message to spread group %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), chan->name); - /* cleanup */ - goto CLEANUP; + opal_output(0, "%s failed to send message to spread group %s on\n\terror %s(%d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), chan->name, + SP_error2str(rc), rc); + rc = errno; } - if (1 == flag) { + if (NULL != snd->buf) { /* call the cbfunc if required */ if (NULL != snd->cbfunc_buffer) { snd->cbfunc_buffer(rc, chan->channel, snd->tag, @@ -1103,281 +866,9 @@ static void xmit_data(int sd, short flags, void* send_req) static void process_recv(int fd, short event, void *cbdata) { orte_mcast_msg_event_t *msg = (orte_mcast_msg_event_t*)cbdata; - rmcast_base_channel_t *chan = msg->channel; - opal_list_item_t *item; - rmcast_base_recv_t *ptr; - orte_process_name_t name; - orte_rmcast_tag_t tag; - opal_buffer_t buf; - int8_t flag; - struct iovec *iovec_array=NULL; - int32_t iovec_count=0, i, sz, n; - opal_buffer_t *recvd_buf=NULL; - int rc; - orte_rmcast_seq_t recvd_seq_num; - rmcast_recv_log_t *log, *lg; - /* extract the header */ - ORTE_MULTICAST_MESSAGE_HDR_NTOH(msg->data, &name, tag, recvd_seq_num); - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:spread:recv sender: %s tag: %d seq_num: %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name), (int)tag, recvd_seq_num)); - - /* if this message is from myself, ignore it */ - if (name.jobid == ORTE_PROC_MY_NAME->jobid && name.vpid == ORTE_PROC_MY_NAME->vpid) { - OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output, - "%s rmcast:spread:recv sent from myself: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); - goto cleanup; - } - - /* if this message is from a different job family, ignore it unless - * it is on the system channel. We ignore these messages to avoid - * confusion between different jobs since we all may be sharing - * multicast channels. The system channel is left open to support - * cross-job communications via the HNP. - */ - if (ORTE_JOB_FAMILY(name.jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { - /* if the channel is other than the system channel, ignore it */ - if (ORTE_RMCAST_SYS_CHANNEL != chan->channel) { - OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output, - "%s rmcast:spread:recv from a different job family: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); - goto cleanup; - } - /* if I am other than the HNP or a tool, ignore it */ - if (!ORTE_PROC_IS_HNP && !ORTE_PROC_IS_TOOL) { - OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output, - "%s rmcast:spread:recv from a different job family: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); - goto cleanup; - } - } - - /* construct the buffer for unpacking */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - - /* unload the message */ - ORTE_MULTICAST_UNLOAD_MESSAGE(&buf, msg->data, msg->sz); - - /* unpack the iovec vs buf flag */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &flag, &n, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* find the recv for this channel, tag, and type */ - for (item = opal_list_get_first(&recvs); - item != opal_list_get_end(&recvs); - item = opal_list_get_next(item)) { - ptr = (rmcast_base_recv_t*)item; - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:spread:recv checking channel %d tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (int)ptr->channel, (int)ptr->tag)); - - if (chan->channel != ptr->channel) { - continue; - } - - if (tag != ptr->tag && ORTE_RMCAST_TAG_WILDCARD != ptr->tag) { - continue; - } - - if (0 == flag && !ptr->iovecs_requested) { - /* it's an iovec and this recv is for buffers */ - continue; - } - - if (1 == flag && ptr->iovecs_requested) { - /* it's a buffer and this recv is for iovecs */ - continue; - } - - /* we have a recv - unpack the data */ - if (0 == flag) { - /* get the number of iovecs in the buffer */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &iovec_count, &n, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - /* malloc the required space */ - iovec_array = (struct iovec *)malloc(iovec_count * sizeof(struct iovec)); - /* unpack the iovecs */ - for (i=0; i < iovec_count; i++) { - /* unpack the number of bytes in this iovec */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &sz, &n, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - iovec_array[i].iov_base = NULL; - iovec_array[i].iov_len = sz; - if (0 < sz) { - /* allocate the space */ - iovec_array[i].iov_base = (uint8_t*)malloc(sz); - /* unpack the data */ - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, iovec_array[i].iov_base, &sz, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - } - } - } else { - /* buffer was included */ - recvd_buf = OBJ_NEW(opal_buffer_t); - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(recvd_buf, &buf))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - } - - /* if the sender's vpid is invalid, then this is a request for - * assignment of a name - so don't log the message - */ - if (ORTE_VPID_INVALID == name.vpid) { - goto MATCH; - } - - /* look up the message log for this sender */ - log = NULL; - for (n=0; n < msg_log.size; n++) { - if (NULL == (lg = (rmcast_recv_log_t*)opal_pointer_array_get_item(&msg_log, n))) { - continue; - } - if ((name.jobid == lg->name.jobid && name.vpid == lg->name.vpid) && - chan->channel == lg->channel) { - log = lg; - break; - } - } - if (NULL == log) { - /* new sender - create a log */ - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:spread:recv creating new msg log for %s channel %d seq# %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name), (int)msg->channel->channel, recvd_seq_num)); - log = OBJ_NEW(rmcast_recv_log_t); - log->name.jobid = name.jobid; - log->name.vpid = name.vpid; - log->channel = chan->channel; - log->seq_num = recvd_seq_num; - opal_pointer_array_add(&msg_log, log); - goto MATCH; - } - - if (recvd_seq_num < log->seq_num) { - /* this must be a repeat of an earlier message - ignore it */ - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:spread:recv recvd repeat msg %d (log at %d) from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - recvd_seq_num, log->seq_num, ORTE_NAME_PRINT(&name))); - goto cleanup; - } - - if (log->seq_num != (recvd_seq_num-1)) { - /* this message out of sequence - tell - * the sender the last number we got - */ - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:spread:recv msg %d is out of sequence (log at %d) from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - recvd_seq_num, log->seq_num, ORTE_NAME_PRINT(&name))); - /* ignore this message */ - goto cleanup; - } - - /* update the seq number */ - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:spread:recv update msg log to %d from %s:%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - recvd_seq_num, ORTE_NAME_PRINT(&name), log->channel)); - log->seq_num = recvd_seq_num; - - MATCH: - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:spread:recv delivering message to channel %d tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ptr->channel, (int)tag)); - - if (0 == flag) { - /* dealing with iovecs */ - if (NULL != ptr->cbfunc_iovec) { - ptr->cbfunc_iovec(ORTE_SUCCESS, ptr->channel, tag, - &name, iovec_array, iovec_count, ptr->cbdata); - /* if it isn't persistent, remove it */ - if (!(ORTE_RMCAST_PERSISTENT & ptr->flags)) { - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &ptr->item); - OPAL_THREAD_UNLOCK(&lock); - OBJ_RELEASE(ptr); - } - } else { - /* copy over the iovec array since it will be released by - * the blocking recv - */ - ptr->iovec_array = (struct iovec *)malloc(iovec_count * sizeof(struct iovec)); - ptr->iovec_count = iovec_count; - for (i=0; i < iovec_count; i++) { - ptr->iovec_array[i].iov_base = (uint8_t*)malloc(iovec_array[i].iov_len); - ptr->iovec_array[i].iov_len = iovec_array[i].iov_len; - memcpy(ptr->iovec_array[i].iov_base, iovec_array[i].iov_base, iovec_array[i].iov_len); - } - /* copy the sender's name */ - ptr->name.jobid = name.jobid; - ptr->name.vpid = name.vpid; - /* flag it as recvd to release blocking recv */ - ptr->recvd = true; - } - } else { - if (NULL != ptr->cbfunc_buffer) { - ptr->cbfunc_buffer(ORTE_SUCCESS, ptr->channel, tag, - &name, recvd_buf, ptr->cbdata); - /* if it isn't persistent, remove it */ - if (!(ORTE_RMCAST_PERSISTENT & ptr->flags)) { - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &ptr->item); - OPAL_THREAD_UNLOCK(&lock); - OBJ_RELEASE(ptr); - } - } else { - /* copy the buffer across since it will be released - * by the blocking recv - */ - ptr->buf = OBJ_NEW(opal_buffer_t); - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(ptr->buf, recvd_buf))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - /* copy the sender's name */ - ptr->name.jobid = name.jobid; - ptr->name.vpid = name.vpid; - /* flag it as recvd to release blocking recv */ - ptr->recvd = true; - } - } - /* we are done - only one recv can match */ - break; - } - -cleanup: + orte_rmcast_base_process_recv(msg); OBJ_RELEASE(msg); - if (NULL != iovec_array) { - for (i=0; i < iovec_count; i++) { - free(iovec_array[i].iov_base); - } - free(iovec_array); - } - if (NULL != recvd_buf) { - OBJ_RELEASE(recvd_buf); - } return; } @@ -1399,6 +890,7 @@ static void recv_handler(int sd, short flags, void* cbdata) int num_groups, size_data; int16 mess_type; int endian_mismatch; + opal_buffer_t *buf; if (!groups) { size_groups = 1; @@ -1413,12 +905,9 @@ static void recv_handler(int sd, short flags, void* cbdata) do { sz = SP_receive(sd, &srvc, sender, size_groups, &num_groups, groups, &mess_type, &endian_mismatch, size_data, (char *)data); if (sz < 0) { - char error_string[1024]; - - SP_error2str(sz, error_string); /* this shouldn't happen - report the errno */ opal_output(0, "%s Error on multicast recv spread event: %s(%d:%d:%d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), error_string, sz, num_groups, endian_mismatch); + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), SP_error2str(sz), sz, num_groups, endian_mismatch); switch (sz) { @@ -1474,7 +963,9 @@ static void recv_handler(int sd, short flags, void* cbdata) (int)sz, (int)chan->channel, chan->name)); /* clear the way for the next message */ - ORTE_MULTICAST_MESSAGE_EVENT(data, sz, chan, process_recv); + buf = OBJ_NEW(opal_buffer_t); + opal_dss.load(buf, data, sz); + ORTE_MULTICAST_MESSAGE_EVENT(buf, process_recv); } else { /* * We've just received a message on a channel whose name diff --git a/orte/mca/rmcast/spread/rmcast_spread.h b/orte/mca/rmcast/spread/rmcast_spread.h index f25baea22f..4cb761437d 100644 --- a/orte/mca/rmcast/spread/rmcast_spread.h +++ b/orte/mca/rmcast/spread/rmcast_spread.h @@ -19,14 +19,6 @@ BEGIN_C_DECLS -#define ORTE_RMCAST_SPREAD_MAX_MSG_SIZE 1500 - -typedef struct { - orte_rmcast_base_component_t super; - int max_msg_size; -} orte_rmcast_spread_component_t; - - /* * Module open / close */ @@ -35,7 +27,7 @@ int orte_rmcast_spread_component_close(void); int orte_rmcast_spread_component_query(mca_base_module_t **module, int *priority); -ORTE_MODULE_DECLSPEC extern orte_rmcast_spread_component_t mca_rmcast_spread_component; +ORTE_MODULE_DECLSPEC extern orte_rmcast_base_component_t mca_rmcast_spread_component; ORTE_DECLSPEC extern orte_rmcast_module_t orte_rmcast_spread_module; END_C_DECLS diff --git a/orte/mca/rmcast/spread/rmcast_spread_component.c b/orte/mca/rmcast/spread/rmcast_spread_component.c index 89a400d6b7..96c11aff6c 100644 --- a/orte/mca/rmcast/spread/rmcast_spread_component.c +++ b/orte/mca/rmcast/spread/rmcast_spread_component.c @@ -38,26 +38,24 @@ const char *mca_rmcast_spread_component_version_string = * Instantiate the public struct with all of our public information * and pointers to our public functions in it */ -orte_rmcast_spread_component_t mca_rmcast_spread_component = { +orte_rmcast_base_component_t mca_rmcast_spread_component = { { - { - ORTE_RMCAST_BASE_VERSION_1_0_0, - - /* Component name and version */ - "spread", - ORTE_MAJOR_VERSION, - ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION, - - /* Component open and close functions */ - orte_rmcast_spread_component_open, - orte_rmcast_spread_component_close, - orte_rmcast_spread_component_query - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - } + ORTE_RMCAST_BASE_VERSION_1_0_0, + + /* Component name and version */ + "spread", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + orte_rmcast_spread_component_open, + orte_rmcast_spread_component_close, + orte_rmcast_spread_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT } }; @@ -65,14 +63,6 @@ orte_rmcast_spread_component_t mca_rmcast_spread_component = { int orte_rmcast_spread_component_open(void) { - mca_base_component_t *c = &mca_rmcast_spread_component.super.version; - - mca_base_param_reg_int(c, "max_msg_size", - "Max #bytes in a single msg (must be > 0)", - false, false, - ORTE_RMCAST_SPREAD_MAX_MSG_SIZE, - &mca_rmcast_spread_component.max_msg_size); - return ORTE_SUCCESS; } diff --git a/orte/mca/rmcast/tcp/rmcast_tcp.c b/orte/mca/rmcast/tcp/rmcast_tcp.c index f85f8ab2cf..a9b28d3fe5 100644 --- a/orte/mca/rmcast/tcp/rmcast_tcp.c +++ b/orte/mca/rmcast/tcp/rmcast_tcp.c @@ -35,25 +35,25 @@ #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/base/rml_contact.h" +#include "orte/mca/odls/odls_types.h" #include "orte/mca/rmcast/base/private.h" #include "orte/mca/rmcast/base/base.h" #include "rmcast_tcp.h" /* LOCAL DATA */ -static opal_mutex_t lock; -static opal_list_t recvs; -static opal_list_t channels; static bool init_completed = false; -static orte_rmcast_channel_t next_channel; +static orte_job_t *daemons=NULL; /* LOCAL FUNCTIONS */ static void recv_handler(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata); -/* LOCAL STRUCTURE VALUES */ -static rmcast_base_channel_t *my_group_channel=NULL; +static void relay_handler(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); +static void relay(int fd, short event, void *cbdata); /* API FUNCTIONS */ static int init(void); @@ -102,16 +102,9 @@ static int tcp_recv_nb(orte_rmcast_channel_t channel, orte_rmcast_callback_fn_t cbfunc, void *cbdata); -static void cancel_recv(orte_rmcast_channel_t channel, - orte_rmcast_tag_t tag); - -static int open_channel(orte_rmcast_channel_t *channel, char *name, +static int open_channel(orte_rmcast_channel_t channel, char *name, char *network, int port, char *interface, uint8_t direction); -static int close_channel(orte_rmcast_channel_t channel); - -static orte_rmcast_channel_t query(void); - /* Define the module */ orte_rmcast_module_t orte_rmcast_tcp_module = { @@ -125,10 +118,10 @@ orte_rmcast_module_t orte_rmcast_tcp_module = { tcp_recv_nb, tcp_recv_buffer, tcp_recv_buffer_nb, - cancel_recv, + orte_rmcast_base_cancel_recv, open_channel, - close_channel, - query + orte_rmcast_base_close_channel, + orte_rmcast_base_query }; /* during init, we setup two channels for both xmit and recv: @@ -153,7 +146,6 @@ orte_rmcast_module_t orte_rmcast_tcp_module = { static int init(void) { int rc; - orte_rmcast_channel_t channel; if (init_completed) { return ORTE_SUCCESS; @@ -164,43 +156,64 @@ static int init(void) "%s rmcast:tcp: init called", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* setup the globals */ - OBJ_CONSTRUCT(&lock, opal_mutex_t); - OBJ_CONSTRUCT(&recvs, opal_list_t); - OBJ_CONSTRUCT(&channels, opal_list_t); - next_channel = ORTE_RMCAST_DYNAMIC_CHANNELS; - /* setup the respective public address channel */ - if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_TOOL) { - channel = ORTE_RMCAST_SYS_CHANNEL; - if (ORTE_SUCCESS != (rc = open_channel(&channel, "system", + if (ORTE_PROC_IS_TOOL) { + /* tools only open the sys channel */ + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_SYS_CHANNEL, "system", NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { ORTE_ERROR_LOG(rc); return rc; } - } else if (ORTE_PROC_IS_APP) { - channel = ORTE_RMCAST_APP_PUBLIC_CHANNEL; - if (ORTE_SUCCESS != (rc = open_channel(&channel, "app-announce", + orte_rmcast_base.my_group_channel = (rmcast_base_channel_t*)opal_list_get_last(&orte_rmcast_base.channels); + } else if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { + /* daemons and hnp open the sys and data server channels */ + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_SYS_CHANNEL, "system", NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { ORTE_ERROR_LOG(rc); return rc; } + orte_rmcast_base.my_group_channel = (rmcast_base_channel_t*)opal_list_get_last(&orte_rmcast_base.channels); + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_DATA_SERVER_CHANNEL, "data-server", + NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* activate a recv to catch relays */ + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_MULTICAST_RELAY, + ORTE_RML_NON_PERSISTENT, + relay_handler, + NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } else if (ORTE_PROC_IS_APP) { + /* apps open the app public and data server channels */ + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_APP_PUBLIC_CHANNEL, "app-announce", + NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_DATA_SERVER_CHANNEL, "data-server", + NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* finally, if we are an app, setup our grp channel, if one was given */ + if (ORTE_PROC_IS_APP && NULL != orte_rmcast_base.my_group_name) { + if (ORTE_SUCCESS != (rc = open_channel(orte_rmcast_base.my_group_number, + orte_rmcast_base.my_group_name, + NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + orte_rmcast_base.my_group_channel = (rmcast_base_channel_t*)opal_list_get_last(&orte_rmcast_base.channels); + } } else { opal_output(0, "rmcast:tcp:init - unknown process type"); return ORTE_ERR_SILENT; } - /* finally, if we are an app, setup our grp channel, if one was given */ - if (ORTE_PROC_IS_APP && NULL != orte_rmcast_base.my_group_name) { - channel = orte_rmcast_base.my_group_number; - if (ORTE_SUCCESS != (rc = open_channel(&channel, orte_rmcast_base.my_group_name, - NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - my_group_channel = (rmcast_base_channel_t*)opal_list_get_last(&channels); - } - if (ORTE_JOBID_WILDCARD == orte_process_info.my_hnp.jobid) { /* set the HNP info in our contact table */ if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(orte_process_info.my_hnp_uri))) { @@ -230,29 +243,10 @@ static int init(void) static void finalize(void) { - opal_list_item_t *item; - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:tcp: finalize called", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* cancel the recv */ - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MULTICAST); - /* deconstruct the globals */ - OPAL_THREAD_LOCK(&lock); - while (NULL != (item = opal_list_remove_first(&recvs))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&recvs); - while (NULL != (item = opal_list_remove_first(&channels))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&channels); - OPAL_THREAD_UNLOCK(&lock); - - OBJ_DESTRUCT(&lock); - return; } @@ -278,32 +272,39 @@ static void internal_snd_buf_cb(int status, } static int queue_xmit(rmcast_base_send_t *snd, - orte_rmcast_channel_t channel, - orte_rmcast_tag_t tag) + orte_rmcast_channel_t channel) { opal_list_item_t *item; - rmcast_base_channel_t *chptr, *ch; - int32_t sz; - int rc; - int8_t flag; - opal_buffer_t buf; - int32_t tmp32; + rmcast_base_channel_t *ch, *chptr; + orte_proc_t *proc; + orte_odls_child_t *child; + int rc, v; + opal_buffer_t *buf; + + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:tcp: send of %d %s" + " called on multicast channel %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == snd->iovec_array) ? (int)snd->buf->bytes_used : (int)snd->iovec_count, + (NULL == snd->iovec_array) ? "bytes" : "iovecs", + (int)channel)); /* if we were asked to send this on our group output * channel, substitute it */ if (ORTE_RMCAST_GROUP_CHANNEL == channel) { - if (NULL == my_group_channel) { + if (NULL == orte_rmcast_base.my_group_channel) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - ch = my_group_channel; + ch = orte_rmcast_base.my_group_channel; goto process; } /* find the channel */ ch = NULL; - for (item = opal_list_get_first(&channels); - item != opal_list_get_end(&channels); + for (item = opal_list_get_first(&orte_rmcast_base.channels); + item != opal_list_get_end(&orte_rmcast_base.channels); item = opal_list_get_next(item)) { chptr = (rmcast_base_channel_t*)item; if (channel == chptr->channel) { @@ -313,120 +314,77 @@ static int queue_xmit(rmcast_base_send_t *snd, } if (NULL == ch) { /* didn't find it */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } process: - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:tcp: send of %d %s" - " called on multicast channel %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == snd->iovec_array) ? (int)snd->buf->bytes_used : (int)snd->iovec_count, - (NULL == snd->iovec_array) ? "bytes" : "iovecs", - (int)ch->channel)); - + /* setup the message for xmission */ + if (ORTE_SUCCESS != (rc = orte_rmcast_base_build_msg(ch, &buf, snd))) { + ORTE_ERROR_LOG(rc); + return rc; + } OPAL_THREAD_LOCK(&ch->send_lock); - /* setup a buffer */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); + + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:tcp multicasting %d bytes to channel %d tag %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)buf->bytes_used, + (int)ch->channel, (int)snd->tag)); - /* pack our name */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* pack the channel */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &ch->channel, 1, ORTE_RMCAST_CHANNEL_T))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* pack the tag */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &tag, 1, ORTE_RMCAST_TAG_T))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* pack the sequence number */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &ch->seq_num, 1, ORTE_RMCAST_SEQ_T))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* are we sending a buffer? */ - if (NULL == snd->buf) { - /* no, flag the buffer as containing iovecs */ - flag = 0; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &flag, 1, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - goto cleanup; + if (ORTE_PROC_IS_HNP) { + /* if we don't already have it, get the daemon object */ + if (NULL == daemons) { + daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); } - - /* pack the number of iovecs */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &snd->iovec_count, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* pack each iovec into a buffer in prep for sending - * so we can recreate the array at the other end - */ - for (sz=0; sz < snd->iovec_count; sz++) { - /* pack the size */ - tmp32 = snd->iovec_array[sz].iov_len; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &tmp32, 1, OPAL_INT32))) { + /* send it to each daemon */ + for (v=1; v < daemons->procs->size; v++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) { + continue; + } + if (0 > (rc = orte_rml.send_buffer(&proc->name, buf, ORTE_RML_TAG_MULTICAST, 0))) { ORTE_ERROR_LOG(rc); goto cleanup; } - if (0 < tmp32) { - /* pack the bytes */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, snd->iovec_array[sz].iov_base, tmp32, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + } + /* send the message to my children */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + if (0 > (rc = orte_rml.send_buffer(child->name, buf, ORTE_RML_TAG_MULTICAST, 0))) { + ORTE_ERROR_LOG(rc); + goto cleanup; } } - + rc = ORTE_SUCCESS; } else { - /* flag it as being a buffer */ - flag = 1; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &flag, 1, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* copy the payload */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&buf, snd->buf))) { + /* send it to the HNP */ + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:tcp sending multicast to HNP %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(ORTE_PROC_MY_HNP))); + if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_MULTICAST_RELAY, 0))) { ORTE_ERROR_LOG(rc); + /* didn't get the message out */ + opal_output(0, "%s failed to send message to multicast channel %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)ch->channel); goto cleanup; } + rc = ORTE_SUCCESS; } - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:tcp multicasting %d bytes to channel %d tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)buf.bytes_used, - (int)ch->channel, (int)tag)); - - if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(ORTE_JOBID_WILDCARD, - &buf, ORTE_RML_TAG_MULTICAST))) { - /* didn't get the message out */ - opal_output(0, "%s failed to send message to multicast channel %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)ch->channel); - goto cleanup; - } - - if (1 == flag) { + + if (NULL != snd->buf) { /* call the cbfunc if required */ if (NULL != snd->cbfunc_buffer) { - snd->cbfunc_buffer(rc, channel, tag, + snd->cbfunc_buffer(rc, channel, snd->tag, ORTE_PROC_MY_NAME, snd->buf, snd->cbdata); } } else { /* call the cbfunc if required */ if (NULL != snd->cbfunc_iovec) { - snd->cbfunc_iovec(rc, channel, tag, + snd->cbfunc_iovec(rc, channel, snd->tag, ORTE_PROC_MY_NAME, snd->iovec_array, snd->iovec_count, snd->cbdata); } @@ -436,11 +394,11 @@ process: ORTE_MULTICAST_NEXT_SEQUENCE_NUM(ch->seq_num); cleanup: - OBJ_DESTRUCT(&buf); + OBJ_RELEASE(buf); OPAL_THREAD_UNLOCK(&ch->send_lock); - return ORTE_SUCCESS; + return rc; } static int tcp_send(orte_rmcast_channel_t channel, @@ -458,7 +416,7 @@ static int tcp_send(orte_rmcast_channel_t channel, snd.cbfunc_iovec = internal_snd_cb; send_complete = false; - if (ORTE_SUCCESS != (ret = queue_xmit(&snd, channel, tag))) { + if (ORTE_SUCCESS != (ret = queue_xmit(&snd, channel))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&snd); return ret; @@ -488,7 +446,7 @@ static int tcp_send_nb(orte_rmcast_channel_t channel, snd.cbfunc_iovec = cbfunc; snd.cbdata = cbdata; - if (ORTE_SUCCESS != (ret = queue_xmit(&snd, channel, tag))) { + if (ORTE_SUCCESS != (ret = queue_xmit(&snd, channel))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&snd); return ret; @@ -512,7 +470,7 @@ static int tcp_send_buffer(orte_rmcast_channel_t channel, snd.cbfunc_buffer = internal_snd_buf_cb; send_buf_complete = false; - if (ORTE_SUCCESS != (ret = queue_xmit(&snd, channel, tag))) { + if (ORTE_SUCCESS != (ret = queue_xmit(&snd, channel))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&snd); return ret; @@ -541,7 +499,7 @@ static int tcp_send_buffer_nb(orte_rmcast_channel_t channel, snd.cbfunc_buffer = cbfunc; snd.cbdata = cbdata; - if (ORTE_SUCCESS != (ret = queue_xmit(&snd, channel, tag))) { + if (ORTE_SUCCESS != (ret = queue_xmit(&snd, channel))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&snd); return ret; @@ -551,75 +509,6 @@ static int tcp_send_buffer_nb(orte_rmcast_channel_t channel, return ORTE_SUCCESS; } -static int queue_recv(rmcast_base_recv_t *recvptr, - orte_rmcast_channel_t channel, - orte_rmcast_tag_t tag, - orte_rmcast_callback_fn_t cbfunc_iovec, - orte_rmcast_callback_buffer_fn_t cbfunc_buffer, - bool blocking) -{ - opal_list_item_t *item; - rmcast_base_channel_t *ch, *chptr; - rmcast_base_recv_t *rptr; - - /* find the channel */ - ch = NULL; - for (item = opal_list_get_first(&channels); - item != opal_list_get_end(&channels); - item = opal_list_get_next(item)) { - chptr = (rmcast_base_channel_t*)item; - if (channel == chptr->channel) { - ch = chptr; - break; - } - } - if (NULL == ch) { - /* didn't find it */ - return ORTE_ERR_NOT_FOUND; - } - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:tcp: queue_recv called on multicast channel %03d.%03d.%03d.%03d tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), OPAL_IF_FORMAT_ADDR(ch->network), tag)); - - if (!blocking) { - /* do we already have a recv for this channel/tag/cbfunc? */ - OPAL_THREAD_LOCK(&lock); - for (item = opal_list_get_first(&recvs); - item != opal_list_get_end(&recvs); - item = opal_list_get_next(item)) { - rptr = (rmcast_base_recv_t*)item; - if (channel != rptr->channel) { - /* different channel */ - continue; - } - if (tag != rptr->tag) { - /* different tag */ - continue; - } - if ((NULL != cbfunc_iovec && NULL != rptr->cbfunc_iovec) || - (NULL != cbfunc_buffer && NULL != rptr->cbfunc_buffer)) { - /* matching type - recv already in place */ - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:tcp: matching recv already active on multicast channel %03d.%03d.%03d.%03d tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), OPAL_IF_FORMAT_ADDR(ch->network), tag)); - OPAL_THREAD_UNLOCK(&lock); - return ORTE_EXISTS; - } - } - OPAL_THREAD_UNLOCK(&lock); - } - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:tcp: adding non-blocking recv on multicast channel %d tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ch->channel, tag)); - OPAL_THREAD_LOCK(&lock); - opal_list_append(&recvs, &recvptr->item); - OPAL_THREAD_UNLOCK(&lock); - - return ORTE_SUCCESS; -} - static int tcp_recv(orte_process_name_t *name, orte_rmcast_channel_t channel, orte_rmcast_tag_t tag, @@ -627,14 +516,18 @@ static int tcp_recv(orte_process_name_t *name, { rmcast_base_recv_t *recvptr; int ret; + orte_rmcast_channel_t chan; + + if (ORTE_RMCAST_GROUP_CHANNEL == channel) { + chan = orte_rmcast_base.my_group_number; + } else { + chan = channel; + } - recvptr = OBJ_NEW(rmcast_base_recv_t); - recvptr->channel = channel; - recvptr->tag = tag; - - if (ORTE_SUCCESS != (ret = queue_recv(recvptr, channel, tag, NULL, NULL, true))) { + if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(&recvptr, chan, tag, + ORTE_RMCAST_NON_PERSISTENT, + NULL, NULL, NULL, true))) { ORTE_ERROR_LOG(ret); - OBJ_RELEASE(recvptr); return ret; } @@ -650,9 +543,9 @@ static int tcp_recv(orte_process_name_t *name, *count = recvptr->iovec_count; /* remove the recv */ - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &recvptr->item); - OPAL_THREAD_UNLOCK(&lock); + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + opal_list_remove_item(&orte_rmcast_base.recvs, &recvptr->item); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); OBJ_RELEASE(recvptr); return ORTE_SUCCESS; @@ -663,31 +556,25 @@ static int tcp_recv_nb(orte_rmcast_channel_t channel, orte_rmcast_flag_t flags, orte_rmcast_callback_fn_t cbfunc, void *cbdata) { - rmcast_base_recv_t *recvptr; int ret; + orte_rmcast_channel_t chan; + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:tcp: recv_nb called on channel %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel)); - recvptr = OBJ_NEW(rmcast_base_recv_t); - recvptr->channel = channel; - recvptr->tag = tag; - recvptr->flags = flags; - recvptr->cbfunc_iovec = cbfunc; - recvptr->cbdata = cbdata; - - if (ORTE_SUCCESS != (ret = queue_recv(recvptr, channel, tag, cbfunc, NULL, false))) { - if (ORTE_EXISTS == ret) { - /* this recv already exists - just release the copy */ - OBJ_RELEASE(recvptr); - return ORTE_SUCCESS; - } - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(recvptr); - return ret; + if (ORTE_RMCAST_GROUP_CHANNEL == channel) { + chan = orte_rmcast_base.my_group_number; + } else { + chan = channel; } - return ORTE_SUCCESS; + if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(NULL, chan, tag, flags, + cbfunc, NULL, cbdata, false))) { + ORTE_ERROR_LOG(ret); + } + + return ret; } static int tcp_recv_buffer(orte_process_name_t *name, @@ -697,18 +584,23 @@ static int tcp_recv_buffer(orte_process_name_t *name, { rmcast_base_recv_t *recvptr; int ret; - + orte_rmcast_channel_t chan; + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:tcp: recv_buffer called on multicast channel %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel)); - recvptr = OBJ_NEW(rmcast_base_recv_t); - recvptr->channel = channel; - recvptr->tag = tag; + if (ORTE_RMCAST_GROUP_CHANNEL == channel) { + chan = orte_rmcast_base.my_group_number; + } else { + chan = channel; + } - if (ORTE_SUCCESS != (ret = queue_recv(recvptr, channel, tag, NULL, NULL, true))) { + if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(&recvptr, chan, tag, + ORTE_RMCAST_NON_PERSISTENT, + NULL, NULL, NULL, true))) { ORTE_ERROR_LOG(ret); - goto cleanup; + return ret; } ORTE_PROGRESSED_WAIT(recvptr->recvd, 0, 1); @@ -725,10 +617,9 @@ static int tcp_recv_buffer(orte_process_name_t *name, /* release the data */ OBJ_RELEASE(recvptr->buf); -cleanup: - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &recvptr->item); - OPAL_THREAD_UNLOCK(&lock); + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + opal_list_remove_item(&orte_rmcast_base.recvs, &recvptr->item); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); OBJ_RELEASE(recvptr); return ret; @@ -739,81 +630,51 @@ static int tcp_recv_buffer_nb(orte_rmcast_channel_t channel, orte_rmcast_flag_t flags, orte_rmcast_callback_buffer_fn_t cbfunc, void *cbdata) { - rmcast_base_recv_t *recvptr; int ret; + orte_rmcast_channel_t chan; OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:tcp: recv_buffer_nb called on multicast channel %d tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel, tag)); - recvptr = OBJ_NEW(rmcast_base_recv_t); - recvptr->channel = channel; - recvptr->tag = tag; - recvptr->flags = flags; - recvptr->cbfunc_buffer = cbfunc; - recvptr->cbdata = cbdata; + if (ORTE_RMCAST_GROUP_CHANNEL == channel) { + chan = orte_rmcast_base.my_group_number; + } else { + chan = channel; + } - if (ORTE_SUCCESS != (ret = queue_recv(recvptr, channel, tag, NULL, cbfunc, false))) { - if (ORTE_EXISTS == ret) { - /* this recv already exists - just release the copy */ - OBJ_RELEASE(recvptr); - return ORTE_SUCCESS; - } + if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(NULL, chan, tag, flags, + NULL, cbfunc, cbdata, false))) { ORTE_ERROR_LOG(ret); - OBJ_RELEASE(recvptr); return ret; } return ORTE_SUCCESS; } -static void cancel_recv(orte_rmcast_channel_t channel, - orte_rmcast_tag_t tag) -{ - opal_list_item_t *item, *next; - rmcast_base_recv_t *ptr; - - /* find all recv's for this channel and tag */ - item = opal_list_get_first(&recvs); - while (item != opal_list_get_end(&recvs)) { - next = opal_list_get_next(item); - - ptr = (rmcast_base_recv_t*)item; - if (channel == ptr->channel && - tag == ptr->tag) { - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &ptr->item); - OBJ_RELEASE(ptr); - OPAL_THREAD_UNLOCK(&lock); - } - item = next; - } -} - /* for the tcp module, we will be using the RML to "fake" a * multicast in combination with the grpcomm "xcast" interface. * We cannot control the network and interface in this * combination as it gets auto-picked well before us, so we * ignore that info here */ -static int open_channel(orte_rmcast_channel_t *channel, char *name, +static int open_channel(orte_rmcast_channel_t channel, char *name, char *network, int port, char *interface, uint8_t direction) { opal_list_item_t *item; - rmcast_base_channel_t *nchan, *chan; + rmcast_base_channel_t *chan; /* see if this name has already been assigned a channel on the specified network */ - chan = NULL; - for (item = opal_list_get_first(&channels); - item != opal_list_get_end(&channels); + for (item = opal_list_get_first(&orte_rmcast_base.channels); + item != opal_list_get_end(&orte_rmcast_base.channels); item = opal_list_get_next(item)) { - nchan = (rmcast_base_channel_t*)item; + chan = (rmcast_base_channel_t*)item; - if (0 == strcasecmp(nchan->name, name)) { + if (0 == strcasecmp(chan->name, name)) { /* check the channel, if one was given */ - if (ORTE_RMCAST_INVALID_CHANNEL != *channel && - nchan->channel != *channel) { - continue; + if (ORTE_RMCAST_INVALID_CHANNEL != channel && + ORTE_RMCAST_INVALID_CHANNEL == chan->channel) { + chan->channel = channel; } /* all setup - nothing to do */ OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, @@ -826,17 +687,11 @@ static int open_channel(orte_rmcast_channel_t *channel, char *name, /* we didn't find an existing match, so create a new channel */ chan = OBJ_NEW(rmcast_base_channel_t); chan->name = strdup(name); - /* if we were given a channel, then just use it */ - if (ORTE_RMCAST_INVALID_CHANNEL != *channel) { - chan->channel = *channel; - } else { - chan->channel = next_channel++; - *channel = chan->channel; - } + chan->channel = channel; /* add to list of known channels */ - OPAL_THREAD_LOCK(&lock); - opal_list_append(&channels, &chan->item); - OPAL_THREAD_UNLOCK(&lock); + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + opal_list_append(&orte_rmcast_base.channels, &chan->item); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:tcp opening new channel for%s%s", @@ -847,240 +702,38 @@ static int open_channel(orte_rmcast_channel_t *channel, char *name, return ORTE_SUCCESS; } -static int close_channel(orte_rmcast_channel_t channel) -{ - opal_list_item_t *item; - rmcast_base_channel_t *chan; - - OPAL_THREAD_LOCK(&lock); - for (item = opal_list_get_first(&channels); - item != opal_list_get_end(&channels); - item = opal_list_get_next(item)) { - chan = (rmcast_base_channel_t*)item; - - if (channel == chan->channel) { - opal_list_remove_item(&channels, item); - OBJ_RELEASE(chan); - OPAL_THREAD_UNLOCK(&lock); - return ORTE_SUCCESS; - } - } - - OPAL_THREAD_UNLOCK(&lock); - return ORTE_ERR_NOT_FOUND; -} - -static orte_rmcast_channel_t query(void) -{ - return orte_rmcast_base.my_group_number; -} - /**** LOCAL FUNCTIONS ****/ static void process_recv(int fd, short event, void *cbdata) { - orte_message_event_t *mev = (orte_message_event_t*)cbdata; - opal_buffer_t *buf = mev->buffer; - orte_rmcast_channel_t channel; + orte_mcast_msg_event_t *mev = (orte_mcast_msg_event_t*)cbdata; opal_list_item_t *item; - rmcast_base_recv_t *ptr; - orte_process_name_t name; - orte_rmcast_tag_t tag; - int8_t flag; - struct iovec *iovec_array=NULL; - int32_t iovec_count=0, i, sz, n; - opal_buffer_t *recvd_buf=NULL; + orte_odls_child_t *child; int rc; - orte_rmcast_seq_t recvd_seq_num; - /* extract the name of the original sender */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &name, &n, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* if this message is from myself, ignore it */ - if (name.jobid == ORTE_PROC_MY_NAME->jobid && name.vpid == ORTE_PROC_MY_NAME->vpid) { - OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output, - "%s rmcast:tcp:recv sent from myself: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); - goto cleanup; - } - - /* extract the "channel" */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &channel, &n, ORTE_RMCAST_CHANNEL_T))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* extract the tag */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &tag, &n, ORTE_RMCAST_TAG_T))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* extract the sequence number */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &recvd_seq_num, &n, ORTE_RMCAST_SEQ_T))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:tcp:recv sender: %s tag: %d seq_num: %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name), (int)tag, recvd_seq_num)); - - - /* unpack the iovec vs buf flag */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &flag, &n, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* find the recv for this channel, tag, and type */ - for (item = opal_list_get_first(&recvs); - item != opal_list_get_end(&recvs); - item = opal_list_get_next(item)) { - ptr = (rmcast_base_recv_t*)item; - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:tcp:recv checking channel %d tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (int)ptr->channel, (int)ptr->tag)); - - if (channel != ptr->channel) { - continue; - } - - if (tag != ptr->tag && ORTE_RMCAST_TAG_WILDCARD != ptr->tag) { - continue; - } - - if (0 == flag && !ptr->iovecs_requested) { - /* it's an iovec and this recv is for buffers */ - continue; - } - - if (1 == flag && ptr->iovecs_requested) { - /* it's a buffer and this recv is for iovecs */ - continue; - } - - /* we have a recv - unpack the data */ - if (0 == flag) { - /* get the number of iovecs in the buffer */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &iovec_count, &n, OPAL_INT32))) { + /* if I am a daemon, I need to relay this to my children first */ + if (ORTE_PROC_IS_DAEMON) { + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s relaying multicast to %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name))); + if (0 > (rc = orte_rml.send_buffer(child->name, mev->buf, ORTE_RML_TAG_MULTICAST, 0))) { ORTE_ERROR_LOG(rc); goto cleanup; } - /* malloc the required space */ - iovec_array = (struct iovec *)malloc(iovec_count * sizeof(struct iovec)); - /* unpack the iovecs */ - for (i=0; i < iovec_count; i++) { - /* unpack the number of bytes in this iovec */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &sz, &n, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - iovec_array[i].iov_base = NULL; - iovec_array[i].iov_len = sz; - if (0 < sz) { - /* allocate the space */ - iovec_array[i].iov_base = (uint8_t*)malloc(sz); - /* unpack the data */ - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, iovec_array[i].iov_base, &sz, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - } - } - } else { - /* buffer was included */ - recvd_buf = OBJ_NEW(opal_buffer_t); - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(recvd_buf, buf))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } } - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:tcp:recv delivering message to channel %d tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ptr->channel, (int)tag)); - - if (0 == flag) { - /* dealing with iovecs */ - if (NULL != ptr->cbfunc_iovec) { - ptr->cbfunc_iovec(ORTE_SUCCESS, ptr->channel, tag, - &name, iovec_array, iovec_count, ptr->cbdata); - /* if it isn't persistent, remove it */ - if (!(ORTE_RMCAST_PERSISTENT & ptr->flags)) { - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &ptr->item); - OPAL_THREAD_UNLOCK(&lock); - OBJ_RELEASE(ptr); - } - } else { - /* copy over the iovec array since it will be released by - * the blocking recv - */ - ptr->iovec_array = (struct iovec *)malloc(iovec_count * sizeof(struct iovec)); - ptr->iovec_count = iovec_count; - for (i=0; i < iovec_count; i++) { - ptr->iovec_array[i].iov_base = (uint8_t*)malloc(iovec_array[i].iov_len); - ptr->iovec_array[i].iov_len = iovec_array[i].iov_len; - memcpy(ptr->iovec_array[i].iov_base, iovec_array[i].iov_base, iovec_array[i].iov_len); - } - /* flag it as recvd to release blocking recv */ - ptr->recvd = true; - } - } else { - if (NULL != ptr->cbfunc_buffer) { - ptr->cbfunc_buffer(ORTE_SUCCESS, ptr->channel, tag, - &name, recvd_buf, ptr->cbdata); - /* if it isn't persistent, remove it */ - if (!(ORTE_RMCAST_PERSISTENT & ptr->flags)) { - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &ptr->item); - OPAL_THREAD_UNLOCK(&lock); - OBJ_RELEASE(ptr); - } - } else { - /* copy the buffer across since it will be released - * by the blocking recv - */ - ptr->buf = OBJ_NEW(opal_buffer_t); - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(ptr->buf, recvd_buf))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - /* flag it as recvd to release blocking recv */ - ptr->recvd = true; - } - } - /* we are done - only one recv can match */ - break; } - + + /* process the receive */ + orte_rmcast_base_process_recv(mev); + cleanup: OBJ_RELEASE(mev); - if (NULL != iovec_array) { - for (i=0; i < iovec_count; i++) { - free(iovec_array[i].iov_base); - } - free(iovec_array); - } - if (NULL != recvd_buf) { - OBJ_RELEASE(recvd_buf); - } return; } @@ -1089,13 +742,16 @@ static void recv_handler(int status, orte_process_name_t* sender, void* cbdata) { int rc; + opal_buffer_t *buf; OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:tcp recvd multicast msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* clear the way for the next message */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_recv); + buf = OBJ_NEW(opal_buffer_t); + opal_dss.copy_payload(buf, buffer); + ORTE_MULTICAST_MESSAGE_EVENT(buf, process_recv); /* reissue the recv */ if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, @@ -1107,3 +763,75 @@ static void recv_handler(int status, orte_process_name_t* sender, } return; } + +static void relay(int fd, short event, void *cbdata) +{ + orte_message_event_t *msg = (orte_message_event_t*)cbdata; + orte_proc_t *proc; + opal_list_item_t *item; + orte_odls_child_t *child; + int rc, v; + + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:tcp relaying multicast msg from %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&msg->sender))); + + /* if we don't already have it, get the daemon object */ + if (NULL == daemons) { + daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + } + /* send it to each daemon other than the one that sent it to me */ + for (v=1; v < daemons->procs->size; v++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) { + continue; + } + if (proc->name.vpid == msg->sender.vpid) { + continue; + } + if (0 > (rc = orte_rml.send_buffer(&proc->name, msg->buffer, ORTE_RML_TAG_MULTICAST, 0))) { + ORTE_ERROR_LOG(rc); + } + } + + /* send the message to my children */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + if (0 > (rc = orte_rml.send_buffer(child->name, msg->buffer, ORTE_RML_TAG_MULTICAST, 0))) { + ORTE_ERROR_LOG(rc); + } + } + + /* now process it myself */ + ORTE_MULTICAST_MESSAGE_EVENT(msg->buffer, process_recv); + /* protect the buffer */ + msg->buffer = NULL; + OBJ_RELEASE(msg); +} + +static void relay_handler(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + int rc; + + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, + "%s rmcast:tcp relay multicast msg from %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender))); + + /* clear the way for the next message */ + ORTE_MESSAGE_EVENT(sender, buffer, tag, relay); + + /* reissue the recv */ + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_MULTICAST_RELAY, + ORTE_RML_NON_PERSISTENT, + relay_handler, + NULL))) { + ORTE_ERROR_LOG(rc); + } + return; +} diff --git a/orte/mca/rmcast/tcp/rmcast_tcp_component.c b/orte/mca/rmcast/tcp/rmcast_tcp_component.c index 21f4e11689..e6b90974c3 100644 --- a/orte/mca/rmcast/tcp/rmcast_tcp_component.c +++ b/orte/mca/rmcast/tcp/rmcast_tcp_component.c @@ -80,9 +80,9 @@ static int orte_rmcast_tcp_query(mca_base_module_t **module, int *priority) *module = NULL; return ORTE_ERROR; } - + /* selected by choice */ - *priority = 0; + *priority = 50; *module = (mca_base_module_t *) &orte_rmcast_tcp_module; initialized = true; diff --git a/orte/mca/rmcast/udp/rmcast_udp.c b/orte/mca/rmcast/udp/rmcast_udp.c index 19ca270621..8c1533c25c 100644 --- a/orte/mca/rmcast/udp/rmcast_udp.c +++ b/orte/mca/rmcast/udp/rmcast_udp.c @@ -38,11 +38,7 @@ #include "rmcast_udp.h" /* LOCAL DATA */ -static opal_mutex_t lock; -static opal_list_t recvs; -static opal_list_t channels; static bool init_completed = false; -static orte_rmcast_channel_t next_channel; static opal_pointer_array_t msg_log; /* LOCAL FUNCTIONS */ @@ -54,9 +50,6 @@ static int setup_socket(int *sd, rmcast_base_channel_t *chan, bool recvsocket); static void xmit_data(int sd, short flags, void* send_req); -/* LOCAL STRUCTURE VALUES */ -static rmcast_base_channel_t *my_group_channel=NULL; - /* API FUNCTIONS */ static int init(void); @@ -104,16 +97,9 @@ static int udp_recv_nb(orte_rmcast_channel_t channel, orte_rmcast_callback_fn_t cbfunc, void *cbdata); -static void cancel_recv(orte_rmcast_channel_t channel, - orte_rmcast_tag_t tag); - -static int open_channel(orte_rmcast_channel_t *channel, char *name, +static int open_channel(orte_rmcast_channel_t channel, char *name, char *network, int port, char *interface, uint8_t direction); -static int close_channel(orte_rmcast_channel_t channel); - -static orte_rmcast_channel_t query(void); - /* Define the module */ orte_rmcast_module_t orte_rmcast_udp_module = { @@ -127,10 +113,10 @@ orte_rmcast_module_t orte_rmcast_udp_module = { udp_recv_nb, udp_recv_buffer, udp_recv_buffer_nb, - cancel_recv, + orte_rmcast_base_cancel_recv, open_channel, - close_channel, - query + orte_rmcast_base_close_channel, + orte_rmcast_base_query }; /* during init, we setup two channels for both xmit and recv: @@ -155,7 +141,6 @@ orte_rmcast_module_t orte_rmcast_udp_module = { static int init(void) { int rc; - orte_rmcast_channel_t channel; if (init_completed) { return ORTE_SUCCESS; @@ -166,59 +151,52 @@ static int init(void) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* setup the globals */ - OBJ_CONSTRUCT(&lock, opal_mutex_t); - OBJ_CONSTRUCT(&recvs, opal_list_t); - OBJ_CONSTRUCT(&channels, opal_list_t); - next_channel = ORTE_RMCAST_DYNAMIC_CHANNELS; OBJ_CONSTRUCT(&msg_log, opal_pointer_array_t); opal_pointer_array_init(&msg_log, 8, INT_MAX, 8); /* setup the respective public address channel */ if (ORTE_PROC_IS_TOOL) { /* tools only open the sys channel */ - channel = ORTE_RMCAST_SYS_CHANNEL; - if (ORTE_SUCCESS != (rc = open_channel(&channel, "system", + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_SYS_CHANNEL, "system", NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { ORTE_ERROR_LOG(rc); return rc; } + orte_rmcast_base.my_group_channel = (rmcast_base_channel_t*)opal_list_get_last(&orte_rmcast_base.channels); } else if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { /* daemons and hnp open the sys and data server channels */ - channel = ORTE_RMCAST_SYS_CHANNEL; - if (ORTE_SUCCESS != (rc = open_channel(&channel, "system", + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_SYS_CHANNEL, "system", NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { ORTE_ERROR_LOG(rc); return rc; } - channel = ORTE_RMCAST_DATA_SERVER_CHANNEL; - if (ORTE_SUCCESS != (rc = open_channel(&channel, "data-server", + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_DATA_SERVER_CHANNEL, "data-server", NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { ORTE_ERROR_LOG(rc); return rc; } + orte_rmcast_base.my_group_channel = (rmcast_base_channel_t*)opal_list_get_last(&orte_rmcast_base.channels); } else if (ORTE_PROC_IS_APP) { /* apps open the app public and data server channels */ - channel = ORTE_RMCAST_APP_PUBLIC_CHANNEL; - if (ORTE_SUCCESS != (rc = open_channel(&channel, "app-announce", + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_APP_PUBLIC_CHANNEL, "app-announce", NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { ORTE_ERROR_LOG(rc); return rc; } - channel = ORTE_RMCAST_DATA_SERVER_CHANNEL; - if (ORTE_SUCCESS != (rc = open_channel(&channel, "data-server", + if (ORTE_SUCCESS != (rc = open_channel(ORTE_RMCAST_DATA_SERVER_CHANNEL, "data-server", NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { ORTE_ERROR_LOG(rc); return rc; } /* also setup our grp channel, if one was given */ if (NULL != orte_rmcast_base.my_group_name) { - channel = orte_rmcast_base.my_group_number; - if (ORTE_SUCCESS != (rc = open_channel(&channel, orte_rmcast_base.my_group_name, + if (ORTE_SUCCESS != (rc = open_channel(orte_rmcast_base.my_group_number, + orte_rmcast_base.my_group_name, NULL, -1, NULL, ORTE_RMCAST_BIDIR))) { ORTE_ERROR_LOG(rc); return rc; } - my_group_channel = (rmcast_base_channel_t*)opal_list_get_last(&channels); + orte_rmcast_base.my_group_channel = (rmcast_base_channel_t*)opal_list_get_last(&orte_rmcast_base.channels); } } else { opal_output(0, "rmcast:udp:init - unknown process type"); @@ -230,46 +208,30 @@ static int init(void) static void finalize(void) { - opal_list_item_t *item; rmcast_recv_log_t *log; int j; OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:udp: finalize called", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* deconstruct the globals */ - OPAL_THREAD_LOCK(&lock); - while (NULL != (item = opal_list_remove_first(&recvs))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&recvs); - while (NULL != (item = opal_list_remove_first(&channels))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&channels); for (j=0; j < msg_log.size; j++) { if (NULL != (log = opal_pointer_array_get_item(&msg_log, j))) { OBJ_RELEASE(log); } } OBJ_DESTRUCT(&msg_log); - OPAL_THREAD_UNLOCK(&lock); - - OBJ_DESTRUCT(&lock); return; } /* internal blocking send support */ -static bool send_complete, send_buf_complete; - static void internal_snd_cb(int status, orte_rmcast_channel_t channel, orte_rmcast_tag_t tag, orte_process_name_t *sender, struct iovec *msg, int count, void *cbdata) { - send_complete = true; + ((rmcast_base_send_t *)cbdata)->send_complete = true; } static void internal_snd_buf_cb(int status, @@ -278,37 +240,31 @@ static void internal_snd_buf_cb(int status, orte_process_name_t *sender, opal_buffer_t *buf, void *cbdata) { - send_buf_complete = true; + ((rmcast_base_send_t *)cbdata)->send_complete = true; } static int queue_xmit(rmcast_base_send_t *snd, - orte_rmcast_channel_t channel, - orte_rmcast_tag_t tag) + orte_rmcast_channel_t channel) { + rmcast_base_channel_t *ch, *chptr; opal_list_item_t *item; - rmcast_base_channel_t *chptr, *ch; - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:udp: queue_xmit to %d:%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - channel, tag)); - /* if we were asked to send this on our group output * channel, substitute it */ if (ORTE_RMCAST_GROUP_CHANNEL == channel) { - if (NULL == my_group_channel) { + if (NULL == orte_rmcast_base.my_group_channel) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - ch = my_group_channel; + ch = orte_rmcast_base.my_group_channel; goto process; } /* find the channel */ ch = NULL; - for (item = opal_list_get_first(&channels); - item != opal_list_get_end(&channels); + for (item = opal_list_get_first(&orte_rmcast_base.channels); + item != opal_list_get_end(&orte_rmcast_base.channels); item = opal_list_get_next(item)) { chptr = (rmcast_base_channel_t*)item; if (channel == chptr->channel) { @@ -324,7 +280,7 @@ static int queue_xmit(rmcast_base_send_t *snd, process: OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:udp: send of %d %s" + "%s rmcast:udp: queue xmit of %d %s" " called on multicast channel %03d.%03d.%03d.%03d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == snd->iovec_array) ? (int)snd->buf->bytes_used : (int)snd->iovec_count, @@ -359,15 +315,14 @@ static int udp_send(orte_rmcast_channel_t channel, snd->tag = tag; snd->cbfunc_iovec = internal_snd_cb; snd->cbdata = snd; - send_complete = false; - if (ORTE_SUCCESS != (ret = queue_xmit(snd, channel, tag))) { + if (ORTE_SUCCESS != (ret = queue_xmit(snd, channel))) { ORTE_ERROR_LOG(ret); return ret; } /* now wait for the send to complete */ - ORTE_PROGRESSED_WAIT(send_complete, 0, 1); + ORTE_PROGRESSED_WAIT(snd->send_complete, 0, 1); return ORTE_SUCCESS; } @@ -389,7 +344,7 @@ static int udp_send_nb(orte_rmcast_channel_t channel, snd->cbfunc_iovec = cbfunc; snd->cbdata = cbdata; - if (ORTE_SUCCESS != (ret = queue_xmit(snd, channel, tag))) { + if (ORTE_SUCCESS != (ret = queue_xmit(snd, channel))) { ORTE_ERROR_LOG(ret); return ret; } @@ -410,16 +365,15 @@ static int udp_send_buffer(orte_rmcast_channel_t channel, snd->tag = tag; snd->cbfunc_buffer = internal_snd_buf_cb; snd->cbdata = snd; - send_buf_complete = false; - if (ORTE_SUCCESS != (ret = queue_xmit(snd, channel, tag))) { + if (ORTE_SUCCESS != (ret = queue_xmit(snd, channel))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(snd); return ret; } /* now wait for the send to complete */ - ORTE_PROGRESSED_WAIT(send_buf_complete, 0, 1); + ORTE_PROGRESSED_WAIT(snd->send_complete, 0, 1); return ORTE_SUCCESS; } @@ -440,7 +394,7 @@ static int udp_send_buffer_nb(orte_rmcast_channel_t channel, snd->cbfunc_buffer = cbfunc; snd->cbdata = cbdata; - if (ORTE_SUCCESS != (ret = queue_xmit(snd, channel, tag))) { + if (ORTE_SUCCESS != (ret = queue_xmit(snd, channel))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(snd); return ret; @@ -449,75 +403,6 @@ static int udp_send_buffer_nb(orte_rmcast_channel_t channel, return ORTE_SUCCESS; } -static int queue_recv(rmcast_base_recv_t *recvptr, - orte_rmcast_channel_t channel, - orte_rmcast_tag_t tag, - orte_rmcast_callback_fn_t cbfunc_iovec, - orte_rmcast_callback_buffer_fn_t cbfunc_buffer, - bool blocking) -{ - opal_list_item_t *item; - rmcast_base_channel_t *ch, *chptr; - rmcast_base_recv_t *rptr; - - /* find the channel */ - ch = NULL; - for (item = opal_list_get_first(&channels); - item != opal_list_get_end(&channels); - item = opal_list_get_next(item)) { - chptr = (rmcast_base_channel_t*)item; - if (channel == chptr->channel) { - ch = chptr; - break; - } - } - if (NULL == ch) { - /* didn't find it */ - return ORTE_ERR_NOT_FOUND; - } - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:udp: queue_recv called on multicast channel %03d.%03d.%03d.%03d tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), OPAL_IF_FORMAT_ADDR(ch->network), tag)); - - if (!blocking) { - /* do we already have a recv for this channel/tag/type? */ - OPAL_THREAD_LOCK(&lock); - for (item = opal_list_get_first(&recvs); - item != opal_list_get_end(&recvs); - item = opal_list_get_next(item)) { - rptr = (rmcast_base_recv_t*)item; - if (channel != rptr->channel) { - /* different channel */ - continue; - } - if (tag != rptr->tag) { - /* different tag */ - continue; - } - if ((NULL != cbfunc_iovec && NULL != rptr->cbfunc_iovec) || - (NULL != cbfunc_buffer && NULL != rptr->cbfunc_buffer)) { - /* matching type - recv already in place */ - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:udp: matching recv already active on multicast channel %03d.%03d.%03d.%03d tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), OPAL_IF_FORMAT_ADDR(ch->network), tag)); - OPAL_THREAD_UNLOCK(&lock); - return ORTE_EXISTS; - } - } - OPAL_THREAD_UNLOCK(&lock); - } - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:udp: adding non-blocking recv on multicast channel %03d.%03d.%03d.%03d tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), OPAL_IF_FORMAT_ADDR(ch->network), tag)); - OPAL_THREAD_LOCK(&lock); - opal_list_append(&recvs, &recvptr->item); - OPAL_THREAD_UNLOCK(&lock); - - return ORTE_SUCCESS; -} - static int udp_recv(orte_process_name_t *name, orte_rmcast_channel_t channel, orte_rmcast_tag_t tag, @@ -525,19 +410,18 @@ static int udp_recv(orte_process_name_t *name, { rmcast_base_recv_t *recvptr; int ret; - - recvptr = OBJ_NEW(rmcast_base_recv_t); - recvptr->iovecs_requested = true; + orte_rmcast_channel_t chan; + if (ORTE_RMCAST_GROUP_CHANNEL == channel) { - recvptr->channel = orte_rmcast_base.my_group_number; + chan = orte_rmcast_base.my_group_number; } else { - recvptr->channel = channel; + chan = channel; } - recvptr->tag = tag; - if (ORTE_SUCCESS != (ret = queue_recv(recvptr, recvptr->channel, tag, NULL, NULL, true))) { + if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(&recvptr, chan, tag, + ORTE_RMCAST_NON_PERSISTENT, + NULL, NULL, NULL, true))) { ORTE_ERROR_LOG(ret); - OBJ_RELEASE(recvptr); return ret; } @@ -553,9 +437,9 @@ static int udp_recv(orte_process_name_t *name, *count = recvptr->iovec_count; /* remove the recv */ - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &recvptr->item); - OPAL_THREAD_UNLOCK(&lock); + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + opal_list_remove_item(&orte_rmcast_base.recvs, &recvptr->item); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); OBJ_RELEASE(recvptr); return ORTE_SUCCESS; @@ -566,33 +450,22 @@ static int udp_recv_nb(orte_rmcast_channel_t channel, orte_rmcast_flag_t flags, orte_rmcast_callback_fn_t cbfunc, void *cbdata) { - rmcast_base_recv_t *recvptr; + orte_rmcast_channel_t chan; int ret; + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:udp: recv_nb called on channel %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel)); - recvptr = OBJ_NEW(rmcast_base_recv_t); - recvptr->iovecs_requested = true; if (ORTE_RMCAST_GROUP_CHANNEL == channel) { - recvptr->channel = orte_rmcast_base.my_group_number; + chan = orte_rmcast_base.my_group_number; } else { - recvptr->channel = channel; + chan = channel; } - recvptr->tag = tag; - recvptr->flags = flags; - recvptr->cbfunc_iovec = cbfunc; - recvptr->cbdata = cbdata; - if (ORTE_SUCCESS != (ret = queue_recv(recvptr, recvptr->channel, tag, cbfunc, NULL, false))) { - if (ORTE_EXISTS == ret) { - /* this recv already exists - just release the copy */ - OBJ_RELEASE(recvptr); - return ORTE_SUCCESS; - } + if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(NULL, chan, tag, flags, + cbfunc, NULL, cbdata, false))) { ORTE_ERROR_LOG(ret); - OBJ_RELEASE(recvptr); - return ret; } return ORTE_SUCCESS; @@ -605,22 +478,23 @@ static int udp_recv_buffer(orte_process_name_t *name, { rmcast_base_recv_t *recvptr; int ret; - + orte_rmcast_channel_t chan; + OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:udp: recv_buffer called on multicast channel %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel)); - recvptr = OBJ_NEW(rmcast_base_recv_t); if (ORTE_RMCAST_GROUP_CHANNEL == channel) { - recvptr->channel = orte_rmcast_base.my_group_number; + chan = orte_rmcast_base.my_group_number; } else { - recvptr->channel = channel; + chan = channel; } - recvptr->tag = tag; - if (ORTE_SUCCESS != (ret = queue_recv(recvptr, recvptr->channel, tag, NULL, NULL, true))) { + if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(&recvptr, chan, tag, + ORTE_RMCAST_NON_PERSISTENT, + NULL, NULL, NULL, true))) { ORTE_ERROR_LOG(ret); - goto cleanup; + return ret; } ORTE_PROGRESSED_WAIT(recvptr->recvd, 0, 1); @@ -637,10 +511,9 @@ static int udp_recv_buffer(orte_process_name_t *name, /* release the data */ OBJ_RELEASE(recvptr->buf); -cleanup: - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &recvptr->item); - OPAL_THREAD_UNLOCK(&lock); + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + opal_list_remove_item(&orte_rmcast_base.recvs, &recvptr->item); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); OBJ_RELEASE(recvptr); return ret; @@ -651,68 +524,29 @@ static int udp_recv_buffer_nb(orte_rmcast_channel_t channel, orte_rmcast_flag_t flags, orte_rmcast_callback_buffer_fn_t cbfunc, void *cbdata) { - rmcast_base_recv_t *recvptr; + orte_rmcast_channel_t chan; int ret; OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:udp: recv_buffer_nb called on multicast channel %d tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel, tag)); - recvptr = OBJ_NEW(rmcast_base_recv_t); if (ORTE_RMCAST_GROUP_CHANNEL == channel) { - recvptr->channel = orte_rmcast_base.my_group_number; + chan = orte_rmcast_base.my_group_number; } else { - recvptr->channel = channel; + chan = channel; } - recvptr->tag = tag; - recvptr->flags = flags; - recvptr->cbfunc_buffer = cbfunc; - recvptr->cbdata = cbdata; - if (ORTE_SUCCESS != (ret = queue_recv(recvptr, recvptr->channel, tag, NULL, cbfunc, false))) { - if (ORTE_EXISTS == ret) { - /* this recv already exists - just release the copy */ - OBJ_RELEASE(recvptr); - return ORTE_SUCCESS; - } + if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(NULL, chan, tag, flags, + NULL, cbfunc, cbdata, false))) { ORTE_ERROR_LOG(ret); - OBJ_RELEASE(recvptr); return ret; } return ORTE_SUCCESS; } -static void cancel_recv(orte_rmcast_channel_t channel, - orte_rmcast_tag_t tag) -{ - opal_list_item_t *item, *next; - rmcast_base_recv_t *ptr; - orte_rmcast_channel_t ch; - - if (ORTE_RMCAST_GROUP_CHANNEL == channel) { - ch = orte_rmcast_base.my_group_number; - } else { - ch = channel; - } - - /* find all recv's for this channel and tag */ - item = opal_list_get_first(&recvs); - while (item != opal_list_get_end(&recvs)) { - next = opal_list_get_next(item); - - ptr = (rmcast_base_recv_t*)item; - if (ch == ptr->channel && tag == ptr->tag) { - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &ptr->item); - OBJ_RELEASE(ptr); - OPAL_THREAD_UNLOCK(&lock); - } - item = next; - } -} - -static int open_channel(orte_rmcast_channel_t *channel, char *name, +static int open_channel(orte_rmcast_channel_t channel, char *name, char *network, int port, char *interface, uint8_t direction) { opal_list_item_t *item; @@ -739,30 +573,25 @@ static int open_channel(orte_rmcast_channel_t *channel, char *name, /* see if this name has already been assigned a channel on the specified network */ OPAL_OUTPUT_VERBOSE((7, orte_rmcast_base.rmcast_output, "%s open_channel: searching for %s:%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), name, *channel)); + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), name, channel)); chan = NULL; - for (item = opal_list_get_first(&channels); - item != opal_list_get_end(&channels); + for (item = opal_list_get_first(&orte_rmcast_base.channels); + item != opal_list_get_end(&orte_rmcast_base.channels); item = opal_list_get_next(item)) { nchan = (rmcast_base_channel_t*)item; OPAL_OUTPUT_VERBOSE((7, orte_rmcast_base.rmcast_output, "%s open_channel: channel %s:%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - nchan->name, *channel)); + nchan->name, channel)); - if (nchan->channel == *channel || + if (nchan->channel == channel || 0 == strcasecmp(nchan->name, name)) { /* check the network, if one was specified */ if (0 != netaddr && netaddr != (nchan->network & netmask)) { continue; } - /* check the channel, if one was given */ - if (ORTE_RMCAST_INVALID_CHANNEL != *channel && - nchan->channel != *channel) { - continue; - } chan = nchan; break; } @@ -789,13 +618,7 @@ static int open_channel(orte_rmcast_channel_t *channel, char *name, /* we didn't find an existing match, so create a new channel */ chan = OBJ_NEW(rmcast_base_channel_t); chan->name = strdup(name); - /* if we were given a channel, then just use it */ - if (ORTE_RMCAST_INVALID_CHANNEL != *channel) { - chan->channel = *channel; - } else { - chan->channel = next_channel++; - *channel = chan->channel; - } + chan->channel = channel; /* if we were not given a network, use the default */ if (NULL == network) { chan->network = orte_rmcast_base.xmit_network + chan->channel; @@ -814,9 +637,9 @@ static int open_channel(orte_rmcast_channel_t *channel, char *name, } else { chan->port = port; } - OPAL_THREAD_LOCK(&lock); - opal_list_append(&channels, &chan->item); - OPAL_THREAD_UNLOCK(&lock); + OPAL_THREAD_LOCK(&orte_rmcast_base.lock); + opal_list_append(&orte_rmcast_base.channels, &chan->item); + OPAL_THREAD_UNLOCK(&orte_rmcast_base.lock); OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:udp opening new channel %s:%d network %03d.%03d.%03d.%03d port %d for%s%s", @@ -835,305 +658,14 @@ static int open_channel(orte_rmcast_channel_t *channel, char *name, return ORTE_SUCCESS; } -static int close_channel(orte_rmcast_channel_t channel) -{ - opal_list_item_t *item; - rmcast_base_channel_t *chan; - - OPAL_THREAD_LOCK(&lock); - for (item = opal_list_get_first(&channels); - item != opal_list_get_end(&channels); - item = opal_list_get_next(item)) { - chan = (rmcast_base_channel_t*)item; - - if (channel == chan->channel) { - opal_list_remove_item(&channels, item); - OBJ_RELEASE(chan); - OPAL_THREAD_UNLOCK(&lock); - return ORTE_SUCCESS; - } - } - - OPAL_THREAD_UNLOCK(&lock); - return ORTE_ERR_NOT_FOUND; -} - -static orte_rmcast_channel_t query(void) -{ - return orte_rmcast_base.my_group_number; -} - - /**** LOCAL FUNCTIONS ****/ static void process_recv(int fd, short event, void *cbdata) { orte_mcast_msg_event_t *msg = (orte_mcast_msg_event_t*)cbdata; - rmcast_base_channel_t *chan = msg->channel; - opal_list_item_t *item; - rmcast_base_recv_t *ptr; - orte_process_name_t name; - orte_rmcast_tag_t tag; - opal_buffer_t buf; - int8_t flag; - struct iovec *iovec_array=NULL; - int32_t iovec_count=0, i, sz, n; - opal_buffer_t *recvd_buf=NULL; - int rc; - orte_rmcast_seq_t recvd_seq_num; - rmcast_recv_log_t *log, *lg; - - /* extract the header */ - ORTE_MULTICAST_MESSAGE_HDR_NTOH(msg->data, &name, tag, recvd_seq_num); - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:udp:recv sender: %s tag: %d seq_num: %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name), (int)tag, recvd_seq_num)); - - /* if this message is from myself, ignore it */ - if (name.jobid == ORTE_PROC_MY_NAME->jobid && name.vpid == ORTE_PROC_MY_NAME->vpid) { - OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output, - "%s rmcast:udp:recv sent from myself: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); - goto cleanup; - } - - /* if this message is from a different job family, ignore it unless - * it is on the system channel. We ignore these messages to avoid - * confusion between different jobs since we all may be sharing - * multicast channels. The system channel is left open to support - * cross-job communications via the HNP. - */ - if (ORTE_JOB_FAMILY(name.jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) && - (ORTE_RMCAST_SYS_CHANNEL != chan->channel)) { - OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output, - "%s rmcast:udp:recv from a different job family: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); - goto cleanup; - } - - /* construct the buffer for unpacking */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - - /* unload the message */ - ORTE_MULTICAST_UNLOAD_MESSAGE(&buf, msg->data, msg->sz); - - /* unpack the iovec vs buf flag */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &flag, &n, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* find the recv for this channel, tag, and type */ - for (item = opal_list_get_first(&recvs); - item != opal_list_get_end(&recvs); - item = opal_list_get_next(item)) { - ptr = (rmcast_base_recv_t*)item; - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:udp:recv checking channel %d tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (int)ptr->channel, (int)ptr->tag)); - - if (chan->channel != ptr->channel) { - continue; - } - - if (tag != ptr->tag && ORTE_RMCAST_TAG_WILDCARD != ptr->tag) { - continue; - } - - if (0 == flag && !ptr->iovecs_requested) { - /* it's an iovec and this recv is for buffers */ - continue; - } - - if (1 == flag && ptr->iovecs_requested) { - /* it's a buffer and this recv is for iovecs */ - continue; - } - - /* we have a recv - unpack the data */ - if (0 == flag) { - /* get the number of iovecs in the buffer */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &iovec_count, &n, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - /* malloc the required space */ - iovec_array = (struct iovec *)malloc(iovec_count * sizeof(struct iovec)); - /* unpack the iovecs */ - for (i=0; i < iovec_count; i++) { - /* unpack the number of bytes in this iovec */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &sz, &n, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - iovec_array[i].iov_base = NULL; - iovec_array[i].iov_len = sz; - if (0 < sz) { - /* allocate the space */ - iovec_array[i].iov_base = (uint8_t*)malloc(sz); - /* unpack the data */ - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, iovec_array[i].iov_base, &sz, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - } - } - } else { - /* buffer was included */ - recvd_buf = OBJ_NEW(opal_buffer_t); - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(recvd_buf, &buf))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - } - - /* if the sender's vpid is invalid, then this is a request for - * assignment of a name - so don't log the message - */ - if (ORTE_VPID_INVALID == name.vpid) { - goto MATCH; - } - - /* look up the message log for this sender */ - log = NULL; - for (n=0; n < msg_log.size; n++) { - if (NULL == (lg = (rmcast_recv_log_t*)opal_pointer_array_get_item(&msg_log, n))) { - continue; - } - if ((name.jobid == lg->name.jobid && name.vpid == lg->name.vpid) && - chan->channel == lg->channel) { - log = lg; - break; - } - } - if (NULL == log) { - /* new sender - create a log */ - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:udp:recv creating new msg log for %s channel %d seq# %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name), (int)msg->channel->channel, recvd_seq_num)); - log = OBJ_NEW(rmcast_recv_log_t); - log->name.jobid = name.jobid; - log->name.vpid = name.vpid; - log->channel = chan->channel; - log->seq_num = recvd_seq_num; - opal_pointer_array_add(&msg_log, log); - goto MATCH; - } - - if (recvd_seq_num < log->seq_num) { - /* this must be a repeat of an earlier message - ignore it */ - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:udp:recv recvd repeat msg %d (log at %d) from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - recvd_seq_num, log->seq_num, ORTE_NAME_PRINT(&name))); - goto cleanup; - } - - if (log->seq_num != (recvd_seq_num-1)) { - /* this message out of sequence - tell - * the sender the last number we got - */ - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:udp:recv msg %d is out of sequence (log at %d) from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - recvd_seq_num, log->seq_num, ORTE_NAME_PRINT(&name))); - /* ignore this message */ - goto cleanup; - } - - /* update the seq number */ - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:udp:recv update msg log to %d from %s:%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - recvd_seq_num, ORTE_NAME_PRINT(&name), log->channel)); - log->seq_num = recvd_seq_num; - - MATCH: - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s rmcast:udp:recv delivering message to channel %d tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ptr->channel, (int)tag)); - - if (0 == flag) { - /* dealing with iovecs */ - if (NULL != ptr->cbfunc_iovec) { - ptr->cbfunc_iovec(ORTE_SUCCESS, ptr->channel, tag, - &name, iovec_array, iovec_count, ptr->cbdata); - /* if it isn't persistent, remove it */ - if (!(ORTE_RMCAST_PERSISTENT & ptr->flags)) { - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &ptr->item); - OPAL_THREAD_UNLOCK(&lock); - OBJ_RELEASE(ptr); - } - } else { - /* copy over the iovec array since it will be released by - * the blocking recv - */ - ptr->iovec_array = (struct iovec *)malloc(iovec_count * sizeof(struct iovec)); - ptr->iovec_count = iovec_count; - for (i=0; i < iovec_count; i++) { - ptr->iovec_array[i].iov_base = (uint8_t*)malloc(iovec_array[i].iov_len); - ptr->iovec_array[i].iov_len = iovec_array[i].iov_len; - memcpy(ptr->iovec_array[i].iov_base, iovec_array[i].iov_base, iovec_array[i].iov_len); - } - /* copy the sender's name */ - ptr->name.jobid = name.jobid; - ptr->name.vpid = name.vpid; - /* flag it as recvd to release blocking recv */ - ptr->recvd = true; - } - } else { - if (NULL != ptr->cbfunc_buffer) { - ptr->cbfunc_buffer(ORTE_SUCCESS, ptr->channel, tag, - &name, recvd_buf, ptr->cbdata); - /* if it isn't persistent, remove it */ - if (!(ORTE_RMCAST_PERSISTENT & ptr->flags)) { - OPAL_THREAD_LOCK(&lock); - opal_list_remove_item(&recvs, &ptr->item); - OPAL_THREAD_UNLOCK(&lock); - OBJ_RELEASE(ptr); - } - } else { - /* copy the buffer across since it will be released - * by the blocking recv - */ - ptr->buf = OBJ_NEW(opal_buffer_t); - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(ptr->buf, recvd_buf))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - /* copy the sender's name */ - ptr->name.jobid = name.jobid; - ptr->name.vpid = name.vpid; - /* flag it as recvd to release blocking recv */ - ptr->recvd = true; - } - } - /* we are done - only one recv can match */ - break; - } - -cleanup: + orte_rmcast_base_process_recv(msg); OBJ_RELEASE(msg); - if (NULL != iovec_array) { - for (i=0; i < iovec_count; i++) { - free(iovec_array[i].iov_base); - } - free(iovec_array); - } - if (NULL != recvd_buf) { - OBJ_RELEASE(recvd_buf); - } return; } @@ -1142,10 +674,11 @@ static void recv_handler(int sd, short flags, void* cbdata) uint8_t *data; ssize_t sz; rmcast_base_channel_t *chan = (rmcast_base_channel_t*)cbdata; + opal_buffer_t *buf; /* read the data */ - data = (uint8_t*)malloc(mca_rmcast_udp_component.max_msg_size * sizeof(uint8_t)); - sz = read(sd, data, mca_rmcast_udp_component.max_msg_size); + data = (uint8_t*)malloc(ORTE_RMCAST_UDP_MTU * sizeof(uint8_t)); + sz = read(sd, data, ORTE_RMCAST_UDP_MTU); if (sz <= 0) { /* this shouldn't happen - report the errno */ @@ -1160,7 +693,9 @@ static void recv_handler(int sd, short flags, void* cbdata) (int)sz, (int)chan->channel)); /* clear the way for the next message */ - ORTE_MULTICAST_MESSAGE_EVENT(data, sz, chan, process_recv); + buf = OBJ_NEW(opal_buffer_t); + opal_dss.load(buf, data, sz); + ORTE_MULTICAST_MESSAGE_EVENT(buf, process_recv); return; } @@ -1185,12 +720,11 @@ static int setup_channel(rmcast_base_channel_t *chan, uint8_t direction) chan->addr.sin_port = htons(chan->port); OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s setup:channel addr %03d.%03d.%03d.%03d port %d for %s:%s msg-size: %d", + "%s setup:channel addr %03d.%03d.%03d.%03d port %d for %s:%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), OPAL_IF_FORMAT_ADDR(chan->network), (int)chan->port, (ORTE_RMCAST_RECV & direction) ? " RECV" : " ", - (ORTE_RMCAST_XMIT & direction) ? " XMIT" : " ", - mca_rmcast_udp_component.max_msg_size)); + (ORTE_RMCAST_XMIT & direction) ? " XMIT" : " ")); if (0 > chan->xmit && ORTE_RMCAST_XMIT & direction) { /* create a xmit socket */ @@ -1199,7 +733,7 @@ static int setup_channel(rmcast_base_channel_t *chan, uint8_t direction) return rc; } chan->xmit = xmitsd; - chan->send_data = (uint8_t*)malloc(mca_rmcast_udp_component.max_msg_size); + chan->send_data = (uint8_t*)malloc(ORTE_RMCAST_UDP_MTU); /* setup the event to xmit messages, but don't activate it */ opal_event_set(&chan->send_ev, chan->xmit, OPAL_EV_WRITE, xmit_data, chan); } @@ -1299,6 +833,17 @@ static int setup_socket(int *sd, rmcast_base_channel_t *chan, bool recvsocket) CLOSE_THE_SOCKET(target_sd); return ORTE_ERROR; } + /* set the recvbuf size */ + flags = 10*ORTE_RMCAST_UDP_MTU; + if ((setsockopt(target_sd, SOL_SOCKET, SO_RCVBUF, &flags, sizeof(flags))) < 0) { + opal_output(0, "%s rmcast:init: setsockopt() failed on SO_RCVBUF\n" + "\tfor multicast network %03d.%03d.%03d.%03d interface %03d.%03d.%03d.%03d\n\tError: %s (%d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + OPAL_IF_FORMAT_ADDR(chan->network), OPAL_IF_FORMAT_ADDR(chan->interface), + strerror(opal_socket_errno), opal_socket_errno); + CLOSE_THE_SOCKET(target_sd); + return ORTE_ERROR; + } } else { /* on the xmit side, need to set the interface */ memset(&inaddr, 0, sizeof(inaddr)); @@ -1319,6 +864,17 @@ static int setup_socket(int *sd, rmcast_base_channel_t *chan, bool recvsocket) CLOSE_THE_SOCKET(target_sd); return ORTE_ERROR; } + /* set the sendbuf size */ + flags = ORTE_RMCAST_UDP_MTU; + if ((setsockopt(target_sd, SOL_SOCKET, SO_SNDBUF, &flags, sizeof(flags))) < 0) { + opal_output(0, "%s rmcast:init: setsockopt() failed on SO_SNDBUF\n" + "\tfor multicast network %03d.%03d.%03d.%03d interface %03d.%03d.%03d.%03d\n\tError: %s (%d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + OPAL_IF_FORMAT_ADDR(chan->network), OPAL_IF_FORMAT_ADDR(chan->interface), + strerror(opal_socket_errno), opal_socket_errno); + CLOSE_THE_SOCKET(target_sd); + return ORTE_ERROR; + } } /* set socket up to be non-blocking */ @@ -1349,9 +905,7 @@ static void xmit_data(int sd, short flags, void* send_req) char *bytes; int32_t sz, outbound; int rc; - int8_t flag; - opal_buffer_t buf; - int32_t tmp32; + opal_buffer_t *buf; rmcast_send_log_t *log, *lg; OPAL_THREAD_LOCK(&chan->send_lock); @@ -1362,71 +916,10 @@ static void xmit_data(int sd, short flags, void* send_req) while (NULL != (item = opal_list_remove_first(&chan->pending_sends))) { snd = (rmcast_base_send_t*)item; - /* setup a tmp buffer for a working area */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - - /* start the send data area with our header */ - ORTE_MULTICAST_MESSAGE_HDR_HTON(chan->send_data, snd->tag, chan->seq_num); - - /* are we sending a buffer? */ - if (NULL == snd->buf) { - /* flag the buffer as containing iovecs */ - flag = 0; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &flag, 1, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s packing %d iovecs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - snd->iovec_count)); - - /* pack the number of iovecs */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &snd->iovec_count, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - - /* pack each iovec into a buffer in prep for sending - * so we can recreate the array at the other end - */ - for (sz=0; sz < snd->iovec_count; sz++) { - /* pack the size */ - tmp32 = snd->iovec_array[sz].iov_len; - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s packing %d bytes for iovec %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - tmp32, sz)); - - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &tmp32, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (0 < tmp32) { - /* pack the bytes */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, snd->iovec_array[sz].iov_base, tmp32, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - } - } - } else { - /* flag it as being a buffer */ - flag = 1; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &flag, 1, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, - "%s copying payload", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* copy the payload */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&buf, snd->buf))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } + /* setup the message for xmission */ + if (ORTE_SUCCESS != (rc = orte_rmcast_base_build_msg(chan, &buf, snd))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; } /* store the working buf in the send ring buffer in case we @@ -1435,7 +928,7 @@ static void xmit_data(int sd, short flags, void* send_req) log = OBJ_NEW(rmcast_send_log_t); log->channel = chan->channel; log->seq_num = chan->seq_num; - opal_dss.copy_payload(log->buf, &buf); + opal_dss.copy_payload(log->buf, buf); if (NULL != (lg = (rmcast_send_log_t*)opal_ring_buffer_push(&chan->cache, log))) { /* release the old message */ OPAL_OUTPUT_VERBOSE((5, orte_rmcast_base.rmcast_output, @@ -1446,49 +939,29 @@ static void xmit_data(int sd, short flags, void* send_req) } /* unload the working buf to obtain the payload */ - if (ORTE_SUCCESS != (rc = opal_dss.unload(&buf, (void**)&bytes, &sz))) { + if (ORTE_SUCCESS != (rc = opal_dss.unload(buf, (void**)&bytes, &sz))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* done with the working buf */ - OBJ_DESTRUCT(&buf); + OBJ_RELEASE(buf); - /* add the payload, up to the limit */ - ORTE_MULTICAST_LOAD_MESSAGE(chan->send_data, bytes, sz, - mca_rmcast_udp_component.max_msg_size, - &outbound); - - if (outbound < 0) { - /* message was too large */ - opal_output(0, "%s message to multicast network %03d.%03d.%03d.%03d failed - size %d was too large (limit: %d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), OPAL_IF_FORMAT_ADDR(chan->network), - -1*outbound, mca_rmcast_udp_component.max_msg_size); - if (1 == flag) { - /* reload into original buffer */ - if (ORTE_SUCCESS != (rc = opal_dss.load(snd->buf, (void*)bytes, sz))) { - ORTE_ERROR_LOG(rc); - } - } - /* cleanup */ - goto CLEANUP; - } - OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:udp multicasting %d bytes to network %03d.%03d.%03d.%03d port %d tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), outbound, OPAL_IF_FORMAT_ADDR(chan->network), (int)chan->port, (int)snd->tag)); - if (outbound != (rc = sendto(chan->xmit, chan->send_data, outbound, 0, + if (sz != (rc = sendto(chan->xmit, bytes, sz, 0, (struct sockaddr *)&(chan->addr), sizeof(struct sockaddr_in)))) { /* didn't get the message out */ - opal_output(0, "%s failed to send message to multicast network %03d.%03d.%03d.%03d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), OPAL_IF_FORMAT_ADDR(chan->network)); - /* cleanup */ - goto CLEANUP; + opal_output(0, "%s failed to send message to multicast network %03d.%03d.%03d.%03d on\n\terror %s(%d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), OPAL_IF_FORMAT_ADDR(chan->network), + strerror(errno), errno); + rc = errno; } - if (1 == flag) { + if (NULL != snd->buf) { /* call the cbfunc if required */ if (NULL != snd->cbfunc_buffer) { snd->cbfunc_buffer(rc, chan->channel, snd->tag, diff --git a/orte/mca/rmcast/udp/rmcast_udp.h b/orte/mca/rmcast/udp/rmcast_udp.h index 964750e157..982558b9cd 100644 --- a/orte/mca/rmcast/udp/rmcast_udp.h +++ b/orte/mca/rmcast/udp/rmcast_udp.h @@ -24,14 +24,9 @@ BEGIN_C_DECLS -#define ORTE_RMCAST_UDP_MAX_MSG_SIZE 1500 +#define ORTE_RMCAST_UDP_MTU 65536 -typedef struct { - orte_rmcast_base_component_t super; - int max_msg_size; -} orte_rmcast_udp_component_t; - -ORTE_MODULE_DECLSPEC extern orte_rmcast_udp_component_t mca_rmcast_udp_component; +ORTE_MODULE_DECLSPEC extern orte_rmcast_base_component_t mca_rmcast_udp_component; extern orte_rmcast_module_t orte_rmcast_udp_module; END_C_DECLS diff --git a/orte/mca/rmcast/udp/rmcast_udp_component.c b/orte/mca/rmcast/udp/rmcast_udp_component.c index a14dd48410..50a48b1d5b 100644 --- a/orte/mca/rmcast/udp/rmcast_udp_component.c +++ b/orte/mca/rmcast/udp/rmcast_udp_component.c @@ -41,25 +41,23 @@ static bool initialized = false; const char *mca_rmcast_udp_component_version_string = "Open MPI udp rmcast MCA component version " ORTE_VERSION; -orte_rmcast_udp_component_t mca_rmcast_udp_component = { +orte_rmcast_base_component_t mca_rmcast_udp_component = { { - { - ORTE_RMCAST_BASE_VERSION_1_0_0, - - "udp", /* MCA component name */ - ORTE_MAJOR_VERSION, /* MCA component major version */ - ORTE_MINOR_VERSION, /* MCA component minor version */ - ORTE_RELEASE_VERSION, /* MCA component release version */ - - /* Component open, close, and query functions */ - orte_rmcast_udp_open, - orte_rmcast_udp_close, - orte_rmcast_udp_query - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - } + ORTE_RMCAST_BASE_VERSION_1_0_0, + + "udp", /* MCA component name */ + ORTE_MAJOR_VERSION, /* MCA component major version */ + ORTE_MINOR_VERSION, /* MCA component minor version */ + ORTE_RELEASE_VERSION, /* MCA component release version */ + + /* Component open, close, and query functions */ + orte_rmcast_udp_open, + orte_rmcast_udp_close, + orte_rmcast_udp_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT } }; @@ -68,14 +66,6 @@ orte_rmcast_udp_component_t mca_rmcast_udp_component = { */ static int orte_rmcast_udp_open(void) { - mca_base_component_t *c = &mca_rmcast_udp_component.super.version; - - mca_base_param_reg_int(c, "max_msg_size", - "Max #bytes in a single msg (must be > 0)", - false, false, - ORTE_RMCAST_UDP_MAX_MSG_SIZE, - &mca_rmcast_udp_component.max_msg_size); - return ORTE_SUCCESS; } diff --git a/orte/test/system/orte_mcast.c b/orte/test/system/orte_mcast.c index 105fd98b97..7c8ad4bdce 100644 --- a/orte/test/system/orte_mcast.c +++ b/orte/test/system/orte_mcast.c @@ -36,7 +36,7 @@ static void cbfunc_iovec(int status, orte_process_name_t *sender, struct iovec *msg, int count, void* cbdata); -orte_rmcast_channel_t chan=4; +static int datasize=1024; static void send_data(int fd, short flags, void *arg) { @@ -49,8 +49,8 @@ static void send_data(int fd, short flags, void *arg) bfptr = OBJ_NEW(opal_buffer_t); i32 = -1; - opal_dss.pack(bfptr, &i32, 1, OPAL_INT32); - if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer_nb(chan, + opal_dss.pack(bfptr, &i32, datasize, OPAL_INT32); + if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer_nb(ORTE_RMCAST_GROUP_CHANNEL, ORTE_RMCAST_TAG_OUTPUT, bfptr, cbfunc_buf_snt, NULL))) { ORTE_ERROR_LOG(rc); @@ -59,11 +59,11 @@ static void send_data(int fd, short flags, void *arg) } /* create an iovec array */ for (i=0; i < 3; i++) { - iovec_array[i].iov_base = (uint8_t*)malloc(30); - iovec_array[i].iov_len = 30; + iovec_array[i].iov_base = (uint8_t*)malloc(datasize); + iovec_array[i].iov_len = datasize; } /* send it out */ - if (ORTE_SUCCESS != (rc = orte_rmcast.send(chan, + if (ORTE_SUCCESS != (rc = orte_rmcast.send(ORTE_RMCAST_GROUP_CHANNEL, ORTE_RMCAST_TAG_OUTPUT, iovec_array, 3))) { ORTE_ERROR_LOG(rc); @@ -85,48 +85,41 @@ int main(int argc, char* argv[]) struct iovec iovec_array[3]; if (0 > (rc = orte_init(&argc, &argv, ORTE_PROC_NON_MPI))) { - fprintf(stderr, "orte_nodename: couldn't init orte - error code %d\n", rc); + fprintf(stderr, "orte_mcast: couldn't init orte - error code %d\n", rc); return rc; } gethostname(hostname, 512); pid = getpid(); - printf("orte_mcast: Node %s Name %s Pid %ld\n", - hostname, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)pid); + if (1 < argc) { + datasize = strtol(argv[1], NULL, 10); + } + + printf("orte_mcast: Node %s Name %s Pid %ld datasize %d\n", + hostname, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)pid, datasize); if (0 == ORTE_PROC_MY_NAME->vpid) { orte_grpcomm.barrier(); - /* open a new channel */ - if (ORTE_SUCCESS != (rc = orte_rmcast.open_channel(&chan, "orte_mcast", NULL, -1, NULL, ORTE_RMCAST_XMIT))) { - ORTE_ERROR_LOG(rc); - goto blast; - } - - OBJ_CONSTRUCT(&buf, opal_buffer_t); - /* pass the new channel number */ - i32 = chan; - opal_dss.pack(&buf, &i32, 1, OPAL_INT32); - if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer(ORTE_RMCAST_APP_PUBLIC_CHANNEL, - ORTE_RMCAST_TAG_ANNOUNCE, &buf))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - goto blast; - } - OBJ_DESTRUCT(&buf); - /* wake up every 5 seconds and send something */ ORTE_TIMER_EVENT(5, 0, send_data); } else { - if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_APP_PUBLIC_CHANNEL, - ORTE_RMCAST_TAG_WILDCARD, + /* setup to recv data on our channel */ + if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_GROUP_CHANNEL, + ORTE_RMCAST_TAG_OUTPUT, ORTE_RMCAST_PERSISTENT, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } + if (ORTE_SUCCESS != (rc = orte_rmcast.recv_nb(ORTE_RMCAST_GROUP_CHANNEL, + ORTE_RMCAST_TAG_OUTPUT, + ORTE_RMCAST_PERSISTENT, + cbfunc_iovec, NULL))) { + ORTE_ERROR_LOG(rc); + } orte_grpcomm.barrier(); /* ensure the public recv is ready */ } opal_event_dispatch(); @@ -152,29 +145,6 @@ static void cbfunc(int status, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender), channel, tag, i32); - if (i32 < 0) { - return; - } - - /* open a new channel */ - chan = i32; - if (ORTE_SUCCESS != (rc = orte_rmcast.open_channel(&chan, "orte_mcast", NULL, -1, NULL, ORTE_RMCAST_RECV))) { - ORTE_ERROR_LOG(rc); - } - - /* setup to recv data on it */ - if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(chan, - ORTE_RMCAST_TAG_OUTPUT, - ORTE_RMCAST_PERSISTENT, - cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } - if (ORTE_SUCCESS != (rc = orte_rmcast.recv_nb(chan, - ORTE_RMCAST_TAG_OUTPUT, - ORTE_RMCAST_PERSISTENT, - cbfunc_iovec, NULL))) { - ORTE_ERROR_LOG(rc); - } } static void cbfunc_iovec(int status,