From c1bbbb5e2f54476f80a42a6390c9532de3e5a66e Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sun, 13 Sep 2015 12:59:26 -0700 Subject: [PATCH] Remove the last involvement of the OOB system from the MPI layer, remove the no-longer-needed usock/oob component, and have procs no longer open the RML, OOB, ROUTED, and GRPCOMM frameworks as PMIx now provides all required app-mpirun cmds --- contrib/platform/intel/bend/linux.conf | 18 - contrib/platform/intel/bend/mac-orcm.conf | 13 - contrib/platform/intel/bend/mac.conf | 14 - ompi/communicator/comm_cid.c | 161 ++-- ompi/communicator/communicator.h | 8 +- ompi/dpm/dpm.c | 201 +---- ompi/mca/rte/orte/rte_orte.h | 23 - ompi/mca/rte/rte.h | 33 - opal/mca/pmix/base/base.h | 4 +- opal/mca/pmix/base/pmix_base_fns.c | 125 +++ opal/mca/pmix/pmix.h | 21 +- opal/mca/pmix/pmix1xx/pmix_pmix1.c | 1 + opal/mca/pmix/pmix_types.h | 3 +- orte/mca/ess/base/ess_base_std_app.c | 87 +- orte/mca/ess/pmi/ess_pmi_module.c | 13 - orte/mca/oob/usock/Makefile.am | 56 -- orte/mca/oob/usock/configure.m4 | 42 - orte/mca/oob/usock/help-oob-usock.txt | 70 -- orte/mca/oob/usock/oob_usock.c | 473 ----------- orte/mca/oob/usock/oob_usock.h | 97 --- orte/mca/oob/usock/oob_usock_component.c | 593 -------------- orte/mca/oob/usock/oob_usock_component.h | 64 -- orte/mca/oob/usock/oob_usock_connection.c | 940 ---------------------- orte/mca/oob/usock/oob_usock_connection.h | 102 --- orte/mca/oob/usock/oob_usock_hdr.h | 59 -- orte/mca/oob/usock/oob_usock_peer.h | 85 -- orte/mca/oob/usock/oob_usock_ping.h | 52 -- orte/mca/oob/usock/oob_usock_sendrecv.c | 631 --------------- orte/mca/oob/usock/oob_usock_sendrecv.h | 255 ------ orte/mca/oob/usock/owner.txt | 7 - orte/mca/plm/base/Makefile.am | 8 +- orte/mca/plm/base/base.h | 2 +- orte/mca/plm/base/plm_base_proxy.c | 319 -------- orte/orted/pmix/pmix_server_pub.c | 12 +- orte/runtime/orte_data_server.c | 43 +- 35 files changed, 326 insertions(+), 4309 deletions(-) delete mode 100644 orte/mca/oob/usock/Makefile.am delete mode 100644 orte/mca/oob/usock/configure.m4 delete mode 100644 orte/mca/oob/usock/help-oob-usock.txt delete mode 100644 orte/mca/oob/usock/oob_usock.c delete mode 100644 orte/mca/oob/usock/oob_usock.h delete mode 100644 orte/mca/oob/usock/oob_usock_component.c delete mode 100644 orte/mca/oob/usock/oob_usock_component.h delete mode 100644 orte/mca/oob/usock/oob_usock_connection.c delete mode 100644 orte/mca/oob/usock/oob_usock_connection.h delete mode 100644 orte/mca/oob/usock/oob_usock_hdr.h delete mode 100644 orte/mca/oob/usock/oob_usock_peer.h delete mode 100644 orte/mca/oob/usock/oob_usock_ping.h delete mode 100644 orte/mca/oob/usock/oob_usock_sendrecv.c delete mode 100644 orte/mca/oob/usock/oob_usock_sendrecv.h delete mode 100644 orte/mca/oob/usock/owner.txt delete mode 100644 orte/mca/plm/base/plm_base_proxy.c diff --git a/contrib/platform/intel/bend/linux.conf b/contrib/platform/intel/bend/linux.conf index 63889a7093..b8b8194e5e 100644 --- a/contrib/platform/intel/bend/linux.conf +++ b/contrib/platform/intel/bend/linux.conf @@ -58,26 +58,8 @@ # parameters available and their default values. # -#default hostfile -#orte_default_hostfile = /home/common/hosts -#ras_slurm_enable_dyn_alloc = 1 -#ras_slurm_config_file = /home/common/slurm/conf/slurm.conf - # Basic behavior to smooth startup mca_base_component_show_load_errors = 1 -mpi_param_check = 0 orte_abort_timeout = 10 hwloc_base_mem_bind_failure_action = silent -## Protect the shared file systems - -## Add the interface for out-of-band communication -## and set it up -oob_tcp_peer_retries = 120 -#oob_tcp_connect_timeout=600 - -## Define the MPI interconnects -btl = sm,tcp,self - -## Setup shared memory -btl_sm_free_list_max = 768 diff --git a/contrib/platform/intel/bend/mac-orcm.conf b/contrib/platform/intel/bend/mac-orcm.conf index 90ad4d5cb2..b8b8194e5e 100644 --- a/contrib/platform/intel/bend/mac-orcm.conf +++ b/contrib/platform/intel/bend/mac-orcm.conf @@ -60,19 +60,6 @@ # Basic behavior to smooth startup mca_base_component_show_load_errors = 1 -mpi_param_check = 0 orte_abort_timeout = 10 hwloc_base_mem_bind_failure_action = silent -## Protect the shared file systems - -## Add the interface for out-of-band communication -## and set it up -oob_tcp_peer_retries = 120 -#oob_tcp_connect_timeout=600 - -## Define the MPI interconnects -btl = sm,tcp,self - -## Setup shared memory -btl_sm_free_list_max = 768 diff --git a/contrib/platform/intel/bend/mac.conf b/contrib/platform/intel/bend/mac.conf index 90ad4d5cb2..bec396b832 100644 --- a/contrib/platform/intel/bend/mac.conf +++ b/contrib/platform/intel/bend/mac.conf @@ -60,19 +60,5 @@ # Basic behavior to smooth startup mca_base_component_show_load_errors = 1 -mpi_param_check = 0 orte_abort_timeout = 10 hwloc_base_mem_bind_failure_action = silent - -## Protect the shared file systems - -## Add the interface for out-of-band communication -## and set it up -oob_tcp_peer_retries = 120 -#oob_tcp_connect_timeout=600 - -## Define the MPI interconnects -btl = sm,tcp,self - -## Setup shared memory -btl_sm_free_list_max = 768 diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c index e9c08d0c38..f0c332b5dc 100644 --- a/ompi/communicator/comm_cid.c +++ b/ompi/communicator/comm_cid.c @@ -17,7 +17,7 @@ * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -30,6 +30,7 @@ #include "ompi_config.h" #include "opal/dss/dss.h" +#include "opal/mca/pmix/pmix.h" #include "ompi/proc/proc.h" #include "ompi/communicator/communicator.h" @@ -58,7 +59,7 @@ typedef int ompi_comm_cid_allredfct (int *inbuf, int* outbuf, ompi_communicator_t *comm, ompi_communicator_t *bridgecomm, void* lleader, void* rleader, - int send_first ); + int send_first, char *tag, int iter ); static int ompi_comm_allreduce_intra (int *inbuf, int* outbuf, int count, struct ompi_op_t *op, @@ -66,7 +67,7 @@ static int ompi_comm_allreduce_intra (int *inbuf, int* outbuf, ompi_communicator_t *bridgecomm, void* local_leader, void* remote_ledaer, - int send_first ); + int send_first, char *tag, int iter ); static int ompi_comm_allreduce_inter (int *inbuf, int *outbuf, int count, struct ompi_op_t *op, @@ -74,7 +75,7 @@ static int ompi_comm_allreduce_inter (int *inbuf, int *outbuf, ompi_communicator_t *bridgecomm, void* local_leader, void* remote_leader, - int send_first ); + int send_first, char *tag, int iter ); static int ompi_comm_allreduce_intra_bridge(int *inbuf, int* outbuf, int count, struct ompi_op_t *op, @@ -82,15 +83,15 @@ static int ompi_comm_allreduce_intra_bridge(int *inbuf, int* outbuf, ompi_communicator_t *bridgecomm, void* local_leader, void* remote_leader, - int send_first); + int send_first, char *tag, int iter); -static int ompi_comm_allreduce_intra_oob (int *inbuf, int* outbuf, - int count, struct ompi_op_t *op, - ompi_communicator_t *intercomm, - ompi_communicator_t *bridgecomm, - void* local_leader, - void* remote_leader, - int send_first ); +static int ompi_comm_allreduce_intra_pmix (int *inbuf, int* outbuf, + int count, struct ompi_op_t *op, + ompi_communicator_t *intercomm, + ompi_communicator_t *bridgecomm, + void* local_leader, + void* remote_leader, + int send_first, char *tag, int iter ); static int ompi_comm_allreduce_group (int *inbuf, int* outbuf, int count, struct ompi_op_t *op, @@ -98,7 +99,7 @@ static int ompi_comm_allreduce_group (int *inbuf, int* outbuf, ompi_communicator_t *bridgecomm, void* local_leader, void* remote_leader, - int send_first); + int send_first, char *tag, int iter); /* non-blocking intracommunicator allreduce */ static int ompi_comm_allreduce_intra_nb (int *inbuf, int *outbuf, @@ -158,7 +159,7 @@ int ompi_comm_nextcid ( ompi_communicator_t* newcomm, int response, glresponse=0; int start; unsigned int i; - + int iter=0; ompi_comm_cid_allredfct* allredfnct; /** @@ -177,8 +178,8 @@ int ompi_comm_nextcid ( ompi_communicator_t* newcomm, case OMPI_COMM_CID_INTRA_BRIDGE: allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra_bridge; break; - case OMPI_COMM_CID_INTRA_OOB: - allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra_oob; + case OMPI_COMM_CID_INTRA_PMIX: + allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra_pmix; break; case OMPI_COMM_CID_GROUP: allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_group; @@ -218,7 +219,8 @@ int ompi_comm_nextcid ( ompi_communicator_t* newcomm, } ret = (allredfnct)(&nextlocal_cid, &nextcid, 1, MPI_MAX, comm, bridgecomm, - local_leader, remote_leader, send_first ); + local_leader, remote_leader, send_first, "nextcid", iter ); + ++iter; if( OMPI_SUCCESS != ret ) { opal_pointer_array_set_item(&ompi_mpi_communicators, nextlocal_cid, NULL); goto release_and_return; @@ -251,7 +253,8 @@ int ompi_comm_nextcid ( ompi_communicator_t* newcomm, } ret = (allredfnct)(&response, &glresponse, 1, MPI_MIN, comm, bridgecomm, - local_leader, remote_leader, send_first ); + local_leader, remote_leader, send_first, "nextcid", iter ); + ++iter; if( OMPI_SUCCESS != ret ) { opal_pointer_array_set_item(&ompi_mpi_communicators, nextcid, NULL); goto release_and_return; @@ -614,8 +617,8 @@ int ompi_comm_activate ( ompi_communicator_t** newcomm, case OMPI_COMM_CID_INTRA_BRIDGE: allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra_bridge; break; - case OMPI_COMM_CID_INTRA_OOB: - allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra_oob; + case OMPI_COMM_CID_INTRA_PMIX: + allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra_pmix; break; case OMPI_COMM_CID_GROUP: allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_group; @@ -636,7 +639,7 @@ int ompi_comm_activate ( ompi_communicator_t** newcomm, ret = (allredfnct)(&ok, &gok, 1, MPI_MIN, comm, bridgecomm, - local_leader, remote_leader, send_first ); + local_leader, remote_leader, send_first, "activate", 0 ); if( OMPI_SUCCESS != ret ) { goto bail_on_error; } @@ -870,7 +873,7 @@ static int ompi_comm_allreduce_intra ( int *inbuf, int *outbuf, ompi_communicator_t *bridgecomm, void* local_leader, void* remote_leader, - int send_first ) + int send_first, char *tag, int iter ) { return comm->c_coll.coll_allreduce ( inbuf, outbuf, count, MPI_INT, op, comm, comm->c_coll.coll_allreduce_module ); @@ -899,7 +902,7 @@ static int ompi_comm_allreduce_inter ( int *inbuf, int *outbuf, ompi_communicator_t *bridgecomm, void* local_leader, void* remote_leader, - int send_first ) + int send_first, char *tag, int iter ) { int local_rank, rsize; int rc; @@ -1204,7 +1207,7 @@ static int ompi_comm_allreduce_intra_bridge (int *inbuf, int *outbuf, ompi_communicator_t *comm, ompi_communicator_t *bcomm, void* lleader, void* rleader, - int send_first ) + int send_first, char *tag, int iter ) { int *tmpbuf=NULL; int local_rank; @@ -1291,46 +1294,30 @@ static int ompi_comm_allreduce_intra_bridge (int *inbuf, int *outbuf, return (rc); } -typedef struct { - opal_buffer_t buf; - bool active; -} comm_cid_return_t; - -static void comm_cid_recv(int status, - ompi_process_name_t* peer, - opal_buffer_t* buffer, - ompi_rml_tag_t tag, - void* cbdata) -{ - comm_cid_return_t *rcid = (comm_cid_return_t*)cbdata; - - opal_dss.copy_payload(&rcid->buf, buffer); - rcid->active = false; -} - /* Arguments not used in this implementation: * - bridgecomm * * lleader is the local rank of root in comm - * rleader is the OOB contact information of the - * root processes in the other world. + * rleader is the port_string */ -static int ompi_comm_allreduce_intra_oob (int *inbuf, int *outbuf, - int count, struct ompi_op_t *op, - ompi_communicator_t *comm, - ompi_communicator_t *bridgecomm, - void* lleader, void* rleader, - int send_first ) +static int ompi_comm_allreduce_intra_pmix (int *inbuf, int *outbuf, + int count, struct ompi_op_t *op, + ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, + void* lleader, void* rleader, + int send_first, char *tag, int iter ) { int *tmpbuf=NULL; int rc; int local_leader, local_rank; - ompi_process_name_t *remote_leader=NULL; + char *port_string; + opal_value_t info; + opal_pmix_pdata_t pdat; + opal_buffer_t sbuf; int32_t size_count; - comm_cid_return_t rcid; local_leader = (*((int*)lleader)); - remote_leader = (ompi_process_name_t*)rleader; + port_string = (char*)rleader; size_count = count; local_rank = ompi_comm_rank ( comm ); @@ -1348,50 +1335,48 @@ static int ompi_comm_allreduce_intra_oob (int *inbuf, int *outbuf, } if (local_rank == local_leader ) { - opal_buffer_t *sbuf; + OBJ_CONSTRUCT(&sbuf, opal_buffer_t); - sbuf = OBJ_NEW(opal_buffer_t); - - if (OPAL_SUCCESS != (rc = opal_dss.pack(sbuf, tmpbuf, (int32_t)count, OPAL_INT))) { + if (OPAL_SUCCESS != (rc = opal_dss.pack(&sbuf, tmpbuf, (int32_t)count, OPAL_INT))) { goto exit; } + OBJ_CONSTRUCT(&info, opal_value_t); + OBJ_CONSTRUCT(&pdat, opal_pmix_pdata_t); - if ( send_first ) { - if (0 > (rc = ompi_rte_send_buffer_nb(remote_leader, sbuf, - OMPI_RML_TAG_COMM_CID_INTRA, - ompi_rte_send_cbfunc, NULL))) { - goto exit; - } - OBJ_CONSTRUCT(&rcid.buf, opal_buffer_t); - rcid.active = true; - ompi_rte_recv_buffer_nb(remote_leader, OMPI_RML_TAG_COMM_CID_INTRA, - OMPI_RML_NON_PERSISTENT, comm_cid_recv, &rcid); - while (rcid.active) { - opal_progress(); - } - } - else { - OBJ_CONSTRUCT(&rcid.buf, opal_buffer_t); - rcid.active = true; - ompi_rte_recv_buffer_nb(remote_leader, OMPI_RML_TAG_COMM_CID_INTRA, - OMPI_RML_NON_PERSISTENT, comm_cid_recv, &rcid); - while (rcid.active) { - opal_progress(); - } - if (0 > (rc = ompi_rte_send_buffer_nb(remote_leader, sbuf, - OMPI_RML_TAG_COMM_CID_INTRA, - ompi_rte_send_cbfunc, NULL))) { - goto exit; - } + info.type = OPAL_BYTE_OBJECT; + pdat.value.type = OPAL_BYTE_OBJECT; + + opal_dss.unload(&sbuf, (void**)&info.data.bo.bytes, &info.data.bo.size); + OBJ_DESTRUCT(&sbuf); + + if (send_first) { + (void)asprintf(&info.key, "%s:%s:send:%d", port_string, tag, iter); + (void)asprintf(&pdat.value.key, "%s:%s:recv:%d", port_string, tag, iter); + } else { + (void)asprintf(&info.key, "%s:%s:recv:%d", port_string, tag, iter); + (void)asprintf(&pdat.value.key, "%s:%s:send:%d", port_string, tag, iter); } - if (OPAL_SUCCESS != (rc = opal_dss.unpack(&rcid.buf, outbuf, &size_count, OPAL_INT))) { + OPAL_PMIX_EXCHANGE(rc, &info, &pdat, 60); + OBJ_DESTRUCT(&info); + + if (OPAL_SUCCESS != rc) { + OBJ_DESTRUCT(&pdat); goto exit; } - OBJ_DESTRUCT(&rcid.buf); + OBJ_CONSTRUCT(&sbuf, opal_buffer_t); + opal_dss.load(&sbuf, pdat.value.data.bo.bytes, pdat.value.data.bo.size); + pdat.value.data.bo.bytes = NULL; + pdat.value.data.bo.size = 0; + OBJ_DESTRUCT(&pdat); + + if (OPAL_SUCCESS != (rc = opal_dss.unpack(&sbuf, outbuf, &size_count, OPAL_INT))) { + OBJ_DESTRUCT(&sbuf); + goto exit; + } + OBJ_DESTRUCT(&sbuf); count = (int)size_count; - - ompi_op_reduce (op, tmpbuf, outbuf, count, MPI_INT); + ompi_op_reduce (op, tmpbuf, outbuf, count, MPI_INT); } rc = comm->c_coll.coll_bcast (outbuf, count, MPI_INT, @@ -1412,7 +1397,7 @@ static int ompi_comm_allreduce_group (int *inbuf, int* outbuf, ompi_communicator_t *newcomm, void* local_leader, void* remote_leader, - int send_first) + int send_first, char *intag, int iter) { ompi_group_t *group = newcomm->c_local_group; int peers_group[3], peers_comm[3]; diff --git a/ompi/communicator/communicator.h b/ompi/communicator/communicator.h index 95c721f2e1..57ef9977fc 100644 --- a/ompi/communicator/communicator.h +++ b/ompi/communicator/communicator.h @@ -17,7 +17,7 @@ * Copyright (c) 2011-2013 Universite Bordeaux 1 * Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -93,7 +93,7 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_communicator_t); #define OMPI_COMM_CID_INTRA 0x00000020 #define OMPI_COMM_CID_INTER 0x00000040 #define OMPI_COMM_CID_INTRA_BRIDGE 0x00000080 -#define OMPI_COMM_CID_INTRA_OOB 0x00000100 +#define OMPI_COMM_CID_INTRA_PMIX 0x00000100 #define OMPI_COMM_CID_GROUP 0x00000200 /** @@ -497,8 +497,8 @@ ompi_communicator_t* ompi_comm_allocate (int local_group_size, * a bridge comm. local_leader * and remote leader are in this * case an int (rank in bridge-comm). - * OMPI_COMM_CID_INTRA_OOB: 2 intracomms, leaders talk - * through OOB. lleader and rleader + * OMPI_COMM_CID_INTRA_PMIX: 2 intracomms, leaders talk + * through PMIx. lleader and rleader * are the required contact information. * @param send_first: to avoid a potential deadlock for * the OOB version. diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index 764981a58c..f8de1baf87 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -72,27 +72,6 @@ static OBJ_CLASS_INSTANCE(ompi_dpm_proct_caddy_t, opal_list_item_t, NULL, NULL); -struct lookup_caddy_t { - volatile bool active; - int status; - opal_pmix_pdata_t *pdat; -}; - -static void lookup_cbfunc(int status, opal_list_t *data, void *cbdata) -{ - struct lookup_caddy_t *cd = (struct lookup_caddy_t*)cbdata; - cd->status = status; - if (OPAL_SUCCESS == status && NULL != data) { - opal_pmix_pdata_t *p = (opal_pmix_pdata_t*)opal_list_get_first(data); - if (NULL != p && OPAL_STRING == p->value.type && - NULL != p->value.data.string) { - cd->pdat->value.data.string = strdup(p->value.data.string); - } - } - cd->active = false; -} - - /* * Init the module */ @@ -113,12 +92,12 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, ompi_communicator_t **newcomm) { int k, size, rsize, rank, rc, rportlen=0; - char **members = NULL, *nstring, *rport=NULL, **pkeys=NULL; + char **members = NULL, *nstring, *rport=NULL; bool dense, isnew; opal_process_name_t pname; opal_list_t ilist, mlist, rlist; - opal_value_t *info; - opal_pmix_pdata_t *pdat; + opal_value_t info; + opal_pmix_pdata_t pdat; opal_namelist_t *nm; opal_jobid_t jobid; @@ -126,7 +105,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, ompi_proc_t *proc; ompi_group_t *group=comm->c_local_group; ompi_proc_t **proc_list=NULL, **new_proc_list = NULL; - int32_t i,j; + int32_t i; ompi_group_t *new_group_pointer; ompi_dpm_proct_caddy_t *cd; @@ -200,106 +179,35 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, } if (rank == root) { - /* the root for each side publishes their list of participants */ - OBJ_CONSTRUCT(&ilist, opal_list_t); - /* put my name at the front of the list of members - my - * name will therefore be on the list twice, but the - * other side's root needs to know the root from this side */ - rc = opal_convert_process_name_to_string(&nstring, OMPI_PROC_MY_NAME); - if (OPAL_SUCCESS != rc) { - return OMPI_ERROR; - } - opal_argv_prepend_nosize(&members, nstring); - free(nstring); - info = OBJ_NEW(opal_value_t); + /* the roots for each side exchange their list of participants */ + OBJ_CONSTRUCT(&info, opal_value_t); + OBJ_CONSTRUCT(&pdat, opal_pmix_pdata_t); if (send_first) { - (void)asprintf(&info->key, "%s:connect", port_string); + (void)asprintf(&info.key, "%s:connect", port_string); + (void)asprintf(&pdat.value.key, "%s:accept", port_string); } else { - (void)asprintf(&info->key, "%s:accept", port_string); + (void)asprintf(&info.key, "%s:accept", port_string); + (void)asprintf(&pdat.value.key, "%s:connect", port_string); } - info->type = OPAL_STRING; - info->data.string = opal_argv_join(members, ':'); - opal_list_append(&ilist, &info->super); - /* also save the key for later */ - opal_argv_append_nosize(&pkeys, info->key); - /* publish them with "session" scope */ - rc = opal_pmix.publish(&ilist); - OPAL_LIST_DESTRUCT(&ilist); + info.type = OPAL_STRING; + info.data.string = opal_argv_join(members, ':'); + pdat.value.type = OPAL_STRING; + + OPAL_PMIX_EXCHANGE(rc, &info, &pdat, 60); + OBJ_DESTRUCT(&info); if (OPAL_SUCCESS != rc) { - opal_argv_free(members); - opal_argv_free(pkeys); - return OMPI_ERROR; - } - /* lookup the other side's info - if a non-blocking form - * of lookup isn't available, then we use the blocking - * form and trust that the underlying system will WAIT - * until the other side publishes its data */ - OBJ_CONSTRUCT(&ilist, opal_list_t); - pdat = OBJ_NEW(opal_pmix_pdata_t); - if (send_first) { - (void)asprintf(&pdat->value.key, "%s:accept", port_string); - } else { - (void)asprintf(&pdat->value.key, "%s:connect", port_string); - } - opal_list_append(&ilist, &pdat->super); - OBJ_CONSTRUCT(&mlist, opal_list_t); - /* if a non-blocking version of lookup isn't - * available, then use the blocking version */ - if (NULL == opal_pmix.lookup_nb) { - rc = opal_pmix.lookup(&ilist, &mlist); - OPAL_LIST_DESTRUCT(&mlist); - if (OPAL_SUCCESS != rc) { - OMPI_ERROR_LOG(rc); - OPAL_LIST_DESTRUCT(&ilist); - opal_argv_free(members); - goto exit; - } - } else { - char **keys = NULL; - struct lookup_caddy_t caddy; - opal_argv_append_nosize(&keys, pdat->value.key); - caddy.active = true; - caddy.pdat = pdat; - /* tell it to wait for the data to arrive */ - info = OBJ_NEW(opal_value_t); - info->key = strdup(OPAL_PMIX_WAIT); - info->type = OPAL_BOOL; - info->data.flag = true; - opal_list_append(&mlist, &info->super); - /* give it a decent timeout as we don't know when - * the other side may call connect - it doesn't - * have to be simultaneous */ - info = OBJ_NEW(opal_value_t); - info->key = strdup(OPAL_PMIX_TIMEOUT); - info->type = OPAL_INT; - info->data.integer = 60; - opal_list_append(&mlist, &info->super); - rc = opal_pmix.lookup_nb(keys, &mlist, lookup_cbfunc, &caddy); - if (OPAL_SUCCESS != rc) { - OPAL_LIST_DESTRUCT(&ilist); - OPAL_LIST_DESTRUCT(&mlist); - opal_argv_free(keys); - opal_argv_free(members); - goto exit; - } - OMPI_WAIT_FOR_COMPLETION(caddy.active); - opal_argv_free(keys); - OPAL_LIST_DESTRUCT(&mlist); - if (OPAL_SUCCESS != caddy.status) { - OMPI_ERROR_LOG(caddy.status); - OPAL_LIST_DESTRUCT(&ilist); - opal_argv_free(members); - goto exit; - } + OBJ_DESTRUCT(&pdat); + return rc; } + /* save the result */ - rport = strdup(pdat->value.data.string); // need this later + rport = strdup(pdat.value.data.string); // need this later rportlen = strlen(rport) + 1; // retain the NULL terminator - OPAL_LIST_DESTRUCT(&ilist); + OBJ_DESTRUCT(&pdat); } /* if we aren't in a comm_spawn, the non-root members won't have - * a port_string - so let's make sure everyone knows the other + * the port_string - so let's make sure everyone knows the other * side's participants */ /* bcast the list-length to all processes in the local comm */ @@ -327,15 +235,9 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, } /* initiate a list of participants for the connect, - * starting with our own members, remembering to - * skip the first member if we are the root rank */ - if (rank == root) { - j = 1; - } else { - j = 0; - } + * starting with our own members */ OBJ_CONSTRUCT(&mlist, opal_list_t); - for (i=j; NULL != members[i]; i++) { + for (i=0; NULL != members[i]; i++) { nm = OBJ_NEW(opal_namelist_t); if (OPAL_SUCCESS != (rc = opal_convert_string_to_process_name(&nm->name, members[i]))) { OMPI_ERROR_LOG(rc); @@ -373,33 +275,18 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, opal_argv_free(members); members = NULL; - /* the pdat object will contain a colon-delimited list + /* rport contains a colon-delimited list * of process names for the remote procs - convert it * into an argv array */ members = opal_argv_split(rport, ':'); free(rport); - /* the first entry is the root for the remote side */ - if (OPAL_SUCCESS != (rc = opal_convert_string_to_process_name(&pname, members[0]))) { - OMPI_ERROR_LOG(rc); - opal_argv_free(members); - goto exit; - } - /* check the name - it should never be a wildcard, so - * this is just checking for an error */ - if (OPAL_VPID_WILDCARD == pname.vpid) { - OMPI_ERROR_LOG(OMPI_ERR_BAD_PARAM); - opal_argv_free(members); - rc = OMPI_ERR_BAD_PARAM; - goto exit; - } - /* add the list of remote procs to our list, and * keep a list of them for later */ OBJ_CONSTRUCT(&ilist, opal_list_t); OBJ_CONSTRUCT(&rlist, opal_list_t); - for (i=1; NULL != members[i]; i++) { + for (i=0; NULL != members[i]; i++) { nm = OBJ_NEW(opal_namelist_t); if (OPAL_SUCCESS != (rc = opal_convert_string_to_process_name(&nm->name, members[i]))) { OMPI_ERROR_LOG(rc); @@ -452,7 +339,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, opal_list_append(&ilist, &cd->super); } /* either way, add to the remote list */ - cd = OBJ_NEW(ompi_dpm_proct_caddy_t); + cd = OBJ_NEW(ompi_dpm_proct_caddy_t); cd->p = proc; opal_list_append(&rlist, &cd->super); } @@ -550,25 +437,25 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, new_group_pointer = MPI_GROUP_NULL; /* allocate comm_cid */ - rc = ompi_comm_nextcid ( newcomp, /* new communicator */ - comm, /* old communicator */ - NULL, /* bridge comm */ - &root, /* local leader */ - &pname, /* remote leader */ - OMPI_COMM_CID_INTRA_OOB, /* mode */ - send_first ); /* send or recv first */ + rc = ompi_comm_nextcid ( newcomp, /* new communicator */ + comm, /* old communicator */ + NULL, /* bridge comm */ + &root, /* local leader */ + (void*)port_string, /* rendezvous point */ + OMPI_COMM_CID_INTRA_PMIX, /* mode */ + send_first ); /* send or recv first */ if (OMPI_SUCCESS != rc) { goto exit; } /* activate comm and init coll-component */ - rc = ompi_comm_activate ( &newcomp, /* new communicator */ - comm, /* old communicator */ - NULL, /* bridge comm */ - &root, /* local leader */ - &pname, /* remote leader */ - OMPI_COMM_CID_INTRA_OOB, /* mode */ - send_first ); /* send or recv first */ + rc = ompi_comm_activate ( &newcomp, /* new communicator */ + comm, /* old communicator */ + NULL, /* bridge comm */ + &root, /* local leader */ + (void*)port_string, /* rendezvous point */ + OMPI_COMM_CID_INTRA_PMIX, /* mode */ + send_first ); /* send or recv first */ if (OMPI_SUCCESS != rc) { goto exit; } @@ -579,10 +466,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, */ exit: - if (NULL != pkeys) { - opal_pmix.unpublish(pkeys, NULL); - opal_argv_free(pkeys); - } if (OMPI_SUCCESS != rc) { if (MPI_COMM_NULL != newcomp && NULL != newcomp) { OBJ_RETAIN(newcomp); diff --git a/ompi/mca/rte/orte/rte_orte.h b/ompi/mca/rte/orte/rte_orte.h index 7796204bf6..a5796276ba 100644 --- a/ompi/mca/rte/orte/rte_orte.h +++ b/ompi/mca/rte/orte/rte_orte.h @@ -96,27 +96,6 @@ typedef orte_error_t ompi_rte_error_report_t; #define ompi_rte_finalize() orte_finalize() OMPI_DECLSPEC void ompi_rte_wait_for_debugger(void); -#define OMPI_DB_HOSTNAME ORTE_DB_HOSTNAME -#define OMPI_DB_LOCALITY ORTE_DB_LOCALITY -#define OMPI_DB_GLOBAL_RANK ORTE_DB_GLOBAL_RANK - -/* Communications */ -typedef orte_rml_tag_t ompi_rml_tag_t; -#define ompi_rte_send_buffer_nb(a, b, c, d, e) orte_rml.send_buffer_nb(a, b, c, d, e) -#define ompi_rte_recv_buffer_nb(a, b, c, d, e) orte_rml.recv_buffer_nb(a, b, c, d, e) -#define ompi_rte_recv_cancel(a, b) orte_rml.recv_cancel(a, b) -#define ompi_rte_parse_uris(a, b, c) orte_rml_base_parse_uris(a, b, c) -#define ompi_rte_send_cbfunc orte_rml_send_callback - -/* Communication tags */ -/* carry over the INVALID def */ -#define OMPI_RML_TAG_INVALID ORTE_RML_TAG_INVALID -/* define a starting point to avoid conflicts */ -#define OMPI_RML_TAG_BASE ORTE_RML_TAG_MAX - -#define OMPI_RML_PERSISTENT ORTE_RML_PERSISTENT -#define OMPI_RML_NON_PERSISTENT ORTE_RML_NON_PERSISTENT - typedef struct { ompi_rte_component_t super; opal_mutex_t lock; @@ -138,8 +117,6 @@ static inline orte_process_name_t * OMPI_CAST_RTE_NAME(opal_process_name_t * nam } #endif -#define ompi_direct_modex_cutoff orte_direct_modex_cutoff - END_C_DECLS #endif /* MCA_OMPI_RTE_ORTE_H */ diff --git a/ompi/mca/rte/rte.h b/ompi/mca/rte/rte.h index ded5a27ea3..bc14cbc476 100644 --- a/ompi/mca/rte/rte.h +++ b/ompi/mca/rte/rte.h @@ -196,39 +196,6 @@ END_C_DECLS BEGIN_C_DECLS -/* Each RTE is required to define a DB key for identifying the node - * upon which a process resides, and for providing this information - * for each process - * - * #define OMPI_RTE_NODE_ID - */ - -/* Communication tags */ -#define OMPI_RML_TAG_UDAPL OMPI_RML_TAG_BASE+1 -#define OMPI_RML_TAG_OPENIB OMPI_RML_TAG_BASE+2 -#define OMPI_RML_TAG_XOPENIB OMPI_RML_TAG_BASE+3 -#define OMPI_RML_TAG_COMM_CID_INTRA OMPI_RML_TAG_BASE+4 -#define OMPI_RML_TAG_XOOB OMPI_RML_TAG_BASE+5 -#define OMPI_RML_TAG_SM_BACK_FILE_CREATED OMPI_RML_TAG_BASE+6 -#define OMPI_CRCP_COORD_BOOKMARK_TAG OMPI_RML_TAG_BASE+7 -#define OMPI_COMM_JOIN_TAG OMPI_RML_TAG_BASE+8 - -/* support for shared memory collectives */ -#define OMPI_RML_TAG_COLL_SM2_BACK_FILE_CREATED OMPI_RML_TAG_BASE+9 -/* common sm component query result index */ -#define OMPI_RML_TAG_COMMON_SM_COMP_INDEX OMPI_RML_TAG_BASE+10 - -/* OFACM RML TAGs */ -#define OMPI_RML_TAG_OFACM OMPI_RML_TAG_BASE+11 -#define OMPI_RML_TAG_XOFACM OMPI_RML_TAG_BASE+12 - -#define OMPI_RML_PCONNECT_TAG OMPI_RML_TAG_BASE+13 - -#define OMPI_RML_TAG_USNIC_CONNECTIVITY OMPI_RML_TAG_BASE+14 -#define OMPI_RML_TAG_USNIC_CONNECTIVITY_REPLY OMPI_RML_TAG_BASE+15 - -#define OMPI_RML_TAG_DYNAMIC OMPI_RML_TAG_BASE+200 - /* * MCA Framework */ diff --git a/opal/mca/pmix/base/base.h b/opal/mca/pmix/base/base.h index 90441a7960..6aaf308c4a 100644 --- a/opal/mca/pmix/base/base.h +++ b/opal/mca/pmix/base/base.h @@ -37,7 +37,9 @@ OPAL_DECLSPEC void opal_pmix_base_deregister_handler(void); OPAL_DECLSPEC void opal_pmix_base_errhandler(int status, opal_list_t *procs, opal_list_t *info); - +OPAL_DECLSPEC int opal_pmix_base_exchange(opal_value_t *info, + opal_pmix_pdata_t *pdat, + int timeout); END_C_DECLS #endif diff --git a/opal/mca/pmix/base/pmix_base_fns.c b/opal/mca/pmix/base/pmix_base_fns.c index ad35455205..6344b8940d 100644 --- a/opal/mca/pmix/base/pmix_base_fns.c +++ b/opal/mca/pmix/base/pmix_base_fns.c @@ -35,6 +35,7 @@ #define OPAL_PMI_PAD 10 +/******** ERRHANDLER SUPPORT ********/ static opal_pmix_errhandler_fn_t errhandler = NULL; void opal_pmix_base_register_handler(opal_pmix_errhandler_fn_t err) @@ -56,6 +57,130 @@ void opal_pmix_base_deregister_handler(void) errhandler = NULL; } +struct lookup_caddy_t { + volatile bool active; + int status; + opal_pmix_pdata_t *pdat; +}; + +/******** DATA EXCHANGE ********/ +static void lookup_cbfunc(int status, opal_list_t *data, void *cbdata) +{ + struct lookup_caddy_t *cd = (struct lookup_caddy_t*)cbdata; + cd->status = status; + if (OPAL_SUCCESS == status && NULL != data) { + opal_pmix_pdata_t *p = (opal_pmix_pdata_t*)opal_list_get_first(data); + if (NULL != p && p->value.type == cd->pdat->value.type) { + (void)opal_value_xfer(&cd->pdat->value, &p->value); + } + cd->pdat->proc = p->proc; + } + cd->active = false; +} + +int opal_pmix_base_exchange(opal_value_t *indat, + opal_pmix_pdata_t *outdat, + int timeout) +{ + int rc; + opal_list_t ilist, mlist; + opal_value_t *info; + opal_pmix_pdata_t *pdat; + struct lookup_caddy_t caddy; + char **keys; + + /* protect the incoming value */ + opal_dss.copy((void**)&info, indat, OPAL_VALUE); + OBJ_CONSTRUCT(&ilist, opal_list_t); + opal_list_append(&ilist, &info->super); + /* tell the server to delete upon read */ + info = OBJ_NEW(opal_value_t); + info->key = strdup(OPAL_PMIX_PERSISTENCE); + info->type = OPAL_INT; + info->data.integer = OPAL_PMIX_PERSIST_FIRST_READ; + opal_list_append(&ilist, &info->super); + + /* publish it with "session" scope */ + rc = opal_pmix.publish(&ilist); + OPAL_LIST_DESTRUCT(&ilist); + if (OPAL_SUCCESS != rc) { + OPAL_ERROR_LOG(rc); + return rc; + } + + /* lookup the other side's info - if a non-blocking form + * of lookup isn't available, then we use the blocking + * form and trust that the underlying system will WAIT + * until the other side publishes its data */ + OBJ_CONSTRUCT(&ilist, opal_list_t); + pdat = OBJ_NEW(opal_pmix_pdata_t); + pdat->value.key = strdup(outdat->value.key); + pdat->value.type = outdat->value.type; + opal_list_append(&ilist, &pdat->super); + /* setup the constraints */ + OBJ_CONSTRUCT(&mlist, opal_list_t); + /* tell it to wait for the data to arrive */ + info = OBJ_NEW(opal_value_t); + info->key = strdup(OPAL_PMIX_WAIT); + info->type = OPAL_BOOL; + info->data.flag = true; + opal_list_append(&mlist, &info->super); + if (0 < timeout) { + /* give it a decent timeout as we don't know when + * the other side will publish - it doesn't + * have to be simultaneous */ + info = OBJ_NEW(opal_value_t); + info->key = strdup(OPAL_PMIX_TIMEOUT); + info->type = OPAL_INT; + info->data.integer = timeout; + opal_list_append(&mlist, &info->super); + } + + /* if a non-blocking version of lookup isn't + * available, then use the blocking version */ + if (NULL == opal_pmix.lookup_nb) { + rc = opal_pmix.lookup(&ilist, &mlist); + OPAL_LIST_DESTRUCT(&mlist); + if (OPAL_SUCCESS != rc) { + OPAL_ERROR_LOG(rc); + OPAL_LIST_DESTRUCT(&ilist); + return rc; + } + } else { + caddy.active = true; + caddy.pdat = pdat; + keys = NULL; + opal_argv_append_nosize(&keys, pdat->value.key); + rc = opal_pmix.lookup_nb(keys, &mlist, lookup_cbfunc, &caddy); + if (OPAL_SUCCESS != rc) { + OPAL_ERROR_LOG(rc); + OPAL_LIST_DESTRUCT(&ilist); + OPAL_LIST_DESTRUCT(&mlist); + opal_argv_free(keys); + return rc; + } + while (caddy.active) { + usleep(10); + } + opal_argv_free(keys); + OPAL_LIST_DESTRUCT(&mlist); + if (OPAL_SUCCESS != caddy.status) { + OPAL_ERROR_LOG(caddy.status); + OPAL_LIST_DESTRUCT(&ilist); + return caddy.status; + } + } + + /* pass back the result */ + outdat->proc = pdat->proc; + rc = opal_value_xfer(&outdat->value, &pdat->value); + OPAL_LIST_DESTRUCT(&ilist); + return rc; +} + + +/******** DATA CONSOLIDATION ********/ + static char* setup_key(const opal_process_name_t* name, const char *key, int pmix_keylen_max); static char *pmi_encode(const void *val, size_t vallen); static uint8_t *pmi_decode (const char *data, size_t *retlen); diff --git a/opal/mca/pmix/pmix.h b/opal/mca/pmix/pmix.h index 3b1847e445..5ba4bb0078 100644 --- a/opal/mca/pmix/pmix.h +++ b/opal/mca/pmix/pmix.h @@ -36,6 +36,9 @@ BEGIN_C_DECLS /* provide access to the framework verbose output without * exposing the entire base */ extern int opal_pmix_verbose_output; +extern int opal_pmix_base_exchange(opal_value_t *info, + opal_pmix_pdata_t *pdat, + int timeout); /** * Provide a simplified macro for sending data via modex @@ -249,12 +252,24 @@ extern int opal_pmix_verbose_output; opal_pmix.fence((p), (s)); \ } while(0); +/** + * Provide a macro for accessing a base function that exchanges + * data values between two procs using the PMIx Publish/Lookup + * APIs */ + #define OPAL_PMIX_EXCHANGE(r, i, p, t) \ + do { \ + OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \ + "%s[%s:%d] EXCHANGE %s WITH %s", \ + OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \ + __FILE__, __LINE__, \ + (i)->key, (p)->value.key)); \ + (r) = opal_pmix_base_exchange((i), (p), (t)); \ + } while(0); + + /* callback handler for errors */ typedef void (*opal_pmix_errhandler_fn_t)(int error); -/* NOTE: calls to these APIs must be thread-protected as there - * currently is NO internal thread safety. */ - /************************************************************ * CLIENT APIs * diff --git a/opal/mca/pmix/pmix1xx/pmix_pmix1.c b/opal/mca/pmix/pmix1xx/pmix_pmix1.c index 84c777400d..316566a983 100644 --- a/opal/mca/pmix/pmix1xx/pmix_pmix1.c +++ b/opal/mca/pmix/pmix1xx/pmix_pmix1.c @@ -262,6 +262,7 @@ void pmix1_value_load(pmix_value_t *v, switch(kv->type) { case OPAL_UNDEF: v->type = PMIX_UNDEF; + opal_output(0, "TYPE WAS UNDEF"); break; case OPAL_BOOL: v->type = PMIX_BOOL; diff --git a/opal/mca/pmix/pmix_types.h b/opal/mca/pmix/pmix_types.h index ca185c1e6e..c02732ed05 100644 --- a/opal/mca/pmix/pmix_types.h +++ b/opal/mca/pmix/pmix_types.h @@ -105,7 +105,7 @@ BEGIN_C_DECLS #define OPAL_PMIX_NOTIFY_COMPLETION "pmix.notecomp" // (bool) notify parent process upon termination of child job #define OPAL_PMIX_RANGE "pmix.range" // (int) opal_pmix_data_range_t value for calls to publish/lookup/unpublish #define OPAL_PMIX_PERSISTENCE "pmix.persist" // (int) opal_pmix_persistence_t value for calls to publish -#define OPAL_PMIX_OPTIONAL "pmix.optional" // (bool) look only in the immediate data store for the requested value - do +#define OPAL_PMIX_OPTIONAL "pmix.optional" // (bool) look only in the immediate data store for the requested value - do // not request data from the server if not found /* attribute used by host server to pass data to the server convenience library - the @@ -171,6 +171,7 @@ typedef enum { * consistent order with the PMIx distro */ typedef enum { OPAL_PMIX_PERSIST_INDEF = 0, // retain until specifically deleted + OPAL_PMIX_PERSIST_FIRST_READ, // delete upon first access OPAL_PMIX_PERSIST_PROC, // retain until publishing process terminates OPAL_PMIX_PERSIST_APP, // retain until application terminates OPAL_PMIX_PERSIST_SESSION // retain until session/allocation terminates diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index 2a87067804..9b78e7b66b 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -48,14 +48,9 @@ #include "opal/runtime/opal_cr.h" #include "opal/runtime/opal_progress_threads.h" -#include "orte/mca/rml/base/base.h" -#include "orte/mca/routed/base/base.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/dfs/base/base.h" #include "orte/mca/grpcomm/base/base.h" -#include "orte/mca/oob/base/base.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/qos/base/base.h" #include "orte/mca/odls/odls_types.h" #include "orte/mca/filem/base/base.h" #include "orte/mca/errmgr/base/base.h" @@ -181,84 +176,14 @@ int orte_ess_base_app_setup(bool db_restrict_local) } OBJ_DESTRUCT(&kv); } - /* Setup the communication infrastructure */ - /* - * OOB Layer - */ - if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { - ORTE_ERROR_LOG(ret); - error = "orte_oob_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_oob_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_oob_base_select"; - goto error; - } - /* Runtime Messaging Layer */ - if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { - ORTE_ERROR_LOG(ret); - error = "orte_rml_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_rml_base_select"; - goto error; - } - /* Messaging QoS Layer */ - if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_qos_base_framework, 0))) { - ORTE_ERROR_LOG(ret); - error = "orte_qos_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_qos_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_qos_base_select"; - goto error; - } + /* setup the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } - /* Routed system */ - if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { - ORTE_ERROR_LOG(ret); - error = "orte_routed_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_routed_base_select"; - goto error; - } - /* - * Group communications - */ - if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) { - ORTE_ERROR_LOG(ret); - error = "orte_grpcomm_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_grpcomm_base_select"; - goto error; - } - /* enable communication via the rml */ - if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { - ORTE_ERROR_LOG(ret); - error = "orte_rml.enable_comm"; - goto error; - } - /* setup the routed info */ - if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { - ORTE_ERROR_LOG(ret); - error = "orte_routed.init_routes"; - goto error; - } + #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC @@ -335,13 +260,7 @@ int orte_ess_base_app_finalize(void) (void) mca_base_framework_close(&orte_filem_base_framework); (void) mca_base_framework_close(&orte_errmgr_base_framework); - /* now can close the rml and its friendly group comm */ - (void) mca_base_framework_close(&orte_grpcomm_base_framework); (void) mca_base_framework_close(&orte_dfs_base_framework); - (void) mca_base_framework_close(&orte_routed_base_framework); - - (void) mca_base_framework_close(&orte_rml_base_framework); - (void) mca_base_framework_close(&orte_oob_base_framework); (void) mca_base_framework_close(&orte_state_base_framework); orte_session_dir_finalize(ORTE_PROC_MY_NAME); @@ -396,7 +315,7 @@ void orte_ess_base_app_abort(int status, bool report) * the message if routing is enabled as this indicates we * have someone to send to */ - if (report && orte_routing_is_enabled && orte_create_session_dirs) { + if (report && orte_create_session_dirs) { myfile = opal_os_path(false, orte_process_info.proc_session_dir, "aborted", NULL); fd = open(myfile, O_CREAT, S_IRUSR); close(fd); diff --git a/orte/mca/ess/pmi/ess_pmi_module.c b/orte/mca/ess/pmi/ess_pmi_module.c index c85881b7cc..7f6bfff6bf 100644 --- a/orte/mca/ess/pmi/ess_pmi_module.c +++ b/orte/mca/ess/pmi/ess_pmi_module.c @@ -47,8 +47,6 @@ #include "opal/mca/pmix/base/base.h" #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/grpcomm/grpcomm.h" -#include "orte/mca/rml/rml.h" #include "orte/util/proc_info.h" #include "orte/util/show_help.h" #include "orte/util/name_fns.h" @@ -85,7 +83,6 @@ static int rte_init(void) char *envar, *ev1, *ev2; uint64_t unique_key[2]; char *string_key; - char *rmluri; opal_value_t *kv; char *val; int u32, *u32ptr; @@ -355,16 +352,6 @@ static int rte_init(void) /*** PUSH DATA FOR OTHERS TO FIND ***/ - /* push our RML URI in case others need to talk directly to us */ - rmluri = orte_rml.get_contact_info(); - /* push it out for others to use */ - OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_PROC_URI, rmluri, OPAL_STRING); - if (ORTE_SUCCESS != ret) { - error = "pmix put uri"; - goto error; - } - free(rmluri); - /* push our hostname so others can find us, if they need to */ OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING); if (ORTE_SUCCESS != ret) { diff --git a/orte/mca/oob/usock/Makefile.am b/orte/mca/oob/usock/Makefile.am deleted file mode 100644 index b44934e8b6..0000000000 --- a/orte/mca/oob/usock/Makefile.am +++ /dev/null @@ -1,56 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2012-2013 Los Alamos National Security, LLC. -# All rights reserved -# Copyright (c) 2013-2015 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -sources = \ - oob_usock_component.h \ - oob_usock.h \ - oob_usock_component.c \ - oob_usock_connection.h \ - oob_usock_sendrecv.h \ - oob_usock_hdr.h \ - oob_usock_peer.h \ - oob_usock_ping.h \ - oob_usock.c \ - oob_usock_connection.c \ - oob_usock_sendrecv.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_oob_usock_DSO -component_noinst = -component_install = mca_oob_usock.la -else -component_noinst = libmca_oob_usock.la -component_install = -endif - -mcacomponentdir = $(ortelibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_oob_usock_la_SOURCES = $(sources) -mca_oob_usock_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_oob_usock_la_SOURCES = $(sources) -libmca_oob_usock_la_LDFLAGS = -module -avoid-version - diff --git a/orte/mca/oob/usock/configure.m4 b/orte/mca/oob/usock/configure.m4 deleted file mode 100644 index c9a1b59f50..0000000000 --- a/orte/mca/oob/usock/configure.m4 +++ /dev/null @@ -1,42 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2011-2013 Los Alamos National Security, LLC. -# All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2013 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_oob_usock_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_orte_oob_usock_CONFIG],[ - AC_CONFIG_FILES([orte/mca/oob/usock/Makefile]) - - # check for sockaddr_un (a good sign we have Unix domain sockets) - AC_CHECK_TYPES([struct sockaddr_un], - [oob_usock_happy="yes"], - [oob_usock_happy="no"], - [AC_INCLUDES_DEFAULT -#ifdef HAVE_SYS_SOCKET_H -#include -#endif -#ifdef HAVE_SYS_UN_H -#include -#endif]) - - AS_IF([test "$oob_usock_happy" = "yes"], [$1], [$2]) -])dnl diff --git a/orte/mca/oob/usock/help-oob-usock.txt b/orte/mca/oob/usock/help-oob-usock.txt deleted file mode 100644 index 6eb8ac0542..0000000000 --- a/orte/mca/oob/usock/help-oob-usock.txt +++ /dev/null @@ -1,70 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2006 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -[static-and-dynamic] -Both static and dynamic port ranges were specified for the -out-of-band (OOB) communication subsystem: - -Static ports: %s -Dynamic ports: %s - -Only one can be specified. Please choose either static or -dynamic ports and try again. -# -[include-exclude] -Both TCP interface include and exclude lists were specified: - - Include: %s - Exclude: %s - -Only one of these can be given. -# -[not-parseable] -The specified network is not parseable. Since we cannot determine -your desired intent, we cannot establish a TCP socket for out-of-band -communications and will therefore abort. Please correct the network -specification and retry. -# -[no-included-found] -None of the networks specified to be included for out-of-band communications -could be found: - - Value given: %s - -Please revise the specification and try again. -# -[excluded-all] -The specified list of networks to be excluded for out-of-band communications -resulted in no networks being available: - - Value given: %s - -Please revise the specification and try again. -# -[no-interfaces-avail] -No network interfaces were found for out-of-band communications. We require -at least one available network for TCP-based messaging. -# -[invalid if_inexclude] -WARNING: An invalid value was given for oob_tcp_if_%s. This -value will be ignored. - - Local host: %s - Value: %s - Message: %s -# diff --git a/orte/mca/oob/usock/oob_usock.c b/orte/mca/oob/usock/oob_usock.c deleted file mode 100644 index 3993bc42e9..0000000000 --- a/orte/mca/oob/usock/oob_usock.c +++ /dev/null @@ -1,473 +0,0 @@ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "orte_config.h" -#include "orte/types.h" -#include "opal/types.h" - -#ifdef HAVE_UNISTD_H -#include -#endif -#ifdef HAVE_SYS_TYPES_H -#include -#endif -#include -#ifdef HAVE_NETINET_IN_H -#include -#endif -#ifdef HAVE_ARPA_INET_H -#include -#endif -#ifdef HAVE_NETDB_H -#include -#endif -#include - -#include "opal/util/show_help.h" -#include "opal/util/error.h" -#include "opal/util/output.h" -#include "opal/opal_socket_errno.h" -#include "opal/util/if.h" -#include "opal/util/net.h" -#include "opal/util/argv.h" -#include "opal/class/opal_hash_table.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ess/ess.h" -#include "orte/util/name_fns.h" -#include "orte/util/parse_options.h" -#include "orte/util/show_help.h" -#include "orte/runtime/orte_globals.h" - -#include "orte/mca/oob/usock/oob_usock.h" -#include "orte/mca/oob/usock/oob_usock_component.h" -#include "orte/mca/oob/usock/oob_usock_peer.h" -#include "orte/mca/oob/usock/oob_usock_connection.h" -#include "orte/mca/oob/usock/oob_usock_ping.h" - -static void usock_init(void); -static void usock_fini(void); -static void accept_connection(const int accepted_fd, - const struct sockaddr *addr); -static void ping(const orte_process_name_t *proc); -static void send_nb(orte_rml_send_t *msg); -static void ft_event(int state); - -mca_oob_usock_module_t mca_oob_usock_module = { - { - usock_init, - usock_fini, - accept_connection, - ping, - send_nb, - ft_event - } -}; - -/* - * Local utility functions - */ -static void recv_handler(int sd, short flags, void* user); -static void* progress_thread_engine(opal_object_t *obj) -{ - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s USOCK PROGRESS THREAD RUNNING", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - while (mca_oob_usock_module.ev_active) { - opal_event_loop(mca_oob_usock_module.ev_base, OPAL_EVLOOP_ONCE); - } - return OPAL_THREAD_CANCELLED; -} - - -/* - * Initialize global variables used w/in this module. - */ -static void usock_init(void) -{ - /* setup the module's state variables */ - OBJ_CONSTRUCT(&mca_oob_usock_module.peers, opal_hash_table_t); - opal_hash_table_init(&mca_oob_usock_module.peers, 32); - mca_oob_usock_module.ev_active = false; - - if (orte_oob_base.use_module_threads) { - /* if we are to use independent progress threads at - * the module level, start it now - */ - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s STARTING USOCK PROGRESS THREAD", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - mca_oob_usock_module.ev_base = opal_event_base_create(); - /* construct the thread object */ - OBJ_CONSTRUCT(&mca_oob_usock_module.progress_thread, opal_thread_t); - /* fork off a thread to progress it */ - mca_oob_usock_module.progress_thread.t_run = progress_thread_engine; - mca_oob_usock_module.progress_thread.t_arg = NULL; - mca_oob_usock_module.ev_active = true; - if (OPAL_SUCCESS != opal_thread_start(&mca_oob_usock_module.progress_thread)) { - opal_output(0, "%s USOCK progress thread failed to start", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - } - } else { - mca_oob_usock_module.ev_base = orte_event_base; - } -} - -/* - * Module cleanup. - */ -static void usock_fini(void) -{ - /* cleanup all peers */ - OBJ_DESTRUCT(&mca_oob_usock_module.peers); - - if (mca_oob_usock_module.ev_active) { - /* if we used an independent progress thread at - * the module level, stop it now - */ - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s STOPPING USOCK PROGRESS THREAD", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* stop the progress thread */ - mca_oob_usock_module.ev_active = false; - /* break the event loop */ - opal_event_base_loopexit(mca_oob_usock_module.ev_base); - /* wait for thread to exit */ - opal_thread_join(&mca_oob_usock_module.progress_thread, NULL); - OBJ_DESTRUCT(&mca_oob_usock_module.progress_thread); - /* release the event base */ - opal_event_base_free(mca_oob_usock_module.ev_base); - } -} - -/* Called by mca_oob_usock_accept() and connection_handler() on - * a socket that has been accepted. This call finishes processing the - * socket by registering for the OOB-level connection handshake. Used - * in both the threaded and event listen modes. - */ -static void accept_connection(const int accepted_fd, - const struct sockaddr *addr) -{ - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s accept_connection", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - /* use a one-time event to wait for receipt of peer's - * process ident message to complete this connection - */ - ORTE_ACTIVATE_USOCK_ACCEPT_STATE(accepted_fd, addr, recv_handler); -} - -/* API functions */ -static void process_ping(int fd, short args, void *cbdata) -{ - mca_oob_usock_ping_t *op = (mca_oob_usock_ping_t*)cbdata; - mca_oob_usock_peer_t *peer; - - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s:[%s:%d] processing ping to peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - __FILE__, __LINE__, - ORTE_NAME_PRINT(&op->peer)); - - /* do we know this peer? */ - if (NULL == (peer = mca_oob_usock_peer_lookup(&op->peer))) { - /* push this back to the framework so another component can try */ - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s:[%s:%d] hop %s unknown", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - __FILE__, __LINE__, - ORTE_NAME_PRINT(&op->peer)); -#if 0 - ORTE_ACTIVATE_USOCK_MSG_ERROR(NULL, NULL, &op->peer, mca_oob_usock_component_hop_unknown); -#endif - goto cleanup; - } - - /* if we are already connected, there is nothing to do */ - if (MCA_OOB_USOCK_CONNECTED == peer->state) { - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s:[%s:%d] already connected to peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - __FILE__, __LINE__, - ORTE_NAME_PRINT(&op->peer)); - goto cleanup; - } - - /* if we are already connecting, there is nothing to do */ - if (MCA_OOB_USOCK_CONNECTING == peer->state && - MCA_OOB_USOCK_CONNECT_ACK == peer->state) { - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s:[%s:%d] already connecting to peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - __FILE__, __LINE__, - ORTE_NAME_PRINT(&op->peer)); - goto cleanup; - } - - /* attempt the connection */ - peer->state = MCA_OOB_USOCK_CONNECTING; - ORTE_ACTIVATE_USOCK_CONN_STATE(peer, mca_oob_usock_peer_try_connect); - - cleanup: - OBJ_RELEASE(op); -} - -static void ping(const orte_process_name_t *proc) -{ - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s:[%s:%d] pinging peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - __FILE__, __LINE__, - ORTE_NAME_PRINT(proc)); - - /* push this into our event base for processing */ - ORTE_ACTIVATE_USOCK_PING(proc, process_ping); -} - -static void process_send(int fd, short args, void *cbdata) -{ - mca_oob_usock_msg_op_t *op = (mca_oob_usock_msg_op_t*)cbdata; - mca_oob_usock_peer_t *peer; - - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s:[%s:%d] processing send to peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - __FILE__, __LINE__, - ORTE_NAME_PRINT(&op->msg->dst)); - - /* if I am an app, the only route is to my daemon, so - * send the msg there - */ - if (ORTE_PROC_IS_APP) { - if (NULL == (peer = mca_oob_usock_peer_lookup(ORTE_PROC_MY_DAEMON))) { - /* we don't know how to talk to our daemon, - * which is strange since we already got here. - * likely means we lost a race condition, so - * - */ - ORTE_ACTIVATE_USOCK_MSG_ERROR(NULL, op->msg, - ORTE_PROC_MY_DAEMON, - mca_oob_usock_component_cannot_send); - goto cleanup; - } - } else if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { - /* if I am a daemon, the only way I should be given this - * message to send is if the proc is local to me - */ - if (NULL == (peer = mca_oob_usock_peer_lookup(&op->msg->dst))) { - /* we don't know how to talk to this proc, - * so send this back up to the OOB base so it - * can try another transport - */ - ORTE_ACTIVATE_USOCK_MSG_ERROR(NULL, op->msg, - &op->msg->dst, - mca_oob_usock_component_cannot_send); - goto cleanup; - } - } else { - /* otherwise, this message can't be handled by me, so - * notify the component of the mistake - */ - opal_output(0, "CAN'T BE HANDLED"); - goto cleanup; - } - - /* add the msg to the target's send queue */ - if (MCA_OOB_USOCK_CONNECTED == peer->state) { - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s usock:send_nb: already connected to %s - queueing for send", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name)); - MCA_OOB_USOCK_QUEUE_SEND(op->msg, peer); - goto cleanup; - } - - /* add the message to the queue for sending after the - * connection is formed - */ - MCA_OOB_USOCK_QUEUE_PENDING(op->msg, peer); - - if (MCA_OOB_USOCK_CONNECTING != peer->state && - MCA_OOB_USOCK_CONNECT_ACK != peer->state) { - /* we have to initiate the connection - again, we do not - * want to block while the connection is created. - * So throw us into an event that will create - * the connection via a mini-state-machine :-) - */ - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s usock:send_nb: initiating connection to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name)); - peer->state = MCA_OOB_USOCK_CONNECTING; - ORTE_ACTIVATE_USOCK_CONN_STATE(peer, mca_oob_usock_peer_try_connect); - } - - cleanup: - OBJ_RELEASE(op); -} - -static void send_nb(orte_rml_send_t *msg) -{ - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s usock:send_nb to peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&msg->dst)); - - /* push this into our event base for processing */ - ORTE_ACTIVATE_USOCK_POST_SEND(msg, process_send); -} - -/* - * Event callback when there is data available on the registered - * socket to recv. This is called for the listen sockets to accept an - * incoming connection, on new sockets trying to complete the software - * connection process, and for probes. Data on an established - * connection is handled elsewhere. - */ -static void recv_handler(int sd, short flags, void *cbdata) -{ - mca_oob_usock_conn_op_t *op = (mca_oob_usock_conn_op_t*)cbdata; - mca_oob_usock_hdr_t hdr; - mca_oob_usock_peer_t *peer; - uint64_t *ui64; - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s:usock:recv:handler called", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - /* get the handshake */ - if (ORTE_SUCCESS != mca_oob_usock_peer_recv_connect_ack(NULL, sd, &hdr)) { - goto cleanup; - } - - /* finish processing ident */ - if (MCA_OOB_USOCK_IDENT == hdr.type) { - if (NULL == (peer = mca_oob_usock_peer_lookup(&hdr.origin))) { - /* should never happen */ - goto cleanup; - } - /* set socket up to be non-blocking */ - if ((flags = fcntl(sd, F_GETFL, 0)) < 0) { - opal_output(0, "%s mca_oob_usock_recv_connect: fcntl(F_GETFL) failed: %s (%d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), strerror(opal_socket_errno), opal_socket_errno); - } else { - flags |= O_NONBLOCK; - if (fcntl(sd, F_SETFL, flags) < 0) { - opal_output(0, "%s mca_oob_usock_recv_connect: fcntl(F_SETFL) failed: %s (%d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), strerror(opal_socket_errno), opal_socket_errno); - } - } - - /* is the peer instance willing to accept this connection */ - peer->sd = sd; - if (mca_oob_usock_peer_accept(peer) == false) { - if (OOB_USOCK_DEBUG_CONNECT <= opal_output_get_verbosity(orte_oob_base_framework.framework_output)) { - opal_output(0, "%s-%s mca_oob_usock_recv_connect: " - "rejected connection state %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - peer->state); - } - CLOSE_THE_SOCKET(sd); - ui64 = (uint64_t*)(&peer->name); - opal_hash_table_set_value_uint64(&mca_oob_usock_module.peers, (*ui64), NULL); - OBJ_RELEASE(peer); - } - } - - cleanup: - OBJ_RELEASE(op); -} - -/* Dummy function for when we are not using FT. */ -#if OPAL_ENABLE_FT_CR == 0 -static void ft_event(int state) -{ - return; -} - -#else -static void ft_event(int state) { -#if 0 - opal_list_item_t *item; -#endif - - if(OPAL_CRS_CHECKPOINT == state) { -#if 0 - /* - * Disable event processing while we are working - */ - opal_event_disable(); -#endif - } - else if(OPAL_CRS_CONTINUE == state) { -#if 0 - /* - * Resume event processing - */ - opal_event_enable(); - } - else if(OPAL_CRS_RESTART == state) { - /* - * Clean out cached connection information - * Select pieces of finalize/init - */ - for (item = opal_list_remove_first(&mod->peer_list); - item != NULL; - item = opal_list_remove_first(&mod->peer_list)) { - mca_oob_usock_peer_t* peer = (mca_oob_usock_peer_t*)item; - /* JJH: Use the below command for debugging restarts with invalid sockets - * mca_oob_usock_peer_dump(peer, "RESTART CLEAN") - */ - MCA_OOB_USOCK_PEER_RETURN(peer); - } - - OBJ_DESTRUCT(&mod->peer_free); - OBJ_DESTRUCT(&mod->peer_names); - OBJ_DESTRUCT(&mod->peers); - OBJ_DESTRUCT(&mod->peer_list); - - OBJ_CONSTRUCT(&mod->peer_list, opal_list_t); - OBJ_CONSTRUCT(&mod->peers, opal_hash_table_t); - OBJ_CONSTRUCT(&mod->peer_names, opal_hash_table_t); - OBJ_CONSTRUCT(&mod->peer_free, opal_free_list_t); - - /* - * Resume event processing - */ - opal_event_enable(); -#endif - } - else if(OPAL_CRS_TERM == state ) { - ; - } - else { - ; - } - - return; -} -#endif diff --git a/orte/mca/oob/usock/oob_usock.h b/orte/mca/oob/usock/oob_usock.h deleted file mode 100644 index f6fcbc5680..0000000000 --- a/orte/mca/oob/usock/oob_usock.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef _MCA_OOB_USOCK_H_ -#define _MCA_OOB_USOCK_H_ - -#include "orte_config.h" - -#include "orte/types.h" - -#include "opal/mca/base/base.h" -#include "opal/class/opal_free_list.h" -#include "opal/class/opal_hash_table.h" -#include "opal/mca/event/event.h" - -#include "orte/mca/oob/oob.h" -#include "orte/mca/oob/base/base.h" - - -BEGIN_C_DECLS - -/* define some debug levels */ -#define OOB_USOCK_DEBUG_FAIL 2 -#define OOB_USOCK_DEBUG_CONNECT 7 - -/* forward declare a couple of structures */ -struct mca_oob_usock_module_t; -struct mca_oob_usock_msg_error_t; - -/* Module definition */ -typedef void (*mca_oob_usock_module_init_fn_t)(void); -typedef void (*mca_oob_usock_module_fini_fn_t)(void); -typedef void (*mca_oob_usock_module_accept_connection_fn_t)(const int accepted_fd, - const struct sockaddr *addr); -typedef void (*mca_oob_usock_module_ping_fn_t)(const orte_process_name_t *proc); -typedef void (*mca_oob_usock_module_send_nb_fn_t)(orte_rml_send_t *msg); -typedef void (*mca_oob_usock_module_ft_event_fn_t)(int state); - -typedef struct { - mca_oob_usock_module_init_fn_t init; - mca_oob_usock_module_fini_fn_t finalize; - mca_oob_usock_module_accept_connection_fn_t accept_connection; - mca_oob_usock_module_ping_fn_t ping; - mca_oob_usock_module_send_nb_fn_t send_nb; - mca_oob_usock_module_ft_event_fn_t ft_event; -} mca_oob_usock_module_api_t; -typedef struct { - mca_oob_usock_module_api_t api; - opal_event_base_t *ev_base; /* event base for the module progress thread */ - bool ev_active; - opal_thread_t progress_thread; - opal_hash_table_t peers; // peer connection info -} mca_oob_usock_module_t; -ORTE_MODULE_DECLSPEC extern mca_oob_usock_module_t mca_oob_usock_module; - -/** - * the state of the connection - */ -typedef enum { - MCA_OOB_USOCK_UNCONNECTED, - MCA_OOB_USOCK_CLOSED, - MCA_OOB_USOCK_RESOLVE, - MCA_OOB_USOCK_CONNECTING, - MCA_OOB_USOCK_CONNECT_ACK, - MCA_OOB_USOCK_CONNECTED, - MCA_OOB_USOCK_FAILED, - MCA_OOB_USOCK_ACCEPTING -} mca_oob_usock_state_t; - -/* module-level shared functions */ -ORTE_MODULE_DECLSPEC void mca_oob_usock_send_handler(int fd, short args, void *cbdata); -ORTE_MODULE_DECLSPEC void mca_oob_usock_recv_handler(int fd, short args, void *cbdata); - - -END_C_DECLS - -#endif /* MCA_OOB_USOCK_H_ */ - diff --git a/orte/mca/oob/usock/oob_usock_component.c b/orte/mca/oob/usock/oob_usock_component.c deleted file mode 100644 index d3a364d07a..0000000000 --- a/orte/mca/oob/usock/oob_usock_component.c +++ /dev/null @@ -1,593 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2015 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2009-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - * In windows, many of the socket functions return an EWOULDBLOCK - * instead of things like EAGAIN, EINPROGRESS, etc. It has been - * verified that this will not conflict with other error codes that - * are returned by these functions under UNIX/Linux environments - */ - -#include "orte_config.h" -#include "orte/types.h" -#include "opal/types.h" - -#ifdef HAVE_UNISTD_H -#include -#endif -#ifdef HAVE_SYS_TYPES_H -#include -#endif -#include -#ifdef HAVE_NETINET_IN_H -#include -#endif -#ifdef HAVE_ARPA_INET_H -#include -#endif -#ifdef HAVE_NETDB_H -#include -#endif -#include - -#include "opal/util/show_help.h" -#include "opal/util/error.h" -#include "opal/util/os_path.h" -#include "opal/util/output.h" -#include "opal/opal_socket_errno.h" -#include "opal/util/if.h" -#include "opal/util/net.h" -#include "opal/util/argv.h" -#include "opal/class/opal_hash_table.h" -#include "opal/class/opal_list.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ess/ess.h" -#include "orte/mca/state/state.h" -#include "orte/util/listener.h" -#include "orte/util/name_fns.h" -#include "orte/util/parse_options.h" -#include "orte/util/session_dir.h" -#include "orte/util/show_help.h" -#include "orte/runtime/orte_globals.h" - -#include "orte/mca/oob/usock/oob_usock.h" -#include "orte/mca/oob/usock/oob_usock_component.h" -#include "orte/mca/oob/usock/oob_usock_peer.h" -#include "orte/mca/oob/usock/oob_usock_connection.h" -#include "orte/mca/oob/usock/oob_usock_ping.h" -/* - * Local utility functions - */ - -static int usock_component_register(void); -static int usock_component_open(void); -static int usock_component_close(void); - -static int component_available(void); -static int component_startup(void); -static void component_shutdown(void); -static int component_send(orte_rml_send_t *msg); -static char* component_get_addr(void); -static int component_set_addr(orte_process_name_t *peer, - char **uris); -static bool component_is_reachable(orte_process_name_t *peer); - -/* - * Struct of function pointers and all that to let us be initialized - */ -mca_oob_usock_component_t mca_oob_usock_component = { - { - .oob_base = { - MCA_OOB_BASE_VERSION_2_0_0, - .mca_component_name = "usock", - MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION), - .mca_open_component = usock_component_open, - .mca_close_component = usock_component_close, - .mca_register_component_params = usock_component_register, - }, - .oob_data = { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - .priority = 100, - .available = component_available, - .startup = component_startup, - .shutdown = component_shutdown, - .send_nb = component_send, - .get_addr = component_get_addr, - .set_addr = component_set_addr, - .is_reachable = component_is_reachable, - }, -}; - -/* - * Initialize global variables used w/in this module. - */ -static int usock_component_open(void) -{ - return ORTE_SUCCESS; -} - -/* - * Cleanup of global variables used by this module. - */ -static int usock_component_close(void) -{ - return ORTE_SUCCESS; -} - - -static int usock_component_register(void) -{ - mca_base_component_t *component = &mca_oob_usock_component.super.oob_base; - - /* register oob module parameters */ - mca_oob_usock_component.max_retries = 2; - (void)mca_base_component_var_register(component, "peer_retries", - "Number of times to try shutting down a connection before giving up", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_LOCAL, - &mca_oob_usock_component.max_retries); - - return ORTE_SUCCESS; -} - - -static int component_available(void) -{ - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "oob:usock: component_available called"); - - /* if session directories were forbidden, then we cannot be used */ - if (!orte_create_session_dirs || - NULL == orte_process_info.tmpdir_base || - NULL == orte_process_info.top_session_dir) { - return ORTE_ERR_NOT_SUPPORTED; - } - - /* this component is not available to tools */ - if (ORTE_PROC_IS_TOOL) { - return ORTE_ERR_NOT_AVAILABLE; - } - - if (ORTE_PROC_IS_APP) { - if (NULL == orte_process_info.my_daemon_uri) { - /* direct-launched apps cannot use it */ - return ORTE_ERR_NOT_AVAILABLE; - } - /* apps launched by daemons *must* use it */ - return ORTE_ERR_FORCE_SELECT; - } - - /* otherwise, we are available */ - return ORTE_SUCCESS; -} - -/* - * Handler for accepting connections from the event library - */ -static void connection_event_handler(int incoming_sd, short flags, void* cbdata) -{ - orte_pending_connection_t *pending = (orte_pending_connection_t*)cbdata; - int sd; - - sd = pending->fd; - pending->fd = -1; - OBJ_RELEASE(pending); - - /* process the connection */ - mca_oob_usock_module.api.accept_connection(sd, NULL); -} - -/* Start the module */ -static int component_startup(void) -{ - int rc=ORTE_SUCCESS; - - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s USOCK STARTUP", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - /* setup the path to the daemon rendezvous point */ - memset(&mca_oob_usock_component.address, 0, sizeof(struct sockaddr_un)); - mca_oob_usock_component.address.sun_family = AF_UNIX; - snprintf(mca_oob_usock_component.address.sun_path, - sizeof(mca_oob_usock_component.address.sun_path)-1, - "%s/%s/%s/0/%s", orte_process_info.tmpdir_base, - orte_process_info.top_session_dir, - ORTE_JOB_FAMILY_PRINT(ORTE_PROC_MY_NAME->jobid), "usock"); - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "SUNPATH: %s", mca_oob_usock_component.address.sun_path); - - /* if we are a daemon/HNP, register our listener */ - if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { - if (ORTE_SUCCESS != (rc = orte_register_listener((struct sockaddr*)&mca_oob_usock_component.address, sizeof(struct sockaddr_un), - orte_event_base, connection_event_handler))) { - ORTE_ERROR_LOG(rc); - } - } else { - /* if the rendezvous point isn't there, then that's an error */ - /* if the rendezvous file doesn't exist, that's an error */ - if (0 != access(mca_oob_usock_component.address.sun_path, R_OK)) { - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "SUNPATH: %s NOT READABLE", mca_oob_usock_component.address.sun_path); - return OPAL_ERR_NOT_FOUND; - } - } - - /* start the module */ - mca_oob_usock_module.api.init(); - - return rc; -} - -static void component_shutdown(void) -{ - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s USOCK SHUTDOWN", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { - /* delete the rendezvous file */ - unlink(mca_oob_usock_component.address.sun_path); - } - - /* shutdown the module */ - if (NULL != mca_oob_usock_module.api.finalize) { - mca_oob_usock_module.api.finalize(); - } -} - -static int component_send(orte_rml_send_t *msg) -{ - orte_proc_t *proc; - - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:usock:send_nb to peer %s:%d to channel=%d seq_num =%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&msg->dst), msg->tag, msg->dst_channel, msg->seq_num); - - if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { - /* daemons can only reach local procs */ - if (NULL == (proc = orte_get_proc_object(&msg->dst))) { - return ORTE_ERR_TAKE_NEXT_OPTION; - } - if (!ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_LOCAL)) { - return ORTE_ERR_TAKE_NEXT_OPTION; - } - } - - /* apps can reach anyone via this module as the daemon - * will route the message to the final destination - */ - - mca_oob_usock_module.api.send_nb(msg); - return ORTE_SUCCESS; -} - -/* although we do not use the uri to determine a peer's - * address (since we know the path via the session directory), - * we have to provide something to the uri. This is needed - * as other places in ORTE use a NULL uri to indicate lack - * of a daemon. We may eventually remove that dependency, - * but for now, just ensure that the uri is never NULL, - * even if we are the only active OOB transport. - */ -static char* component_get_addr(void) -{ - char *tmp; - tmp = strdup("usock"); - return tmp; -} - -static int component_set_addr(orte_process_name_t *peer, - char **uris) -{ - orte_proc_t *proc; - mca_oob_usock_peer_t *pr; - uint64_t *ui64; - - /* if I am an application, then everything is addressable - * by me via my daemon - */ - if (ORTE_PROC_IS_APP) { - /* if this is my daemon, then take it - otherwise, ignore */ - if (ORTE_PROC_MY_DAEMON->jobid == peer->jobid && - ORTE_PROC_MY_DAEMON->vpid == peer->vpid) { - ui64 = (uint64_t*)peer; - if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_usock_module.peers, - (*ui64), (void**)&pr) || NULL == pr) { - pr = OBJ_NEW(mca_oob_usock_peer_t); - pr->name = *peer; - opal_hash_table_set_value_uint64(&mca_oob_usock_module.peers, (*ui64), pr); - } - /* we have to initiate the connection because otherwise the - * daemon has no way to communicate to us via this component - * as the app doesn't have a listening port */ - pr->state = MCA_OOB_USOCK_CONNECTING; - ORTE_ACTIVATE_USOCK_CONN_STATE(pr, mca_oob_usock_peer_try_connect); - return ORTE_SUCCESS; - } - /* otherwise, indicate that we cannot reach this peer */ - return ORTE_ERR_TAKE_NEXT_OPTION; - } - - /* if I am a daemon or HNP, I can only reach my - * own local procs via this component - */ - if (ORTE_PROC_MY_NAME->jobid == peer->jobid) { - /* another daemon */ - return ORTE_ERR_TAKE_NEXT_OPTION; - } - if (NULL == (proc = orte_get_proc_object(peer)) || - !ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_LOCAL)) { - return ORTE_ERR_TAKE_NEXT_OPTION; - } - /* indicate that this peer is addressable by this component */ - ui64 = (uint64_t*)peer; - if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_usock_module.peers, - (*ui64), (void**)&pr) || NULL == pr) { - pr = OBJ_NEW(mca_oob_usock_peer_t); - pr->name = *peer; - opal_hash_table_set_value_uint64(&mca_oob_usock_module.peers, (*ui64), pr); - } - return ORTE_SUCCESS; -} - -void mca_oob_usock_component_set_module(int fd, short args, void *cbdata) -{ - mca_oob_usock_peer_op_t *pop = (mca_oob_usock_peer_op_t*)cbdata; - uint64_t ui64; - int rc; - orte_oob_base_peer_t *bpr; - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s usock:set_module called for peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&pop->peer->name)); - - /* retrieve the peer's name */ - memcpy(&ui64, (char*)&(pop->peer->name), sizeof(uint64_t)); - - /* make sure the OOB knows that we are handling this peer - we - * are in the same event base as the OOB base, so we can - * directly access its storage - */ - if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers, - ui64, (void**)&bpr) || NULL == bpr) { - bpr = OBJ_NEW(orte_oob_base_peer_t); - } - opal_bitmap_set_bit(&bpr->addressable, mca_oob_usock_component.super.idx); - bpr->component = &mca_oob_usock_component.super; - if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, - ui64, bpr))) { - ORTE_ERROR_LOG(rc); - } - - OBJ_RELEASE(pop); -} - -void mca_oob_usock_component_lost_connection(int fd, short args, void *cbdata) -{ - mca_oob_usock_peer_op_t *pop = (mca_oob_usock_peer_op_t*)cbdata; - uint64_t ui64; - int rc; - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s usock:lost connection called for peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&pop->peer->name)); - - /* retrieve the peer's name */ - memcpy(&ui64, (char*)&(pop->peer->name), sizeof(uint64_t)); - - /* mark the OOB's table that we can't reach it any more - for now, we don't - * worry about shifting to another component. Eventually, we will want to push - * this decision to the OOB so it can try other components and eventually error out - */ - if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, - ui64, NULL))) { - ORTE_ERROR_LOG(rc); - } - - /* activate the proc state - since an app only connects to its parent daemon, - * and the daemon is *always* its lifeline, activate the lifeline lost state */ - if (ORTE_PROC_IS_APP) { - ORTE_ACTIVATE_PROC_STATE(&pop->peer->name, ORTE_PROC_STATE_LIFELINE_LOST); - } else { - /* we are the daemon end, so notify that the child's comm failed */ - ORTE_ACTIVATE_PROC_STATE(&pop->peer->name, ORTE_PROC_STATE_COMM_FAILED); - } - - OBJ_RELEASE(pop); -} - -void mca_oob_usock_component_cannot_send(int fd, short args, void *cbdata) -{ - mca_oob_usock_msg_error_t *pop = (mca_oob_usock_msg_error_t*)cbdata; - uint64_t ui64; - int rc; - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s usock:unable to send to peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&pop->hop)); - - /* retrieve the peer's name */ - memcpy(&ui64, (char*)&(pop->hop), sizeof(uint64_t)); - - /* mark the OOB's table that we can't reach it any more - for now, we don't - * worry about shifting to another component. Eventually, we will want to push - * this decision to the OOB so it can try other components and eventually error out - */ - if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, - ui64, NULL))) { - ORTE_ERROR_LOG(rc); - } - - /* have the OOB base try to send it again */ - ORTE_OOB_SEND(pop->rmsg); - - OBJ_RELEASE(pop); -} - -void mca_oob_usock_component_failed_to_connect(int fd, short args, void *cbdata) -{ - mca_oob_usock_peer_op_t *pop = (mca_oob_usock_peer_op_t*)cbdata; - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s usock:failed_to_connect called for peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&pop->peer->name)); - - /* if we are terminating, then don't do anything further */ - if (orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) { - OBJ_RELEASE(pop); - return; - } - - /* activate the proc state */ - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s usock:failed_to_connect unable to reach peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&pop->peer->name)); - - /* since an app only connects to its parent daemon, - * and the daemon is *always* its lifeline, activate the lifeline lost state */ - if (ORTE_PROC_IS_APP) { - ORTE_ACTIVATE_PROC_STATE(&pop->peer->name, ORTE_PROC_STATE_LIFELINE_LOST); - } else { - /* we are the daemon end, so notify that the child's comm failed */ - ORTE_ACTIVATE_PROC_STATE(&pop->peer->name, ORTE_PROC_STATE_COMM_FAILED); - } - OBJ_RELEASE(pop); -} - -static bool component_is_reachable(orte_process_name_t *peer) -{ - orte_proc_t *proc; - - /* if I am an application, then everything is reachable - * by me via my daemon - */ - if (ORTE_PROC_IS_APP) { - return true; - } - - /* if I am a daemon or HNP, I can only reach my - * own local procs via this component - */ - if (ORTE_PROC_MY_NAME->jobid == peer->jobid) { - /* another daemon */ - return false; - } - if (NULL == (proc = orte_get_proc_object(peer)) || - !ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_LOCAL)) { - return false; - } - /* indicate that this peer is reachable by this component */ - return true; -} - -char* mca_oob_usock_state_print(mca_oob_usock_state_t state) -{ - switch (state) { - case MCA_OOB_USOCK_UNCONNECTED: - return "UNCONNECTED"; - case MCA_OOB_USOCK_CLOSED: - return "CLOSED"; - case MCA_OOB_USOCK_RESOLVE: - return "RESOLVE"; - case MCA_OOB_USOCK_CONNECTING: - return "CONNECTING"; - case MCA_OOB_USOCK_CONNECT_ACK: - return "ACK"; - case MCA_OOB_USOCK_CONNECTED: - return "CONNECTED"; - case MCA_OOB_USOCK_FAILED: - return "FAILED"; - default: - return "UNKNOWN"; - } -} - - -mca_oob_usock_peer_t* mca_oob_usock_peer_lookup(const orte_process_name_t *name) -{ - mca_oob_usock_peer_t *peer; - uint64_t ui64; - - memcpy(&ui64, (char*)name, sizeof(uint64_t)); - if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_usock_module.peers, ui64, (void**)&peer)) { - return NULL; - } - return peer; -} - -/* OOB USOCK Class instances */ - -static void peer_cons(mca_oob_usock_peer_t *peer) -{ - peer->auth_method = NULL; - peer->sd = -1; - peer->state = MCA_OOB_USOCK_UNCONNECTED; - peer->retries = 0; - OBJ_CONSTRUCT(&peer->send_queue, opal_list_t); - peer->send_msg = NULL; - peer->recv_msg = NULL; - peer->send_ev_active = false; - peer->recv_ev_active = false; - peer->timer_ev_active = false; -} -static void peer_des(mca_oob_usock_peer_t *peer) -{ - if (NULL != peer->auth_method) { - free(peer->auth_method); - } - if (0 <= peer->sd) { - CLOSE_THE_SOCKET(peer->sd); - } - OPAL_LIST_DESTRUCT(&peer->send_queue); -} -OBJ_CLASS_INSTANCE(mca_oob_usock_peer_t, - opal_list_item_t, - peer_cons, peer_des); - -OBJ_CLASS_INSTANCE(mca_oob_usock_peer_op_t, - opal_object_t, - NULL, NULL); - -OBJ_CLASS_INSTANCE(mca_oob_usock_msg_op_t, - opal_object_t, - NULL, NULL); - -OBJ_CLASS_INSTANCE(mca_oob_usock_conn_op_t, - opal_object_t, - NULL, NULL); - -OBJ_CLASS_INSTANCE(mca_oob_usock_ping_t, - opal_object_t, - NULL, NULL); - diff --git a/orte/mca/oob/usock/oob_usock_component.h b/orte/mca/oob/usock/oob_usock_component.h deleted file mode 100644 index a0bc004e79..0000000000 --- a/orte/mca/oob/usock/oob_usock_component.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef _MCA_OOB_USOCK_COMPONENT_H_ -#define _MCA_OOB_USOCK_COMPONENT_H_ - -#include "orte_config.h" - -#ifdef HAVE_SYS_TIME_H -#include -#endif -#ifdef HAVE_SYS_SOCKET_H -#include -#endif -#ifdef HAVE_SYS_UN_H -#include -#endif - -#include "opal/class/opal_bitmap.h" -#include "opal/class/opal_list.h" -#include "opal/class/opal_pointer_array.h" - -#include "orte/mca/oob/oob.h" -#include "oob_usock_peer.h" -#include "oob_usock.h" - -/** - * OOB USOCK Component - */ -typedef struct { - mca_oob_base_component_t super; /**< base OOB component */ - int max_retries; /**< max number of retries before declaring peer gone */ - struct sockaddr_un address; /**< address of our rendezvous point */ -} mca_oob_usock_component_t; - -ORTE_MODULE_DECLSPEC extern mca_oob_usock_component_t mca_oob_usock_component; - -ORTE_MODULE_DECLSPEC char* mca_oob_usock_state_print(mca_oob_usock_state_t state); -ORTE_MODULE_DECLSPEC void mca_oob_usock_component_set_module(int fd, short args, void *cbdata); -ORTE_MODULE_DECLSPEC void mca_oob_usock_component_lost_connection(int fd, short args, void *cbdata); -ORTE_MODULE_DECLSPEC void mca_oob_usock_component_failed_to_connect(int fd, short args, void *cbdata); -ORTE_MODULE_DECLSPEC mca_oob_usock_peer_t* mca_oob_usock_peer_lookup(const orte_process_name_t *name); -ORTE_MODULE_DECLSPEC void mca_oob_usock_component_cannot_send(int fd, short args, void *cbdata); - -#endif /* _MCA_OOB_USOCK_COMPONENT_H_ */ diff --git a/orte/mca/oob/usock/oob_usock_connection.c b/orte/mca/oob/usock/oob_usock_connection.c deleted file mode 100644 index fb3ce8dac1..0000000000 --- a/orte/mca/oob/usock/oob_usock_connection.c +++ /dev/null @@ -1,940 +0,0 @@ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#ifdef HAVE_UNISTD_H -#include -#endif -#include -#ifdef HAVE_SYS_UIO_H -#include -#endif -#ifdef HAVE_NET_UIO_H -#include -#endif -#ifdef HAVE_SYS_TYPES_H -#include -#endif -#include "opal/opal_socket_errno.h" -#ifdef HAVE_NETINET_IN_H -#include -#endif -#ifdef HAVE_ARPA_INET_H -#include -#endif -#ifdef HAVE_NETINET_TCP_H -#include -#endif - -#include "opal/types.h" -#include "opal_stdint.h" -#include "opal/mca/backtrace/backtrace.h" -#include "opal/mca/base/mca_base_var.h" -#include "opal/mca/sec/sec.h" -#include "opal/util/output.h" -#include "opal/util/net.h" -#include "opal/util/error.h" -#include "opal/util/fd.h" -#include "opal/class/opal_hash_table.h" -#include "opal/mca/event/event.h" - -#include "orte/util/name_fns.h" -#include "orte/mca/state/state.h" -#include "orte/runtime/orte_globals.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ess/ess.h" -#include "orte/runtime/orte_wait.h" - -#include "oob_usock.h" -#include "orte/mca/oob/usock/oob_usock_component.h" -#include "orte/mca/oob/usock/oob_usock_peer.h" -#include "orte/mca/oob/usock/oob_usock_connection.h" - -static void usock_peer_event_init(mca_oob_usock_peer_t* peer); -static int usock_peer_send_connect_ack(mca_oob_usock_peer_t* peer); -static int usock_peer_send_blocking(mca_oob_usock_peer_t* peer, - int sd, void* data, size_t size); -static bool usock_peer_recv_blocking(mca_oob_usock_peer_t* peer, - int sd, void* data, size_t size); -static void usock_peer_connected(mca_oob_usock_peer_t* peer); - -static int usock_peer_create_socket(mca_oob_usock_peer_t* peer) -{ - int flags; - - if (peer->sd > 0) { - return ORTE_SUCCESS; - } - - OPAL_OUTPUT_VERBOSE((1, orte_oob_base_framework.framework_output, - "%s oob:usock:peer creating socket to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)))); - peer->sd = socket(PF_UNIX, SOCK_STREAM, 0); - - if (peer->sd < 0) { - opal_output(0, "%s-%s usock_peer_create_socket: socket() failed: %s (%d)\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - strerror(opal_socket_errno), - opal_socket_errno); - return ORTE_ERR_UNREACH; - } - /* Set this fd to be close-on-exec so that subsequent children don't see it */ - if (opal_fd_set_cloexec(peer->sd) != OPAL_SUCCESS) { - opal_output(0, "%s unable to set socket to CLOEXEC", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - close(peer->sd); - peer->sd = -1; - return ORTE_ERROR; - } - - /* setup event callbacks */ - usock_peer_event_init(peer); - - /* setup the socket as non-blocking */ - if (peer->sd >= 0) { - if ((flags = fcntl(peer->sd, F_GETFL, 0)) < 0) { - opal_output(0, "%s-%s usock_peer_connect: fcntl(F_GETFL) failed: %s (%d)\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - strerror(opal_socket_errno), - opal_socket_errno); - } else { - flags |= O_NONBLOCK; - if(fcntl(peer->sd, F_SETFL, flags) < 0) - opal_output(0, "%s-%s usock_peer_connect: fcntl(F_SETFL) failed: %s (%d)\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - strerror(opal_socket_errno), - opal_socket_errno); - } - } - - return ORTE_SUCCESS; -} - - -/* - * Try connecting to a peer - */ -void mca_oob_usock_peer_try_connect(int fd, short args, void *cbdata) -{ - mca_oob_usock_conn_op_t *op = (mca_oob_usock_conn_op_t*)cbdata; - mca_oob_usock_peer_t *peer = op->peer; - int rc; - opal_socklen_t addrlen = 0; - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s orte_usock_peer_try_connect: " - "attempting to connect to proc %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name))); - - rc = usock_peer_create_socket(peer); - if (ORTE_SUCCESS != rc) { - /* FIXME: we cannot create a USOCK socket - report - * back to the component that this peer is - * unreachable so it can remove the peer - * from its list and report back to the base - * NOTE: this could be a reconnect attempt, - * so we also need to mark any queued messages - * and return them as "unreachable" - */ - opal_output(0, "%s CANNOT CREATE SOCKET", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - ORTE_FORCED_TERMINATE(1); - OBJ_RELEASE(op); - return; - } - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s orte_usock_peer_try_connect: " - "attempting to connect to proc %s on socket %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), peer->sd); - - addrlen = sizeof(struct sockaddr_un); - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s orte_usock_peer_try_connect: " - "attempting to connect to proc %s - %d retries", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - peer->retries); - - retry_connect: - peer->retries++; - if (connect(peer->sd, (struct sockaddr *) &mca_oob_usock_component.address, addrlen) < 0) { - /* non-blocking so wait for completion */ - if (opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) { - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s waiting for connect completion to %s - activating send event", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name)); - /* just ensure the send_event is active */ - if (!peer->send_ev_active) { - opal_event_add(&peer->send_event, 0); - peer->send_ev_active = true; - } - OBJ_RELEASE(op); - return; - } - - /* Some kernels (Linux 2.6) will automatically software - abort a connection that was ECONNREFUSED on the last - attempt, without even trying to establish the - connection. Handle that case in a semi-rational - way by trying twice before giving up */ - if (ECONNABORTED == opal_socket_errno) { - if (peer->retries < mca_oob_usock_component.max_retries) { - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s connection aborted by OS to %s - retrying", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name)); - goto retry_connect; - } else { - /* We were unsuccessful in establishing this connection, and are - * not likely to suddenly become successful, - */ - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s orte_usock_peer_try_connect: " - "Connection across unix domain socket to local proc %s failed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name)); - peer->state = MCA_OOB_USOCK_FAILED; - CLOSE_THE_SOCKET(peer->sd); - /* let the USOCK component know that this module failed to make - * the connection so it can try other modules, and/or fail back - * to the OOB level so another component can try. This will activate - * an event in the component event base, and so it will fire async - * from us if we are in our own progress thread - */ - ORTE_ACTIVATE_USOCK_CMP_OP(peer, mca_oob_usock_component_failed_to_connect); - OBJ_RELEASE(op); - return; - } - } - } - - /* connection succeeded */ - peer->retries = 0; - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s orte_usock_peer_try_connect: " - "Connection across to proc %s succeeded", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name)); - /* setup our recv to catch the return ack call */ - if (!peer->recv_ev_active) { - opal_event_add(&peer->recv_event, 0); - peer->recv_ev_active = true; - } - - /* send our globally unique process identifier to the peer */ - if (ORTE_SUCCESS == (rc = usock_peer_send_connect_ack(peer))) { - peer->state = MCA_OOB_USOCK_CONNECT_ACK; - } else { - opal_output(0, - "%s orte_usock_peer_try_connect: " - "usock_peer_send_connect_ack to proc %s failed: %s (%d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - opal_strerror(rc), rc); - ORTE_FORCED_TERMINATE(1); - } - - OBJ_RELEASE(op); -} - -static int usock_peer_send_connect_ack(mca_oob_usock_peer_t* peer) -{ - char *msg; - mca_oob_usock_hdr_t hdr; - int rc; - size_t sdsize; - char *cred; - size_t credsize; - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s SEND CONNECT ACK", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - /* send a handshake that includes our process identifier - * to ensure we are talking to another OMPI process - */ - hdr.origin = *ORTE_PROC_MY_NAME; - hdr.dst = peer->name; - hdr.type = MCA_OOB_USOCK_IDENT; - hdr.tag = 0; - hdr.channel = 0xffffffff; - hdr.seq_num = 0; - - /* get our security credential*/ - if (OPAL_SUCCESS != (rc = opal_sec.get_my_credential(peer->auth_method, - ORTE_PROC_MY_NAME, &cred, &credsize))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* set the number of bytes to be read beyond the header */ - hdr.nbytes = strlen(orte_version_string) + 1 + credsize; - - /* create a space for our message */ - sdsize = (sizeof(hdr) + strlen(orte_version_string) + 1 + credsize); - if (NULL == (msg = (char*)malloc(sdsize))) { - return ORTE_ERR_OUT_OF_RESOURCE; - } - memset(msg, 0, sdsize); - - /* load the message */ - memcpy(msg, &hdr, sizeof(hdr)); - memcpy(msg+sizeof(hdr), orte_version_string, strlen(orte_version_string)); - memcpy(msg+sizeof(hdr)+strlen(orte_version_string)+1, cred, credsize); - free(cred); - - if (ORTE_SUCCESS != usock_peer_send_blocking(peer, peer->sd, msg, sdsize)) { - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - free(msg); - return ORTE_ERR_UNREACH; - } - free(msg); - return ORTE_SUCCESS; -} - -/* - * Initialize events to be used by the peer instance for USOCK select/poll callbacks. - */ -static void usock_peer_event_init(mca_oob_usock_peer_t* peer) -{ - if (peer->sd >= 0) { - opal_event_set(mca_oob_usock_module.ev_base, - &peer->recv_event, - peer->sd, - OPAL_EV_READ|OPAL_EV_PERSIST, - mca_oob_usock_recv_handler, - peer); - opal_event_set_priority(&peer->recv_event, ORTE_MSG_PRI); - if (peer->recv_ev_active) { - opal_event_del(&peer->recv_event); - peer->recv_ev_active = false; - } - opal_event_set(mca_oob_usock_module.ev_base, - &peer->send_event, - peer->sd, - OPAL_EV_WRITE|OPAL_EV_PERSIST, - mca_oob_usock_send_handler, - peer); - opal_event_set_priority(&peer->send_event, ORTE_MSG_PRI); - if (peer->send_ev_active) { - opal_event_del(&peer->send_event); - peer->send_ev_active = false; - } - } -} - -/* - * Check the status of the connection. If the connection failed, will retry - * later. Otherwise, send this processes identifier to the peer on the - * newly connected socket. - */ -void mca_oob_usock_peer_complete_connect(mca_oob_usock_peer_t *peer) -{ - int so_error = 0; - opal_socklen_t so_length = sizeof(so_error); - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s:usock:complete_connect called for peer %s on socket %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name), peer->sd); - - /* check connect completion status */ - if (getsockopt(peer->sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) { - opal_output(0, "%s usock_peer_complete_connect: getsockopt() to %s failed: %s (%d)\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - strerror(opal_socket_errno), - opal_socket_errno); - peer->state = MCA_OOB_USOCK_FAILED; - mca_oob_usock_peer_close(peer); - return; - } - - if (so_error == EINPROGRESS) { - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s:usock:send:handler still in progress", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return; - } else if (so_error == ECONNREFUSED || so_error == ETIMEDOUT) { - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s-%s usock_peer_complete_connect: connection failed: %s (%d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - strerror(so_error), - so_error); - mca_oob_usock_peer_close(peer); - return; - } else if (so_error != 0) { - /* No need to worry about the return code here - we return regardless - at this point, and if an error did occur a message has already been - printed for the user */ - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s-%s usock_peer_complete_connect: " - "connection failed with error %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), so_error); - mca_oob_usock_peer_close(peer); - return; - } - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s usock_peer_complete_connect: " - "sending ack to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name))); - - if (usock_peer_send_connect_ack(peer) == ORTE_SUCCESS) { - peer->state = MCA_OOB_USOCK_CONNECT_ACK; - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s usock_peer_complete_connect: " - "setting read event on connection to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name))); - - if (!peer->recv_ev_active) { - opal_event_add(&peer->recv_event, 0); - peer->recv_ev_active = true; - } - } else { - opal_output(0, "%s usock_peer_complete_connect: unable to send connect ack to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name))); - peer->state = MCA_OOB_USOCK_FAILED; - mca_oob_usock_peer_close(peer); - } -} - -/* - * A blocking send on a non-blocking socket. Used to send the small amount of connection - * information that identifies the peers endpoint. - */ -static int usock_peer_send_blocking(mca_oob_usock_peer_t* peer, - int sd, void* data, size_t size) -{ - unsigned char* ptr = (unsigned char*)data; - size_t cnt = 0; - int retval; - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s send blocking of %"PRIsize_t" bytes to socket %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - size, sd); - - while (cnt < size) { - retval = send(sd, (char*)ptr+cnt, size-cnt, 0); - if (retval < 0) { - if (opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) { - opal_output(0, "%s usock_peer_send_blocking: send() to socket %d failed: %s (%d)\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), sd, - strerror(opal_socket_errno), - opal_socket_errno); - peer->state = MCA_OOB_USOCK_FAILED; - mca_oob_usock_peer_close(peer); - return ORTE_ERR_UNREACH; - } - continue; - } - cnt += retval; - } - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s blocking send complete to socket %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), sd); - - return ORTE_SUCCESS; -} - -/* - * Receive the peers globally unique process identification from a newly - * connected socket and verify the expected response. If so, move the - * socket to a connected state. - */ -int mca_oob_usock_peer_recv_connect_ack(mca_oob_usock_peer_t* pr, int sd, - mca_oob_usock_hdr_t *dhdr) -{ - char *msg; - char *version; - int rc, cmpval; - char *cred; - size_t credsize; - mca_oob_usock_peer_t *peer; - mca_oob_usock_hdr_t hdr; - uint64_t *ui64; - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s RECV CONNECT ACK FROM %s ON SOCKET %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == pr) ? "UNKNOWN" : ORTE_NAME_PRINT(&pr->name), sd); - - peer = pr; - /* ensure all is zero'd */ - memset(&hdr, 0, sizeof(mca_oob_usock_hdr_t)); - - if (usock_peer_recv_blocking(peer, sd, &hdr, sizeof(mca_oob_usock_hdr_t))) { - if (NULL != peer) { - /* If the peer state is CONNECT_ACK, then we were waiting for - * the connection to be ack'd - */ - if (peer->state != MCA_OOB_USOCK_CONNECT_ACK) { - /* handshake broke down - abort this connection */ - opal_output(0, "%s RECV CONNECT BAD HANDSHAKE FROM %s ON SOCKET %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name), sd); - mca_oob_usock_peer_close(peer); - return ORTE_ERR_UNREACH; - } - } - } else { - /* unable to complete the recv */ - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s unable to complete recv of connect-ack from %s ON SOCKET %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == peer) ? "UNKNOWN" : ORTE_NAME_PRINT(&peer->name), sd); - return ORTE_ERR_UNREACH; - } - /* if the requestor wanted the header returned, then do so now */ - if (NULL != dhdr) { - *dhdr = hdr; - } - - if (MCA_OOB_USOCK_PROBE == hdr.type) { - /* send a header back */ - hdr.type = MCA_OOB_USOCK_PROBE; - hdr.dst = hdr.origin; - hdr.origin = *ORTE_PROC_MY_NAME; - usock_peer_send_blocking(peer, sd, &hdr, sizeof(mca_oob_usock_hdr_t)); - CLOSE_THE_SOCKET(sd); - return ORTE_SUCCESS; - } - - if (hdr.type != MCA_OOB_USOCK_IDENT) { - opal_output(0, "usock_peer_recv_connect_ack: invalid header type: %d\n", hdr.type); - if (NULL != peer) { - peer->state = MCA_OOB_USOCK_FAILED; - mca_oob_usock_peer_close(peer); - } else { - CLOSE_THE_SOCKET(sd); - } - return ORTE_ERR_UNREACH; - } - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s connect-ack recvd from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == peer) ? "UNKNOWN" : ORTE_NAME_PRINT(&peer->name)); - - /* if we don't already have it, get the peer */ - if (NULL == peer) { - peer = mca_oob_usock_peer_lookup(&hdr.origin); - if (NULL == peer) { - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s mca_oob_usock_recv_connect: connection from new peer", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - peer = OBJ_NEW(mca_oob_usock_peer_t); - peer->name = hdr.origin; - peer->state = MCA_OOB_USOCK_ACCEPTING; - peer->sd = sd; - ui64 = (uint64_t*)(&peer->name); - if (OPAL_SUCCESS != opal_hash_table_set_value_uint64(&mca_oob_usock_module.peers, (*ui64), peer)) { - OBJ_RELEASE(peer); - CLOSE_THE_SOCKET(sd); - return ORTE_ERR_UNREACH; - } - } else { - /* check for a race condition - if I was in the process of - * creating a connection to the peer, or have already established - * such a connection, then we need to reject this connection. We will - * let the higher ranked process retry - if I'm the lower ranked - * process, I'll simply defer until I receive the request - */ - if (MCA_OOB_USOCK_CONNECTED == peer->state || - MCA_OOB_USOCK_CONNECTING == peer->state || - MCA_OOB_USOCK_CONNECT_ACK == peer->state) { - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s SIMUL CONNECTION WITH %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&hdr.origin)); - if (peer->recv_ev_active) { - opal_event_del(&peer->recv_event); - peer->recv_ev_active = false; - } - if (peer->send_ev_active) { - opal_event_del(&peer->send_event); - peer->send_ev_active = false; - } - if (0 < peer->sd) { - CLOSE_THE_SOCKET(peer->sd); - peer->sd = -1; - } - CLOSE_THE_SOCKET(sd); - peer->retries = 0; - cmpval = orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &hdr.origin, ORTE_PROC_MY_NAME); - if (OPAL_VALUE1_GREATER == cmpval) { - /* force the other end to retry the connection */ - peer->state = MCA_OOB_USOCK_UNCONNECTED; - return ORTE_ERR_UNREACH; - } else { - /* retry the connection */ - peer->state = MCA_OOB_USOCK_CONNECTING; - ORTE_ACTIVATE_USOCK_CONN_STATE(peer, mca_oob_usock_peer_try_connect); - return ORTE_ERR_UNREACH; - } - } - } - } else { - /* compare the peers name to the expected value */ - if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &peer->name, &hdr.origin)) { - opal_output(0, "%s usock_peer_recv_connect_ack: " - "received unexpected process identifier %s from %s\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(hdr.origin)), - ORTE_NAME_PRINT(&(peer->name))); - peer->state = MCA_OOB_USOCK_FAILED; - mca_oob_usock_peer_close(peer); - return ORTE_ERR_UNREACH; - } - } - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s connect-ack header from %s is okay", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name)); - - /* get the authentication and version payload */ - if (NULL == (msg = (char*)malloc(hdr.nbytes))) { - peer->state = MCA_OOB_USOCK_FAILED; - mca_oob_usock_peer_close(peer); - return ORTE_ERR_OUT_OF_RESOURCE; - } - if (!usock_peer_recv_blocking(peer, sd, msg, hdr.nbytes)) { - /* unable to complete the recv */ - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s unable to complete recv of connect-ack from %s ON SOCKET %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name), peer->sd); - free(msg); - return ORTE_ERR_UNREACH; - } - - /* check that this is from a matching version */ - version = (char*)(msg); - if (0 != strcmp(version, orte_version_string)) { - opal_output(0, "%s usock_peer_recv_connect_ack: " - "received different version from %s: %s instead of %s\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - version, orte_version_string); - peer->state = MCA_OOB_USOCK_FAILED; - mca_oob_usock_peer_close(peer); - free(msg); - return ORTE_ERR_UNREACH; - } - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s connect-ack version from %s matches ours", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name)); - - /* check security token */ - cred = (char*)(msg + strlen(version) + 1); - credsize = hdr.nbytes - strlen(version) - 1; - if (OPAL_SUCCESS != (rc = opal_sec.authenticate(cred, credsize, &peer->auth_method))) { - ORTE_ERROR_LOG(rc); - } - free(msg); - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s connect-ack %s authenticated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name)); - - /* if the requestor wanted the header returned, then they - * will complete their processing - */ - if (NULL != dhdr) { - return ORTE_SUCCESS; - } - - /* set the peer into the component and OOB-level peer tables to indicate - * that we know this peer and we will be handling him - */ - ORTE_ACTIVATE_USOCK_CMP_OP(peer, mca_oob_usock_component_set_module); - - /* connected */ - usock_peer_connected(peer); - if (OOB_USOCK_DEBUG_CONNECT <= opal_output_get_verbosity(orte_oob_base_framework.framework_output)) { - mca_oob_usock_peer_dump(peer, "connected"); - } - return ORTE_SUCCESS; -} - -/* - * Setup peer state to reflect that connection has been established, - * and start any pending sends. - */ -static void usock_peer_connected(mca_oob_usock_peer_t* peer) -{ - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s-%s usock_peer_connected on socket %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), peer->sd); - - if (peer->timer_ev_active) { - opal_event_del(&peer->timer_event); - peer->timer_ev_active = false; - } - peer->state = MCA_OOB_USOCK_CONNECTED; - - /* initiate send of first message on queue */ - if (NULL == peer->send_msg) { - peer->send_msg = (mca_oob_usock_send_t*) - opal_list_remove_first(&peer->send_queue); - } - if (NULL != peer->send_msg && !peer->send_ev_active) { - opal_event_add(&peer->send_event, 0); - peer->send_ev_active = true; - } -} - -/* - * Remove any event registrations associated with the socket - * and update the peer state to reflect the connection has - * been closed. - */ -void mca_oob_usock_peer_close(mca_oob_usock_peer_t *peer) -{ - mca_oob_usock_send_t *snd; - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s usock_peer_close for %s sd %d state %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - peer->sd, mca_oob_usock_state_print(peer->state)); - - peer->state = MCA_OOB_USOCK_CLOSED; - - /* release the socket */ - close(peer->sd); - - /* inform the component-level that we have lost a connection so - * it can decide what to do about it. - */ - ORTE_ACTIVATE_USOCK_CMP_OP(peer, mca_oob_usock_component_lost_connection); - - if (orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) { - /* nothing more to do */ - return; - } - - /* FIXME: push any queued messages back onto the OOB for retry - note that - * this must be done after the prior call to ensure that the component - * processes the "lost connection" notice before the OOB begins to - * handle these recycled messages. This prevents us from unintentionally - * attempting to send the message again across the now-failed interface - */ - if (NULL != peer->send_msg) { - } - while (NULL != (snd = (mca_oob_usock_send_t*)opal_list_remove_first(&peer->send_queue))) { - } -} - -/* - * A blocking recv on a non-blocking socket. Used to receive the small amount of connection - * information that identifies the peers endpoint. - */ -static bool usock_peer_recv_blocking(mca_oob_usock_peer_t* peer, - int sd, void* data, size_t size) -{ - unsigned char* ptr = (unsigned char*)data; - size_t cnt = 0; - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s waiting for connect ack from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == peer) ? "UNKNOWN" : ORTE_NAME_PRINT(&(peer->name))); - - while (cnt < size) { - int retval = recv(sd, (char *)ptr+cnt, size-cnt, 0); - - /* remote closed connection */ - if (retval == 0) { - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s-%s usock_peer_recv_blocking: " - "peer closed connection: peer state %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == peer) ? "UNKNOWN" : ORTE_NAME_PRINT(&(peer->name)), - (NULL == peer) ? 0 : peer->state); - mca_oob_usock_peer_close(peer); - return false; - } - - /* socket is non-blocking so handle errors */ - if (retval < 0) { - if (opal_socket_errno != EINTR && - opal_socket_errno != EAGAIN && - opal_socket_errno != EWOULDBLOCK) { - if (peer->state == MCA_OOB_USOCK_CONNECT_ACK) { - /* If we overflow the listen backlog, it's - possible that even though we finished the three - way handshake, the remote host was unable to - transition the connection from half connected - (received the initial SYN) to fully connected - (in the listen backlog). We likely won't see - the failure until we try to receive, due to - timing and the like. The first thing we'll get - in that case is a RST packet, which receive - will turn into a connection reset by peer - errno. In that case, leave the socket in - CONNECT_ACK and propogate the error up to - recv_connect_ack, who will try to establish the - connection again */ - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s connect ack received error %s from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - strerror(opal_socket_errno), - (NULL == peer) ? "UNKNOWN" : ORTE_NAME_PRINT(&(peer->name))); - return false; - } else { - opal_output(0, - "%s usock_peer_recv_blocking: " - "recv() failed for %s: %s (%d)\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == peer) ? "UNKNOWN" : ORTE_NAME_PRINT(&(peer->name)), - strerror(opal_socket_errno), - opal_socket_errno); - if (NULL != peer) { - peer->state = MCA_OOB_USOCK_FAILED; - mca_oob_usock_peer_close(peer); - } else { - CLOSE_THE_SOCKET(sd); - } - return false; - } - } - continue; - } - cnt += retval; - } - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s connect ack received from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == peer) ? "UNKNOWN" : ORTE_NAME_PRINT(&(peer->name))); - return true; -} - -/* - * Routine for debugging to print the connection state and socket options - */ -void mca_oob_usock_peer_dump(mca_oob_usock_peer_t* peer, const char* msg) -{ - char buff[255]; - int nodelay,flags; - - if ((flags = fcntl(peer->sd, F_GETFL, 0)) < 0) { - opal_output(0, "usock_peer_dump: fcntl(F_GETFL) failed: %s (%d)\n", - strerror(opal_socket_errno), - opal_socket_errno); - } -#if defined(USOCK_NODELAY) - optlen = sizeof(nodelay); - if (getsockopt(peer->sd, IPPROTO_USOCK, USOCK_NODELAY, (char *)&nodelay, &optlen) < 0) { - opal_output(0, "usock_peer_dump: USOCK_NODELAY option: %s (%d)\n", - strerror(opal_socket_errno), - opal_socket_errno); - } -#else - nodelay = 0; -#endif - - snprintf(buff, sizeof(buff), "%s-%s %s: nodelay %d flags %08x\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - msg, nodelay, flags); - opal_output(0, "%s", buff); -} - -/* - * Accept incoming connection - if not already connected - */ - -bool mca_oob_usock_peer_accept(mca_oob_usock_peer_t* peer) -{ - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s usock:peer_accept called for peer %s in state %s on socket %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name), - mca_oob_usock_state_print(peer->state), peer->sd); - - if (peer->state != MCA_OOB_USOCK_CONNECTED) { - - usock_peer_event_init(peer); - - if (usock_peer_send_connect_ack(peer) != ORTE_SUCCESS) { - opal_output(0, "%s-%s usock_peer_accept: " - "usock_peer_send_connect_ack failed\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name))); - peer->state = MCA_OOB_USOCK_FAILED; - mca_oob_usock_peer_close(peer); - return false; - } - - /* set the peer into the component and OOB-level peer tables to indicate - * that we know this peer and we will be handling him - */ - ORTE_ACTIVATE_USOCK_CMP_OP(peer, mca_oob_usock_component_set_module); - - usock_peer_connected(peer); - if (!peer->recv_ev_active) { - opal_event_add(&peer->recv_event, 0); - peer->recv_ev_active = true; - } - /* if a message is waiting to be sent, ensure the send event is active */ - if (NULL != peer->send_msg && !peer->send_ev_active) { - opal_event_add(&peer->send_event, 0); - peer->send_ev_active = true; - } - if (OOB_USOCK_DEBUG_CONNECT <= opal_output_get_verbosity(orte_oob_base_framework.framework_output)) { - mca_oob_usock_peer_dump(peer, "accepted"); - } - return true; - } - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s usock:peer_accept ignored for peer %s in state %s on socket %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name), - mca_oob_usock_state_print(peer->state), peer->sd); - return false; -} diff --git a/orte/mca/oob/usock/oob_usock_connection.h b/orte/mca/oob/usock/oob_usock_connection.h deleted file mode 100644 index fe98f6e09c..0000000000 --- a/orte/mca/oob/usock/oob_usock_connection.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef _MCA_OOB_USOCK_CONNECTION_H_ -#define _MCA_OOB_USOCK_CONNECTION_H_ - -#include "orte_config.h" - -#ifdef HAVE_SYS_TYPES_H -#include -#endif -#ifdef HAVE_SYS_SOCKET_H -#include -#endif - -#include "oob_usock.h" -#include "oob_usock_peer.h" - -/* State machine for connection operations */ -typedef struct { - opal_object_t super; - mca_oob_usock_peer_t *peer; - opal_event_t ev; -} mca_oob_usock_conn_op_t; -OBJ_CLASS_DECLARATION(mca_oob_usock_conn_op_t); - -#define CLOSE_THE_SOCKET(socket) \ - do { \ - shutdown(socket, 2); \ - close(socket); \ - } while(0) - -#define ORTE_ACTIVATE_USOCK_CONN_STATE(p, cbfunc) \ - do { \ - mca_oob_usock_conn_op_t *cop; \ - opal_output_verbose(5, orte_oob_base_framework.framework_output, \ - "%s:[%s:%d] connect to %s", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - __FILE__, __LINE__, \ - ORTE_NAME_PRINT((&(p)->name))); \ - cop = OBJ_NEW(mca_oob_usock_conn_op_t); \ - cop->peer = (p); \ - opal_event_set(mca_oob_usock_module.ev_base, &cop->ev, -1, \ - OPAL_EV_WRITE, (cbfunc), cop); \ - opal_event_set_priority(&cop->ev, ORTE_MSG_PRI); \ - opal_event_active(&cop->ev, OPAL_EV_WRITE, 1); \ - } while(0); - -#define ORTE_ACTIVATE_USOCK_ACCEPT_STATE(s, a, cbfunc) \ - do { \ - mca_oob_usock_conn_op_t *cop; \ - cop = OBJ_NEW(mca_oob_usock_conn_op_t); \ - opal_event_set(mca_oob_usock_module.ev_base, &cop->ev, s, \ - OPAL_EV_READ, (cbfunc), cop); \ - opal_event_set_priority(&cop->ev, ORTE_MSG_PRI); \ - opal_event_add(&cop->ev, 0); \ - } while(0); - -#define ORTE_RETRY_USOCK_CONN_STATE(p, cbfunc, tv) \ - do { \ - mca_oob_usock_conn_op_t *cop; \ - opal_output_verbose(5, orte_oob_base_framework.framework_output, \ - "%s:[%s:%d] retry connect to %s", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - __FILE__, __LINE__, \ - ORTE_NAME_PRINT((&(p)->name))); \ - cop = OBJ_NEW(mca_oob_usock_conn_op_t); \ - cop->peer = (p); \ - opal_event_evtimer_set(mca_oob_usock_module.ev_base, \ - &cop->ev, \ - (cbfunc), cop); \ - opal_event_evtimer_add(&cop->ev, (tv)); \ - } while(0); - -ORTE_MODULE_DECLSPEC void mca_oob_usock_peer_try_connect(int fd, short args, void *cbdata); -ORTE_MODULE_DECLSPEC void mca_oob_usock_peer_dump(mca_oob_usock_peer_t* peer, const char* msg); -ORTE_MODULE_DECLSPEC bool mca_oob_usock_peer_accept(mca_oob_usock_peer_t* peer); -ORTE_MODULE_DECLSPEC void mca_oob_usock_peer_complete_connect(mca_oob_usock_peer_t* peer); -ORTE_MODULE_DECLSPEC int mca_oob_usock_peer_recv_connect_ack(mca_oob_usock_peer_t* peer, - int sd, mca_oob_usock_hdr_t *hdr); -ORTE_MODULE_DECLSPEC void mca_oob_usock_peer_close(mca_oob_usock_peer_t *peer); - -#endif /* _MCA_OOB_USOCK_CONNECTION_H_ */ diff --git a/orte/mca/oob/usock/oob_usock_hdr.h b/orte/mca/oob/usock/oob_usock_hdr.h deleted file mode 100644 index c7cad2d998..0000000000 --- a/orte/mca/oob/usock/oob_usock_hdr.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef _MCA_OOB_USOCK_HDR_H_ -#define _MCA_OOB_USOCK_HDR_H_ - -#include "orte_config.h" - -/* define several internal-only message - * types this component uses for its own - * handshake operations, plus one indicating - * the message came from an external (to - * this component) source - */ -typedef enum { - MCA_OOB_USOCK_IDENT, - MCA_OOB_USOCK_PROBE, - MCA_OOB_USOCK_PING, - MCA_OOB_USOCK_USER -} mca_oob_usock_msg_type_t; - -/* header for usock msgs */ -typedef struct { - /* the original sender */ - orte_process_name_t origin; - /* the intended final recipient */ - orte_process_name_t dst; - /* type of message */ - mca_oob_usock_msg_type_t type; - /* the rml tag where this message is headed */ - orte_rml_tag_t tag; - /* the rml channel to which this message is headed */ - orte_rml_channel_num_t channel; - /* msg seq number on the src channel */ - uint32_t seq_num; - /* number of bytes in message */ - uint32_t nbytes; -} mca_oob_usock_hdr_t; - -#endif /* _MCA_OOB_USOCK_HDR_H_ */ diff --git a/orte/mca/oob/usock/oob_usock_peer.h b/orte/mca/oob/usock/oob_usock_peer.h deleted file mode 100644 index cc715d4fdc..0000000000 --- a/orte/mca/oob/usock/oob_usock_peer.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef _MCA_OOB_USOCK_PEER_H_ -#define _MCA_OOB_USOCK_PEER_H_ - -#include "orte_config.h" - -#include "oob_usock.h" -#include "oob_usock_sendrecv.h" - -/* object for tracking peers */ -typedef struct { - opal_list_item_t super; - /* although not required, there is enough debug - * value that retaining the name makes sense - */ - orte_process_name_t name; - char *auth_method; // how the peer authenticated themselves to use - int sd; - int retries; // number of times we have tried to connect to this address - mca_oob_usock_state_t state; - opal_event_t op_event; // used for connecting and operations other than read/write - opal_event_t send_event; /**< registration with event thread for send events */ - bool send_ev_active; - opal_event_t recv_event; /**< registration with event thread for recv events */ - bool recv_ev_active; - opal_event_t timer_event; /**< timer for retrying connection failures */ - bool timer_ev_active; - opal_list_t send_queue; /**< list of messages to send */ - mca_oob_usock_send_t *send_msg; /**< current send in progress */ - mca_oob_usock_recv_t *recv_msg; /**< current recv in progress */ -} mca_oob_usock_peer_t; -OBJ_CLASS_DECLARATION(mca_oob_usock_peer_t); - -typedef struct { - opal_object_t super; - opal_event_t ev; - mca_oob_usock_peer_t *peer; -} mca_oob_usock_peer_op_t; -OBJ_CLASS_DECLARATION(mca_oob_usock_peer_op_t); - -#define ORTE_ACTIVATE_USOCK_PEER_OP(p, cbfunc) \ - do { \ - mca_oob_usock_peer_op_t *op; \ - op = OBJ_NEW(mca_oob_usock_peer_op_t); \ - op->peer = (p); \ - opal_event_set(mca_usock_component.ev_base, &op->ev, -1, \ - OPAL_EV_WRITE, (cbfunc), op); \ - opal_event_set_priority(&op->ev, ORTE_MSG_PRI); \ - opal_event_active(&op->ev, OPAL_EV_WRITE, 1); \ - } while(0); - -#define ORTE_ACTIVATE_USOCK_CMP_OP(p, cbfunc) \ - do { \ - mca_oob_usock_peer_op_t *pop; \ - pop = OBJ_NEW(mca_oob_usock_peer_op_t); \ - pop->peer = (p); \ - opal_event_set(orte_event_base, &pop->ev, -1, \ - OPAL_EV_WRITE, (cbfunc), pop); \ - opal_event_set_priority(&pop->ev, ORTE_MSG_PRI); \ - opal_event_active(&pop->ev, OPAL_EV_WRITE, 1); \ - } while(0); - - -#endif /* _MCA_OOB_USOCK_PEER_H_ */ diff --git a/orte/mca/oob/usock/oob_usock_ping.h b/orte/mca/oob/usock/oob_usock_ping.h deleted file mode 100644 index 67badb8f05..0000000000 --- a/orte/mca/oob/usock/oob_usock_ping.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef _MCA_OOB_USOCK_PING_H_ -#define _MCA_OOB_USOCK_PING_H_ - -#include "orte_config.h" - -#include "opal/mca/event/event.h" - -#include "oob_usock.h" -#include "oob_usock_sendrecv.h" - -typedef struct { - opal_object_t super; - opal_event_t ev; - orte_process_name_t peer; -} mca_oob_usock_ping_t; -OBJ_CLASS_DECLARATION(mca_oob_usock_ping_t); - -#define ORTE_ACTIVATE_USOCK_PING(p, cbfunc) \ - do { \ - mca_oob_usock_ping_t *pop; \ - pop = OBJ_NEW(mca_oob_usock_ping_t); \ - pop->peer.jobid = (p)->jobid; \ - pop->peer.vpid = (p)->vpid; \ - opal_event_set(mca_oob_usock_module.ev_base, &pop->ev, -1, \ - OPAL_EV_WRITE, (cbfunc), pop); \ - opal_event_set_priority(&pop->ev, ORTE_MSG_PRI); \ - opal_event_active(&pop->ev, OPAL_EV_WRITE, 1); \ - } while(0); - -#endif /* _MCA_OOB_USOCK_PING_H_ */ diff --git a/orte/mca/oob/usock/oob_usock_sendrecv.c b/orte/mca/oob/usock/oob_usock_sendrecv.c deleted file mode 100644 index b07e42956a..0000000000 --- a/orte/mca/oob/usock/oob_usock_sendrecv.c +++ /dev/null @@ -1,631 +0,0 @@ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - * In windows, many of the socket functions return an EWOULDBLOCK - * instead of \ things like EAGAIN, EINPROGRESS, etc. It has been - * verified that this will \ not conflict with other error codes that - * are returned by these functions \ under UNIX/Linux environments - */ - -#include "orte_config.h" - -#ifdef HAVE_UNISTD_H -#include -#endif -#include -#ifdef HAVE_SYS_UIO_H -#include -#endif -#ifdef HAVE_NET_UIO_H -#include -#endif -#ifdef HAVE_SYS_TYPES_H -#include -#endif -#include "opal/opal_socket_errno.h" -#ifdef HAVE_NETINET_IN_H -#include -#endif -#ifdef HAVE_ARPA_INET_H -#include -#endif -#ifdef HAVE_NETINET_TCP_H -#include -#endif - -#include "opal_stdint.h" -#include "opal/types.h" -#include "opal/mca/backtrace/backtrace.h" -#include "opal/util/output.h" -#include "opal/util/net.h" -#include "opal/util/error.h" -#include "opal/class/opal_hash_table.h" -#include "opal/mca/event/event.h" - -#include "orte/util/name_fns.h" -#include "orte/runtime/orte_globals.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ess/ess.h" -#include "orte/mca/state/state.h" -#include "orte/runtime/orte_wait.h" - -#include "oob_usock.h" -#include "orte/mca/oob/usock/oob_usock_component.h" -#include "orte/mca/oob/usock/oob_usock_peer.h" -#include "orte/mca/oob/usock/oob_usock_connection.h" - -static int send_bytes(mca_oob_usock_peer_t* peer) -{ - mca_oob_usock_send_t* msg = peer->send_msg; - int rc; - - while (0 < msg->sdbytes) { - rc = write(peer->sd, msg->sdptr, msg->sdbytes); - if (rc < 0) { - if (opal_socket_errno == EINTR) { - continue; - } else if (opal_socket_errno == EAGAIN) { - /* tell the caller to keep this message on active, - * but let the event lib cycle so other messages - * can progress while this socket is busy - */ - return ORTE_ERR_RESOURCE_BUSY; - } else if (opal_socket_errno == EWOULDBLOCK) { - /* tell the caller to keep this message on active, - * but let the event lib cycle so other messages - * can progress while this socket is busy - */ - return ORTE_ERR_WOULD_BLOCK; - } - /* we hit an error and cannot progress this message */ - opal_output(0, "%s->%s mca_oob_usock_msg_send_bytes: write failed: %s (%d) [sd = %d]", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - strerror(opal_socket_errno), - opal_socket_errno, - peer->sd); - return ORTE_ERR_COMM_FAILURE; - } - /* update location */ - msg->sdbytes -= rc; - msg->sdptr += rc; - } - /* we sent the full data block */ - return ORTE_SUCCESS; -} - -/* - * A file descriptor is available/ready for send. Check the state - * of the socket and take the appropriate action. - */ -void mca_oob_usock_send_handler(int sd, short flags, void *cbdata) -{ - mca_oob_usock_peer_t* peer = (mca_oob_usock_peer_t*)cbdata; - mca_oob_usock_send_t* msg = peer->send_msg; - int rc; - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s usock:send_handler called to send to peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name)); - - switch (peer->state) { - case MCA_OOB_USOCK_CONNECTING: - case MCA_OOB_USOCK_CLOSED: - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s usock:send_handler %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - mca_oob_usock_state_print(peer->state)); - mca_oob_usock_peer_complete_connect(peer); - /* de-activate the send event until the connection - * handshake completes - */ - if (peer->send_ev_active) { - opal_event_del(&peer->send_event); - peer->send_ev_active = false; - } - break; - case MCA_OOB_USOCK_CONNECTED: - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s usock:send_handler SENDING TO %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == peer->send_msg) ? "NULL" : ORTE_NAME_PRINT(&peer->name)); - if (NULL != msg) { - /* if the header hasn't been completely sent, send it */ - if (!msg->hdr_sent) { - if (ORTE_SUCCESS == (rc = send_bytes(peer))) { - /* header is completely sent */ - msg->hdr_sent = true; - /* setup to send the data */ - if (NULL == msg->msg) { - /* this was a zero-byte msg - nothing more to do */ - OBJ_RELEASE(msg); - peer->send_msg = NULL; - goto next; - } else if (NULL != msg->msg->buffer) { - /* send the buffer data as a single block */ - msg->sdptr = msg->msg->buffer->base_ptr; - msg->sdbytes = msg->msg->buffer->bytes_used; - } else if (NULL != msg->msg->iov) { - /* start with the first iovec */ - msg->sdptr = msg->msg->iov[0].iov_base; - msg->sdbytes = msg->msg->iov[0].iov_len; - msg->iovnum = 0; - } else { - msg->sdptr = msg->msg->data; - msg->sdbytes = msg->msg->count; - } - /* fall thru and let the send progress */ - } else if (ORTE_ERR_RESOURCE_BUSY == rc || - ORTE_ERR_WOULD_BLOCK == rc) { - /* exit this event and let the event lib progress */ - return; - } else { - // report the error - opal_output(0, "%s-%s mca_oob_usock_peer_send_handler: unable to send header", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name))); - opal_event_del(&peer->send_event); - peer->send_ev_active = false; - msg->msg->status = rc; - if( NULL == msg->msg->channel) { - ORTE_RML_SEND_COMPLETE(msg->msg); - } - else { - ORTE_QOS_SEND_COMPLETE(msg->msg); - } - OBJ_RELEASE(msg); - peer->send_msg = NULL; - goto next; - } - } - /* progress the data transmission */ - if (msg->hdr_sent) { - if (ORTE_SUCCESS == (rc = send_bytes(peer))) { - /* this block is complete */ - if (NULL != msg->msg->buffer) { - /* we are done - notify the RML */ - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - msg->hdr.nbytes, peer->sd); - msg->msg->status = ORTE_SUCCESS; - if( NULL == msg->msg->channel) { - ORTE_RML_SEND_COMPLETE(msg->msg); - } - else { - ORTE_QOS_SEND_COMPLETE(msg->msg); - } - OBJ_RELEASE(msg); - peer->send_msg = NULL; - } else if (NULL != msg->msg->data) { - /* this was a relay message - nothing more to do */ - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - msg->hdr.nbytes, peer->sd); - OBJ_RELEASE(msg); - peer->send_msg = NULL; - } else { - /* rotate to the next iovec */ - msg->iovnum++; - if (msg->iovnum < msg->msg->count) { - msg->sdptr = msg->msg->iov[msg->iovnum].iov_base; - msg->sdbytes = msg->msg->iov[msg->iovnum].iov_len; - /* exit this event to give the event lib - * a chance to progress any other pending - * actions - */ - return; - } else { - /* this message is complete - notify the RML */ - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - msg->hdr.nbytes, peer->sd); - msg->msg->status = ORTE_SUCCESS; - if( NULL == msg->msg->channel) { - ORTE_RML_SEND_COMPLETE(msg->msg); - } - else { - ORTE_QOS_SEND_COMPLETE(msg->msg); - } - OBJ_RELEASE(msg); - peer->send_msg = NULL; - } - } - /* fall thru to queue the next message */ - } else if (ORTE_ERR_RESOURCE_BUSY == rc || - ORTE_ERR_WOULD_BLOCK == rc) { - /* exit this event and let the event lib progress */ - return; - } else { - // report the error - opal_output(0, "%s-%s mca_oob_usock_peer_send_handler: unable to send message ON SOCKET %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), peer->sd); - opal_event_del(&peer->send_event); - peer->send_ev_active = false; - msg->msg->status = rc; - if( NULL == msg->msg->channel) { - ORTE_RML_SEND_COMPLETE(msg->msg); - } - else { - ORTE_QOS_SEND_COMPLETE(msg->msg); - } - OBJ_RELEASE(msg); - peer->send_msg = NULL; - ORTE_FORCED_TERMINATE(1); - return; - } - } - - next: - /* if current message completed - progress any pending sends by - * moving the next in the queue into the "on-deck" position. Note - * that this doesn't mean we send the message right now - we will - * wait for another send_event to fire before doing so. This gives - * us a chance to service any pending recvs. - */ - peer->send_msg = (mca_oob_usock_send_t*) - opal_list_remove_first(&peer->send_queue); - } - /* if nothing else to do unregister for send event notifications */ - if (NULL == peer->send_msg && peer->send_ev_active) { - opal_event_del(&peer->send_event); - peer->send_ev_active = false; - } - break; - default: - opal_output(0, "%s-%s mca_oob_usock_peer_send_handler: invalid connection state (%d) on socket %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - peer->state, peer->sd); - if (peer->send_ev_active) { - opal_event_del(&peer->send_event); - peer->send_ev_active = false; - } - break; - } -} - -static int read_bytes(mca_oob_usock_peer_t* peer) -{ - int rc; - - /* read until all bytes recvd or error */ - while (0 < peer->recv_msg->rdbytes) { - rc = read(peer->sd, peer->recv_msg->rdptr, peer->recv_msg->rdbytes); - if (rc < 0) { - if(opal_socket_errno == EINTR) { - continue; - } else if (opal_socket_errno == EAGAIN) { - /* tell the caller to keep this message on active, - * but let the event lib cycle so other messages - * can progress while this socket is busy - */ - return ORTE_ERR_RESOURCE_BUSY; - } else if (opal_socket_errno == EWOULDBLOCK) { - /* tell the caller to keep this message on active, - * but let the event lib cycle so other messages - * can progress while this socket is busy - */ - return ORTE_ERR_WOULD_BLOCK; - } - /* we hit an error and cannot progress this message - report - * the error back to the RML and let the caller know - * to abort this message - */ - opal_output_verbose(OOB_USOCK_DEBUG_FAIL, orte_oob_base_framework.framework_output, - "%s-%s mca_oob_usock_msg_recv: readv failed: %s (%d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - strerror(opal_socket_errno), - opal_socket_errno); - // mca_oob_usock_peer_close(peer); - // if (NULL != mca_oob_usock.oob_exception_callback) { - // mca_oob_usock.oob_exception_callback(&peer->name, ORTE_RML_PEER_DISCONNECTED); - //} - return ORTE_ERR_COMM_FAILURE; - } else if (rc == 0) { - /* the remote peer closed the connection - report that condition - * and let the caller know - */ - opal_output_verbose(OOB_USOCK_DEBUG_FAIL, orte_oob_base_framework.framework_output, - "%s-%s mca_oob_usock_msg_recv: peer closed connection", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name))); - /* stop all events */ - if (peer->recv_ev_active) { - opal_event_del(&peer->recv_event); - peer->recv_ev_active = false; - } - if (peer->timer_ev_active) { - opal_event_del(&peer->timer_event); - peer->timer_ev_active = false; - } - if (peer->send_ev_active) { - opal_event_del(&peer->send_event); - peer->send_ev_active = false; - } - if (NULL != peer->recv_msg) { - OBJ_RELEASE(peer->recv_msg); - peer->recv_msg = NULL; - } - mca_oob_usock_peer_close(peer); - //if (NULL != mca_oob_usock.oob_exception_callback) { - // mca_oob_usock.oob_exception_callback(&peer->peer_name, ORTE_RML_PEER_DISCONNECTED); - //} - return ORTE_ERR_WOULD_BLOCK; - } - /* we were able to read something, so adjust counters and location */ - peer->recv_msg->rdbytes -= rc; - peer->recv_msg->rdptr += rc; - } - - /* we read the full data block */ - return ORTE_SUCCESS; -} - -/* - * Dispatch to the appropriate action routine based on the state - * of the connection with the peer. - */ - -void mca_oob_usock_recv_handler(int sd, short flags, void *cbdata) -{ - mca_oob_usock_peer_t* peer = (mca_oob_usock_peer_t*)cbdata; - int rc; - orte_rml_send_t *snd; - - if (orte_abnormal_term_ordered) { - return; - } - - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s:usock:recv:handler called for peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name)); - - switch (peer->state) { - case MCA_OOB_USOCK_CONNECT_ACK: - if (ORTE_SUCCESS == (rc = mca_oob_usock_peer_recv_connect_ack(peer, peer->sd, NULL))) { - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s:usock:recv:handler starting send/recv events", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* we connected! Start the send/recv events */ - if (!peer->recv_ev_active) { - opal_event_add(&peer->recv_event, 0); - peer->recv_ev_active = true; - } - if (peer->timer_ev_active) { - opal_event_del(&peer->timer_event); - peer->timer_ev_active = false; - } - /* if there is a message waiting to be sent, queue it */ - if (NULL == peer->send_msg) { - peer->send_msg = (mca_oob_usock_send_t*)opal_list_remove_first(&peer->send_queue); - } - if (NULL != peer->send_msg && !peer->send_ev_active) { - opal_event_add(&peer->send_event, 0); - peer->send_ev_active = true; - } - /* update our state */ - peer->state = MCA_OOB_USOCK_CONNECTED; - } else { - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s UNABLE TO COMPLETE CONNECT ACK WITH %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name)); - opal_event_del(&peer->recv_event); - peer->recv_ev_active = false; - ORTE_FORCED_TERMINATE(1); - return; - } - break; - case MCA_OOB_USOCK_CONNECTED: - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s:usock:recv:handler CONNECTED", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* allocate a new message and setup for recv */ - if (NULL == peer->recv_msg) { - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s:usock:recv:handler allocate new recv msg", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - peer->recv_msg = OBJ_NEW(mca_oob_usock_recv_t); - if (NULL == peer->recv_msg) { - opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: unable to allocate recv message\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name))); - return; - } - /* start by reading the header */ - peer->recv_msg->rdptr = (char*)&peer->recv_msg->hdr; - peer->recv_msg->rdbytes = sizeof(mca_oob_usock_hdr_t); - } - /* if the header hasn't been completely read, read it */ - if (!peer->recv_msg->hdr_recvd) { - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s:usock:recv:handler read hdr", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - if (ORTE_SUCCESS == (rc = read_bytes(peer))) { - /* completed reading the header */ - peer->recv_msg->hdr_recvd = true; - /* if this is a zero-byte message, then we are done */ - if (0 == peer->recv_msg->hdr.nbytes) { - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s RECVD ZERO-BYTE MESSAGE FROM %s for tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->name), peer->recv_msg->hdr.tag); - peer->recv_msg->data = NULL; // make sure - peer->recv_msg->rdptr = NULL; - peer->recv_msg->rdbytes = 0; - } else { - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s:usock:recv:handler allocate data region of size %lu", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes); - /* allocate the data region */ - peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes); - /* point to it */ - peer->recv_msg->rdptr = peer->recv_msg->data; - peer->recv_msg->rdbytes = peer->recv_msg->hdr.nbytes; - } - /* fall thru and attempt to read the data */ - } else if (ORTE_ERR_RESOURCE_BUSY == rc || - ORTE_ERR_WOULD_BLOCK == rc) { - /* exit this event and let the event lib progress */ - return; - } else { - /* close the connection */ - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s:usock:recv:handler error reading bytes - closing connection", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - mca_oob_usock_peer_close(peer); - return; - } - } - - if (peer->recv_msg->hdr_recvd) { - /* continue to read the data block - we start from - * wherever we left off, which could be at the - * beginning or somewhere in the message - */ - if (ORTE_SUCCESS == (rc = read_bytes(peer))) { - /* we recvd all of the message */ - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s RECVD COMPLETE MESSAGE FROM %s OF %d BYTES FOR DEST %s TAG %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer->recv_msg->hdr.origin), - (int)peer->recv_msg->hdr.nbytes, - ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst), - peer->recv_msg->hdr.tag); - /* am I the intended recipient? */ - if (peer->recv_msg->hdr.dst.jobid == ORTE_PROC_MY_NAME->jobid && - peer->recv_msg->hdr.dst.vpid == ORTE_PROC_MY_NAME->vpid) { - /* yes - post it to the RML for delivery */ - opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s DELIVERING TO RML", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - ORTE_RML_POST_MESSAGE(&peer->recv_msg->hdr.origin, peer->recv_msg->hdr.tag, - peer->recv_msg->hdr.channel, peer->recv_msg->hdr.seq_num, - peer->recv_msg->data, - peer->recv_msg->hdr.nbytes); - OBJ_RELEASE(peer->recv_msg); - } else { - /* no - we don't route things, so we promote this - * back to the OOB and let another transport move - * it along. If we are a daemon and it is intended - * for another of our local procs, it will just come - * back to us and be handled then - */ - snd = OBJ_NEW(orte_rml_send_t); - snd->dst = peer->recv_msg->hdr.dst; - snd->origin = peer->recv_msg->hdr.origin; - snd->tag = peer->recv_msg->hdr.tag; - snd->data = peer->recv_msg->data; - snd->dst_channel = peer->recv_msg->hdr.channel; - snd->seq_num = peer->recv_msg->hdr.seq_num; - snd->count = peer->recv_msg->hdr.nbytes; - snd->cbfunc.iov = NULL; - snd->cbdata = NULL; - /* activate the OOB send state */ - ORTE_OOB_SEND(snd); - /* protect the data */ - peer->recv_msg->data = NULL; - /* cleanup */ - OBJ_RELEASE(peer->recv_msg); - return; - } - } else if (ORTE_ERR_RESOURCE_BUSY == rc || - ORTE_ERR_WOULD_BLOCK == rc) { - /* exit this event and let the event lib progress */ - return; - } else { - // report the error - opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: unable to recv message", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name))); - /* turn off the recv event */ - opal_event_del(&peer->recv_event); - peer->recv_ev_active = false; - ORTE_FORCED_TERMINATE(1); - return; - } - } - break; - default: - opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: invalid socket state(%d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - peer->state); - // mca_oob_usock_peer_close(peer); - break; - } -} - -static void snd_cons(mca_oob_usock_send_t *ptr) -{ - ptr->msg = NULL; - ptr->data = NULL; - ptr->hdr_sent = false; - ptr->iovnum = 0; - ptr->sdptr = NULL; - ptr->sdbytes = 0; -} -/* we don't destruct any RML msg that is - * attached to our send as the RML owns - * that memory. However, if we relay a - * msg, the data in the relay belongs to - * us and must be free'd - */ -static void snd_des(mca_oob_usock_send_t *ptr) -{ - if (NULL != ptr->data) { - free(ptr->data); - } -} -OBJ_CLASS_INSTANCE(mca_oob_usock_send_t, - opal_list_item_t, - snd_cons, snd_des); - -static void rcv_cons(mca_oob_usock_recv_t *ptr) -{ - ptr->hdr_recvd = false; - ptr->rdptr = NULL; - ptr->rdbytes = 0; -} -OBJ_CLASS_INSTANCE(mca_oob_usock_recv_t, - opal_list_item_t, - rcv_cons, NULL); - -static void err_cons(mca_oob_usock_msg_error_t *ptr) -{ - ptr->rmsg = NULL; - ptr->snd = NULL; -} -OBJ_CLASS_INSTANCE(mca_oob_usock_msg_error_t, - opal_object_t, - err_cons, NULL); - diff --git a/orte/mca/oob/usock/oob_usock_sendrecv.h b/orte/mca/oob/usock/oob_usock_sendrecv.h deleted file mode 100644 index 65658da08c..0000000000 --- a/orte/mca/oob/usock/oob_usock_sendrecv.h +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2010-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef _MCA_OOB_USOCK_SENDRECV_H_ -#define _MCA_OOB_USOCK_SENDRECV_H_ - -#include "orte_config.h" - -#include "opal/class/opal_list.h" - -#include "orte/mca/rml/base/base.h" - -#include "oob_usock.h" -#include "oob_usock_hdr.h" - -/* usock structure for sending a message */ -typedef struct { - opal_list_item_t super; - mca_oob_usock_hdr_t hdr; - orte_rml_send_t *msg; - char *data; - bool hdr_sent; - int iovnum; - char *sdptr; - size_t sdbytes; -} mca_oob_usock_send_t; -OBJ_CLASS_DECLARATION(mca_oob_usock_send_t); - -/* usock structure for recving a message */ -typedef struct { - opal_list_item_t super; - mca_oob_usock_hdr_t hdr; - bool hdr_recvd; - char *data; - char *rdptr; - size_t rdbytes; -} mca_oob_usock_recv_t; -OBJ_CLASS_DECLARATION(mca_oob_usock_recv_t); - -/* Queue a message to be sent to a specified peer. The macro - * checks to see if a message is already in position to be - * sent - if it is, then the message provided is simply added - * to the peer's message queue. If not, then the provided message - * is placed in the "ready" position - * - * If the provided boolean is true, then the send event for the - * peer is checked and activated if not already active. This allows - * the macro to either immediately send the message, or to queue - * it as "pending" for later transmission - e.g., after the - * connection procedure is completed - * - * p => pointer to mca_oob_usock_peer_t - * s => pointer to mca_oob_usock_send_t - * f => true if send event is to be activated - */ -#define MCA_OOB_USOCK_QUEUE_MSG(p, s, f) \ - do { \ - opal_output_verbose(5, orte_oob_base_framework.framework_output, \ - "%s:[%s:%d] queue msg to %s", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - __FILE__, __LINE__, \ - ORTE_NAME_PRINT(&((s)->hdr.dst))); \ - /* if there is no message on-deck, put this one there */ \ - if (NULL == (p)->send_msg) { \ - (p)->send_msg = (s); \ - } else { \ - /* add it to the queue */ \ - opal_list_append(&(p)->send_queue, &(s)->super); \ - } \ - if ((f)) { \ - /* if we aren't connected, then start connecting */ \ - if (MCA_OOB_USOCK_CONNECTED != (p)->state) { \ - (p)->state = MCA_OOB_USOCK_CONNECTING; \ - ORTE_ACTIVATE_USOCK_CONN_STATE((p), \ - mca_oob_usock_peer_try_connect); \ - } else { \ - /* ensure the send event is active */ \ - if (!(p)->send_ev_active) { \ - opal_event_add(&(p)->send_event, 0); \ - (p)->send_ev_active = true; \ - } \ - } \ - } \ - }while(0); - -/* queue a message to be sent by one of our modules - must - * provide the following params: - * - * m - the RML message to be sent - * p - the final recipient - */ -#define MCA_OOB_USOCK_QUEUE_SEND(m, p) \ - do { \ - mca_oob_usock_send_t *msg; \ - int i; \ - opal_output_verbose(5, orte_oob_base_framework.framework_output, \ - "%s:[%s:%d] queue send to %s", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - __FILE__, __LINE__, \ - ORTE_NAME_PRINT(&((m)->dst))); \ - msg = OBJ_NEW(mca_oob_usock_send_t); \ - /* setup the header */ \ - msg->hdr.origin = (m)->origin; \ - msg->hdr.dst = (m)->dst; \ - msg->hdr.type = MCA_OOB_USOCK_USER; \ - msg->hdr.tag = (m)->tag; \ - msg->hdr.channel = (m)->dst_channel; \ - msg->hdr.seq_num = (m)->seq_num; \ - /* point to the actual message */ \ - msg->msg = (m); \ - /* set the total number of bytes to be sent */ \ - if (NULL != (m)->buffer) { \ - msg->hdr.nbytes = (m)->buffer->bytes_used; \ - } else if (NULL != (m)->iov) { \ - msg->hdr.nbytes = 0; \ - for (i=0; i < (m)->count; i++) { \ - msg->hdr.nbytes += (m)->iov[i].iov_len; \ - } \ - } else { \ - msg->hdr.nbytes = (m)->count; \ - } \ - /* start the send with the header */ \ - msg->sdptr = (char*)&msg->hdr; \ - msg->sdbytes = sizeof(mca_oob_usock_hdr_t); \ - /* add to the msg queue for this peer */ \ - MCA_OOB_USOCK_QUEUE_MSG((p), msg, true); \ - }while(0); - -/* queue a message to be sent by one of our modules upon completing - * the connection process - must provide the following params: - * - * m - the RML message to be sent - * p - the final recipient - */ -#define MCA_OOB_USOCK_QUEUE_PENDING(m, p) \ - do { \ - mca_oob_usock_send_t *msg; \ - int i; \ - opal_output_verbose(5, orte_oob_base_framework.framework_output, \ - "%s:[%s:%d] queue pending to %s", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - __FILE__, __LINE__, \ - ORTE_NAME_PRINT(&((m)->dst))); \ - msg = OBJ_NEW(mca_oob_usock_send_t); \ - /* setup the header */ \ - msg->hdr.origin = (m)->origin; \ - msg->hdr.dst = (m)->dst; \ - msg->hdr.type = MCA_OOB_USOCK_USER; \ - msg->hdr.tag = (m)->tag; \ - msg->hdr.channel = (m)->dst_channel; \ - msg->hdr.seq_num = (m)->seq_num; \ - /* point to the actual message */ \ - msg->msg = (m); \ - /* set the total number of bytes to be sent */ \ - if (NULL != (m)->buffer) { \ - msg->hdr.nbytes = (m)->buffer->bytes_used; \ - } else if (NULL != (m)->iov) { \ - msg->hdr.nbytes = 0; \ - for (i=0; i < (m)->count; i++) { \ - msg->hdr.nbytes += (m)->iov[i].iov_len; \ - } \ - } else { \ - msg->hdr.nbytes = (m)->count; \ - } \ - /* start the send with the header */ \ - msg->sdptr = (char*)&msg->hdr; \ - msg->sdbytes = sizeof(mca_oob_usock_hdr_t); \ - /* add to the msg queue for this peer */ \ - MCA_OOB_USOCK_QUEUE_MSG((p), msg, false); \ - }while(0); - -/* State machine for processing message */ -typedef struct { - opal_object_t super; - opal_event_t ev; - orte_rml_send_t *msg; -} mca_oob_usock_msg_op_t; -OBJ_CLASS_DECLARATION(mca_oob_usock_msg_op_t); - -#define ORTE_ACTIVATE_USOCK_POST_SEND(ms, cbfunc) \ - do { \ - mca_oob_usock_msg_op_t *mop; \ - opal_output_verbose(5, orte_oob_base_framework.framework_output, \ - "%s:[%s:%d] post send to %s", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - __FILE__, __LINE__, \ - ORTE_NAME_PRINT(&((ms)->dst))); \ - mop = OBJ_NEW(mca_oob_usock_msg_op_t); \ - mop->msg = (ms); \ - opal_event_set(mca_oob_usock_module.ev_base, &mop->ev, -1, \ - OPAL_EV_WRITE, (cbfunc), mop); \ - opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \ - opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \ - } while(0); - -typedef struct { - opal_object_t super; - opal_event_t ev; - orte_rml_send_t *rmsg; - mca_oob_usock_send_t *snd; - orte_process_name_t hop; -} mca_oob_usock_msg_error_t; -OBJ_CLASS_DECLARATION(mca_oob_usock_msg_error_t); - -/* macro for reporting delivery errors back to the - * component for error handling - * - * s -> mca_oob_usock_send_t that failed (can be NULL) - * r -> orte_rml_send_t that failed (can be NULL) - * h -> process name for the next recipient - * cbfunc -> function to handle the callback - */ -#define ORTE_ACTIVATE_USOCK_MSG_ERROR(s, r, h, cbfunc) \ - do { \ - mca_oob_usock_msg_error_t *mop; \ - opal_output_verbose(5, orte_oob_base_framework.framework_output, \ - "%s:[%s:%d] post msg error to %s", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - __FILE__, __LINE__, \ - ORTE_NAME_PRINT((h))); \ - mop = OBJ_NEW(mca_oob_usock_msg_error_t); \ - if (NULL != (s)) { \ - mop->snd = (s); \ - } else if (NULL != (r)) { \ - /* use a proxy so we can pass NULL into the macro */ \ - mop->rmsg = (r); \ - } \ - mop->hop.jobid = (h)->jobid; \ - mop->hop.vpid = (h)->vpid; \ - opal_event_set(orte_event_base, &mop->ev, -1, \ - OPAL_EV_WRITE, (cbfunc), mop); \ - opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \ - opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \ - } while(0); - -#endif /* _MCA_OOB_USOCK_SENDRECV_H_ */ diff --git a/orte/mca/oob/usock/owner.txt b/orte/mca/oob/usock/owner.txt deleted file mode 100644 index 4ad6f408ca..0000000000 --- a/orte/mca/oob/usock/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: INTEL -status: maintenance diff --git a/orte/mca/plm/base/Makefile.am b/orte/mca/plm/base/Makefile.am index 82ffc693d6..70cba71ac0 100644 --- a/orte/mca/plm/base/Makefile.am +++ b/orte/mca/plm/base/Makefile.am @@ -10,6 +10,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2015 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -25,9 +26,8 @@ libmca_plm_la_SOURCES += \ base/plm_base_frame.c \ base/plm_base_select.c \ base/plm_base_receive.c \ - base/plm_base_launch_support.c \ - base/plm_base_jobid.c \ - base/plm_base_proxy.c \ - base/plm_base_orted_cmds.c + base/plm_base_launch_support.c \ + base/plm_base_jobid.c \ + base/plm_base_orted_cmds.c dist_ortedata_DATA += base/help-plm-base.txt diff --git a/orte/mca/plm/base/base.h b/orte/mca/plm/base/base.h index 7957eae112..0e98d5fdb9 100644 --- a/orte/mca/plm/base/base.h +++ b/orte/mca/plm/base/base.h @@ -10,6 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved. + * Copyright (c) 2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -63,7 +64,6 @@ ORTE_DECLSPEC void orte_plm_base_mapping_complete(int fd, short args, void *cbda ORTE_DECLSPEC void orte_plm_base_launch_apps(int fd, short args, void *cbdata); ORTE_DECLSPEC void orte_plm_base_post_launch(int fd, short args, void *cbdata); ORTE_DECLSPEC void orte_plm_base_registered(int fd, short args, void *cbdata); -ORTE_DECLSPEC int orte_plm_base_fork_hnp(void); END_C_DECLS diff --git a/orte/mca/plm/base/plm_base_proxy.c b/orte/mca/plm/base/plm_base_proxy.c deleted file mode 100644 index e70e667d6e..0000000000 --- a/orte/mca/plm/base/plm_base_proxy.c +++ /dev/null @@ -1,319 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/dss/dss.h" -#include "opal/util/path.h" -#include "opal/mca/installdirs/installdirs.h" -#include "opal/mca/pmix/base/base.h" -#include "opal/util/argv.h" - -#include "orte/util/name_fns.h" -#include "orte/util/show_help.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/oob/base/base.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/rml/base/rml_contact.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/state/state.h" -#include "orte/orted/pmix/pmix_server.h" -#include "orte/runtime/orte_globals.h" -#include "orte/runtime/orte_wait.h" - -#include "orte/mca/plm/base/base.h" -#include "orte/mca/plm/base/plm_private.h" - -#define ORTE_URI_MSG_LGTH 256 - -static void set_handler_default(int sig) -{ - struct sigaction act; - - act.sa_handler = SIG_DFL; - act.sa_flags = 0; - sigemptyset(&act.sa_mask); - - sigaction(sig, &act, (struct sigaction *)0); -} - -int orte_plm_base_fork_hnp(void) -{ - int p[2], death_pipe[2]; - char *cmd; - char **argv = NULL; - int argc; - char *param, *cptr, *pmix_uri; - sigset_t sigs; - int buffer_length, num_chars_read, chunk; - char *orted_uri; - int rc; - orte_jobid_t jobid; - - /* if we don't have any active OOB modules, then abort */ - if (0 == opal_list_get_size(&orte_oob_base.actives)) { - orte_show_help("help-plm-base.txt", "no-oob", true); - ORTE_FORCED_TERMINATE(ORTE_ERR_SILENT); - return ORTE_ERR_SILENT; - } - - /* A pipe is used to communicate between the parent and child to - indicate whether the exec ultimately succeeded or failed. The - child sets the pipe to be close-on-exec; the child only ever - writes anything to the pipe if there is an error (e.g., - executable not found, exec() fails, etc.). The parent does a - blocking read on the pipe; if the pipe closed with no data, - then the exec() succeeded. If the parent reads something from - the pipe, then the child was letting us know that it failed. - */ - if (pipe(p) < 0) { - ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES); - return ORTE_ERR_SYS_LIMITS_PIPES; - } - - /* we also have to give the HNP a pipe it can watch to know when - * we terminated. Since the HNP is going to be a child of us, it - * can't just use waitpid to see when we leave - so it will watch - * the pipe instead - */ - if (pipe(death_pipe) < 0) { - ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES); - return ORTE_ERR_SYS_LIMITS_PIPES; - } - - /* find the orted binary using the install_dirs support - this also - * checks to ensure that we can see this executable and it *is* executable by us - */ - cmd = opal_path_access("orted", opal_install_dirs.bindir, X_OK); - if (NULL == cmd) { - /* guess we couldn't do it - best to abort */ - ORTE_ERROR_LOG(ORTE_ERR_FILE_NOT_EXECUTABLE); - close(p[0]); - close(p[1]); - return ORTE_ERR_FILE_NOT_EXECUTABLE; - } - - /* okay, setup an appropriate argv */ - opal_argv_append(&argc, &argv, "orted"); - - /* tell the daemon it is to be the HNP */ - opal_argv_append(&argc, &argv, "--hnp"); - - /* tell the daemon to get out of our process group */ - opal_argv_append(&argc, &argv, "--set-sid"); - - /* tell the daemon to report back its uri so we can connect to it */ - opal_argv_append(&argc, &argv, "--report-uri"); - asprintf(¶m, "%d", p[1]); - opal_argv_append(&argc, &argv, param); - free(param); - - /* give the daemon a pipe it can watch to tell when we have died */ - opal_argv_append(&argc, &argv, "--singleton-died-pipe"); - asprintf(¶m, "%d", death_pipe[0]); - opal_argv_append(&argc, &argv, param); - free(param); - - /* add any debug flags */ - if (orte_debug_flag) { - opal_argv_append(&argc, &argv, "--debug"); - } - - if (orte_debug_daemons_flag) { - opal_argv_append(&argc, &argv, "--debug-daemons"); - } - - if (orte_debug_daemons_file_flag) { - if (!orte_debug_daemons_flag) { - opal_argv_append(&argc, &argv, "--debug-daemons"); - } - opal_argv_append(&argc, &argv, "--debug-daemons-file"); - } - - /* indicate that it must use the novm state machine */ - opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID); - opal_argv_append(&argc, &argv, "state_novm_select"); - opal_argv_append(&argc, &argv, "1"); - - /* pass it a jobid to match my job family */ - opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID); - opal_argv_append(&argc, &argv, "ess_base_jobid"); - jobid = ORTE_DAEMON_JOBID(ORTE_PROC_MY_NAME->jobid); - if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(¶m, jobid))) { - ORTE_ERROR_LOG(rc); - free(cmd); - return rc; - } - opal_argv_append(&argc, &argv, param); - free(param); - - /* Fork off the child */ - orte_process_info.hnp_pid = fork(); - if(orte_process_info.hnp_pid < 0) { - ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); - close(p[0]); - close(p[1]); - close(death_pipe[0]); - close(death_pipe[1]); - free(cmd); - opal_argv_free(argv); - return ORTE_ERR_SYS_LIMITS_CHILDREN; - } - - if (orte_process_info.hnp_pid == 0) { - close(p[0]); - close(death_pipe[1]); - /* I am the child - exec me */ - - /* Set signal handlers back to the default. Do this close - to the execve() because the event library may (and likely - will) reset them. If we don't do this, the event - library may have left some set that, at least on some - OS's, don't get reset via fork() or exec(). Hence, the - orted could be unkillable (for example). */ - set_handler_default(SIGTERM); - set_handler_default(SIGINT); - set_handler_default(SIGHUP); - set_handler_default(SIGPIPE); - set_handler_default(SIGCHLD); - - /* Unblock all signals, for many of the same reasons that - we set the default handlers, above. This is noticable - on Linux where the event library blocks SIGTERM, but we - don't want that blocked by the orted (or, more - specifically, we don't want it to be blocked by the - orted and then inherited by the ORTE processes that it - forks, making them unkillable by SIGTERM). */ - sigprocmask(0, 0, &sigs); - sigprocmask(SIG_UNBLOCK, &sigs, 0); - - execv(cmd, argv); - - /* if I get here, the execv failed! */ - orte_show_help("help-ess-base.txt", "ess-base:execv-error", - true, cmd, strerror(errno)); - exit(1); - - } else { - free(cmd); - /* I am the parent - wait to hear something back and - * report results - */ - close(p[1]); /* parent closes the write - orted will write its contact info to it*/ - close(death_pipe[0]); /* parent closes the death_pipe's read */ - opal_argv_free(argv); - - /* setup the buffer to read the HNP's uri */ - buffer_length = ORTE_URI_MSG_LGTH; - chunk = ORTE_URI_MSG_LGTH-1; - num_chars_read = 0; - orted_uri = (char*)malloc(buffer_length); - - while (chunk == (rc = read(p[0], &orted_uri[num_chars_read], chunk))) { - /* we read an entire buffer - better get more */ - num_chars_read += chunk; - buffer_length += ORTE_URI_MSG_LGTH; - orted_uri = realloc((void*)orted_uri, buffer_length); - } - num_chars_read += rc; - - if (num_chars_read <= 0) { - /* we didn't get anything back - this is bad */ - ORTE_ERROR_LOG(ORTE_ERR_HNP_COULD_NOT_START); - free(orted_uri); - return ORTE_ERR_HNP_COULD_NOT_START; - } - - /* parse the sysinfo from the returned info - must - * start from the end of the string as the uri itself - * can contain brackets */ - if (NULL == (param = strrchr(orted_uri, '['))) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - free(orted_uri); - return ORTE_ERR_COMM_FAILURE; - } - *param = '\0'; /* terminate the uri string */ - ++param; /* point to the start of the sysinfo */ - - /* find the end of the sysinfo */ - if (NULL == (cptr = strchr(param, ']'))) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - free(orted_uri); - return ORTE_ERR_COMM_FAILURE; - } - *cptr = '\0'; /* terminate the sysinfo string */ - ++cptr; /* point to the start of the pmix uri */ - - /* convert the sysinfo string */ - if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_sysinfo(&orte_local_cpu_type, - &orte_local_cpu_model, param))) { - ORTE_ERROR_LOG(rc); - free(orted_uri); - return rc; - } - - /* save the daemon uri - we will process it later */ - orte_process_info.my_daemon_uri = strdup(orted_uri); - /* Set the contact info in the RML - this won't actually establish - * the connection, but just tells the RML how to reach the daemon - * if/when we attempt to send to it - */ - orte_rml.set_contact_info(orte_process_info.my_daemon_uri); - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri, - ORTE_PROC_MY_DAEMON, NULL))) { - ORTE_ERROR_LOG(rc); - free(orted_uri); - return rc; - } - - /* likewise, since this is also the HNP, set that uri too */ - orte_process_info.my_hnp_uri = orted_uri; - orte_rml.set_contact_info(orte_process_info.my_hnp_uri); - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, - ORTE_PROC_MY_HNP, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* push the pmix_uri into our environment - need to protect it */ - (void)asprintf(&pmix_uri, "PMIX_SERVER_URI=%s", cptr); - putenv(pmix_uri); - /* now re-init the pmix framework so we can connect when required */ - if (OPAL_SUCCESS != (rc = opal_pmix.init())) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* now call fence to push our own modex data into the - * newly-launched HNP in case someone else needs it */ - if (OPAL_SUCCESS != (rc = opal_pmix.fence(NULL, 0))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* all done - report success */ - return ORTE_SUCCESS; - } -} diff --git a/orte/orted/pmix/pmix_server_pub.c b/orte/orted/pmix/pmix_server_pub.c index f0a6ca4e36..08701a90e7 100644 --- a/orte/orted/pmix/pmix_server_pub.c +++ b/orte/orted/pmix/pmix_server_pub.c @@ -162,6 +162,10 @@ int pmix_server_publish_fn(opal_process_name_t *proc, 0 == strcmp(iptr->key, OPAL_PMIX_PERSISTENCE)) { continue; } + opal_output_verbose(5, orte_pmix_server_globals.output, + "%s publishing data %s of type %d from source %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, iptr->type, + ORTE_NAME_PRINT(proc)); if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &iptr, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(req); @@ -398,15 +402,15 @@ void pmix_server_keyval_client(int status, orte_process_name_t* sender, while (OPAL_SUCCESS == opal_dss.unpack(buffer, &source, &cnt, OPAL_NAME)) { pdata = OBJ_NEW(opal_pmix_pdata_t); pdata->proc = source; - opal_output_verbose(5, orte_pmix_server_globals.output, - "%s recvd lookup returned data from source %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&source)); if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &iptr, &cnt, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(pdata); continue; } + opal_output_verbose(5, orte_pmix_server_globals.output, + "%s recvd lookup returned data %s of type %d from source %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, iptr->type, + ORTE_NAME_PRINT(&source)); if (OPAL_SUCCESS != (rc = opal_value_xfer(&pdata->value, iptr))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(pdata); diff --git a/orte/runtime/orte_data_server.c b/orte/runtime/orte_data_server.c index ee02093f9f..eb5694db74 100644 --- a/orte/runtime/orte_data_server.c +++ b/orte/runtime/orte_data_server.c @@ -158,7 +158,7 @@ void orte_data_server(int status, orte_process_name_t* sender, opal_value_t *iptr, *inext; uint32_t ninfo, i; char **keys = NULL, *str; - bool ret_packed = false, wait = false; + bool ret_packed = false, wait = false, data_added; int room_number; uint32_t uid; opal_pmix_data_range_t range; @@ -229,6 +229,10 @@ void orte_data_server(int status, orte_process_name_t* sender, data->uid = iptr->data.uint32; OBJ_RELEASE(iptr); } else { + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + "%s data server: adding %s to data from %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, + ORTE_NAME_PRINT(&data->owner))); opal_list_append(&data->values, &iptr->super); } } @@ -271,6 +275,10 @@ void orte_data_server(int status, orte_process_name_t* sender, ORTE_ERROR_LOG(rc); break; } + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + "%s data server: adding %s data from %s to response", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, + ORTE_NAME_PRINT(&data->owner))); if (ORTE_SUCCESS != (rc = opal_dss.pack(reply, &iptr, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); break; @@ -294,9 +302,20 @@ void orte_data_server(int status, orte_process_name_t* sender, opal_list_remove_item(&pending, &req->super); OBJ_RELEASE(req); reply = NULL; + /* if the persistence is "first_read", then delete this data */ + if (OPAL_PMIX_PERSIST_FIRST_READ == data->persistence) { + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + "%s NOT STORING DATA FROM %s AT INDEX %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&data->owner), data->index)); + opal_pointer_array_set_item(&orte_data_server_store, data->index, NULL); + OBJ_RELEASE(data); + goto release; + } } } + release: /* tell the user it was wonderful... */ ret = ORTE_SUCCESS; if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { @@ -367,8 +386,12 @@ void orte_data_server(int status, orte_process_name_t* sender, /* cycle across the provided keys */ ret_packed = false; for (i=0; NULL != keys[i]; i++) { + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + "%s data server: looking for %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), keys[i])); /* cycle across the stored data, looking for a match */ for (k=0; k < orte_data_server_store.size; k++) { + data_added = false; data = (orte_data_object_t*)opal_pointer_array_get_item(&orte_data_server_store, k); if (NULL == data) { continue; @@ -383,6 +406,10 @@ void orte_data_server(int status, orte_process_name_t* sender, } /* see if we have this key */ OPAL_LIST_FOREACH(iptr, &data->values, opal_value_t) { + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + "%s COMPARING %s %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + keys[i], iptr->key)); if (0 == strcmp(iptr->key, keys[i])) { /* found it - package it for return */ if (!ret_packed) { @@ -394,11 +421,16 @@ void orte_data_server(int status, orte_process_name_t* sender, } ret_packed = true; } + data_added = true; if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &data->owner, 1, OPAL_NAME))) { ORTE_ERROR_LOG(rc); opal_argv_free(keys); goto SEND_ERROR; } + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + "%s data server: adding %s to data from %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, + ORTE_NAME_PRINT(&data->owner))); if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &iptr, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); opal_argv_free(keys); @@ -406,6 +438,14 @@ void orte_data_server(int status, orte_process_name_t* sender, } } } + if (data_added && OPAL_PMIX_PERSIST_FIRST_READ == data->persistence) { + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + "%s REMOVING DATA FROM %s AT INDEX %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&data->owner), data->index)); + opal_pointer_array_set_item(&orte_data_server_store, data->index, NULL); + OBJ_RELEASE(data); + } } } if (!ret_packed) { @@ -433,6 +473,7 @@ void orte_data_server(int status, orte_process_name_t* sender, opal_argv_free(keys); goto SEND_ERROR; } + opal_argv_free(keys); OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s data server:lookup: data found",