From 6d6cebb4a77a0fde1c6fdf4ec57a21f72d1117b6 Mon Sep 17 00:00:00 2001 From: Ralph Castain <rhc@open-mpi.org> Date: Tue, 14 Nov 2006 19:34:59 +0000 Subject: [PATCH] Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things). Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it. I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn). This commit was SVN r12597. --- ompi/attribute/attribute_predefined.c | 4 +- ompi/communicator/comm.c | 2 +- ompi/communicator/comm_dyn.c | 60 +- ompi/communicator/communicator.h | 2 +- ompi/mca/btl/mvapi/btl_mvapi_endpoint.c | 4 +- ompi/mca/btl/openib/btl_openib_endpoint.c | 4 +- ompi/mca/btl/tcp/btl_tcp_endpoint.c | 2 +- ompi/mca/btl/udapl/btl_udapl_endpoint.c | 8 +- ompi/mca/coll/sm/coll_sm_module.c | 2 +- ompi/mca/pml/base/pml_base_module_exchange.c | 10 +- ompi/mca/pml/dr/pml_dr.c | 4 +- ompi/proc/proc.c | 26 +- ompi/runtime/ompi_mpi_abort.c | 4 +- ompi/runtime/ompi_mpi_init.c | 2 +- ompi/runtime/ompi_mpi_io.c | 8 +- orte/dss/dss_pack.c | 2 +- orte/include/orte/orte_constants.h | 4 +- orte/include/orte/orte_types.h | 1 + orte/mca/errmgr/base/errmgr_base_receive.c | 4 +- orte/mca/errmgr/hnp/errmgr_hnp.c | 17 +- orte/mca/gpr/proxy/gpr_proxy_component.c | 6 +- .../gpr_replica_deliver_notify_msg_api.c | 1 + .../functional_layer/gpr_replica_cleanup_fn.c | 4 +- .../gpr_replica_messaging_fn.c | 6 +- .../gpr_replica_trig_ops_fn.c | 29 +- orte/mca/gpr/replica/gpr_replica_component.c | 4 +- orte/mca/iof/base/iof_base_close.c | 2 +- orte/mca/iof/base/iof_base_endpoint.c | 11 +- orte/mca/iof/proxy/iof_proxy.c | 18 +- orte/mca/iof/proxy/iof_proxy_component.c | 4 +- orte/mca/iof/proxy/iof_proxy_svc.c | 6 +- orte/mca/iof/svc/iof_svc.c | 20 +- orte/mca/iof/svc/iof_svc_component.c | 4 +- orte/mca/iof/svc/iof_svc_proxy.c | 10 +- orte/mca/iof/svc/iof_svc_pub.c | 12 +- orte/mca/iof/svc/iof_svc_sub.c | 14 +- orte/mca/ns/base/Makefile.am | 9 +- orte/mca/ns/base/base.h | 202 ---- .../ns_data_type_compare_fns.c | 60 +- .../data_type_support/ns_data_type_copy_fns.c | 21 +- .../ns_data_type_packing_fns.c | 43 +- .../ns_data_type_print_fns.c | 36 +- .../ns_data_type_release_fns.c | 2 +- .../data_type_support/ns_data_type_size_fns.c | 6 +- .../ns_data_type_unpacking_fns.c | 27 +- orte/mca/ns/base/ns_base_cell_fns.c | 232 ++++ orte/mca/ns/base/ns_base_diag_fns.c | 98 ++ orte/mca/ns/base/ns_base_general_fns.c | 78 ++ orte/mca/ns/base/ns_base_job_fns.c | 169 +++ orte/mca/ns/base/ns_base_local_fns.c | 678 ----------- orte/mca/ns/base/ns_base_open.c | 27 +- orte/mca/ns/base/ns_base_vpid_name_fns.c | 397 ++++++ orte/mca/ns/base/ns_private.h | 285 +++++ orte/mca/ns/ns.h | 372 +++--- orte/mca/ns/ns_types.h | 54 +- orte/mca/ns/proxy/Makefile.am | 33 +- orte/mca/ns/proxy/configure.params | 3 +- orte/mca/ns/proxy/{src => }/ns_proxy.h | 43 +- orte/mca/ns/proxy/ns_proxy_cell_fns.c | 440 +++++++ .../ns/proxy/{src => }/ns_proxy_component.c | 64 +- orte/mca/ns/proxy/ns_proxy_diag_fns.c | 313 +++++ orte/mca/ns/proxy/ns_proxy_general_fns.c | 495 ++++++++ orte/mca/ns/proxy/ns_proxy_job_fns.c | 526 ++++++++ orte/mca/ns/proxy/src/Makefile.extra | 23 - orte/mca/ns/proxy/src/ns_proxy.c | 1071 ----------------- orte/mca/ns/replica/Makefile.am | 36 +- orte/mca/ns/replica/configure.params | 3 +- orte/mca/ns/replica/{src => }/ns_replica.c | 221 ---- orte/mca/ns/replica/{src => }/ns_replica.h | 140 ++- orte/mca/ns/replica/ns_replica_cell_fns.c | 297 +++++ .../ns/replica/ns_replica_class_instances.h | 172 +++ orte/mca/ns/replica/ns_replica_component.c | 309 +++++ orte/mca/ns/replica/ns_replica_diag_fns.c | 322 +++++ orte/mca/ns/replica/ns_replica_general_fns.c | 403 +++++++ orte/mca/ns/replica/ns_replica_job_fns.c | 295 +++++ orte/mca/ns/replica/ns_replica_recv.c | 457 +++++++ orte/mca/ns/replica/ns_replica_support_fns.c | 120 ++ orte/mca/ns/replica/src/Makefile.extra | 24 - .../mca/ns/replica/src/ns_replica_component.c | 702 ----------- orte/mca/odls/bproc/odls_bproc.c | 6 +- orte/mca/odls/default/odls_default_module.c | 17 +- orte/mca/oob/base/base.h | 17 +- orte/mca/oob/base/oob_base_init.c | 3 - orte/mca/oob/base/oob_base_recv.c | 4 +- orte/mca/oob/base/oob_base_recv_nb.c | 6 +- orte/mca/oob/base/oob_base_xcast.c | 2 +- orte/mca/oob/oob.h | 6 +- orte/mca/oob/oob_types.h | 19 - orte/mca/oob/tcp/oob_tcp.c | 27 +- orte/mca/oob/tcp/oob_tcp.h | 6 +- orte/mca/oob/tcp/oob_tcp_addr.c | 7 +- orte/mca/oob/tcp/oob_tcp_msg.c | 15 +- orte/mca/oob/tcp/oob_tcp_peer.c | 22 +- orte/mca/oob/tcp/oob_tcp_ping.c | 5 +- orte/mca/oob/tcp/oob_tcp_recv.c | 20 +- orte/mca/oob/tcp/oob_tcp_send.c | 13 +- orte/mca/pls/base/pls_base_dmn_registry_fns.c | 87 +- orte/mca/pls/base/pls_base_receive.c | 43 +- orte/mca/pls/base/pls_private.h | 2 +- orte/mca/pls/bproc/pls_bproc.c | 29 +- orte/mca/pls/bproc/pls_bproc.h | 11 +- orte/mca/pls/bproc/pls_bproc_state.c | 6 +- orte/mca/pls/cnos/pls_cnos.c | 16 +- orte/mca/pls/gridengine/pls_gridengine.h | 6 +- .../pls/gridengine/pls_gridengine_module.c | 12 +- orte/mca/pls/pls.h | 6 +- orte/mca/pls/poe/pls_poe_module.c | 12 +- orte/mca/pls/proxy/pls_proxy.c | 24 +- orte/mca/pls/proxy/pls_proxy.h | 6 +- orte/mca/pls/rsh/pls_rsh.h | 7 +- orte/mca/pls/rsh/pls_rsh_component.c | 5 + orte/mca/pls/rsh/pls_rsh_module.c | 17 +- orte/mca/pls/slurm/pls_slurm_module.c | 16 +- orte/mca/pls/tm/pls_tm_module.c | 18 +- orte/mca/ras/base/ras_base_receive.c | 4 +- orte/mca/rds/base/rds_base_receive.c | 4 +- orte/mca/rmaps/base/rmaps_base_receive.c | 4 +- .../rmgr_data_type_packing_fns.c | 9 +- orte/mca/rmgr/base/rmgr_base_attribute_fns.c | 20 + orte/mca/rmgr/base/rmgr_base_receive.c | 12 +- orte/mca/rmgr/base/rmgr_base_stage_gate.c | 24 +- orte/mca/rmgr/proxy/rmgr_proxy.c | 12 +- orte/mca/rmgr/rmgr.h | 2 +- orte/mca/rmgr/urm/rmgr_urm.c | 11 +- orte/mca/rml/base/rml_base_open.c | 2 - orte/mca/rml/rml.h | 8 - orte/mca/rml/rml_types.h | 15 - orte/mca/schema/schema_types.h | 10 + orte/mca/sds/bproc/sds_bproc_module.c | 2 +- orte/mca/sds/env/sds_env_module.c | 4 +- orte/mca/sds/pipe/sds_pipe_module.c | 4 +- orte/mca/sds/singleton/sds_singleton_module.c | 5 +- orte/mca/sds/slurm/sds_slurm_module.c | 2 +- orte/mca/smr/base/smr_base_get_proc_state.c | 5 +- orte/mca/smr/base/smr_base_set_proc_state.c | 12 +- orte/runtime/orte_init_stage1.c | 12 +- orte/runtime/orte_restart.c | 6 +- orte/runtime/orte_setup_hnp.c | 4 +- orte/test/unit/ns/Makefile | 12 + orte/test/unit/ns/ns_peers.c | 204 ++++ orte/test/unit/ns/ns_string_fns.c | 157 +++ orte/tools/orted/orted.c | 8 +- orte/tools/orteprobe/orteprobe.c | 2 +- orte/tools/orterun/orterun.c | 33 +- 144 files changed, 6909 insertions(+), 3847 deletions(-) create mode 100644 orte/mca/ns/base/ns_base_cell_fns.c create mode 100644 orte/mca/ns/base/ns_base_diag_fns.c create mode 100644 orte/mca/ns/base/ns_base_general_fns.c create mode 100644 orte/mca/ns/base/ns_base_job_fns.c delete mode 100644 orte/mca/ns/base/ns_base_local_fns.c create mode 100644 orte/mca/ns/base/ns_base_vpid_name_fns.c create mode 100644 orte/mca/ns/base/ns_private.h rename orte/mca/ns/proxy/{src => }/ns_proxy.h (75%) create mode 100644 orte/mca/ns/proxy/ns_proxy_cell_fns.c rename orte/mca/ns/proxy/{src => }/ns_proxy_component.c (84%) create mode 100644 orte/mca/ns/proxy/ns_proxy_diag_fns.c create mode 100644 orte/mca/ns/proxy/ns_proxy_general_fns.c create mode 100644 orte/mca/ns/proxy/ns_proxy_job_fns.c delete mode 100644 orte/mca/ns/proxy/src/Makefile.extra delete mode 100644 orte/mca/ns/proxy/src/ns_proxy.c rename orte/mca/ns/replica/{src => }/ns_replica.c (67%) rename orte/mca/ns/replica/{src => }/ns_replica.h (50%) create mode 100644 orte/mca/ns/replica/ns_replica_cell_fns.c create mode 100644 orte/mca/ns/replica/ns_replica_class_instances.h create mode 100644 orte/mca/ns/replica/ns_replica_component.c create mode 100644 orte/mca/ns/replica/ns_replica_diag_fns.c create mode 100644 orte/mca/ns/replica/ns_replica_general_fns.c create mode 100644 orte/mca/ns/replica/ns_replica_job_fns.c create mode 100644 orte/mca/ns/replica/ns_replica_recv.c create mode 100644 orte/mca/ns/replica/ns_replica_support_fns.c delete mode 100644 orte/mca/ns/replica/src/Makefile.extra delete mode 100644 orte/mca/ns/replica/src/ns_replica_component.c create mode 100644 orte/test/unit/ns/Makefile create mode 100644 orte/test/unit/ns/ns_peers.c create mode 100644 orte/test/unit/ns/ns_string_fns.c diff --git a/ompi/attribute/attribute_predefined.c b/ompi/attribute/attribute_predefined.c index 23226db6c5..fb1134ff75 100644 --- a/ompi/attribute/attribute_predefined.c +++ b/ompi/attribute/attribute_predefined.c @@ -307,9 +307,7 @@ void ompi_attr_create_predefined_callback( /* Set some default values */ - if (ORTE_SUCCESS != orte_ns.get_jobid(&job, orte_process_info.my_name)) { - return; - } + job = ORTE_PROC_MY_NAME->jobid; /* Query the gpr to find out how many CPUs there will be. This will only return a non-empty list in a persistent diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index 9febed8bab..ded4517b36 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -1040,7 +1040,7 @@ int ompi_comm_determine_first ( ompi_communicator_t *intercomm, int high ) theirproc = intercomm->c_remote_group->grp_proc_pointers[0]; mask = ORTE_NS_CMP_CELLID | ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; - rc = orte_ns.compare (mask, &(ourproc->proc_name), &(theirproc->proc_name)); + rc = orte_ns.compare_fields(mask, &(ourproc->proc_name), &(theirproc->proc_name)); if ( 0 > rc ) { flag = true; } diff --git a/ompi/communicator/comm_dyn.c b/ompi/communicator/comm_dyn.c index 8118a76c58..c49d2496d5 100644 --- a/ompi/communicator/comm_dyn.c +++ b/ompi/communicator/comm_dyn.c @@ -312,7 +312,7 @@ orte_process_name_t *ompi_comm_get_rport (orte_process_name_t *port, int send_fi if (NULL == rbuf) { return NULL; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer(ORTE_RML_NAME_ANY, rbuf, tag))) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, rbuf, tag))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(rbuf); return NULL; @@ -565,6 +565,16 @@ ompi_comm_start_processes(int count, char **array_of_commands, /* cleanup */ if (NULL != base_prefix) free(base_prefix); + /* tell the RTE that we want to be a child of this process' job */ + if (ORTE_SUCCESS != (rc = orte_rmgr.add_attribute(&attributes, ORTE_NS_USE_PARENT, + ORTE_JOBID, &(orte_process_info.my_name->jobid), + ORTE_RMGR_ATTR_OVERRIDE))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&attributes); + opal_progress_event_decrement(); + return MPI_ERR_SPAWN; + } + /* tell the RTE that we want to the children to run inside of our allocation - * don't go get one just for them */ @@ -857,7 +867,7 @@ void ompi_comm_disconnect_waitall (int count, ompi_comm_disconnect_obj **objs) #define OMPI_COMM_MAXJOBIDS 64 void ompi_comm_mark_dyncomm (ompi_communicator_t *comm) { - int i, j, numjobids=0, rc; + int i, j, numjobids=0; int size, rsize; int found; orte_jobid_t jobids[OMPI_COMM_MAXJOBIDS], thisjobid; @@ -875,40 +885,34 @@ void ompi_comm_mark_dyncomm (ompi_communicator_t *comm) of different jobids. */ grp = comm->c_local_group; for (i=0; i< size; i++) { - if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&thisjobid, &(grp->grp_proc_pointers[i]->proc_name)))) { - ORTE_ERROR_LOG(rc); - return; - } - found = 0; - for ( j=0; j<numjobids; j++) { - if ( thisjobid == jobids[j]) { - found = 1; - break; + thisjobid = grp->grp_proc_pointers[i]->proc_name.jobid; + found = 0; + for ( j=0; j<numjobids; j++) { + if (thisjobid == jobids[j]) { + found = 1; + break; + } + } + if (!found ) { + jobids[numjobids++] = thisjobid; } - } - if (!found ) { - jobids[numjobids++] = thisjobid; - } } /* if inter-comm, loop over all processes in remote_group and count number of different jobids */ grp = comm->c_remote_group; for (i=0; i< rsize; i++) { - if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&thisjobid, &(grp->grp_proc_pointers[i]->proc_name)))) { - ORTE_ERROR_LOG(rc); - return; - } - found = 0; - for ( j=0; j<numjobids; j++) { - if ( thisjobid == jobids[j]) { - found = 1; - break; + thisjobid = grp->grp_proc_pointers[i]->proc_name.jobid; + found = 0; + for ( j=0; j<numjobids; j++) { + if ( thisjobid == jobids[j]) { + found = 1; + break; + } + } + if (!found ) { + jobids[numjobids++] = thisjobid; } - } - if (!found ) { - jobids[numjobids++] = thisjobid; - } } /* if number of joibds larger than one, set the disconnect flag*/ diff --git a/ompi/communicator/communicator.h b/ompi/communicator/communicator.h index dc709b8826..607d00cba5 100644 --- a/ompi/communicator/communicator.h +++ b/ompi/communicator/communicator.h @@ -27,7 +27,7 @@ #include "mpi.h" #include "ompi/group/group.h" #include "ompi/mca/coll/coll.h" -#include "orte/mca/oob/oob_types.h" +#include "orte/mca/rml/rml_types.h" #include "ompi/proc/proc.h" #if defined(c_plusplus) || defined(__cplusplus) diff --git a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c index 4f069b28a1..ca65ddb55b 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c @@ -579,7 +579,7 @@ static void mca_btl_mvapi_endpoint_recv( opal_list_get_end(&mca_btl_mvapi_component.ib_procs); ib_proc = (mca_btl_mvapi_proc_t*)opal_list_get_next(ib_proc)) { - if(orte_ns.compare(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, endpoint) == 0) { + if(orte_ns.compare_fields(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, endpoint) == ORTE_EQUAL) { bool found = false; /* Try to get the endpoint instance of this proc */ @@ -690,7 +690,7 @@ static void mca_btl_mvapi_endpoint_recv( void mca_btl_mvapi_post_recv() { orte_rml.recv_buffer_nb( - ORTE_RML_NAME_ANY, + ORTE_NAME_WILDCARD, ORTE_RML_TAG_DYNAMIC-1, ORTE_RML_PERSISTENT, mca_btl_mvapi_endpoint_recv, diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index cb317a9f42..49a46f2cd5 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -662,7 +662,7 @@ static void mca_btl_openib_endpoint_recv( opal_list_get_end(&mca_btl_openib_component.ib_procs); ib_proc = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) { - if(orte_ns.compare(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, endpoint) == 0) { + if(orte_ns.compare_fields(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, endpoint) == ORTE_EQUAL) { bool found = false; /* Try to get the endpoint instance of this proc */ @@ -776,7 +776,7 @@ void mca_btl_openib_post_recv() { orte_rml.recv_buffer_nb( - ORTE_RML_NAME_ANY, + ORTE_NAME_WILDCARD, ORTE_RML_TAG_DYNAMIC-1, ORTE_RML_PERSISTENT, mca_btl_openib_endpoint_recv, diff --git a/ompi/mca/btl/tcp/btl_tcp_endpoint.c b/ompi/mca/btl/tcp/btl_tcp_endpoint.c index 1a5c38267e..d7ca834001 100644 --- a/ompi/mca/btl/tcp/btl_tcp_endpoint.c +++ b/ompi/mca/btl/tcp/btl_tcp_endpoint.c @@ -307,7 +307,7 @@ bool mca_btl_tcp_endpoint_accept(mca_btl_base_endpoint_t* btl_endpoint, struct s if((btl_addr = btl_endpoint->endpoint_addr) != NULL && btl_addr->addr_inet.s_addr == addr->sin_addr.s_addr) { mca_btl_tcp_proc_t *endpoint_proc = btl_endpoint->endpoint_proc; - cmpval = orte_ns.compare(mask, + cmpval = orte_ns.compare_fields(mask, &endpoint_proc->proc_ompi->proc_name, &this_proc->proc_ompi->proc_name); if((btl_endpoint->endpoint_sd < 0) || diff --git a/ompi/mca/btl/udapl/btl_udapl_endpoint.c b/ompi/mca/btl/udapl/btl_udapl_endpoint.c index f996fe9d79..a4cac125e2 100644 --- a/ompi/mca/btl/udapl/btl_udapl_endpoint.c +++ b/ompi/mca/btl/udapl/btl_udapl_endpoint.c @@ -207,7 +207,7 @@ void mca_btl_udapl_endpoint_recv(int status, orte_process_name_t* endpoint, opal_list_get_end(&mca_btl_udapl_component.udapl_procs); proc = (mca_btl_udapl_proc_t*)opal_list_get_next(proc)) { - if(0 == orte_ns.compare(ORTE_NS_CMP_ALL, &proc->proc_guid, endpoint)) { + if(ORTE_EQUAL == orte_ns.compare_fields(ORTE_NS_CMP_ALL, &proc->proc_guid, endpoint)) { for(i = 0; i < proc->proc_endpoint_count; i++) { ep = proc->proc_endpoints[i]; @@ -231,7 +231,7 @@ void mca_btl_udapl_endpoint_recv(int status, orte_process_name_t* endpoint, void mca_btl_udapl_endpoint_post_oob_recv(void) { - orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_DYNAMIC-1, + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DYNAMIC-1, ORTE_RML_PERSISTENT, mca_btl_udapl_endpoint_recv, NULL); } @@ -246,7 +246,7 @@ void mca_btl_udapl_endpoint_connect(mca_btl_udapl_endpoint_t* endpoint) /* Nasty test to prevent deadlock and unwanted connection attempts */ /* This right here is the whole point of using the ORTE/RML handshake */ if((MCA_BTL_UDAPL_CONN_EAGER == endpoint->endpoint_state && - 0 > orte_ns.compare(ORTE_NS_CMP_ALL, + 0 > orte_ns.compare_fields(ORTE_NS_CMP_ALL, &endpoint->endpoint_proc->proc_guid, &ompi_proc_local()->proc_name)) || (MCA_BTL_UDAPL_CLOSED != endpoint->endpoint_state && @@ -370,7 +370,7 @@ static int mca_btl_udapl_endpoint_finish_eager( OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); /* Only one side does dat_ep_connect() */ - if(0 < orte_ns.compare(ORTE_NS_CMP_ALL, + if(0 < orte_ns.compare_fields(ORTE_NS_CMP_ALL, &endpoint->endpoint_proc->proc_guid, &ompi_proc_local()->proc_name)) { diff --git a/ompi/mca/coll/sm/coll_sm_module.c b/ompi/mca/coll/sm/coll_sm_module.c index bbbb955a90..1a6134de57 100644 --- a/ompi/mca/coll/sm/coll_sm_module.c +++ b/ompi/mca/coll/sm/coll_sm_module.c @@ -626,7 +626,7 @@ static int bootstrap_comm(ompi_communicator_t *comm) empty_index = -1; for (i = 0; i < mca_coll_sm_component.sm_bootstrap_num_segments; ++i) { if (comm->c_contextid == bshe->smbhe_keys[i].mcsbck_cid && - 0 == orte_ns.compare(ORTE_NS_CMP_ALL, + ORTE_EQUAL == orte_ns.compare_fields(ORTE_NS_CMP_ALL, rank0, &bshe->smbhe_keys[i].mcsbck_rank0_name)) { found = true; diff --git a/ompi/mca/pml/base/pml_base_module_exchange.c b/ompi/mca/pml/base/pml_base_module_exchange.c index 48e42aa53f..f527e164cc 100644 --- a/ompi/mca/pml/base/pml_base_module_exchange.c +++ b/ompi/mca/pml/base/pml_base_module_exchange.c @@ -426,10 +426,7 @@ static int mca_pml_base_modex_subscribe(orte_process_name_t* name) OPAL_UNLOCK(&mca_pml_base_modex_lock); /* otherwise - subscribe to get this jobid's contact info */ - if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid, name))) { - ORTE_ERROR_LOG(rc); - return rc; - } + jobid = name->jobid; if (ORTE_SUCCESS != (rc = orte_schema.get_std_subscription_name(&sub_name, OMPI_MODEX_SUBSCRIPTION, jobid))) { @@ -520,10 +517,7 @@ int mca_pml_base_modex_send( orte_byte_object_t bo; orte_data_value_t value = ORTE_DATA_VALUE_EMPTY; - if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid, orte_process_info.my_name))) { - ORTE_ERROR_LOG(rc); - return rc; - } + jobid = ORTE_PROC_MY_NAME->jobid; if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) { ORTE_ERROR_LOG(rc); diff --git a/ompi/mca/pml/dr/pml_dr.c b/ompi/mca/pml/dr/pml_dr.c index d3a69a575a..5081d10692 100644 --- a/ompi/mca/pml/dr/pml_dr.c +++ b/ompi/mca/pml/dr/pml_dr.c @@ -172,9 +172,9 @@ int mca_pml_dr_add_procs(ompi_proc_t** procs, size_t nprocs) processes, but will work for initial job start */ idx = ompi_pointer_array_add(&mca_pml_dr.endpoints, (void*) endpoint); - if(orte_ns.compare(ORTE_NS_CMP_ALL, + if(orte_ns.compare_fields(ORTE_NS_CMP_ALL, orte_process_info.my_name, - &(endpoint->proc_ompi->proc_name)) == 0) { + &(endpoint->proc_ompi->proc_name)) == ORTE_EQUAL) { mca_pml_dr.my_rank = idx; } endpoint->local = endpoint->dst = idx; diff --git a/ompi/proc/proc.c b/ompi/proc/proc.c index 66d3139063..dba746c458 100644 --- a/ompi/proc/proc.c +++ b/ompi/proc/proc.c @@ -104,7 +104,7 @@ void ompi_proc_destruct(ompi_proc_t* proc) int ompi_proc_init(void) { orte_process_name_t *peers; - orte_std_cntr_t i, npeers, self, num_tokens; + orte_std_cntr_t i, npeers, num_tokens; orte_jobid_t jobid; char *segment, **tokens; orte_data_value_t value = { {OBJ_CLASS(orte_data_value_t),0}, ORTE_NULL, NULL}; @@ -115,7 +115,7 @@ int ompi_proc_init(void) OBJ_CONSTRUCT(&ompi_proc_lock, opal_mutex_t); /* get all peers in this job */ - if(ORTE_SUCCESS != (rc = orte_ns.get_peers(&peers, &npeers, &self))) { + if(ORTE_SUCCESS != (rc = orte_ns.get_peers(&peers, &npeers, NULL))) { opal_output(0, "ompi_proc_init: get_peers failed with errno=%d", rc); return rc; } @@ -124,7 +124,7 @@ int ompi_proc_init(void) for( i = 0; i < npeers; i++ ) { ompi_proc_t *proc = OBJ_NEW(ompi_proc_t); proc->proc_name = peers[i]; - if( i == self ) { + if( i == ORTE_PROC_MY_NAME->vpid ) { ompi_proc_local_proc = proc; proc->proc_flags |= OMPI_PROC_FLAG_LOCAL; } @@ -150,10 +150,7 @@ int ompi_proc_init(void) return rc; } - if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid, orte_process_info.my_name))) { - ORTE_ERROR_LOG(rc); - return rc; - } + jobid = ORTE_PROC_MY_NAME->jobid; /* find the job segment on the registry */ if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) { @@ -223,7 +220,7 @@ ompi_proc_t** ompi_proc_world(size_t *size) for (proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list); proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list); proc = (ompi_proc_t*)opal_list_get_next(proc)) { - if (0 == orte_ns.compare(mask, &proc->proc_name, &my_name)) { + if (ORTE_EQUAL == orte_ns.compare_fields(mask, &proc->proc_name, &my_name)) { ++count; } } @@ -239,7 +236,7 @@ ompi_proc_t** ompi_proc_world(size_t *size) for (proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list); proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list); proc = (ompi_proc_t*)opal_list_get_next(proc)) { - if (0 == orte_ns.compare(mask, &proc->proc_name, &my_name)) { + if (ORTE_EQUAL == orte_ns.compare_fields(mask, &proc->proc_name, &my_name)) { procs[count++] = proc; } } @@ -298,7 +295,7 @@ ompi_proc_t * ompi_proc_find ( const orte_process_name_t * name ) for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list); proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list); proc = (ompi_proc_t*)opal_list_get_next(proc)) { - if (0 == orte_ns.compare(mask, &proc->proc_name, name)) { + if (ORTE_EQUAL == orte_ns.compare_fields(mask, &proc->proc_name, name)) { rproc = proc; break; } @@ -319,7 +316,7 @@ ompi_proc_t * ompi_proc_find_and_add ( const orte_process_name_t * name, bool* i for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list); proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list); proc = (ompi_proc_t*)opal_list_get_next(proc)) { - if (0 == orte_ns.compare(mask, &proc->proc_name, name)) { + if (ORTE_EQUAL == orte_ns.compare_fields(mask, &proc->proc_name, name)) { *isnew = false; rproc = proc; break; @@ -395,10 +392,7 @@ static int setup_registry_callback(void) orte_gpr_subscription_id_t id; orte_jobid_t jobid; - if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid, &local->proc_name))) { - ORTE_ERROR_LOG(rc); - return rc; - } + jobid = local->proc_name.jobid; /* find the job segment on the registry */ if (ORTE_SUCCESS != @@ -534,7 +528,7 @@ static void callback(orte_gpr_notify_data_t *data, void *cbdata) /* find the associated proc entry and update its arch flag. If the nodename of this info is my local host, also set the LOCAL flag. */ - if (0 == orte_ns.compare(mask, &name, &proc->proc_name)) { + if (ORTE_EQUAL == orte_ns.compare_fields(mask, &name, &proc->proc_name)) { proc->proc_arch = arch; if (0 == strcmp(str, orte_system_info.nodename)) { proc->proc_flags |= OMPI_PROC_FLAG_LOCAL; diff --git a/ompi/runtime/ompi_mpi_abort.c b/ompi/runtime/ompi_mpi_abort.c index a8f872fef6..d3a8da6aae 100644 --- a/ompi/runtime/ompi_mpi_abort.c +++ b/ompi/runtime/ompi_mpi_abort.c @@ -133,7 +133,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm, /* put all the local procs in the abort list */ for (i = 0 ; i < ompi_comm_size(comm) ; ++i) { - if (0 != orte_ns.compare(ORTE_NS_CMP_ALL, + if (ORTE_EQUAL != orte_ns.compare_fields(ORTE_NS_CMP_ALL, &comm->c_local_group->grp_proc_pointers[i]->proc_name, orte_process_info.my_name)) { assert(count <= nabort_procs); @@ -147,7 +147,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm, /* if requested, kill off remote procs too */ if (kill_remote_of_intercomm) { for (i = 0 ; i < ompi_comm_remote_size(comm) ; ++i) { - if (0 != orte_ns.compare(ORTE_NS_CMP_ALL, + if (ORTE_EQUAL != orte_ns.compare_fields(ORTE_NS_CMP_ALL, &comm->c_remote_group->grp_proc_pointers[i]->proc_name, orte_process_info.my_name)) { assert(count <= nabort_procs); diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 11a15bfae2..c616c45e3a 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -294,7 +294,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) } if (!set) { char *vpid; - orte_ns_base_get_vpid_string(&vpid, orte_process_info.my_name); + orte_ns.get_vpid_string(&vpid, orte_process_info.my_name); opal_show_help("help-mpi-runtime", "mpi_init:startup:paffinity-unavailable", true, vpid); diff --git a/ompi/runtime/ompi_mpi_io.c b/ompi/runtime/ompi_mpi_io.c index cfba440bb4..15e6e928f7 100644 --- a/ompi/runtime/ompi_mpi_io.c +++ b/ompi/runtime/ompi_mpi_io.c @@ -17,7 +17,7 @@ */ #include "ompi_config.h" -#include "orte/mca/oob/oob.h" +#include "orte/mca/ns/ns_types.h" #include "orte/mca/iof/iof.h" #include "ompi/constants.h" @@ -40,7 +40,7 @@ int ompi_mpi_init_io(void) close(fds[0]); rc = mca_iof.iof_publish( - MCA_OOB_NAME_SELF, + OMPI_PROC_MY_NAME, MCA_IOF_SINK, MCA_IOF_STDIN, fds[1]); @@ -56,7 +56,7 @@ int ompi_mpi_init_io(void) close(fds[1]); rc = mca_iof.iof_publish( - MCA_OOB_NAME_SELF, + OMPI_PROC_MY_NAME, MCA_IOF_SOURCE, MCA_IOF_STDOUT, fds[0]); @@ -72,7 +72,7 @@ int ompi_mpi_init_io(void) close(fds[1]); rc = mca_iof.iof_publish( - MCA_OOB_NAME_SELF, + OMPI_PROC_MY_NAME, MCA_IOF_SOURCE, MCA_IOF_STDERR, fds[0]); diff --git a/orte/dss/dss_pack.c b/orte/dss/dss_pack.c index 13c8fb3284..aa5d0784fd 100644 --- a/orte/dss/dss_pack.c +++ b/orte/dss/dss_pack.c @@ -36,7 +36,7 @@ int orte_dss_pack(orte_buffer_t *buffer, void *src, orte_std_cntr_t num_vals, int rc; /* check for error */ - if (NULL == buffer || NULL == src) { + if (NULL == buffer) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } diff --git a/orte/include/orte/orte_constants.h b/orte/include/orte/orte_constants.h index 4c5e1fd6b8..9d03b3d642 100644 --- a/orte/include/orte/orte_constants.h +++ b/orte/include/orte/orte_constants.h @@ -29,8 +29,8 @@ extern "C" { #define ORTE_ERR_BASE OPAL_ERR_MAX /* define the results values for comparisons so we can change them in only one place */ -#define ORTE_VALUE1_GREATER -1 -#define ORTE_VALUE2_GREATER +1 +#define ORTE_VALUE1_GREATER +1 +#define ORTE_VALUE2_GREATER -1 #define ORTE_EQUAL 0 enum { diff --git a/orte/include/orte/orte_types.h b/orte/include/orte/orte_types.h index 133f8f4413..f816090924 100644 --- a/orte/include/orte/orte_types.h +++ b/orte/include/orte/orte_types.h @@ -38,6 +38,7 @@ typedef uint8_t orte_data_type_t; /** data type indicators used in ORTE */ typedef int32_t orte_std_cntr_t; /** standard counters used in ORTE */ #define ORTE_STD_CNTR_T ORTE_INT32 #define ORTE_STD_CNTR_MAX INT32_MAX +#define ORTE_STD_CNTR_MIN INT32_MIN #define ORTE_STD_CNTR_INVALID -1 /* define a structure to hold generic byte objects */ diff --git a/orte/mca/errmgr/base/errmgr_base_receive.c b/orte/mca/errmgr/base/errmgr_base_receive.c index b972ff9054..4800c486e1 100644 --- a/orte/mca/errmgr/base/errmgr_base_receive.c +++ b/orte/mca/errmgr/base/errmgr_base_receive.c @@ -50,7 +50,7 @@ int orte_errmgr_base_comm_start(void) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ERRMGR, ORTE_RML_PERSISTENT, orte_errmgr_base_recv, @@ -70,7 +70,7 @@ int orte_errmgr_base_comm_stop(void) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_ERRMGR))) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ERRMGR))) { ORTE_ERROR_LOG(rc); } recv_issued = false; diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index 1ee8e85d5e..ab422a05f2 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -23,6 +23,7 @@ #include <stdlib.h> #include <stdarg.h> +#include "opal/class/opal_list.h" #include "opal/util/trace.h" #include "opal/util/output.h" @@ -59,6 +60,8 @@ int orte_errmgr_hnp_proc_aborted(orte_gpr_notify_message_t *msg) NULL }; orte_data_value_t dval = ORTE_DATA_VALUE_EMPTY; + opal_list_t attrs; + opal_list_item_t *item; int rc; OPAL_TRACE(1); @@ -79,11 +82,15 @@ int orte_errmgr_hnp_proc_aborted(orte_gpr_notify_message_t *msg) return rc; } - /* tell the pls to terminate the job */ - if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job))) { + /* tell the pls to terminate the job AND ALL ITS DESCENDANTS */ + OBJ_CONSTRUCT(&attrs, opal_list_t); + orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); + if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &attrs))) { ORTE_ERROR_LOG(rc); return rc; } + while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); + OBJ_DESTRUCT(&attrs); /* orterun will only wakeup when all procs report terminated. The terminate_job * function *should* have done that - however, it is possible during abnormal @@ -142,8 +149,10 @@ int orte_errmgr_hnp_incomplete_start(orte_gpr_notify_message_t *msg) return rc; } - /* tell the pls to terminate the job */ - if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job))) { + /* tell the pls to terminate the job - just kill this job, not any descendants since + * the job is just trying to start + */ + if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, NULL))) { ORTE_ERROR_LOG(rc); } diff --git a/orte/mca/gpr/proxy/gpr_proxy_component.c b/orte/mca/gpr/proxy/gpr_proxy_component.c index 5b187d2d03..7746a4dad6 100644 --- a/orte/mca/gpr/proxy/gpr_proxy_component.c +++ b/orte/mca/gpr/proxy/gpr_proxy_component.c @@ -233,7 +233,7 @@ orte_gpr_proxy_component_init(bool *allow_multi_user_threads, bool *have_hidden_ ORTE_ERROR_LOG(ret); return NULL; } - if(ORTE_SUCCESS != (ret = orte_ns.copy_process_name(&orte_process_info.gpr_replica, &name))) { + if(ORTE_SUCCESS != (ret = orte_dss.copy((void**)&orte_process_info.gpr_replica, &name, ORTE_NAME))) { ORTE_ERROR_LOG(ret); return NULL; } @@ -299,7 +299,7 @@ int orte_gpr_proxy_module_init(void) { /* issue the non-blocking receive */ int rc; - rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_GPR_NOTIFY, ORTE_RML_PERSISTENT, orte_gpr_proxy_notify_recv, NULL); + rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_GPR_NOTIFY, ORTE_RML_PERSISTENT, orte_gpr_proxy_notify_recv, NULL); if(rc < 0) { ORTE_ERROR_LOG(rc); return rc; @@ -359,7 +359,7 @@ int orte_gpr_proxy_finalize(void) } /* All done */ - orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_GPR_NOTIFY); + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_GPR_NOTIFY); return ORTE_SUCCESS; } diff --git a/orte/mca/gpr/replica/api_layer/gpr_replica_deliver_notify_msg_api.c b/orte/mca/gpr/replica/api_layer/gpr_replica_deliver_notify_msg_api.c index 9c783d8f4d..61d8bfff75 100755 --- a/orte/mca/gpr/replica/api_layer/gpr_replica_deliver_notify_msg_api.c +++ b/orte/mca/gpr/replica/api_layer/gpr_replica_deliver_notify_msg_api.c @@ -30,6 +30,7 @@ #include "orte/orte_constants.h" #include "opal/util/trace.h" +#include "orte/dss/dss.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/gpr/replica/api_layer/gpr_replica_api.h" diff --git a/orte/mca/gpr/replica/functional_layer/gpr_replica_cleanup_fn.c b/orte/mca/gpr/replica/functional_layer/gpr_replica_cleanup_fn.c index f557f8b12c..1098c3f043 100644 --- a/orte/mca/gpr/replica/functional_layer/gpr_replica_cleanup_fn.c +++ b/orte/mca/gpr/replica/functional_layer/gpr_replica_cleanup_fn.c @@ -88,9 +88,7 @@ int orte_gpr_replica_cleanup_proc_fn(orte_process_name_t *proc) } /* find the job segment */ - if (ORTE_SUCCESS != orte_ns.get_jobid(&jobid, proc)) { - return ORTE_ERR_BAD_PARAM; - } + jobid = proc->jobid; if (ORTE_SUCCESS != orte_ns.convert_jobid_to_string(&jobidstring, jobid)) { return ORTE_ERR_BAD_PARAM; diff --git a/orte/mca/gpr/replica/functional_layer/gpr_replica_messaging_fn.c b/orte/mca/gpr/replica/functional_layer/gpr_replica_messaging_fn.c index a8d139da4c..774fb4ae34 100644 --- a/orte/mca/gpr/replica/functional_layer/gpr_replica_messaging_fn.c +++ b/orte/mca/gpr/replica/functional_layer/gpr_replica_messaging_fn.c @@ -387,9 +387,7 @@ int orte_gpr_replica_define_callback(orte_gpr_notify_msg_type_t msg_type, if (((NULL == recipient && NULL == cb->requestor) && (msg_type == cb->message->msg_type)) || (((NULL != recipient && NULL != cb->requestor) && - (0 == orte_ns.compare(ORTE_NS_CMP_ALL, - recipient, - cb->requestor))) && + (ORTE_EQUAL == orte_dss.compare(recipient, cb->requestor, ORTE_NAME))) && (msg_type == cb->message->msg_type))) { /* okay, a callback has been registered to send data to this * recipient - return this location @@ -421,7 +419,7 @@ int orte_gpr_replica_define_callback(orte_gpr_notify_msg_type_t msg_type, if (NULL == recipient) { cb->requestor = NULL; } else { - if (ORTE_SUCCESS != (rc = orte_ns.copy_process_name(&(cb->requestor), recipient))) { + if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(cb->requestor), recipient, ORTE_NAME))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/mca/gpr/replica/functional_layer/gpr_replica_trig_ops_fn.c b/orte/mca/gpr/replica/functional_layer/gpr_replica_trig_ops_fn.c index 16ee7f5b4e..7339df0415 100644 --- a/orte/mca/gpr/replica/functional_layer/gpr_replica_trig_ops_fn.c +++ b/orte/mca/gpr/replica/functional_layer/gpr_replica_trig_ops_fn.c @@ -216,8 +216,13 @@ ADDREQ: i < (sub->requestors)->size; i++) { if (NULL != reqs[i]) { j++; + if ((NULL == reqs[i]->requestor && NULL != requestor) || + (NULL != reqs[i]->requestor && NULL == requestor)) { + continue; + } if (reqs[i]->idtag == subscription->id && - 0 == orte_ns.compare(ORTE_NS_CMP_ALL, reqs[i]->requestor, requestor)) { + ((NULL == reqs[i]->requestor && NULL == requestor) || + (ORTE_EQUAL == orte_dss.compare(reqs[i]->requestor, requestor, ORTE_NAME)))) { /* found this requestor - do not add it again */ goto DONESUB; } @@ -234,8 +239,7 @@ ADDREQ: } if (NULL != requestor) { - if (ORTE_SUCCESS != (rc = orte_ns.copy_process_name(&(req->requestor), - requestor))) { + if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(req->requestor), requestor, ORTE_NAME))) { ORTE_ERROR_LOG(rc); return rc; } @@ -546,8 +550,14 @@ ADDREQ: i < (trig->attached)->size; i++) { if (NULL != reqs[i]) { j++; + /* if one is NULL and the other isn't, then they can't possibly match */ + if ((NULL == reqs[i]->requestor && NULL != requestor) || + (NULL != reqs[i]->requestor && NULL == requestor)) { + continue; + } if (reqs[i]->idtag == trigger->id && - 0 == orte_ns.compare(ORTE_NS_CMP_ALL, reqs[i]->requestor, requestor)) { + ((NULL == reqs[i]->requestor && NULL == requestor) || + (ORTE_EQUAL == orte_dss.compare(reqs[i]->requestor, requestor, ORTE_NAME)))) { /* found this requestor - do not add it again */ goto DONETRIG; } @@ -562,8 +572,7 @@ ADDREQ: } if (NULL != requestor) { - if (ORTE_SUCCESS != (rc = orte_ns.copy_process_name(&(req->requestor), - requestor))) { + if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(req->requestor), requestor, ORTE_NAME))) { ORTE_ERROR_LOG(rc); return rc; } @@ -657,8 +666,7 @@ orte_gpr_replica_remove_subscription(orte_process_name_t *requestor, if (id == reqs[k]->idtag && ((NULL == requestor && NULL == reqs[k]->requestor) || (NULL != requestor && NULL != reqs[k]->requestor && - 0 == orte_ns.compare(ORTE_NS_CMP_ALL, - reqs[k]->requestor, requestor)))) { + ORTE_EQUAL == orte_dss.compare(reqs[k]->requestor, requestor, ORTE_NAME)))) { /* this is the subscription */ sub = subs[i]; req = reqs[k]; @@ -763,8 +771,7 @@ orte_gpr_replica_remove_trigger(orte_process_name_t *requestor, if (id == reqs[k]->idtag && ((NULL == requestor && NULL == reqs[k]->requestor) || (NULL != requestor && NULL != reqs[k]->requestor && - 0 == orte_ns.compare(ORTE_NS_CMP_ALL, - reqs[k]->requestor, requestor)))) { + ORTE_EQUAL == orte_dss.compare(reqs[k]->requestor, requestor, ORTE_NAME)))) { /* this is the trigger */ trig = trigs[i]; req = reqs[k]; @@ -1330,7 +1337,7 @@ int orte_gpr_replica_purge_subscriptions(orte_process_name_t *proc) } OBJ_RELEASE(trig); } else if (NULL != proc && NULL != trig[i]->requestor && - 0 == orte_ns.compare(ORTE_NS_CMP_ALL, proc, trig[i]->requestor)) { + ORTE_EQUAL == orte_dss.compare(Oproc, trig[i]->requestor, ORTE_NAME)) { if (ORTE_SUCCESS != (rc = orte_pointer_array_set_item(orte_gpr_replica.triggers, trig[i]->index, NULL))) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/gpr/replica/gpr_replica_component.c b/orte/mca/gpr/replica/gpr_replica_component.c index 1363ef9524..c14975d484 100644 --- a/orte/mca/gpr/replica/gpr_replica_component.c +++ b/orte/mca/gpr/replica/gpr_replica_component.c @@ -311,7 +311,7 @@ int orte_gpr_replica_module_init(void) /* issue the non-blocking receive */ if (!orte_gpr_replica_globals.isolate) { int rc = orte_rml.recv_buffer_nb( - ORTE_RML_NAME_ANY, ORTE_RML_TAG_GPR, ORTE_RML_PERSISTENT, orte_gpr_replica_recv, NULL); + ORTE_NAME_WILDCARD, ORTE_RML_TAG_GPR, ORTE_RML_PERSISTENT, orte_gpr_replica_recv, NULL); if(rc < 0) { ORTE_ERROR_LOG(rc); return rc; @@ -437,7 +437,7 @@ int orte_gpr_replica_finalize(void) return ORTE_SUCCESS; } - orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_GPR); + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_GPR); return ORTE_SUCCESS; } diff --git a/orte/mca/iof/base/iof_base_close.c b/orte/mca/iof/base/iof_base_close.c index b7da7c2230..ea42475c0c 100644 --- a/orte/mca/iof/base/iof_base_close.c +++ b/orte/mca/iof/base/iof_base_close.c @@ -59,7 +59,7 @@ int orte_iof_base_close(void) OPAL_THREAD_UNLOCK(&orte_iof_base.iof_lock); if (NULL != orte_iof_base.iof_service) { - orte_ns.free_name(&(orte_iof_base.iof_service)); + free(orte_iof_base.iof_service); } return ORTE_SUCCESS; diff --git a/orte/mca/iof/base/iof_base_endpoint.c b/orte/mca/iof/base/iof_base_endpoint.c index f3659408ea..96cf3224d8 100644 --- a/orte/mca/iof/base/iof_base_endpoint.c +++ b/orte/mca/iof/base/iof_base_endpoint.c @@ -42,7 +42,10 @@ #include <signal.h> #endif /* HAVE_SIGNAL_H */ #include "opal/util/output.h" + +#include "orte/mca/ns/ns.h" #include "orte/mca/rml/rml.h" + #include "orte/mca/iof/base/base.h" #include "orte/mca/iof/base/iof_base_endpoint.h" #include "orte/mca/iof/base/iof_base_fragment.h" @@ -172,7 +175,7 @@ static void orte_iof_base_endpoint_read_handler(int fd, short flags, void *cbdat hdr = &frag->frag_hdr; hdr->hdr_common.hdr_type = ORTE_IOF_BASE_HDR_MSG; hdr->hdr_msg.msg_src = endpoint->ep_name; - hdr->hdr_msg.msg_proxy = *ORTE_RML_NAME_SELF; + hdr->hdr_msg.msg_proxy = *ORTE_PROC_MY_NAME; hdr->hdr_msg.msg_tag = endpoint->ep_tag; hdr->hdr_msg.msg_seq = endpoint->ep_seq; hdr->hdr_msg.msg_len = frag->frag_len; @@ -294,7 +297,7 @@ static orte_iof_base_endpoint_t* orte_iof_base_endpoint_lookup( item != opal_list_get_end(&orte_iof_base.iof_endpoints); item = opal_list_get_next(item)) { orte_iof_base_endpoint_t* endpoint = (orte_iof_base_endpoint_t*)item; - if(orte_ns.compare(ORTE_NS_CMP_ALL,proc,&endpoint->ep_name) == 0 && + if(orte_ns.compare_fields(ORTE_NS_CMP_ALL,proc,&endpoint->ep_name) == 0 && endpoint->ep_tag == tag && endpoint->ep_mode == mode) { OBJ_RETAIN(endpoint); return endpoint; @@ -428,7 +431,7 @@ int orte_iof_base_endpoint_delete( while(item != opal_list_get_end(&orte_iof_base.iof_endpoints)) { opal_list_item_t* next = opal_list_get_next(item); orte_iof_base_endpoint_t* endpoint = (orte_iof_base_endpoint_t*)item; - if(orte_ns.compare(mask,proc,&endpoint->ep_name) == 0 && + if(orte_ns.compare_fields(mask,proc,&endpoint->ep_name) == 0 && endpoint->ep_tag == tag) { OBJ_RELEASE(endpoint); opal_list_remove_item(&orte_iof_base.iof_endpoints,&endpoint->super); @@ -485,7 +488,7 @@ orte_iof_base_endpoint_t* orte_iof_base_endpoint_match( item != opal_list_get_end(&orte_iof_base.iof_endpoints); item = opal_list_get_next(item)) { orte_iof_base_endpoint_t* endpoint = (orte_iof_base_endpoint_t*)item; - if(orte_ns.compare(dst_mask,dst_name,&endpoint->ep_name) == 0) { + if(orte_ns.compare_fields(dst_mask,dst_name,&endpoint->ep_name) == 0) { if(endpoint->ep_tag == dst_tag || endpoint->ep_tag == ORTE_IOF_ANY || dst_tag == ORTE_IOF_ANY) { OBJ_RETAIN(endpoint); OPAL_THREAD_UNLOCK(&orte_iof_base.iof_lock); diff --git a/orte/mca/iof/proxy/iof_proxy.c b/orte/mca/iof/proxy/iof_proxy.c index 7972454d26..5d27ea1eac 100644 --- a/orte/mca/iof/proxy/iof_proxy.c +++ b/orte/mca/iof/proxy/iof_proxy.c @@ -144,7 +144,7 @@ int orte_iof_proxy_push( /* send a subscription to server on behalf of the destination */ rc = orte_iof_proxy_svc_subscribe( - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, ORTE_NS_CMP_ALL, dst_tag, dst_name, @@ -156,7 +156,7 @@ int orte_iof_proxy_push( /* setup a local endpoint to reflect registration */ rc = orte_iof_base_endpoint_create( - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, ORTE_IOF_SOURCE, dst_tag, fd); @@ -184,7 +184,7 @@ int orte_iof_proxy_pull( /* setup a local endpoint */ int rc; rc = orte_iof_base_endpoint_create( - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, ORTE_IOF_SINK, src_tag, fd); @@ -195,7 +195,7 @@ int orte_iof_proxy_pull( /* publish this endpoint */ rc = orte_iof_proxy_svc_publish( - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, src_tag); if(rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); @@ -207,7 +207,7 @@ int orte_iof_proxy_pull( src_name, src_mask, src_tag, - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, ORTE_NS_CMP_ALL, src_tag); if(rc != ORTE_SUCCESS) { @@ -247,7 +247,7 @@ int orte_iof_proxy_subscribe( int rc; /* create a local registration to reflect the callback */ - rc = orte_iof_base_callback_create(ORTE_RML_NAME_SELF,src_tag,cbfunc,cbdata); + rc = orte_iof_base_callback_create(ORTE_PROC_MY_NAME,src_tag,cbfunc,cbdata); if(rc != ORTE_SUCCESS) return rc; @@ -256,7 +256,7 @@ int orte_iof_proxy_subscribe( src_name, src_mask, src_tag, - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, ORTE_NS_CMP_ALL, src_tag); return rc; @@ -274,13 +274,13 @@ int orte_iof_proxy_unsubscribe( src_name, src_mask, src_tag, - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, ORTE_NS_CMP_ALL, src_tag); if(rc != ORTE_SUCCESS) return rc; /* remove local callback */ - return orte_iof_base_callback_delete(ORTE_RML_NAME_SELF,src_tag); + return orte_iof_base_callback_delete(ORTE_PROC_MY_NAME,src_tag); } diff --git a/orte/mca/iof/proxy/iof_proxy_component.c b/orte/mca/iof/proxy/iof_proxy_component.c index da43b1273c..f6ac8f8f98 100644 --- a/orte/mca/iof/proxy/iof_proxy_component.c +++ b/orte/mca/iof/proxy/iof_proxy_component.c @@ -127,7 +127,7 @@ orte_iof_proxy_init(int* priority, bool *allow_multi_user_threads, bool *have_hi mca_iof_proxy_component.proxy_iov[0].iov_len = 0; rc = orte_rml.recv_nb( - ORTE_RML_NAME_ANY, + ORTE_NAME_WILDCARD, mca_iof_proxy_component.proxy_iov, 1, ORTE_RML_TAG_IOF_SVC, @@ -152,7 +152,7 @@ static int orte_iof_proxy_close(void) int rc = ORTE_SUCCESS; if (initialized) { - rc = orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_IOF_SVC); + rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_SVC); } return rc; } diff --git a/orte/mca/iof/proxy/iof_proxy_svc.c b/orte/mca/iof/proxy/iof_proxy_svc.c index 00e543cc82..1456bf405a 100644 --- a/orte/mca/iof/proxy/iof_proxy_svc.c +++ b/orte/mca/iof/proxy/iof_proxy_svc.c @@ -58,7 +58,7 @@ int orte_iof_proxy_svc_publish( hdr.hdr_common.hdr_type = ORTE_IOF_BASE_HDR_PUB; hdr.hdr_common.hdr_status = 0; hdr.hdr_pub.pub_name = *name; - hdr.hdr_pub.pub_proxy = *ORTE_RML_NAME_SELF; + hdr.hdr_pub.pub_proxy = *ORTE_PROC_MY_NAME; hdr.hdr_pub.pub_mask = ORTE_NS_CMP_ALL; hdr.hdr_pub.pub_tag = tag; ORTE_IOF_BASE_HDR_PUB_NTOH(hdr.hdr_pub); @@ -96,7 +96,7 @@ int orte_iof_proxy_svc_unpublish( hdr.hdr_common.hdr_type = ORTE_IOF_BASE_HDR_PUB; hdr.hdr_common.hdr_status = 0; hdr.hdr_pub.pub_name = *name; - hdr.hdr_pub.pub_proxy = *ORTE_RML_NAME_SELF; + hdr.hdr_pub.pub_proxy = *ORTE_PROC_MY_NAME; hdr.hdr_pub.pub_mask = mask; hdr.hdr_pub.pub_tag = tag; ORTE_IOF_BASE_HDR_PUB_NTOH(hdr.hdr_pub); @@ -257,7 +257,7 @@ static void orte_iof_proxy_svc_msg( unsigned char* data) { orte_iof_base_endpoint_t* endpoint; - endpoint = orte_iof_base_endpoint_match(ORTE_RML_NAME_ANY, ORTE_NS_CMP_NONE, msg->msg_tag); + endpoint = orte_iof_base_endpoint_match(ORTE_NAME_WILDCARD, ORTE_NS_CMP_NONE, msg->msg_tag); if(endpoint != NULL) { orte_iof_base_endpoint_forward(endpoint,src,msg,data); OBJ_RELEASE(endpoint); diff --git a/orte/mca/iof/svc/iof_svc.c b/orte/mca/iof/svc/iof_svc.c index f60db9c878..7fbedc943c 100644 --- a/orte/mca/iof/svc/iof_svc.c +++ b/orte/mca/iof/svc/iof_svc.c @@ -80,7 +80,7 @@ int orte_iof_svc_publish( if(mode == ORTE_IOF_SINK) { rc = orte_iof_svc_pub_create( name, - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, ORTE_NS_CMP_ALL, tag); } @@ -106,7 +106,7 @@ int orte_iof_svc_unpublish( int rc; rc = orte_iof_svc_pub_delete( name, - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, mask, tag); if(rc != ORTE_SUCCESS) @@ -142,7 +142,7 @@ int orte_iof_svc_push( /* setup a subscription */ rc = orte_iof_svc_sub_create( - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, ORTE_NS_CMP_ALL, dst_tag, dst_name, @@ -153,7 +153,7 @@ int orte_iof_svc_push( /* setup a local endpoint to reflect registration */ rc = orte_iof_base_endpoint_create( - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, ORTE_IOF_SOURCE, dst_tag, fd); @@ -181,7 +181,7 @@ int orte_iof_svc_pull( /* setup a local endpoint */ rc = orte_iof_base_endpoint_create( - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, ORTE_IOF_SINK, src_tag, fd); @@ -193,7 +193,7 @@ int orte_iof_svc_pull( src_name, src_mask, src_tag, - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, ORTE_NS_CMP_ALL, src_tag); return rc; @@ -230,7 +230,7 @@ int orte_iof_svc_subscribe( int rc; /* create a local registration to reflect the callback */ - rc = orte_iof_base_callback_create(ORTE_RML_NAME_SELF,src_tag,cbfunc,cbdata); + rc = orte_iof_base_callback_create(ORTE_PROC_MY_NAME,src_tag,cbfunc,cbdata); if(rc != ORTE_SUCCESS) return rc; @@ -239,7 +239,7 @@ int orte_iof_svc_subscribe( src_name, src_mask, src_tag, - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, ORTE_NS_CMP_ALL, src_tag); return rc; @@ -257,12 +257,12 @@ int orte_iof_svc_unsubscribe( src_name, src_mask, src_tag, - ORTE_RML_NAME_SELF, + ORTE_PROC_MY_NAME, ORTE_NS_CMP_ALL, src_tag); if(ORTE_SUCCESS != rc) return rc; /* cleanup any locally registered callback */ - return orte_iof_base_callback_delete(ORTE_RML_NAME_SELF,src_tag); + return orte_iof_base_callback_delete(ORTE_PROC_MY_NAME,src_tag); } diff --git a/orte/mca/iof/svc/iof_svc_component.c b/orte/mca/iof/svc/iof_svc_component.c index 16a9d980c1..121347c6f7 100644 --- a/orte/mca/iof/svc/iof_svc_component.c +++ b/orte/mca/iof/svc/iof_svc_component.c @@ -120,7 +120,7 @@ static int orte_iof_svc_close(void) OBJ_RELEASE(item); } OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock); - orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_IOF_SVC); + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_SVC); } return ORTE_SUCCESS; @@ -164,7 +164,7 @@ orte_iof_svc_init(int* priority, bool *allow_multi_user_threads, bool *have_hidd mca_iof_svc_component.svc_iov[0].iov_len = 0; rc = orte_rml.recv_nb( - ORTE_RML_NAME_ANY, + ORTE_NAME_WILDCARD, mca_iof_svc_component.svc_iov, 1, ORTE_RML_TAG_IOF_SVC, diff --git a/orte/mca/iof/svc/iof_svc_proxy.c b/orte/mca/iof/svc/iof_svc_proxy.c index 37c2003522..ac1ad8b3c1 100644 --- a/orte/mca/iof/svc/iof_svc_proxy.c +++ b/orte/mca/iof/svc/iof_svc_proxy.c @@ -151,7 +151,7 @@ static void orte_iof_svc_proxy_msg( continue; /* source match */ - if(orte_ns.compare(sub->src_mask,&sub->src_name,&hdr->msg_src) == 0) { + if(orte_ns.compare_fields(sub->src_mask,&sub->src_name,&hdr->msg_src) == 0) { if(mca_iof_svc_component.svc_debug > 1) { opal_output(0, "[%lu,%lu,%lu] orte_iof_svc_proxy_msg: tag %d sequence %d\n", ORTE_NAME_ARGS(&sub->src_name),hdr->msg_tag,hdr->msg_seq); @@ -227,7 +227,7 @@ static void orte_iof_svc_proxy_ack( orte_iof_svc_sub_t* sub = (orte_iof_svc_sub_t*)s_item; opal_list_item_t *f_item; - if (orte_ns.compare(sub->src_mask,&sub->src_name,&hdr->msg_src) != 0 || + if (orte_ns.compare_fields(sub->src_mask,&sub->src_name,&hdr->msg_src) != 0 || sub->src_tag != hdr->msg_tag) { continue; } @@ -238,8 +238,8 @@ static void orte_iof_svc_proxy_ack( f_item = opal_list_get_next(f_item)) { orte_iof_svc_fwd_t* fwd = (orte_iof_svc_fwd_t*)f_item; orte_iof_svc_pub_t* pub = fwd->fwd_pub; - if (orte_ns.compare(pub->pub_mask,&pub->pub_name,src) == 0 || - orte_ns.compare(ORTE_NS_CMP_ALL,&pub->pub_proxy,src) == 0) { + if (orte_ns.compare_fields(pub->pub_mask,&pub->pub_name,src) == 0 || + orte_ns.compare_fields(ORTE_NS_CMP_ALL,&pub->pub_proxy,src) == 0) { value.uval = hdr->msg_seq + hdr->msg_len; orte_hash_table_set_proc(&fwd->fwd_seq, &hdr->msg_src, &value.vval); @@ -259,7 +259,7 @@ static void orte_iof_svc_proxy_ack( */ if(seq_min == hdr->msg_seq+hdr->msg_len) { - if(orte_ns.compare(ORTE_NS_CMP_ALL,orte_process_info.my_name,&hdr->msg_src) == 0) { + if(orte_ns.compare_fields(ORTE_NS_CMP_ALL,orte_process_info.my_name,&hdr->msg_src) == 0) { orte_iof_base_endpoint_t* endpoint; /* * Local delivery diff --git a/orte/mca/iof/svc/iof_svc_pub.c b/orte/mca/iof/svc/iof_svc_pub.c index 996e1fafa7..36739f67bd 100644 --- a/orte/mca/iof/svc/iof_svc_pub.c +++ b/orte/mca/iof/svc/iof_svc_pub.c @@ -48,8 +48,8 @@ int orte_iof_svc_pub_create( item != opal_list_get_end(&mca_iof_svc_component.svc_published); item = opal_list_get_next(item)) { pub = (orte_iof_svc_pub_t*)item; - if(orte_ns.compare(pub_mask,pub_name,&pub->pub_name) == 0 && - orte_ns.compare(ORTE_NS_CMP_ALL,pub_proxy,&pub->pub_proxy) == 0 && + if(orte_ns.compare_fields(pub_mask,pub_name,&pub->pub_name) == 0 && + orte_ns.compare_fields(ORTE_NS_CMP_ALL,pub_proxy,&pub->pub_proxy) == 0 && pub_tag == pub->pub_tag) { OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock); return ORTE_SUCCESS; @@ -96,8 +96,8 @@ orte_iof_svc_pub_t* orte_iof_svc_pub_lookup( item != opal_list_get_end(&mca_iof_svc_component.svc_published); item = opal_list_get_next(item)) { orte_iof_svc_pub_t* pub = (orte_iof_svc_pub_t*)item; - if (orte_ns.compare(ORTE_NS_CMP_ALL, &pub->pub_name,pub_name) == 0 && - orte_ns.compare(ORTE_NS_CMP_ALL, &pub->pub_proxy,pub_proxy) == 0 && + if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_name,pub_name) == 0 && + orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_proxy,pub_proxy) == 0 && pub->pub_mask == pub_mask && pub->pub_tag == pub_tag) { return pub; @@ -157,8 +157,8 @@ void orte_iof_svc_pub_delete_all( opal_list_item_t* p_next = opal_list_get_next(p_item); orte_iof_svc_pub_t* pub = (orte_iof_svc_pub_t*)p_item; - if (orte_ns.compare(ORTE_NS_CMP_ALL, &pub->pub_name,name) == 0 || - orte_ns.compare(ORTE_NS_CMP_ALL, &pub->pub_proxy,name) == 0) { + if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_name,name) == 0 || + orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_proxy,name) == 0) { opal_list_item_t* s_item; for(s_item = opal_list_get_first(&mca_iof_svc_component.svc_subscribed); diff --git a/orte/mca/iof/svc/iof_svc_sub.c b/orte/mca/iof/svc/iof_svc_sub.c index 103da39460..f2f7f39805 100644 --- a/orte/mca/iof/svc/iof_svc_sub.c +++ b/orte/mca/iof/svc/iof_svc_sub.c @@ -81,10 +81,10 @@ int orte_iof_svc_sub_create( item = opal_list_get_next(item)) { sub = (orte_iof_svc_sub_t*)item; if (sub->src_mask == src_mask && - orte_ns.compare(sub->src_mask,&sub->src_name,src_name) == 0 && + orte_ns.compare_fields(sub->src_mask,&sub->src_name,src_name) == 0 && sub->src_tag == src_tag && sub->dst_mask == dst_mask && - orte_ns.compare(sub->dst_mask,&sub->dst_name,dst_name) == 0 && + orte_ns.compare_fields(sub->dst_mask,&sub->dst_name,dst_name) == 0 && sub->dst_tag == dst_tag) { OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock); return ORTE_SUCCESS; @@ -134,10 +134,10 @@ int orte_iof_svc_sub_delete( opal_list_item_t* next = opal_list_get_next(item); orte_iof_svc_sub_t* sub = (orte_iof_svc_sub_t*)item; if (sub->src_mask == src_mask && - orte_ns.compare(sub->src_mask,&sub->src_name,src_name) == 0 && + orte_ns.compare_fields(sub->src_mask,&sub->src_name,src_name) == 0 && sub->src_tag == src_tag && sub->dst_mask == dst_mask && - orte_ns.compare(sub->dst_mask,&sub->dst_name,dst_name) == 0 && + orte_ns.compare_fields(sub->dst_mask,&sub->dst_name,dst_name) == 0 && sub->dst_tag == dst_tag) { opal_list_remove_item(&mca_iof_svc_component.svc_subscribed, item); OBJ_RELEASE(item); @@ -159,9 +159,9 @@ int orte_iof_svc_sub_delete_all( opal_list_item_t* next = opal_list_get_next(item); orte_iof_svc_sub_t* sub = (orte_iof_svc_sub_t*)item; if ((sub->src_mask == ORTE_NS_CMP_ALL && - orte_ns.compare(ORTE_NS_CMP_ALL,&sub->src_name,name) == 0) || + orte_ns.compare_fields(ORTE_NS_CMP_ALL,&sub->src_name,name) == 0) || (sub->dst_mask == ORTE_NS_CMP_ALL && - orte_ns.compare(ORTE_NS_CMP_ALL,&sub->dst_name,name) == 0)) { + orte_ns.compare_fields(ORTE_NS_CMP_ALL,&sub->dst_name,name) == 0)) { opal_list_remove_item(&mca_iof_svc_component.svc_subscribed, item); OBJ_RELEASE(item); } @@ -283,7 +283,7 @@ bool orte_iof_svc_fwd_match( orte_iof_svc_sub_t* sub, orte_iof_svc_pub_t* pub) { - if (orte_ns.compare(sub->dst_mask,&sub->dst_name,&pub->pub_name) == 0 && + if (orte_ns.compare_fields(sub->dst_mask,&sub->dst_name,&pub->pub_name) == 0 && sub->src_tag == pub->pub_tag) { return true; } else { diff --git a/orte/mca/ns/base/Makefile.am b/orte/mca/ns/base/Makefile.am index deed6eb0e6..9d99d64481 100644 --- a/orte/mca/ns/base/Makefile.am +++ b/orte/mca/ns/base/Makefile.am @@ -17,13 +17,18 @@ # headers += \ - base/base.h + base/base.h \ + base/ns_private.h libmca_ns_la_SOURCES += \ base/ns_base_close.c \ base/ns_base_select.c \ base/ns_base_open.c \ - base/ns_base_local_fns.c \ + base/ns_base_cell_fns.c \ + base/ns_base_job_fns.c \ + base/ns_base_vpid_name_fns.c \ + base/ns_base_general_fns.c \ + base/ns_base_diag_fns.c \ base/data_type_support/ns_data_type_compare_fns.c \ base/data_type_support/ns_data_type_copy_fns.c \ base/data_type_support/ns_data_type_print_fns.c \ diff --git a/orte/mca/ns/base/base.h b/orte/mca/ns/base/base.h index 641e79a16a..262d977f18 100644 --- a/orte/mca/ns/base/base.h +++ b/orte/mca/ns/base/base.h @@ -42,38 +42,6 @@ extern "C" { #endif -/* default limits */ -#define ORTE_NS_ARRAY_MAX_SIZE INT_MAX -#define ORTE_NS_ARRAY_BLOCK_SIZE 512 -/* - * Internal definitions - */ -typedef uint8_t orte_ns_cmd_bitmask_t; - -/* - * packing type definitions - */ -/* CAUTION - any changes here must also change corresponding - * typedefs above - */ -#define ORTE_NS_CMD ORTE_INT8 - -/* - * define flag values for remote commands - only used internally - */ -#define ORTE_NS_CREATE_CELLID_CMD (int8_t)0x01 -#define ORTE_NS_GET_CELL_INFO_CMD (int8_t)0x02 -#define ORTE_NS_CREATE_JOBID_CMD (int8_t)0x03 -#define ORTE_NS_RESERVE_RANGE_CMD (int8_t)0x04 -#define ORTE_NS_ASSIGN_OOB_TAG_CMD (int8_t)0x08 -#define ORTE_NS_GET_JOB_PEERS_CMD (int8_t)0x0A -#define ORTE_NS_DEFINE_DATA_TYPE_CMD (int8_t)0x10 -#define ORTE_NS_CREATE_MY_NAME_CMD (int8_t)0x20 -#define ORTE_NS_DUMP_CELLS_CMD (int8_t)0x21 -#define ORTE_NS_DUMP_JOBIDS_CMD (int8_t)0x22 -#define ORTE_NS_DUMP_TAGS_CMD (int8_t)0x23 -#define ORTE_NS_DUMP_DATATYPES_CMD (int8_t)0x24 - /* * function definitions @@ -82,176 +50,6 @@ ORTE_DECLSPEC int orte_ns_base_open(void); ORTE_DECLSPEC int orte_ns_base_select(void); ORTE_DECLSPEC int orte_ns_base_close(void); - /* - * Base functions that are common to all implementations - can be overridden - */ - -ORTE_DECLSPEC int orte_ns_base_assign_cellid_to_process(orte_process_name_t* name); - -ORTE_DECLSPEC int orte_ns_base_create_process_name(orte_process_name_t **name, - orte_cellid_t cell, - orte_jobid_t job, - orte_vpid_t vpid); - -ORTE_DECLSPEC int orte_ns_base_copy_process_name(orte_process_name_t **dest, - orte_process_name_t* src); - -ORTE_DECLSPEC int orte_ns_base_convert_string_to_process_name(orte_process_name_t **name, - const char* name_string); - -ORTE_DECLSPEC int orte_ns_base_get_proc_name_string(char **name_string, - const orte_process_name_t* name); - -ORTE_DECLSPEC int orte_ns_base_get_vpid_string(char **vpid_string, const orte_process_name_t* name); - -ORTE_DECLSPEC int orte_ns_base_convert_vpid_to_string(char **vpid_string, const orte_vpid_t vpid); - -ORTE_DECLSPEC int orte_ns_base_convert_string_to_vpid(orte_vpid_t *vpid, const char* vpidstring); - -ORTE_DECLSPEC int orte_ns_base_get_jobid_string(char **jobid_string, const orte_process_name_t* name); - -ORTE_DECLSPEC int orte_ns_base_convert_jobid_to_string(char **jobid_string, const orte_jobid_t jobid); - -ORTE_DECLSPEC int orte_ns_base_convert_string_to_jobid(orte_jobid_t *jobid, const char* jobidstring); - -ORTE_DECLSPEC int orte_ns_base_get_cellid_string(char **cellid_string, const orte_process_name_t* name); - -ORTE_DECLSPEC int orte_ns_base_convert_string_to_cellid(orte_cellid_t *cellid, const char *cellidstring); - -ORTE_DECLSPEC int orte_ns_base_convert_cellid_to_string(char **cellid_string, const orte_cellid_t cellid); - -ORTE_DECLSPEC int orte_ns_base_get_vpid(orte_vpid_t *vpid, const orte_process_name_t* name); - -ORTE_DECLSPEC int orte_ns_base_get_jobid(orte_jobid_t *jobid, const orte_process_name_t* name); - -ORTE_DECLSPEC int orte_ns_base_get_cellid(orte_cellid_t *cellid, const orte_process_name_t* name); - -ORTE_DECLSPEC int orte_ns_base_compare(orte_ns_cmp_bitmask_t fields, - const orte_process_name_t* name1, - const orte_process_name_t* name2); - -ORTE_DECLSPEC int orte_ns_base_free_name(orte_process_name_t **name); - -ORTE_DECLSPEC int orte_ns_base_print_dump(orte_buffer_t *buffer); - - -/* not available functions */ -ORTE_DECLSPEC int orte_ns_base_module_init_not_available(void); - -ORTE_DECLSPEC int orte_ns_base_create_cellid_not_available(orte_cellid_t *cellid, - char *site, char *resource); - -ORTE_DECLSPEC int orte_ns_base_get_cell_info_not_available(orte_cellid_t cellid, - char **site, char **resource); - -ORTE_DECLSPEC int orte_ns_base_create_jobid_not_available(orte_jobid_t *jobid); - -ORTE_DECLSPEC int orte_ns_base_get_vpid_range_not_available(orte_jobid_t job, - orte_vpid_t range, - orte_vpid_t *startvpid); - -ORTE_DECLSPEC int orte_ns_base_derive_vpid(orte_vpid_t *vpid, - orte_vpid_t base_vpid, - int offset); - -ORTE_DECLSPEC int orte_ns_base_assign_rml_tag_not_available(orte_rml_tag_t *tag, char *name); - -ORTE_DECLSPEC int orte_ns_base_define_data_type_not_available( - const char *name, - orte_data_type_t *type); - -ORTE_DECLSPEC int orte_ns_base_create_my_name_not_available(void); - -ORTE_DECLSPEC int orte_ns_base_get_job_peers_not_available(orte_process_name_t **procs, - orte_std_cntr_t *num_procs, orte_jobid_t job); - -ORTE_DECLSPEC int orte_ns_base_dump_cells_not_available(void); -ORTE_DECLSPEC int orte_ns_base_dump_jobs_not_available(void); -ORTE_DECLSPEC int orte_ns_base_dump_tags_not_available(void); -ORTE_DECLSPEC int orte_ns_base_dump_datatypes_not_available(void); - -/* Base functions used everywhere */ -ORTE_DECLSPEC int orte_ns_base_get_peers(orte_process_name_t **procs, - orte_std_cntr_t *num_procs, orte_std_cntr_t *self); - -ORTE_DECLSPEC int orte_ns_base_pack_name(orte_buffer_t *buffer, void *src, - orte_std_cntr_t num_vals, orte_data_type_t type); - -ORTE_DECLSPEC int orte_ns_base_pack_cellid(orte_buffer_t *buffer, void *src, - orte_std_cntr_t num_vals, orte_data_type_t type); - -ORTE_DECLSPEC int orte_ns_base_pack_jobid(orte_buffer_t *buffer, void *src, - orte_std_cntr_t num_vals, orte_data_type_t type); - -ORTE_DECLSPEC int orte_ns_base_pack_vpid(orte_buffer_t *buffer, void *src, - orte_std_cntr_t num_vals, orte_data_type_t type); - -ORTE_DECLSPEC int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest, - orte_std_cntr_t *num_vals, orte_data_type_t type); - -ORTE_DECLSPEC int orte_ns_base_unpack_cellid(orte_buffer_t *buffer, void *dest, - orte_std_cntr_t *num_vals, orte_data_type_t type); - -ORTE_DECLSPEC int orte_ns_base_unpack_jobid(orte_buffer_t *buffer, void *dest, - orte_std_cntr_t *num_vals, orte_data_type_t type); - -ORTE_DECLSPEC int orte_ns_base_unpack_vpid(orte_buffer_t *buffer, void *dest, - orte_std_cntr_t *num_vals, orte_data_type_t type); - -/* - * copy functions - */ - -int orte_ns_base_copy_name(orte_process_name_t **dest, orte_process_name_t *src, orte_data_type_t type); - -int orte_ns_base_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, orte_data_type_t type); - -int orte_ns_base_copy_cellid(orte_cellid_t **dest, orte_cellid_t *src, orte_data_type_t type); - -int orte_ns_base_copy_jobid(orte_jobid_t **dest, orte_jobid_t *src, orte_data_type_t type); - -/* - * compare functions - */ - -int orte_ns_base_compare_name(orte_process_name_t *value1, - orte_process_name_t *value2, - orte_data_type_t type); - - -int orte_ns_base_compare_vpid(orte_vpid_t *value1, - orte_vpid_t *value2, - orte_data_type_t type); - -int orte_ns_base_compare_jobid(orte_jobid_t *value1, - orte_jobid_t *value2, - orte_data_type_t type); - -int orte_ns_base_compare_cellid(orte_cellid_t *value1, - orte_cellid_t *value2, - orte_data_type_t type); - -/* - * size functions - */ - -int orte_ns_base_std_size(size_t *size, void *src, orte_data_type_t type); - -/* - * release functions - */ - -void orte_ns_base_std_release(orte_data_value_t *value); - -/* - * print functions - */ - -int orte_ns_base_std_print(char **output, char *prefix, void *src, orte_data_type_t type); - -int orte_ns_base_print_name(char **output, char *prefix, orte_process_name_t *name, orte_data_type_t type); - - /* * globals that might be needed */ diff --git a/orte/mca/ns/base/data_type_support/ns_data_type_compare_fns.c b/orte/mca/ns/base/data_type_support/ns_data_type_compare_fns.c index 65cceb317d..683f6f1ae7 100755 --- a/orte/mca/ns/base/data_type_support/ns_data_type_compare_fns.c +++ b/orte/mca/ns/base/data_type_support/ns_data_type_compare_fns.c @@ -25,7 +25,7 @@ #include "orte/dss/dss.h" #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ns/base/base.h" +#include "orte/mca/ns/base/ns_private.h" /* * NUMERIC COMPARE FUNCTIONS @@ -42,42 +42,13 @@ int orte_ns_base_compare_name(orte_process_name_t *value1, return ORTE_VALUE1_GREATER; } - /** we have to take care of the special case where one of the - * values is ORTE_NAME_WILDCARD. If any of the fields are wildcard, - * then we want to just ignore that one field. However, in the case - * of ORTE_NAME_WILDCARD (where ALL of the fields are wildcard), this - * would automatically result in ORTE_EQUAL for any name in the other - * value - a totally useless result. - * - * Instead, what we want to know in this case is if the value actually - * *is* ORTE_NAME_WILDCARD. So, we need to detect if one of the values - * is ORTE_NAME_WILDCARD, and then specifically check the other one - * to see if it matches - */ - if (value2->cellid == ORTE_CELLID_WILDCARD && - value2->jobid == ORTE_JOBID_WILDCARD && - value2->vpid == ORTE_VPID_WILDCARD) { - if (value1->cellid == ORTE_CELLID_WILDCARD && - value1->jobid == ORTE_JOBID_WILDCARD && - value1->vpid == ORTE_VPID_WILDCARD) { - return ORTE_EQUAL; - } else { - return ORTE_VALUE1_GREATER; - } - } else if (value1->cellid == ORTE_CELLID_WILDCARD && - value1->jobid == ORTE_JOBID_WILDCARD && - value1->vpid == ORTE_VPID_WILDCARD) { - if (value2->cellid == ORTE_CELLID_WILDCARD && - value2->jobid == ORTE_JOBID_WILDCARD && - value2->vpid == ORTE_VPID_WILDCARD) { - return ORTE_EQUAL; - } else { - return ORTE_VALUE2_GREATER; - } - } - - /** now that the special cases are done, go through the progression */ - + /* If any of the fields are wildcard, + * then we want to just ignore that one field. In the case + * of ORTE_NAME_WILDCARD (where ALL of the fields are wildcard), this + * will automatically result in ORTE_EQUAL for any name in the other + * value - a totally useless result, but consistent in behavior. + */ + /** check the cellids - if one of them is WILDCARD, then ignore * this field since anything is okay */ @@ -163,3 +134,18 @@ int orte_ns_base_compare_cellid(orte_cellid_t *value1, return ORTE_EQUAL; } + +int orte_ns_base_compare_nodeid(orte_nodeid_t *value1, + orte_nodeid_t *value2, + orte_data_type_t type) +{ + /** if either value is WILDCARD, then return equal */ + if (*value1 == ORTE_NODEID_WILDCARD || + *value2 == ORTE_NODEID_WILDCARD) return ORTE_EQUAL; + + if (*value1 > *value2) return ORTE_VALUE1_GREATER; + + if (*value2 > *value1) return ORTE_VALUE2_GREATER; + + return ORTE_EQUAL; +} diff --git a/orte/mca/ns/base/data_type_support/ns_data_type_copy_fns.c b/orte/mca/ns/base/data_type_support/ns_data_type_copy_fns.c index 01fc378998..9ddfad4a42 100755 --- a/orte/mca/ns/base/data_type_support/ns_data_type_copy_fns.c +++ b/orte/mca/ns/base/data_type_support/ns_data_type_copy_fns.c @@ -23,7 +23,7 @@ #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ns/base/base.h" +#include "orte/mca/ns/base/ns_private.h" /* * VPID @@ -63,6 +63,25 @@ int orte_ns_base_copy_cellid(orte_cellid_t **dest, orte_cellid_t *src, orte_data return ORTE_SUCCESS; } +/* + * NODEID + */ +int orte_ns_base_copy_nodeid(orte_nodeid_t **dest, orte_nodeid_t *src, orte_data_type_t type) +{ + orte_nodeid_t *val; + + val = (orte_nodeid_t*)malloc(sizeof(orte_nodeid_t)); + if (NULL == val) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + *val = *src; + *dest = val; + + return ORTE_SUCCESS; +} + /* * JOBID */ diff --git a/orte/mca/ns/base/data_type_support/ns_data_type_packing_fns.c b/orte/mca/ns/base/data_type_support/ns_data_type_packing_fns.c index fe9b00ff65..769d67e04f 100644 --- a/orte/mca/ns/base/data_type_support/ns_data_type_packing_fns.c +++ b/orte/mca/ns/base/data_type_support/ns_data_type_packing_fns.c @@ -28,7 +28,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/dss/dss_internal.h" -#include "orte/mca/ns/base/base.h" +#include "orte/mca/ns/base/ns_private.h" /* * NAME @@ -51,11 +51,7 @@ int orte_ns_base_pack_name(orte_buffer_t *buffer, void *src, } proc = (orte_process_name_t*)src; for (i=0; i < num_vals; i++) { - if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&cellid[i], proc))) { - ORTE_ERROR_LOG(rc); - free(cellid); - return rc; - } + cellid[i] = proc->cellid; proc++; } /* now pack them in one shot */ @@ -75,11 +71,7 @@ int orte_ns_base_pack_name(orte_buffer_t *buffer, void *src, } proc = (orte_process_name_t*)src; for (i=0; i < num_vals; i++) { - if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid[i], proc))) { - ORTE_ERROR_LOG(rc); - free(jobid); - return rc; - } + jobid[i] = proc->jobid; proc++; } /* now pack them in one shot */ @@ -99,11 +91,7 @@ int orte_ns_base_pack_name(orte_buffer_t *buffer, void *src, } proc = (orte_process_name_t*)src; for (i=0; i < num_vals; i++) { - if (ORTE_SUCCESS != (rc = orte_ns.get_vpid(&vpid[i], proc))) { - ORTE_ERROR_LOG(rc); - free(vpid); - return rc; - } + vpid[i] = proc->vpid; proc++; } /* now pack them in one shot */ @@ -128,13 +116,30 @@ int orte_ns_base_pack_cellid(orte_buffer_t *buffer, void *src, /* Turn around and pack the real type */ if (ORTE_SUCCESS != ( - ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_STD_CNTR_T))) { + ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_CELLID_T))) { ORTE_ERROR_LOG(ret); } return ret; } +/* + * NODEID + */ +int orte_ns_base_pack_nodeid(orte_buffer_t *buffer, void *src, + orte_std_cntr_t num_vals, orte_data_type_t type) +{ + int ret; + + /* Turn around and pack the real type */ + if (ORTE_SUCCESS != ( + ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_NODEID_T))) { + ORTE_ERROR_LOG(ret); + } + + return ret; +} + /* * JOBID */ @@ -145,7 +150,7 @@ int orte_ns_base_pack_jobid(orte_buffer_t *buffer, void *src, /* Turn around and pack the real type */ if (ORTE_SUCCESS != ( - ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_STD_CNTR_T))) { + ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_JOBID_T))) { ORTE_ERROR_LOG(ret); } @@ -162,7 +167,7 @@ int orte_ns_base_pack_vpid(orte_buffer_t *buffer, void *src, /* Turn around and pack the real type */ if (ORTE_SUCCESS != ( - ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_STD_CNTR_T))) { + ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_VPID_T))) { ORTE_ERROR_LOG(ret); } diff --git a/orte/mca/ns/base/data_type_support/ns_data_type_print_fns.c b/orte/mca/ns/base/data_type_support/ns_data_type_print_fns.c index 65fcb263cb..552dbe9f2a 100755 --- a/orte/mca/ns/base/data_type_support/ns_data_type_print_fns.c +++ b/orte/mca/ns/base/data_type_support/ns_data_type_print_fns.c @@ -24,7 +24,7 @@ #include "orte/dss/dss.h" #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ns/base/base.h" +#include "orte/mca/ns/base/ns_private.h" static void orte_ns_base_quick_print(char **output, char *type_name, char *pfx, void *src, size_t src_size); @@ -49,6 +49,10 @@ int orte_ns_base_std_print(char **output, char *prefix, void *src, orte_data_typ orte_ns_base_quick_print(output, "ORTE_CELLID", prefix, src, sizeof(orte_cellid_t)); break; + case ORTE_NODEID: + orte_ns_base_quick_print(output, "ORTE_NODEID", prefix, src, sizeof(orte_nodeid_t)); + break; + default: ORTE_ERROR_LOG(ORTE_ERR_UNKNOWN_DATA_TYPE); return ORTE_ERR_UNKNOWN_DATA_TYPE; @@ -69,9 +73,9 @@ int orte_ns_base_print_name(char **output, char *prefix, orte_process_name_t *na asprintf(output, "%sData type: ORTE_PROCESS_NAME\tData Value: NULL", (NULL == prefix ? " " : prefix)); } else { - asprintf(output, "%sData type: ORTE_PROCESS_NAME\tData Value: [%lu,%lu,%lu]", - (NULL == prefix ? " " : prefix), (unsigned long)name->cellid, - (unsigned long)name->jobid, (unsigned long)name->vpid); + asprintf(output, "%sData type: ORTE_PROCESS_NAME\tData Value: [%ld,%ld,%ld]", + (NULL == prefix ? " " : prefix), (long)name->cellid, + (long)name->jobid, (long)name->vpid); } return ORTE_SUCCESS; @@ -80,10 +84,10 @@ int orte_ns_base_print_name(char **output, char *prefix, orte_process_name_t *na static void orte_ns_base_quick_print(char **output, char *type_name, char *prefix, void *src, size_t src_size) { - uint8_t *ui8; - uint16_t *ui16; - uint32_t *ui32; - uint64_t *ui64; + int8_t *i8; + int16_t *i16; + int32_t *i32; + int64_t *i64; char *pfx; /* set default result */ @@ -99,23 +103,23 @@ static void orte_ns_base_quick_print(char **output, char *type_name, char *prefi switch(src_size) { case 1: - ui8 = (uint8_t*)src; - asprintf(output, "%sData type: %s\tValue: %d", pfx, type_name, (int) *ui8); + i8 = (int8_t*)src; + asprintf(output, "%sData type: %s\tValue: %d", pfx, type_name, (int) *i8); break; case 2: - ui16 = (uint16_t*)src; - asprintf(output, "%sData type: %s\tValue: %d", pfx, type_name, (int) *ui16); + i16 = (int16_t*)src; + asprintf(output, "%sData type: %s\tValue: %d", pfx, type_name, (int) *i16); break; case 4: - ui32 = (uint32_t*)src; - asprintf(output, "%sData type: %s\tValue: %lu", pfx, type_name, (unsigned long) *ui32); + i32 = (int32_t*)src; + asprintf(output, "%sData type: %s\tValue: %ld", pfx, type_name, (long) *i32); break; case 8: - ui64 = (uint64_t*)src; - asprintf(output, "%sData type: %s\tValue: %lu", pfx, type_name, (unsigned long) *ui64); + i64 = (int64_t*)src; + asprintf(output, "%sData type: %s\tValue: %ld", pfx, type_name, (long) *i64); break; default: diff --git a/orte/mca/ns/base/data_type_support/ns_data_type_release_fns.c b/orte/mca/ns/base/data_type_support/ns_data_type_release_fns.c index 8fb49b3e4b..16ebe91117 100644 --- a/orte/mca/ns/base/data_type_support/ns_data_type_release_fns.c +++ b/orte/mca/ns/base/data_type_support/ns_data_type_release_fns.c @@ -24,7 +24,7 @@ #include "orte/dss/dss.h" #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ns/base/base.h" +#include "orte/mca/ns/base/ns_private.h" /* * STANDARD RELEASE FUNCTION - WORKS FOR EVERYTHING NON-STRUCTURED diff --git a/orte/mca/ns/base/data_type_support/ns_data_type_size_fns.c b/orte/mca/ns/base/data_type_support/ns_data_type_size_fns.c index 5d298e23c3..2abbe6630e 100755 --- a/orte/mca/ns/base/data_type_support/ns_data_type_size_fns.c +++ b/orte/mca/ns/base/data_type_support/ns_data_type_size_fns.c @@ -24,7 +24,7 @@ #include "orte/dss/dss.h" #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ns/base/base.h" +#include "orte/mca/ns/base/ns_private.h" /* * STANDARD SIZE FUNCTION - WORKS FOR EVERYTHING NON-STRUCTURED @@ -44,6 +44,10 @@ int orte_ns_base_std_size(size_t *size, void *src, orte_data_type_t type) *size = sizeof(orte_cellid_t); break; + case ORTE_NODEID: + *size = sizeof(orte_nodeid_t); + break; + case ORTE_NAME: *size = sizeof(orte_process_name_t); break; diff --git a/orte/mca/ns/base/data_type_support/ns_data_type_unpacking_fns.c b/orte/mca/ns/base/data_type_support/ns_data_type_unpacking_fns.c index 27df2c64da..9506c04652 100644 --- a/orte/mca/ns/base/data_type_support/ns_data_type_unpacking_fns.c +++ b/orte/mca/ns/base/data_type_support/ns_data_type_unpacking_fns.c @@ -26,7 +26,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/dss/dss_internal.h" -#include "orte/mca/ns/base/base.h" +#include "orte/mca/ns/base/ns_private.h" /* * NAME @@ -123,14 +123,29 @@ int orte_ns_base_unpack_cellid(orte_buffer_t *buffer, void *dest, int ret; /* Turn around and unpack the real type */ - if (ORTE_SUCCESS != ( - ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_STD_CNTR_T))) { + if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_CELLID_T))) { ORTE_ERROR_LOG(ret); } return ret; } +/* + * NODEID + */ +int orte_ns_base_unpack_nodeid(orte_buffer_t *buffer, void *dest, + orte_std_cntr_t *num_vals, orte_data_type_t type) +{ + int ret; + + /* Turn around and unpack the real type */ + if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_NODEID_T))) { + ORTE_ERROR_LOG(ret); + } + + return ret; +} + /* * JOBID */ @@ -140,8 +155,7 @@ int orte_ns_base_unpack_jobid(orte_buffer_t *buffer, void *dest, int ret; /* Turn around and unpack the real type */ - if (ORTE_SUCCESS != ( - ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_STD_CNTR_T))) { + if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_JOBID_T))) { ORTE_ERROR_LOG(ret); } @@ -157,8 +171,7 @@ int orte_ns_base_unpack_vpid(orte_buffer_t *buffer, void *dest, int ret; /* Turn around and unpack the real type */ - if (ORTE_SUCCESS != ( - ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_STD_CNTR_T))) { + if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_VPID_T))) { ORTE_ERROR_LOG(ret); } diff --git a/orte/mca/ns/base/ns_base_cell_fns.c b/orte/mca/ns/base/ns_base_cell_fns.c new file mode 100644 index 0000000000..d4f9d8b543 --- /dev/null +++ b/orte/mca/ns/base/ns_base_cell_fns.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +#include "orte_config.h" + +#include <stdio.h> +#include <string.h> +#include <stddef.h> +#include <stdlib.h> +#if HAVE_NETINET_IN_H +#include <netinet/in.h> +#endif + +#include "orte/orte_constants.h" + +#include "opal/util/output.h" +#include "opal/util/printf.h" +#include "opal/mca/mca.h" + +#include "orte/mca/schema/schema_types.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/ns/base/ns_private.h" + +/* + * "not available" functions + */ +int +orte_ns_base_create_cellid_not_available(orte_cellid_t *cellid, char *site, char *resource) +{ + *cellid = ORTE_CELLID_INVALID; + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int +orte_ns_base_get_cell_info_not_available(orte_cellid_t cellid, + char **site, char **resource) +{ + *site = NULL; + *resource = NULL; + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int +orte_ns_base_create_nodeids_not_available(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, orte_cellid_t cellid, char **nodename) +{ + *nodeids = NULL; + *nnodes = 0; + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int +orte_ns_base_get_node_info_not_available(char ***nodenames, orte_cellid_t cellid, + orte_std_cntr_t num_nodeids, orte_nodeid_t *nodeids) +{ + *nodenames = NULL; + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + + +/**** CELL STRING FUNCTIONS ****/ +int orte_ns_base_get_cellid_string(char **cellid_string, const orte_process_name_t* name) +{ + if (NULL == name) { /* got an error */ + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + *cellid_string = NULL; + return ORTE_ERR_BAD_PARAM; + } + + /* check for wildcard value - handle appropriately */ + if (ORTE_CELLID_WILDCARD == name->cellid) { + *cellid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING); + return ORTE_SUCCESS; + } + + /* check for invalid value - handle appropriately */ + if (ORTE_CELLID_INVALID == name->cellid) { + *cellid_string = strdup(ORTE_SCHEMA_INVALID_STRING); + return ORTE_SUCCESS; + } + + if (0 > asprintf(cellid_string, "%ld", (long) name->cellid)) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + return ORTE_SUCCESS; +} + + +int orte_ns_base_convert_cellid_to_string(char **cellid_string, const orte_cellid_t cellid) +{ + /* check for wildcard value - handle appropriately */ + if (ORTE_CELLID_WILDCARD == cellid) { + *cellid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING); + return ORTE_SUCCESS; + } + + /* check for invalid value - handle appropriately */ + if (ORTE_CELLID_INVALID == cellid) { + *cellid_string = strdup(ORTE_SCHEMA_INVALID_STRING); + return ORTE_SUCCESS; + } + + if (0 > asprintf(cellid_string, "%ld", (long) cellid)) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + return ORTE_SUCCESS; +} + + +int orte_ns_base_convert_string_to_cellid(orte_cellid_t *cellid, const char *cellidstring) +{ + long int tmpint; + + if (NULL == cellidstring) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + *cellid = ORTE_CELLID_INVALID; + return ORTE_ERR_BAD_PARAM; + } + + /** check for wildcard string - handle appropriately */ + if (0 == strcmp(ORTE_SCHEMA_WILDCARD_STRING, cellidstring)) { + *cellid = ORTE_CELLID_WILDCARD; + return ORTE_SUCCESS; + } + + /** check for invalid string - handle appropriately */ + if (0 == strcmp(ORTE_SCHEMA_INVALID_STRING, cellidstring)) { + *cellid = ORTE_CELLID_INVALID; + return ORTE_SUCCESS; + } + + tmpint = strtol(cellidstring, NULL, 10); + + if (ORTE_CELLID_MAX >= tmpint && ORTE_CELLID_MIN <= tmpint) { + *cellid = (orte_cellid_t)tmpint; + } else { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + *cellid = ORTE_CELLID_INVALID; + return ORTE_ERR_BAD_PARAM; + } + + return ORTE_SUCCESS; +} + + +/**** NODEID STRING FUNCTIONS ****/ +int orte_ns_base_convert_nodeid_to_string(char **string, const orte_nodeid_t nodeid) +{ + *string = NULL; + + /* check for wildcard value - handle appropriately */ + if (ORTE_NODEID_WILDCARD == nodeid) { + *string = strdup(ORTE_SCHEMA_WILDCARD_STRING); + return ORTE_SUCCESS; + } + + /* check for invalid value - handle appropriately */ + if (ORTE_NODEID_INVALID == nodeid) { + *string = strdup(ORTE_SCHEMA_INVALID_STRING); + return ORTE_SUCCESS; + } + + if (0 > asprintf(string, "%ld", (long)nodeid)) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + return ORTE_SUCCESS; +} + + +int orte_ns_base_convert_string_to_nodeid(orte_nodeid_t *nodeid, const char* string) +{ + long int tmpint; + + if (NULL == string) { /* got an error */ + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + *nodeid = ORTE_NODEID_INVALID; + return ORTE_ERR_BAD_PARAM; + } + + /** check for wildcard character - handle appropriately */ + if (0 == strcmp(ORTE_SCHEMA_WILDCARD_STRING, string)) { + *nodeid = ORTE_NODEID_WILDCARD; + return ORTE_SUCCESS; + } + + /* check for invalid value */ + if (0 == strcmp(ORTE_SCHEMA_INVALID_STRING, string)) { + *nodeid = ORTE_NODEID_INVALID; + return ORTE_SUCCESS; + } + + tmpint = strtol(string, NULL, 10); + + if (ORTE_NODEID_MAX >= tmpint && ORTE_NODEID_MIN <= tmpint) { + *nodeid = (orte_nodeid_t)tmpint; + } else { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + *nodeid = ORTE_NODEID_INVALID; + return ORTE_ERR_BAD_PARAM; + } + + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ns/base/ns_base_diag_fns.c b/orte/mca/ns/base/ns_base_diag_fns.c new file mode 100644 index 0000000000..734ed4cfea --- /dev/null +++ b/orte/mca/ns/base/ns_base_diag_fns.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +#include "orte_config.h" + +#include <stdio.h> +#include <string.h> +#include <stddef.h> +#include <stdlib.h> +#if HAVE_NETINET_IN_H +#include <netinet/in.h> +#endif + +#include "orte/orte_constants.h" + +#include "opal/util/output.h" +#include "opal/util/printf.h" +#include "opal/mca/mca.h" + +#include "orte/dss/dss.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/ns/base/base.h" +#include "orte/mca/ns/base/ns_private.h" + +/* + * "not available" functions + */ + +int +orte_ns_base_dump_cells_not_available(void) +{ + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int +orte_ns_base_dump_jobs_not_available(void) +{ + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int +orte_ns_base_dump_tags_not_available(void) +{ + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int +orte_ns_base_dump_datatypes_not_available(void) +{ + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +/**** DIAGNOSTIC FUNCTIONS ****/ +int orte_ns_base_print_dump(orte_buffer_t *buffer) +{ + char *line; + orte_std_cntr_t n; + orte_data_type_t type; + int rc; + + n = 1; + while (ORTE_SUCCESS == orte_dss.peek(buffer, &type, &n)) { + if (ORTE_SUCCESS != + (rc = orte_dss.unpack(buffer, &line, &n, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + opal_output(mca_ns_base_output, "%s", line); + free(line); + n=1; + } + + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ns/base/ns_base_general_fns.c b/orte/mca/ns/base/ns_base_general_fns.c new file mode 100644 index 0000000000..884fbeaa1b --- /dev/null +++ b/orte/mca/ns/base/ns_base_general_fns.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +#include "orte_config.h" + +#include <stdio.h> +#include <string.h> +#include <stddef.h> +#include <stdlib.h> +#if HAVE_NETINET_IN_H +#include <netinet/in.h> +#endif + +#include "orte/orte_constants.h" + +#include "opal/util/output.h" +#include "opal/util/printf.h" +#include "opal/mca/mca.h" + +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/ns/base/ns_private.h" + +/* + * "not available" functions + */ +int +orte_ns_base_module_init_not_available(void) +{ + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int +orte_ns_base_assign_rml_tag_not_available(orte_rml_tag_t *tag, char *name) +{ + *tag = ORTE_RML_TAG_MAX; + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int +orte_ns_base_define_data_type_not_available( + const char *name, + orte_data_type_t *type) +{ + *type = ORTE_DSS_ID_MAX; + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +/**** GET PEERS ****/ +int orte_ns_base_get_peers_not_available(orte_process_name_t **procs, + orte_std_cntr_t *num_procs, opal_list_t *attributes) +{ + *procs = NULL; + *num_procs = 0; + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} diff --git a/orte/mca/ns/base/ns_base_job_fns.c b/orte/mca/ns/base/ns_base_job_fns.c new file mode 100644 index 0000000000..f15eb7498f --- /dev/null +++ b/orte/mca/ns/base/ns_base_job_fns.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +#include "orte_config.h" + +#include <stdio.h> +#include <string.h> +#include <stddef.h> +#include <stdlib.h> +#if HAVE_NETINET_IN_H +#include <netinet/in.h> +#endif + +#include "orte/orte_constants.h" + +#include "opal/util/output.h" +#include "opal/util/printf.h" +#include "opal/mca/mca.h" + +#include "orte/mca/schema/schema_types.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/ns/base/ns_private.h" + +/* + * "not available" functions + */ +int +orte_ns_base_create_jobid_not_available(orte_jobid_t *jobid, opal_list_t *attrs) +{ + *jobid = ORTE_JOBID_INVALID; + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int +orte_ns_base_get_vpid_range_not_available(orte_jobid_t job, + orte_vpid_t range, + orte_vpid_t *startvpid) +{ + *startvpid = ORTE_VPID_INVALID; + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int orte_ns_base_get_job_descendants_not_available(orte_jobid_t** descendants, + orte_std_cntr_t *num_desc, + orte_jobid_t job) +{ + *descendants = NULL; + *num_desc = 0; + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int orte_ns_base_get_job_children_not_available(orte_jobid_t** children, + orte_std_cntr_t *num_childs, + orte_jobid_t job) +{ + *children = NULL; + *num_childs = 0; + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int orte_ns_base_get_root_job_not_available(orte_jobid_t *root_job, orte_jobid_t job) +{ + *root_job = ORTE_JOBID_INVALID; + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int orte_ns_base_get_parent_job_not_available(orte_jobid_t *parent, orte_jobid_t job) +{ + *parent = ORTE_JOBID_INVALID; + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + + +/**** JOB STRING FUNCTIONS ****/ +int orte_ns_base_get_jobid_string(char **jobid_string, const orte_process_name_t* name) +{ + if (NULL == name) { /* got an error */ + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + *jobid_string = NULL; + return ORTE_ERR_BAD_PARAM; + } + + /* check for wildcard value - handle appropriately */ + if (ORTE_JOBID_WILDCARD == name->jobid) { + *jobid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING); + return ORTE_SUCCESS; + } + + if (0 > asprintf(jobid_string, "%ld", (long) name->jobid)) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + return ORTE_SUCCESS; +} + + +int orte_ns_base_convert_jobid_to_string(char **jobid_string, const orte_jobid_t jobid) +{ + /* check for wildcard value - handle appropriately */ + if (ORTE_JOBID_WILDCARD == jobid) { + *jobid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING); + return ORTE_SUCCESS; + } + + if (0 > asprintf(jobid_string, "%ld", (long) jobid)) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + return ORTE_SUCCESS; +} + + +int orte_ns_base_convert_string_to_jobid(orte_jobid_t *jobid, const char* jobidstring) +{ + long int tmpint; + + if (NULL == jobidstring) { /* got an error */ + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + *jobid = ORTE_JOBID_INVALID; + return ORTE_ERR_BAD_PARAM; + } + + tmpint = strtoul(jobidstring, NULL, 10); + + /* check for invalid value */ + if (ORTE_JOBID_INVALID == tmpint) { + *jobid = ORTE_JOBID_INVALID; + return ORTE_SUCCESS; + } + + if (ORTE_JOBID_MAX >= tmpint && ORTE_JOBID_MIN <= tmpint) { + *jobid = (orte_jobid_t)tmpint; + } else { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + *jobid = ORTE_JOBID_INVALID; + return ORTE_ERR_BAD_PARAM; + } + + return ORTE_SUCCESS; +} + + diff --git a/orte/mca/ns/base/ns_base_local_fns.c b/orte/mca/ns/base/ns_base_local_fns.c deleted file mode 100644 index 80dbd28b95..0000000000 --- a/orte/mca/ns/base/ns_base_local_fns.c +++ /dev/null @@ -1,678 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file: - * - */ - -#include "orte_config.h" - -#include <stdio.h> -#include <string.h> -#include <stddef.h> -#include <stdlib.h> -#if HAVE_NETINET_IN_H -#include <netinet/in.h> -#endif - -#include "orte/orte_constants.h" - -#include "opal/util/output.h" -#include "opal/util/printf.h" -#include "opal/mca/mca.h" - -#include "orte/mca/errmgr/errmgr.h" - -#include "orte/mca/ns/base/base.h" - -/** - * globals - */ - - -/* - * "not available" functions - */ -int -orte_ns_base_module_init_not_available(void) -{ - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return ORTE_ERR_UNREACH; -} - -int -orte_ns_base_create_cellid_not_available(orte_cellid_t *cellid, char *site, char *resource) -{ - *cellid = ORTE_CELLID_MAX; - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return ORTE_ERR_UNREACH; -} - -int -orte_ns_base_get_cell_info_not_available(orte_cellid_t cellid, - char **site, char **resource) -{ - *site = NULL; - *resource = NULL; - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return ORTE_ERR_UNREACH; -} - -int -orte_ns_base_create_jobid_not_available(orte_jobid_t *jobid) -{ - *jobid = ORTE_JOBID_MAX; - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return ORTE_ERR_UNREACH; -} - -int -orte_ns_base_get_vpid_range_not_available(orte_jobid_t job, - orte_vpid_t range, - orte_vpid_t *startvpid) -{ - *startvpid = ORTE_VPID_MAX; - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return ORTE_ERR_UNREACH; -} - -int -orte_ns_base_assign_rml_tag_not_available(orte_rml_tag_t *tag, char *name) -{ - *tag = ORTE_RML_TAG_MAX; - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return ORTE_ERR_UNREACH; -} - -int -orte_ns_base_define_data_type_not_available( - const char *name, - orte_data_type_t *type) -{ - *type = ORTE_DSS_ID_MAX; - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return ORTE_ERR_UNREACH; -} - -int -orte_ns_base_create_my_name_not_available(void) -{ - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return ORTE_ERR_UNREACH; -} - -int orte_ns_base_get_job_peers_not_available(orte_process_name_t **procs, - orte_std_cntr_t *num_procs, orte_jobid_t job) -{ - *procs = NULL; - *num_procs = 0; - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return ORTE_ERR_UNREACH; -} - -int -orte_ns_base_dump_cells_not_available(void) -{ - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return ORTE_ERR_UNREACH; -} - -int -orte_ns_base_dump_jobs_not_available(void) -{ - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return ORTE_ERR_UNREACH; -} - -int -orte_ns_base_dump_tags_not_available(void) -{ - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return ORTE_ERR_UNREACH; -} - -int -orte_ns_base_dump_datatypes_not_available(void) -{ - ORTE_ERROR_LOG(ORTE_ERR_UNREACH); - return ORTE_ERR_UNREACH; -} - - - -/* - * functions - */ - -int orte_ns_base_assign_cellid_to_process(orte_process_name_t *name) -{ - if (NULL == name) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - name->cellid = 0; - return ORTE_SUCCESS; -} - - -int orte_ns_base_create_process_name(orte_process_name_t **name, - orte_cellid_t cell, - orte_jobid_t job, - orte_vpid_t vpid) -{ - *name = NULL; - - if (ORTE_CELLID_MAX < cell || - ORTE_JOBID_MAX < job || - ORTE_VPID_MAX < vpid) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - *name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t)); - if (NULL == *name) { /* got an error */ - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - (*name)->cellid = cell; - (*name)->jobid = job; - (*name)->vpid = vpid; - return ORTE_SUCCESS; -} - -int orte_ns_base_derive_vpid(orte_vpid_t *vpid, orte_vpid_t base_vpid, int offset) -{ - *vpid = base_vpid + (orte_vpid_t)offset; - - return ORTE_SUCCESS; -} - - -int orte_ns_base_copy_process_name(orte_process_name_t **dest, - orte_process_name_t* src) -{ - orte_cellid_t cell; - orte_jobid_t job; - orte_vpid_t vpid; - int rc; - - *dest = NULL; - - if (NULL == src) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - if (ORTE_SUCCESS != orte_ns_base_get_cellid(&cell, src)) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - if (ORTE_SUCCESS != orte_ns_base_get_jobid(&job, src)) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - if (ORTE_SUCCESS != orte_ns_base_get_vpid(&vpid, src)) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - if (ORTE_SUCCESS != (rc = orte_ns_base_create_process_name(dest, cell, job, vpid))) { - ORTE_ERROR_LOG(rc); - } - - return rc; -} - -int orte_ns_base_get_proc_name_string(char **name_string, - const orte_process_name_t* name) -{ - *name_string = NULL; - - if (NULL == name) { /* got an error */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - if (0 > asprintf(name_string, "%lu.%lu.%lu", ORTE_NAME_ARGS(name))) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - return ORTE_SUCCESS; -} - -int orte_ns_base_convert_string_to_process_name(orte_process_name_t **name, - const char* name_string) -{ - char *temp, *token; - orte_cellid_t cell; - orte_jobid_t job; - orte_vpid_t vpid; - unsigned long int tmpint; - int return_code=ORTE_SUCCESS; - - const char delimiters[] = "."; - - *name = NULL; - - /* check for NULL string - error */ - if (NULL == name_string) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - temp = strdup(name_string); - token = strtok(temp, delimiters); /* get first field -> cellid */ - - /* convert to largest possible unsigned int - unsigned long long is only supported - * in C99, so we have to use unsigned long for backward compatibility - then - * check to ensure it is within range of cellid_t before casting */ - - tmpint = strtoul(token, NULL, 10); - if (ORTE_CELLID_MAX >= tmpint) { - cell = (orte_cellid_t)tmpint; - } else { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return_code = ORTE_ERR_BAD_PARAM; - goto CLEANUP; - } - - token = strtok(NULL, delimiters); /* get second field -> jobid */ - - /* convert to largest possible unsigned int - then - * check to ensure it is within range of jobid_t before casting */ - - tmpint = strtoul(token, NULL, 10); - if (ORTE_JOBID_MAX >= tmpint) { - job = (orte_jobid_t)tmpint; - } else { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return_code = ORTE_ERR_BAD_PARAM; - goto CLEANUP; - } - - token = strtok(NULL, delimiters); /* get third field -> vpid */ - - /* convert to largest possible unsigned int then - * check to ensure it is within range of vpid_t before casting */ - - tmpint = strtoul(token, NULL, 10); - if (ORTE_VPID_MAX >= tmpint) { - vpid = (orte_vpid_t)tmpint; - } else { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return_code = ORTE_ERR_BAD_PARAM; - goto CLEANUP; - } - - if (ORTE_SUCCESS != (return_code = orte_ns_base_create_process_name(name, cell, job, vpid))) { - ORTE_ERROR_LOG(return_code); - } - - CLEANUP: - if (temp) { - free(temp); - } - - return return_code; -} - - -int orte_ns_base_get_vpid_string(char **vpid_string, const orte_process_name_t* name) -{ - *vpid_string = NULL; - - if (NULL == name) { /* got an error */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - if (0 > asprintf(vpid_string, "%lu", (unsigned long) name->vpid)) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - return ORTE_SUCCESS; -} - - -int orte_ns_base_convert_vpid_to_string(char **vpid_string, const orte_vpid_t vpid) -{ - *vpid_string = NULL; - - if (0 > asprintf(vpid_string, "%lu", (unsigned long) vpid)) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - return ORTE_SUCCESS; -} - - -int orte_ns_base_convert_string_to_vpid(orte_vpid_t *vpid, const char* vpidstring) -{ - unsigned long int tmpint; - - *vpid = ORTE_VPID_MAX; - - if (NULL == vpidstring) { /* got an error */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - tmpint = strtoul(vpidstring, NULL, 10); - if (ORTE_VPID_MAX >= tmpint) { - *vpid = (orte_vpid_t)tmpint; - } else { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - *vpid = ORTE_VPID_MAX; - return ORTE_ERR_BAD_PARAM; - } - - return ORTE_SUCCESS; -} - - -int orte_ns_base_get_jobid_string(char **jobid_string, const orte_process_name_t* name) -{ - *jobid_string = NULL; - - if (NULL == name) { /* got an error */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - if (0 > asprintf(jobid_string, "%lu", (unsigned long) name->jobid)) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - return ORTE_SUCCESS; -} - - -int orte_ns_base_convert_jobid_to_string(char **jobid_string, const orte_jobid_t jobid) -{ - *jobid_string = NULL; - - if (0 > asprintf(jobid_string, "%lu", (unsigned long) jobid)) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - return ORTE_SUCCESS; -} - - -int orte_ns_base_convert_string_to_jobid(orte_jobid_t *jobid, const char* jobidstring) -{ - unsigned long int tmpint; - - *jobid = ORTE_JOBID_MAX; - - if (NULL == jobidstring) { /* got an error */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - tmpint = strtoul(jobidstring, NULL, 10); - if (ORTE_JOBID_MAX >= tmpint) { - *jobid = (orte_jobid_t)tmpint; - } else { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - *jobid = ORTE_JOBID_MAX; - return ORTE_ERR_BAD_PARAM; - } - - return ORTE_SUCCESS; -} - - -int orte_ns_base_get_cellid_string(char **cellid_string, const orte_process_name_t* name) -{ - *cellid_string = NULL; - - if (NULL == name) { /* got an error */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - if (0 > asprintf(cellid_string, "%lu", (unsigned long) name->cellid)) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - return ORTE_SUCCESS; -} - - -int orte_ns_base_convert_cellid_to_string(char **cellid_string, const orte_cellid_t cellid) -{ - *cellid_string = NULL; - - if (0 > asprintf(cellid_string, "%lu", (unsigned long) cellid)) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - return ORTE_SUCCESS; -} - - -int orte_ns_base_convert_string_to_cellid(orte_cellid_t *cellid, const char *cellidstring) -{ - unsigned long int tmpint; - - *cellid = ORTE_CELLID_MAX; - - if (NULL == cellidstring) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - tmpint = strtoul(cellidstring, NULL, 10); - if (ORTE_CELLID_MAX >= tmpint) { - *cellid = (orte_cellid_t)tmpint; - } else { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - *cellid = ORTE_CELLID_MAX; - return ORTE_ERR_BAD_PARAM; - } - - return ORTE_SUCCESS; -} - - -int orte_ns_base_get_vpid(orte_vpid_t *vpid, const orte_process_name_t* name) -{ - *vpid = ORTE_VPID_MAX; - - if (NULL == name) { /* got an error */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - *vpid = name->vpid; - - return ORTE_SUCCESS; -} - - -int orte_ns_base_get_jobid(orte_jobid_t *jobid, const orte_process_name_t* name) -{ - *jobid = ORTE_JOBID_MAX; - - if (NULL == name) { /* got an error */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - *jobid = name->jobid; - - return ORTE_SUCCESS; -} - -int orte_ns_base_get_cellid(orte_cellid_t *cellid, const orte_process_name_t* name) -{ - *cellid = ORTE_CELLID_MAX; - - if (NULL == name) { /* got an error */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - *cellid = name->cellid; - - return ORTE_SUCCESS; -} - - -int orte_ns_base_compare(orte_ns_cmp_bitmask_t fields, - const orte_process_name_t* name1, - const orte_process_name_t* name2) -{ - if (NULL == name1 && NULL == name2) { - return 0; - } else if (NULL == name1) { - return -1; - } else if (NULL == name2) { - return 1; - } - - if (ORTE_NS_CMP_CELLID & fields) { /* check cellid field */ - if (name1->cellid < name2->cellid) { - return -1; - } else if (name1->cellid > name2->cellid) { - return 1; - } - } - - /* get here if cellid's are equal, or cellid not being checked */ - /* now check job id */ - - if (ORTE_NS_CMP_JOBID & fields) { - if (name1->jobid < name2->jobid) { - return -1; - } else if (name1->jobid > name2->jobid) { - return 1; - } - } - - /* get here if cellid's and jobid's are equal, or neither being checked, - * or cellid not checked and jobid's equal. - * now check vpid - */ - - if (ORTE_NS_CMP_VPID & fields) { - if (name1->vpid < name2->vpid) { - return -1; - } else if (name1->vpid > name2->vpid) { - return 1; - } - } - - /* only way to get here is if all fields are being checked and are equal, - * or cellid not checked, but jobid and vpid equal, - * or cellid and jobid not checked, but vpid equal, - * only vpid being checked, and equal - * return that fact - */ - return 0; -} - - -int orte_ns_base_free_name(orte_process_name_t **name) -{ - if (NULL != name && NULL != *name) { - free(*name); - } - - *name = NULL; - - return ORTE_SUCCESS; -} - -int orte_ns_base_get_peers(orte_process_name_t **procs, - orte_std_cntr_t *num_procs, orte_std_cntr_t *self) -{ - orte_std_cntr_t i; - int rc; - orte_cellid_t mycellid; - orte_jobid_t myjobid; - orte_vpid_t myvpid; - - *procs = (orte_process_name_t*)malloc(orte_process_info.num_procs * - sizeof(orte_process_name_t)); - if (NULL == *procs) { - return ORTE_ERR_OUT_OF_RESOURCE; - } - - if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mycellid, orte_process_info.my_name))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - if (ORTE_SUCCESS != orte_ns.get_jobid(&myjobid, orte_process_info.my_name)) { - ORTE_ERROR_LOG(rc); - return rc; - } - - if (ORTE_SUCCESS != orte_ns.get_vpid(&myvpid, orte_process_info.my_name)) { - ORTE_ERROR_LOG(rc); - return rc; - } - - for (i=0; i < orte_process_info.num_procs; i++) { - (*procs)[i].cellid = mycellid; - (*procs)[i].jobid = myjobid; - (*procs)[i].vpid = orte_process_info.vpid_start + i; - } - - *num_procs = orte_process_info.num_procs; - *self = (orte_std_cntr_t)(myvpid - orte_process_info.vpid_start); - - return ORTE_SUCCESS; -} - - -/* - * DIAGNOSTIC FUNCTIONS - */ -int orte_ns_base_print_dump(orte_buffer_t *buffer) -{ - char *line; - orte_std_cntr_t n; - orte_data_type_t type; - int rc; - - n = 1; - while (ORTE_SUCCESS == orte_dss.peek(buffer, &type, &n)) { - if (ORTE_SUCCESS != - (rc = orte_dss.unpack(buffer, &line, &n, ORTE_STRING))) { - ORTE_ERROR_LOG(rc); - return rc; - } - opal_output(mca_ns_base_output, "%s", line); - free(line); - n=1; - } - - return ORTE_SUCCESS; -} - diff --git a/orte/mca/ns/base/ns_base_open.c b/orte/mca/ns/base/ns_base_open.c index e6c49b0975..0a0f110c82 100644 --- a/orte/mca/ns/base/ns_base_open.c +++ b/orte/mca/ns/base/ns_base_open.c @@ -29,6 +29,7 @@ #include "orte/dss/dss.h" #include "orte/mca/ns/base/base.h" +#include "orte/mca/ns/base/ns_private.h" /* @@ -43,7 +44,9 @@ * globals */ -orte_process_name_t orte_name_all = {ORTE_CELLID_MAX, ORTE_JOBID_MAX, ORTE_VPID_MAX}; +orte_process_name_t orte_ns_name_wildcard = {ORTE_CELLID_WILDCARD, ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD}; +orte_process_name_t orte_ns_name_invalid = {ORTE_CELLID_INVALID, ORTE_JOBID_INVALID, ORTE_VPID_INVALID}; +orte_process_name_t orte_ns_name_my_hnp = {0, 0, 0}; /* * Global variables @@ -54,35 +57,37 @@ mca_ns_base_module_t orte_ns = { orte_ns_base_module_init_not_available, /* cell functions */ orte_ns_base_create_cellid_not_available, - orte_ns_base_get_cellid, orte_ns_base_get_cell_info_not_available, - orte_ns_base_assign_cellid_to_process, orte_ns_base_get_cellid_string, orte_ns_base_convert_cellid_to_string, orte_ns_base_convert_string_to_cellid, + /* node functions */ + orte_ns_base_create_nodeids_not_available, + orte_ns_base_get_node_info_not_available, + orte_ns_base_convert_nodeid_to_string, + orte_ns_base_convert_string_to_nodeid, /* jobid functions */ orte_ns_base_create_jobid_not_available, - orte_ns_base_get_jobid, + orte_ns_base_get_job_descendants_not_available, + orte_ns_base_get_job_children_not_available, + orte_ns_base_get_root_job_not_available, + orte_ns_base_get_parent_job_not_available, orte_ns_base_get_jobid_string, orte_ns_base_convert_jobid_to_string, orte_ns_base_convert_string_to_jobid, - /* vpid functions */ orte_ns_base_get_vpid_range_not_available, - orte_ns_base_get_vpid, + /* vpid functions */ orte_ns_base_get_vpid_string, orte_ns_base_convert_vpid_to_string, orte_ns_base_convert_string_to_vpid, /* name functions */ orte_ns_base_create_process_name, orte_ns_base_create_my_name_not_available, - orte_ns_base_copy_process_name, orte_ns_base_convert_string_to_process_name, - orte_ns_base_free_name, orte_ns_base_get_proc_name_string, - orte_ns_base_compare, + orte_ns_base_compare_fields, /* peer functions */ - orte_ns_base_get_peers, - orte_ns_base_get_job_peers_not_available, + orte_ns_base_get_peers_not_available, /* tag server functions */ orte_ns_base_assign_rml_tag_not_available, /* data type functions */ diff --git a/orte/mca/ns/base/ns_base_vpid_name_fns.c b/orte/mca/ns/base/ns_base_vpid_name_fns.c new file mode 100644 index 0000000000..4d5465335c --- /dev/null +++ b/orte/mca/ns/base/ns_base_vpid_name_fns.c @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +#include "orte_config.h" + +#include <stdio.h> +#include <string.h> +#include <stddef.h> +#include <stdlib.h> +#if HAVE_NETINET_IN_H +#include <netinet/in.h> +#endif + +#include "orte/orte_constants.h" + +#include "opal/util/output.h" +#include "opal/util/printf.h" +#include "opal/mca/mca.h" + +#include "orte/mca/schema/schema_types.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/ns/base/ns_private.h" + +/* + * "not available" functions + */ +int +orte_ns_base_create_my_name_not_available(void) +{ + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +/**** NAME STRING FUNCTIONS ****/ + +int orte_ns_base_get_proc_name_string(char **name_string, + const orte_process_name_t* name) +{ + char *tmp, *tmp2; + + if (NULL == name) { /* got an error */ + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + + /* handle the more typical case where none of the fields + * contain WILDCARD or INVALID values + */ + if ((ORTE_CELLID_WILDCARD != name->cellid && ORTE_CELLID_INVALID != name->cellid) && + (ORTE_JOBID_WILDCARD != name->jobid && ORTE_JOBID_INVALID != name->jobid) && + (ORTE_VPID_WILDCARD != name->vpid && ORTE_VPID_INVALID != name->vpid)) { + if (0 > asprintf(name_string, "%ld%c%ld%c%ld", (long)name->cellid, + ORTE_SCHEMA_DELIMITER_CHAR, (long)name->jobid, + ORTE_SCHEMA_DELIMITER_CHAR, (long)name->vpid)) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + return ORTE_SUCCESS; + } + + /* okay, now handle the corner cases */ + if (ORTE_CELLID_WILDCARD == name->cellid) { + tmp = strdup(ORTE_SCHEMA_WILDCARD_STRING); + } else if (ORTE_CELLID_INVALID == name->cellid) { + tmp = strdup(ORTE_SCHEMA_INVALID_STRING); + } else { + asprintf(&tmp, "%ld", (long)name->cellid); + } + + if (ORTE_JOBID_WILDCARD == name->jobid) { + asprintf(&tmp2, "%s%c%s", tmp, ORTE_SCHEMA_DELIMITER_CHAR, + ORTE_SCHEMA_WILDCARD_STRING, ORTE_SCHEMA_DELIMITER_CHAR); + } else if (ORTE_JOBID_INVALID == name->jobid) { + asprintf(&tmp2, "%s%c%s", tmp, ORTE_SCHEMA_DELIMITER_CHAR, + ORTE_SCHEMA_INVALID_STRING, ORTE_SCHEMA_DELIMITER_CHAR); + } else { + asprintf(&tmp2, "%s%c%ld", tmp, ORTE_SCHEMA_DELIMITER_CHAR, + (long)name->jobid, ORTE_SCHEMA_DELIMITER_CHAR); + } + free(tmp); + + if (ORTE_VPID_WILDCARD == name->vpid) { + asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, + ORTE_SCHEMA_WILDCARD_STRING); + } else if (ORTE_VPID_INVALID == name->vpid) { + asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, + ORTE_SCHEMA_INVALID_STRING); + } else { + asprintf(name_string, "%s%c%ld", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, + (long)name->vpid); + } + free(tmp2); + + return ORTE_SUCCESS; +} + +int orte_ns_base_convert_string_to_process_name(orte_process_name_t **name, + const char* name_string) +{ + char *temp, *token; + orte_cellid_t cell; + orte_jobid_t job; + orte_vpid_t vpid; + long int tmpint; + int return_code=ORTE_SUCCESS; + + /* check for NULL string - error */ + if (NULL == name_string) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + + temp = strdup(name_string); /** copy input string as the strtok process is destructive */ + token = strtok(temp, ORTE_SCHEMA_DELIMITER_STRING); /** get first field -> cellid */ + + /* check for error */ + if (NULL == token) { + return ORTE_ERR_BAD_PARAM; + } + + /* convert to largest possible int - then + * check to ensure it is within range of cellid_t before casting + */ + + /* first, though, check for WILDCARD character - assign + * value accordingly, if found + */ + if (0 == strcmp(token, ORTE_SCHEMA_WILDCARD_STRING)) { + cell = ORTE_CELLID_WILDCARD; + } else if (0 == strcmp(token, ORTE_SCHEMA_INVALID_STRING)) { + cell = ORTE_CELLID_INVALID; + } else { + tmpint = strtol(token, NULL, 10); + if (ORTE_CELLID_MAX >= tmpint && ORTE_CELLID_MIN <= tmpint) { + cell = (orte_cellid_t)tmpint; + } else { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return_code = ORTE_ERR_BAD_PARAM; + goto CLEANUP; + } + } + + token = strtok(NULL, ORTE_SCHEMA_DELIMITER_STRING); /** get next field -> jobid */ + + /** convert to largest possible int - then + * check to ensure it is within range of jobid_t before casting */ + + /* check for error */ + if (NULL == token) { + return ORTE_ERR_BAD_PARAM; + } + + /** first, though, check for WILDCARD character - assign + * value accordingly, if found + */ + if (0 == strcmp(token, ORTE_SCHEMA_WILDCARD_STRING)) { + job = ORTE_JOBID_WILDCARD; + } else if (0 == strcmp(token, ORTE_SCHEMA_INVALID_STRING)) { + job = ORTE_JOBID_INVALID; + } else { + tmpint = strtol(token, NULL, 10); + if (ORTE_JOBID_MAX >= tmpint && ORTE_JOBID_MIN <= tmpint) { + job = (orte_jobid_t)tmpint; + } else { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return_code = ORTE_ERR_BAD_PARAM; + goto CLEANUP; + } + } + + token = strtok(NULL, ORTE_SCHEMA_DELIMITER_STRING); /** get next field -> vpid */ + + /* check for error */ + if (NULL == token) { + return ORTE_ERR_BAD_PARAM; + } + + /** convert to largest possible int then + * check to ensure it is within range of vpid_t before casting */ + + /** first, though, check for WILDCARD character - assign + * value accordingly, if found + */ + if (0 == strcmp(token, ORTE_SCHEMA_WILDCARD_STRING)) { + vpid = ORTE_VPID_WILDCARD; + } else if (0 == strcmp(token, ORTE_SCHEMA_INVALID_STRING)) { + vpid = ORTE_VPID_INVALID; + } else { + tmpint = strtol(token, NULL, 10); + if (ORTE_VPID_MAX >= tmpint && ORTE_VPID_MIN <= tmpint) { + vpid = (orte_vpid_t)tmpint; + } else { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return_code = ORTE_ERR_BAD_PARAM; + goto CLEANUP; + } + } + + if (ORTE_SUCCESS != (return_code = + orte_ns_base_create_process_name(name, cell, job, vpid))) { + ORTE_ERROR_LOG(return_code); + } + +CLEANUP: + free(temp); + + return return_code; +} + +/**** CREATE PROCESS NAME ****/ +int orte_ns_base_create_process_name(orte_process_name_t **name, + orte_cellid_t cell, + orte_jobid_t job, + orte_vpid_t vpid) +{ + *name = NULL; + + *name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t)); + if (NULL == *name) { /* got an error */ + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + (*name)->cellid = cell; + (*name)->jobid = job; + (*name)->vpid = vpid; + return ORTE_SUCCESS; +} + + +/**** VPID STRING FUNCTIONS ****/ +int orte_ns_base_get_vpid_string(char **vpid_string, const orte_process_name_t* name) +{ + if (NULL == name) { /* got an error */ + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + *vpid_string = NULL; + return ORTE_ERR_BAD_PARAM; + } + + /* check for wildcard value - handle appropriately */ + if (ORTE_VPID_WILDCARD == name->vpid) { + *vpid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING); + return ORTE_SUCCESS; + } + + /* check for invalid value - handle appropriately */ + if (ORTE_VPID_INVALID == name->vpid) { + *vpid_string = strdup(ORTE_SCHEMA_INVALID_STRING); + return ORTE_SUCCESS; + } + + if (0 > asprintf(vpid_string, "%ld", (long) name->vpid)) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + return ORTE_SUCCESS; +} + + +int orte_ns_base_convert_vpid_to_string(char **vpid_string, const orte_vpid_t vpid) +{ + /* check for wildcard value - handle appropriately */ + if (ORTE_VPID_WILDCARD == vpid) { + *vpid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING); + return ORTE_SUCCESS; + } + + /* check for invalid value - handle appropriately */ + if (ORTE_VPID_INVALID == vpid) { + *vpid_string = strdup(ORTE_SCHEMA_INVALID_STRING); + return ORTE_SUCCESS; + } + + if (0 > asprintf(vpid_string, "%ld", (long) vpid)) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + return ORTE_SUCCESS; +} + + +int orte_ns_base_convert_string_to_vpid(orte_vpid_t *vpid, const char* vpidstring) +{ + long int tmpint; + + if (NULL == vpidstring) { /* got an error */ + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + *vpid = ORTE_VPID_INVALID; + return ORTE_ERR_BAD_PARAM; + } + + /** check for wildcard character - handle appropriately */ + if (0 == strcmp(ORTE_SCHEMA_WILDCARD_STRING, vpidstring)) { + *vpid = ORTE_VPID_WILDCARD; + return ORTE_SUCCESS; + } + + /* check for invalid value */ + if (0 == strcmp(ORTE_SCHEMA_INVALID_STRING, vpidstring)) { + *vpid = ORTE_VPID_INVALID; + return ORTE_SUCCESS; + } + + tmpint = strtol(vpidstring, NULL, 10); + + if (ORTE_VPID_MAX >= tmpint && ORTE_VPID_MIN <= tmpint) { + *vpid = (orte_vpid_t)tmpint; + } else { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + *vpid = ORTE_VPID_INVALID; + return ORTE_ERR_BAD_PARAM; + } + + return ORTE_SUCCESS; +} + +/**** COMPARE NAME FIELDS ****/ +int orte_ns_base_compare_fields(orte_ns_cmp_bitmask_t fields, + const orte_process_name_t* name1, + const orte_process_name_t* name2) +{ + /* handle the NULL pointer case */ + if (NULL == name1 && NULL == name2) { + return ORTE_EQUAL; + } else if (NULL == name1) { + return ORTE_VALUE2_GREATER; + } else if (NULL == name2) { + return ORTE_VALUE1_GREATER; + } + + /* in this comparison function, we check for exact equalities. + * In the case of wildcards, we check to ensure that the fields + * actually match those values - thus, a "wildcard" in this + * function does not actually stand for a wildcard value, but + * rather a specific value + */ + if (ORTE_NS_CMP_CELLID & fields) { /* check cellid field */ + if (name1->cellid < name2->cellid) { + return ORTE_VALUE2_GREATER; + } else if (name1->cellid > name2->cellid) { + return ORTE_VALUE1_GREATER; + } + } + + /* get here if cellid's are equal, or cellid not being checked */ + /* now check job id */ + + if (ORTE_NS_CMP_JOBID & fields) { + if (name1->jobid < name2->jobid) { + return ORTE_VALUE2_GREATER; + } else if (name1->jobid > name2->jobid) { + return ORTE_VALUE1_GREATER; + } + } + + /* get here if cellid's and jobid's are equal, or neither being checked, + * or cellid not checked and jobid's equal. + * now check vpid + */ + + if (ORTE_NS_CMP_VPID & fields) { + if (name1->vpid < name2->vpid) { + return ORTE_VALUE2_GREATER; + } else if (name1->vpid > name2->vpid) { + return ORTE_VALUE1_GREATER; + } + } + + /* only way to get here is if all fields are being checked and are equal, + * or cellid not checked, but jobid and vpid equal, + * or cellid and jobid not checked, but vpid equal, + * only vpid being checked, and equal + * return that fact + */ + return ORTE_EQUAL; +} diff --git a/orte/mca/ns/base/ns_private.h b/orte/mca/ns/base/ns_private.h new file mode 100644 index 0000000000..9cfadf206a --- /dev/null +++ b/orte/mca/ns/base/ns_private.h @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + */ + +#ifndef MCA_NS_PRIVATE_H +#define MCA_NS_PRIVATE_H + +/* + * includes + */ +#include "orte_config.h" +#include "orte/orte_constants.h" + +#include "opal/class/opal_list.h" +#include "opal/mca/mca.h" + +#include "orte/dss/dss_types.h" +#include "orte/mca/ns/ns_types.h" +#include "orte/mca/rml/rml_types.h" + +/* + * Global functions for MCA overall collective open and close + */ +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +/* default limits */ +#define ORTE_NS_ARRAY_MAX_SIZE INT_MAX +#define ORTE_NS_ARRAY_BLOCK_SIZE 512 +/* + * Internal definitions + */ +typedef uint8_t orte_ns_cmd_bitmask_t; +typedef uint8_t orte_ns_cmd_flag_t; + +/* + * packing type definitions + */ +/* CAUTION - any changes here must also change corresponding + * typedefs above and in ns_types.h + */ +#define ORTE_NS_CMD ORTE_INT8 +#define ORTE_CELLID_T ORTE_INT32 +#define ORTE_NODEID_T ORTE_INT32 +#define ORTE_JOBID_T ORTE_INT32 +#define ORTE_VPID_T ORTE_INT32 + +/* + * define flag values for remote commands - only used internally + */ +#define ORTE_NS_CREATE_CELLID_CMD (int8_t) 1 +#define ORTE_NS_GET_CELL_INFO_CMD (int8_t) 2 +#define ORTE_NS_CREATE_NODEID_CMD (int8_t) 3 +#define ORTE_NS_GET_NODE_INFO_CMD (int8_t) 4 +#define ORTE_NS_CREATE_JOBID_CMD (int8_t) 5 +#define ORTE_NS_GET_JOB_DESC_CMD (int8_t) 6 +#define ORTE_NS_GET_JOB_CHILD_CMD (int8_t) 7 +#define ORTE_NS_GET_ROOT_JOB_CMD (int8_t) 8 +#define ORTE_NS_GET_PARENT_JOB_CMD (int8_t) 9 +#define ORTE_NS_RESERVE_RANGE_CMD (int8_t) 10 +#define ORTE_NS_ASSIGN_OOB_TAG_CMD (int8_t) 11 +#define ORTE_NS_GET_PEERS_CMD (int8_t) 12 +#define ORTE_NS_DEFINE_DATA_TYPE_CMD (int8_t) 13 +#define ORTE_NS_CREATE_MY_NAME_CMD (int8_t) 14 +#define ORTE_NS_DUMP_CELLS_CMD (int8_t) 15 +#define ORTE_NS_DUMP_JOBIDS_CMD (int8_t) 16 +#define ORTE_NS_DUMP_TAGS_CMD (int8_t) 17 +#define ORTE_NS_DUMP_DATATYPES_CMD (int8_t) 18 + + +/* + * Base functions that are common to all implementations - can be overridden + */ + +ORTE_DECLSPEC int orte_ns_base_create_process_name(orte_process_name_t **name, + orte_cellid_t cell, + orte_jobid_t job, + orte_vpid_t vpid); + +ORTE_DECLSPEC int orte_ns_base_convert_string_to_process_name(orte_process_name_t **name, + const char* name_string); + +ORTE_DECLSPEC int orte_ns_base_get_proc_name_string(char **name_string, + const orte_process_name_t* name); + +ORTE_DECLSPEC int orte_ns_base_get_vpid_string(char **vpid_string, const orte_process_name_t* name); + +ORTE_DECLSPEC int orte_ns_base_convert_vpid_to_string(char **vpid_string, const orte_vpid_t vpid); + +ORTE_DECLSPEC int orte_ns_base_convert_string_to_vpid(orte_vpid_t *vpid, const char* vpidstring); + +ORTE_DECLSPEC int orte_ns_base_get_jobid_string(char **jobid_string, const orte_process_name_t* name); + +ORTE_DECLSPEC int orte_ns_base_convert_jobid_to_string(char **jobid_string, const orte_jobid_t jobid); + +ORTE_DECLSPEC int orte_ns_base_convert_string_to_jobid(orte_jobid_t *jobid, const char* jobidstring); + +ORTE_DECLSPEC int orte_ns_base_get_cellid_string(char **cellid_string, const orte_process_name_t* name); + +ORTE_DECLSPEC int orte_ns_base_convert_string_to_cellid(orte_cellid_t *cellid, const char *cellidstring); + +ORTE_DECLSPEC int orte_ns_base_convert_cellid_to_string(char **cellid_string, const orte_cellid_t cellid); + +ORTE_DECLSPEC int orte_ns_base_get_vpid(orte_vpid_t *vpid, const orte_process_name_t* name); + +ORTE_DECLSPEC int orte_ns_base_get_jobid(orte_jobid_t *jobid, const orte_process_name_t* name); + +ORTE_DECLSPEC int orte_ns_base_get_cellid(orte_cellid_t *cellid, const orte_process_name_t* name); + +ORTE_DECLSPEC int orte_ns_base_convert_string_to_nodeid(orte_nodeid_t *cellid, const char *string); + +ORTE_DECLSPEC int orte_ns_base_convert_nodeid_to_string(char **nodeid_string, const orte_nodeid_t nodeid); + +ORTE_DECLSPEC int orte_ns_base_compare_fields(orte_ns_cmp_bitmask_t fields, + const orte_process_name_t* name1, + const orte_process_name_t* name2); + +ORTE_DECLSPEC int orte_ns_base_print_dump(orte_buffer_t *buffer); + + +/* not available functions */ +ORTE_DECLSPEC int orte_ns_base_module_init_not_available(void); + +ORTE_DECLSPEC int orte_ns_base_create_cellid_not_available(orte_cellid_t *cellid, + char *site, char *resource); + +ORTE_DECLSPEC int orte_ns_base_get_cell_info_not_available(orte_cellid_t cellid, + char **site, char **resource); + +ORTE_DECLSPEC int orte_ns_base_create_nodeids_not_available(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, + orte_cellid_t cellid, char **nodename); + +ORTE_DECLSPEC int orte_ns_base_get_node_info_not_available(char ***nodename, orte_cellid_t cellid, + orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids); + +ORTE_DECLSPEC int orte_ns_base_create_jobid_not_available(orte_jobid_t *jobid, opal_list_t *attrs); + +ORTE_DECLSPEC int orte_ns_base_get_job_descendants_not_available(orte_jobid_t** descendants, + orte_std_cntr_t *num_desc, + orte_jobid_t job); + +ORTE_DECLSPEC int orte_ns_base_get_job_children_not_available(orte_jobid_t** children, + orte_std_cntr_t *num_childs, + orte_jobid_t job); + +ORTE_DECLSPEC int orte_ns_base_get_root_job_not_available(orte_jobid_t *root_job, orte_jobid_t job); + +ORTE_DECLSPEC int orte_ns_base_get_parent_job_not_available(orte_jobid_t *parent, orte_jobid_t job); + +ORTE_DECLSPEC int orte_ns_base_get_vpid_range_not_available(orte_jobid_t job, + orte_vpid_t range, + orte_vpid_t *startvpid); + +ORTE_DECLSPEC int orte_ns_base_assign_rml_tag_not_available(orte_rml_tag_t *tag, char *name); + +ORTE_DECLSPEC int orte_ns_base_define_data_type_not_available( + const char *name, + orte_data_type_t *type); + +ORTE_DECLSPEC int orte_ns_base_create_my_name_not_available(void); + +ORTE_DECLSPEC int orte_ns_base_get_peers_not_available(orte_process_name_t **procs, + orte_std_cntr_t *num_procs, opal_list_t *attributes); + +ORTE_DECLSPEC int orte_ns_base_dump_cells_not_available(void); +ORTE_DECLSPEC int orte_ns_base_dump_jobs_not_available(void); +ORTE_DECLSPEC int orte_ns_base_dump_tags_not_available(void); +ORTE_DECLSPEC int orte_ns_base_dump_datatypes_not_available(void); + +/* Base functions used everywhere */ +ORTE_DECLSPEC int orte_ns_base_pack_name(orte_buffer_t *buffer, void *src, + orte_std_cntr_t num_vals, orte_data_type_t type); + +ORTE_DECLSPEC int orte_ns_base_pack_cellid(orte_buffer_t *buffer, void *src, + orte_std_cntr_t num_vals, orte_data_type_t type); + +ORTE_DECLSPEC int orte_ns_base_pack_nodeid(orte_buffer_t *buffer, void *src, + orte_std_cntr_t num_vals, orte_data_type_t type); + +ORTE_DECLSPEC int orte_ns_base_pack_jobid(orte_buffer_t *buffer, void *src, + orte_std_cntr_t num_vals, orte_data_type_t type); + +ORTE_DECLSPEC int orte_ns_base_pack_vpid(orte_buffer_t *buffer, void *src, + orte_std_cntr_t num_vals, orte_data_type_t type); + +ORTE_DECLSPEC int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest, + orte_std_cntr_t *num_vals, orte_data_type_t type); + +ORTE_DECLSPEC int orte_ns_base_unpack_cellid(orte_buffer_t *buffer, void *dest, + orte_std_cntr_t *num_vals, orte_data_type_t type); + +ORTE_DECLSPEC int orte_ns_base_unpack_nodeid(orte_buffer_t *buffer, void *dest, + orte_std_cntr_t *num_vals, orte_data_type_t type); + +ORTE_DECLSPEC int orte_ns_base_unpack_jobid(orte_buffer_t *buffer, void *dest, + orte_std_cntr_t *num_vals, orte_data_type_t type); + +ORTE_DECLSPEC int orte_ns_base_unpack_vpid(orte_buffer_t *buffer, void *dest, + orte_std_cntr_t *num_vals, orte_data_type_t type); + +/* + * copy functions + */ + +int orte_ns_base_copy_name(orte_process_name_t **dest, orte_process_name_t *src, orte_data_type_t type); + +int orte_ns_base_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, orte_data_type_t type); + +int orte_ns_base_copy_cellid(orte_cellid_t **dest, orte_cellid_t *src, orte_data_type_t type); + +int orte_ns_base_copy_nodeid(orte_nodeid_t **dest, orte_nodeid_t *src, orte_data_type_t type); + +int orte_ns_base_copy_jobid(orte_jobid_t **dest, orte_jobid_t *src, orte_data_type_t type); + +/* + * compare functions + */ + +int orte_ns_base_compare_name(orte_process_name_t *value1, + orte_process_name_t *value2, + orte_data_type_t type); + + +int orte_ns_base_compare_vpid(orte_vpid_t *value1, + orte_vpid_t *value2, + orte_data_type_t type); + +int orte_ns_base_compare_jobid(orte_jobid_t *value1, + orte_jobid_t *value2, + orte_data_type_t type); + +int orte_ns_base_compare_cellid(orte_cellid_t *value1, + orte_cellid_t *value2, + orte_data_type_t type); + +int orte_ns_base_compare_nodeid(orte_nodeid_t *value1, + orte_nodeid_t *value2, + orte_data_type_t type); + +/* + * size functions + */ + +int orte_ns_base_std_size(size_t *size, void *src, orte_data_type_t type); + +/* + * release functions + */ + +void orte_ns_base_std_release(orte_data_value_t *value); + +/* + * print functions + */ + +int orte_ns_base_std_print(char **output, char *prefix, void *src, orte_data_type_t type); + +int orte_ns_base_print_name(char **output, char *prefix, orte_process_name_t *name, orte_data_type_t type); + + +/* + * external API functions will be documented in the mca/ns/ns.h file + */ + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/orte/mca/ns/ns.h b/orte/mca/ns/ns.h index db893efb27..4086f61e6a 100644 --- a/orte/mca/ns/ns.h +++ b/orte/mca/ns/ns.h @@ -39,7 +39,7 @@ #include "orte/dss/dss.h" #include "opal/mca/mca.h" -#include "orte/mca/oob/oob_types.h" +#include "orte/mca/rml/rml_types.h" #include "ns_types.h" @@ -59,26 +59,18 @@ typedef int (*orte_ns_base_module_init_fn_t)(void); /**** CELL FUNCTIONS ****/ /** * Create a new cell id. - * The create_cellid() function allocates a new cell id for use by the caller. - * The function checks to find the next available cell id, reserves it, and returns that - * number. No memory for names is allocated by this process. The range of answers is from - * 1 to MCA_NS_BASE_CELLID_MAX-1 (zero is reserved for the seed name and cannot therefore be - * allocated). + * Allocates a new cell id for use by the caller. The function returns an + * existing cellid if the specified site/resource already has been assigned + * one. * * @param site The name of the site where the cell is located. * @param resource The name of the resource associated with this cell (e.g., the name * of the cluster). - * @param cellid The numerical value of the allocated cell id. A value of - * MCA_NS_BASE_CELLID_MAX indicates - * that an error occurred - this represents a very unlikely - * event meaning that the system ran out of cell id's. This probably indicates - * an error in the calling program as the number of available cell id's is extremely large. + * @param cellid The location where the cellid is to be stored. * * @retval ORTE_SUCCESS A cellid was created and returned. * @retval ORTE_ERROR_VALUE An error code indicative of the problem. * - * @code - * new_cellid = ompi_name_server.create_cellid() * @endcode */ typedef int (*orte_ns_base_module_create_cellid_fn_t)(orte_cellid_t *cellid, @@ -97,34 +89,6 @@ typedef int (*orte_ns_base_module_create_cellid_fn_t)(orte_cellid_t *cellid, typedef int (*orte_ns_base_module_get_cell_info_fn_t)(orte_cellid_t cellid, char **site, char **resource); -/** - * Get the cell id for a process. - * The cellid designator represents the physical location of the process - it is associated with - * the hardware/system where the process is executing. Each process name contains this identifier - * so that the system can issue commands (e.g., "die") to a collection of processes that are - * executing on a common platform. - * - * Given that usage, it is necessary that the system have a way of telling a process its cellid. - * The create_cellid() function is used by the system to associate a "cellid" identifier with - * each platform. This function - assign_cellid_to_process() - is used to inform the process - * of its cellid. - * - * Given a process name, this function will lookup its current platform and update the name with the - * cellid. - * - * @param name Pointer to an ompi_process_name structure. The function will update the cellid - * entry in the structure. - * - * @retval ORTE_SUCCESS Update was successful. - * @retval OMPI_ERROR Update failed, most likely due to either a NULL process name pointer or the - * inability to locate the process name in the lookup table. - * - * @code - * return_value = ompi_name_server.assign_cellid_to_process(ompi_process_name_t* name); - * @endcode - */ -typedef int (*orte_ns_base_module_assign_cellid_to_process_fn_t)(orte_process_name_t* name); - /** * Get the cell id as a character string. * The get_cellid_string() function returns the cell id in a character string @@ -181,50 +145,105 @@ typedef int (*orte_ns_base_module_get_cellid_string_fn_t)(char **cellid_string, typedef int (*orte_ns_base_module_convert_string_to_cellid_fn_t)(orte_cellid_t *cellid, const char *cellidstring); -/** - * Get the cell id as a numberic value. - * The get_cellid() function returns the cell id in a numeric representation - - * i.e., in an integer form. - * - * @param *name A pointer to the name structure containing the name. - * - * @retval cellid The cell id field of the provided name. - * @retval MCA_NS_BASE_CELLID_MAX Indicates that an error occurred - in this case, that - * the name variable provided was NULL. - * - * @code - * cellid = ompi_name_server.get_cellid(&name) - * @endcode +/**** NODE FUNCTIONS ****/ +/* + * Get an array of node id's + * Given the cell and a NULL-terminated array of names of nodes within it, this function assigns an id to represent + * each node within the cell. */ -typedef int (*orte_ns_base_module_get_cellid_fn_t)(orte_cellid_t *cellid, const orte_process_name_t* name); +typedef int (*orte_ns_base_module_create_nodeids_fn_t)(orte_nodeid_t **nodes, orte_std_cntr_t *nnodes, + orte_cellid_t cellid, char **nodename); + +/* + * Get node info + * Retrieve the names of an array of nodes given their cellid and nodeids. The cellid + * is required as the nodeids are only unique within a given cell. + * + * @param cellid The id of the cell of the node. + * @param nodeids The ids of the node. + * @param nodenames Returns a pointer to a NULL-terminated array of strdup'd strings containing the node names. + * @retval ORTE_SUCCESS The nodename was created and returned. + * @retval ORTE_ERROR_VALUE An error code indicative of the problem. + */ +typedef int (*orte_ns_base_module_get_node_info_fn_t)(char ***nodename, orte_cellid_t cellid, + orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids); + +/* + * Convert nodeid to character string + * Returns the nodeid in a character string representation. The string is created + * by expressing the provided nodeid in decimal. Memory for the string is + * allocated by the function - releasing that allocation is the responsibility of + * the calling program. + * + * @param nodeid The nodeid to be converted. + * + * @param *nodeid_string A pointer to a character string representation of the nodeid. + * @retval ORTE_SUCCESS The string was created and returned. + * @retval ORTE_ERROR_VALUE An error code indicative of the problem. + */ +typedef int (*orte_ns_base_module_convert_nodeid_to_string_fn_t)(char **nodeid_string, const orte_nodeid_t nodeid); + +/* + * Convert a string to a nodeid. + * Converts a characters string into a nodeid. The character string must be a + * decimal representation of a valid nodeid. + * + * @param nodeidstring The string to be converted. + * + * @param nodeid A pointer to a location where the resulting nodeid is to be stored. + * @retval ORTE_SUCCESS The string was created and returned. + * @retval ORTE_ERROR_VALUE An error code indicative of the problem. + */ +typedef int (*orte_ns_base_module_convert_string_to_nodeid_fn_t)(orte_nodeid_t *nodeid, const char *nodeidstring); /**** JOB ID FUNCTIONS ****/ /** * Create a new job id. - * The create_jobid() function allocates a new job id for use by the caller. - * The function checks to find the next available job id, reserves it, and returns that - * number. No memory for names is allocated by this process. The range of answers is from - * 1 to MCA_NS_BASE_JOBID_MAX-1 (zero is reserved for the seed name and cannot therefore be - * allocated). - - * + * Allocate a new job id for use by the caller. + * * The 0 job id is reserved for daemons within the system and will not be allocated. * Developers should therefore assume that the daemon job id is automatically allocated * and proceed to request names against it. * * @param None - * @retval jobid The numerical value of the allocated job id. A value of - * MCA_NS_BASE_JOBID_MAX indicates - * that an error occurred - this represents a very unlikely - * event meaning that the system ran out of job id's. This probably indicates - * an error in the calling program as the number of available job id's is extremely large. - * - * @code - * new_jobid = ompi_name_server.create_jobid() - * @endcode + * @param jobid A pointer to the location where the jobid is to be returned. + * @param attrs A list of attributes that describe any conditions to be placed on + * the assigned jobid. For example, specifying USE_PARENT indicates that the specified + * jobid is to be identified as the parent of the new jobid. USE_ROOT indicates that + * the root of the job family of the specified jobid is to be identified as the parent. */ -typedef int (*orte_ns_base_module_create_jobid_fn_t)(orte_jobid_t *jobid); +typedef int (*orte_ns_base_module_create_jobid_fn_t)(orte_jobid_t *jobid, opal_list_t *attrs); + +/* + * Get job descendants + * Given a jobid, return the array of jobids that descend from this one. + */ +typedef int (*orte_ns_base_module_get_job_descendants_fn_t)(orte_jobid_t** descendants, + orte_std_cntr_t *num_desc, + orte_jobid_t job); + +/* + * Get job children + * Given a jobid, return the array of jobids that are direct children of that job + */ +typedef int (*orte_ns_base_module_get_job_children_fn_t)(orte_jobid_t** children, + orte_std_cntr_t *num_childs, + orte_jobid_t job); + +/* + * Get root job from job family + * Given a jobid, return the jobid at the head of this job's family. If the jobid provided is the + * root for that family, that value will be returned. + */ +typedef int (*orte_ns_base_module_get_root_job_fn_t)(orte_jobid_t *root_job, orte_jobid_t job); + +/* + * Get parent jobid + * Given a jobid, return the parent job from which it descended. If the provided jobid is the + * root (i.e., has no parent), this function will return that same value. + */ +typedef int (*orte_ns_base_module_get_parent_job_fn_t)(orte_jobid_t *parent, orte_jobid_t job); /** * Reserve a range of process id's. @@ -305,23 +324,6 @@ typedef int (*orte_ns_base_module_convert_jobid_to_string_fn_t)(char **jobid_str */ typedef int (*orte_ns_base_module_convert_string_to_jobid_fn_t)(orte_jobid_t *jobid, const char* jobidstring); -/** - * Get the job id as a numeric value. - * The get_jobid() function returns the job id in a numeric representation - - * i.e., in an integer form. - * - * @param *name A pointer to the name structure containing the name. - * - * @retval jobid The job id field of the provided name. - * @retval MCA_NS_BASE_JOBID_MAX Indicates that an error occurred - in this case, that - * the name variable provided was NULL. - * - * @code - * jobid = ompi_name_server.get_jobid(&name) - * @endcode - */ -typedef int (*orte_ns_base_module_get_jobid_fn_t)(orte_jobid_t *jobid, const orte_process_name_t* name); - /**** NAME FUNCTIONS ****/ @@ -365,20 +367,6 @@ typedef int (*orte_ns_base_module_create_proc_name_fn_t)(orte_process_name_t **n */ typedef int (*orte_ns_base_module_create_my_name_fn_t)(void); -/** - * Make a copy of a process name. - * Given a process name, this function creates a copy of it and returns a pointer - * to the duplicate structure. - * - * @param *name Pointer to an existing process name structure. - * - * @retval *newname Pointer to the duplicate structure, with all fields transferred. - * @retval NULL Indicates an error - most likely due to a NULL process name - * pointer being supplied as input. - */ -typedef int (*orte_ns_base_module_copy_proc_name_fn_t)(orte_process_name_t **dest, - orte_process_name_t* src); - /** * Convert a string representation to a process name. * The convert_string_to_process_name() function converts a string representation of a process @@ -399,30 +387,6 @@ typedef int (*orte_ns_base_module_convert_string_to_process_name_fn_t)(orte_proc const char* name_string); -/** - * Free (release) a process name. - * The free_name() function releases the process name from the "used" list - * maintained within the name server for the jobid contained in the specified - * name. The memory for the name is also released at that time. - * - * Name values are currently \em not re-used. Hence, free-ing a name - * does not provide any noticeable benefit other than releasing the memory. In - * the future, names may be re-used if this becomes desirable. - * - * @param *name A pointer to the name structure containing the name being released. - * - * @retval ORTE_SUCCESS Indicates the release was succesfully accomplished. - * @retval OMPI_ERROR Indicates the release failed - most likely due to an - * error when free-ing the memory allocation. - * - * @code - * if (OMPI_ERROR == ompi_name_server.free_name(&name) { - * report error - * } - * @endcode - */ -typedef int (*orte_ns_base_module_free_name_fn_t)(orte_process_name_t **name); - /** * Get the process name as a character string. * The get_proc_name_string() function returns the entire process name in a @@ -478,9 +442,9 @@ typedef int (*orte_ns_base_module_get_proc_name_string_fn_t)(char **name_string, * result = ompi_name_server.compare(bit_mask, &name1, &name2) * @endcode */ -typedef int (*orte_ns_base_module_compare_fn_t)(orte_ns_cmp_bitmask_t fields, - const orte_process_name_t* name1, - const orte_process_name_t* name2); +typedef int (*orte_ns_base_module_compare_fields_fn_t)(orte_ns_cmp_bitmask_t fields, + const orte_process_name_t* name1, + const orte_process_name_t* name2); /**** VPID FUNCTIONS ****/ @@ -539,22 +503,7 @@ typedef int (*orte_ns_base_module_get_vpid_string_fn_t)(char **vpid_string, cons */ typedef int (*orte_ns_base_module_convert_string_to_vpid_fn_t)(orte_vpid_t *vpid, const char* vpidstring); -/** - * Get the virtual process id as a numeric value. - * The get_vpid() function returns the vpid in a numeric representation - - * i.e., in an integer form. - * - * @param *name A pointer to the name structure containing the name. - * - * @retval vpid The vpid field of the provided name. - * @retval MCA_NS_BASE_VPID_MAX Indicates that an error occurred - in this case, that - * the name variable provided was NULL. - * - * @code - * vpid = ompi_name_server.get_vpid(&name) - * @endcode - */ -typedef int (*orte_ns_base_module_get_vpid_fn_t)(orte_vpid_t *vpid, const orte_process_name_t *name); + /**** TAG SERVER ****/ /* @@ -576,24 +525,33 @@ typedef int (*orte_ns_base_module_define_data_type_fn_t)( /**** PEER RETRIEVAL ****/ -/* - * Get my peers +/** + * Get the process names of all processes in the specified conditions. It is + * sometimes necessary for a process to communicate to all processes of a + * given job, all processes in a given cell or on a given node, etc. The RML + * communication system utilizes the process name as its "pointer" for + * sending messages to another process. This function returns an array of + * process name pointers that contains the names of all processes that + * meet the specified combination of attributes. + * + * @param procs The location where the address of the array of pointers + * is to be stored. The function will dynamically allocate space for the + * array - the caller is responsible for releasing this space. + * @param num_procs The location where the number of entries in the + * returned array is to be stored. + * @param attributes A list of conditions to be used in defining the + * peers to be included in the returned array. This can include a + * request that all peers for the parent job be returned, for example. + * More common options would be to specify a cell or job. + * + * NOTE The combination of ORTE_CELLID_WILDCARD and ORTE_JOBID_WILDCARD + * in the attribute list will cause the function to return the names of *all* + * processes currently active in the universe. * - * THIS FUNCTION MAY BE ELIMINATED IN FUTURE VERSIONS TO REMOVE MULTIPLE STORAGE - * OF O(N) ARRAYS IN THE SYSTEM */ typedef int (*orte_ns_base_module_get_peers_fn_t)(orte_process_name_t **procs, - orte_std_cntr_t *num_procs, orte_std_cntr_t *self); - -/* - * Get the list of peers from a specified job - * - * THIS FUNCTION MAY BE ELIMINATED IN FUTURE VERSIONS TO REMOVE MULTIPLE STORAGE - * OF O(N) ARRAYS IN THE SYSTEM - */ -typedef int (*orte_ns_base_module_get_job_peers_fn_t)(orte_process_name_t **procs, - orte_std_cntr_t *num_procs, orte_jobid_t job); - + orte_std_cntr_t *num_procs, + opal_list_t *attributes); /* @@ -609,55 +567,57 @@ typedef int (*orte_ns_base_module_dump_datatypes_fn_t)(void); /* - * Ver 1.0.0 + * Ver 2.0 */ -struct mca_ns_base_module_1_0_0_t { +struct mca_ns_base_module_2_0_0_t { /* init */ - orte_ns_base_module_init_fn_t init; + orte_ns_base_module_init_fn_t init; /* cell functions */ - orte_ns_base_module_create_cellid_fn_t create_cellid; - orte_ns_base_module_get_cellid_fn_t get_cellid; - orte_ns_base_module_get_cell_info_fn_t get_cell_info; - orte_ns_base_module_assign_cellid_to_process_fn_t assign_cellid_to_process; - orte_ns_base_module_get_cellid_string_fn_t get_cellid_string; - orte_ns_base_module_convert_cellid_to_string_fn_t convert_cellid_to_string; - orte_ns_base_module_convert_string_to_cellid_fn_t convert_string_to_cellid; + orte_ns_base_module_create_cellid_fn_t create_cellid; + orte_ns_base_module_get_cell_info_fn_t get_cell_info; + orte_ns_base_module_get_cellid_string_fn_t get_cellid_string; + orte_ns_base_module_convert_cellid_to_string_fn_t convert_cellid_to_string; + orte_ns_base_module_convert_string_to_cellid_fn_t convert_string_to_cellid; + /** node functions */ + orte_ns_base_module_create_nodeids_fn_t create_nodeids; + orte_ns_base_module_get_node_info_fn_t get_node_info; + orte_ns_base_module_convert_nodeid_to_string_fn_t convert_nodeid_to_string; + orte_ns_base_module_convert_string_to_nodeid_fn_t convert_string_to_nodeid; /* jobid functions */ - orte_ns_base_module_create_jobid_fn_t create_jobid; - orte_ns_base_module_get_jobid_fn_t get_jobid; - orte_ns_base_module_get_jobid_string_fn_t get_jobid_string; - orte_ns_base_module_convert_jobid_to_string_fn_t convert_jobid_to_string; - orte_ns_base_module_convert_string_to_jobid_fn_t convert_string_to_jobid; + orte_ns_base_module_create_jobid_fn_t create_jobid; + orte_ns_base_module_get_job_descendants_fn_t get_job_descendants; + orte_ns_base_module_get_job_children_fn_t get_job_children; + orte_ns_base_module_get_root_job_fn_t get_root_job; + orte_ns_base_module_get_parent_job_fn_t get_parent_job; + orte_ns_base_module_get_jobid_string_fn_t get_jobid_string; + orte_ns_base_module_convert_jobid_to_string_fn_t convert_jobid_to_string; + orte_ns_base_module_convert_string_to_jobid_fn_t convert_string_to_jobid; + orte_ns_base_module_reserve_range_fn_t reserve_range; /* vpid functions */ - orte_ns_base_module_reserve_range_fn_t reserve_range; - orte_ns_base_module_get_vpid_fn_t get_vpid; - orte_ns_base_module_get_vpid_string_fn_t get_vpid_string; - orte_ns_base_module_convert_vpid_to_string_fn_t convert_vpid_to_string; - orte_ns_base_module_convert_string_to_vpid_fn_t convert_string_to_vpid; + orte_ns_base_module_get_vpid_string_fn_t get_vpid_string; + orte_ns_base_module_convert_vpid_to_string_fn_t convert_vpid_to_string; + orte_ns_base_module_convert_string_to_vpid_fn_t convert_string_to_vpid; /* name functions */ - orte_ns_base_module_create_proc_name_fn_t create_process_name; - orte_ns_base_module_create_my_name_fn_t create_my_name; - orte_ns_base_module_copy_proc_name_fn_t copy_process_name; + orte_ns_base_module_create_proc_name_fn_t create_process_name; + orte_ns_base_module_create_my_name_fn_t create_my_name; orte_ns_base_module_convert_string_to_process_name_fn_t convert_string_to_process_name; - orte_ns_base_module_free_name_fn_t free_name; - orte_ns_base_module_get_proc_name_string_fn_t get_proc_name_string; - orte_ns_base_module_compare_fn_t compare; + orte_ns_base_module_get_proc_name_string_fn_t get_proc_name_string; + orte_ns_base_module_compare_fields_fn_t compare_fields; /* peer functions */ - orte_ns_base_module_get_peers_fn_t get_peers; - orte_ns_base_module_get_job_peers_fn_t get_job_peers; + orte_ns_base_module_get_peers_fn_t get_peers; /* tag server functions */ - orte_ns_base_module_assign_rml_tag_fn_t assign_rml_tag; + orte_ns_base_module_assign_rml_tag_fn_t assign_rml_tag; /* data type functions */ - orte_ns_base_module_define_data_type_fn_t define_data_type; + orte_ns_base_module_define_data_type_fn_t define_data_type; /* diagnostic functions */ - orte_ns_base_module_dump_cells_fn_t dump_cells; - orte_ns_base_module_dump_jobs_fn_t dump_jobs; - orte_ns_base_module_dump_tags_fn_t dump_tags; - orte_ns_base_module_dump_datatypes_fn_t dump_datatypes; + orte_ns_base_module_dump_cells_fn_t dump_cells; + orte_ns_base_module_dump_jobs_fn_t dump_jobs; + orte_ns_base_module_dump_tags_fn_t dump_tags; + orte_ns_base_module_dump_datatypes_fn_t dump_datatypes; }; -typedef struct mca_ns_base_module_1_0_0_t mca_ns_base_module_1_0_0_t; -typedef mca_ns_base_module_1_0_0_t mca_ns_base_module_t; +typedef struct mca_ns_base_module_2_0_0_t mca_ns_base_module_2_0_0_t; +typedef mca_ns_base_module_2_0_0_t mca_ns_base_module_t; /* * NS Component @@ -677,26 +637,26 @@ typedef int (*mca_ns_base_component_finalize_fn_t)(void); * the standard component data structure */ -struct mca_ns_base_component_1_0_0_t { +struct mca_ns_base_component_2_0_0_t { mca_base_component_t ns_version; mca_base_component_data_1_0_0_t ns_data; mca_ns_base_component_init_fn_t ns_init; mca_ns_base_component_finalize_fn_t ns_finalize; }; -typedef struct mca_ns_base_component_1_0_0_t mca_ns_base_component_1_0_0_t; -typedef mca_ns_base_component_1_0_0_t mca_ns_base_component_t; +typedef struct mca_ns_base_component_2_0_0_t mca_ns_base_component_2_0_0_t; +typedef mca_ns_base_component_2_0_0_t mca_ns_base_component_t; /* - * Macro for use in components that are of type ns v1.0.0 + * Macro for use in components that are of type ns v2.0.0 */ -#define MCA_NS_BASE_VERSION_1_0_0 \ - /* ns v1.0 is chained to MCA v1.0 */ \ +#define MCA_NS_BASE_VERSION_2_0_0 \ + /* ns v2.0 is chained to MCA v1.0 */ \ MCA_BASE_VERSION_1_0_0, \ - /* ns v1.0 */ \ - "ns", 1, 0, 0 + /* ns v2.0 */ \ + "ns", 2, 0, 0 /* Global structure for accessing name server functions */ diff --git a/orte/mca/ns/ns_types.h b/orte/mca/ns/ns_types.h index 0042e0ffbf..72a9650ef3 100644 --- a/orte/mca/ns/ns_types.h +++ b/orte/mca/ns/ns_types.h @@ -47,10 +47,20 @@ extern "C" { #endif +/**** NS ATTRIBUTES ****/ +#define ORTE_NS_USE_PARENT "orte-ns-use-parent" +#define ORTE_NS_USE_ROOT "orte-ns-use-root" +#define ORTE_NS_USE_CELL "orte-ns-use-cell" +#define ORTE_NS_USE_JOBID "orte-ns-use-job" +#define ORTE_NS_USE_NODE "orte-ns-use-node" +#define ORTE_NS_INCLUDE_DESCENDANTS "orte-ns-include-desc" +#define ORTE_NS_INCLUDE_CHILDREN "orte-ns-include-child" + + #define ORTE_NAME_ARGS(n) \ - (unsigned long) ((NULL == n) ? -1 : (int32_t)(n)->cellid), \ - (unsigned long) ((NULL == n) ? -1 : (int32_t)(n)->jobid), \ - (unsigned long) ((NULL == n) ? -1 : (int32_t)(n)->vpid) + (long) ((NULL == n) ? (long)-1 : (long)(n)->cellid), \ + (long) ((NULL == n) ? (long)-1 : (long)(n)->jobid), \ + (long) ((NULL == n) ? (long)-1 : (long)(n)->vpid) /* @@ -69,18 +79,19 @@ extern "C" { /** Set the allowed range for ids in each space * * NOTE: Be sure to update the ORTE_NAME_ARGS #define (above) and all - * uses of it if these types change to be larger than (unsigned long)! + * uses of it if these types change to be larger than (long)! */ typedef orte_std_cntr_t orte_jobid_t; typedef orte_std_cntr_t orte_cellid_t; +typedef orte_std_cntr_t orte_nodeid_t; typedef orte_std_cntr_t orte_vpid_t; + typedef uint8_t orte_ns_cmp_bitmask_t; /**< Bit mask for comparing process names */ -typedef uint8_t orte_ns_cmd_flag_t; struct orte_process_name_t { - orte_cellid_t cellid; /**< Cell number */ - orte_jobid_t jobid; /**< Job number */ - orte_vpid_t vpid; /**< Process number */ + orte_cellid_t cellid; /**< Cell number */ + orte_jobid_t jobid; /**< Job number */ + orte_vpid_t vpid; /**< Process number */ }; typedef struct orte_process_name_t orte_process_name_t; @@ -90,6 +101,15 @@ typedef struct orte_process_name_t orte_process_name_t; #define ORTE_CELLID_MAX ORTE_STD_CNTR_MAX #define ORTE_JOBID_MAX ORTE_STD_CNTR_MAX #define ORTE_VPID_MAX ORTE_STD_CNTR_MAX +#define ORTE_NODEID_MAX ORTE_STD_CNTR_MAX + +/* + * define minimum value for id's in any field + */ +#define ORTE_CELLID_MIN ORTE_STD_CNTR_MIN +#define ORTE_JOBID_MIN ORTE_STD_CNTR_MIN +#define ORTE_VPID_MIN ORTE_STD_CNTR_MIN +#define ORTE_NODEID_MIN ORTE_STD_CNTR_MIN /* * define invalid values @@ -97,19 +117,31 @@ typedef struct orte_process_name_t orte_process_name_t; #define ORTE_CELLID_INVALID -999 #define ORTE_JOBID_INVALID -999 #define ORTE_VPID_INVALID -999 +#define ORTE_NODEID_INVALID -999 /* - * define wildcard values + * define wildcard values (should be -1) */ #define ORTE_CELLID_WILDCARD -1 #define ORTE_JOBID_WILDCARD -1 #define ORTE_VPID_WILDCARD -1 +#define ORTE_NODEID_WILDCARD -1 -ORTE_DECLSPEC extern orte_process_name_t orte_name_all; -#define ORTE_NAME_ALL &orte_name_all +/* + * Shortcut for some commonly used names + */ + +#define ORTE_NAME_WILDCARD &orte_ns_name_wildcard +ORTE_DECLSPEC extern orte_process_name_t orte_ns_name_wildcard; /** instantiated in orte/mca/ns/base/ns_base_open.c */ + +#define ORTE_NAME_INVALID &orte_ns_name_invalid +ORTE_DECLSPEC extern orte_process_name_t orte_ns_name_invalid; /** instantiated in orte/mca/ns/base/ns_base_open.c */ #define ORTE_PROC_MY_NAME orte_process_info.my_name +#define ORTE_PROC_MY_HNP &orte_ns_name_my_hnp +ORTE_DECLSPEC extern orte_process_name_t orte_ns_name_my_hnp; /** instantiated in orte/mca/ns/base/ns_base_open.c */ + /** * Convert process name from host to network byte order. * diff --git a/orte/mca/ns/proxy/Makefile.am b/orte/mca/ns/proxy/Makefile.am index 3f1498cf81..143c49770b 100644 --- a/orte/mca/ns/proxy/Makefile.am +++ b/orte/mca/ns/proxy/Makefile.am @@ -16,37 +16,34 @@ # $HEADER$ # -# Use the top-level Makefile.options - - - -sources = -include src/Makefile.extra +sources = \ + ns_proxy.h \ + ns_proxy_cell_fns.c \ + ns_proxy_diag_fns.c \ + ns_proxy_general_fns.c \ + ns_proxy_job_fns.c \ + ns_proxy_component.c # Make the output library in this directory, and name it either # mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la # (for static builds). if OMPI_BUILD_ns_proxy_DSO -lib = -lib_sources = -component = mca_ns_proxy.la -component_sources = $(sources) +component_noinst = +component_install = mca_ns_proxy.la else -lib = libmca_ns_proxy.la -lib_sources = $(sources) -component = -component_sources = +component_noinst = libmca_ns_proxy.la +component_install = endif mcacomponentdir = $(libdir)/openmpi -mcacomponent_LTLIBRARIES = $(component) -mca_ns_proxy_la_SOURCES = $(component_sources) +mcacomponent_LTLIBRARIES = $(component_install) +mca_ns_proxy_la_SOURCES = $(sources) mca_ns_proxy_la_LDFLAGS = -module -avoid-version mca_ns_proxy_la_LIBADD = \ $(top_ompi_builddir)/orte/liborte.la \ $(top_ompi_builddir)/opal/libopal.la -noinst_LTLIBRARIES = $(lib) -libmca_ns_proxy_la_SOURCES = $(lib_sources) +noinst_LTLIBRARIES = $(component_noinst) +libmca_ns_proxy_la_SOURCES =$(sources) libmca_ns_proxy_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/ns/proxy/configure.params b/orte/mca/ns/proxy/configure.params index 4686b02ce4..a0ccbf55aa 100644 --- a/orte/mca/ns/proxy/configure.params +++ b/orte/mca/ns/proxy/configure.params @@ -19,5 +19,6 @@ # Specific to this module -PARAM_INIT_FILE=src/ns_proxy.c +PARAM_INIT_FILE=ns_proxy.c +PARAM_CONFIG_HEADER_FILE="ns_proxy.h" PARAM_CONFIG_FILES="Makefile" diff --git a/orte/mca/ns/proxy/src/ns_proxy.h b/orte/mca/ns/proxy/ns_proxy.h similarity index 75% rename from orte/mca/ns/proxy/src/ns_proxy.h rename to orte/mca/ns/proxy/ns_proxy.h index f7c4375ac7..10fea146f1 100644 --- a/orte/mca/ns/proxy/src/ns_proxy.h +++ b/orte/mca/ns/proxy/ns_proxy.h @@ -22,27 +22,20 @@ #include "orte_config.h" -#include "opal/types.h" #include "orte/orte_constants.h" + +#include "opal/types.h" #include "opal/class/opal_list.h" + #include "orte/dss/dss.h" -#include "orte/mca/ns/base/base.h" +#include "orte/mca/ns/ns.h" +#include "orte/mca/ns/base/ns_private.h" #if defined(c_plusplus) || defined(__cplusplus) extern "C" { #endif -struct orte_ns_proxy_cell_info_t { - opal_object_t super; - orte_cellid_t cellid; - char *site; - char *resource; -}; -typedef struct orte_ns_proxy_cell_info_t orte_ns_proxy_cell_info_t; - -OBJ_CLASS_DECLARATION(orte_ns_proxy_cell_info_t); - struct orte_ns_proxy_tagitem_t { opal_object_t super; orte_rml_tag_t tag; /**< OOB tag */ @@ -81,7 +74,6 @@ int orte_ns_proxy_finalize(void); */ typedef struct { size_t max_size, block_size; - orte_process_name_t *my_replica; int debug; orte_cellid_t num_cells; orte_pointer_array_t *cells; @@ -94,6 +86,12 @@ typedef struct { extern orte_ns_proxy_globals_t orte_ns_proxy; +/* + * simplifying define + */ +#define ORTE_NS_MY_REPLICA orte_process_info.ns_replica + + /* * proxy function prototypes */ @@ -101,13 +99,26 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc int orte_ns_proxy_get_cell_info(orte_cellid_t cellid, char **site, char **resource); -int orte_ns_proxy_create_jobid(orte_jobid_t *jobid); +int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, + orte_cellid_t cellid, char **nodenames); + +int orte_ns_proxy_get_node_info(char ***nodename, orte_cellid_t cellid, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids); + +int orte_ns_proxy_create_jobid(orte_jobid_t *jobid, opal_list_t *attrs); + +int orte_ns_proxy_get_job_descendants(orte_jobid_t** descendants, orte_std_cntr_t *ndesc, orte_jobid_t job); + +int orte_ns_proxy_get_job_children(orte_jobid_t** descendants, orte_std_cntr_t *ndesc, orte_jobid_t job); + +int orte_ns_proxy_get_root_job(orte_jobid_t *root_job, orte_jobid_t job); + +int orte_ns_proxy_get_parent_job(orte_jobid_t *parent, orte_jobid_t job); int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t *startvpid); -int orte_ns_proxy_get_job_peers(orte_process_name_t **procs, - orte_std_cntr_t *num_procs, orte_jobid_t job); +int orte_ns_proxy_get_peers(orte_process_name_t **procs, + orte_std_cntr_t *num_procs, opal_list_t *attrs); int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, char *name); diff --git a/orte/mca/ns/proxy/ns_proxy_cell_fns.c b/orte/mca/ns/proxy/ns_proxy_cell_fns.c new file mode 100644 index 0000000000..f6738b9e18 --- /dev/null +++ b/orte/mca/ns/proxy/ns_proxy_cell_fns.c @@ -0,0 +1,440 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +#include "orte_config.h" + +#include <string.h> + +#include "orte/orte_constants.h" +#include "orte/orte_types.h" + +#include "opal/mca/mca.h" +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "opal/util/trace.h" + +#include "orte/dss/dss.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rml/rml.h" + +#include "ns_proxy.h" + +/** + * globals + */ + +/* + * functions + */ + +int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resource) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t count; + int rc; + + OPAL_TRACE(1); + + /* set the default value of error */ + *cellid = ORTE_CELLID_INVALID; + + command = ORTE_NS_CREATE_CELLID_CMD; + + cmd = OBJ_NEW(orte_buffer_t); + if (cmd == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &site, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &resource, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + answer = OBJ_NEW(orte_buffer_t); + if(answer == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + if (ORTE_NS_CREATE_CELLID_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, cellid, &count, ORTE_CELLID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + OBJ_RELEASE(answer); + + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_SUCCESS; +} + + +int orte_ns_proxy_get_cell_info(orte_cellid_t cellid, + char **site, char **resource) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t count; + int rc, ret=ORTE_SUCCESS; + + OPAL_TRACE(1); + + command = ORTE_NS_GET_CELL_INFO_CMD; + + cmd = OBJ_NEW(orte_buffer_t); + if (cmd == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &cellid, 1, ORTE_CELLID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + answer = OBJ_NEW(orte_buffer_t); + if(answer == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (ORTE_NS_GET_CELL_INFO_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, site, &count, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, resource, &count, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_SUCCESS; +} + +int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, + orte_cellid_t cellid, char **nodenames) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t count, index; + int rc; + + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); + + command = ORTE_NS_CREATE_NODEID_CMD; + + cmd = OBJ_NEW(orte_buffer_t); + if (cmd == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &cellid, 1, ORTE_CELLID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + count = opal_argv_count(nodenames); + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &count, 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, nodenames, count, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + answer = OBJ_NEW(orte_buffer_t); + if(answer == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + if (ORTE_NS_CREATE_NODEID_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &index, &count, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + /** allocate the space for the nodeids */ + *nodeids = (orte_nodeid_t*)malloc(index * sizeof(orte_nodeid_t)); + if (NULL == *nodeids) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, nodeids, &index, ORTE_NODEID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + OBJ_RELEASE(answer); + + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_SUCCESS; +} + +int orte_ns_proxy_get_node_info(char ***nodenames, orte_cellid_t cellid, + orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t count, index; + int rc, ret=ORTE_SUCCESS; + + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); + + command = ORTE_NS_GET_NODE_INFO_CMD; + + cmd = OBJ_NEW(orte_buffer_t); + if (cmd == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &cellid, 1, ORTE_CELLID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &num_nodes, 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, nodeids, num_nodes, ORTE_NODEID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + answer = OBJ_NEW(orte_buffer_t); + if(answer == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (ORTE_NS_GET_NODE_INFO_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &index, &count, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + /** create the space for the nodenames */ + *nodenames = (char**)malloc(index * sizeof(char*)); + if (NULL == *nodenames) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, *nodenames, &index, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &ret, &count, ORTE_INT))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ret; +} diff --git a/orte/mca/ns/proxy/src/ns_proxy_component.c b/orte/mca/ns/proxy/ns_proxy_component.c similarity index 84% rename from orte/mca/ns/proxy/src/ns_proxy_component.c rename to orte/mca/ns/proxy/ns_proxy_component.c index 2ce677cf56..fc438b636b 100644 --- a/orte/mca/ns/proxy/src/ns_proxy_component.c +++ b/orte/mca/ns/proxy/ns_proxy_component.c @@ -47,7 +47,7 @@ */ mca_ns_base_component_t mca_ns_proxy_component = { { - MCA_NS_BASE_VERSION_1_0_0, + MCA_NS_BASE_VERSION_2_0_0, "proxy", /* MCA module name */ ORTE_MAJOR_VERSION, /* MCA module major version */ @@ -71,35 +71,37 @@ static mca_ns_base_module_t orte_ns_proxy_module = { orte_ns_proxy_module_init, /* cell functions */ orte_ns_proxy_create_cellid, - orte_ns_base_get_cellid, orte_ns_proxy_get_cell_info, - orte_ns_base_assign_cellid_to_process, orte_ns_base_get_cellid_string, orte_ns_base_convert_cellid_to_string, orte_ns_base_convert_string_to_cellid, + /** node functions */ + orte_ns_proxy_create_nodeids, + orte_ns_proxy_get_node_info, + orte_ns_base_convert_nodeid_to_string, + orte_ns_base_convert_string_to_nodeid, /* jobid functions */ orte_ns_proxy_create_jobid, - orte_ns_base_get_jobid, + orte_ns_proxy_get_job_descendants, + orte_ns_proxy_get_job_children, + orte_ns_proxy_get_root_job, + orte_ns_proxy_get_parent_job, orte_ns_base_get_jobid_string, orte_ns_base_convert_jobid_to_string, orte_ns_base_convert_string_to_jobid, - /* vpid functions */ orte_ns_proxy_reserve_range, - orte_ns_base_get_vpid, + /* vpid functions */ orte_ns_base_get_vpid_string, orte_ns_base_convert_vpid_to_string, orte_ns_base_convert_string_to_vpid, /* name functions */ orte_ns_base_create_process_name, orte_ns_proxy_create_my_name, - orte_ns_base_copy_process_name, orte_ns_base_convert_string_to_process_name, - orte_ns_base_free_name, orte_ns_base_get_proc_name_string, - orte_ns_base_compare, + orte_ns_base_compare_fields, /* peer functions */ - orte_ns_base_get_peers, - orte_ns_proxy_get_job_peers, + orte_ns_proxy_get_peers, /* tag server functions */ orte_ns_proxy_assign_rml_tag, /* data type functions */ @@ -116,31 +118,6 @@ static mca_ns_base_module_t orte_ns_proxy_module = { */ static bool initialized = false; -/* constructor - used to initialize state of cell info list instance */ -static void orte_ns_proxy_cell_info_construct(orte_ns_proxy_cell_info_t* ptr) -{ - ptr->resource = NULL; - ptr->site = NULL; -} - -/* destructor - used to free any resources held by instance */ -static void orte_ns_proxy_cell_info_destructor(orte_ns_proxy_cell_info_t* ptr) -{ - if (NULL != ptr->resource) { - free(ptr->resource); - } - if (NULL != ptr->site) { - free(ptr->site); - } -} - -/* define instance of opal_class_t */ -OBJ_CLASS_INSTANCE( - orte_ns_proxy_cell_info_t, /* type name */ - opal_object_t, /* parent "class" name */ - orte_ns_proxy_cell_info_construct, /* constructor */ - orte_ns_proxy_cell_info_destructor); /* destructor */ - /* constructor - used to initialize state of taglist instance */ static void orte_ns_proxy_tagitem_construct(orte_ns_proxy_tagitem_t* tagitem) { @@ -245,14 +222,10 @@ mca_ns_base_module_t* orte_ns_proxy_init(int *priority) ORTE_ERROR_LOG(ret); return NULL; } - if(ORTE_SUCCESS != (ret = orte_ns.copy_process_name(&orte_process_info.ns_replica, &name))) { + if(ORTE_SUCCESS != (ret = orte_dss.copy((void**)&orte_process_info.ns_replica, &name, ORTE_NAME))) { ORTE_ERROR_LOG(ret); return NULL; } - if (ORTE_SUCCESS != orte_ns_base_copy_process_name(&orte_ns_proxy.my_replica, - orte_process_info.ns_replica)) { /* can't operate */ - return NULL; - } /* initialize the cell info tracker */ if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.cells), @@ -315,7 +288,6 @@ int orte_ns_proxy_module_init(void) */ int orte_ns_proxy_finalize(void) { - orte_ns_proxy_cell_info_t **cptr; orte_ns_proxy_tagitem_t **tag; orte_ns_proxy_dti_t **dti; orte_std_cntr_t i; @@ -323,14 +295,6 @@ int orte_ns_proxy_finalize(void) /* free all tracking storage, but only if this component was initialized */ if (initialized) { - cptr = (orte_ns_proxy_cell_info_t**)(orte_ns_proxy.cells)->addr; - for (i=0; i < (orte_ns_proxy.cells)->size; i++) { - if (NULL != cptr[i]) { - OBJ_RELEASE(cptr[i]); - } - } - OBJ_RELEASE(orte_ns_proxy.cells); - tag = (orte_ns_proxy_tagitem_t**)(orte_ns_proxy.tags)->addr; for (i=0; i < (orte_ns_proxy.tags)->size; i++) { if (NULL != tag[i]) OBJ_RELEASE(tag[i]); diff --git a/orte/mca/ns/proxy/ns_proxy_diag_fns.c b/orte/mca/ns/proxy/ns_proxy_diag_fns.c new file mode 100644 index 0000000000..5cfce73c65 --- /dev/null +++ b/orte/mca/ns/proxy/ns_proxy_diag_fns.c @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +#include "orte_config.h" + +#include <string.h> + +#include "orte/orte_constants.h" +#include "orte/orte_types.h" + +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/util/trace.h" + +#include "orte/dss/dss.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rml/rml.h" + +#include "orte/mca/ns/base/base.h" +#include "ns_proxy.h" + +/* + * DIAGNOSTIC functions + */ +int orte_ns_proxy_dump_cells(void) +{ + orte_buffer_t cmd; + orte_buffer_t answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t count; + int rc; + + command = ORTE_NS_DUMP_CELLS_CMD; + + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); + + /* dump name service replica cell tracker */ + OBJ_CONSTRUCT(&cmd, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + OBJ_DESTRUCT(&cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, &cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_DESTRUCT(&cmd); + + OBJ_CONSTRUCT(&answer, orte_buffer_t); + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, &answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(&answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&answer); + return rc; + } + + if (ORTE_NS_DUMP_CELLS_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&answer); + return ORTE_ERR_COMM_FAILURE; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&answer); + return rc; + } + + return ORTE_SUCCESS; +} + + +int orte_ns_proxy_dump_jobs(void) +{ + orte_buffer_t cmd; + orte_buffer_t answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t count; + int rc; + + command = ORTE_NS_DUMP_JOBIDS_CMD; + + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); + + /* dump name service replica jobid tracker */ + OBJ_CONSTRUCT(&cmd, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + OBJ_DESTRUCT(&cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, &cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_DESTRUCT(&cmd); + + OBJ_CONSTRUCT(&answer, orte_buffer_t); + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, &answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(&answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&answer); + return rc; + } + + if (ORTE_NS_DUMP_JOBIDS_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&answer); + return ORTE_ERR_COMM_FAILURE; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&answer); + return rc; + } + + return ORTE_SUCCESS; +} + + +int orte_ns_proxy_dump_tags(void) +{ + orte_buffer_t cmd; + orte_buffer_t answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t i; + orte_std_cntr_t count; + orte_rml_tag_t j; + orte_ns_proxy_tagitem_t **ptr; + int rc; + + command = ORTE_NS_DUMP_TAGS_CMD; + + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); + + /* dump name service replica tag tracker */ + OBJ_CONSTRUCT(&cmd, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + OBJ_DESTRUCT(&cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, &cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_DESTRUCT(&cmd); + + OBJ_CONSTRUCT(&answer, orte_buffer_t); + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, &answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(&answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&answer); + return rc; + } + + if (ORTE_NS_DUMP_TAGS_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&answer); + return ORTE_ERR_COMM_FAILURE; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&answer); + return rc; + } + + /* dump local tag tracker */ + opal_output(mca_ns_base_output, "\n\n[%lu,%lu,%lu] Dump of Local Tag Tracker\n", + ORTE_NAME_ARGS(orte_process_info.my_name)); + ptr = (orte_ns_proxy_tagitem_t**)(orte_ns_proxy.tags)->addr; + for (i=0, j=0; j < orte_ns_proxy.num_tags && + i < (orte_ns_proxy.tags)->size; i++) { + if (NULL != ptr[i]) { + j++; + opal_output(mca_ns_base_output, "Num: %lu\tTag: %lu\tTag name: %s\n", + (unsigned long)j, (unsigned long)ptr[i]->tag, ptr[i]->name); + } + } + + return ORTE_SUCCESS; +} + + +int orte_ns_proxy_dump_datatypes(void) +{ + orte_buffer_t cmd; + orte_buffer_t answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t i, j; + orte_std_cntr_t count; + orte_ns_proxy_dti_t **ptr; + int rc; + + command = ORTE_NS_DUMP_DATATYPES_CMD; + + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); + + /* dump name service replica datatype tracker */ + OBJ_CONSTRUCT(&cmd, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + OBJ_DESTRUCT(&cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, &cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_DESTRUCT(&cmd); + + OBJ_CONSTRUCT(&answer, orte_buffer_t); + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, &answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(&answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&answer); + return rc; + } + + if (ORTE_NS_DUMP_DATATYPES_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&answer); + return ORTE_ERR_COMM_FAILURE; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&answer); + return rc; + } + + /* dump local datatype tracker */ + opal_output(mca_ns_base_output, "\n\n[%lu,%lu,%lu] Dump of Local Datatype Tracker\n", + ORTE_NAME_ARGS(orte_process_info.my_name)); + ptr = (orte_ns_proxy_dti_t**)(orte_ns_proxy.dts)->addr; + for (i=0, j=0; j < orte_ns_proxy.num_dts && + i < (orte_ns_proxy.dts)->size; i++) { + if (NULL != ptr[i]) { + j++; + opal_output(mca_ns_base_output, "Num: %lu\tDatatype id: %lu\tDatatype name: %s\n", + (unsigned long)j, (unsigned long)ptr[i]->id, ptr[i]->name); + } + } + + return ORTE_SUCCESS; +} + + diff --git a/orte/mca/ns/proxy/ns_proxy_general_fns.c b/orte/mca/ns/proxy/ns_proxy_general_fns.c new file mode 100644 index 0000000000..c2250eee6a --- /dev/null +++ b/orte/mca/ns/proxy/ns_proxy_general_fns.c @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +#include "orte_config.h" + +#include <string.h> + +#include "orte/orte_constants.h" +#include "orte/orte_types.h" + +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/util/trace.h" + +#include "orte/dss/dss.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rmgr/rmgr.h" +#include "orte/mca/rml/rml.h" + +#include "ns_proxy.h" + +/* + * PEER functions + */ +int orte_ns_proxy_get_peers(orte_process_name_t **procs, + orte_std_cntr_t *num_procs, opal_list_t *attrs) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t count, nprocs, i; + orte_cellid_t *cptr; + orte_attribute_t *attr; + int rc; + + OPAL_TRACE(1); + + /* set default value */ + *procs = NULL; + *num_procs = 0; + + /* check the attributes to see if USE_JOB or USE_CELL has been set. If not, then this is + * a request for my own job peers - process that one locally + */ + + /* if the cell is given AND it matches my own, then we can process this + * quickly. Otherwise, we have to do some more work. + * + * RHC: when we go multi-cell, we need a way to find all the cells upon + * which a job is executing so we can make this work! + */ + if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_CELL))) { + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, attr->value, ORTE_CELLID))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + if (*cptr != ORTE_PROC_MY_NAME->cellid && *cptr != ORTE_CELLID_WILDCARD) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_NOT_IMPLEMENTED; + } + } + + if (NULL == (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_JOBID))) { + /* get my own job peers, assuming all are on this cell - process here + * + * RHC: This is a bad assumption. When we go multi-cell, we are going to have to process + * get peer requests solely on the HNP since we won't know the cellid otherwise + */ + *procs = (orte_process_name_t*)malloc(orte_process_info.num_procs * sizeof(orte_process_name_t)); + if (NULL == *procs) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + for (i=0; i < orte_process_info.num_procs; i++) { + (*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid; + (*procs)[i].jobid = ORTE_PROC_MY_NAME->jobid; + (*procs)[i].vpid = orte_process_info.vpid_start + i; + } + + *num_procs = orte_process_info.num_procs; + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_SUCCESS; + } + + /* non-local request for peers in another job - send to replica for processing */ + if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + command = ORTE_NS_GET_PEERS_CMD; + /* pack the command */ + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */ + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + /* pack the attributes */ + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, attrs, 1, ORTE_ATTR_LIST))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (ORTE_NS_GET_PEERS_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &nprocs, &count, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + /* allocate space for array of proc names */ + if (0 < nprocs) { + *procs = (orte_process_name_t*)malloc((nprocs) * sizeof(orte_process_name_t)); + if (NULL == *procs) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, *procs, &nprocs, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + } + *num_procs = nprocs; + + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_SUCCESS; +} + + +int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, + char *name) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_ns_cmd_flag_t command; + orte_ns_proxy_tagitem_t* tagitem, **tags; + orte_std_cntr_t count, i; + orte_rml_tag_t j; + int rc; + + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); + + if (NULL != name) { + /* see if this name is already in list - if so, return tag */ + tags = (orte_ns_proxy_tagitem_t**)orte_ns_proxy.tags->addr; + for (i=0, j=0; j < orte_ns_proxy.num_tags && + i < (orte_ns_proxy.tags)->size; i++) { + if (NULL != tags[i]) { + j++; + if (tags[i]->name != NULL && + 0 == strcmp(name, tags[i]->name)) { /* found name on list */ + *tag = tags[i]->tag; + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_SUCCESS; + } + } + } + } + + /* okay, not on local list - so go get one from tag server */ + command = ORTE_NS_ASSIGN_OOB_TAG_CMD; + *tag = ORTE_RML_TAG_MAX; /* set the default error value */ + + if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (NULL == name) { + name = "NULL"; + } + + if (0 > (rc = orte_dss.pack(cmd, &name, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (ORTE_NS_ASSIGN_OOB_TAG_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, tag, &count, ORTE_RML_TAG))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + OBJ_RELEASE(answer); + + /* add the new tag to the local list so we don't have to get it again */ + tagitem = OBJ_NEW(orte_ns_proxy_tagitem_t); + if (NULL == tagitem) { /* out of memory */ + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i, + orte_ns_proxy.tags, tagitem))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + tagitem->tag = *tag; + (orte_ns_proxy.num_tags)++; + if (NULL != name) { /* provided - can look it up later */ + tagitem->name = strdup(name); + } else { + tagitem->name = NULL; + } + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + + /* all done */ + return ORTE_SUCCESS; +} + + +int orte_ns_proxy_define_data_type(const char *name, + orte_data_type_t *type) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_ns_cmd_flag_t command; + orte_ns_proxy_dti_t **dti, *dtip; + orte_std_cntr_t count, i, j; + int rc=ORTE_SUCCESS; + + if (NULL == name || 0 < *type) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); + + /* first, check to see if name is already on local list + * if so, return id, ensure registered with dss + */ + dti = (orte_ns_proxy_dti_t**)orte_ns_proxy.dts->addr; + for (i=0, j=0; j < orte_ns_proxy.num_dts && + i < orte_ns_proxy.dts->size; i++) { + if (NULL != dti[i]) { + j++; + if (dti[i]->name != NULL && + 0 == strcmp(name, dti[i]->name)) { /* found name on list */ + *type = dti[i]->id; + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_SUCCESS; + } + } + } + + + /* okay, not on local list - so go get one from tag server */ + command = ORTE_NS_DEFINE_DATA_TYPE_CMD; + *type = ORTE_DSS_ID_MAX; /* set the default error value */ + + if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&name, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + + if (ORTE_NS_ASSIGN_OOB_TAG_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, type, &count, ORTE_DATA_TYPE))) { + ORTE_ERROR_LOG(ORTE_ERR_UNPACK_FAILURE); + OBJ_RELEASE(answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_UNPACK_FAILURE; + } + OBJ_RELEASE(answer); + + /* add the new id to the local list so we don't have to get it again */ + dtip = OBJ_NEW(orte_ns_proxy_dti_t); + if (NULL == dtip) { /* out of memory */ + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + dtip->name = strdup(name); + if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i, + orte_ns_proxy.dts, dtip))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + dtip->id = *type; + (orte_ns_proxy.num_dts)++; + + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + + /* all done */ + return rc; +} + +/* + * Take advantage of the way the RML uses the process name as its index into + * the RML communicator table. Because the RML needs a name right away, it will + * automatically assign us one when it receives a message - and it communicates + * that assignment back to us automatically. Thus, to get a name for ourselves, + * all we have to do is send a message! No response from the replica is required. + */ +int orte_ns_proxy_create_my_name(void) +{ + orte_buffer_t* cmd; + orte_ns_cmd_flag_t command; + int rc; + + command = ORTE_NS_CREATE_MY_NAME_CMD; + + cmd = OBJ_NEW(orte_buffer_t); + if (cmd == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ns/proxy/ns_proxy_job_fns.c b/orte/mca/ns/proxy/ns_proxy_job_fns.c new file mode 100644 index 0000000000..181d94ccc9 --- /dev/null +++ b/orte/mca/ns/proxy/ns_proxy_job_fns.c @@ -0,0 +1,526 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +#include "orte_config.h" + +#include <string.h> + +#include "orte/orte_constants.h" +#include "orte/orte_types.h" + +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/util/trace.h" + +#include "orte/dss/dss.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rml/rml.h" + +#include "ns_proxy.h" + +/**** CREATE JOBID ****/ +int orte_ns_proxy_create_jobid(orte_jobid_t *job, opal_list_t *attrs) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t count; + int rc; + + OPAL_TRACE(1); + + /* set default value */ + *job = ORTE_JOBID_INVALID; + + if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + command = ORTE_NS_CREATE_JOBID_CMD; + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, attrs, 1, ORTE_ATTR_LIST))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(answer); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + if (ORTE_NS_CREATE_JOBID_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, job, &count, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + OBJ_RELEASE(answer); + return ORTE_SUCCESS; +} + + +/**** GET JOB DESCENDANTS ****/ +int orte_ns_proxy_get_job_descendants(orte_jobid_t **descendants, orte_std_cntr_t *num_desc, orte_jobid_t job) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t count, ndesc=0; + orte_jobid_t *jobs=NULL; + int rc; + + OPAL_TRACE(1); + + /* set default response */ + *descendants = NULL; + *num_desc = 0; + + if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + command = ORTE_NS_GET_JOB_DESC_CMD; + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */ + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&job, 1, ORTE_JOBID))) { /* got a problem */ + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(answer); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + if (ORTE_NS_GET_JOB_DESC_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &ndesc, &count, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + /* if there are any descendants, allocate space for them and unpack */ + if (0 < ndesc) { + jobs = (orte_jobid_t*)malloc(ndesc * sizeof(orte_jobid_t)); + if (NULL == jobs) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(answer); + return ORTE_ERR_OUT_OF_RESOURCE; + } + count = ndesc; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, jobs, &count, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + } + + OBJ_RELEASE(answer); + + *descendants = jobs; + *num_desc = count; + + return ORTE_SUCCESS; +} + +/**** GET JOB CHILDREN ****/ +int orte_ns_proxy_get_job_children(orte_jobid_t **descendants, orte_std_cntr_t *num_desc, orte_jobid_t job) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t count, ndesc=0; + orte_jobid_t *jobs=NULL; + int rc; + + OPAL_TRACE(1); + + /* set default response */ + *descendants = NULL; + *num_desc = 0; + + if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + command = ORTE_NS_GET_JOB_CHILD_CMD; + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */ + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&job, 1, ORTE_JOBID))) { /* got a problem */ + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(answer); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + if (ORTE_NS_GET_JOB_DESC_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &ndesc, &count, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + /* if there are any descendants, allocate space for them and unpack */ + if (0 < ndesc) { + jobs = (orte_jobid_t*)malloc(ndesc * sizeof(orte_jobid_t)); + if (NULL == jobs) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(answer); + return ORTE_ERR_OUT_OF_RESOURCE; + } + count = ndesc; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, jobs, &count, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + } + + OBJ_RELEASE(answer); + + *descendants = jobs; + *num_desc = count; + + return ORTE_SUCCESS; +} + +int orte_ns_proxy_get_root_job(orte_jobid_t *root_job, orte_jobid_t job) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t count; + int rc; + + OPAL_TRACE(1); + + /* set default value */ + *root_job = ORTE_JOBID_INVALID; + + if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + command = ORTE_NS_GET_ROOT_JOB_CMD; + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &job, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(answer); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + if (ORTE_NS_GET_ROOT_JOB_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, root_job, &count, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + OBJ_RELEASE(answer); + return ORTE_SUCCESS; +} + +int orte_ns_proxy_get_parent_job(orte_jobid_t *parent, orte_jobid_t job) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t count; + int rc; + + OPAL_TRACE(1); + + /* set default value */ + *parent = ORTE_JOBID_INVALID; + + if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + command = ORTE_NS_GET_PARENT_JOB_CMD; + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &job, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(answer); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + if (ORTE_NS_GET_PARENT_JOB_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, parent, &count, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + OBJ_RELEASE(answer); + return ORTE_SUCCESS; +} + + +int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t *starting_vpid) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_ns_cmd_flag_t command; + orte_std_cntr_t count; + int rc; + + OPAL_TRACE(1); + + /* set default return value */ + *starting_vpid = ORTE_VPID_INVALID; + + if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + command = ORTE_NS_RESERVE_RANGE_CMD; + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */ + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&job, 1, ORTE_JOBID))) { /* got a problem */ + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&range, 1, ORTE_VPID))) { /* got a problem */ + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + + if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if ((ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) + || (ORTE_NS_RESERVE_RANGE_CMD != command)) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, starting_vpid, &count, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + OBJ_RELEASE(answer); + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ns/proxy/src/Makefile.extra b/orte/mca/ns/proxy/src/Makefile.extra deleted file mode 100644 index 8e963be1ce..0000000000 --- a/orte/mca/ns/proxy/src/Makefile.extra +++ /dev/null @@ -1,23 +0,0 @@ -# -*- makefile -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -sources += \ - src/ns_proxy.h \ - src/ns_proxy.c \ - src/ns_proxy_component.c diff --git a/orte/mca/ns/proxy/src/ns_proxy.c b/orte/mca/ns/proxy/src/ns_proxy.c deleted file mode 100644 index 2cdbabfd2e..0000000000 --- a/orte/mca/ns/proxy/src/ns_proxy.c +++ /dev/null @@ -1,1071 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file: - * - */ - -#include "orte_config.h" - -#include <string.h> - -#include "orte/orte_constants.h" -#include "orte/orte_types.h" - -#include "opal/mca/mca.h" -#include "opal/util/output.h" -#include "opal/util/trace.h" - -#include "orte/dss/dss.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/rml/rml.h" - -#include "ns_proxy.h" - -/** - * globals - */ - -/* - * functions - */ - -int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resource) -{ - orte_buffer_t* cmd; - orte_buffer_t* answer; - orte_ns_cmd_flag_t command; - orte_std_cntr_t count, index; - int rc; - orte_ns_proxy_cell_info_t *new_cell; - - OPAL_TRACE(1); - - /* set the default value of error */ - *cellid = ORTE_CELLID_MAX; - - command = ORTE_NS_CREATE_CELLID_CMD; - - cmd = OBJ_NEW(orte_buffer_t); - if (cmd == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - return rc; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &site, 1, ORTE_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - return rc; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &resource, 1, ORTE_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - return rc; - } - - if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(cmd); - return ORTE_ERR_COMM_FAILURE; - } - OBJ_RELEASE(cmd); - - answer = OBJ_NEW(orte_buffer_t); - if(answer == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(answer); - return ORTE_ERR_COMM_FAILURE; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - return rc; - } - - if (ORTE_NS_CREATE_CELLID_CMD != command) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(answer); - return ORTE_ERR_COMM_FAILURE; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, cellid, &count, ORTE_CELLID))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - return rc; - } - OBJ_RELEASE(answer); - - /* store the info locally for later retrieval */ - OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); - new_cell = OBJ_NEW(orte_ns_proxy_cell_info_t); - if (NULL == new_cell) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index, - orte_ns_proxy.cells, new_cell))) { - ORTE_ERROR_LOG(rc); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - if (NULL != site) { - new_cell->site = strdup(site); - } - if (NULL != resource) { - new_cell->resource = strdup(resource); - } - - new_cell->cellid = orte_ns_proxy.num_cells; - *cellid = new_cell->cellid; - (orte_ns_proxy.num_cells)++; - - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_SUCCESS; -} - - -int orte_ns_proxy_get_cell_info(orte_cellid_t cellid, - char **site, char **resource) -{ - orte_buffer_t* cmd; - orte_buffer_t* answer; - orte_ns_cmd_flag_t command; - orte_cellid_t j; - orte_std_cntr_t i, count, index; - orte_ns_proxy_cell_info_t **cell, *new_cell; - int rc, ret=ORTE_SUCCESS; - - OPAL_TRACE(1); - - /* see if we already have the info locally */ - OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); - - cell = (orte_ns_proxy_cell_info_t**)(orte_ns_proxy.cells)->addr; - for (i=0, j=0; j < orte_ns_proxy.num_cells && - i < (orte_ns_proxy.cells)->size; i++) { - if (NULL != cell[i]) { - j++; - if (cellid == cell[i]->cellid) { - *site = strdup(cell[i]->site); - *resource = strdup(cell[i]->resource); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_SUCCESS; - } - } - } - - /* okay, don't have it locally - go ask for it */ - - command = ORTE_NS_GET_CELL_INFO_CMD; - - cmd = OBJ_NEW(orte_buffer_t); - if (cmd == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &cellid, 1, ORTE_CELLID))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - - if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - OBJ_RELEASE(cmd); - - answer = OBJ_NEW(orte_buffer_t); - if(answer == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - - if (ORTE_NS_GET_CELL_INFO_CMD != command) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, site, &count, ORTE_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, resource, &count, ORTE_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &ret, &count, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - - if (ORTE_SUCCESS == ret) { - /* remote operation worked - store the info locally for any subsequent requests */ - new_cell = OBJ_NEW(orte_ns_proxy_cell_info_t); - if (NULL == new_cell) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index, - orte_ns_proxy.cells, new_cell))) { - ORTE_ERROR_LOG(rc); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - if (NULL != site) { - new_cell->site = strdup(*site); - } - if (NULL != resource) { - new_cell->resource = strdup(*resource); - } - - new_cell->cellid = cellid; - } - - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ret; -} - -int orte_ns_proxy_create_jobid(orte_jobid_t *job) -{ - orte_buffer_t* cmd; - orte_buffer_t* answer; - orte_ns_cmd_flag_t command; - orte_std_cntr_t count; - int rc; - - OPAL_TRACE(1); - - /* set default value */ - *job = ORTE_JOBID_MAX; - - if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - command = ORTE_NS_CREATE_JOBID_CMD; - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */ - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - return rc; - } - - if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(cmd); - return ORTE_ERR_COMM_FAILURE; - } - OBJ_RELEASE(cmd); - - if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OBJ_RELEASE(answer); - return ORTE_ERR_OUT_OF_RESOURCE; - } - if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(answer); - return ORTE_ERR_COMM_FAILURE; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - return rc; - } - - if (ORTE_NS_CREATE_JOBID_CMD != command) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(answer); - return ORTE_ERR_COMM_FAILURE; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, job, &count, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - return rc; - } - - OBJ_RELEASE(answer); - return ORTE_SUCCESS; -} - - -int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t *starting_vpid) -{ - orte_buffer_t* cmd; - orte_buffer_t* answer; - orte_ns_cmd_flag_t command; - orte_std_cntr_t count; - int rc; - - OPAL_TRACE(1); - - /* set default return value */ - *starting_vpid = ORTE_VPID_MAX; - - if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - command = ORTE_NS_RESERVE_RANGE_CMD; - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */ - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - return rc; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&job, 1, ORTE_JOBID))) { /* got a problem */ - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - return rc; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&range, 1, ORTE_VPID))) { /* got a problem */ - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - return rc; - } - - if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(cmd); - return ORTE_ERR_COMM_FAILURE; - } - OBJ_RELEASE(cmd); - - - if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(answer); - return ORTE_ERR_COMM_FAILURE; - } - - count = 1; - if ((ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) - || (ORTE_NS_RESERVE_RANGE_CMD != command)) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - return rc; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, starting_vpid, &count, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - return rc; - } - OBJ_RELEASE(answer); - return ORTE_SUCCESS; -} - - -/* - * PEER functions - */ -int orte_ns_proxy_get_job_peers(orte_process_name_t **procs, - orte_std_cntr_t *num_procs, orte_jobid_t job) -{ - orte_buffer_t* cmd; - orte_buffer_t* answer; - orte_ns_cmd_flag_t command; - orte_std_cntr_t count, nprocs; - int rc; - - OPAL_TRACE_ARG1(1, job); - - /* set default value */ - *procs = NULL; - *num_procs = 0; - - if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - command = ORTE_NS_GET_JOB_PEERS_CMD; - /* pack the command */ - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */ - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - return rc; - } - - /* pack the jobid */ - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &job, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - return rc; - } - - if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(cmd); - return ORTE_ERR_COMM_FAILURE; - } - OBJ_RELEASE(cmd); - - if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OBJ_RELEASE(answer); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(answer); - return ORTE_ERR_COMM_FAILURE; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - return rc; - } - - if (ORTE_NS_GET_JOB_PEERS_CMD != command) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(answer); - return ORTE_ERR_COMM_FAILURE; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &nprocs, &count, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - return rc; - } - - /* allocate space for array of proc names */ - if (0 < nprocs) { - *procs = (orte_process_name_t*)malloc((nprocs) * sizeof(orte_process_name_t)); - if (NULL == *procs) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OBJ_RELEASE(answer); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, procs, &nprocs, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - return rc; - } - } - *num_procs = nprocs; - - OBJ_RELEASE(answer); - return ORTE_SUCCESS; -} - - -int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, - char *name) -{ - orte_buffer_t* cmd; - orte_buffer_t* answer; - orte_ns_cmd_flag_t command; - orte_ns_proxy_tagitem_t* tagitem, **tags; - orte_std_cntr_t count, i; - orte_rml_tag_t j; - int rc; - - OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); - - if (NULL != name) { - /* see if this name is already in list - if so, return tag */ - tags = (orte_ns_proxy_tagitem_t**)orte_ns_proxy.tags->addr; - for (i=0, j=0; j < orte_ns_proxy.num_tags && - i < (orte_ns_proxy.tags)->size; i++) { - if (NULL != tags[i]) { - j++; - if (tags[i]->name != NULL && - 0 == strcmp(name, tags[i]->name)) { /* found name on list */ - *tag = tags[i]->tag; - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_SUCCESS; - } - } - } - } - - /* okay, not on local list - so go get one from tag server */ - command = ORTE_NS_ASSIGN_OOB_TAG_CMD; - *tag = ORTE_RML_TAG_MAX; /* set the default error value */ - - if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - - if (NULL == name) { - name = "NULL"; - } - - if (0 > (rc = orte_dss.pack(cmd, &name, 1, ORTE_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - - if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - OBJ_RELEASE(cmd); - - if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - - if (ORTE_NS_ASSIGN_OOB_TAG_CMD != command) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, tag, &count, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - - OBJ_RELEASE(answer); - - /* add the new tag to the local list so we don't have to get it again */ - tagitem = OBJ_NEW(orte_ns_proxy_tagitem_t); - if (NULL == tagitem) { /* out of memory */ - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i, - orte_ns_proxy.tags, tagitem))) { - ORTE_ERROR_LOG(rc); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - tagitem->tag = *tag; - (orte_ns_proxy.num_tags)++; - if (NULL != name) { /* provided - can look it up later */ - tagitem->name = strdup(name); - } else { - tagitem->name = NULL; - } - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - - /* all done */ - return ORTE_SUCCESS; -} - - -int orte_ns_proxy_define_data_type(const char *name, - orte_data_type_t *type) -{ - orte_buffer_t* cmd; - orte_buffer_t* answer; - orte_ns_cmd_flag_t command; - orte_ns_proxy_dti_t **dti, *dtip; - orte_std_cntr_t count, i, j; - int rc=ORTE_SUCCESS; - - if (NULL == name || 0 < *type) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); - - /* first, check to see if name is already on local list - * if so, return id, ensure registered with dss - */ - dti = (orte_ns_proxy_dti_t**)orte_ns_proxy.dts->addr; - for (i=0, j=0; j < orte_ns_proxy.num_dts && - i < orte_ns_proxy.dts->size; i++) { - if (NULL != dti[i]) { - j++; - if (dti[i]->name != NULL && - 0 == strcmp(name, dti[i]->name)) { /* found name on list */ - *type = dti[i]->id; - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_SUCCESS; - } - } - } - - - /* okay, not on local list - so go get one from tag server */ - command = ORTE_NS_DEFINE_DATA_TYPE_CMD; - *type = ORTE_DSS_ID_MAX; /* set the default error value */ - - if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&name, 1, ORTE_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - - if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - OBJ_RELEASE(cmd); - - if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - - if (ORTE_NS_ASSIGN_OOB_TAG_CMD != command) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, type, &count, ORTE_DATA_TYPE))) { - ORTE_ERROR_LOG(ORTE_ERR_UNPACK_FAILURE); - OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_UNPACK_FAILURE; - } - OBJ_RELEASE(answer); - - /* add the new id to the local list so we don't have to get it again */ - dtip = OBJ_NEW(orte_ns_proxy_dti_t); - if (NULL == dtip) { /* out of memory */ - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - dtip->name = strdup(name); - if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i, - orte_ns_proxy.dts, dtip))) { - ORTE_ERROR_LOG(rc); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return rc; - } - dtip->id = *type; - (orte_ns_proxy.num_dts)++; - - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - - /* all done */ - return rc; -} - -/* - * Take advantage of the way the RML uses the process name as its index into - * the RML communicator table. Because the RML needs a name right away, it will - * automatically assign us one when it receives a message - and it communicates - * that assignment back to us automatically. Thus, to get a name for ourselves, - * all we have to do is send a message! No response from the replica is required. - */ -int orte_ns_proxy_create_my_name(void) -{ - orte_buffer_t* cmd; - orte_ns_cmd_flag_t command; - int rc; - - command = ORTE_NS_CREATE_MY_NAME_CMD; - - cmd = OBJ_NEW(orte_buffer_t); - if (cmd == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - return rc; - } - - if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(cmd); - return ORTE_ERR_COMM_FAILURE; - } - OBJ_RELEASE(cmd); - - return ORTE_SUCCESS; -} - -/* - * DIAGNOSTIC functions - */ -int orte_ns_proxy_dump_cells(void) -{ - orte_buffer_t cmd; - orte_buffer_t answer; - orte_ns_cmd_flag_t command; - orte_std_cntr_t i; - orte_cellid_t j; - orte_ns_proxy_cell_info_t **ptr; - int rc; - - command = ORTE_NS_DUMP_CELLS_CMD; - - OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); - - /* dump name service replica cell tracker */ - OBJ_CONSTRUCT(&cmd, orte_buffer_t); - if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - OBJ_DESTRUCT(&cmd); - return rc; - } - - if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, ORTE_RML_TAG_NS, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_DESTRUCT(&cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - OBJ_DESTRUCT(&cmd); - - OBJ_CONSTRUCT(&answer, orte_buffer_t); - if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_DESTRUCT(&answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - - if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&answer); - return rc; - } - - /* dump local cell tracker */ - opal_output(mca_ns_base_output, "\n\n[%lu,%lu,%lu] Dump of Local Cell Tracker\n", - ORTE_NAME_ARGS(orte_process_info.my_name)); - ptr = (orte_ns_proxy_cell_info_t**)(orte_ns_proxy.cells)->addr; - for (i=0, j=0; j < orte_ns_proxy.num_cells && - i < (orte_ns_proxy.cells)->size; i++) { - if (NULL != ptr[i]) { - j++; - opal_output(mca_ns_base_output, "Num: %lu\tCell: %lu\n", - (unsigned long)j, (unsigned long)ptr[i]->cellid); - } - } - - return ORTE_SUCCESS; -} - - -int orte_ns_proxy_dump_jobs(void) -{ - orte_buffer_t cmd; - orte_buffer_t answer; - orte_ns_cmd_flag_t command; - int rc; - - command = ORTE_NS_DUMP_JOBIDS_CMD; - - OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); - - /* dump name service replica jobid tracker */ - OBJ_CONSTRUCT(&cmd, orte_buffer_t); - if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - OBJ_DESTRUCT(&cmd); - return rc; - } - - if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, ORTE_RML_TAG_NS, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_DESTRUCT(&cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - OBJ_DESTRUCT(&cmd); - - OBJ_CONSTRUCT(&answer, orte_buffer_t); - if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_DESTRUCT(&answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - - if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&answer); - return rc; - } - - return ORTE_SUCCESS; -} - - -int orte_ns_proxy_dump_tags(void) -{ - orte_buffer_t cmd; - orte_buffer_t answer; - orte_ns_cmd_flag_t command; - orte_std_cntr_t i; - orte_rml_tag_t j; - orte_ns_proxy_tagitem_t **ptr; - int rc; - - command = ORTE_NS_DUMP_TAGS_CMD; - - OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); - - /* dump name service replica tag tracker */ - OBJ_CONSTRUCT(&cmd, orte_buffer_t); - if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - OBJ_DESTRUCT(&cmd); - return rc; - } - - if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, ORTE_RML_TAG_NS, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_DESTRUCT(&cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - OBJ_DESTRUCT(&cmd); - - OBJ_CONSTRUCT(&answer, orte_buffer_t); - if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_DESTRUCT(&answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - - if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&answer); - return rc; - } - - /* dump local tag tracker */ - opal_output(mca_ns_base_output, "\n\n[%lu,%lu,%lu] Dump of Local Tag Tracker\n", - ORTE_NAME_ARGS(orte_process_info.my_name)); - ptr = (orte_ns_proxy_tagitem_t**)(orte_ns_proxy.tags)->addr; - for (i=0, j=0; j < orte_ns_proxy.num_tags && - i < (orte_ns_proxy.tags)->size; i++) { - if (NULL != ptr[i]) { - j++; - opal_output(mca_ns_base_output, "Num: %lu\tTag: %lu\tTag name: %s\n", - (unsigned long)j, (unsigned long)ptr[i]->tag, ptr[i]->name); - } - } - - return ORTE_SUCCESS; -} - - -int orte_ns_proxy_dump_datatypes(void) -{ - orte_buffer_t cmd; - orte_buffer_t answer; - orte_ns_cmd_flag_t command; - orte_std_cntr_t i, j; - orte_ns_proxy_dti_t **ptr; - int rc; - - command = ORTE_NS_DUMP_DATATYPES_CMD; - - OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); - - /* dump name service replica datatype tracker */ - OBJ_CONSTRUCT(&cmd, orte_buffer_t); - if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - OBJ_DESTRUCT(&cmd); - return rc; - } - - if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, ORTE_RML_TAG_NS, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_DESTRUCT(&cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - OBJ_DESTRUCT(&cmd); - - OBJ_CONSTRUCT(&answer, orte_buffer_t); - if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_DESTRUCT(&answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); - return ORTE_ERR_COMM_FAILURE; - } - - if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&answer); - return rc; - } - - /* dump local datatype tracker */ - opal_output(mca_ns_base_output, "\n\n[%lu,%lu,%lu] Dump of Local Datatype Tracker\n", - ORTE_NAME_ARGS(orte_process_info.my_name)); - ptr = (orte_ns_proxy_dti_t**)(orte_ns_proxy.dts)->addr; - for (i=0, j=0; j < orte_ns_proxy.num_dts && - i < (orte_ns_proxy.dts)->size; i++) { - if (NULL != ptr[i]) { - j++; - opal_output(mca_ns_base_output, "Num: %lu\tDatatype id: %lu\tDatatype name: %s\n", - (unsigned long)j, (unsigned long)ptr[i]->id, ptr[i]->name); - } - } - - return ORTE_SUCCESS; -} - - diff --git a/orte/mca/ns/replica/Makefile.am b/orte/mca/ns/replica/Makefile.am index 0f19235d50..9adba2abb1 100644 --- a/orte/mca/ns/replica/Makefile.am +++ b/orte/mca/ns/replica/Makefile.am @@ -16,37 +16,37 @@ # $HEADER$ # -# Use the top-level Makefile.options - - - -sources = -include src/Makefile.extra +sources = \ + ns_replica.h \ + ns_replica_class_instances.h \ + ns_replica_cell_fns.c \ + ns_replica_diag_fns.c \ + ns_replica_general_fns.c \ + ns_replica_job_fns.c \ + ns_replica_recv.c \ + ns_replica_support_fns.c \ + ns_replica_component.c # Make the output library in this directory, and name it either # mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la # (for static builds). if OMPI_BUILD_ns_replica_DSO -lib = -lib_sources = -component = mca_ns_replica.la -component_sources = $(sources) +component_noinst = +component_install = mca_ns_replica.la else -lib = libmca_ns_replica.la -lib_sources = $(sources) -component = -component_sources = +component_noinst = libmca_ns_replica.la +component_install = endif mcacomponentdir = $(libdir)/openmpi -mcacomponent_LTLIBRARIES = $(component) -mca_ns_replica_la_SOURCES = $(component_sources) +mcacomponent_LTLIBRARIES = $(component_install) +mca_ns_replica_la_SOURCES = $(sources) mca_ns_replica_la_LDFLAGS = -module -avoid-version mca_ns_replica_la_LIBADD = \ $(top_ompi_builddir)/orte/liborte.la \ $(top_ompi_builddir)/opal/libopal.la -noinst_LTLIBRARIES = $(lib) -libmca_ns_replica_la_SOURCES = $(lib_sources) +noinst_LTLIBRARIES = $(component_noinst) +libmca_ns_replica_la_SOURCES =$(sources) libmca_ns_replica_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/ns/replica/configure.params b/orte/mca/ns/replica/configure.params index 33baded7ae..3d04f54b19 100644 --- a/orte/mca/ns/replica/configure.params +++ b/orte/mca/ns/replica/configure.params @@ -19,5 +19,6 @@ # Specific to this module -PARAM_INIT_FILE=src/ns_replica.c +PARAM_INIT_FILE=ns_replica.c +PARAM_CONFIG_HEADER_FILE="ns_replica.h" PARAM_CONFIG_FILES="Makefile" diff --git a/orte/mca/ns/replica/src/ns_replica.c b/orte/mca/ns/replica/ns_replica.c similarity index 67% rename from orte/mca/ns/replica/src/ns_replica.c rename to orte/mca/ns/replica/ns_replica.c index a5845c58a8..a1151af690 100644 --- a/orte/mca/ns/replica/src/ns_replica.c +++ b/orte/mca/ns/replica/ns_replica.c @@ -38,227 +38,6 @@ */ #define NS_REPLICA_MAX_STRING_SIZE 256 -/* - * functions - */ - -int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resource) -{ - orte_ns_replica_cell_tracker_t *new_cell; - int rc; - orte_std_cntr_t index; - - OPAL_TRACE(1); - - OPAL_THREAD_LOCK(&orte_ns_replica.mutex); - - *cellid = ORTE_CELLID_MAX; - - /* check if cellid is available. NOTE: need to reserve - * ORTE_CELLID_MAX as an invalid value, so can't allow - * num_cells to get there - */ - if (ORTE_CELLID_MAX-2 < orte_ns_replica.num_cells) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - new_cell = OBJ_NEW(orte_ns_replica_cell_tracker_t); - if (NULL == new_cell) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index, - orte_ns_replica.cells, new_cell))) { - ORTE_ERROR_LOG(rc); - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return rc; - } - new_cell->site = strdup(site); - new_cell->resource = strdup(resource); - - new_cell->cell = orte_ns_replica.num_cells; - *cellid = new_cell->cell; - (orte_ns_replica.num_cells)++; - - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return ORTE_SUCCESS; -} - -int orte_ns_replica_get_cell_info(orte_cellid_t cellid, - char **site, char **resource) -{ - orte_std_cntr_t i; - orte_cellid_t j; - orte_ns_replica_cell_tracker_t **cell; - - OPAL_TRACE(1); - - OPAL_THREAD_LOCK(&orte_ns_replica.mutex); - - cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr; - for (i=0, j=0; j < orte_ns_replica.num_cells && - i < (orte_ns_replica.cells)->size; i++) { - if (NULL != cell[i]) { - j++; - if (cellid == cell[i]->cell) { - *site = strdup(cell[i]->site); - *resource = strdup(cell[i]->resource); - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return ORTE_SUCCESS; - } - } - } - - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return ORTE_ERR_NOT_FOUND; -} - - -/* - * JOBID functions - */ -int orte_ns_replica_create_jobid(orte_jobid_t *jobid) -{ - orte_ns_replica_jobid_tracker_t *ptr; - int rc; - orte_std_cntr_t index; - - OPAL_TRACE(1); - - OPAL_THREAD_LOCK(&orte_ns_replica.mutex); - - *jobid = ORTE_JOBID_MAX; - /* check if a jobid is available. NOTE: need to - * reserve ORTE_JOBID_MAX as an invalid value, so can't let - * num_jobids get there - */ - if (ORTE_JOBID_MAX-2 < orte_ns_replica.num_jobids) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - ptr = OBJ_NEW(orte_ns_replica_jobid_tracker_t); - if (NULL == ptr) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index, - orte_ns_replica.jobids, ptr))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(ptr); - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return rc; - } - - ptr->jobid = orte_ns_replica.num_jobids; - *jobid = ptr->jobid; - (orte_ns_replica.num_jobids)++; - - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return ORTE_SUCCESS; -} - - - -int orte_ns_replica_reserve_range(orte_jobid_t job, orte_vpid_t range, - orte_vpid_t *start) -{ - orte_ns_replica_jobid_tracker_t **ptr; - orte_std_cntr_t j; - orte_jobid_t k; - - OPAL_TRACE(1); - - OPAL_THREAD_LOCK(&orte_ns_replica.mutex); - - /* find the jobid */ - ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr; - for (j=0, k=0; k < orte_ns_replica.num_jobids && - j < (orte_ns_replica.jobids)->size; j++) { - if (NULL != ptr[j]) { - k++; - if (job == ptr[j]->jobid) { - goto PROCESS; - } - } - } - /* didn't find the specified jobid - error */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return ORTE_ERR_NOT_FOUND; - -PROCESS: - if ((ORTE_VPID_MAX-range-(ptr[j]->next_vpid)) > 0) { - *start = ptr[j]->next_vpid; - ptr[j]->next_vpid += range; - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return ORTE_SUCCESS; - } - - /* get here if the range isn't available */ - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; -} - -int orte_ns_replica_get_job_peers(orte_process_name_t **procs, - orte_std_cntr_t *num_procs, orte_jobid_t job) -{ - orte_ns_replica_jobid_tracker_t **ptr; - orte_process_name_t *nptr; - orte_std_cntr_t j; - orte_jobid_t k; - - OPAL_TRACE_ARG1(1, job); - - OPAL_THREAD_LOCK(&orte_ns_replica.mutex); - - /* find the jobid */ - ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr; - for (j=0, k=0; k < orte_ns_replica.num_jobids && - j < (orte_ns_replica.jobids)->size; j++) { - if (NULL != ptr[j]) { - k++; - if (job == ptr[j]->jobid) { - goto PROCESS; - } - } - } - /* didn't find the specified jobid - error */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return ORTE_ERR_NOT_FOUND; - -PROCESS: - /* the field next_vpid contains the value of the next unassigned - * vpid, so the job extends from vpid=0 to that value. create - * an array of process names containing those values - */ - *procs = (orte_process_name_t*)malloc(ptr[j]->next_vpid * sizeof(orte_process_name_t)); - if (NULL == *procs) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return ORTE_ERR_OUT_OF_RESOURCE; - } - nptr = *procs; - for (k=0; k < ptr[j]->next_vpid; k++) { - nptr->cellid = 0; - nptr->jobid = job; - nptr->vpid = (orte_vpid_t)k; - nptr++; - } - *num_procs = (orte_std_cntr_t)ptr[j]->next_vpid; - - OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); - return ORTE_SUCCESS; -} - - /* * DIAGNOSTIC functions */ diff --git a/orte/mca/ns/replica/src/ns_replica.h b/orte/mca/ns/replica/ns_replica.h similarity index 50% rename from orte/mca/ns/replica/src/ns_replica.h rename to orte/mca/ns/replica/ns_replica.h index 212ca9f618..a8c02d6be0 100644 --- a/orte/mca/ns/replica/src/ns_replica.h +++ b/orte/mca/ns/replica/ns_replica.h @@ -34,31 +34,54 @@ extern "C" { #endif -/* list class for tracking cellid's +/* + * globals */ +#define NS_REPLICA_MAX_STRING_SIZE 256 + + +/* class for tracking cellid's */ struct orte_ns_replica_cell_tracker_t { opal_object_t super; orte_cellid_t cell; char *site; char *resource; + orte_nodeid_t next_nodeid; + orte_pointer_array_t *nodeids; }; typedef struct orte_ns_replica_cell_tracker_t orte_ns_replica_cell_tracker_t; OBJ_CLASS_DECLARATION(orte_ns_replica_cell_tracker_t); -/* - * object for tracking vpids/jobids - * This structure is used to track jobid-max vpid pairs. Basically, we - * are tracking the max used vpid for each jobid that has been created. - */ -struct orte_ns_replica_jobid_tracker_t { +/* object for tracking nodeid's */ +struct orte_ns_replica_nodeid_tracker_t { opal_object_t super; - orte_jobid_t jobid; /**< Job id */ - orte_vpid_t next_vpid; + char *nodename; + orte_nodeid_t nodeid; }; -typedef struct orte_ns_replica_jobid_tracker_t orte_ns_replica_jobid_tracker_t; +typedef struct orte_ns_replica_nodeid_tracker_t orte_ns_replica_nodeid_tracker_t; + +OBJ_CLASS_DECLARATION(orte_ns_replica_nodeid_tracker_t); + + +/* + * object for tracking vpids and jobids for job families + * This structure is used to track the parent-child relationship between + * jobs. The "root" of the family is the initial parent - each child has + * a record under that parent. Any child that subsequently spawns its own + * children will form a list of jobids beneath them. + * + * each object records the jobid of the job it represents, and the next vpid + * that will be assigned when a range is requested. + */ +typedef struct { + opal_list_item_t super; + orte_jobid_t jobid; + orte_vpid_t next_vpid; + opal_list_t children; +} orte_ns_replica_jobitem_t; +OBJ_CLASS_DECLARATION(orte_ns_replica_jobitem_t); -OBJ_CLASS_DECLARATION(orte_ns_replica_jobid_tracker_t); struct orte_ns_replica_tagitem_t { opal_object_t super; @@ -85,12 +108,8 @@ typedef struct { size_t max_size, block_size; orte_cellid_t num_cells; orte_pointer_array_t *cells; -#if 0 - orte_jobgrp_t num_jobgrps; - orte_pointer_array_t *jobgrps; -#endif orte_jobid_t num_jobids; - orte_pointer_array_t *jobids; + opal_list_t jobs; orte_pointer_array_t *tags; orte_rml_tag_t num_tags; orte_pointer_array_t *dts; @@ -124,38 +143,53 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender, orte_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata); /* - * Implementation of create_cellid(). + * CELL FUNCTIONS */ int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resource); -/* - * Implementation of get_cell_info() - */ int orte_ns_replica_get_cell_info(orte_cellid_t cellid, char **site, char **resource); -/* - * Implementation of create_jobid(). - */ -int orte_ns_replica_create_jobid(orte_jobid_t *jobid); +int orte_ns_replica_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, + orte_cellid_t cellid, char **nodenames); +int orte_ns_replica_get_node_info(char ***nodenames, orte_cellid_t cellid, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids); /* - * Implementation of reserve_range() + * JOB FUNCTIONS */ +int orte_ns_replica_create_jobid(orte_jobid_t *jobid, opal_list_t *attrs); + +int orte_ns_replica_get_job_descendants(orte_jobid_t **descendants, orte_std_cntr_t *num_desc, orte_jobid_t job); + +int orte_ns_replica_get_job_children(orte_jobid_t **descendants, orte_std_cntr_t *num_desc, orte_jobid_t job); + +int orte_ns_replica_get_root_job(orte_jobid_t *root_job, orte_jobid_t job); + +int orte_ns_replica_get_parent_job(orte_jobid_t *parent, orte_jobid_t job); + int orte_ns_replica_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t *startvpid); /* - * Peer functions + * GENERAL FUNCTIONS */ -int orte_ns_replica_get_job_peers(orte_process_name_t **procs, - orte_std_cntr_t *num_procs, orte_jobid_t job); +int orte_ns_replica_get_peers(orte_process_name_t **procs, + orte_std_cntr_t *num_procs, opal_list_t *attrs); + +int orte_ns_replica_assign_rml_tag(orte_rml_tag_t *tag, + char *name); + + +int orte_ns_replica_define_data_type(const char *name, + orte_data_type_t *type); + +int orte_ns_replica_create_my_name(void); /* - * Diagnostic functions + * DIAGNOSTIC FUNCTIONS */ int orte_ns_replica_dump_cells(void); int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer); @@ -171,20 +205,46 @@ int orte_ns_replica_dump_datatypes_fn(orte_buffer_t *buffer); /* - * Implementation of assign rml tag + * INTERNAL SUPPORT FUNCTIONS */ -int orte_ns_replica_assign_rml_tag(orte_rml_tag_t *tag, - char *name); - -int orte_ns_replica_define_data_type(const char *name, - orte_data_type_t *type); - -int orte_ns_replica_create_my_name(void); - -/* - * +/* find a job's record, wherever it may be located on the list of job families. + * this function searches the entire list of job families, traversing the list + * of all jobs in each family, until it finds the specified job. It then returns + * a pointer to the that job's info structure. It returns + * NULL (without error_logging an error) if no record is found */ +orte_ns_replica_jobitem_t* orte_ns_replica_find_job(orte_jobid_t job); + +/* find the root job for the specified job. + * this function searches the entire list of job families, traversing the list + * of all jobs in each family, until it finds the specified job. It then returns + * a pointer to the root job's info structure for that job family. It returns + * NULL (without error_logging an error) if no record is found + */ +orte_ns_replica_jobitem_t* orte_ns_replica_find_root_job(orte_jobid_t job); + +/* find a job's record on a specified root's family tree. + * this function finds the family record for the specified root job. It then + * traverses the children of that root until it finds the specified job, and then + * returns a pointer to that job's info structure. If root=jobid, then it will + * return a pointer to the root job's info structure. It returns + * NULL (without error_logging an error) if no record is found + */ +orte_ns_replica_jobitem_t* orte_ns_replica_search_job_family_tree(orte_jobid_t root, orte_jobid_t jobid); + +/* given a job's record, create a flattened list of descendants below it */ +void orte_ns_replica_construct_flattened_tree(opal_list_t *tree, orte_ns_replica_jobitem_t *ptr); + +/* search down a tree, following all the children's branches, to find the specified + * job. Return a pointer to that object, and a pointer to the parent object + * This function is called recursively, so it passes into it the ptr to the + * current object being looked at + */ +orte_ns_replica_jobitem_t *down_search(orte_ns_replica_jobitem_t *ptr, + orte_ns_replica_jobitem_t **parent_ptr, + orte_jobid_t job); + ORTE_MODULE_DECLSPEC extern mca_ns_base_component_t mca_ns_replica_component; #if defined(c_plusplus) || defined(__cplusplus) diff --git a/orte/mca/ns/replica/ns_replica_cell_fns.c b/orte/mca/ns/replica/ns_replica_cell_fns.c new file mode 100644 index 0000000000..86758af1ee --- /dev/null +++ b/orte/mca/ns/replica/ns_replica_cell_fns.c @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ +#include "orte_config.h" + +#include <stdio.h> +#include <string.h> + +#include "opal/threads/mutex.h" +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "opal/util/trace.h" + +#include "orte/dss/dss.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/ns/base/base.h" +#include "ns_replica.h" + +/* + * functions + */ + +int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resource) +{ + orte_ns_replica_cell_tracker_t *new_cell, **cell; + int rc; + orte_std_cntr_t i, j, index; + + OPAL_TRACE(1); + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + *cellid = ORTE_CELLID_INVALID; + + /* check for error */ + if (NULL == site || NULL == resource) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_BAD_PARAM; + } + + /* is this a known cellid? */ + cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr; + for (i=0, j=0; j < orte_ns_replica.num_cells && + i < (orte_ns_replica.cells)->size; i++) { + if (NULL != cell[i]) { + j++; + if (0 == strcmp(site, cell[i]->site) && + 0 == strcmp(resource, cell[i]->resource)) { + *cellid = cell[i]->cell; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; + } + } + } + + /* new cell - check if cellid is available */ + if (ORTE_CELLID_MAX-1 < orte_ns_replica.num_cells) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + new_cell = OBJ_NEW(orte_ns_replica_cell_tracker_t); + if (NULL == new_cell) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index, + orte_ns_replica.cells, new_cell))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + new_cell->site = strdup(site); + new_cell->resource = strdup(resource); + + new_cell->cell = orte_ns_replica.num_cells; + *cellid = new_cell->cell; + (orte_ns_replica.num_cells)++; + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; +} + +int orte_ns_replica_get_cell_info(orte_cellid_t cellid, + char **site, char **resource) +{ + orte_std_cntr_t i; + orte_cellid_t j; + orte_ns_replica_cell_tracker_t **cell; + + OPAL_TRACE(1); + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr; + for (i=0, j=0; j < orte_ns_replica.num_cells && + i < (orte_ns_replica.cells)->size; i++) { + if (NULL != cell[i]) { + j++; + if (cellid == cell[i]->cell) { + *site = strdup(cell[i]->site); + *resource = strdup(cell[i]->resource); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; + } + } + } + + /* it isn't an error to not find the cell - so do NOT + * report it via ORTE_ERROR_LOG + */ + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_NOT_FOUND; +} + +/* + * NODEID + */ +int orte_ns_replica_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, + orte_cellid_t cellid, char **nodenames) +{ + orte_ns_replica_cell_tracker_t **cell, *cptr; + orte_ns_replica_nodeid_tracker_t **nodes, *node; + orte_nodeid_t *nds, nid; + orte_std_cntr_t i, j, k, m, n, num_nodes; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + num_nodes = opal_argv_count(nodenames); + if (0 == num_nodes) { /** no nodenames provided - just return */ + *nodeids = NULL; + *nnodes = 0; + return ORTE_SUCCESS; + } + + nds = (orte_nodeid_t*)malloc(num_nodes * sizeof(orte_nodeid_t)); + if (NULL == nds) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /** find the cell */ + cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr; + for (i=0, j=0; j < orte_ns_replica.num_cells && + i < (orte_ns_replica.cells)->size; i++) { + if (NULL != cell[i]) { + j++; + if (cellid == cell[i]->cell) { + /** found the specified cell - check to see if nodename has already been + * defined. if so, just return the nodeid. if not, create a new one + */ + cptr = cell[i]; + goto PROCESS; + } + } + } + /** get here if we didn't find the cell */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + free(nds); + *nodeids = NULL; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_NOT_FOUND; + +PROCESS: + nodes = (orte_ns_replica_nodeid_tracker_t**)(cptr->nodeids->addr); + for (n=0; n < num_nodes; n++) { + for (k=0, m=0; m < cptr->next_nodeid && + k < (cptr->nodeids)->size; k++) { + if (NULL != nodes[k]) { + m++; + if (strcmp(nodenames[n], nodes[k]->nodename) == 0) { /** found same name */ + nid = nodes[k]->nodeid; + goto ASSIGN; + } + } + } + /** get here if we don't find this nodename - add one */ + node = OBJ_NEW(orte_ns_replica_nodeid_tracker_t); + if (NULL == node) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + free(nds); + *nodeids = NULL; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + node->nodename = strdup(nodenames[n]); + node->nodeid = cptr->next_nodeid; + cptr->next_nodeid++; + nid = node->nodeid; + +ASSIGN: + nds[n] = nid; + } /** for n */ + + *nodeids = nds; + *nnodes = num_nodes; + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; +} + +int orte_ns_replica_get_node_info(char ***nodenames, orte_cellid_t cellid, + orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids) +{ + char **names, *nm; + orte_ns_replica_cell_tracker_t **cell, *cptr; + orte_ns_replica_nodeid_tracker_t **nodes; + orte_std_cntr_t i, j, k, m, n; + char *err_name = "NODE_NOT_FOUND" + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + if (0 == num_nodes) { + *nodenames = NULL; + return ORTE_SUCCESS; + } + + /** allocate an extra space for the NULL termination */ + names = (char**)malloc((num_nodes+1) * sizeof(char*)); + if (NULL == names) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + names[num_nodes] = NULL; /** NULL-terminate the list */ + + /** find the cell */ + cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr; + for (i=0, j=0; j < orte_ns_replica.num_cells && + i < (orte_ns_replica.cells)->size; i++) { + if (NULL != cell[i]) { + j++; + if (cellid == cell[i]->cell) { + /** found the specified cell - check to see if nodename has already been + * defined. if so, just return the nodeid. if not, create a new one + */ + cptr = cell[i]; + goto PROCESS; + } + } + } + /** get here if we didn't find the cell */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + free(names); + *nodenames = NULL; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_NOT_FOUND; + +PROCESS: + nodes = (orte_ns_replica_nodeid_tracker_t**)(cell[i]->nodeids->addr); + for (n=0; n < num_nodes; n++) { + for (k=0, m=0; m < cell[i]->next_nodeid && + k < (cell[i]->nodeids)->size; k++) { + if (NULL != nodes[k]) { + m++; + if (nodeids[n] == nodes[k]->nodeid) { /** found it */ + nm = nodes[k]->nodename; + goto ASSIGN; + } + } + } + /** node not found - set name to error name. Can't set it to NULL since + * the list is a NULL-terminated one + */ + nm = err_name; + +ASSIGN: + names[n] = strdup(nm); + } + + *nodenames = names; + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ns/replica/ns_replica_class_instances.h b/orte/mca/ns/replica/ns_replica_class_instances.h new file mode 100644 index 0000000000..b2a5c49a5c --- /dev/null +++ b/orte/mca/ns/replica/ns_replica_class_instances.h @@ -0,0 +1,172 @@ +/* -*- C -*- +* +* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +* University Research and Technology +* Corporation. All rights reserved. +* Copyright (c) 2004-2006 The University of Tennessee and The University +* of Tennessee Research Foundation. All rights +* reserved. +* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +* University of Stuttgart. All rights reserved. +* Copyright (c) 2004-2005 The Regents of the University of California. +* All rights reserved. +* $COPYRIGHT$ +* +* Additional copyrights may follow +* +* $HEADER$ +* +*/ +#ifndef NS_REPLICA_CLASS_INSTANCES_H +#define NS_REPLICA_CLASS_INSTANCES_H + +#include "orte_config.h" +#include "orte/orte_types.h" +#include "orte/orte_constants.h" +#include "opal/threads/mutex.h" +#include "opal/class/opal_object.h" +#include "orte/class/orte_pointer_array.h" +#include "orte/dss/dss.h" +#include "orte/mca/oob/oob_types.h" +#include "orte/mca/ns/base/base.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +/*** CELLID ***/ +/* constructor - used to initialize state of cell_tracker instance */ +static void orte_ns_replica_cell_tracker_construct(orte_ns_replica_cell_tracker_t* cell_tracker) +{ + cell_tracker->cell = ORTE_CELLID_INVALID; + cell_tracker->site = NULL; + cell_tracker->resource = NULL; + + cell_tracker->next_nodeid = 0; + orte_pointer_array_init(&(cell_tracker->nodeids), + orte_ns_replica.block_size, + orte_ns_replica.max_size, + orte_ns_replica.block_size); +} + +/* destructor - used to free any resources held by instance */ +static void orte_ns_replica_cell_tracker_destructor(orte_ns_replica_cell_tracker_t* cell_tracker) +{ + orte_std_cntr_t i, j; + orte_ns_replica_nodeid_tracker_t **nodeid; + + if (NULL != cell_tracker->site) free(cell_tracker->site); + if (NULL != cell_tracker->resource) free(cell_tracker->resource); + + nodeid = (orte_ns_replica_nodeid_tracker_t**)(cell_tracker->nodeids)->addr; + + for (i=0, j=0; j < cell_tracker->next_nodeid && + i < (cell_tracker->nodeids)->size; i++) { + if (NULL != nodeid[i]) { + j++; + OBJ_RELEASE(nodeid[i]); + } + } + OBJ_RELEASE(cell_tracker->nodeids); +} + +/* define instance of opal_class_t */ +OBJ_CLASS_INSTANCE(orte_ns_replica_cell_tracker_t, /* type name */ + opal_object_t, /* parent "class" name */ + orte_ns_replica_cell_tracker_construct, /* constructor */ + orte_ns_replica_cell_tracker_destructor); /* destructor */ + + +/** NODEID */ +static void orte_ns_replica_nodeid_tracker_construct(orte_ns_replica_nodeid_tracker_t *ptr) +{ + ptr->nodeid = ORTE_NODEID_INVALID; + ptr->nodename = NULL; +} + +static void orte_ns_replica_nodeid_tracker_destructor(orte_ns_replica_nodeid_tracker_t *ptr) +{ + if (NULL != ptr->nodename) free(ptr->nodename); +} + +OBJ_CLASS_INSTANCE(orte_ns_replica_nodeid_tracker_t, /* type name */ + opal_object_t, /* parent "class" name */ + orte_ns_replica_nodeid_tracker_construct, /* constructor */ + orte_ns_replica_nodeid_tracker_destructor); /* destructor */ + + +/*** JOBITEM ***/ +/* constructor - used to initialize state of jobitem instance */ +static void orte_ns_replica_jobitem_construct(orte_ns_replica_jobitem_t *ptr) +{ + ptr->jobid = ORTE_JOBID_INVALID; + ptr->next_vpid = 0; + OBJ_CONSTRUCT(&ptr->children, opal_list_t); +} + +/* destructor - used to free any resources held by instance */ +static void orte_ns_replica_jobitem_destructor(orte_ns_replica_jobitem_t *ptr){ + opal_list_item_t *item; + + while (NULL != (item = opal_list_remove_first(&ptr->children))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&ptr->children); +} + +/* define instance of opal_class_t */ +OBJ_CLASS_INSTANCE(orte_ns_replica_jobitem_t, /* type name */ + opal_list_item_t, /* parent "class" name */ + orte_ns_replica_jobitem_construct, /* constructor */ + orte_ns_replica_jobitem_destructor); /* destructor */ + + +/*** RML TAG ***/ +/* constructor - used to initialize state of taglist instance */ +static void orte_ns_replica_tagitem_construct(orte_ns_replica_tagitem_t* tagitem) +{ + tagitem->tag = ORTE_RML_TAG_MAX; + tagitem->name = NULL; +} + +/* destructor - used to free any resources held by instance */ +static void orte_ns_replica_tagitem_destructor(orte_ns_replica_tagitem_t* tagitem) +{ + if (NULL != tagitem->name) { + free(tagitem->name); + } +} + +/* define instance of opal_class_t */ +OBJ_CLASS_INSTANCE(orte_ns_replica_tagitem_t, /* type name */ + opal_object_t, /* parent "class" name */ + orte_ns_replica_tagitem_construct, /* constructor */ + orte_ns_replica_tagitem_destructor); /* destructor */ + + +/*** DATA TYPE ***/ +/* constructor - used to initialize state of dtilist instance */ +static void orte_ns_replica_dti_construct(orte_ns_replica_dti_t* dti) +{ + dti->id = ORTE_DSS_ID_MAX; + dti->name = NULL; +} + +/* destructor - used to free any resources held by instance */ +static void orte_ns_replica_dti_destructor(orte_ns_replica_dti_t* dti) +{ + if (NULL != dti->name) { + free(dti->name); + } +} + +/* define instance of opal_class_t */ +OBJ_CLASS_INSTANCE(orte_ns_replica_dti_t, /* type name */ + opal_object_t, /* parent "class" name */ + orte_ns_replica_dti_construct, /* constructor */ + orte_ns_replica_dti_destructor); /* destructor */ + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/orte/mca/ns/replica/ns_replica_component.c b/orte/mca/ns/replica/ns_replica_component.c new file mode 100644 index 0000000000..eb2359b1e4 --- /dev/null +++ b/orte/mca/ns/replica/ns_replica_component.c @@ -0,0 +1,309 @@ +/* -*- C -*- +* +* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +* University Research and Technology +* Corporation. All rights reserved. +* Copyright (c) 2004-2005 The University of Tennessee and The University +* of Tennessee Research Foundation. All rights +* reserved. +* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +* University of Stuttgart. All rights reserved. +* Copyright (c) 2004-2005 The Regents of the University of California. +* All rights reserved. +* $COPYRIGHT$ +* +* Additional copyrights may follow +* +* $HEADER$ +*/ +/** @file: +* +* The Open MPI Name Server +* +* The Open MPI Name Server provides unique name ranges for processes +* within the universe. Each universe will have one name server +* running within the seed daemon. This is done to prevent the +* inadvertent duplication of names. +*/ + +/* + * includes + */ +#include "orte_config.h" + +#include "orte/orte_constants.h" +#include "orte/orte_types.h" + +#include "opal/threads/mutex.h" +#include "opal/class/opal_list.h" +#include "opal/util/output.h" + +#include "opal/mca/mca.h" +#include "opal/mca/base/mca_base_param.h" + +#include "orte/util/proc_info.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rml/rml.h" + +#include "orte/mca/ns/base/ns_private.h" +#include "ns_replica.h" + + +/* + * Struct of function pointers that need to be initialized + */ +mca_ns_base_component_t mca_ns_replica_component = { +{ + MCA_NS_BASE_VERSION_2_0_0, + + "replica", /* MCA module name */ + ORTE_MAJOR_VERSION, /* MCA module major version */ + ORTE_MINOR_VERSION, /* MCA module minor version */ + ORTE_RELEASE_VERSION, /* MCA module release version */ + orte_ns_replica_open, /* module open */ + orte_ns_replica_close /* module close */ +}, +{ + false /* checkpoint / restart */ +}, +orte_ns_replica_init, /* module init */ +orte_ns_replica_finalize /* module shutdown */ +}; + +/* + * setup the function pointers for the module + */ +static mca_ns_base_module_t orte_ns_replica_module = { + /* init */ + orte_ns_replica_module_init, + /* cell functions */ + orte_ns_replica_create_cellid, + orte_ns_replica_get_cell_info, + orte_ns_base_get_cellid_string, + orte_ns_base_convert_cellid_to_string, + orte_ns_base_convert_string_to_cellid, + /** node functions */ + orte_ns_replica_create_nodeids, + orte_ns_replica_get_node_info, + orte_ns_base_convert_nodeid_to_string, + orte_ns_base_convert_string_to_nodeid, + /* jobid functions */ + orte_ns_replica_create_jobid, + orte_ns_replica_get_job_descendants, + orte_ns_replica_get_job_children, + orte_ns_replica_get_root_job, + orte_ns_replica_get_parent_job, + orte_ns_base_get_jobid_string, + orte_ns_base_convert_jobid_to_string, + orte_ns_base_convert_string_to_jobid, + orte_ns_replica_reserve_range, + /* vpid functions */ + orte_ns_base_get_vpid_string, + orte_ns_base_convert_vpid_to_string, + orte_ns_base_convert_string_to_vpid, + /* name functions */ + orte_ns_base_create_process_name, + orte_ns_replica_create_my_name, + orte_ns_base_convert_string_to_process_name, + orte_ns_base_get_proc_name_string, + orte_ns_base_compare_fields, + /* peer functions */ + orte_ns_replica_get_peers, + /* tag server functions */ + orte_ns_replica_assign_rml_tag, + /* data type functions */ + orte_ns_replica_define_data_type, + /* diagnostic functions */ + orte_ns_replica_dump_cells, + orte_ns_replica_dump_jobs, + orte_ns_replica_dump_tags, + orte_ns_replica_dump_datatypes +}; + +/* + * Whether or not we allowed this component to be selected + */ +static bool initialized = false; + +/* + * class instantiations + */ +#include "ns_replica_class_instances.h" + +/* + * globals needed within replica component + */ +orte_ns_replica_globals_t orte_ns_replica; + +/* + * don't really need this function - could just put NULL in the above structure + * Just holding the place in case we decide there is something we need to do + */ +int orte_ns_replica_open(void) +{ + int id, param; + + id = mca_base_param_register_int("ns", "replica", "debug", NULL, (int)false); + mca_base_param_lookup_int(id, &orte_ns_replica.debug); + + id = mca_base_param_register_int("ns", "replica", "isolate", NULL, (int)false); + mca_base_param_lookup_int(id, ¶m); + if (param) { + orte_ns_replica.isolate = true; + } else { + orte_ns_replica.isolate = false; + } + + id = mca_base_param_register_int("ns", "replica", "maxsize", NULL, + ORTE_NS_ARRAY_MAX_SIZE); + mca_base_param_lookup_int(id, ¶m); + orte_ns_replica.max_size = (size_t)param; + + id = mca_base_param_register_int("ns", "replica", "blocksize", NULL, + ORTE_NS_ARRAY_BLOCK_SIZE); + mca_base_param_lookup_int(id, ¶m); + orte_ns_replica.block_size = (size_t)param; + + return ORTE_SUCCESS; +} + +/* + * ditto for this one + */ +int orte_ns_replica_close(void) +{ + return ORTE_SUCCESS; +} + +mca_ns_base_module_t* orte_ns_replica_init(int *priority) +{ + int rc; + + /* If we are to host a replica, then we want to be selected, so do all the + setup and return the module */ + + if (NULL == orte_process_info.ns_replica_uri) { + + /* Return a module (choose an arbitrary, positive priority -- + it's only relevant compared to other ns components). If + we're not the seed, then we don't want to be selected, so + return NULL. */ + + *priority = 50; + + /* initialize the cell info tracker */ + if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.cells), + (orte_std_cntr_t)orte_ns_replica.block_size, + (orte_std_cntr_t)orte_ns_replica.max_size, + (orte_std_cntr_t)orte_ns_replica.block_size))) { + ORTE_ERROR_LOG(rc); + return NULL; + } + orte_ns_replica.num_cells = 0; + + /* initialize the job tracking system */ + OBJ_CONSTRUCT(&orte_ns_replica.jobs, opal_list_t); + orte_ns_replica.num_jobids = 0; + + /* initialize the taglist */ + + if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.tags), + (orte_std_cntr_t)orte_ns_replica.block_size, + (orte_std_cntr_t)orte_ns_replica.max_size, + (orte_std_cntr_t)orte_ns_replica.block_size))) { + ORTE_ERROR_LOG(rc); + return NULL; + } + orte_ns_replica.num_tags = 0; + + /* initialize the dtlist */ + + if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.dts), + (orte_std_cntr_t)orte_ns_replica.block_size, + (orte_std_cntr_t)orte_ns_replica.max_size, + (orte_std_cntr_t)orte_ns_replica.block_size))) { + ORTE_ERROR_LOG(rc); + return NULL; + } + orte_ns_replica.num_dts = 0; + + /* setup the thread lock */ + OBJ_CONSTRUCT(&orte_ns_replica.mutex, opal_mutex_t); + + /* Return the module */ + + initialized = true; + return &orte_ns_replica_module; + } else { + return NULL; + } +} + +int orte_ns_replica_module_init(void) +{ + int rc; + if (orte_ns_replica.isolate) { + return ORTE_SUCCESS; + } + + /* issue non-blocking receive for call_back function */ + rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NS, ORTE_RML_PERSISTENT, orte_ns_replica_recv, NULL); + if(rc < 0) { + ORTE_ERROR_LOG(rc); + return rc; + } + return ORTE_SUCCESS; +} + + +/* + * finalize routine + */ +int orte_ns_replica_finalize(void) +{ + orte_ns_replica_cell_tracker_t **cptr; + opal_list_item_t *item; + orte_ns_replica_tagitem_t **tag; + orte_ns_replica_dti_t **dti; + orte_std_cntr_t i; + + /* free all tracking storage, but only if this component was initialized */ + + if (initialized) { + cptr = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr; + for (i=0; i < (orte_ns_replica.cells)->size; i++) { + if (NULL != cptr[i]) { + OBJ_RELEASE(cptr[i]); + } + } + OBJ_RELEASE(orte_ns_replica.cells); + + while (NULL != (item = opal_list_remove_first(&orte_ns_replica.jobs))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&orte_ns_replica.jobs); + + tag = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr; + for (i=0; i < (orte_ns_replica.tags)->size; i++) { + if (NULL != tag[i]) OBJ_RELEASE(tag[i]); + } + OBJ_RELEASE(orte_ns_replica.tags); + + dti = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr; + for (i=0; i < (orte_ns_replica.dts)->size; i++) { + if (NULL != dti[i]) OBJ_RELEASE(dti[i]); + } + OBJ_RELEASE(orte_ns_replica.dts); + + initialized = false; + } + + /* All done */ + if (orte_ns_replica.isolate) { + return ORTE_SUCCESS; + } + + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NS); + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ns/replica/ns_replica_diag_fns.c b/orte/mca/ns/replica/ns_replica_diag_fns.c new file mode 100644 index 0000000000..94bd9ff103 --- /dev/null +++ b/orte/mca/ns/replica/ns_replica_diag_fns.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ +#include "orte_config.h" + +#include <stdio.h> +#include <string.h> + +#include "opal/threads/mutex.h" +#include "opal/util/output.h" +#include "opal/util/trace.h" + +#include "orte/dss/dss.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/ns/base/base.h" +#include "orte/mca/ns/base/ns_private.h" +#include "ns_replica.h" + +/* + * DIAGNOSTIC functions + */ +int orte_ns_replica_dump_cells(void) +{ + orte_buffer_t buffer; + int rc; + + OBJ_CONSTRUCT(&buffer, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&buffer))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&buffer); + return rc; + } + + OBJ_DESTRUCT(&buffer); + return ORTE_SUCCESS; +} + +int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer) +{ + orte_std_cntr_t i; + orte_cellid_t j; + orte_ns_replica_cell_tracker_t **cell; + char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp; + int rc; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + tmp = tmp_out; + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Cell Tracker\n"); + if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr; + for (i=0, j=0; j < orte_ns_replica.num_cells && + i < (orte_ns_replica.cells)->size; i++) { + if (NULL != cell[i]) { + j++; + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tCell: %lu\n", + (unsigned long)j, (unsigned long)cell[i]->cell); + if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "\tSite: %s\n\tResource: %s\n", + cell[i]->site, cell[i]->resource); + if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + } + } + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + + return ORTE_SUCCESS; +} + + +int orte_ns_replica_dump_jobs(void) +{ + orte_buffer_t buffer; + int rc; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + OBJ_CONSTRUCT(&buffer, orte_buffer_t); + + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&buffer))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&buffer); + return rc; + } + + OBJ_DESTRUCT(&buffer); + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; +} + +static int dump_child_jobs(orte_ns_replica_jobitem_t *ptr, char *prefix, orte_buffer_t *buffer) +{ + opal_list_item_t *item; + orte_ns_replica_jobitem_t *child; + char *tmp; + int rc; + char *pfx; + + asprintf(&pfx, "%s ", prefix); + + /* print out the children's info */ + for (item = opal_list_get_first(&ptr->children); + item != opal_list_get_end(&ptr->children); + item = opal_list_get_next(item)) { + child = (orte_ns_replica_jobitem_t*)item; + asprintf(&tmp, "%sChild jobid: %ld Next vpid: %ld Num direct children: %ld\n", + pfx, (long)child->jobid, (long)child->next_vpid, (long)opal_list_get_size(&child->children)); + if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + free(tmp); + if (ORTE_SUCCESS != (rc = dump_child_jobs(child, pfx, buffer))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + free(pfx); + + return ORTE_SUCCESS; +} + +int orte_ns_replica_dump_jobs_fn(orte_buffer_t *buffer) +{ + orte_ns_replica_jobitem_t *root; + opal_list_item_t *item; + char *tmp; + int rc; + char *prefix = " "; + + asprintf(&tmp, "Dump of Name Service Jobid Tracker\n"); + if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + free(tmp); + + for (item = opal_list_get_first(&orte_ns_replica.jobs); + item != opal_list_get_end(&orte_ns_replica.jobs); + item = opal_list_get_next(item)) { + root = (orte_ns_replica_jobitem_t*)item; + asprintf(&tmp, " Data for job family with root %ld\n", (long)root->jobid); + if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + free(tmp); + asprintf(&tmp, "%sNext vpid: %ld Num direct children: %ld\n", + prefix, (long)root->next_vpid, (long)opal_list_get_size(&root->children)); + if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + free(tmp); + if (ORTE_SUCCESS != (rc = dump_child_jobs(root, prefix, buffer))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + + return ORTE_SUCCESS; +} + + +int orte_ns_replica_dump_tags(void) +{ + orte_buffer_t buffer; + int rc; + + OBJ_CONSTRUCT(&buffer, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_tags_fn(&buffer))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&buffer); + return rc; + } + + OBJ_DESTRUCT(&buffer); + return ORTE_SUCCESS; +} + + +int orte_ns_replica_dump_tags_fn(orte_buffer_t *buffer) +{ + orte_std_cntr_t i; + orte_rml_tag_t j; + orte_ns_replica_tagitem_t **ptr; + char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp; + int rc; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + tmp = tmp_out; + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service RML Tag Tracker\n"); + if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + ptr = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr; + for (i=0, j=0; j < orte_ns_replica.num_tags && + i < (orte_ns_replica.tags)->size; i++) { + if (NULL != ptr[i]) { + j++; + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tTag id: %lu\tName: %s\n", + (unsigned long)j, (unsigned long)ptr[i]->tag, ptr[i]->name); + if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + } + } + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + + return ORTE_SUCCESS; +} + + +int orte_ns_replica_dump_datatypes(void) +{ + orte_buffer_t buffer; + int rc; + + OBJ_CONSTRUCT(&buffer, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_datatypes_fn(&buffer))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&buffer); + return rc; + } + + OBJ_DESTRUCT(&buffer); + return ORTE_SUCCESS; +} + +int orte_ns_replica_dump_datatypes_fn(orte_buffer_t *buffer) +{ + orte_std_cntr_t i, j; + orte_ns_replica_dti_t **ptr; + char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp; + int rc; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + tmp = tmp_out; + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Datatype Tracker\n"); + if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + ptr = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr; + for (i=0, j=0; j < orte_ns_replica.num_dts && + i < (orte_ns_replica.dts)->size; i++) { + if (NULL != ptr[i]) { + j++; + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tDatatype id: %lu\tName: %s\n", + (unsigned long)j, (unsigned long)ptr[i]->id, ptr[i]->name); + if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + } + } + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ns/replica/ns_replica_general_fns.c b/orte/mca/ns/replica/ns_replica_general_fns.c new file mode 100644 index 0000000000..b56f107e29 --- /dev/null +++ b/orte/mca/ns/replica/ns_replica_general_fns.c @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ +#include "orte_config.h" + +#include <stdio.h> +#include <string.h> + +#include "opal/threads/mutex.h" +#include "opal/util/output.h" +#include "opal/util/trace.h" + +#include "orte/dss/dss.h" +#include "orte/mca/rmgr/rmgr.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/ns/base/ns_private.h" +#include "ns_replica.h" + + +/*** GET PEERS ***/ +int orte_ns_replica_get_peers(orte_process_name_t **procs, + orte_std_cntr_t *num_procs, opal_list_t *attrs) +{ + orte_std_cntr_t i, isave, npeers; + orte_jobid_t *jptr; + orte_cellid_t *cptr; + orte_attribute_t *attr; + orte_ns_replica_jobitem_t *job_info, *child; + opal_list_item_t *item; + opal_list_t peerlist; + int rc; + + OPAL_TRACE(1); + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + /* set default value */ + *procs = NULL; + *num_procs = 0; + + /* check the attributes to see if USE_JOB or USE_CELL has been set. If not, then this is + * a request for my own job peers - process that one locally + */ + + /* if the cell is given AND it matches my own, then we can process this + * quickly. Otherwise, we have to do some more work. + * + * RHC: when we go multi-cell, we need a way to find all the cells upon + * which a job is executing so we can make this work! + */ + if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_CELL))) { + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, attr->value, ORTE_CELLID))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + if (*cptr != ORTE_PROC_MY_NAME->cellid && *cptr != ORTE_CELLID_WILDCARD) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_NOT_IMPLEMENTED; + } + } + + if (NULL == (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_JOBID))) { + /* get my own job peers, assuming all are on this cell */ + *procs = (orte_process_name_t*)malloc(orte_process_info.num_procs * sizeof(orte_process_name_t)); + if (NULL == *procs) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + for (i=0; i < orte_process_info.num_procs; i++) { + (*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid; + (*procs)[i].jobid = ORTE_PROC_MY_NAME->jobid; + (*procs)[i].vpid = orte_process_info.vpid_start + i; + } + + *num_procs = orte_process_info.num_procs; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; + } + + /* we get here if the job attribute was passed to us - use that jobid */ + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + + /* look up this job's record on the tracking database */ + if (NULL == (job_info = orte_ns_replica_find_job(*jptr))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_NOT_FOUND; + } + + if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_INCLUDE_DESCENDANTS))) { + /* we want the peers from this job AND ALL of its descendants - start by constructing + * a flattened list of the descendant jobs + */ + OBJ_CONSTRUCT(&peerlist, opal_list_t); + child = OBJ_NEW(orte_ns_replica_jobitem_t); + child->jobid = job_info->jobid; + child->next_vpid = job_info->next_vpid; + opal_list_append(&peerlist, &child->super); /* add the current job to the list */ + orte_ns_replica_construct_flattened_tree(&peerlist, job_info); + + i = opal_list_get_size(&peerlist); + if (0 < i) { + npeers = 0; + for (item = opal_list_get_first(&peerlist); + item != opal_list_get_end(&peerlist); + item = opal_list_get_next(item)) { + child = (orte_ns_replica_jobitem_t*)item; + npeers += child->next_vpid; + } + if (0 >= npeers) { + *num_procs = npeers; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; + } + + *procs = (orte_process_name_t*)malloc(npeers * sizeof(orte_process_name_t)); + if (NULL == *procs) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /* populate it from the list */ + isave = 0; + while (NULL != (item = opal_list_remove_first(&peerlist))) { + child = (orte_ns_replica_jobitem_t*)item; + for (i=0; i < child->next_vpid; i++) { + (*procs)[i+isave].cellid = ORTE_PROC_MY_NAME->cellid; + (*procs)[i+isave].jobid = child->jobid; + (*procs)[i+isave].vpid = i; + } + isave += child->next_vpid; + } + } + *num_procs = npeers; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; + } + + if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_INCLUDE_CHILDREN))) { + /* we want the peers from this job AND ONLY its immediate children */ + + /* determine the number of peers we are going to have */ + npeers = job_info->next_vpid; + for (item = opal_list_get_first(&job_info->children); + item != opal_list_get_end(&job_info->children); + item = opal_list_get_next(item)) { + child = (orte_ns_replica_jobitem_t*)item; + npeers += child->next_vpid; + } + + /* create the array */ + if (0 < npeers) { + *procs = (orte_process_name_t*)malloc(npeers * sizeof(orte_process_name_t)); + if (NULL == *procs) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /* populate it, starting with the specified job followed by its children */ + for (i=0; i < job_info->next_vpid; i++) { + (*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid; + (*procs)[i].jobid = *jptr; + (*procs)[i].vpid = i; + } + isave = job_info->next_vpid; + for (item = opal_list_get_first(&job_info->children); + item != opal_list_get_end(&job_info->children); + item = opal_list_get_next(item)) { + child = (orte_ns_replica_jobitem_t*)item; + for (i=0; i < child->next_vpid; i++) { + (*procs)[i+isave].cellid = ORTE_PROC_MY_NAME->cellid; + (*procs)[i+isave].jobid = child->jobid; + (*procs)[i+isave].vpid = i; + } + isave += child->next_vpid; + } + } + *num_procs = npeers; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; + } + + /* get here if we want just the peers for the specified job */ + + /* create the array of peers */ + if (0 < job_info->next_vpid) { + *procs = (orte_process_name_t*)malloc(job_info->next_vpid * sizeof(orte_process_name_t)); + if (NULL == *procs) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + for (i=0; i < job_info->next_vpid; i++) { + (*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid; + (*procs)[i].jobid = *jptr; + (*procs)[i].vpid = i; + } + } + + *num_procs = job_info->next_vpid; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; +} + + + +/* + * TAG SERVER functions + */ +int orte_ns_replica_assign_rml_tag(orte_rml_tag_t *tag, + char *name) +{ + orte_ns_replica_tagitem_t *tagitem, **tags; + orte_std_cntr_t i; + orte_rml_tag_t j; + int rc; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + if (NULL != name) { + /* see if this name is already in list - if so, return tag */ + tags = (orte_ns_replica_tagitem_t**)orte_ns_replica.tags->addr; + for (i=0, j=0; j < orte_ns_replica.num_tags && + i < (orte_ns_replica.tags)->size; i++) { + if (NULL != tags[i]) { + j++; + if (tags[i]->name != NULL && + 0 == strcmp(name, tags[i]->name)) { /* found name on list */ + *tag = tags[i]->tag; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; + } + } + } + } + + /* not in list or not provided, so allocate next tag */ + *tag = ORTE_RML_TAG_MAX; + + /* check if tag is available - need to do this since the tag type + * is probably not going to be a orte_std_cntr_t, so we cannot just rely + * on the pointer_array's size limits to protect us. NOTE: need to + * reserve ORTE_RML_TAG_MAX as an invalid value, so can't let + * num_tags get there + */ + if (ORTE_RML_TAG_MAX-2 < orte_ns_replica.num_tags) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + tagitem = OBJ_NEW(orte_ns_replica_tagitem_t); + if (NULL == tagitem) { /* out of memory */ + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i, + orte_ns_replica.tags, tagitem))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + tagitem->tag = orte_ns_replica.num_tags + ORTE_RML_TAG_DYNAMIC; + (orte_ns_replica.num_tags)++; + if (NULL != name) { /* provided - can look it up later */ + tagitem->name = strdup(name); + } else { + tagitem->name = NULL; + } + + *tag = tagitem->tag; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; +} + + +/* + * DATA TYPE SERVER functions + */ +int orte_ns_replica_define_data_type(const char *name, + orte_data_type_t *type) +{ + orte_ns_replica_dti_t **dti, *dtip; + orte_std_cntr_t i, j; + int rc; + + if (NULL == name || 0 < *type) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + dti = (orte_ns_replica_dti_t**)orte_ns_replica.dts->addr; + for (i=0, j=0; j < orte_ns_replica.num_dts && + i < orte_ns_replica.dts->size; i++) { + if (NULL != dti[i]) { + j++; + if (dti[i]->name != NULL && + 0 == strcmp(name, dti[i]->name)) { /* found name on list */ + *type = dti[i]->id; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; + } + } + } + + /* not in list or not provided, so allocate next id */ + *type = ORTE_DSS_ID_MAX; + + /* check if id is available - need to do this since the data type + * is probably not going to be a orte_std_cntr_t, so we cannot just rely + * on the pointer_array's size limits to protect us. + */ + if (ORTE_DSS_ID_MAX-2 < orte_ns_replica.num_dts) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + dtip = OBJ_NEW(orte_ns_replica_dti_t); + if (NULL == dtip) { /* out of memory */ + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + dtip->name = strdup(name); + if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i, + orte_ns_replica.dts, dtip))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + dtip->id = orte_ns_replica.num_dts; + (orte_ns_replica.num_dts)++; + + *type = dtip->id; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; +} + +/* + * NAME functions + */ +int orte_ns_replica_create_my_name(void) +{ + orte_jobid_t jobid; + orte_vpid_t vpid; + opal_list_t attrs; + int rc; + + OBJ_CONSTRUCT(&attrs, opal_list_t); + if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(&jobid, &attrs))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&attrs); + return rc; + } + OBJ_DESTRUCT(&attrs); + + if (ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, 1, &vpid))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(orte_process_info.my_name), + 0, jobid, vpid))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ns/replica/ns_replica_job_fns.c b/orte/mca/ns/replica/ns_replica_job_fns.c new file mode 100644 index 0000000000..cabd7ba590 --- /dev/null +++ b/orte/mca/ns/replica/ns_replica_job_fns.c @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ +#include "orte_config.h" + +#include <stdio.h> +#include <string.h> + +#include "opal/threads/mutex.h" +#include "opal/util/output.h" +#include "opal/util/trace.h" + +#include "orte/dss/dss.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rmgr/rmgr.h" + +#include "ns_replica.h" + +/* + * JOBID functions + */ +int orte_ns_replica_create_jobid(orte_jobid_t *jobid, opal_list_t *attrs) +{ + orte_ns_replica_jobitem_t *child, *parent, *root; + orte_jobid_t parent_job=ORTE_JOBID_INVALID, *jptr; + orte_attribute_t *attr; + int rc; + + OPAL_TRACE(1); + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + *jobid = ORTE_JOBID_INVALID; + + /* check for attributes */ + if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_PARENT))) { + /* declares the specified jobid to be the parent of the new one */ + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + parent_job = *jptr; + } else if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_ROOT))) { + /* use the root of the specified job as the parent of the new one */ + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (NULL == (root = orte_ns_replica_find_root_job(*jptr))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + parent_job = root->jobid; + } + + /* if the parent jobid is INVALID, then this is the root of a new + * job family - create it + */ + if (ORTE_JOBID_INVALID == parent_job) { + root = OBJ_NEW(orte_ns_replica_jobitem_t); + if (NULL == root) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + root->jobid = orte_ns_replica.num_jobids; + opal_list_append(&orte_ns_replica.jobs, &root->super); + *jobid = root->jobid; + (orte_ns_replica.num_jobids)++; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; + } + + /* if the parent jobid is not INVALID, then the request is for a + * new child for this parent. Find the job's record + */ + if (NULL == (parent = orte_ns_replica_find_job(parent_job))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_NOT_FOUND; + } + + /* add this new job to the parent's list of children */ + child = OBJ_NEW(orte_ns_replica_jobitem_t); + if (NULL == child) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + opal_list_append(&parent->children, &child->super); + child->jobid = orte_ns_replica.num_jobids; + *jobid = child->jobid; + (orte_ns_replica.num_jobids)++; + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; +} + + +int orte_ns_replica_get_job_descendants(orte_jobid_t **descendants, orte_std_cntr_t *num_desc, orte_jobid_t job) +{ + orte_std_cntr_t i, num; + orte_ns_replica_jobitem_t *ptr, *newptr; + orte_jobid_t *descs; + opal_list_t desc_list; + opal_list_item_t *item; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + /* default values */ + *descendants = NULL; + *num_desc = 0; + + /* find this job's record on the tree */ + if (NULL == (ptr = orte_ns_replica_find_job(job))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_NOT_FOUND; + } + + /* construct a flattened list of its descendants - including ourself */ + OBJ_CONSTRUCT(&desc_list, opal_list_t); + newptr = OBJ_NEW(orte_ns_replica_jobitem_t); + newptr->jobid = job; + opal_list_append(&desc_list, &newptr->super); + + orte_ns_replica_construct_flattened_tree(&desc_list, ptr); + + /* count number of entries */ + num = opal_list_get_size(&desc_list); + + /* allocate memory for the array */ + descs = (orte_jobid_t*)malloc(num * sizeof(orte_jobid_t)); + if (NULL == descs) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /* now fill in the array */ + i = 0; + while (NULL != (item = opal_list_remove_first(&desc_list))) { + ptr = (orte_ns_replica_jobitem_t*)item; + descs[i++] = ptr->jobid; + OBJ_RELEASE(ptr); + } + OBJ_DESTRUCT(&desc_list); + + *descendants = descs; + *num_desc = num; + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; +} + +int orte_ns_replica_get_job_children(orte_jobid_t **children, orte_std_cntr_t *num_childs, orte_jobid_t job) +{ + orte_std_cntr_t i, num; + orte_ns_replica_jobitem_t *ptr, *newptr; + orte_jobid_t *descs; + opal_list_item_t *item; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + /* default values */ + *children = NULL; + *num_childs = 0; + + /* find this job's record on the tree */ + if (NULL == (ptr = orte_ns_replica_find_job(job))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_NOT_FOUND; + } + + /* count number of entries in our direct children - include ourselves */ + num = 1 + opal_list_get_size(&ptr->children); + + /* allocate memory for the array */ + descs = (orte_jobid_t*)malloc(num * sizeof(orte_jobid_t)); + if (NULL == descs) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /* now fill in the array - put ourselves first */ + descs[0] = job; + i = 1; + for (item = opal_list_get_first(&ptr->children); + item != opal_list_get_end(&ptr->children); + item = opal_list_get_next(item)) { + newptr = (orte_ns_replica_jobitem_t*)item; + descs[i++] = newptr->jobid; + } + + *children = descs; + *num_childs = num; + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; +} + +int orte_ns_replica_get_root_job(orte_jobid_t *root_job, orte_jobid_t job) +{ + orte_ns_replica_jobitem_t *root; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + if (NULL == (root = orte_ns_replica_find_root_job(job))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_NOT_FOUND; + } + + *root_job = root->jobid; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; +} + + +int orte_ns_replica_get_parent_job(orte_jobid_t *parent_job, orte_jobid_t job) +{ + opal_list_item_t *item; + orte_ns_replica_jobitem_t *root, *ptr, *parent; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + /* find this job's parent object */ + for (item = opal_list_get_first(&orte_ns_replica.jobs); + item != opal_list_get_end(&orte_ns_replica.jobs); + item = opal_list_get_next(item)) { + root = (orte_ns_replica_jobitem_t*)item; + if (NULL != (ptr = down_search(root, &parent, job))) { + goto REPORT; + } + } + /* don't report an error if not found, just return invalid */ + *parent_job = ORTE_JOBID_INVALID; + return ORTE_ERR_NOT_FOUND; + +REPORT: + /* return the info */ + *parent_job = parent->jobid; + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; +} + + +int orte_ns_replica_reserve_range(orte_jobid_t job, orte_vpid_t range, + orte_vpid_t *start) +{ + orte_ns_replica_jobitem_t *ptr; + + OPAL_TRACE(1); + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + /* find the job's record */ + if (NULL == (ptr = orte_ns_replica_find_job(job))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_NOT_FOUND; + } + + if ((ORTE_VPID_MAX-range-(ptr->next_vpid)) > 0) { + *start = ptr->next_vpid; + ptr->next_vpid += range; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; + } + + /* get here if the range isn't available */ + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; +} diff --git a/orte/mca/ns/replica/ns_replica_recv.c b/orte/mca/ns/replica/ns_replica_recv.c new file mode 100644 index 0000000000..559ce1b235 --- /dev/null +++ b/orte/mca/ns/replica/ns_replica_recv.c @@ -0,0 +1,457 @@ +/* -*- C -*- +* +* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +* University Research and Technology +* Corporation. All rights reserved. +* Copyright (c) 2004-2005 The University of Tennessee and The University +* of Tennessee Research Foundation. All rights +* reserved. +* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +* University of Stuttgart. All rights reserved. +* Copyright (c) 2004-2005 The Regents of the University of California. +* All rights reserved. +* $COPYRIGHT$ +* +* Additional copyrights may follow +* +* $HEADER$ +*/ +/** @file: +* +* The Open MPI Name Server +* +*/ + +/* + * includes + */ +#include "orte_config.h" + +#include "orte/orte_constants.h" +#include "orte/orte_types.h" + +#include "opal/threads/mutex.h" +#include "opal/class/opal_list.h" +#include "opal/util/output.h" + +#include "opal/mca/mca.h" +#include "opal/mca/base/mca_base_param.h" + +#include "orte/util/proc_info.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rml/rml.h" + +#include "orte/mca/ns/base/ns_private.h" +#include "ns_replica.h" + + +/* + * handle message from proxies + * NOTE: The incoming buffer "buffer" is OBJ_RELEASED by the calling program. + * DO NOT RELEASE THIS BUFFER IN THIS CODE + */ + +void orte_ns_replica_recv(int status, orte_process_name_t* sender, + orte_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + orte_buffer_t answer, error_answer; + orte_ns_cmd_flag_t command; + opal_list_t attrs; + orte_cellid_t cell; + orte_jobid_t job, root, *descendants; + orte_vpid_t startvpid, range; + char *tagname, *site, *resource; + orte_rml_tag_t oob_tag; + orte_data_type_t type; + orte_std_cntr_t count, nprocs, nret; + orte_process_name_t *procs; + int rc=ORTE_SUCCESS, ret; + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + rc = ORTE_ERR_BAD_PARAM; + goto RETURN_ERROR; + } + + OBJ_CONSTRUCT(&answer, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + switch (command) { + case ORTE_NS_CREATE_CELLID_CMD: + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &site, &count, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + rc = ORTE_ERR_BAD_PARAM; + goto RETURN_ERROR; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &resource, &count, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + rc = ORTE_ERR_BAD_PARAM; + goto RETURN_ERROR; + } + + rc = orte_ns_replica_create_cellid(&cell, site, resource); + + if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &cell, 1, ORTE_CELLID))) { + ORTE_ERROR_LOG(ret); + goto RETURN_ERROR; + } + + if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_GET_CELL_INFO_CMD: + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &cell, &count, ORTE_CELLID))) { + ORTE_ERROR_LOG(rc); + rc = ORTE_ERR_BAD_PARAM; + goto RETURN_ERROR; + } + + site = NULL; + resource = NULL; + rc = orte_ns_replica_get_cell_info(cell, &site, &resource); + + if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &site, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(ret); + goto RETURN_ERROR; + } + + if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &resource, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(ret); + goto RETURN_ERROR; + } + + if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_CREATE_NODEID_CMD: + case ORTE_NS_GET_NODE_INFO_CMD: + ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); + goto RETURN_ERROR; + break; + + case ORTE_NS_CREATE_JOBID_CMD: + /* get the list of attributes */ + OBJ_CONSTRUCT(&attrs, opal_list_t); + count = 1; + if(ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (ORTE_SUCCESS != (rc = orte_ns_replica_create_jobid(&job, &attrs))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&attrs); + goto RETURN_ERROR; + } + OBJ_DESTRUCT(&attrs); + + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&job, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_GET_JOB_DESC_CMD: + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&job, &count, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (ORTE_SUCCESS != (rc = orte_ns_replica_get_job_descendants(&descendants, &nret, job))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&nret, 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (0 < nret) { + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)descendants, nret, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + } + + if (0 > (rc = orte_rml.send_buffer(sender, &answer, tag, 0))) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_GET_JOB_CHILD_CMD: + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&job, &count, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (ORTE_SUCCESS != (rc = orte_ns_replica_get_job_children(&descendants, &nret, job))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&nret, 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (0 < nret) { + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)descendants, nret, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + } + + if (0 > (rc = orte_rml.send_buffer(sender, &answer, tag, 0))) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_GET_ROOT_JOB_CMD: + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&job, &count, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (ORTE_SUCCESS != (rc = orte_ns_replica_get_root_job(&root, job))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&root, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (0 > (rc = orte_rml.send_buffer(sender, &answer, tag, 0))) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_GET_PARENT_JOB_CMD: + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&job, &count, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (ORTE_SUCCESS != (rc = orte_ns_replica_get_parent_job(&root, job))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&root, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (0 > (rc = orte_rml.send_buffer(sender, &answer, tag, 0))) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_RESERVE_RANGE_CMD: + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&job, &count, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&range, &count, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (ORTE_SUCCESS != (rc = orte_ns_replica_reserve_range(job, range, &startvpid))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&startvpid, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (0 > (rc = orte_rml.send_buffer(sender, &answer, tag, 0))) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_ASSIGN_OOB_TAG_CMD: + count = 1; + if (0 > orte_dss.unpack(buffer, &tagname, &count, ORTE_STRING)) { + rc = ORTE_ERR_UNPACK_FAILURE; + goto RETURN_ERROR; + } + + if (0 == strncmp(tagname, "NULL", 4)) { + if (ORTE_SUCCESS != (rc = orte_ns_replica_assign_rml_tag(&oob_tag, NULL))) { + goto RETURN_ERROR; + } + } else { + if (ORTE_SUCCESS != (rc = orte_ns_replica_assign_rml_tag(&oob_tag, tagname))) { + goto RETURN_ERROR; + } + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&oob_tag, 1, ORTE_RML_TAG))) { + goto RETURN_ERROR; + } + + if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_DEFINE_DATA_TYPE_CMD: + count = 1; + if (0 > orte_dss.unpack(buffer, &tagname, &count, ORTE_STRING)) { + rc = ORTE_ERR_UNPACK_FAILURE; + goto RETURN_ERROR; + } + + if (ORTE_SUCCESS != (rc = orte_ns_replica_define_data_type(tagname, &type))) { + goto RETURN_ERROR; + } + + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&type, 1, ORTE_DATA_TYPE))) { + goto RETURN_ERROR; + } + + if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_CREATE_MY_NAME_CMD: + /* ignore this command */ + break; + + case ORTE_NS_GET_PEERS_CMD: + /* get the list of attributes */ + OBJ_CONSTRUCT(&attrs, opal_list_t); + count = 1; + if(ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + /* process the request */ + if (ORTE_SUCCESS != (rc = orte_ns_replica_get_peers(&procs, &nprocs, &attrs))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&attrs); + goto RETURN_ERROR; + } + OBJ_DESTRUCT(&attrs); + + /* pack the answer */ + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &nprocs, 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (nprocs > 0) { + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, procs, nprocs, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + } + + if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_DUMP_CELLS_CMD: + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&answer))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_DUMP_JOBIDS_CMD: + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&answer))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_DUMP_TAGS_CMD: + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_tags_fn(&answer))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_DUMP_DATATYPES_CMD: + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_datatypes_fn(&answer))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + default: + goto RETURN_ERROR; + } + goto CLEANUP; + +RETURN_ERROR: + OBJ_CONSTRUCT(&error_answer, orte_buffer_t); + orte_dss.pack(&error_answer, (void*)&command, 1, ORTE_NS_CMD); + orte_dss.pack(&error_answer, (void*)&rc, 1, ORTE_INT32); + orte_rml.send_buffer(sender, &error_answer, tag, 0); + OBJ_DESTRUCT(&error_answer); + +CLEANUP: + /* cleanup */ + OBJ_DESTRUCT(&answer); +} + diff --git a/orte/mca/ns/replica/ns_replica_support_fns.c b/orte/mca/ns/replica/ns_replica_support_fns.c new file mode 100644 index 0000000000..fa2c2da7fc --- /dev/null +++ b/orte/mca/ns/replica/ns_replica_support_fns.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ +#include "orte_config.h" + +#include <stdio.h> +#include <string.h> + +#include "opal/util/output.h" +#include "opal/util/trace.h" + +#include "orte/mca/errmgr/errmgr.h" + +#include "ns_replica.h" + +orte_ns_replica_jobitem_t *down_search(orte_ns_replica_jobitem_t *ptr, + orte_ns_replica_jobitem_t **parent_ptr, + orte_jobid_t job) +{ + opal_list_item_t *item; + orte_ns_replica_jobitem_t *ptr2, *ptr3; + + /* check if this is the specified job */ + if (ptr->jobid == job) { + return ptr; + } + + /* otherwise, look at the children of this ptr. call ourselves + * to check each one + */ + for (item = opal_list_get_first(&ptr->children); + item != opal_list_get_end(&ptr->children); + item = opal_list_get_next(item)) { + ptr2 = (orte_ns_replica_jobitem_t*)item; + *parent_ptr = ptr; + if (NULL != (ptr3 = down_search(ptr2, parent_ptr, job))) { + return ptr3; + } + } + + return NULL; +} + +/* find a job's record, wherever it is on the tree */ +orte_ns_replica_jobitem_t* orte_ns_replica_find_job(orte_jobid_t job) +{ + opal_list_item_t *item; + orte_ns_replica_jobitem_t *root, *ptr, *parent; + + for (item = opal_list_get_first(&orte_ns_replica.jobs); + item != opal_list_get_end(&orte_ns_replica.jobs); + item = opal_list_get_next(item)) { + root = (orte_ns_replica_jobitem_t*)item; + if (NULL != (ptr = down_search(root, &parent, job))) { + return ptr; + } + } + + /* don't report an error if not found, just return NULL */ + return NULL; +} + +/* given a jobid, find it's root job's object */ +orte_ns_replica_jobitem_t* orte_ns_replica_find_root_job(orte_jobid_t job) +{ + opal_list_item_t *item; + orte_ns_replica_jobitem_t *root, *ptr, *parent; + + for (item = opal_list_get_first(&orte_ns_replica.jobs); + item != opal_list_get_end(&orte_ns_replica.jobs); + item = opal_list_get_next(item)) { + root = (orte_ns_replica_jobitem_t*)item; + + if (NULL != (ptr = down_search(root, &parent, job))) { + return root; + } + } + + /* don't report an error if not found, just return NULL */ + return NULL; +} + +/* given a job's record, construct a flattened list of the descendants below it, + * including the starting point + */ +void orte_ns_replica_construct_flattened_tree(opal_list_t *tree, orte_ns_replica_jobitem_t *ptr) +{ + orte_ns_replica_jobitem_t *job, *newjob; + opal_list_item_t *item; + + for (item = opal_list_get_first(&ptr->children); + item != opal_list_get_end(&ptr->children); + item = opal_list_get_next(item)) { + job = (orte_ns_replica_jobitem_t*)item; + + newjob = OBJ_NEW(orte_ns_replica_jobitem_t); + newjob->jobid = job->jobid; + newjob->next_vpid = job->next_vpid; + opal_list_append(tree, &newjob->super); + + orte_ns_replica_construct_flattened_tree(tree, job); /* get anyone below this one */ + } +} diff --git a/orte/mca/ns/replica/src/Makefile.extra b/orte/mca/ns/replica/src/Makefile.extra deleted file mode 100644 index 3c3478e9c5..0000000000 --- a/orte/mca/ns/replica/src/Makefile.extra +++ /dev/null @@ -1,24 +0,0 @@ -# -*- makefile -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -sources += \ - src/ns_replica.h \ - src/ns_replica.c \ - src/ns_replica_component.c - diff --git a/orte/mca/ns/replica/src/ns_replica_component.c b/orte/mca/ns/replica/src/ns_replica_component.c deleted file mode 100644 index b126bf36fa..0000000000 --- a/orte/mca/ns/replica/src/ns_replica_component.c +++ /dev/null @@ -1,702 +0,0 @@ -/* -*- C -*- - * - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file: - * - * The Open MPI Name Server - * - * The Open MPI Name Server provides unique name ranges for processes - * within the universe. Each universe will have one name server - * running within the seed daemon. This is done to prevent the - * inadvertent duplication of names. - */ - -/* - * includes - */ -#include "orte_config.h" - -#include "orte/orte_constants.h" -#include "orte/orte_types.h" - -#include "opal/threads/mutex.h" -#include "orte/util/proc_info.h" -#include "opal/util/output.h" - -#include "opal/mca/mca.h" -#include "opal/mca/base/mca_base_param.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/rml/rml.h" -#include "ns_replica.h" - - -/* - * Struct of function pointers that need to be initialized - */ -mca_ns_base_component_t mca_ns_replica_component = { - { - MCA_NS_BASE_VERSION_1_0_0, - - "replica", /* MCA module name */ - ORTE_MAJOR_VERSION, /* MCA module major version */ - ORTE_MINOR_VERSION, /* MCA module minor version */ - ORTE_RELEASE_VERSION, /* MCA module release version */ - orte_ns_replica_open, /* module open */ - orte_ns_replica_close /* module close */ - }, - { - false /* checkpoint / restart */ - }, - orte_ns_replica_init, /* module init */ - orte_ns_replica_finalize /* module shutdown */ -}; - -/* - * setup the function pointers for the module - */ -static mca_ns_base_module_t orte_ns_replica_module = { - /* init */ - orte_ns_replica_module_init, - /* cell functions */ - orte_ns_replica_create_cellid, - orte_ns_base_get_cellid, - orte_ns_replica_get_cell_info, - orte_ns_base_assign_cellid_to_process, - orte_ns_base_get_cellid_string, - orte_ns_base_convert_cellid_to_string, - orte_ns_base_convert_string_to_cellid, - /* jobid functions */ - orte_ns_replica_create_jobid, - orte_ns_base_get_jobid, - orte_ns_base_get_jobid_string, - orte_ns_base_convert_jobid_to_string, - orte_ns_base_convert_string_to_jobid, - /* vpid functions */ - orte_ns_replica_reserve_range, - orte_ns_base_get_vpid, - orte_ns_base_get_vpid_string, - orte_ns_base_convert_vpid_to_string, - orte_ns_base_convert_string_to_vpid, - /* name functions */ - orte_ns_base_create_process_name, - orte_ns_replica_create_my_name, - orte_ns_base_copy_process_name, - orte_ns_base_convert_string_to_process_name, - orte_ns_base_free_name, - orte_ns_base_get_proc_name_string, - orte_ns_base_compare, - /* peer functions */ - orte_ns_base_get_peers, - orte_ns_replica_get_job_peers, - /* tag server functions */ - orte_ns_replica_assign_rml_tag, - /* data type functions */ - orte_ns_replica_define_data_type, - /* diagnostic functions */ - orte_ns_replica_dump_cells, - orte_ns_replica_dump_jobs, - orte_ns_replica_dump_tags, - orte_ns_replica_dump_datatypes -}; - -/* - * Whether or not we allowed this component to be selected - */ -static bool initialized = false; - - -/* constructor - used to initialize state of cell_tracker instance */ -static void orte_ns_replica_cell_tracker_construct(orte_ns_replica_cell_tracker_t* cell_tracker) -{ - cell_tracker->cell = 0; - cell_tracker->site = NULL; - cell_tracker->resource = NULL; -} - -/* destructor - used to free any resources held by instance */ -static void orte_ns_replica_cell_tracker_destructor(orte_ns_replica_cell_tracker_t* cell_tracker) -{ - if (NULL != cell_tracker->site) free(cell_tracker->site); - if (NULL != cell_tracker->resource) free(cell_tracker->resource); -} - -/* define instance of opal_class_t */ -OBJ_CLASS_INSTANCE( - orte_ns_replica_cell_tracker_t, /* type name */ - opal_object_t, /* parent "class" name */ - orte_ns_replica_cell_tracker_construct, /* constructor */ - orte_ns_replica_cell_tracker_destructor); /* destructor */ - - -/* constructor - used to initialize state of jobid_tracker instance */ -static void orte_ns_replica_jobid_tracker_construct(orte_ns_replica_jobid_tracker_t* jobid_tracker) -{ - jobid_tracker->jobid = ORTE_JOBID_MAX; - jobid_tracker->next_vpid = 0; -} - -/* destructor - used to free any resources held by instance */ -static void orte_ns_replica_jobid_tracker_destructor(orte_ns_replica_jobid_tracker_t* jobid_tracker){ -} - -/* define instance of opal_class_t */ -OBJ_CLASS_INSTANCE( - orte_ns_replica_jobid_tracker_t, /* type name */ - opal_object_t, /* parent "class" name */ - orte_ns_replica_jobid_tracker_construct, /* constructor */ - orte_ns_replica_jobid_tracker_destructor); /* destructor */ - - -/* constructor - used to initialize state of taglist instance */ -static void orte_ns_replica_tagitem_construct(orte_ns_replica_tagitem_t* tagitem) -{ - tagitem->tag = ORTE_RML_TAG_MAX; - tagitem->name = NULL; -} - -/* destructor - used to free any resources held by instance */ -static void orte_ns_replica_tagitem_destructor(orte_ns_replica_tagitem_t* tagitem) -{ - if (NULL != tagitem->name) { - free(tagitem->name); - } -} - -/* define instance of opal_class_t */ -OBJ_CLASS_INSTANCE( - orte_ns_replica_tagitem_t, /* type name */ - opal_object_t, /* parent "class" name */ - orte_ns_replica_tagitem_construct, /* constructor */ - orte_ns_replica_tagitem_destructor); /* destructor */ - - -/* constructor - used to initialize state of dtilist instance */ -static void orte_ns_replica_dti_construct(orte_ns_replica_dti_t* dti) -{ - dti->id = ORTE_DSS_ID_MAX; - dti->name = NULL; -} - -/* destructor - used to free any resources held by instance */ -static void orte_ns_replica_dti_destructor(orte_ns_replica_dti_t* dti) -{ - if (NULL != dti->name) { - free(dti->name); - } -} - -/* define instance of opal_class_t */ -OBJ_CLASS_INSTANCE( - orte_ns_replica_dti_t, /* type name */ - opal_object_t, /* parent "class" name */ - orte_ns_replica_dti_construct, /* constructor */ - orte_ns_replica_dti_destructor); /* destructor */ - -/* - * globals needed within replica component - */ -orte_ns_replica_globals_t orte_ns_replica; - -/* - * don't really need this function - could just put NULL in the above structure - * Just holding the place in case we decide there is something we need to do - */ -int orte_ns_replica_open(void) -{ - int id, param; - - id = mca_base_param_register_int("ns", "replica", "debug", NULL, (int)false); - mca_base_param_lookup_int(id, &orte_ns_replica.debug); - - id = mca_base_param_register_int("ns", "replica", "isolate", NULL, (int)false); - mca_base_param_lookup_int(id, ¶m); - if (param) { - orte_ns_replica.isolate = true; - } else { - orte_ns_replica.isolate = false; - } - - id = mca_base_param_register_int("ns", "replica", "maxsize", NULL, - ORTE_NS_ARRAY_MAX_SIZE); - mca_base_param_lookup_int(id, ¶m); - orte_ns_replica.max_size = (size_t)param; - - id = mca_base_param_register_int("ns", "replica", "blocksize", NULL, - ORTE_NS_ARRAY_BLOCK_SIZE); - mca_base_param_lookup_int(id, ¶m); - orte_ns_replica.block_size = (size_t)param; - - return ORTE_SUCCESS; -} - -/* - * ditto for this one - */ -int orte_ns_replica_close(void) -{ - return ORTE_SUCCESS; -} - -mca_ns_base_module_t* orte_ns_replica_init(int *priority) -{ - int rc; - - /* If we are to host a replica, then we want to be selected, so do all the - setup and return the module */ - - if (NULL == orte_process_info.ns_replica_uri) { - - /* Return a module (choose an arbitrary, positive priority -- - it's only relevant compared to other ns components). If - we're not the seed, then we don't want to be selected, so - return NULL. */ - - *priority = 50; - - /* initialize the cell info tracker */ - if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.cells), - (orte_std_cntr_t)orte_ns_replica.block_size, - (orte_std_cntr_t)orte_ns_replica.max_size, - (orte_std_cntr_t)orte_ns_replica.block_size))) { - ORTE_ERROR_LOG(rc); - return NULL; - } - orte_ns_replica.num_cells = 0; - - /* initialize the job id tracker */ - if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.jobids), - (orte_std_cntr_t)orte_ns_replica.block_size, - (orte_std_cntr_t)orte_ns_replica.max_size, - (orte_std_cntr_t)orte_ns_replica.block_size))) { - ORTE_ERROR_LOG(rc); - return NULL; - } - orte_ns_replica.num_jobids = 0; - - /* initialize the taglist */ - - if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.tags), - (orte_std_cntr_t)orte_ns_replica.block_size, - (orte_std_cntr_t)orte_ns_replica.max_size, - (orte_std_cntr_t)orte_ns_replica.block_size))) { - ORTE_ERROR_LOG(rc); - return NULL; - } - orte_ns_replica.num_tags = 0; - - /* initialize the dtlist */ - - if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.dts), - (orte_std_cntr_t)orte_ns_replica.block_size, - (orte_std_cntr_t)orte_ns_replica.max_size, - (orte_std_cntr_t)orte_ns_replica.block_size))) { - ORTE_ERROR_LOG(rc); - return NULL; - } - orte_ns_replica.num_dts = 0; - - /* setup the thread lock */ - OBJ_CONSTRUCT(&orte_ns_replica.mutex, opal_mutex_t); - - /* Return the module */ - - initialized = true; - return &orte_ns_replica_module; - } else { - return NULL; - } -} - -int orte_ns_replica_module_init(void) -{ - int rc; - if (orte_ns_replica.isolate) { - return ORTE_SUCCESS; - } - - /* issue non-blocking receive for call_back function */ - rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_NS, ORTE_RML_PERSISTENT, orte_ns_replica_recv, NULL); - if(rc < 0) { - ORTE_ERROR_LOG(rc); - return rc; - } - return ORTE_SUCCESS; -} - - -/* - * finalize routine - */ -int orte_ns_replica_finalize(void) -{ - orte_ns_replica_cell_tracker_t **cptr; - orte_ns_replica_jobid_tracker_t **jptr; - orte_ns_replica_tagitem_t **tag; - orte_ns_replica_dti_t **dti; - orte_std_cntr_t i; - - /* free all tracking storage, but only if this component was initialized */ - - if (initialized) { - cptr = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr; - for (i=0; i < (orte_ns_replica.cells)->size; i++) { - if (NULL != cptr[i]) { - OBJ_RELEASE(cptr[i]); - } - } - OBJ_RELEASE(orte_ns_replica.cells); - - jptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr; - for (i=0; i < (orte_ns_replica.jobids)->size; i++) { - if (NULL != jptr[i]) { - OBJ_RELEASE(jptr[i]); - } - } - OBJ_RELEASE(orte_ns_replica.jobids); - - tag = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr; - for (i=0; i < (orte_ns_replica.tags)->size; i++) { - if (NULL != tag[i]) OBJ_RELEASE(tag[i]); - } - OBJ_RELEASE(orte_ns_replica.tags); - - dti = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr; - for (i=0; i < (orte_ns_replica.dts)->size; i++) { - if (NULL != dti[i]) OBJ_RELEASE(dti[i]); - } - OBJ_RELEASE(orte_ns_replica.dts); - - initialized = false; - } - - /* All done */ - if (orte_ns_replica.isolate) { - return ORTE_SUCCESS; - } - - orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_NS); - return ORTE_SUCCESS; -} - - -/* - * handle message from proxies - * NOTE: The incoming buffer "buffer" is OBJ_RELEASED by the calling program. - * DO NOT RELEASE THIS BUFFER IN THIS CODE - */ - -void orte_ns_replica_recv(int status, orte_process_name_t* sender, - orte_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - orte_buffer_t answer, error_answer; - orte_ns_cmd_flag_t command; - orte_cellid_t cell; - orte_jobid_t job; - orte_vpid_t startvpid, range; - char *tagname, *site, *resource; - orte_rml_tag_t oob_tag; - orte_data_type_t type; - orte_std_cntr_t count, nprocs; - orte_process_name_t *procs; - int rc=ORTE_SUCCESS, ret; - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &command, &count, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - rc = ORTE_ERR_BAD_PARAM; - goto RETURN_ERROR; - } - - OBJ_CONSTRUCT(&answer, orte_buffer_t); - - switch (command) { - case ORTE_NS_CREATE_CELLID_CMD: - if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &command, 1, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &site, &count, ORTE_STRING))) { - ORTE_ERROR_LOG(rc); - rc = ORTE_ERR_BAD_PARAM; - goto RETURN_ERROR; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &resource, &count, ORTE_STRING))) { - ORTE_ERROR_LOG(rc); - rc = ORTE_ERR_BAD_PARAM; - goto RETURN_ERROR; - } - - rc = orte_ns_replica_create_cellid(&cell, site, resource); - - if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &cell, 1, ORTE_CELLID))) { - ORTE_ERROR_LOG(ret); - goto RETURN_ERROR; - } - if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - goto RETURN_ERROR; - } - break; - - case ORTE_NS_GET_CELL_INFO_CMD: - if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &command, 1, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &cell, &count, ORTE_CELLID))) { - ORTE_ERROR_LOG(rc); - rc = ORTE_ERR_BAD_PARAM; - goto RETURN_ERROR; - } - - site = NULL; - resource = NULL; - rc = orte_ns_replica_get_cell_info(cell, &site, &resource); - - if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &site, 1, ORTE_STRING))) { - ORTE_ERROR_LOG(ret); - goto RETURN_ERROR; - } - - if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &resource, 1, ORTE_STRING))) { - ORTE_ERROR_LOG(ret); - goto RETURN_ERROR; - } - - if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &rc, 1, ORTE_INT))) { - ORTE_ERROR_LOG(ret); - goto RETURN_ERROR; - } - - if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - goto RETURN_ERROR; - } - break; - - case ORTE_NS_CREATE_JOBID_CMD: - if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&command, 1, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - - if (ORTE_SUCCESS != (rc = orte_ns_replica_create_jobid(&job))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&job, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - - if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - goto RETURN_ERROR; - } - break; - - case ORTE_NS_RESERVE_RANGE_CMD: - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&job, &count, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&range, &count, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - - if (ORTE_SUCCESS != (rc = orte_ns_replica_reserve_range(job, range, &startvpid))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&command, 1, ORTE_NS_CMD))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&startvpid, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - - if (0 > (rc = orte_rml.send_buffer(sender, &answer, tag, 0))) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - goto RETURN_ERROR; - } - break; - - case ORTE_NS_ASSIGN_OOB_TAG_CMD: - count = 1; - if (0 > orte_dss.unpack(buffer, &tagname, &count, ORTE_STRING)) { - rc = ORTE_ERR_UNPACK_FAILURE; - goto RETURN_ERROR; - } - - if (0 == strncmp(tagname, "NULL", 4)) { - if (ORTE_SUCCESS != (rc = orte_ns_replica_assign_rml_tag(&oob_tag, NULL))) { - goto RETURN_ERROR; - } - } else { - if (ORTE_SUCCESS != (rc = orte_ns_replica_assign_rml_tag(&oob_tag, tagname))) { - goto RETURN_ERROR; - } - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&command, 1, ORTE_NS_CMD))) { - goto RETURN_ERROR; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&oob_tag, 1, ORTE_RML_TAG))) { - goto RETURN_ERROR; - } - - if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - goto RETURN_ERROR; - } - break; - - case ORTE_NS_DEFINE_DATA_TYPE_CMD: - count = 1; - if (0 > orte_dss.unpack(buffer, &tagname, &count, ORTE_STRING)) { - rc = ORTE_ERR_UNPACK_FAILURE; - goto RETURN_ERROR; - } - - if (ORTE_SUCCESS != (rc = orte_ns_replica_define_data_type(tagname, &type))) { - goto RETURN_ERROR; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&command, 1, ORTE_NS_CMD))) { - goto RETURN_ERROR; - } - - if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&type, 1, ORTE_DATA_TYPE))) { - goto RETURN_ERROR; - } - - if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - goto RETURN_ERROR; - } - break; - - case ORTE_NS_CREATE_MY_NAME_CMD: - /* ignore this command */ - break; - - case ORTE_NS_GET_JOB_PEERS_CMD: - /* unpack the jobid */ - count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &job, &count, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - /* process the request */ - if (ORTE_SUCCESS != (rc = orte_ns_replica_get_job_peers(&procs, &nprocs, job))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - - /* pack the answer */ - if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &nprocs, 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - - if (nprocs > 0) { - if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &procs, nprocs, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - } - break; - - case ORTE_NS_DUMP_CELLS_CMD: - if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&answer))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - goto RETURN_ERROR; - } - break; - - case ORTE_NS_DUMP_JOBIDS_CMD: - if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&answer))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - goto RETURN_ERROR; - } - break; - - case ORTE_NS_DUMP_TAGS_CMD: - if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_tags_fn(&answer))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - goto RETURN_ERROR; - } - break; - - case ORTE_NS_DUMP_DATATYPES_CMD: - if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_datatypes_fn(&answer))) { - ORTE_ERROR_LOG(rc); - goto RETURN_ERROR; - } - if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - goto RETURN_ERROR; - } - break; - - default: - goto RETURN_ERROR; - } - goto CLEANUP; - -RETURN_ERROR: - OBJ_CONSTRUCT(&error_answer, orte_buffer_t); - orte_dss.pack(&error_answer, (void*)&command, 1, ORTE_NS_CMD); - orte_dss.pack(&error_answer, (void*)&rc, 1, ORTE_INT32); - orte_rml.send_buffer(sender, &error_answer, tag, 0); - OBJ_DESTRUCT(&error_answer); - -CLEANUP: - /* cleanup */ - OBJ_DESTRUCT(&answer); -} - diff --git a/orte/mca/odls/bproc/odls_bproc.c b/orte/mca/odls/bproc/odls_bproc.c index 1e3feb40dc..f029b013f2 100644 --- a/orte/mca/odls/bproc/odls_bproc.c +++ b/orte/mca/odls/bproc/odls_bproc.c @@ -119,7 +119,7 @@ static char * return NULL; } - rc = orte_ns_base_convert_jobid_to_string(&job, jobid); + rc = orte_ns.convert_jobid_to_string(&job, jobid); if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return NULL; @@ -634,7 +634,7 @@ orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_env if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); } - rc = mca_oob_send_packed_nb(ORTE_RML_NAME_SEED, ack, ORTE_RML_TAG_BPROC, 0, + rc = mca_oob_send_packed_nb(ORTE_PROC_MY_HNP, ack, ORTE_RML_TAG_BPROC, 0, odls_bproc_send_cb, NULL); if (0 > rc) { ORTE_ERROR_LOG(rc); @@ -666,7 +666,7 @@ int orte_odls_bproc_kill_local_procs(orte_jobid_t job, bool set_state) * @param signal The signal to send * @retval ORTE_SUCCESS */ -int orte_odls_bproc_signal_local_procs(orte_process_name_t* proc, int32_t signal) +int orte_odls_bproc_signal_local_procs(const orte_process_name_t* proc, int32_t signal) { orte_iof.iof_flush(); return ORTE_SUCCESS; diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index 5724e6714b..d6526ea0d1 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -372,6 +372,8 @@ static void odls_default_wait_local_proc(pid_t pid, int status, void* cbdata) struct stat buf; int rc; + opal_output(orte_odls_globals.output, "odls: child process terminated"); + /* since we are going to be working with the global list of * children, we need to protect that list from modification * by other threads. This will also be used to protect us @@ -384,6 +386,8 @@ static void odls_default_wait_local_proc(pid_t pid, int status, void* cbdata) item != opal_list_get_end(&orte_odls_default.children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; + opal_output(orte_odls_globals.output, "odls: checking child [%ld,%ld,%ld] alive %s", + ORTE_NAME_ARGS(child->name), (child->alive ? "true" : "dead")); if (child->alive && pid == child->pid) { /* found it */ goto GOTCHILD; } @@ -398,8 +402,14 @@ static void odls_default_wait_local_proc(pid_t pid, int status, void* cbdata) return; GOTCHILD: + opal_output(orte_odls_globals.output, "odls: flushing output for [%ld,%ld,%ld]", + ORTE_NAME_ARGS(child->name)); + orte_iof.iof_flush(); + opal_output(orte_odls_globals.output, "odls: output for [%ld,%ld,%ld] flushed", + ORTE_NAME_ARGS(child->name)); + /* determine the state of this process */ aborted = false; if(WIFEXITED(status)) { @@ -426,6 +436,9 @@ GOTCHILD: job, vpid, "abort", NULL ); free(job); free(vpid); + opal_output(orte_odls_globals.output, "odls: stat'ing file %s for [%ld,%ld,%ld]", + abort_file, ORTE_NAME_ARGS(child->name)); + if (0 == stat(abort_file, &buf)) { /* the abort file must exist - there is nothing in it we need. It's * meer existence indicates that an abnormal termination occurred @@ -435,14 +448,14 @@ GOTCHILD: aborted = true; free(abort_file); } else { - opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died naturally", + opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated normally", ORTE_NAME_ARGS(child->name)); } } else { /* the process was terminated with a signal! That's definitely * abnormal, so indicate that condition */ - opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by signal", + opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated with signal", ORTE_NAME_ARGS(child->name)); aborted = true; } diff --git a/orte/mca/oob/base/base.h b/orte/mca/oob/base/base.h index 2adfcd82b4..8a1ac8468a 100644 --- a/orte/mca/oob/base/base.h +++ b/orte/mca/oob/base/base.h @@ -48,13 +48,6 @@ extern "C" { */ ORTE_DECLSPEC extern bool orte_oob_base_timing; -/* - * Well known address - */ - -ORTE_DECLSPEC extern orte_process_name_t mca_oob_name_any; -ORTE_DECLSPEC extern orte_process_name_t mca_oob_name_seed; - /* * OOB API */ @@ -187,7 +180,7 @@ ORTE_DECLSPEC int mca_oob_send_packed( /** * Similiar to unix readv(2) * -* @param peer (IN/OUT) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. In the +* @param peer (IN/OUT) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. In the * case of a wildcard receive, will be modified to return the matched peer name. * @param msg (IN) Array of iovecs describing user buffers and lengths. * @param count (IN) Number of elements in iovec array. @@ -223,7 +216,7 @@ ORTE_DECLSPEC int mca_oob_recv( /** * Similiar to unix read(2) * -* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. +* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param buf (OUT) Array of iovecs describing user buffers and lengths. * @param tag (IN/OUT) User defined tag for matching send/recv. * @return OMPI error code (<0) on error or number of bytes actually received. @@ -338,7 +331,7 @@ ORTE_DECLSPEC int mca_oob_send_packed_nb( /** * Non-blocking version of mca_oob_recv(). * -* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. +* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param msg (IN) Array of iovecs describing user buffers and lengths. * @param count (IN) Number of elements in iovec array. * @param tag (IN) User defined tag for matching send/recv. @@ -363,7 +356,7 @@ ORTE_DECLSPEC int mca_oob_recv_nb( /** * Routine to cancel pending non-blocking recvs. * -* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. +* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param tag (IN) User defined tag for matching send/recv. * @return OMPI error code (<0) on error or number of bytes actually received. */ @@ -375,7 +368,7 @@ ORTE_DECLSPEC int mca_oob_recv_cancel( /** * Non-blocking version of mca_oob_recv_packed(). * -* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. +* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param buffer (IN) Array of iovecs describing user buffers and lengths. * @param count (IN) Number of elements in iovec array. * @param tag (IN) User defined tag for matching send/recv. diff --git a/orte/mca/oob/base/oob_base_init.c b/orte/mca/oob/base/oob_base_init.c index c48db9d3ab..b717d37edb 100644 --- a/orte/mca/oob/base/oob_base_init.c +++ b/orte/mca/oob/base/oob_base_init.c @@ -48,9 +48,6 @@ OBJ_CLASS_INSTANCE( NULL ); -orte_process_name_t mca_oob_name_seed = { 0, 0, 0 }; -orte_process_name_t mca_oob_name_any = { ORTE_CELLID_MAX, ORTE_JOBID_MAX, ORTE_VPID_MAX }; - /** * Parse contact info string into process name and list of uri strings. */ diff --git a/orte/mca/oob/base/oob_base_recv.c b/orte/mca/oob/base/oob_base_recv.c index fa8ace5aa3..6066c34386 100644 --- a/orte/mca/oob/base/oob_base_recv.c +++ b/orte/mca/oob/base/oob_base_recv.c @@ -34,7 +34,7 @@ /* * Similiar to unix recv(2) * -* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. +* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param msg (IN) Array of iovecs describing user buffers and lengths. * @param types (IN) Parallel array to iovecs describing data type of each iovec element. * @param count (IN) Number of elements in iovec array. @@ -51,7 +51,7 @@ int mca_oob_recv(orte_process_name_t* peer, struct iovec *msg, int count, int ta /* * Similiar to unix recv(2) * -* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. +* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param buffer (OUT) Buffer that the OOB creates to recv this message... * @param tag (IN) User defined tag for matching send/recv. * iovec array without removing the message from the queue. diff --git a/orte/mca/oob/base/oob_base_recv_nb.c b/orte/mca/oob/base/oob_base_recv_nb.c index 3fa5456218..b898cfbddf 100644 --- a/orte/mca/oob/base/oob_base_recv_nb.c +++ b/orte/mca/oob/base/oob_base_recv_nb.c @@ -53,7 +53,7 @@ static void mca_oob_recv_callback( /* * Non-blocking version of mca_oob_recv_nb(). * - * @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. + * @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param msg (IN) Array of iovecs describing user buffers and lengths. * @param count (IN) Number of elements in iovec array. * @param flags (IN) May be MCA_OOB_PEEK to return up to size bytes of msg w/out removing it from the queue, @@ -71,7 +71,7 @@ int mca_oob_recv_nb(orte_process_name_t* peer, struct iovec* msg, int count, int /* * Cancel non-blocking recv.j * - * @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. + * @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param tag (IN) User defined tag for message matching. * @return OMPI success or error code (<0) on error. */ @@ -84,7 +84,7 @@ int mca_oob_recv_cancel(orte_process_name_t* peer, int tag) /** * Non-blocking version of mca_oob_recv_packed(). * -* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. +* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param buffer (IN) Array of iovecs describing user buffers and lengths. * @param count (IN) Number of elements in iovec array. * @param tag (IN) User defined tag for matching send/recv. diff --git a/orte/mca/oob/base/oob_base_xcast.c b/orte/mca/oob/base/oob_base_xcast.c index a8475b4988..01ec3f5b5f 100644 --- a/orte/mca/oob/base/oob_base_xcast.c +++ b/orte/mca/oob/base/oob_base_xcast.c @@ -149,7 +149,7 @@ int mca_oob_xcast( orte_gpr_notify_message_t *msg; OBJ_CONSTRUCT(&rbuf, orte_buffer_t); - rc = mca_oob_recv_packed(MCA_OOB_NAME_ANY, &rbuf, tag); + rc = mca_oob_recv_packed(ORTE_NAME_WILDCARD, &rbuf, tag); if(rc < 0) { OBJ_DESTRUCT(&rbuf); return rc; diff --git a/orte/mca/oob/oob.h b/orte/mca/oob/oob.h index f3abd671b5..c61ddbe278 100644 --- a/orte/mca/oob/oob.h +++ b/orte/mca/oob/oob.h @@ -110,7 +110,7 @@ typedef int (*mca_oob_base_module_send_fn_t)( /** * Implementation of mca_oob_recv(). * -* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. +* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param msg (IN) Array of iovecs describing user buffers and lengths. * @param types (IN) Parallel array to iovecs describing data type of each iovec element. * @param count (IN) Number of elements in iovec array. @@ -153,7 +153,7 @@ typedef int (*mca_oob_base_module_send_nb_fn_t)( /** * Implementation of mca_oob_recv_nb(). * -* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. +* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param msg (IN) Array of iovecs describing user buffers and lengths. * @param count (IN) Number of elements in iovec array. * @param tag (IN) User defined tag for matching send/recv. @@ -175,7 +175,7 @@ typedef int (*mca_oob_base_module_recv_nb_fn_t)( /** * Implementation of mca_oob_recv_cancel(). * -* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. +* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param tag (IN) User defined tag for matching send/recv. * @return OMPI error code (<0) on error or number of bytes actually received. */ diff --git a/orte/mca/oob/oob_types.h b/orte/mca/oob/oob_types.h index f0e6d85e76..ae1c62135b 100644 --- a/orte/mca/oob/oob_types.h +++ b/orte/mca/oob/oob_types.h @@ -25,24 +25,5 @@ #include "orte_config.h" #include "orte/orte_constants.h" -#include <limits.h> - -#include "orte/mca/rml/rml_types.h" -/* - * Other constants - */ - -/** - * The wildcard for receives from any peer. - */ -#define MCA_OOB_NAME_ANY &mca_oob_name_any -/** - * Process name of self - */ -#define MCA_OOB_NAME_SELF orte_process_info.my_name -/** - * Process name of seed - */ -#define MCA_OOB_NAME_SEED &mca_oob_name_seed #endif /* MCA_OOB_TYPES_H */ diff --git a/orte/mca/oob/tcp/oob_tcp.c b/orte/mca/oob/tcp/oob_tcp.c index f65bde51f9..4f9e09433a 100644 --- a/orte/mca/oob/tcp/oob_tcp.c +++ b/orte/mca/oob/tcp/oob_tcp.c @@ -690,20 +690,18 @@ static void mca_oob_tcp_recv_connect(int sd, mca_oob_tcp_hdr_t* hdr) } } - /* check for wildcard name - if this is true - we allocate a name from the name server + /* check for invalid name - if this is true - we allocate a name from the name server * and return to the peer */ - cmpval = orte_ns.compare(ORTE_NS_CMP_ALL, &hdr->msg_src, MCA_OOB_NAME_ANY); - if (cmpval == 0) { - if (ORTE_SUCCESS != orte_ns.create_jobid(&hdr->msg_src.jobid)) { + cmpval = orte_ns.compare_fields(ORTE_NS_CMP_ALL, &hdr->msg_src, ORTE_NAME_INVALID); + if (cmpval == ORTE_EQUAL) { + if (ORTE_SUCCESS != orte_ns.create_jobid(&hdr->msg_src.jobid, NULL)) { return; } if (ORTE_SUCCESS != orte_ns.reserve_range(hdr->msg_src.jobid, 1, &hdr->msg_src.vpid)) { return; } - if (ORTE_SUCCESS != orte_ns.assign_cellid_to_process(&hdr->msg_src)) { - return; - } + hdr->msg_src.cellid = ORTE_PROC_MY_NAME->cellid; } /* lookup the corresponding process */ @@ -1049,12 +1047,8 @@ int mca_oob_tcp_init(void) #endif /* get my jobid */ - if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid, - orte_process_info.my_name))) { - ORTE_ERROR_LOG(rc); - return rc; - } - + jobid = ORTE_PROC_MY_NAME->jobid; + /* create a listen socket */ if (OOB_TCP_EVENT == mca_oob_tcp_component.tcp_listen_type) { if(mca_oob_tcp_create_listen() != ORTE_SUCCESS) { @@ -1286,12 +1280,17 @@ int mca_oob_tcp_fini(void) * Note that the definition of < or > is somewhat arbitrary - * just needs to be consistently applied to maintain an ordering * when process names are used as indices. +* +* Currently, this function is ONLY used in one place - in oob_tcp_send.c to +* determine if the recipient of the message-to-be-sent is ourselves. Hence, +* this comparison is okay to be LITERAL and can/should use the ns.compare_fields +* function */ int mca_oob_tcp_process_name_compare(const orte_process_name_t* n1, const orte_process_name_t* n2) { - return orte_ns.compare(ORTE_NS_CMP_ALL, n1, n2); + return orte_ns.compare_fields(ORTE_NS_CMP_ALL, n1, n2); } diff --git a/orte/mca/oob/tcp/oob_tcp.h b/orte/mca/oob/tcp/oob_tcp.h index 7900f982df..add933b1ad 100644 --- a/orte/mca/oob/tcp/oob_tcp.h +++ b/orte/mca/oob/tcp/oob_tcp.h @@ -119,7 +119,7 @@ int mca_oob_tcp_send( /** * Similiar to unix readv(2) * - * @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. + * @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param msg (IN) Array of iovecs describing user buffers and lengths. * @param count (IN) Number of elements in iovec array. * @param tag (IN) User defined tag for matching send/recv. @@ -166,7 +166,7 @@ int mca_oob_tcp_send_nb( /** * Non-blocking version of mca_oob_recv(). * - * @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. + * @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param msg (IN) Array of iovecs describing user buffers and lengths. * @param count (IN) Number of elements in iovec array. * @param tag (IN) User defined tag for matching send/recv. @@ -188,7 +188,7 @@ int mca_oob_tcp_recv_nb( /** * Cancel non-blocking receive. * - * @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. + * @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param tag (IN) User defined tag for matching send/recv. * @return OMPI error code (<0) on error or number of bytes actually received. */ diff --git a/orte/mca/oob/tcp/oob_tcp_addr.c b/orte/mca/oob/tcp/oob_tcp_addr.c index 6857bea799..8a2061f609 100644 --- a/orte/mca/oob/tcp/oob_tcp_addr.c +++ b/orte/mca/oob/tcp/oob_tcp_addr.c @@ -32,6 +32,11 @@ #include <string.h> #include "orte/orte_constants.h" #include "opal/util/if.h" + +#include "orte/mca/ns/ns_types.h" +#include "orte/util/proc_info.h" +#include "orte/dss/dss.h" + #include "oob_tcp.h" #include "oob_tcp_addr.h" @@ -65,7 +70,7 @@ int mca_oob_tcp_addr_pack(orte_buffer_t* buffer) int i; int rc; - rc = orte_dss.pack(buffer, orte_process_info.my_name, 1, ORTE_NAME); + rc = orte_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME); if(rc != ORTE_SUCCESS) return rc; diff --git a/orte/mca/oob/tcp/oob_tcp_msg.c b/orte/mca/oob/tcp/oob_tcp_msg.c index c02d403e68..62352c8eea 100644 --- a/orte/mca/oob/tcp/oob_tcp_msg.c +++ b/orte/mca/oob/tcp/oob_tcp_msg.c @@ -404,7 +404,9 @@ void mca_oob_tcp_msg_recv_complete(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* p } /** - * Process an ident message. + * Process an ident message. In this case, we insist that the two process names + * exactly match - hence, we use the orte_ns.compare_fields function, which + * checks each field in a literal manner (i.e., no wildcards). */ static void mca_oob_tcp_msg_ident(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* peer) @@ -412,7 +414,7 @@ static void mca_oob_tcp_msg_ident(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* pe orte_process_name_t src = msg->msg_hdr.msg_src; OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); - if (orte_ns.compare(ORTE_NS_CMP_ALL, &peer->peer_name, &src) != 0) { + if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, &peer->peer_name, &src) != ORTE_EQUAL) { orte_hash_table_remove_proc(&mca_oob_tcp_component.tcp_peers, &peer->peer_name); peer->peer_name = src; orte_hash_table_set_proc(&mca_oob_tcp_component.tcp_peers, &peer->peer_name, peer); @@ -558,9 +560,7 @@ mca_oob_tcp_msg_t* mca_oob_tcp_msg_match_recv(orte_process_name_t* name, int tag msg != (mca_oob_tcp_msg_t*) opal_list_get_end(&mca_oob_tcp_component.tcp_msg_recv); msg = (mca_oob_tcp_msg_t*) opal_list_get_next(msg)) { - int cmpval1 = orte_ns.compare(ORTE_NS_CMP_ALL, name, MCA_OOB_NAME_ANY); - int cmpval2 = orte_ns.compare(ORTE_NS_CMP_ALL, name, &msg->msg_peer); - if((0 == cmpval1) || (0 == cmpval2)) { + if(ORTE_EQUAL == orte_dss.compare(name, &msg->msg_peer, ORTE_NAME)) { if (tag == msg->msg_hdr.msg_tag) { return msg; } @@ -585,10 +585,7 @@ mca_oob_tcp_msg_t* mca_oob_tcp_msg_match_post(orte_process_name_t* name, int tag msg != (mca_oob_tcp_msg_t*) opal_list_get_end(&mca_oob_tcp_component.tcp_msg_post); msg = (mca_oob_tcp_msg_t*) opal_list_get_next(msg)) { - int cmpval1 = orte_ns.compare(ORTE_NS_CMP_ALL, &msg->msg_peer, MCA_OOB_NAME_ANY); - int cmpval2 = orte_ns.compare(ORTE_NS_CMP_ALL, name, &msg->msg_peer); - - if((0 == cmpval1) || (0 == cmpval2)) { + if(ORTE_EQUAL == orte_dss.compare(name, &msg->msg_peer, ORTE_NAME)) { if (msg->msg_hdr.msg_tag == tag) { if((msg->msg_flags & MCA_OOB_PERSISTENT) == 0) { opal_list_remove_item(&mca_oob_tcp_component.tcp_msg_post, &msg->super.super); diff --git a/orte/mca/oob/tcp/oob_tcp_peer.c b/orte/mca/oob/tcp/oob_tcp_peer.c index 0fe1be9f73..34e88ff62e 100644 --- a/orte/mca/oob/tcp/oob_tcp_peer.c +++ b/orte/mca/oob/tcp/oob_tcp_peer.c @@ -503,7 +503,7 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer) } /* if we lose the connection to the seed - abort */ - if(memcmp(&peer->peer_name,&mca_oob_name_seed,sizeof(mca_oob_name_seed)) == 0) { + if(memcmp(&peer->peer_name,ORTE_PROC_MY_HNP,sizeof(orte_process_name_t)) == 0) { /* If we are not already inside orte_finalize, then call abort */ if (ORTE_UNIVERSE_STATE_FINALIZE > orte_universe_info.state) { /* Should free the peer lock before we abort so we don't @@ -554,7 +554,7 @@ static int mca_oob_tcp_peer_send_connect_ack(mca_oob_tcp_peer_t* peer) mca_oob_tcp_hdr_t hdr; memset(&hdr,0,sizeof(hdr)); if (NULL == orte_process_info.my_name) { /* my name isn't defined yet */ - hdr.msg_src = *MCA_OOB_NAME_ANY; + hdr.msg_src = *ORTE_NAME_INVALID; } else { hdr.msg_src = *(orte_process_info.my_name); } @@ -597,11 +597,13 @@ static int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* peer) return ORTE_ERR_UNREACH; } - /* if we have a wildcard name - use the name returned by the peer */ + /* if we have an invalid name or do not have one assigned at all - use the name returned by the peer. + * This needs to be a LITERAL comparison - we do NOT want wildcard values to return EQUAL + */ if(orte_process_info.my_name == NULL) { orte_ns.create_process_name(&orte_process_info.my_name, hdr.msg_dst.cellid, hdr.msg_dst.jobid, hdr.msg_dst.vpid); - } else if(orte_ns.compare(ORTE_NS_CMP_ALL, orte_process_info.my_name, &mca_oob_name_any) == 0) { + } else if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, orte_process_info.my_name, ORTE_NAME_INVALID) == ORTE_EQUAL) { *orte_process_info.my_name = hdr.msg_dst; } @@ -876,18 +878,24 @@ static void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg) /* - * Accept incoming connection - if not already connected. + * Accept incoming connection - if not already connected. We compare the name of the + * peer to our own name using the ns.compare_fields function as we want this to be + * a LITERAL comparison - i.e., there is no occasion when the peer's name should + * be a wildcard value. + * + * To avoid competing reciprocal connection attempts, we only accept connections from + * processes whose names are "greater" than our own. */ bool mca_oob_tcp_peer_accept(mca_oob_tcp_peer_t* peer, int sd) { int cmpval; OPAL_THREAD_LOCK(&peer->peer_lock); - cmpval = orte_ns.compare(ORTE_NS_CMP_ALL, &peer->peer_name, orte_process_info.my_name); + cmpval = orte_ns.compare_fields(ORTE_NS_CMP_ALL, &peer->peer_name, orte_process_info.my_name); if ((peer->peer_state == MCA_OOB_TCP_CLOSED) || (peer->peer_state == MCA_OOB_TCP_RESOLVE) || (peer->peer_state != MCA_OOB_TCP_CONNECTED && - cmpval < 0)) { + cmpval == ORTE_VALUE1_GREATER)) { if(peer->peer_state != MCA_OOB_TCP_CLOSED) { mca_oob_tcp_peer_close(peer); diff --git a/orte/mca/oob/tcp/oob_tcp_ping.c b/orte/mca/oob/tcp/oob_tcp_ping.c index 6a89a8d4a4..48a79f1bfa 100644 --- a/orte/mca/oob/tcp/oob_tcp_ping.c +++ b/orte/mca/oob/tcp/oob_tcp_ping.c @@ -47,7 +47,10 @@ #include <signal.h> #endif #include "opal/event/event.h" + #include "orte/mca/ns/ns_types.h" +#include "orte/util/proc_info.h" + #include "orte/mca/oob/tcp/oob_tcp.h" /* @@ -148,7 +151,7 @@ int mca_oob_tcp_ping( if(orte_process_info.my_name != NULL) { hdr.msg_src = *orte_process_info.my_name; } else { - hdr.msg_src = mca_oob_name_any; + hdr.msg_src = *ORTE_NAME_INVALID; } hdr.msg_dst = *name; hdr.msg_type = MCA_OOB_TCP_PROBE; diff --git a/orte/mca/oob/tcp/oob_tcp_recv.c b/orte/mca/oob/tcp/oob_tcp_recv.c index 382dd76c37..7d6c31e281 100644 --- a/orte/mca/oob/tcp/oob_tcp_recv.c +++ b/orte/mca/oob/tcp/oob_tcp_recv.c @@ -24,7 +24,7 @@ /* * Similiar to unix readv(2) * - * @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. + * @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param msg (IN) Array of iovecs describing user buffers and lengths. * @param types (IN) Parallel array to iovecs describing data type of each iovec element. * @param count (IN) Number of elements in iovec array. @@ -114,7 +114,7 @@ int mca_oob_tcp_recv( msg->msg_hdr.msg_type = MCA_OOB_TCP_DATA; msg->msg_hdr.msg_src = *peer; if (NULL == orte_process_info.my_name) { - msg->msg_hdr.msg_dst = *MCA_OOB_NAME_ANY; + msg->msg_hdr.msg_dst = *ORTE_NAME_INVALID; } else { msg->msg_hdr.msg_dst = *orte_process_info.my_name; } @@ -202,7 +202,7 @@ static void mca_oob_tcp_msg_matched(mca_oob_tcp_msg_t* msg, mca_oob_tcp_msg_t* m /* * Non-blocking version of mca_oob_recv(). * - * @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. + * @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param msg (IN) Array of iovecs describing user buffers and lengths. * @param count (IN) Number of elements in iovec array. * @param tag (IN) User supplied tag for matching send/recv. @@ -242,7 +242,11 @@ int mca_oob_tcp_recv_nb( } /* fill in the header */ - msg->msg_hdr.msg_src = *orte_process_info.my_name; + if (NULL == orte_process_info.my_name) { + msg->msg_hdr.msg_src = *ORTE_NAME_INVALID; + } else { + msg->msg_hdr.msg_src = *orte_process_info.my_name; + } msg->msg_hdr.msg_dst = *peer; msg->msg_hdr.msg_size = size; msg->msg_hdr.msg_tag = tag; @@ -286,7 +290,7 @@ int mca_oob_tcp_recv_nb( /* * Cancel non-blocking recv. * - * @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. + * @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. * @param tag (IN) User supplied tag for matching send/recv. * @return OMPI error code (<0) on error or number of bytes actually received. */ @@ -295,7 +299,7 @@ int mca_oob_tcp_recv_cancel( orte_process_name_t* name, int tag) { - int matched = 0, cmpval1, cmpval2; + int matched = 0; opal_list_item_t *item, *next; /* wait for any previously matched messages to be processed */ @@ -317,9 +321,7 @@ int mca_oob_tcp_recv_cancel( mca_oob_tcp_msg_t* msg = (mca_oob_tcp_msg_t*)item; next = opal_list_get_next(item); - cmpval1 = orte_ns.compare(ORTE_NS_CMP_ALL, name, MCA_OOB_NAME_ANY); - cmpval2 = orte_ns.compare(ORTE_NS_CMP_ALL, &msg->msg_peer, name); - if ((0 == cmpval1) || (0 == cmpval2)) { + if (ORTE_EQUAL == orte_dss.compare(name, &msg->msg_peer, ORTE_NAME)) { if (msg->msg_hdr.msg_tag == tag) { opal_list_remove_item(&mca_oob_tcp_component.tcp_msg_post, &msg->super.super); MCA_OOB_TCP_MSG_RETURN(msg); diff --git a/orte/mca/oob/tcp/oob_tcp_send.c b/orte/mca/oob/tcp/oob_tcp_send.c index 0b57f9c93e..6e11513668 100644 --- a/orte/mca/oob/tcp/oob_tcp_send.c +++ b/orte/mca/oob/tcp/oob_tcp_send.c @@ -18,6 +18,7 @@ #include "orte_config.h" #include "orte/mca/ns/ns_types.h" +#include "orte/util/proc_info.h" #include "orte/mca/oob/tcp/oob_tcp.h" @@ -119,7 +120,7 @@ int mca_oob_tcp_send( msg->msg_hdr.msg_size = size; msg->msg_hdr.msg_tag = tag; if (NULL == orte_process_info.my_name) { - msg->msg_hdr.msg_src = *MCA_OOB_NAME_ANY; + msg->msg_hdr.msg_src = *ORTE_NAME_INVALID; } else { msg->msg_hdr.msg_src = *orte_process_info.my_name; } @@ -144,7 +145,7 @@ int mca_oob_tcp_send( msg->msg_peer = peer->peer_name; if (NULL != name && NULL != orte_process_info.my_name && - 0 == mca_oob_tcp_process_name_compare(name, orte_process_info.my_name)) { /* local delivery */ + ORTE_EQUAL == mca_oob_tcp_process_name_compare(name, orte_process_info.my_name)) { /* local delivery */ return mca_oob_tcp_send_self(peer,msg,iov,count); } @@ -206,7 +207,11 @@ int mca_oob_tcp_send_nb( msg->msg_hdr.msg_type = MCA_OOB_TCP_DATA; msg->msg_hdr.msg_size = size; msg->msg_hdr.msg_tag = tag; - msg->msg_hdr.msg_src = *orte_process_info.my_name; + if (NULL == orte_process_info.my_name) { + msg->msg_hdr.msg_src = *ORTE_NAME_INVALID; + } else { + msg->msg_hdr.msg_src = *orte_process_info.my_name; + } msg->msg_hdr.msg_dst = *name; /* create one additional iovect that will hold the size of the message */ @@ -227,7 +232,7 @@ int mca_oob_tcp_send_nb( msg->msg_complete = false; msg->msg_peer = peer->peer_name; - if (0 == mca_oob_tcp_process_name_compare(name, orte_process_info.my_name)) { /* local delivery */ + if (ORTE_EQUAL == mca_oob_tcp_process_name_compare(name, orte_process_info.my_name)) { /* local delivery */ return mca_oob_tcp_send_self(peer,msg,iov,count); } diff --git a/orte/mca/pls/base/pls_base_dmn_registry_fns.c b/orte/mca/pls/base/pls_base_dmn_registry_fns.c index 44c75a5150..829ccd9610 100644 --- a/orte/mca/pls/base/pls_base_dmn_registry_fns.c +++ b/orte/mca/pls/base/pls_base_dmn_registry_fns.c @@ -28,6 +28,7 @@ #include "orte/mca/ns/ns.h" #include "orte/mca/gpr/gpr.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rmgr/rmgr.h" #include "orte/mca/pls/base/pls_private.h" @@ -128,10 +129,7 @@ CLEANUP: return rc; } -/* - * Retrieve a list of the active daemons for a job - */ -int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job) +static int get_daemons(opal_list_t *daemons, orte_jobid_t job) { orte_gpr_value_t **values; orte_gpr_keyval_t *kv; @@ -149,7 +147,7 @@ int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job) orte_pls_daemon_info_t *dmn; bool found_name, found_node, found_cell; int rc; - + /* setup the key */ if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, job))) { ORTE_ERROR_LOG(rc); @@ -172,11 +170,11 @@ int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job) /* loop through the answers and construct the list */ for (i=0; i < cnt; i++) { /* for systems such as bproc, the node segment holds containers - * for nodes that we may not have launched upon. Each container - * will send us back a value object, so we have to ensure here - * that we only create daemon objects on the list for those nodes - * that DO provide a valid object - */ + * for nodes that we may not have launched upon. Each container + * will send us back a value object, so we have to ensure here + * that we only create daemon objects on the list for those nodes + * that DO provide a valid object + */ found_name = found_node = found_cell = false; for (j=0; j < values[i]->cnt; j++) { kv = values[i]->keyvals[j]; @@ -204,10 +202,10 @@ int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job) found_cell = true; continue; } - } + } /* if we found everything, then this is a valid entry - create - * it and add it to the list - */ + * it and add it to the list + */ if (found_name && found_node && found_cell) { dmn = OBJ_NEW(orte_pls_daemon_info_t); if (NULL == dmn) { @@ -230,14 +228,59 @@ int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job) } OBJ_RELEASE(values[i]); } - + CLEANUP: for (i=0; i < cnt; i++) { if (NULL != values[i]) OBJ_RELEASE(values[i]); } if (NULL != values) free(values); free(keys[0]); + + return rc; +} +/* + * Retrieve a list of the active daemons for a job + */ +int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job, opal_list_t *attrs) +{ + orte_jobid_t *jobs; + orte_std_cntr_t njobs, i; + bool allocated; + int rc; + + if (NULL != orte_rmgr.find_attribute(attrs, ORTE_NS_INCLUDE_DESCENDANTS)) { + /* need to include all descendants in list */ + if (ORTE_SUCCESS != (rc = orte_ns.get_job_descendants(&jobs, &njobs, job))) { + ORTE_ERROR_LOG(rc); + return rc; + } + allocated = true; + } else if (NULL != orte_rmgr.find_attribute(attrs, ORTE_NS_INCLUDE_CHILDREN)) { + /* just include the direct children of the job */ + if (ORTE_SUCCESS != (rc = orte_ns.get_job_children(&jobs, &njobs, job))) { + ORTE_ERROR_LOG(rc); + return rc; + } + allocated = true; + } else { + /* just want daemons for this one job */ + jobs = &job; + njobs = 1; + allocated = false; + } + + /* loop through all the jobs and get their info */ + for (i=0; i < njobs; i++) { + if (ORTE_SUCCESS != (rc = get_daemons(daemons, jobs[i]))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + } + +CLEANUP: + if (allocated) free(jobs); + return rc; } @@ -246,19 +289,9 @@ CLEANUP: */ int orte_pls_base_remove_daemon(orte_pls_daemon_info_t *info) { - opal_list_t daemons; - int rc; - - OBJ_CONSTRUCT(&daemons, opal_list_t); - - /* We actually don't want to do this - instead, we need to do a registry - * delete function call targeting this entry + /* We need to do a registry + * delete function call targeting the entry */ - if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, info->active_job))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* find this item in the list */ + return ORTE_SUCCESS; } diff --git a/orte/mca/pls/base/pls_base_receive.c b/orte/mca/pls/base/pls_base_receive.c index 29cbb7d07d..810233564b 100644 --- a/orte/mca/pls/base/pls_base_receive.c +++ b/orte/mca/pls/base/pls_base_receive.c @@ -51,7 +51,7 @@ int orte_pls_base_comm_start(void) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS, ORTE_RML_PERSISTENT, orte_pls_base_recv, @@ -72,7 +72,7 @@ int orte_pls_base_comm_stop(void) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_PLS))) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS))) { ORTE_ERROR_LOG(rc); } recv_issued = false; @@ -97,6 +97,8 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender, orte_jobid_t job; orte_process_name_t *name; int32_t signal; + opal_list_t attrs; + opal_list_item_t *item; int rc; count = 1; @@ -130,10 +132,21 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender, ORTE_ERROR_LOG(rc); goto SEND_ANSWER; } + + OBJ_CONSTRUCT(&attrs, opal_list_t); + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) { + ORTE_ERROR_LOG(rc); + goto SEND_ANSWER; + } - if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job))) { + + if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &attrs))) { ORTE_ERROR_LOG(rc); } + + while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); + OBJ_DESTRUCT(&attrs); break; case ORTE_PLS_TERMINATE_ORTEDS_CMD: @@ -143,9 +156,19 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender, goto SEND_ANSWER; } - if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(job))) { + OBJ_CONSTRUCT(&attrs, opal_list_t); + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) { + ORTE_ERROR_LOG(rc); + goto SEND_ANSWER; + } + + if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(job, &attrs))) { ORTE_ERROR_LOG(rc); } + + while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); + OBJ_DESTRUCT(&attrs); break; case ORTE_PLS_SIGNAL_JOB_CMD: @@ -161,9 +184,19 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender, goto SEND_ANSWER; } - if (ORTE_SUCCESS != (rc = orte_pls.signal_job(job, signal))) { + OBJ_CONSTRUCT(&attrs, opal_list_t); + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) { + ORTE_ERROR_LOG(rc); + goto SEND_ANSWER; + } + + if (ORTE_SUCCESS != (rc = orte_pls.signal_job(job, signal, &attrs))) { ORTE_ERROR_LOG(rc); } + + while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); + OBJ_DESTRUCT(&attrs); break; case ORTE_PLS_TERMINATE_PROC_CMD: diff --git a/orte/mca/pls/base/pls_private.h b/orte/mca/pls/base/pls_private.h index f2c971ae55..34b0b3ec96 100644 --- a/orte/mca/pls/base/pls_private.h +++ b/orte/mca/pls/base/pls_private.h @@ -79,7 +79,7 @@ extern "C" { int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal); int orte_pls_base_orted_add_local_procs(opal_list_t *daemons, orte_gpr_notify_data_t *ndat); - int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job); + int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job, opal_list_t *attrs); int orte_pls_base_store_active_daemons(opal_list_t *daemons); int orte_pls_base_remove_daemon(orte_pls_daemon_info_t *info); diff --git a/orte/mca/pls/bproc/pls_bproc.c b/orte/mca/pls/bproc/pls_bproc.c index 9fa58e65cf..5953169d04 100644 --- a/orte/mca/pls/bproc/pls_bproc.c +++ b/orte/mca/pls/bproc/pls_bproc.c @@ -42,6 +42,7 @@ #endif /* HAVE_STRING_H */ #include "opal/install_dirs.h" +#include "opal/class/opal_list.h" #include "opal/event/event.h" #include "opal/mca/base/mca_base_param.h" #include "opal/util/argv.h" @@ -342,7 +343,7 @@ static void orte_pls_bproc_waitpid_daemon_cb(pid_t wpid, int status, void *data) if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); } - rc = mca_oob_send_packed(ORTE_RML_NAME_SELF, &ack, ORTE_RML_TAG_BPROC, 0); + rc = mca_oob_send_packed(ORTE_PROC_MY_NAME, &ack, ORTE_RML_TAG_BPROC, 0); if(0 > rc) { ORTE_ERROR_LOG(rc); } @@ -429,8 +430,7 @@ static void orte_pls_bproc_setup_env(char *** env) /* ns replica contact info */ if(NULL == orte_process_info.ns_replica) { - orte_ns.copy_process_name(&orte_process_info.ns_replica, - orte_process_info.my_name); + orte_dss.copy((void**)&orte_process_info.ns_replica, orte_process_info.my_name, ORTE_NAME); orte_process_info.ns_replica_uri = orte_rml.get_uri(); } var = mca_base_param_environ_variable("ns","replica","uri"); @@ -451,8 +451,7 @@ static void orte_pls_bproc_setup_env(char *** env) /* gpr replica contact info */ if(NULL == orte_process_info.gpr_replica) { - orte_ns.copy_process_name(&orte_process_info.gpr_replica, - orte_process_info.my_name); + orte_dss.copy((void**)&orte_process_info.gpr_replica, orte_process_info.my_name, ORTE_NAME); orte_process_info.gpr_replica_uri = orte_rml.get_uri(); } var = mca_base_param_environ_variable("gpr","replica","uri"); @@ -832,13 +831,13 @@ orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data, orte_schema.extract_jobid_from_segment_name(&jobid, value->tokens[0]); printf("killing jobid %d\n", jobid); if(jobid != 0) - orte_pls_bproc_terminate_job(jobid); + orte_pls_bproc_terminate_job(jobid, NULL); } /* * and kill everyone else */ printf("and go bye-bye...\n"); - orte_pls_bproc_terminate_job(0); + orte_pls_bproc_terminate_job(0, NULL); /* shouldn't ever get here.. */ exit(1); } @@ -1240,7 +1239,7 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) { for(j = 0; j < num_daemons; j++) { orte_buffer_t ack; OBJ_CONSTRUCT(&ack, orte_buffer_t); - rc = mca_oob_recv_packed(ORTE_RML_NAME_ANY, &ack, ORTE_RML_TAG_BPROC); + rc = mca_oob_recv_packed(ORTE_NAME_WILDCARD, &ack, ORTE_RML_TAG_BPROC); if(0 > rc) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&ack); @@ -1265,7 +1264,7 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) { } rc = ORTE_ERROR; ORTE_ERROR_LOG(rc); - orte_pls_bproc_terminate_job(jobid); + orte_pls_bproc_terminate_job(jobid, NULL); goto cleanup; } } @@ -1307,7 +1306,7 @@ cleanup: /** * Terminate all processes associated with this job */ -int orte_pls_bproc_terminate_job(orte_jobid_t jobid) { +int orte_pls_bproc_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) { pid_t* pids; orte_std_cntr_t i, num_pids; int rc; @@ -1319,7 +1318,7 @@ int orte_pls_bproc_terminate_job(orte_jobid_t jobid) { } /* kill application process */ - if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pids(jobid, &pids, &num_pids))) + if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pids(jobid, &pids, &num_pids, attrs))) return rc; for(i=0; i<num_pids; i++) { if(mca_pls_bproc_component.debug) { @@ -1337,7 +1336,7 @@ int orte_pls_bproc_terminate_job(orte_jobid_t jobid) { /** * Terminate the orteds for a given job */ -int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid) +int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -1347,7 +1346,7 @@ int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid) /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -1394,7 +1393,7 @@ int orte_pls_bproc_terminate_proc(const orte_process_name_t* proc_name) { /** * Signal all processes associated with this job */ -int orte_pls_bproc_signal_job(orte_jobid_t jobid, int32_t signal) { +int orte_pls_bproc_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs) { pid_t* pids; orte_std_cntr_t i, num_pids; int rc; @@ -1402,7 +1401,7 @@ int orte_pls_bproc_signal_job(orte_jobid_t jobid, int32_t signal) { OPAL_TRACE(1); /* signal application process */ - if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pids(jobid, &pids, &num_pids))) + if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pids(jobid, &pids, &num_pids, attrs))) return rc; for(i=0; i<num_pids; i++) { if(mca_pls_bproc_component.debug) { diff --git a/orte/mca/pls/bproc/pls_bproc.h b/orte/mca/pls/bproc/pls_bproc.h index 001b636656..55e7635f8d 100644 --- a/orte/mca/pls/bproc/pls_bproc.h +++ b/orte/mca/pls/bproc/pls_bproc.h @@ -72,10 +72,10 @@ int orte_pls_bproc_finalize(void); * Interface */ int orte_pls_bproc_launch(orte_jobid_t); -int orte_pls_bproc_terminate_job(orte_jobid_t); +int orte_pls_bproc_terminate_job(orte_jobid_t, opal_list_t*); int orte_pls_bproc_terminate_proc(const orte_process_name_t* proc_name); -int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid); -int orte_pls_bproc_signal_job(orte_jobid_t, int32_t); +int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, opal_list_t*); +int orte_pls_bproc_signal_job(orte_jobid_t, int32_t, opal_list_t*); int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t); /* Utility routine to get/set process pid */ @@ -84,7 +84,10 @@ ORTE_DECLSPEC int orte_pls_bproc_get_proc_pid(const orte_process_name_t*, pid_t* /** * Utility routine to retreive all process pids w/in a specified job. */ -ORTE_DECLSPEC int orte_pls_bproc_get_proc_pids(orte_jobid_t jobid, pid_t** pids, orte_std_cntr_t* num_pids); +ORTE_DECLSPEC int orte_pls_bproc_get_proc_pids(orte_jobid_t jobid, pid_t** pids, + orte_std_cntr_t* num_pids, + opal_list_t *attrs); + /** * Utility routine to get/set daemon pid */ diff --git a/orte/mca/pls/bproc/pls_bproc_state.c b/orte/mca/pls/bproc/pls_bproc_state.c index 74671971ad..2196be2cdc 100644 --- a/orte/mca/pls/bproc/pls_bproc_state.c +++ b/orte/mca/pls/bproc/pls_bproc_state.c @@ -173,7 +173,7 @@ cleanup: /** * Retrieve all process pids for the specified job. */ -int orte_pls_bproc_get_proc_pids(orte_jobid_t jobid, pid_t **pids, orte_std_cntr_t* num_pids) +int orte_pls_bproc_get_proc_pids(orte_jobid_t jobid, pid_t **pids, orte_std_cntr_t* num_pids, opal_list_t *attrs) { char *segment; char *keys[2]; @@ -352,7 +352,7 @@ int orte_pls_bproc_comm_start(void) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_BPROC_ABORT, ORTE_RML_PERSISTENT, orte_pls_bproc_recv, @@ -373,7 +373,7 @@ int orte_pls_bproc_comm_stop(void) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_BPROC_ABORT))) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_BPROC_ABORT))) { ORTE_ERROR_LOG(rc); } mca_pls_bproc_component.recv_issued = false; diff --git a/orte/mca/pls/cnos/pls_cnos.c b/orte/mca/pls/cnos/pls_cnos.c index 6e8bd8611d..7c3a10ce7a 100644 --- a/orte/mca/pls/cnos/pls_cnos.c +++ b/orte/mca/pls/cnos/pls_cnos.c @@ -38,10 +38,10 @@ static int orte_pls_cnos_launch_job(orte_jobid_t jobid); -static int orte_pls_cnos_terminate_job(orte_jobid_t jobid); -static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid); +static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, opal_list_t *attrs); +static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs); static int orte_pls_cnos_terminate_proc(const orte_process_name_t* proc_name); -static int orte_pls_cnos_signal_job(orte_jobid_t jobid, int32_t signal); +static int orte_pls_cnos_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs); static int orte_pls_cnos_signal_proc(const orte_process_name_t* proc_name, int32_t signal); static int orte_pls_cnos_finalize(void); @@ -68,12 +68,12 @@ static int orte_pls_cnos_launch_job(orte_jobid_t jobid) extern int killrank(rank_t RANK, int SIG); #endif -static int orte_pls_cnos_terminate_job(orte_jobid_t jobid) +static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) { #ifdef HAVE_KILLRANK orte_jobid_t my_jobid; - orte_ns.get_jobid(&my_jobid, orte_process_info.my_name); + my_jobid = orte_process_info.my_name->jobid; /* make sure it's my job */ if (jobid == my_jobid) { @@ -89,12 +89,12 @@ static int orte_pls_cnos_terminate_job(orte_jobid_t jobid) } -static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid) +static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) { #ifdef HAVE_KILLRANK orte_jobid_t my_jobid; - orte_ns.get_jobid(&my_jobid, orte_process_info.my_name); + my_jobid = orte_process_info.my_name->jobid; /* make sure it's my job */ if (jobid == my_jobid) { @@ -136,7 +136,7 @@ static int orte_pls_cnos_terminate_proc(const orte_process_name_t* proc_name) } -static int orte_pls_cnos_signal_job(orte_jobid_t jobid, int32_t signal) +static int orte_pls_cnos_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs) { return ORTE_ERR_NOT_SUPPORTED; } diff --git a/orte/mca/pls/gridengine/pls_gridengine.h b/orte/mca/pls/gridengine/pls_gridengine.h index efdfbd9ee5..e141261809 100644 --- a/orte/mca/pls/gridengine/pls_gridengine.h +++ b/orte/mca/pls/gridengine/pls_gridengine.h @@ -113,10 +113,10 @@ int orte_pls_gridengine_finalize(void); * Interface */ int orte_pls_gridengine_launch_job(orte_jobid_t); -int orte_pls_gridengine_terminate_job(orte_jobid_t); -int orte_pls_gridengine_terminate_orteds(orte_jobid_t); +int orte_pls_gridengine_terminate_job(orte_jobid_t, opal_list_t *attrs); +int orte_pls_gridengine_terminate_orteds(orte_jobid_t, opal_list_t *attrs); int orte_pls_gridengine_terminate_proc(const orte_process_name_t*); -int orte_pls_gridengine_signal_job(orte_jobid_t, int32_t); +int orte_pls_gridengine_signal_job(orte_jobid_t, int32_t, opal_list_t *attrs); int orte_pls_gridengine_signal_proc(const orte_process_name_t*, int32_t); /** diff --git a/orte/mca/pls/gridengine/pls_gridengine_module.c b/orte/mca/pls/gridengine/pls_gridengine_module.c index 8f6f2f6293..0cab075ec3 100644 --- a/orte/mca/pls/gridengine/pls_gridengine_module.c +++ b/orte/mca/pls/gridengine/pls_gridengine_module.c @@ -759,7 +759,7 @@ static int update_slot_keyval(orte_ras_node_t* ras_node, int* slot_cnt) /** * Query the registry for all nodes participating in the job */ -int orte_pls_gridengine_terminate_job(orte_jobid_t jobid) +int orte_pls_gridengine_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -767,7 +767,7 @@ int orte_pls_gridengine_terminate_job(orte_jobid_t jobid) /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -794,7 +794,7 @@ int orte_pls_gridengine_terminate_proc(const orte_process_name_t* proc) /** * Terminate the orteds for a given job */ -int orte_pls_gridengine_terminate_orteds(orte_jobid_t jobid) +int orte_pls_gridengine_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -802,7 +802,7 @@ int orte_pls_gridengine_terminate_orteds(orte_jobid_t jobid) /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -823,7 +823,7 @@ CLEANUP: /** * Signal all processes associated with this job */ -int orte_pls_gridengine_signal_job(orte_jobid_t jobid, int32_t signal) +int orte_pls_gridengine_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -831,7 +831,7 @@ int orte_pls_gridengine_signal_job(orte_jobid_t jobid, int32_t signal) /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&daemons); return rc; diff --git a/orte/mca/pls/pls.h b/orte/mca/pls/pls.h index cef06ccb7a..2968ad7f49 100644 --- a/orte/mca/pls/pls.h +++ b/orte/mca/pls/pls.h @@ -202,12 +202,12 @@ typedef int (*orte_pls_base_module_launch_job_fn_t)(orte_jobid_t); * Terminate any processes launched for the respective jobid by * this component. */ -typedef int (*orte_pls_base_module_terminate_job_fn_t)(orte_jobid_t); +typedef int (*orte_pls_base_module_terminate_job_fn_t)(orte_jobid_t, opal_list_t *attrs); /** * Terminate the daemons associated with this jobid */ -typedef int (*orte_pls_base_module_terminate_orteds_fn_t)(orte_jobid_t); +typedef int (*orte_pls_base_module_terminate_orteds_fn_t)(orte_jobid_t, opal_list_t *attrs); /** * Terminate a specific process. @@ -218,7 +218,7 @@ typedef int (*orte_pls_base_module_terminate_proc_fn_t)(const orte_process_name_ * Signal any processes launched for the respective jobid by * this component. */ -typedef int (*orte_pls_base_module_signal_job_fn_t)(orte_jobid_t, int32_t); +typedef int (*orte_pls_base_module_signal_job_fn_t)(orte_jobid_t, int32_t, opal_list_t *attrs); /** * Signal a specific process. diff --git a/orte/mca/pls/poe/pls_poe_module.c b/orte/mca/pls/poe/pls_poe_module.c index 4fc4adb57b..82e73f2b5f 100644 --- a/orte/mca/pls/poe/pls_poe_module.c +++ b/orte/mca/pls/poe/pls_poe_module.c @@ -60,10 +60,10 @@ extern char **environ; * Local functions */ static int pls_poe_launch_job(orte_jobid_t jobid); -static int pls_poe_terminate_job(orte_jobid_t jobid); -static int pls_poe_terminate_orteds(orte_jobid_t jobid); +static int pls_poe_terminate_job(orte_jobid_t jobid, opal_list_t *attrs); +static int pls_poe_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs); static int pls_poe_terminate_proc(const orte_process_name_t *name); -static int pls_poe_signal_job(orte_jobid_t jobid, int32_t signal); +static int pls_poe_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs); static int pls_poe_signal_proc(const orte_process_name_t *name, int32_t signal); static int pls_poe_finalize(void); @@ -601,7 +601,7 @@ static int pls_poe_launch_job(orte_jobid_t jobid) return ORTE_ERR_NOT_IMPLEMENTED; } -static int pls_poe_terminate_job(orte_jobid_t jobid) +static int pls_poe_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) { return ORTE_ERR_NOT_IMPLEMENTED; } @@ -612,12 +612,12 @@ static int pls_poe_terminate_proc(const orte_process_name_t *name) return ORTE_ERR_NOT_IMPLEMENTED; } -static int pls_poe_terminate_orteds(orte_jobid_t jobid) +static int pls_poe_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) { return ORTE_ERR_NOT_IMPLEMENTED; } -static int pls_poe_signal_job(orte_jobid_t jobid, int32_t signal) +static int pls_poe_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs) { return ORTE_ERR_NOT_IMPLEMENTED; } diff --git a/orte/mca/pls/proxy/pls_proxy.c b/orte/mca/pls/proxy/pls_proxy.c index 1a0470dd2d..bdc6603fef 100644 --- a/orte/mca/pls/proxy/pls_proxy.c +++ b/orte/mca/pls/proxy/pls_proxy.c @@ -110,7 +110,7 @@ int orte_pls_proxy_launch(orte_jobid_t job) return ORTE_SUCCESS; } -int orte_pls_proxy_terminate_job(orte_jobid_t job) +int orte_pls_proxy_terminate_job(orte_jobid_t job, opal_list_t *attrs) { orte_buffer_t* cmd; orte_buffer_t* answer; @@ -140,6 +140,12 @@ int orte_pls_proxy_terminate_job(orte_jobid_t job) return rc; } + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, attrs, 1, ORTE_ATTR_LIST))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + if (0 > orte_rml.send_buffer(orte_pls_proxy_replica, cmd, ORTE_RML_TAG_PLS, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(cmd); @@ -176,7 +182,7 @@ int orte_pls_proxy_terminate_job(orte_jobid_t job) return ORTE_SUCCESS; } -int orte_pls_proxy_terminate_orteds(orte_jobid_t job) +int orte_pls_proxy_terminate_orteds(orte_jobid_t job, opal_list_t *attrs) { orte_buffer_t* cmd; orte_buffer_t* answer; @@ -206,6 +212,12 @@ int orte_pls_proxy_terminate_orteds(orte_jobid_t job) return rc; } + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, attrs, 1, ORTE_ATTR_LIST))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + if (0 > orte_rml.send_buffer(orte_pls_proxy_replica, cmd, ORTE_RML_TAG_PLS, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(cmd); @@ -242,7 +254,7 @@ int orte_pls_proxy_terminate_orteds(orte_jobid_t job) return ORTE_SUCCESS; } -int orte_pls_proxy_signal_job(orte_jobid_t job, int32_t signal) +int orte_pls_proxy_signal_job(orte_jobid_t job, int32_t signal, opal_list_t *attrs) { orte_buffer_t* cmd; orte_buffer_t* answer; @@ -278,6 +290,12 @@ int orte_pls_proxy_signal_job(orte_jobid_t job, int32_t signal) return rc; } + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, attrs, 1, ORTE_ATTR_LIST))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + if (0 > orte_rml.send_buffer(orte_pls_proxy_replica, cmd, ORTE_RML_TAG_PLS, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(cmd); diff --git a/orte/mca/pls/proxy/pls_proxy.h b/orte/mca/pls/proxy/pls_proxy.h index df47d732fb..10db57fa4f 100644 --- a/orte/mca/pls/proxy/pls_proxy.h +++ b/orte/mca/pls/proxy/pls_proxy.h @@ -48,10 +48,10 @@ int orte_pls_proxy_finalize(void); * proxy function prototypes */ int orte_pls_proxy_launch(orte_jobid_t job); -int orte_pls_proxy_terminate_job(orte_jobid_t job); -int orte_pls_proxy_terminate_orteds(orte_jobid_t job); +int orte_pls_proxy_terminate_job(orte_jobid_t job, opal_list_t *attrs); +int orte_pls_proxy_terminate_orteds(orte_jobid_t job, opal_list_t *attrs); int orte_pls_proxy_terminate_proc(const orte_process_name_t* name); -int orte_pls_proxy_signal_job(orte_jobid_t job, int32_t signal); +int orte_pls_proxy_signal_job(orte_jobid_t job, int32_t signal, opal_list_t *attrs); int orte_pls_proxy_signal_proc(const orte_process_name_t* name, int32_t signal); diff --git a/orte/mca/pls/rsh/pls_rsh.h b/orte/mca/pls/rsh/pls_rsh.h index 7770e83444..554161d9e2 100644 --- a/orte/mca/pls/rsh/pls_rsh.h +++ b/orte/mca/pls/rsh/pls_rsh.h @@ -49,10 +49,10 @@ int orte_pls_rsh_finalize(void); * Interface */ int orte_pls_rsh_launch(orte_jobid_t); -int orte_pls_rsh_terminate_job(orte_jobid_t); -int orte_pls_rsh_terminate_orteds(orte_jobid_t); +int orte_pls_rsh_terminate_job(orte_jobid_t, opal_list_t*); +int orte_pls_rsh_terminate_orteds(orte_jobid_t, opal_list_t*); int orte_pls_rsh_terminate_proc(const orte_process_name_t* proc_name); -int orte_pls_rsh_signal_job(orte_jobid_t, int32_t); +int orte_pls_rsh_signal_job(orte_jobid_t, int32_t, opal_list_t*); int orte_pls_rsh_signal_proc(const orte_process_name_t* proc_name, int32_t); /** @@ -66,6 +66,7 @@ struct orte_pls_rsh_component_t { bool timing; bool reap; bool assume_same_shell; + bool force_rsh; int delay; int priority; char *agent_param; diff --git a/orte/mca/pls/rsh/pls_rsh_component.c b/orte/mca/pls/rsh/pls_rsh_component.c index dbe5a68c04..095fe2e002 100644 --- a/orte/mca/pls/rsh/pls_rsh_component.c +++ b/orte/mca/pls/rsh/pls_rsh_component.c @@ -134,6 +134,11 @@ int orte_pls_rsh_component_open(void) } mca_pls_rsh_component.num_concurrent = tmp; + mca_base_param_reg_int(c, "force_rsh", + "Force the launcher to always use rsh, even for local daemons", + false, false, false, &tmp); + mca_pls_rsh_component.force_rsh = OPAL_INT_TO_BOOL(tmp); + if (mca_pls_rsh_component.debug == 0) { mca_base_param_reg_int_name("orte", "debug", "Whether or not to enable debugging output for all ORTE components (0 or 1)", diff --git a/orte/mca/pls/rsh/pls_rsh_module.c b/orte/mca/pls/rsh/pls_rsh_module.c index f04b72ee88..ad0ab530b8 100644 --- a/orte/mca/pls/rsh/pls_rsh_module.c +++ b/orte/mca/pls/rsh/pls_rsh_module.c @@ -832,8 +832,9 @@ int orte_pls_rsh_launch(orte_jobid_t jobid) * current nodename, which must be local. If that doesn't * match, check using ifislocal(). */ - if (0 == strcmp(rmaps_node->nodename, orte_system_info.nodename) || - opal_ifislocal(rmaps_node->nodename)) { + if (!mca_pls_rsh_component.force_rsh && + (0 == strcmp(rmaps_node->nodename, orte_system_info.nodename) || + opal_ifislocal(rmaps_node->nodename))) { if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: %s is a LOCAL node\n", rmaps_node->nodename); @@ -1103,7 +1104,7 @@ cleanup: /** * Terminate all processes for a given job */ -int orte_pls_rsh_terminate_job(orte_jobid_t jobid) +int orte_pls_rsh_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -1113,7 +1114,7 @@ int orte_pls_rsh_terminate_job(orte_jobid_t jobid) /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -1135,7 +1136,7 @@ CLEANUP: /** * Terminate the orteds for a given job */ -int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid) +int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -1145,7 +1146,7 @@ int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid) /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -1173,7 +1174,7 @@ int orte_pls_rsh_terminate_proc(const orte_process_name_t* proc) return ORTE_ERR_NOT_IMPLEMENTED; } -int orte_pls_rsh_signal_job(orte_jobid_t jobid, int32_t signal) +int orte_pls_rsh_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -1183,7 +1184,7 @@ int orte_pls_rsh_signal_job(orte_jobid_t jobid, int32_t signal) /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&daemons); return rc; diff --git a/orte/mca/pls/slurm/pls_slurm_module.c b/orte/mca/pls/slurm/pls_slurm_module.c index 575440699d..31f187c5b5 100644 --- a/orte/mca/pls/slurm/pls_slurm_module.c +++ b/orte/mca/pls/slurm/pls_slurm_module.c @@ -73,10 +73,10 @@ * Local functions */ static int pls_slurm_launch_job(orte_jobid_t jobid); -static int pls_slurm_terminate_job(orte_jobid_t jobid); -static int pls_slurm_terminate_orteds(orte_jobid_t jobid); +static int pls_slurm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs); +static int pls_slurm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs); static int pls_slurm_terminate_proc(const orte_process_name_t *name); -static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal); +static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs); static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal); static int pls_slurm_finalize(void); @@ -421,7 +421,7 @@ cleanup: } -static int pls_slurm_terminate_job(orte_jobid_t jobid) +static int pls_slurm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -429,7 +429,7 @@ static int pls_slurm_terminate_job(orte_jobid_t jobid) /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -452,7 +452,7 @@ CLEANUP: /** * Terminate the orteds for a given job */ -static int pls_slurm_terminate_orteds(orte_jobid_t jobid) +static int pls_slurm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -460,7 +460,7 @@ static int pls_slurm_terminate_orteds(orte_jobid_t jobid) /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -493,7 +493,7 @@ static int pls_slurm_terminate_proc(const orte_process_name_t *name) /** * Signal all the processes in the child srun by sending the signal directly to it */ -static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal) +static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs) { if (0 != srun_pid) { kill(srun_pid, (int)signal); diff --git a/orte/mca/pls/tm/pls_tm_module.c b/orte/mca/pls/tm/pls_tm_module.c index 3c022097f4..ea969daaf4 100644 --- a/orte/mca/pls/tm/pls_tm_module.c +++ b/orte/mca/pls/tm/pls_tm_module.c @@ -79,10 +79,10 @@ * Local functions */ static int pls_tm_launch_job(orte_jobid_t jobid); -static int pls_tm_terminate_job(orte_jobid_t jobid); -static int pls_tm_terminate_orteds(orte_jobid_t jobid); +static int pls_tm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs); +static int pls_tm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs); static int pls_tm_terminate_proc(const orte_process_name_t *name); -static int pls_tm_signal_job(orte_jobid_t jobid, int32_t signal); +static int pls_tm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs); static int pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal); static int pls_tm_finalize(void); @@ -545,7 +545,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid) } -static int pls_tm_terminate_job(orte_jobid_t jobid) +static int pls_tm_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -553,7 +553,7 @@ static int pls_tm_terminate_job(orte_jobid_t jobid) /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -576,7 +576,7 @@ CLEANUP: /** * Terminate the orteds for a given job */ -int pls_tm_terminate_orteds(orte_jobid_t jobid) +int pls_tm_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -584,7 +584,7 @@ int pls_tm_terminate_orteds(orte_jobid_t jobid) /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } @@ -614,7 +614,7 @@ static int pls_tm_terminate_proc(const orte_process_name_t *name) } -static int pls_tm_signal_job(orte_jobid_t jobid, int32_t signal) +static int pls_tm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs) { int rc; opal_list_t daemons; @@ -622,7 +622,7 @@ static int pls_tm_signal_job(orte_jobid_t jobid, int32_t signal) /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid))) { + if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&daemons); return rc; diff --git a/orte/mca/ras/base/ras_base_receive.c b/orte/mca/ras/base/ras_base_receive.c index e0e725e923..a24abafd6d 100644 --- a/orte/mca/ras/base/ras_base_receive.c +++ b/orte/mca/ras/base/ras_base_receive.c @@ -50,7 +50,7 @@ int orte_ras_base_comm_start(void) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_RAS, ORTE_RML_PERSISTENT, orte_ras_base_recv, @@ -71,7 +71,7 @@ int orte_ras_base_comm_stop(void) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_RAS))) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_RAS))) { ORTE_ERROR_LOG(rc); } recv_issued = false; diff --git a/orte/mca/rds/base/rds_base_receive.c b/orte/mca/rds/base/rds_base_receive.c index 4ab39fdf39..da03378bc4 100644 --- a/orte/mca/rds/base/rds_base_receive.c +++ b/orte/mca/rds/base/rds_base_receive.c @@ -49,7 +49,7 @@ int orte_rds_base_comm_start(void) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_RDS, ORTE_RML_PERSISTENT, orte_rds_base_recv, @@ -70,7 +70,7 @@ int orte_rds_base_comm_stop(void) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_RDS))) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_RDS))) { ORTE_ERROR_LOG(rc); } recv_issued = false; diff --git a/orte/mca/rmaps/base/rmaps_base_receive.c b/orte/mca/rmaps/base/rmaps_base_receive.c index 45c197eda3..6b70e9dd47 100644 --- a/orte/mca/rmaps/base/rmaps_base_receive.c +++ b/orte/mca/rmaps/base/rmaps_base_receive.c @@ -50,7 +50,7 @@ int orte_rmaps_base_comm_start(void) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_RMAPS, ORTE_RML_PERSISTENT, orte_rmaps_base_recv, @@ -71,7 +71,7 @@ int orte_rmaps_base_comm_stop(void) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_RMAPS))) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_RMAPS))) { ORTE_ERROR_LOG(rc); } recv_issued = false; diff --git a/orte/mca/rmgr/base/data_type_support/rmgr_data_type_packing_fns.c b/orte/mca/rmgr/base/data_type_support/rmgr_data_type_packing_fns.c index 2b7528a6a5..2d18892008 100644 --- a/orte/mca/rmgr/base/data_type_support/rmgr_data_type_packing_fns.c +++ b/orte/mca/rmgr/base/data_type_support/rmgr_data_type_packing_fns.c @@ -214,8 +214,13 @@ int orte_rmgr_base_pack_attr_list(orte_buffer_t *buffer, void *src, opal_list_item_t *item; orte_std_cntr_t num_attr; - /* get the number of attributes and pack it */ - num_attr = (orte_std_cntr_t)opal_list_get_size(attrs); + /* if the list is NULL, we have zero attributes */ + if (NULL == src) { + num_attr = 0; + } else { + /* get the number of attributes */ + num_attr = (orte_std_cntr_t)opal_list_get_size(attrs); + } if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, (void*)&num_attr, 1, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); return rc; diff --git a/orte/mca/rmgr/base/rmgr_base_attribute_fns.c b/orte/mca/rmgr/base/rmgr_base_attribute_fns.c index a12f65d756..bc6c838059 100644 --- a/orte/mca/rmgr/base/rmgr_base_attribute_fns.c +++ b/orte/mca/rmgr/base/rmgr_base_attribute_fns.c @@ -43,6 +43,11 @@ orte_attribute_t* orte_rmgr_base_find_attribute(opal_list_t* attr_list, char* ke opal_list_item_t *item; orte_attribute_t *kval; + if (NULL == attr_list) { + /* if the list is NULL, then by definition we couldn't find it! */ + return NULL; + } + for (item = opal_list_get_first(attr_list); item != opal_list_get_end(attr_list); item = opal_list_get_next(item)) { @@ -69,6 +74,11 @@ int orte_rmgr_base_add_attribute(opal_list_t* attr_list, char* key, orte_gpr_keyval_t *kval; orte_attribute_t *attr; + /* protect against NULL case */ + if (NULL == attr_list) { + return ORTE_ERR_BAD_PARAM; + } + /* see if this attribute is already present */ if (NULL != (attr = orte_rmgr_base_find_attribute(attr_list, key))) { /** found it - do we want to replace this value? */ @@ -108,6 +118,11 @@ int orte_rmgr_base_merge_attributes(opal_list_t* target, opal_list_t* source, bo opal_list_item_t *item; orte_attribute_t *attr; + /* protect against NULL cases */ + if (NULL == target || NULL == source) { + return ORTE_ERR_BAD_PARAM; + } + /* Since the add_attribute function takes care of the override issue, we just * need to cycle through the source list and "add" everything to the target */ @@ -136,6 +151,11 @@ int orte_rmgr_base_delete_attribute(opal_list_t* attr_list, char* key) opal_list_item_t *item; orte_attribute_t *kval; + /* protect against the NULL case */ + if (NULL == attr_list) { + return ORTE_SUCCESS; + } + for (item = opal_list_get_first(attr_list); item != opal_list_get_end(attr_list); item = opal_list_get_next(item)) { diff --git a/orte/mca/rmgr/base/rmgr_base_receive.c b/orte/mca/rmgr/base/rmgr_base_receive.c index c05acdbc50..2137a7c54f 100644 --- a/orte/mca/rmgr/base/rmgr_base_receive.c +++ b/orte/mca/rmgr/base/rmgr_base_receive.c @@ -51,7 +51,7 @@ int orte_rmgr_base_comm_start(void) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_RMGR, ORTE_RML_PERSISTENT, orte_rmgr_base_recv, @@ -71,7 +71,7 @@ int orte_rmgr_base_comm_stop(void) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_RMGR))) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_RMGR))) { ORTE_ERROR_LOG(rc); } recv_issued = false; @@ -148,7 +148,7 @@ void orte_rmgr_base_recv(int status, orte_process_name_t* sender, count = 1; if(ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) { ORTE_ERROR_LOG(rc); - goto CLEANUP_SPAWN; + goto SEND_ANSWER; } /* process the request */ @@ -156,7 +156,11 @@ void orte_rmgr_base_recv(int status, orte_process_name_t* sender, ORTE_ERROR_LOG(rc); goto CLEANUP_SPAWN; } - + while (NULL != (item = opal_list_remove_first(&attrs))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&attrs); + /* return the new jobid */ if(ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &job, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/rmgr/base/rmgr_base_stage_gate.c b/orte/mca/rmgr/base/rmgr_base_stage_gate.c index 5654ffd62d..813fca199d 100644 --- a/orte/mca/rmgr/base/rmgr_base_stage_gate.c +++ b/orte/mca/rmgr/base/rmgr_base_stage_gate.c @@ -62,6 +62,8 @@ int orte_rmgr_base_proc_stage_gate_mgr(orte_gpr_notify_message_t *msg) orte_std_cntr_t n=0; int rc; orte_jobid_t job; + opal_list_t attrs; + opal_list_item_t *item; OPAL_TRACE(1); @@ -88,13 +90,21 @@ int orte_rmgr_base_proc_stage_gate_mgr(orte_gpr_notify_message_t *msg) OPAL_TRACE_ARG1(1, job); /* need the list of peers for this job so we can send them the xcast. - * obtain this list from the name service's get_job_peers function - */ - if (ORTE_SUCCESS != (rc = orte_ns.get_job_peers(&recipients, &n, job))) { + * obtain this list from the name service's get_job_peers function + */ + OBJ_CONSTRUCT(&attrs, opal_list_t); + if (ORTE_SUCCESS != (rc = orte_rmgr.add_attribute(&attrs, ORTE_NS_USE_JOBID, ORTE_JOBID, + &job, ORTE_RMGR_ATTR_OVERRIDE))) { ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&attrs); return rc; } - + + if (ORTE_SUCCESS != (rc = orte_ns.get_peers(&recipients, &n, &attrs))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + /* set the job state to the appropriate level */ if (orte_schema.check_std_trigger_name(msg->target, ORTE_ALL_LAUNCHED_TRIGGER)) { if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_LAUNCHED))) { @@ -146,5 +156,11 @@ int orte_rmgr_base_proc_stage_gate_mgr(orte_gpr_notify_message_t *msg) CLEANUP: if (NULL != recipients) free(recipients); + + while (NULL != (item = opal_list_remove_first(&attrs))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&attrs); + return rc; } diff --git a/orte/mca/rmgr/proxy/rmgr_proxy.c b/orte/mca/rmgr/proxy/rmgr_proxy.c index 45a28d88aa..0e1186c828 100644 --- a/orte/mca/rmgr/proxy/rmgr_proxy.c +++ b/orte/mca/rmgr/proxy/rmgr_proxy.c @@ -122,7 +122,7 @@ static int orte_rmgr_proxy_setup_job(orte_app_context_t** app_context, return rc; } - /* pack any attributes */ + /* pack the attributes */ if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, attrs, 1, ORTE_ATTR_LIST))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&cmd); @@ -130,7 +130,7 @@ static int orte_rmgr_proxy_setup_job(orte_app_context_t** app_context, } /* send the command */ - if(0 > (rc = orte_rml.send_buffer(ORTE_RML_NAME_SEED, &cmd, ORTE_RML_TAG_RMGR, 0))) { + if(0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &cmd, ORTE_RML_TAG_RMGR, 0))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&cmd); return rc; @@ -139,7 +139,7 @@ static int orte_rmgr_proxy_setup_job(orte_app_context_t** app_context, /* wait for response */ OBJ_CONSTRUCT(&rsp, orte_buffer_t); - if(0 > (rc = orte_rml.recv_buffer(ORTE_RML_NAME_SEED, &rsp, ORTE_RML_TAG_RMGR))) { + if(0 > (rc = orte_rml.recv_buffer(ORTE_PROC_MY_HNP, &rsp, ORTE_RML_TAG_RMGR))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&rsp); return rc; @@ -197,7 +197,7 @@ static int orte_rmgr_proxy_setup_stage_gates(orte_jobid_t jobid) } /* send the command */ - if(0 > (rc = orte_rml.send_buffer(ORTE_RML_NAME_SEED, &cmd, ORTE_RML_TAG_RMGR, 0))) { + if(0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &cmd, ORTE_RML_TAG_RMGR, 0))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&cmd); return rc; @@ -206,7 +206,7 @@ static int orte_rmgr_proxy_setup_stage_gates(orte_jobid_t jobid) /* wait for response */ OBJ_CONSTRUCT(&rsp, orte_buffer_t); - if(0 > (rc = orte_rml.recv_buffer(ORTE_RML_NAME_SEED, &rsp, ORTE_RML_TAG_RMGR))) { + if(0 > (rc = orte_rml.recv_buffer(ORTE_PROC_MY_HNP, &rsp, ORTE_RML_TAG_RMGR))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&rsp); return rc; @@ -398,7 +398,7 @@ static int orte_rmgr_proxy_spawn_job( */ if (flags & ORTE_RMGR_SETUP) { if (ORTE_SUCCESS != - (rc = orte_rmgr_proxy_setup_job(app_context,num_context,jobid, attributes))) { + (rc = orte_rmgr_proxy_setup_job(app_context, num_context, jobid, attributes))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/mca/rmgr/rmgr.h b/orte/mca/rmgr/rmgr.h index 4089bfc38d..b14b56a294 100644 --- a/orte/mca/rmgr/rmgr.h +++ b/orte/mca/rmgr/rmgr.h @@ -66,7 +66,7 @@ typedef int (*orte_rmgr_base_module_setup_job_fn_t)( orte_app_context_t** app_context, orte_std_cntr_t num_context, orte_jobid_t *jobid, - opal_list_t *attributes); + opal_list_t *attrs); /* * Callback function for resource manager diff --git a/orte/mca/rmgr/urm/rmgr_urm.c b/orte/mca/rmgr/urm/rmgr_urm.c index ea4b456c5f..1c0fc59465 100644 --- a/orte/mca/rmgr/urm/rmgr_urm.c +++ b/orte/mca/rmgr/urm/rmgr_urm.c @@ -133,7 +133,7 @@ static int orte_rmgr_urm_setup_job(orte_app_context_t** app_context, *jobid = *jptr; } else { /* allocate a jobid */ - if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(jobid))) { + if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(jobid, attrs))) { ORTE_ERROR_LOG(rc); return rc; } @@ -349,15 +349,12 @@ static int orte_rmgr_urm_spawn_job( * Initialize job segment and allocate resources */ /* JJH Insert C/N mapping stuff here */ - /* Only do this step if we have been asked to do it via the ORTE_RMGR_SPAWN_FLOW - * attribute, or if no flow flags were provided - * If the jobid = ORTE_JOBID_INVALID, then we need to - * get one assigned to us. Otherwise, we are entering - * with a valid jobid, so no need to get one + /* Only do this step if we have been asked to do it via the + * ORTE_RMGR_SPAWN_FLOW attribute */ if (flags & ORTE_RMGR_SETUP) { if (ORTE_SUCCESS != - (rc = orte_rmgr_urm_setup_job(app_context,num_context,jobid,attributes))) { + (rc = orte_rmgr_urm_setup_job(app_context, num_context, jobid, attributes))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/mca/rml/base/rml_base_open.c b/orte/mca/rml/base/rml_base_open.c index 65067144cc..7406929aff 100644 --- a/orte/mca/rml/base/rml_base_open.c +++ b/orte/mca/rml/base/rml_base_open.c @@ -43,8 +43,6 @@ orte_rml_base_t orte_rml_base; orte_rml_module_t orte_rml; -orte_process_name_t orte_rml_name_any = { ORTE_CELLID_MAX, ORTE_JOBID_MAX, ORTE_VPID_MAX }; -orte_process_name_t orte_rml_name_seed = { 0, 0, 0 }; /** * Function for finding and opening either all MCA components, or the one diff --git a/orte/mca/rml/rml.h b/orte/mca/rml/rml.h index dfca8306fb..d4366a50de 100644 --- a/orte/mca/rml/rml.h +++ b/orte/mca/rml/rml.h @@ -41,14 +41,6 @@ extern "C" { #endif -/* - * Well known addresses - */ - -ORTE_DECLSPEC extern orte_process_name_t orte_rml_name_any; -ORTE_DECLSPEC extern orte_process_name_t orte_rml_name_seed; - - /* * RML Module function prototypes. */ diff --git a/orte/mca/rml/rml_types.h b/orte/mca/rml/rml_types.h index 2dc7a362d9..81bb9c4756 100644 --- a/orte/mca/rml/rml_types.h +++ b/orte/mca/rml/rml_types.h @@ -73,20 +73,5 @@ typedef uint32_t orte_rml_tag_t; #define ORTE_RML_PERSISTENT 0x08 /**< posted non-blocking recv is persistent */ #define ORTE_RML_NON_PERSISTENT 0x00 -/** - * The wildcard for receives from any peer. - */ -#define ORTE_RML_NAME_ANY &orte_rml_name_any - -/** - * Process name of seed - */ -#define ORTE_RML_NAME_SEED &orte_rml_name_seed - -/** - * Process name of self - */ -#define ORTE_RML_NAME_SELF orte_process_info.my_name - #endif /* RML_TYPES */ diff --git a/orte/mca/schema/schema_types.h b/orte/mca/schema/schema_types.h index dc837ca56a..b32a0eabaf 100644 --- a/orte/mca/schema/schema_types.h +++ b/orte/mca/schema/schema_types.h @@ -23,6 +23,16 @@ #include "orte/mca/ns/ns_types.h" #include "opal/util/error.h" +/** +* Standard characters used in ORTE + */ +#define ORTE_SCHEMA_DELIMITER_CHAR '.' +#define ORTE_SCHEMA_DELIMITER_STRING "." +#define ORTE_SCHEMA_WILDCARD_CHAR '*' +#define ORTE_SCHEMA_WILDCARD_STRING "*" +#define ORTE_SCHEMA_INVALID_CHAR '$' +#define ORTE_SCHEMA_INVALID_STRING "$" + /* * Standard names used across the system */ diff --git a/orte/mca/sds/bproc/sds_bproc_module.c b/orte/mca/sds/bproc/sds_bproc_module.c index e5bb0637b0..7c5fd7bc4e 100644 --- a/orte/mca/sds/bproc/sds_bproc_module.c +++ b/orte/mca/sds/bproc/sds_bproc_module.c @@ -52,7 +52,7 @@ int orte_sds_bproc_set_name(void) id = mca_base_param_register_string("ns", "nds", "name", NULL, NULL); mca_base_param_lookup_string(id, &name_string); if(name_string != NULL) { - if (ORTE_SUCCESS != (rc = orte_ns_base_convert_string_to_process_name( + if (ORTE_SUCCESS != (rc = orte_ns.convert_string_to_process_name( &(orte_process_info.my_name), name_string))) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/sds/env/sds_env_module.c b/orte/mca/sds/env/sds_env_module.c index 08104432e3..7b5b388898 100644 --- a/orte/mca/sds/env/sds_env_module.c +++ b/orte/mca/sds/env/sds_env_module.c @@ -28,7 +28,7 @@ #include "opal/mca/base/mca_base_param.h" #include "orte/mca/ns/ns.h" #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ns/base/base.h" +#include "orte/mca/ns/ns.h" orte_sds_base_module_t orte_sds_env_module = { @@ -50,7 +50,7 @@ orte_sds_env_set_name(void) mca_base_param_lookup_string(id, &name_string); if(name_string != NULL) { - if (ORTE_SUCCESS != (rc = orte_ns_base_convert_string_to_process_name( + if (ORTE_SUCCESS != (rc = orte_ns.convert_string_to_process_name( &(orte_process_info.my_name), name_string))) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/sds/pipe/sds_pipe_module.c b/orte/mca/sds/pipe/sds_pipe_module.c index 821226667e..9c0b8e4d46 100644 --- a/orte/mca/sds/pipe/sds_pipe_module.c +++ b/orte/mca/sds/pipe/sds_pipe_module.c @@ -29,6 +29,8 @@ #include "orte/util/proc_info.h" #include "opal/util/output.h" #include "opal/mca/base/mca_base_param.h" + +#include "orte/dss/dss.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ns/ns.h" #include "orte/mca/ns/base/base.h" @@ -59,7 +61,7 @@ orte_sds_pipe_set_name(void) ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - if(ORTE_SUCCESS != (rc = orte_ns.copy_process_name(&orte_process_info.my_name, &name))) { + if(ORTE_SUCCESS != (rc = orte_dss.copy((void**)&orte_process_info.my_name, &name, ORTE_NAME))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/mca/sds/singleton/sds_singleton_module.c b/orte/mca/sds/singleton/sds_singleton_module.c index c6982b7013..6f481a2999 100644 --- a/orte/mca/sds/singleton/sds_singleton_module.c +++ b/orte/mca/sds/singleton/sds_singleton_module.c @@ -48,10 +48,7 @@ orte_sds_singleton_set_name(void) return rc; } - if (ORTE_SUCCESS != (rc = orte_ns.get_vpid(&vpid, orte_process_info.my_name))) { - ORTE_ERROR_LOG(rc); - return rc; - } + vpid = ORTE_PROC_MY_NAME->vpid; orte_process_info.num_procs = 1; orte_process_info.vpid_start = vpid; diff --git a/orte/mca/sds/slurm/sds_slurm_module.c b/orte/mca/sds/slurm/sds_slurm_module.c index d441c03267..5432516bc8 100644 --- a/orte/mca/sds/slurm/sds_slurm_module.c +++ b/orte/mca/sds/slurm/sds_slurm_module.c @@ -68,7 +68,7 @@ orte_sds_slurm_set_name(void) if(name_string != NULL) { if (ORTE_SUCCESS != - (rc = orte_ns_base_convert_string_to_process_name(&(orte_process_info.my_name), + (rc = orte_ns.convert_string_to_process_name(&(orte_process_info.my_name), name_string))) { ORTE_ERROR_LOG(rc); free(name_string); diff --git a/orte/mca/smr/base/smr_base_get_proc_state.c b/orte/mca/smr/base/smr_base_get_proc_state.c index 035978bbf5..e730bdcb33 100644 --- a/orte/mca/smr/base/smr_base_get_proc_state.c +++ b/orte/mca/smr/base/smr_base_get_proc_state.c @@ -47,10 +47,7 @@ int orte_smr_base_get_proc_state(orte_proc_state_t *state, orte_jobid_t jobid; bool found1=false, found2=false; - if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid, proc))) { - ORTE_ERROR_LOG(rc); - return rc; - } + jobid = proc->jobid; if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/smr/base/smr_base_set_proc_state.c b/orte/mca/smr/base/smr_base_set_proc_state.c index d7b8c98181..58fef4be4a 100644 --- a/orte/mca/smr/base/smr_base_set_proc_state.c +++ b/orte/mca/smr/base/smr_base_set_proc_state.c @@ -44,10 +44,7 @@ int orte_smr_base_set_proc_state(orte_process_name_t *proc, orte_exit_code_t exit_code; char *segment; - if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid, proc))) { - ORTE_ERROR_LOG(rc); - return rc; - } + jobid = proc->jobid; if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) { ORTE_ERROR_LOG(rc); @@ -61,12 +58,7 @@ int orte_smr_base_set_proc_state(orte_process_name_t *proc, return rc; } - if (ORTE_SUCCESS != (rc = orte_ns.get_vpid(&vpid, proc))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(value); - free(segment); - return rc; - } + vpid = proc->vpid; if (ORTE_VPID_MAX != vpid) { /* check for wildcard case - leave tokens alone if so */ if (ORTE_SUCCESS != (rc = orte_schema.get_proc_tokens(&(value->tokens), &(value->num_tokens), proc))) { diff --git a/orte/runtime/orte_init_stage1.c b/orte/runtime/orte_init_stage1.c index c1b4cb1800..f7f161edf9 100644 --- a/orte/runtime/orte_init_stage1.c +++ b/orte/runtime/orte_init_stage1.c @@ -371,11 +371,7 @@ int orte_init_stage1(bool infrastructure) char *site, *resource; orte_app_context_t *app; - if (ORTE_SUCCESS != (ret = orte_ns.get_jobid(&my_jobid, orte_process_info.my_name))) { - ORTE_ERROR_LOG(ret); - error = "orte_ns.get_jobid for singleton/seed"; - goto error; - } + my_jobid = ORTE_PROC_MY_NAME->jobid; /* If there is no existing cellid, create one */ my_cellid = 0; /* JJH Assertion/Repair until cellid's are fixed */ @@ -399,11 +395,7 @@ int orte_init_stage1(bool infrastructure) goto error; } - if (ORTE_SUCCESS != (ret = orte_ns.get_cellid(&my_cellid, orte_process_info.my_name))) { - ORTE_ERROR_LOG(ret); - error = "orte_ns.get_cellid for singleton/seed"; - goto error; - } + my_cellid = ORTE_PROC_MY_NAME->cellid; /* set the rest of the infrastructure */ app = OBJ_NEW(orte_app_context_t); diff --git a/orte/runtime/orte_restart.c b/orte/runtime/orte_restart.c index cd2ab1ac76..5e805854a4 100644 --- a/orte/runtime/orte_restart.c +++ b/orte/runtime/orte_restart.c @@ -33,6 +33,8 @@ #include "opal/mca/mca.h" #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_param.h" + +#include "orte/dss/dss.h" #include "orte/mca/iof/base/base.h" #include "orte/mca/rml/base/base.h" #include "orte/mca/errmgr/base/base.h" @@ -60,11 +62,11 @@ int orte_restart(orte_process_name_t *name, const char* uri) orte_process_name_t* old_name; orte_process_name_t* new_name; - if (ORTE_SUCCESS != (rc = orte_ns.copy_process_name(&old_name, orte_process_info.my_name))) { + if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&old_name, orte_process_info.my_name, ORTE_NAME))) { ORTE_ERROR_LOG(rc); return rc; } - if (ORTE_SUCCESS != (rc = orte_ns.copy_process_name(&new_name, name))) { + if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&new_name, name, ORTE_NAME))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/runtime/orte_setup_hnp.c b/orte/runtime/orte_setup_hnp.c index 222d590b9c..d54d5b2a5a 100644 --- a/orte/runtime/orte_setup_hnp.c +++ b/orte/runtime/orte_setup_hnp.c @@ -333,7 +333,7 @@ MOVEON: OBJ_CONSTRUCT(&orte_setup_hnp_condition, opal_condition_t); /* get a jobid for the probe */ - rc = orte_ns.create_jobid(&jobid); + rc = orte_ns.create_jobid(&jobid, NULL); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; @@ -451,7 +451,7 @@ MOVEON: } /* issue the non-blocking recv to get the probe's findings */ - rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_PROBE, + rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PROBE, 0, orte_setup_hnp_recv, NULL); if(rc < 0) { ORTE_ERROR_LOG(rc); diff --git a/orte/test/unit/ns/Makefile b/orte/test/unit/ns/Makefile new file mode 100644 index 0000000000..51e5d404f4 --- /dev/null +++ b/orte/test/unit/ns/Makefile @@ -0,0 +1,12 @@ +PROGS = ns_string_fns ns_peers + +all: $(PROGS) + +CC = ortecc +CFLAGS = -g +CXX = ortec++ +CXXFLAGS = -g +FFLAGS = -g + +clean: + rm -f $(PROGS) *~ diff --git a/orte/test/unit/ns/ns_peers.c b/orte/test/unit/ns/ns_peers.c new file mode 100644 index 0000000000..0dafc419c1 --- /dev/null +++ b/orte/test/unit/ns/ns_peers.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/orte_constants.h" + +#include <stdio.h> +#include <string.h> + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rmgr/rmgr.h" + +#include "orte/mca/ns/base/base.h" + +int main(int argc, char **argv) +{ + orte_process_name_t *test_name; + orte_process_name_t *peers, *jptr; + orte_jobid_t parent, jobs[5], *jobdesc, root; + orte_std_cntr_t j, num_jobs, npeers; + orte_vpid_t vpid, vpids[5]; + int i, rc; + opal_list_t attrs; + opal_list_item_t *item; + orte_attribute_t *attr; + + if (ORTE_SUCCESS != orte_init(true)) { + fprintf(stderr, "failed to start ORTE\n"); + exit (1); + } + + /* create parent jobid */ + if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(&parent, NULL))) { /* got error */ + fprintf(stderr, "create parent jobid: error with error %s\n", ORTE_ERROR_NAME(rc)); + goto FINALIZE; + } else { + fprintf(stderr, "parent jobid created: %lu\n", (unsigned long) parent); + } + + /* get range of vpids */ + if (ORTE_SUCCESS != (rc = orte_ns.reserve_range(parent, 25, &vpid))) { /* got error */ + fprintf(stderr, "reserve range: error with error %s\n", + ORTE_ERROR_NAME(rc)); + goto FINALIZE; + } else { + fprintf(stderr, "range reserved: %lu\n", (unsigned long) vpid); + } + + OBJ_CONSTRUCT(&attrs, opal_list_t); + + for (i=0; i<5; i++) { /* loop through several vpid ranges */ + if (0 == i) { + orte_rmgr.add_attribute(&attrs, ORTE_NS_USE_PARENT, ORTE_JOBID, &parent, ORTE_RMGR_ATTR_OVERRIDE); + } else if (2 == i) { + orte_rmgr.add_attribute(&attrs, ORTE_NS_USE_ROOT, ORTE_JOBID, &parent, ORTE_RMGR_ATTR_OVERRIDE); + } else { + orte_rmgr.add_attribute(&attrs, ORTE_NS_USE_PARENT, ORTE_JOBID, &jobs[i-1], ORTE_RMGR_ATTR_OVERRIDE); + } + + if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(&jobs[i], &attrs))) { + ORTE_ERROR_LOG(rc); + goto FINALIZE; + } + fprintf(stderr, "create jobid on step %d: jobid %ld\n", i, (long)jobs[i]); + + /* get range of vpids */ + if (ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobs[i], 10*(i+1), &vpids[i]))) { /* got error */ + fprintf(stderr, "reserve range: error with error %s\n", + ORTE_ERROR_NAME(rc)); + goto FINALIZE; + } else { + fprintf(stderr, "range reserved: %lu\n", (unsigned long) vpids[i]); + } + + orte_rmgr.delete_attribute(&attrs, ORTE_NS_USE_PARENT); + orte_rmgr.delete_attribute(&attrs, ORTE_NS_USE_ROOT); + } + while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); + OBJ_DESTRUCT(&attrs); + + orte_ns.dump_jobs(); + + /*** DESCENDANTS ***/ + num_jobs = 0; + if (ORTE_SUCCESS != (rc = orte_ns.get_job_descendants(&jobdesc, &num_jobs, parent))) { + fprintf(stderr, "get job descendants: failed with error %s\n", ORTE_ERROR_NAME(rc)); + goto FINALIZE; + } + for (j=0; j < num_jobs; j++) { + fprintf(stderr, "job descendants: job %ld\n", jobdesc[j]); + } + free(jobdesc); + + + /*** ROOT JOB ***/ + if (ORTE_SUCCESS != (rc = orte_ns.get_root_job(&root, jobs[4]))) { + fprintf(stderr, "get root job: failed with error %s\n", ORTE_ERROR_NAME(rc)); + goto FINALIZE; + } + fprintf(stderr, "got root job for job %ld - root was %ld\n", (long)jobs[4], root); + + + /*** PARENT JOB ***/ + if (ORTE_SUCCESS != (rc = orte_ns.get_parent_job(&root, jobs[4]))) { + fprintf(stderr, "get parent job: failed with error %s\n", ORTE_ERROR_NAME(rc)); + goto FINALIZE; + } + fprintf(stderr, "got parent job for job %ld - parent was %ld\n", (long)jobs[4], root); + + + /*** PEERS FUNCTIONS ***/ + if (ORTE_SUCCESS != (rc = orte_ns.get_peers(&peers, &npeers, NULL))) { + fprintf(stderr, "get peers local: failed with error %s\n", ORTE_ERROR_NAME(rc)); + goto FINALIZE; + } + + jptr = peers; + for (j=0; j < npeers; j++) { + fprintf(stderr, "get peers local: peer %ld, %ld, %ld\n", ORTE_NAME_ARGS(jptr)); + jptr++; + } + free(peers); + + OBJ_CONSTRUCT(&attrs, opal_list_t); + orte_rmgr.add_attribute(&attrs, ORTE_NS_USE_JOBID, ORTE_JOBID, &jobs[1], ORTE_RMGR_ATTR_OVERRIDE); + + if (ORTE_SUCCESS != (rc = orte_ns.get_peers(&peers, &npeers, &attrs))) { + fprintf(stderr, "get peers for job %ld: failed with error %s\n", (long)jobs[1], ORTE_ERROR_NAME(rc)); + goto FINALIZE; + } + + jptr = peers; + for (j=0; j < npeers; j++) { + fprintf(stderr, "get peers for job %ld: peer %ld, %ld, %ld\n", (long)jobs[1], ORTE_NAME_ARGS(jptr)); + jptr++; + } + if (NULL != peers) free(peers); + while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); + OBJ_DESTRUCT(&attrs); + + OBJ_CONSTRUCT(&attrs, opal_list_t); + orte_rmgr.add_attribute(&attrs, ORTE_NS_USE_JOBID, ORTE_JOBID, &parent, ORTE_RMGR_ATTR_OVERRIDE); + orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); + + if (ORTE_SUCCESS != (rc = orte_ns.get_peers(&peers, &npeers, &attrs))) { + fprintf(stderr, "get peers with descendants for job %ld: failed with error %s\n", (long)parent, ORTE_ERROR_NAME(rc)); + goto FINALIZE; + } + + jptr = peers; + for (j=0; j < npeers; j++) { + fprintf(stderr, "get peers with descendants for job %ld: peer %ld, %ld, %ld\n", (long)parent, ORTE_NAME_ARGS(jptr)); + jptr++; + } + if (NULL != peers) free(peers); + while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); + OBJ_DESTRUCT(&attrs); + + OBJ_CONSTRUCT(&attrs, opal_list_t); + orte_rmgr.add_attribute(&attrs, ORTE_NS_USE_JOBID, ORTE_JOBID, &parent, ORTE_RMGR_ATTR_OVERRIDE); + orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_CHILDREN, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); + + if (ORTE_SUCCESS != (rc = orte_ns.get_peers(&peers, &npeers, &attrs))) { + fprintf(stderr, "get peers job with children only: failed with error %s\n", ORTE_ERROR_NAME(rc)); + goto FINALIZE; + } + + jptr = peers; + for (j=0; j < npeers; j++) { + fprintf(stderr, "get peers with children only for job %ld: peer %ld, %ld, %ld\n", (long)parent, ORTE_NAME_ARGS(jptr)); + jptr++; + } + if (NULL != peers) free(peers); + while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); + OBJ_DESTRUCT(&attrs); + +FINALIZE: + /* finalize and see if memory cleared */ + orte_ns_base_close(); + + orte_proc_info_finalize(); + mca_base_close(); + opal_malloc_finalize(); + opal_output_finalize(); + + fclose( stderr ); + + return(0); +} diff --git a/orte/test/unit/ns/ns_string_fns.c b/orte/test/unit/ns/ns_string_fns.c new file mode 100644 index 0000000000..8e206dd956 --- /dev/null +++ b/orte/test/unit/ns/ns_string_fns.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/orte_constants.h" + +#include <stdio.h> +#include <string.h> + +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/ns/base/base.h" + +int main(int argc, char **argv) +{ + orte_process_name_t *test_name; + orte_cellid_t cell; + orte_jobid_t job; + orte_vpid_t vpid; + int i, j, rc; + char *tmp, *site, *resource; + + if (ORTE_SUCCESS != orte_init(true)) { + fprintf(stderr, "failed to start ORTE\n"); + exit (1); + } + + /* create a name */ + if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&test_name, 0, 1, 1))) { /* got error */ + fprintf(stderr, "create process name failed with error %s\n", + ORTE_ERROR_NAME(rc)); + exit(1); + } else { + fprintf(stderr, "got process name: %ld %ld %ld\n", ORTE_NAME_ARGS(test_name)); + } + free(test_name); + + /* convert a string to a name */ + tmp = strdup("1234.5678.0010"); + if (ORTE_SUCCESS != (rc = orte_ns.convert_string_to_process_name(&test_name, tmp))) { /* got error */ + fprintf(stderr, "convert string to process name failed with error %s\n", + ORTE_ERROR_NAME(rc)); + exit(1); + } else { + fprintf(stderr, "got process name: %ld %ld %ld\n", ORTE_NAME_ARGS(test_name)); + } + free(tmp); + free(test_name); + + /* create a cellid */ + if (ORTE_SUCCESS != (rc = orte_ns.create_cellid(&cell, "dummy-site", "dummy-resource"))) { /* got error */ + fprintf(stderr, "create cellid: error with error %s\n", ORTE_ERROR_NAME(rc)); + exit(1); + } else { + fprintf(stderr, "cellid created: %lu\n", (unsigned long) cell); + } + + /* get cellid info */ + if (ORTE_SUCCESS != (rc = orte_ns.get_cell_info(cell, &site, &resource))) { /* got error */ + fprintf(stderr, "get_cell_info: error with error %s\n", ORTE_ERROR_NAME(rc)); + exit(1); + } else { + fprintf(stderr, "get_cell_info: %lu %s %s\n", (unsigned long) cell, site, resource); + } + + for (i=0; i<10; i++) { /* loop through */ + /* create jobid */ + if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(&job, NULL))) { /* got error */ + fprintf(stderr, "create jobid: error with error %s\n", ORTE_ERROR_NAME(rc)); + exit(1); + } else { + fprintf(stderr, "jobid created: %lu\n", (unsigned long) job); + } + + for (j=0; j<5; j++) { /* loop through several vpid ranges */ + /* get range of vpids */ + if (ORTE_SUCCESS != (rc = orte_ns.reserve_range(job, 250, &vpid))) { /* got error */ + fprintf(stderr, "reserve range: error with error %s\n", + ORTE_ERROR_NAME(rc)); + exit(1); + } else { + fprintf(stderr, "range reserved: %lu\n", + (unsigned long) vpid); + } + + /* create a name */ + if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&test_name, (orte_cellid_t)i, + job, vpid))) { + fprintf(stderr, "test_ns_replica: failed to create proc name after vpid range with error %s\n", + ORTE_ERROR_NAME(rc)); + exit(1); + } + + /* get and print its string values */ + if (ORTE_SUCCESS != (rc = orte_ns.get_proc_name_string(&tmp, test_name))) { + fprintf(stderr, "test_ns_replica: failed to get proc_name_string with error %s\n", + ORTE_ERROR_NAME(rc)); + exit(1); + } else { + fprintf(stderr, "(%d) strings: name - %s\n", i, tmp); + } + free(tmp); + if (ORTE_SUCCESS != (rc = orte_ns.get_vpid_string(&tmp, test_name))) { + fprintf(stderr, "test_ns_replica: failed to get vpid_string with error %s\n", + ORTE_ERROR_NAME(rc)); + exit(1); + } else { + fprintf(stderr, "(%d) strings: vpid - %s\n", i, tmp); + } + free(tmp); + if (ORTE_SUCCESS != (rc = orte_ns.get_jobid_string(&tmp, test_name))) { + fprintf(stderr, "test_ns_replica: failed to get jobid_string with error %s\n", + ORTE_ERROR_NAME(rc)); + exit(1); + } else { + fprintf(stderr, "(%d) strings: jobid - %s\n", i, tmp); + } + free(tmp); + if (ORTE_SUCCESS != (rc = orte_ns.get_cellid_string(&tmp, test_name))) { + fprintf(stderr, "test_ns_replica: failed to get cellid_string with error %s\n", + ORTE_ERROR_NAME(rc)); + exit(1); + } else { + fprintf(stderr, "(%d) strings: cellid - %s\n", i, tmp); + } + free(tmp); + + } + } + + /* finalize and see if memory cleared */ + orte_ns_base_close(); + + orte_proc_info_finalize(); + mca_base_close(); + opal_malloc_finalize(); + opal_output_finalize(); + + fclose( stderr ); + + return(0); +} diff --git a/orte/tools/orted/orted.c b/orte/tools/orted/orted.c index 347aadfe95..2105207d79 100644 --- a/orte/tools/orted/orted.c +++ b/orte/tools/orted/orted.c @@ -379,12 +379,12 @@ int main(int argc, char *argv[]) OBJ_CONSTRUCT(&orted_globals.condition, opal_condition_t); /* register the daemon main receive functions */ - ret = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_PLS_ORTED, ORTE_RML_NON_PERSISTENT, orte_daemon_recv_pls, NULL); + ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED, ORTE_RML_NON_PERSISTENT, orte_daemon_recv_pls, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); return ret; } - ret = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); + ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); return ret; @@ -693,7 +693,7 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender, OPAL_THREAD_UNLOCK(&orted_globals.mutex); /* reissue the non-blocking receive */ - ret = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_PLS_ORTED, ORTE_RML_NON_PERSISTENT, orte_daemon_recv_pls, NULL); + ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED, ORTE_RML_NON_PERSISTENT, orte_daemon_recv_pls, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); } @@ -789,7 +789,7 @@ DONE: OPAL_THREAD_UNLOCK(&orted_globals.mutex); /* reissue the non-blocking receive */ - ret = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); + ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); } diff --git a/orte/tools/orteprobe/orteprobe.c b/orte/tools/orteprobe/orteprobe.c index 09576826c6..920af1bf92 100644 --- a/orte/tools/orteprobe/orteprobe.c +++ b/orte/tools/orteprobe/orteprobe.c @@ -175,7 +175,7 @@ int main(int argc, char *argv[]) * Attempt to parse the probe's name and save in proc_info */ if (orteprobe_globals.name_string) { - ret = orte_ns_base_convert_string_to_process_name( + ret = orte_ns.convert_string_to_process_name( &orte_process_info.my_name, orteprobe_globals.name_string); if(ORTE_SUCCESS != ret) { fprintf(stderr, "orteprobe: Couldn't convert environmental string to probe's process name\n"); diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index b031f802c2..3adde8e905 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -301,6 +301,7 @@ int orterun(int argc, char *argv[]) orte_proc_state_t cb_states; orte_job_state_t exit_state; opal_list_t attributes; + opal_list_item_t *item; /* Setup MCA params */ @@ -458,6 +459,9 @@ int orterun(int argc, char *argv[]) cb_states = ORTE_PROC_STATE_TERMINATED | ORTE_PROC_STATE_AT_STG1; rc = orte_rmgr.spawn_job(apps, num_apps, &jobid, 0, NULL, job_state_callback, cb_states, &attributes); + while (NULL != (item = opal_list_remove_first(&attributes))) OBJ_RELEASE(item); + OBJ_DESTRUCT(&attributes); + if (ORTE_SUCCESS != rc) { /* JMS show_help */ opal_output(0, "%s: spawn failed with errno=%d\n", orterun_basename, rc); @@ -505,11 +509,17 @@ int orterun(int argc, char *argv[]) /* the job is complete - now tell the orteds that it is * okay to finalize and exit, we are done with them + * be sure to include any descendants so nothing is + * left hanging */ - if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid))) { + OBJ_CONSTRUCT(&attributes, opal_list_t); + orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); + if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &attributes))) { opal_show_help("help-orterun.txt", "orterun:daemon-die", false, orterun_basename, NULL, NULL, ret); } + while (NULL != (item = opal_list_remove_first(&attributes))) OBJ_RELEASE(item); + OBJ_DESTRUCT(&attributes); OPAL_THREAD_UNLOCK(&orterun_globals.lock); } @@ -744,6 +754,8 @@ static void abort_signal_callback(int fd, short flags, void *arg) int ret; struct timeval tv = { 1, 0 }; opal_event_t* event; + opal_list_t attrs; + opal_list_item_t *item; static int signalled = 0; @@ -757,10 +769,15 @@ static void abort_signal_callback(int fd, short flags, void *arg) } /* terminate the job - this will also wakeup orterun so - * it can kill all the orteds + * it can kill all the orteds. Be sure to kill all the job's + * descendants, if any, so nothing is left hanging */ if (jobid != ORTE_JOBID_INVALID) { - ret = orte_pls.terminate_job(jobid); + OBJ_CONSTRUCT(&attrs, opal_list_t); + orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); + ret = orte_pls.terminate_job(jobid, &attrs); + while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); + OBJ_DESTRUCT(&attrs); if (ORTE_SUCCESS != ret) { jobid = ORTE_JOBID_INVALID; } @@ -787,6 +804,8 @@ static void signal_forward_callback(int fd, short event, void *arg) { struct opal_event *signal = (struct opal_event*)arg; int signum, ret; + opal_list_t attrs; + opal_list_item_t *item; OPAL_TRACE(1); @@ -796,11 +815,15 @@ static void signal_forward_callback(int fd, short event, void *arg) orterun_basename, signum); } - /** send the signal out to the processes */ - if (ORTE_SUCCESS != (ret = orte_pls.signal_job(jobid, signum))) { + /** send the signal out to the processes, including any descendants */ + OBJ_CONSTRUCT(&attrs, opal_list_t); + orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); + if (ORTE_SUCCESS != (ret = orte_pls.signal_job(jobid, signum, &attrs))) { fprintf(stderr, "Signal %d could not be sent to the job (returned %d)", signum, ret); } + while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); + OBJ_DESTRUCT(&attrs); }