From 96c778656ad6376d45ab0961d8f1422381440ead Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 15 Jun 2012 10:15:07 +0000 Subject: [PATCH] Improve launch performance on clusters that use dedicated nodes by instructing the orteds to use the same port as the HNP, thus allowing them to "rollup" their initial callback via the routed network. This substantially reduces the HNP bottleneck and the number of ports opened by the HNP. Restore enable-static-ports option by default - the Cray will have to disable it to get around their library issues, but that's just a warning problem as opposed to blocking the build. This commit was SVN r26606. --- orte/config/orte_configure_options.m4 | 10 +- orte/mca/ess/base/ess_base_std_orted.c | 33 +- orte/mca/ess/slurm/ess_slurm_module.c | 15 +- orte/mca/grpcomm/base/Makefile.am | 5 +- orte/mca/grpcomm/base/base.h | 3 + orte/mca/grpcomm/base/grpcomm_base_receive.c | 10 + orte/mca/grpcomm/base/grpcomm_base_rollup.c | 127 +++++++ orte/mca/grpcomm/grpcomm.h | 1 - orte/mca/oob/tcp/oob_tcp.c | 15 +- orte/mca/plm/base/plm_base_launch_support.c | 345 ++++++++++--------- orte/mca/rml/rml_types.h | 1 + orte/mca/routed/debruijn/routed_debruijn.c | 6 +- orte/orted/orted_main.c | 48 ++- orte/runtime/orte_globals.c | 1 + orte/runtime/orte_globals.h | 1 + orte/runtime/orte_mca_params.c | 5 + 16 files changed, 417 insertions(+), 209 deletions(-) create mode 100644 orte/mca/grpcomm/base/grpcomm_base_rollup.c diff --git a/orte/config/orte_configure_options.m4 b/orte/config/orte_configure_options.m4 index 4387f8cc08..7f950e5a3e 100644 --- a/orte/config/orte_configure_options.m4 +++ b/orte/config/orte_configure_options.m4 @@ -137,13 +137,13 @@ AC_DEFINE_UNQUOTED([ORTE_ENABLE_PROGRESS_THREADS], AC_MSG_CHECKING([if want orte static ports]) AC_ARG_ENABLE([orte-static-ports], [AC_HELP_STRING([--enable-orte-static-ports], - [Enable orte static ports for tcp oob. (default: disabled)])]) -if test "$enable_orte_static_ports" = "yes"; then - AC_MSG_RESULT([yes]) - orte_enable_static_ports=1 -else + [Enable orte static ports for tcp oob. (default: enabled)])]) +if test "$enable_orte_static_ports" = "no"; then AC_MSG_RESULT([no]) orte_enable_static_ports=0 +else + AC_MSG_RESULT([yes]) + orte_enable_static_ports=1 fi AC_DEFINE_UNQUOTED([ORTE_ENABLE_STATIC_PORTS], [$orte_enable_static_ports], diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index d86bdd49fe..d9a971bc8c 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -304,13 +304,18 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } #if ORTE_ENABLE_STATIC_PORTS - /* if we are using static ports, then we need to setup + /* if we are using static ports or a common port, then we need to setup * the daemon info so the RML can function properly * without requiring a wireup stage. This must be done * after we enable_comm as that function determines our * own port, which we need in order to construct the nidmap */ - if (orte_static_ports) { + if (orte_static_ports || orte_use_common_port) { + /* define the routing tree so we know the pattern + * if we are trying to setup common or static ports + */ + orte_routed.update_routing_plan(); + if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) { ORTE_ERROR_LOG(ret); error = "orte_util_nidmap_init"; @@ -364,8 +369,8 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } /* Once the session directory location has been established, set - the opal_output env file location to be in the - proc-specific session directory. */ + the opal_output env file location to be in the + proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); @@ -411,9 +416,9 @@ int orte_ess_base_orted_setup(char **hosts) /* setup the global job and node arrays */ orte_job_data = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, - 1, - ORTE_GLOBAL_ARRAY_MAX_SIZE, - 1))) { + 1, + ORTE_GLOBAL_ARRAY_MAX_SIZE, + 1))) { ORTE_ERROR_LOG(ret); error = "setup job array"; goto error; @@ -421,18 +426,18 @@ int orte_ess_base_orted_setup(char **hosts) orte_node_pool = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE, - ORTE_GLOBAL_ARRAY_MAX_SIZE, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { + ORTE_GLOBAL_ARRAY_BLOCK_SIZE, + ORTE_GLOBAL_ARRAY_MAX_SIZE, + ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node array"; goto error; } orte_node_topologies = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE, - ORTE_GLOBAL_ARRAY_MAX_SIZE, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { + ORTE_GLOBAL_ARRAY_BLOCK_SIZE, + ORTE_GLOBAL_ARRAY_MAX_SIZE, + ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node topologies array"; goto error; @@ -580,7 +585,7 @@ int orte_ess_base_orted_setup(char **hosts) return ORTE_SUCCESS; -error: + error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); diff --git a/orte/mca/ess/slurm/ess_slurm_module.c b/orte/mca/ess/slurm/ess_slurm_module.c index 74c242a71c..4960df9272 100644 --- a/orte/mca/ess/slurm/ess_slurm_module.c +++ b/orte/mca/ess/slurm/ess_slurm_module.c @@ -72,6 +72,7 @@ static int rte_init(void) { int ret; char *error = NULL; + char **hosts = NULL; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { @@ -86,11 +87,23 @@ static int rte_init(void) * default procedure */ if (ORTE_PROC_IS_DAEMON) { - if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(NULL))) { + if (NULL != orte_node_regex) { + /* extract the nodes */ + if (ORTE_SUCCESS != (ret = + orte_regex_extract_node_names(orte_node_regex, &hosts)) || + NULL == hosts) { + error = "orte_regex_extract_node_names"; + goto error; + } + } + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } + if (NULL != hosts) { + opal_argv_free(hosts); + } return ORTE_SUCCESS; } diff --git a/orte/mca/grpcomm/base/Makefile.am b/orte/mca/grpcomm/base/Makefile.am index 185558af83..e6e98e5166 100644 --- a/orte/mca/grpcomm/base/Makefile.am +++ b/orte/mca/grpcomm/base/Makefile.am @@ -9,7 +9,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2011 Los Alamos National Security, LLC. +# Copyright (c) 2011-2012 Los Alamos National Security, LLC. # All rights reserved. # $COPYRIGHT$ # @@ -31,5 +31,6 @@ if !ORTE_DISABLE_FULL_SUPPORT libmca_grpcomm_la_SOURCES += \ base/grpcomm_base_modex.c \ base/grpcomm_base_receive.c \ - base/grpcomm_base_xcast.c + base/grpcomm_base_xcast.c \ + base/grpcomm_base_rollup.c endif diff --git a/orte/mca/grpcomm/base/base.h b/orte/mca/grpcomm/base/base.h index 4ca7d47344..35dd0f0284 100644 --- a/orte/mca/grpcomm/base/base.h +++ b/orte/mca/grpcomm/base/base.h @@ -74,6 +74,9 @@ ORTE_DECLSPEC orte_grpcomm_coll_id_t orte_grpcomm_base_get_coll_id(void); ORTE_DECLSPEC void orte_grpcomm_base_pack_collective(opal_buffer_t *relay, orte_grpcomm_collective_t *coll, orte_grpcomm_internal_stage_t stg); +ORTE_DECLSPEC void orte_grpcomm_base_rollup_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); /* modex support */ ORTE_DECLSPEC int orte_grpcomm_base_set_proc_attr(const char* project, diff --git a/orte/mca/grpcomm/base/grpcomm_base_receive.c b/orte/mca/grpcomm/base/grpcomm_base_receive.c index 41d917e44d..b1fd8ecb55 100644 --- a/orte/mca/grpcomm/base/grpcomm_base_receive.c +++ b/orte/mca/grpcomm/base/grpcomm_base_receive.c @@ -93,6 +93,16 @@ int orte_grpcomm_base_comm_start(void) recv_issued = false; return rc; } + if (ORTE_PROC_IS_DAEMON) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_ROLLUP, + ORTE_RML_PERSISTENT, + orte_grpcomm_base_rollup_recv, NULL))) { + ORTE_ERROR_LOG(rc); + recv_issued = false; + return rc; + } + } if (ORTE_PROC_IS_HNP) { if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLL_ID_REQ, diff --git a/orte/mca/grpcomm/base/grpcomm_base_rollup.c b/orte/mca/grpcomm/base/grpcomm_base_rollup.c new file mode 100644 index 0000000000..77d9e31136 --- /dev/null +++ b/orte/mca/grpcomm/base/grpcomm_base_rollup.c @@ -0,0 +1,127 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +/* + * includes + */ +#include "orte_config.h" + + +#include "opal/dss/dss.h" + +#include "orte/util/proc_info.h" +#include "orte/util/error_strings.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" +#include "orte/mca/odls/base/base.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/routed/routed.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/grpcomm/grpcomm_types.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/mca/grpcomm/base/base.h" + +/* function to cleanup collective once completed */ +static void rml_send_callback(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + orte_grpcomm_collective_t *coll; + + /* remove this collective from our list */ + coll = (orte_grpcomm_collective_t*)opal_list_remove_first(&orte_grpcomm_base.active_colls); + + /* release it */ + OBJ_RELEASE(coll); + + /* release our buffer */ + OBJ_RELEASE(buffer); +} + +void orte_grpcomm_base_rollup_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + int ret; + orte_grpcomm_collective_t *coll; + bool done = false; + opal_buffer_t *relay; + + OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, + "%s grpcomm:rollup:recv from sender %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender))); + + /* lookup the collective - can only be one on the list at this time */ + coll = (orte_grpcomm_collective_t*)opal_list_get_first(&orte_grpcomm_base.active_colls); + + /* flag that we received a bucket */ + if (sender->vpid != ORTE_PROC_MY_NAME->vpid) { + coll->num_peer_buckets++; + } + + /* transfer the data */ + opal_dss.copy_payload(&coll->buffer, buffer); + + /* if list is empty, then we can just send our data along */ + if (opal_list_is_empty(&coll->targets)) { + done = true; + } else if (coll->num_peer_buckets == opal_list_get_size(&coll->targets)) { + done = true; + } else { + /* check for a wildcard */ + orte_namelist_t *nm; + nm = (orte_namelist_t*)opal_list_get_first(&coll->targets); + if (ORTE_VPID_WILDCARD == nm->name.vpid && + coll->num_peer_buckets == orte_process_info.num_procs) { + done = true; + } + } + + if (done) { + /* send the message to my parent */ + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:rollup: sending rollup msg to %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT))); + relay = OBJ_NEW(opal_buffer_t); + opal_dss.copy_payload(relay, &coll->buffer); + /* if my parent is the HNP, send it to the final destination */ + if (ORTE_PROC_MY_PARENT->vpid == ORTE_PROC_MY_HNP->vpid) { + if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, relay, + ORTE_RML_TAG_ORTED_CALLBACK, 0, + rml_send_callback, NULL))) { + ORTE_ERROR_LOG(ret); + } + } else { + if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_PARENT, relay, + ORTE_RML_TAG_ROLLUP, 0, + rml_send_callback, NULL))) { + ORTE_ERROR_LOG(ret); + } + } + } +} diff --git a/orte/mca/grpcomm/grpcomm.h b/orte/mca/grpcomm/grpcomm.h index 15f6978958..2b8e10f386 100644 --- a/orte/mca/grpcomm/grpcomm.h +++ b/orte/mca/grpcomm/grpcomm.h @@ -70,7 +70,6 @@ typedef int (*orte_grpcomm_base_module_allgather_fn_t)(orte_grpcomm_collective_t /* barrier function */ typedef int (*orte_grpcomm_base_module_barrier_fn_t)(orte_grpcomm_collective_t *coll); - /** DATA EXCHANGE FUNCTIONS - SEE ompi/runtime/ompi_module_exchange.h FOR A DESCRIPTION * OF HOW THIS ALL WORKS */ diff --git a/orte/mca/oob/tcp/oob_tcp.c b/orte/mca/oob/tcp/oob_tcp.c index 71c9c69e52..0706586134 100644 --- a/orte/mca/oob/tcp/oob_tcp.c +++ b/orte/mca/oob/tcp/oob_tcp.c @@ -651,7 +651,15 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t port in the range. Otherwise, tcp_port_min will be 0, which means "pick any port" */ if (AF_INET == af_family) { - if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { + if (ORTE_PROC_IS_DAEMON && orte_use_common_port) { + /* use the same port as the HNP */ + char *ptr, *portptr; + portptr = strdup(orte_process_info.my_hnp_uri); + ptr = strrchr(portptr, ':'); + ptr++; + opal_argv_append_nosize(&ports, ptr); + free(portptr); + } else if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { if (NULL != mca_oob_tcp_component.tcp4_static_ports) { /* if static ports were provided, the daemon takes the * first entry in the list @@ -1647,10 +1655,8 @@ int mca_oob_tcp_resolve(mca_oob_tcp_peer_t* peer) * if we are trying to talk to a process on our own node, try * looking for the loopback interface before giving up */ -#if OPAL_WANT_IPV6 goto unlock; } -#else if (0 == strcasecmp(host, orte_process_info.nodename) || 0 == strncasecmp(host, orte_process_info.nodename, strlen(host)) || opal_ifislocal(host)) { @@ -1677,7 +1683,6 @@ int mca_oob_tcp_resolve(mca_oob_tcp_peer_t* peer) haddr = inet_ntoa(*(struct in_addr*)h->h_addr_list[0]); } proceed: -#endif /* we can't know which af_family we are using, so for now, let's * just look to see which static port family was provided */ @@ -1738,7 +1743,7 @@ int mca_oob_tcp_resolve(mca_oob_tcp_peer_t* peer) } } -unlock: + unlock: #endif return rc; } diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 2c72c666f1..1a4c2af5be 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -518,141 +518,149 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, char *nodename; orte_node_t *node; orte_job_t *jdata; + orte_process_name_t dname; /* get the daemon job, if necessary */ if (NULL == jdatorted) { jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); } - /* unpack its contact info */ + /* multiple daemons could be in this buffer, so unpack until we exhaust the data */ idx = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &idx, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - orted_failed_launch = true; - goto CLEANUP; - } - - /* set the contact info into the hash table */ - if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) { - ORTE_ERROR_LOG(rc); - orted_failed_launch = true; - goto CLEANUP; - } - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_report_launch from daemon %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* update state and record for this daemon contact info */ - if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, sender->vpid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - orted_failed_launch = true; - goto CLEANUP; - } - daemon->state = ORTE_PROC_STATE_RUNNING; - daemon->rml_uri = rml_uri; - - /* unpack the node name */ - idx = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &nodename, &idx, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - orted_failed_launch = true; - goto CLEANUP; - } - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_report_launch from daemon %s on node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender), nodename)); - - /* look this node up, if necessary */ - if (!orte_plm_globals.daemon_nodes_assigned_at_launch) { - if (!orte_have_fqdn_allocation) { - /* remove any domain info */ - if (NULL != (ptr = strchr(nodename, '.'))) { - *ptr = '\0'; - ptr = strdup(nodename); - free(nodename); - nodename = ptr; - } - } - if (orte_plm_globals.strip_prefix_from_node_names) { - /* remove all leading characters and zeroes */ - ptr = nodename; - while (idx < (int)strlen(nodename) && - (isalpha(nodename[idx]) || '0' == nodename[idx])) { - idx++; - } - if (idx == (int)strlen(nodename)) { - ptr = strdup(nodename); - } else { - ptr = strdup(&nodename[idx]); - } - free(nodename); - nodename = ptr; - } - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_report_launch attempting to assign daemon %s to node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender), nodename)); - for (idx=0; idx < orte_node_pool->size; idx++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, idx))) { - continue; - } - if (node->location_verified) { - /* already assigned */ - continue; - } - if (0 == strcmp(nodename, node->name)) { - /* flag that we verified the location */ - node->location_verified = true; - if (node == daemon->node) { - /* it wound up right where it should */ - break; - } - /* remove the prior association */ - if (NULL != daemon->node) { - OBJ_RELEASE(daemon->node); - } - if (NULL != node->daemon) { - OBJ_RELEASE(node->daemon); - } - /* associate this daemon with the node */ - node->daemon = daemon; - OBJ_RETAIN(daemon); - /* associate this node with the daemon */ - daemon->node = node; - daemon->nodename = node->name; - OBJ_RETAIN(node); - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_report_launch assigning daemon %s to node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&daemon->name), node->name)); - break; - } - } - } - -#if OPAL_HAVE_HWLOC - /* store the local resources for that node */ - { - hwloc_topology_t topo, t; - int i; - bool found; - - idx=1; - node = daemon->node; - if (NULL == node) { - /* this shouldn't happen - it indicates an error in the - * prior node matching logic, so report it and error out - */ - orte_show_help("help-plm-base.txt", "daemon-no-assigned-node", true, - ORTE_NAME_PRINT(&daemon->name), nodename); + while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx, ORTE_NAME))) { + /* unpack its contact info */ + idx = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &idx, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); orted_failed_launch = true; goto CLEANUP; } - if (OPAL_SUCCESS == opal_dss.unpack(buffer, &topo, &idx, OPAL_HWLOC_TOPO)) { + + /* set the contact info into the hash table */ + if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) { + ORTE_ERROR_LOG(rc); + orted_failed_launch = true; + goto CLEANUP; + } + + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:orted_report_launch from daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&dname))); + + /* update state and record for this daemon contact info */ + if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, dname.vpid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + orted_failed_launch = true; + goto CLEANUP; + } + daemon->state = ORTE_PROC_STATE_RUNNING; + daemon->rml_uri = rml_uri; + + /* unpack the node name */ + idx = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &nodename, &idx, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + orted_failed_launch = true; + goto CLEANUP; + } + + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:orted_report_launch from daemon %s on node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&dname), nodename)); + + /* look this node up, if necessary */ + if (!orte_plm_globals.daemon_nodes_assigned_at_launch) { + if (!orte_have_fqdn_allocation) { + /* remove any domain info */ + if (NULL != (ptr = strchr(nodename, '.'))) { + *ptr = '\0'; + ptr = strdup(nodename); + free(nodename); + nodename = ptr; + } + } + if (orte_plm_globals.strip_prefix_from_node_names) { + /* remove all leading characters and zeroes */ + ptr = nodename; + while (idx < (int)strlen(nodename) && + (isalpha(nodename[idx]) || '0' == nodename[idx])) { + idx++; + } + if (idx == (int)strlen(nodename)) { + ptr = strdup(nodename); + } else { + ptr = strdup(&nodename[idx]); + } + free(nodename); + nodename = ptr; + } + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:orted_report_launch attempting to assign daemon %s to node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&dname), nodename)); + for (idx=0; idx < orte_node_pool->size; idx++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, idx))) { + continue; + } + if (node->location_verified) { + /* already assigned */ + continue; + } + if (0 == strcmp(nodename, node->name)) { + /* flag that we verified the location */ + node->location_verified = true; + if (node == daemon->node) { + /* it wound up right where it should */ + break; + } + /* remove the prior association */ + if (NULL != daemon->node) { + OBJ_RELEASE(daemon->node); + } + if (NULL != node->daemon) { + OBJ_RELEASE(node->daemon); + } + /* associate this daemon with the node */ + node->daemon = daemon; + OBJ_RETAIN(daemon); + /* associate this node with the daemon */ + daemon->node = node; + daemon->nodename = node->name; + OBJ_RETAIN(node); + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:orted_report_launch assigning daemon %s to node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&daemon->name), node->name)); + break; + } + } + } + +#if OPAL_HAVE_HWLOC + /* store the local resources for that node */ + if (1 == dname.vpid || orte_hetero_nodes) { + hwloc_topology_t topo, t; + int i; + bool found; + + idx=1; + node = daemon->node; + if (NULL == node) { + /* this shouldn't happen - it indicates an error in the + * prior node matching logic, so report it and error out + */ + orte_show_help("help-plm-base.txt", "daemon-no-assigned-node", true, + ORTE_NAME_PRINT(&daemon->name), nodename); + orted_failed_launch = true; + goto CLEANUP; + } + if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &idx, OPAL_HWLOC_TOPO))) { + ORTE_ERROR_LOG(rc); + orted_failed_launch = true; + goto CLEANUP; + } OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s RECEIVED TOPOLOGY FROM NODE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename)); @@ -681,44 +689,48 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s NEW TOPOLOGY - ADDING", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - + opal_pointer_array_add(orte_node_topologies, topo); node->topology = topo; } } - } #endif - - CLEANUP: - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_report_launch %s for daemon %s at contact %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - orted_failed_launch ? "failed" : "completed", - ORTE_NAME_PRINT(sender), - (NULL == daemon) ? "UNKNOWN" : daemon->rml_uri)); - - if (orted_failed_launch) { - ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_FAILED_TO_START); - } else { - jdatorted->num_reported++; - if (jdatorted->num_procs == jdatorted->num_reported) { - jdatorted->state = ORTE_JOB_STATE_DAEMONS_REPORTED; - /* activate the daemons_reported state for all jobs - * whose daemons were launched - */ - for (idx=1; idx < orte_job_data->size; idx++) { - if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, idx))) { - continue; - } - if (ORTE_JOB_STATE_DAEMONS_LAUNCHED == jdata->state) { - ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); + + CLEANUP: + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:orted_report_launch %s for daemon %s at contact %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orted_failed_launch ? "failed" : "completed", + ORTE_NAME_PRINT(&dname), + (NULL == daemon) ? "UNKNOWN" : daemon->rml_uri)); + + if (orted_failed_launch) { + ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_FAILED_TO_START); + return; + } else { + jdatorted->num_reported++; + if (jdatorted->num_procs == jdatorted->num_reported) { + jdatorted->state = ORTE_JOB_STATE_DAEMONS_REPORTED; + /* activate the daemons_reported state for all jobs + * whose daemons were launched + */ + for (idx=1; idx < orte_job_data->size; idx++) { + if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, idx))) { + continue; + } + if (ORTE_JOB_STATE_DAEMONS_LAUNCHED == jdata->state) { + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); + } } } } + idx = 1; } - - /* if a tree-launch is underway, send the cmd back */ - if (NULL != orte_tree_launch_cmd) { + if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { + ORTE_ERROR_LOG(rc); + ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_FAILED_TO_START); + } else if (NULL != orte_tree_launch_cmd) { + /* if a tree-launch is underway, send the cmd back */ OBJ_RETAIN(orte_tree_launch_cmd); orte_rml.send_buffer_nb(sender, orte_tree_launch_cmd, ORTE_RML_TAG_DAEMON, 0, @@ -770,24 +782,16 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, /* check for debug flags */ if (orte_debug_flag) { - opal_argv_append(argc, argv, "-mca"); - opal_argv_append(argc, argv, "orte_debug"); - opal_argv_append(argc, argv, "1"); + opal_argv_append(argc, argv, "--debug"); } if (orte_debug_daemons_flag) { - opal_argv_append(argc, argv, "-mca"); - opal_argv_append(argc, argv, "orte_debug_daemons"); - opal_argv_append(argc, argv, "1"); + opal_argv_append(argc, argv, "--debug-daemons"); } if (orte_debug_daemons_file_flag) { - opal_argv_append(argc, argv, "-mca"); - opal_argv_append(argc, argv, "orte_debug_daemons_file"); - opal_argv_append(argc, argv, "1"); + opal_argv_append(argc, argv, "--debug-daemons-file"); } if (orted_spin_flag) { - opal_argv_append(argc, argv, "-mca"); - opal_argv_append(argc, argv, "orte_daemon_spin"); - opal_argv_append(argc, argv, "1"); + opal_argv_append(argc, argv, "--spin"); } #if OPAL_HAVE_HWLOC if (opal_hwloc_report_bindings) { @@ -880,8 +884,8 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, opal_argv_append(argc, argv, param); free(param); - /* if given and we have static ports, pass the node list */ - if (orte_static_ports && NULL != nodes) { + /* if given and we have static ports or are using a common port, pass the node list */ + if ((orte_static_ports || orte_use_common_port) && NULL != nodes) { /* convert the nodes to a regex */ if (ORTE_SUCCESS != (rc = orte_regex_create(nodes, ¶m))) { ORTE_ERROR_LOG(rc); @@ -893,6 +897,11 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, free(param); } + /* if we want to use a common port, tell the daemon to do so */ + if (orte_use_common_port) { + opal_argv_append(argc, argv, "--use-common-port"); + } + /* pass along any cmd line MCA params provided to mpirun, * being sure to "purge" any that would cause problems * on backend nodes diff --git a/orte/mca/rml/rml_types.h b/orte/mca/rml/rml_types.h index 2df4003c43..d76ca21587 100644 --- a/orte/mca/rml/rml_types.h +++ b/orte/mca/rml/rml_types.h @@ -61,6 +61,7 @@ BEGIN_C_DECLS #define ORTE_RML_TAG_WIREUP 8 #define ORTE_RML_TAG_RML_INFO_UPDATE 9 #define ORTE_RML_TAG_ORTED_CALLBACK 10 +#define ORTE_RML_TAG_ROLLUP 11 #define ORTE_RML_TAG_REPORT_REMOTE_LAUNCH 12 #define ORTE_RML_TAG_CKPT 13 diff --git a/orte/mca/routed/debruijn/routed_debruijn.c b/orte/mca/routed/debruijn/routed_debruijn.c index 69916061d3..be8f4702c4 100644 --- a/orte/mca/routed/debruijn/routed_debruijn.c +++ b/orte/mca/routed/debruijn/routed_debruijn.c @@ -385,7 +385,7 @@ static orte_process_name_t get_route(orte_process_name_t *target) /* THIS CAME FROM OUR OWN JOB FAMILY... */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) { - if (!hnp_direct || orte_static_ports) { + if (!hnp_direct || orte_static_ports || orte_use_common_port) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, "%s routing to the HNP through my parent %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -489,8 +489,8 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) return rc; } - /* if we are using static ports, set my lifeline to point at my parent */ - if (orte_static_ports) { + /* if we are using static ports or a common port, set my lifeline to point at my parent */ + if (orte_static_ports || orte_use_common_port) { lifeline = ORTE_PROC_MY_PARENT; } else { /* set our lifeline to the HNP - we will abort if that connection is lost */ diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index f27a2500a3..4115700f8b 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -168,6 +168,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = { NULL, OPAL_CMD_LINE_TYPE_STRING, "URI for the parent if tree launch is enabled."}, + { "orte", "use", "common_port", '\0', NULL, "use-common-port", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Use the same port as the HNP."}, + { NULL, NULL, NULL, '\0', NULL, "set-sid", 0, &orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL, "Direct the orted to separate from the current session"}, @@ -685,6 +689,12 @@ int orte_daemon(int argc, char *argv[]) */ buffer = OBJ_NEW(opal_buffer_t); + /* insert our name for rollup purposes */ + if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buffer); + goto DONE; + } /* for now, always include our contact info, even if we are using * static ports. Eventually, this will be removed */ @@ -708,16 +718,34 @@ int orte_daemon(int argc, char *argv[]) } #endif - /* send to the HNP's callback - this will flow up the routing - * tree if static ports are enabled - */ - if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer, - ORTE_RML_TAG_ORTED_CALLBACK, 0, - rml_cbfunc, NULL))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buffer); - goto DONE; - } + if (orte_static_ports || orte_use_common_port) { + /* use the rollup collective to send our data to the HNP + * so we minimize the HNP bottleneck + */ + orte_grpcomm_collective_t *coll; + coll = OBJ_NEW(orte_grpcomm_collective_t); + /* get the list of contributors we need from the routed module */ + orte_routed.get_routing_list(ORTE_GRPCOMM_COLL_PEERS, coll); + /* add the collective to our list */ + opal_list_append(&orte_grpcomm_base.active_colls, &coll->super); + /* send the buffer to ourselves to start the collective */ + if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, buffer, + ORTE_RML_TAG_ROLLUP, 0, + rml_cbfunc, NULL))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buffer); + goto DONE; + } + } else { + /* send directly to the HNP's callback */ + if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer, + ORTE_RML_TAG_ORTED_CALLBACK, 0, + rml_cbfunc, NULL))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buffer); + goto DONE; + } + } } if (orte_debug_daemons_flag) { diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index bc467fec46..f89d8eaa09 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -74,6 +74,7 @@ char *orte_basename = NULL; bool orte_static_ports = false; char *orte_oob_static_ports = NULL; bool orte_standalone_operation = false; +bool orte_use_common_port = false; bool orte_keep_fqdn_hostnames = false; bool orte_have_fqdn_allocation = false; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index ade2a3c3fc..e594e86b6a 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -610,6 +610,7 @@ ORTE_DECLSPEC extern char *orte_basename; ORTE_DECLSPEC extern bool orte_static_ports; ORTE_DECLSPEC extern char *orte_oob_static_ports; ORTE_DECLSPEC extern bool orte_standalone_operation; +ORTE_DECLSPEC extern bool orte_use_common_port; ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames; ORTE_DECLSPEC extern bool orte_have_fqdn_allocation; diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index d69a8c49ad..66154ba57a 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -547,6 +547,11 @@ int orte_register_params(void) "Maximum size of virtual machine - used to subdivide allocation", false, false, -1, &orte_max_vm_size); + mca_base_param_reg_int_name("orte", "use_common_port", + "Daemons use same port as HNP", + false, false, (int)false, &value); + orte_use_common_port = OPAL_INT_TO_BOOL(value); + #endif /* ORTE_DISABLE_FULL_SUPPORT */ return ORTE_SUCCESS;