From 641e17f26c2dd16fd846403c6a69026ed0eb67f0 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 1 Dec 2011 14:24:43 +0000 Subject: [PATCH] A better way of handling fqdn allocations. Prior method was wrong as it equated "node1" with "node10", which definitely caused problems. Detect the addition of fqdn nodes in the allocation. If not found, then strip all incoming hostnames from daemons of any domain info when matching those names against the names in the node pool. Leave some protection and "live" diagnostic output in place so we can continue to detect problems across all environments. This commit was SVN r25557. --- orte/mca/plm/base/plm_base_launch_support.c | 41 ++++----------------- orte/mca/ras/base/ras_base_node.c | 15 ++++++++ orte/runtime/orte_globals.c | 1 + orte/runtime/orte_globals.h | 1 + 4 files changed, 25 insertions(+), 33 deletions(-) diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index cfa9bb9deb..a5ab6d892c 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -336,7 +336,7 @@ static void process_orted_launch_report(int fd, short event, void *data) orte_message_event_t *mev = (orte_message_event_t*)data; opal_buffer_t *buffer = mev->buffer; orte_process_name_t peer; - char *rml_uri = NULL; + char *rml_uri = NULL, *ptr; int rc, idx; struct timeval recvtime; long secs, usecs; @@ -344,7 +344,6 @@ static void process_orted_launch_report(int fd, short event, void *data) int64_t startsec, startusec; orte_proc_t *daemon=NULL; char *nodename; - size_t len, len2; orte_node_t *node; /* see if we need to timestamp this receipt */ @@ -467,30 +466,19 @@ static void process_orted_launch_report(int fd, short event, void *data) /* look this node up, if necessary */ if (!orte_plm_globals.daemon_nodes_assigned_at_launch) { - if (NULL == nodename) { - /* it is permissible to transmit a NULL string, but - * that would be a problem here - */ - opal_output(0, "%s NULL nodename returned by daemon %s - cannot process", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&daemon->name)); - rc = ORTE_ERR_FATAL; - orted_failed_launch = true; - goto CLEANUP; + if (!orte_have_fqdn_allocation) { + /* remove any domain info */ + if (NULL != (ptr = strchr(nodename, '.'))) { + *ptr = '\0'; + } } - len = strlen(nodename); for (idx=0; idx < orte_node_pool->size; idx++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, idx))) { continue; } if (NULL != node->daemon) { /* already known */ - if (len <= strlen(node->name)) { - len2 = len; - } else { - len2 = strlen(node->name); - } - if (0 == strncmp(nodename, node->name, len2)) { + if (0 == strcmp(nodename, node->name)) { /* this shouldn't happen, but protect against it just in case */ opal_output(0, "%s Node %s already has daemon %s assigned to it - assigning daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -513,20 +501,7 @@ static void process_orted_launch_report(int fd, short event, void *data) } continue; } - if (NULL == node->name) { - /* this shouldn't happen */ - opal_output(0, "%s NULL nodename detected during daemon callback - cannot process", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - rc = ORTE_ERR_FATAL; - orted_failed_launch = true; - goto CLEANUP; - } - if (len <= strlen(node->name)) { - len2 = len; - } else { - len2 = strlen(node->name); - } - if (0 == strncmp(nodename, node->name, len2)) { + if (0 == strcmp(nodename, node->name)) { /* associate this daemon with the node */ node->daemon = daemon; OBJ_RETAIN(daemon); diff --git a/orte/mca/ras/base/ras_base_node.c b/orte/mca/ras/base/ras_base_node.c index 3505039f0f..5afb4a5e19 100644 --- a/orte/mca/ras/base/ras_base_node.c +++ b/orte/mca/ras/base/ras_base_node.c @@ -65,6 +65,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) orte_std_cntr_t num_nodes; int rc, i; orte_node_t *node, *hnp_node; + char *ptr; /* get the number of nodes */ num_nodes = (orte_std_cntr_t)opal_list_get_size(nodes); @@ -154,6 +155,20 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) } /* update the total slots in the job */ jdata->total_slots_alloc += node->slots; + /* check if we have fqdn names in the allocation */ + if (NULL != strchr(node->name, '.')) { + orte_have_fqdn_allocation = true; + } + } + } + + /* if we didn't find any fqdn names in the allocation, then + * ensure we don't have any domain info in the node record + * for the hnp + */ + if (!orte_have_fqdn_allocation) { + if (NULL != (ptr = strchr(hnp_node->name, '.'))) { + *ptr = '\0'; } } diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 396cdac9e0..302c7fd874 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -67,6 +67,7 @@ char *orte_oob_static_ports = NULL; bool orte_standalone_operation = false; bool orte_keep_fqdn_hostnames = false; +bool orte_have_fqdn_allocation = false; bool orte_show_resolved_nodenames; int orted_debug_failure; int orted_debug_failure_delay; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 3466fdd6f7..6144d66ff9 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -535,6 +535,7 @@ ORTE_DECLSPEC extern char *orte_oob_static_ports; ORTE_DECLSPEC extern bool orte_standalone_operation; ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames; +ORTE_DECLSPEC extern bool orte_have_fqdn_allocation; ORTE_DECLSPEC extern bool orte_show_resolved_nodenames; ORTE_DECLSPEC extern int orted_debug_failure; ORTE_DECLSPEC extern int orted_debug_failure_delay;