1
1

A better way of handling fqdn allocations. Prior method was wrong as it equated "node1" with "node10", which definitely caused problems.

Detect the addition of fqdn nodes in the allocation. If not found, then strip all incoming hostnames from daemons of any domain info when matching those names against the names in the node pool.

Leave some protection and "live" diagnostic output in place so we can continue to detect problems across all environments.

This commit was SVN r25557.
Этот коммит содержится в:
Ralph Castain 2011-12-01 14:24:43 +00:00
родитель 512aea79bc
Коммит 641e17f26c
4 изменённых файлов: 25 добавлений и 33 удалений

Просмотреть файл

@ -336,7 +336,7 @@ static void process_orted_launch_report(int fd, short event, void *data)
orte_message_event_t *mev = (orte_message_event_t*)data; orte_message_event_t *mev = (orte_message_event_t*)data;
opal_buffer_t *buffer = mev->buffer; opal_buffer_t *buffer = mev->buffer;
orte_process_name_t peer; orte_process_name_t peer;
char *rml_uri = NULL; char *rml_uri = NULL, *ptr;
int rc, idx; int rc, idx;
struct timeval recvtime; struct timeval recvtime;
long secs, usecs; long secs, usecs;
@ -344,7 +344,6 @@ static void process_orted_launch_report(int fd, short event, void *data)
int64_t startsec, startusec; int64_t startsec, startusec;
orte_proc_t *daemon=NULL; orte_proc_t *daemon=NULL;
char *nodename; char *nodename;
size_t len, len2;
orte_node_t *node; orte_node_t *node;
/* see if we need to timestamp this receipt */ /* see if we need to timestamp this receipt */
@ -467,30 +466,19 @@ static void process_orted_launch_report(int fd, short event, void *data)
/* look this node up, if necessary */ /* look this node up, if necessary */
if (!orte_plm_globals.daemon_nodes_assigned_at_launch) { if (!orte_plm_globals.daemon_nodes_assigned_at_launch) {
if (NULL == nodename) { if (!orte_have_fqdn_allocation) {
/* it is permissible to transmit a NULL string, but /* remove any domain info */
* that would be a problem here if (NULL != (ptr = strchr(nodename, '.'))) {
*/ *ptr = '\0';
opal_output(0, "%s NULL nodename returned by daemon %s - cannot process", }
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&daemon->name));
rc = ORTE_ERR_FATAL;
orted_failed_launch = true;
goto CLEANUP;
} }
len = strlen(nodename);
for (idx=0; idx < orte_node_pool->size; idx++) { for (idx=0; idx < orte_node_pool->size; idx++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, idx))) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, idx))) {
continue; continue;
} }
if (NULL != node->daemon) { if (NULL != node->daemon) {
/* already known */ /* already known */
if (len <= strlen(node->name)) { if (0 == strcmp(nodename, node->name)) {
len2 = len;
} else {
len2 = strlen(node->name);
}
if (0 == strncmp(nodename, node->name, len2)) {
/* this shouldn't happen, but protect against it just in case */ /* this shouldn't happen, but protect against it just in case */
opal_output(0, "%s Node %s already has daemon %s assigned to it - assigning daemon %s", opal_output(0, "%s Node %s already has daemon %s assigned to it - assigning daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -513,20 +501,7 @@ static void process_orted_launch_report(int fd, short event, void *data)
} }
continue; continue;
} }
if (NULL == node->name) { if (0 == strcmp(nodename, node->name)) {
/* this shouldn't happen */
opal_output(0, "%s NULL nodename detected during daemon callback - cannot process",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
rc = ORTE_ERR_FATAL;
orted_failed_launch = true;
goto CLEANUP;
}
if (len <= strlen(node->name)) {
len2 = len;
} else {
len2 = strlen(node->name);
}
if (0 == strncmp(nodename, node->name, len2)) {
/* associate this daemon with the node */ /* associate this daemon with the node */
node->daemon = daemon; node->daemon = daemon;
OBJ_RETAIN(daemon); OBJ_RETAIN(daemon);

Просмотреть файл

@ -65,6 +65,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
orte_std_cntr_t num_nodes; orte_std_cntr_t num_nodes;
int rc, i; int rc, i;
orte_node_t *node, *hnp_node; orte_node_t *node, *hnp_node;
char *ptr;
/* get the number of nodes */ /* get the number of nodes */
num_nodes = (orte_std_cntr_t)opal_list_get_size(nodes); num_nodes = (orte_std_cntr_t)opal_list_get_size(nodes);
@ -154,6 +155,20 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
} }
/* update the total slots in the job */ /* update the total slots in the job */
jdata->total_slots_alloc += node->slots; jdata->total_slots_alloc += node->slots;
/* check if we have fqdn names in the allocation */
if (NULL != strchr(node->name, '.')) {
orte_have_fqdn_allocation = true;
}
}
}
/* if we didn't find any fqdn names in the allocation, then
* ensure we don't have any domain info in the node record
* for the hnp
*/
if (!orte_have_fqdn_allocation) {
if (NULL != (ptr = strchr(hnp_node->name, '.'))) {
*ptr = '\0';
} }
} }

Просмотреть файл

@ -67,6 +67,7 @@ char *orte_oob_static_ports = NULL;
bool orte_standalone_operation = false; bool orte_standalone_operation = false;
bool orte_keep_fqdn_hostnames = false; bool orte_keep_fqdn_hostnames = false;
bool orte_have_fqdn_allocation = false;
bool orte_show_resolved_nodenames; bool orte_show_resolved_nodenames;
int orted_debug_failure; int orted_debug_failure;
int orted_debug_failure_delay; int orted_debug_failure_delay;

Просмотреть файл

@ -535,6 +535,7 @@ ORTE_DECLSPEC extern char *orte_oob_static_ports;
ORTE_DECLSPEC extern bool orte_standalone_operation; ORTE_DECLSPEC extern bool orte_standalone_operation;
ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames; ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames;
ORTE_DECLSPEC extern bool orte_have_fqdn_allocation;
ORTE_DECLSPEC extern bool orte_show_resolved_nodenames; ORTE_DECLSPEC extern bool orte_show_resolved_nodenames;
ORTE_DECLSPEC extern int orted_debug_failure; ORTE_DECLSPEC extern int orted_debug_failure;
ORTE_DECLSPEC extern int orted_debug_failure_delay; ORTE_DECLSPEC extern int orted_debug_failure_delay;