1
1

When we comm_spawn, we really want to respect the original -host directives and not expand the daemon virtual machine unless directed to do so in the comm_spawn command. Otherwise, we will automatically launch daemons on every node in the allocation.

cmr=v1.8.2:reviewer=rhc:subject=respect vm boundaries during comm_spawn

This commit was SVN r31578.
Этот коммит содержится в:
Ralph Castain 2014-04-30 22:26:18 +00:00
родитель d40112a012
Коммит 238ecea311
5 изменённых файлов: 83 добавлений и 3 удалений

Просмотреть файл

@ -1342,6 +1342,52 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
map = daemons->map;
/* if this is a dynamic spawn, then we don't make any changes to
* the virtual machine unless specifically requested to do so
*/
if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
OBJ_CONSTRUCT(&nodes, opal_list_t);
for (i=1; i < orte_node_pool->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
/* only add in nodes marked as "added" */
if (ORTE_NODE_STATE_ADDED != node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
"%s plm_base:setup_vm NODE %s WAS NOT ADDED",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
continue;
}
OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
"%s plm_base:setup_vm ADDING NODE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
/* retain a copy for our use in case the item gets
* destructed along the way
*/
OBJ_RETAIN(node);
opal_list_append(&nodes, &node->super);
/* reset the state so it can be used for mapping */
node->state = ORTE_NODE_STATE_UP;
}
map->num_new_daemons = 0;
/* if we didn't get anything, then there is nothing else to
* do as no other daemons are to be launched
*/
if (0 == opal_list_get_size(&nodes)) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:setup_vm no new daemons required",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
OBJ_DESTRUCT(&nodes);
/* mark that the daemons have reported so we can proceed */
daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
daemons->updated = false;
return ORTE_SUCCESS;
}
/* if we got some new nodes to launch, we need to handle it */
goto process;
}
/* if we are not working with a virtual machine, then we
* look across all jobs and ensure that the "VM" contains

Просмотреть файл

@ -206,7 +206,7 @@ typedef int8_t orte_node_state_t;
#define ORTE_NODE_STATE_REBOOT 4 // Node is rebooting
#define ORTE_NODE_STATE_DO_NOT_USE 5 // Node is up, but not available for use for the next mapping
#define ORTE_NODE_STATE_NOT_INCLUDED 6 // Node is up, but not part of the node pool for jobs
#define ORTE_NODE_STATE_ADDED 7 // Node was dynamically added to pool
/* Define a boundary so that external developers
* have a starting point for defining their own

Просмотреть файл

@ -44,6 +44,7 @@
#include "orte/util/dash_host/dash_host.h"
#include "orte/util/proc_info.h"
#include "orte/util/comm/comm.h"
#include "orte/util/error_strings.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_quit.h"
@ -76,9 +77,10 @@ void orte_ras_base_display_alloc(void)
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse);
} else {
asprintf(&tmp2, "\t%s: slots=%d max_slots=%d slots_inuse=%d\n",
asprintf(&tmp2, "\t%s: slots=%d max_slots=%d slots_inuse=%d state=%s\n",
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse);
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse,
orte_node_state_to_str(alloc->state));
}
if (NULL == tmp) {
tmp = tmp2;
@ -444,6 +446,7 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
opal_list_t nodes;
int i;
orte_app_context_t *app;
orte_node_t *node;
/* construct a list to hold the results */
OBJ_CONSTRUCT(&nodes, opal_list_t);
@ -518,6 +521,10 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
/* if something was found, we add that to our global pool */
if (!opal_list_is_empty(&nodes)) {
/* mark all the nodes as "added" */
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
node->state = ORTE_NODE_STATE_ADDED;
}
/* store the results in the global resource pool - this removes the
* list items
*/

Просмотреть файл

@ -395,3 +395,27 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
return "UNKNOWN STATE!";
}
}
const char *orte_node_state_to_str(orte_node_state_t state)
{
switch(state) {
case ORTE_NODE_STATE_UNDEF:
return "UNDEF";
case ORTE_NODE_STATE_UNKNOWN:
return "UNKNOWN";
case ORTE_NODE_STATE_DOWN:
return "DOWN";
case ORTE_NODE_STATE_UP:
return "UP";
case ORTE_NODE_STATE_REBOOT:
return "REBOOT";
case ORTE_NODE_STATE_DO_NOT_USE:
return "DO_NOT_USE";
case ORTE_NODE_STATE_NOT_INCLUDED:
return "NOT_INCLUDED";
case ORTE_NODE_STATE_ADDED:
return "ADDED";
default:
return "UNKNOWN STATE!";
}
}

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -42,5 +43,7 @@ ORTE_DECLSPEC const char *orte_app_ctx_state_to_str(orte_app_state_t state);
ORTE_DECLSPEC const char *orte_proc_state_to_str(orte_proc_state_t state);
ORTE_DECLSPEC const char *orte_node_state_to_str(orte_node_state_t state);
END_C_DECLS
#endif