When we comm_spawn, we really want to respect the original -host directives and not expand the daemon virtual machine unless directed to do so in the comm_spawn command. Otherwise, we will automatically launch daemons on every node in the allocation.
cmr=v1.8.2:reviewer=rhc:subject=respect vm boundaries during comm_spawn This commit was SVN r31578.
Этот коммит содержится в:
родитель
d40112a012
Коммит
238ecea311
@ -1342,6 +1342,52 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
map = daemons->map;
|
||||
|
||||
/* if this is a dynamic spawn, then we don't make any changes to
|
||||
* the virtual machine unless specifically requested to do so
|
||||
*/
|
||||
if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
for (i=1; i < orte_node_pool->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
||||
continue;
|
||||
}
|
||||
/* only add in nodes marked as "added" */
|
||||
if (ORTE_NODE_STATE_ADDED != node->state) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
|
||||
"%s plm_base:setup_vm NODE %s WAS NOT ADDED",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
|
||||
continue;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
|
||||
"%s plm_base:setup_vm ADDING NODE %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
|
||||
/* retain a copy for our use in case the item gets
|
||||
* destructed along the way
|
||||
*/
|
||||
OBJ_RETAIN(node);
|
||||
opal_list_append(&nodes, &node->super);
|
||||
/* reset the state so it can be used for mapping */
|
||||
node->state = ORTE_NODE_STATE_UP;
|
||||
}
|
||||
map->num_new_daemons = 0;
|
||||
/* if we didn't get anything, then there is nothing else to
|
||||
* do as no other daemons are to be launched
|
||||
*/
|
||||
if (0 == opal_list_get_size(&nodes)) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s plm:base:setup_vm no new daemons required",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
/* mark that the daemons have reported so we can proceed */
|
||||
daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
|
||||
daemons->updated = false;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* if we got some new nodes to launch, we need to handle it */
|
||||
goto process;
|
||||
}
|
||||
|
||||
/* if we are not working with a virtual machine, then we
|
||||
* look across all jobs and ensure that the "VM" contains
|
||||
|
@ -206,7 +206,7 @@ typedef int8_t orte_node_state_t;
|
||||
#define ORTE_NODE_STATE_REBOOT 4 // Node is rebooting
|
||||
#define ORTE_NODE_STATE_DO_NOT_USE 5 // Node is up, but not available for use for the next mapping
|
||||
#define ORTE_NODE_STATE_NOT_INCLUDED 6 // Node is up, but not part of the node pool for jobs
|
||||
|
||||
#define ORTE_NODE_STATE_ADDED 7 // Node was dynamically added to pool
|
||||
|
||||
/* Define a boundary so that external developers
|
||||
* have a starting point for defining their own
|
||||
|
@ -44,6 +44,7 @@
|
||||
#include "orte/util/dash_host/dash_host.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/comm/comm.h"
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
@ -76,9 +77,10 @@ void orte_ras_base_display_alloc(void)
|
||||
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
|
||||
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse);
|
||||
} else {
|
||||
asprintf(&tmp2, "\t%s: slots=%d max_slots=%d slots_inuse=%d\n",
|
||||
asprintf(&tmp2, "\t%s: slots=%d max_slots=%d slots_inuse=%d state=%s\n",
|
||||
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
|
||||
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse);
|
||||
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse,
|
||||
orte_node_state_to_str(alloc->state));
|
||||
}
|
||||
if (NULL == tmp) {
|
||||
tmp = tmp2;
|
||||
@ -444,6 +446,7 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
|
||||
opal_list_t nodes;
|
||||
int i;
|
||||
orte_app_context_t *app;
|
||||
orte_node_t *node;
|
||||
|
||||
/* construct a list to hold the results */
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
@ -518,6 +521,10 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
|
||||
|
||||
/* if something was found, we add that to our global pool */
|
||||
if (!opal_list_is_empty(&nodes)) {
|
||||
/* mark all the nodes as "added" */
|
||||
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
|
||||
node->state = ORTE_NODE_STATE_ADDED;
|
||||
}
|
||||
/* store the results in the global resource pool - this removes the
|
||||
* list items
|
||||
*/
|
||||
|
@ -395,3 +395,27 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
|
||||
return "UNKNOWN STATE!";
|
||||
}
|
||||
}
|
||||
|
||||
const char *orte_node_state_to_str(orte_node_state_t state)
|
||||
{
|
||||
switch(state) {
|
||||
case ORTE_NODE_STATE_UNDEF:
|
||||
return "UNDEF";
|
||||
case ORTE_NODE_STATE_UNKNOWN:
|
||||
return "UNKNOWN";
|
||||
case ORTE_NODE_STATE_DOWN:
|
||||
return "DOWN";
|
||||
case ORTE_NODE_STATE_UP:
|
||||
return "UP";
|
||||
case ORTE_NODE_STATE_REBOOT:
|
||||
return "REBOOT";
|
||||
case ORTE_NODE_STATE_DO_NOT_USE:
|
||||
return "DO_NOT_USE";
|
||||
case ORTE_NODE_STATE_NOT_INCLUDED:
|
||||
return "NOT_INCLUDED";
|
||||
case ORTE_NODE_STATE_ADDED:
|
||||
return "ADDED";
|
||||
default:
|
||||
return "UNKNOWN STATE!";
|
||||
}
|
||||
}
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -42,5 +43,7 @@ ORTE_DECLSPEC const char *orte_app_ctx_state_to_str(orte_app_state_t state);
|
||||
|
||||
ORTE_DECLSPEC const char *orte_proc_state_to_str(orte_proc_state_t state);
|
||||
|
||||
ORTE_DECLSPEC const char *orte_node_state_to_str(orte_node_state_t state);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user