1
1

Add debug and handle the use-case where someone (a) uses a hostfile while in a managed allocation to sub-allocate runs, and (b) includes the HNP's node in one of those hostfiles.

cmr:v1.7

This commit was SVN r28203.
Этот коммит содержится в:
Ralph Castain 2013-03-22 00:53:33 +00:00
родитель 6c3f986d79
Коммит 2f43989d22
2 изменённых файлов: 65 добавлений и 5 удалений

Просмотреть файл

@ -1246,14 +1246,20 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
}
/* ignore nodes that are marked as do-not-use for this mapping */
if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
"NODE %s IS MARKED NO_USE", node->name));
/* reset the state so it can be used another time */
node->state = ORTE_NODE_STATE_UP;
continue;
}
if (ORTE_NODE_STATE_DOWN == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
"NODE %s IS MARKED DOWN", node->name));
continue;
}
if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
"NODE %s IS MARKED NO_INCLUDE", node->name));
/* not to be used */
continue;
}
@ -1388,14 +1394,20 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
/* have a match - now see if we want this node */
/* ignore nodes that are marked as do-not-use for this mapping */
if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
"NODE %s IS MARKED NO_USE", node->name));
/* reset the state so it can be used another time */
node->state = ORTE_NODE_STATE_UP;
break;
}
if (ORTE_NODE_STATE_DOWN == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
"NODE %s IS MARKED DOWN", node->name));
break;
}
if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
"NODE %s IS MARKED NO_INCLUDE", node->name));
break;
}
/* if this node is us, ignore it */
@ -1430,21 +1442,25 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
goto process;
}
/* construct a list of available nodes - don't need ours as
* we already exist
*/
/* construct a list of available nodes */
for (i=1; i < orte_node_pool->size; i++) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
/* ignore nodes that are marked as do-not-use for this mapping */
if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
"NODE %s IS MARKED NO_USE", node->name));
/* reset the state so it can be used another time */
node->state = ORTE_NODE_STATE_UP;
continue;
}
if (ORTE_NODE_STATE_DOWN == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
"NODE %s IS MARKED DOWN", node->name));
continue;
}
if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
"NODE %s IS MARKED NO_INCLUDE", node->name));
/* not to be used */
continue;
}
@ -1461,13 +1477,14 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
}
/* if we didn't get anything, then we are the only node in the
* allocation - so there is nothing else to do as no other
* system - so there is nothing else to do as no other
* daemons are to be launched
*/
if (0 == opal_list_get_size(&nodes)) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:setup_vm only HNP in allocation",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* cleanup */
OBJ_DESTRUCT(&nodes);
/* mark that the daemons have reported so we can proceed */
daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
@ -1475,7 +1492,17 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
return ORTE_SUCCESS;
}
/* filter across the union of all app_context specs */
/* filter across the union of all app_context specs - if the HNP
* was allocated, then we have to include
* ourselves in case someone has specified a -host or hostfile
* that includes the head node. We will remove ourselves later
* as we clearly already exist
*/
if (orte_hnp_is_allocated) {
node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
OBJ_RETAIN(node);
opal_list_append(&nodes, &node->super);
}
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
@ -1507,6 +1534,18 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
}
}
/* ensure we are not on the list */
for (item = opal_list_get_first(&nodes);
item != opal_list_get_end(&nodes);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
if (0 == node->index) {
opal_list_remove_item(&nodes, item);
OBJ_RELEASE(item);
break;
}
}
/* if we didn't get anything, then we are the only node in the
* allocation - so there is nothing else to do as no other
* daemons are to be launched

Просмотреть файл

@ -255,18 +255,27 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
continue;
}
if (0 != strcmp(node->name, nptr->name)) {
OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
"NODE %s DOESNT MATCH NODE %s",
node->name, nptr->name));
continue;
}
/* ignore nodes that are marked as do-not-use for this mapping */
if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
"NODE %s IS MARKED NO_USE", node->name));
/* reset the state so it can be used another time */
node->state = ORTE_NODE_STATE_UP;
continue;
}
if (ORTE_NODE_STATE_DOWN == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
"NODE %s IS DOWN", node->name));
continue;
}
if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
"NODE %s IS MARKED NO_INCLUDE", node->name));
/* not to be used */
continue;
}
@ -274,6 +283,8 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
* unless we are mapping prior to launching the vm
*/
if (NULL == node->daemon && !novm) {
OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
"NODE %s HAS NO DAEMON", node->name));
continue;
}
/* retain a copy for our use in case the item gets
@ -327,6 +338,8 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) {
if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
"HNP IS MARKED NO_USE"));
/* clear this for future use, but don't include it */
node->state = ORTE_NODE_STATE_UP;
} else if (ORTE_NODE_STATE_NOT_INCLUDED != node->state) {
@ -359,14 +372,20 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
/* ignore nodes that are marked as do-not-use for this mapping */
if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
"NODE %s IS MARKED NO_USE", node->name));
/* reset the state so it can be used another time */
node->state = ORTE_NODE_STATE_UP;
continue;
}
if (ORTE_NODE_STATE_DOWN == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
"NODE %s IS MARKED DOWN", node->name));
continue;
}
if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
"NODE %s IS MARKED NO_INCLUDE", node->name));
/* not to be used */
continue;
}
@ -374,6 +393,8 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
* unless we are mapping prior to launching the vm
*/
if (NULL == node->daemon && !novm) {
OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
"NODE %s HAS NO DAEMON", node->name));
continue;
}
/* retain a copy for our use in case the item gets