Back out some prior commits. These commits fixed bproc so it would run, but broke several other things (singleton comm_spawn and hostfile operations have been identified so far). Since bproc is the culprit here, let's leave bproc broken for now - I'll work on a fix for that environment that doesn't impact everythig else.
This commit was SVN r12648.
Этот коммит содержится в:
родитель
20d5c35f43
Коммит
6fca1431f3
@ -27,7 +27,6 @@
|
|||||||
#include "orte/dss/dss.h"
|
#include "orte/dss/dss.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/rmgr/rmgr.h"
|
#include "orte/mca/rmgr/rmgr.h"
|
||||||
#include "orte/mca/gpr/gpr.h"
|
|
||||||
|
|
||||||
#include "orte/mca/ras/base/proxy/ras_base_proxy.h"
|
#include "orte/mca/ras/base/proxy/ras_base_proxy.h"
|
||||||
#include "orte/mca/ras/base/ras_private.h"
|
#include "orte/mca/ras/base/ras_private.h"
|
||||||
@ -129,11 +128,23 @@ int orte_ras_base_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
|||||||
* want to allocate new nodes. Otherwise allocate all the existing nodes to
|
* want to allocate new nodes. Otherwise allocate all the existing nodes to
|
||||||
* our job */
|
* our job */
|
||||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||||
|
/* See if there are any nodes already on the registry. Most of the time
|
||||||
|
* these would have been put there by the RDS reading the hostfile. */
|
||||||
|
if (ORTE_SUCCESS != (ret = orte_ras_base_node_query(&nodes))) {
|
||||||
|
OBJ_DESTRUCT(&nodes);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
/* If there are any nodes at all, allocate them all to this job */
|
||||||
|
if (!opal_list_is_empty(&nodes)) {
|
||||||
|
opal_output(orte_ras_base.ras_output,
|
||||||
|
"orte:ras:base:allocate: reallocating nodes that are already on registry");
|
||||||
|
ret = orte_ras_base_allocate_nodes(jobid, &nodes);
|
||||||
|
OBJ_DESTRUCT(&nodes);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/* Run the RAS components from highest to lowest priority (they are already sorted).
|
/* there were no nodes already on the registry, so get them from the
|
||||||
* Stop when the node segment is no longer empty. This ensures we go through the
|
* RAS components */
|
||||||
* allocator components at least once
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* If no components are available, then return an error */
|
/* If no components are available, then return an error */
|
||||||
if (opal_list_is_empty(&orte_ras_base.ras_available)) {
|
if (opal_list_is_empty(&orte_ras_base.ras_available)) {
|
||||||
|
@ -434,6 +434,7 @@ int orte_init_stage1(bool infrastructure)
|
|||||||
orte_rds_cell_desc_t *rds_item;
|
orte_rds_cell_desc_t *rds_item;
|
||||||
orte_rds_cell_attr_t *new_attr;
|
orte_rds_cell_attr_t *new_attr;
|
||||||
orte_ras_node_t *ras_item;
|
orte_ras_node_t *ras_item;
|
||||||
|
opal_list_t attrs;
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&single_host, opal_list_t);
|
OBJ_CONSTRUCT(&single_host, opal_list_t);
|
||||||
OBJ_CONSTRUCT(&rds_single_host, opal_list_t);
|
OBJ_CONSTRUCT(&rds_single_host, opal_list_t);
|
||||||
@ -523,11 +524,21 @@ int orte_init_stage1(bool infrastructure)
|
|||||||
goto error;;
|
goto error;;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_ras_base_allocate_nodes(my_jobid, &single_host))) {
|
/* JMS: Same as above -- fix this after 1.0: force a
|
||||||
|
selection so that orte_ras has initialized pointers in
|
||||||
|
case anywhere else tries to use it. This may end up
|
||||||
|
putting a bunch more nodes on the node segment - e.g.,
|
||||||
|
if you're in a SLURM allocation and you "./a.out",
|
||||||
|
you'll end up with the localhost *and* all the other
|
||||||
|
nodes in your allocation on the node segment -- which
|
||||||
|
is probably fine */
|
||||||
|
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
||||||
|
if (ORTE_SUCCESS != (ret = orte_ras.allocate_job(my_jobid, &attrs))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
error = "allocate for a singleton";
|
error = "allocate for a singleton";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
OBJ_DESTRUCT(&attrs);
|
||||||
|
|
||||||
OBJ_DESTRUCT(&single_host);
|
OBJ_DESTRUCT(&single_host);
|
||||||
OBJ_DESTRUCT(&rds_single_host);
|
OBJ_DESTRUCT(&rds_single_host);
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user