Back out some prior commits. These commits fixed bproc so it would run, but broke several other things (singleton comm_spawn and hostfile operations have been identified so far). Since bproc is the culprit here, let's leave bproc broken for now - I'll work on a fix for that environment that doesn't impact everythig else.
This commit was SVN r12648.
Этот коммит содержится в:
родитель
20d5c35f43
Коммит
6fca1431f3
@ -27,7 +27,6 @@
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
|
||||
#include "orte/mca/ras/base/proxy/ras_base_proxy.h"
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
@ -129,11 +128,23 @@ int orte_ras_base_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
* want to allocate new nodes. Otherwise allocate all the existing nodes to
|
||||
* our job */
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
/* See if there are any nodes already on the registry. Most of the time
|
||||
* these would have been put there by the RDS reading the hostfile. */
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_base_node_query(&nodes))) {
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
return ret;
|
||||
}
|
||||
/* If there are any nodes at all, allocate them all to this job */
|
||||
if (!opal_list_is_empty(&nodes)) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: reallocating nodes that are already on registry");
|
||||
ret = orte_ras_base_allocate_nodes(jobid, &nodes);
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Run the RAS components from highest to lowest priority (they are already sorted).
|
||||
* Stop when the node segment is no longer empty. This ensures we go through the
|
||||
* allocator components at least once
|
||||
*/
|
||||
/* there were no nodes already on the registry, so get them from the
|
||||
* RAS components */
|
||||
|
||||
/* If no components are available, then return an error */
|
||||
if (opal_list_is_empty(&orte_ras_base.ras_available)) {
|
||||
|
@ -434,6 +434,7 @@ int orte_init_stage1(bool infrastructure)
|
||||
orte_rds_cell_desc_t *rds_item;
|
||||
orte_rds_cell_attr_t *new_attr;
|
||||
orte_ras_node_t *ras_item;
|
||||
opal_list_t attrs;
|
||||
|
||||
OBJ_CONSTRUCT(&single_host, opal_list_t);
|
||||
OBJ_CONSTRUCT(&rds_single_host, opal_list_t);
|
||||
@ -523,11 +524,21 @@ int orte_init_stage1(bool infrastructure)
|
||||
goto error;;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_base_allocate_nodes(my_jobid, &single_host))) {
|
||||
/* JMS: Same as above -- fix this after 1.0: force a
|
||||
selection so that orte_ras has initialized pointers in
|
||||
case anywhere else tries to use it. This may end up
|
||||
putting a bunch more nodes on the node segment - e.g.,
|
||||
if you're in a SLURM allocation and you "./a.out",
|
||||
you'll end up with the localhost *and* all the other
|
||||
nodes in your allocation on the node segment -- which
|
||||
is probably fine */
|
||||
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
||||
if (ORTE_SUCCESS != (ret = orte_ras.allocate_job(my_jobid, &attrs))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "allocate for a singleton";
|
||||
goto error;
|
||||
}
|
||||
OBJ_DESTRUCT(&attrs);
|
||||
|
||||
OBJ_DESTRUCT(&single_host);
|
||||
OBJ_DESTRUCT(&rds_single_host);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user