Round and round the mulberry bush we go...
Fix comm_spawn by singletons. orte_init does some voodoo to let the system know about localhost when we are a singleton. This includes allocating it so that any comm_spawn'd children can use their parent "allocation". Unfortunately, the fix that bproc needs (due to that smr filling up the node segment!) causes the singleton startup to fail. The fix is to just have the singleton startup force an allocation of its localhost. Only issue here is: what happens if we are in a persistent universe? The singleton will now overwrite any prior info on slots used on localhost by other jobs (won't affect anything else). The answer, of course, is to do something more intelligent - lookup localhost on the registry and just update its info instead of overwriting it. Something for another day (or month....or year) This commit was SVN r12644.
Этот коммит содержится в:
родитель
761d4fab25
Коммит
9f3dcd147a
@ -429,7 +429,6 @@ int orte_init_stage1(bool infrastructure)
|
|||||||
orte_rds_cell_desc_t *rds_item;
|
orte_rds_cell_desc_t *rds_item;
|
||||||
orte_rds_cell_attr_t *new_attr;
|
orte_rds_cell_attr_t *new_attr;
|
||||||
orte_ras_node_t *ras_item;
|
orte_ras_node_t *ras_item;
|
||||||
opal_list_t attrs;
|
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&single_host, opal_list_t);
|
OBJ_CONSTRUCT(&single_host, opal_list_t);
|
||||||
OBJ_CONSTRUCT(&rds_single_host, opal_list_t);
|
OBJ_CONSTRUCT(&rds_single_host, opal_list_t);
|
||||||
@ -519,21 +518,11 @@ int orte_init_stage1(bool infrastructure)
|
|||||||
goto error;;
|
goto error;;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* JMS: Same as above -- fix this after 1.0: force a
|
if (ORTE_SUCCESS != (ret = orte_ras_base_allocate_nodes(my_jobid, &single_host))) {
|
||||||
selection so that orte_ras has initialized pointers in
|
|
||||||
case anywhere else tries to use it. This may end up
|
|
||||||
putting a bunch more nodes on the node segment - e.g.,
|
|
||||||
if you're in a SLURM allocation and you "./a.out",
|
|
||||||
you'll end up with the localhost *and* all the other
|
|
||||||
nodes in your allocation on the node segment -- which
|
|
||||||
is probably fine */
|
|
||||||
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_ras.allocate_job(my_jobid, &attrs))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
error = "allocate for a singleton";
|
error = "allocate for a singleton";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
OBJ_DESTRUCT(&attrs);
|
|
||||||
|
|
||||||
OBJ_DESTRUCT(&single_host);
|
OBJ_DESTRUCT(&single_host);
|
||||||
OBJ_DESTRUCT(&rds_single_host);
|
OBJ_DESTRUCT(&rds_single_host);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user