Fix the allocator to make bproc happy.
We were burned again by the fact that the bproc state monitor creates entries on the node segment for *all* the nodes in the cluster when it is opened during orte_init. As a result, the bjs allocator was never being called, and the system merrily assumed that *all* nodes in the cluster had been allocated to it. To fix this, I removed a test that had been inserted into the allocation procedure that checked for a non-zero node segment. This was an old artifact - the RAS components already know that they are not to overwrite any existing node segment entries (at least, bproc does - I will check the others. For now, I just want to save the bproc fix on this machine). This commit was SVN r12640.
Этот коммит содержится в:
родитель
922f335678
Коммит
a30c65ca24
@ -27,6 +27,7 @@
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
|
||||
#include "orte/mca/ras/base/proxy/ras_base_proxy.h"
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
@ -128,23 +129,11 @@ int orte_ras_base_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
* want to allocate new nodes. Otherwise allocate all the existing nodes to
|
||||
* our job */
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
/* See if there are any nodes already on the registry. Most of the time
|
||||
* these would have been put there by the RDS reading the hostfile. */
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_base_node_query(&nodes))) {
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
return ret;
|
||||
}
|
||||
/* If there are any nodes at all, allocate them all to this job */
|
||||
if (!opal_list_is_empty(&nodes)) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: reallocating nodes that are already on registry");
|
||||
ret = orte_ras_base_allocate_nodes(jobid, &nodes);
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* there were no nodes already on the registry, so get them from the
|
||||
* RAS components */
|
||||
/* Run the RAS components from highest to lowest priority (they are already sorted).
|
||||
* Stop when the node segment is no longer empty. This ensures we go through the
|
||||
* allocator components at least once
|
||||
*/
|
||||
|
||||
/* If no components are available, then return an error */
|
||||
if (opal_list_is_empty(&orte_ras_base.ras_available)) {
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user