Extend the daemon reuse functionality to *most* of the other environments.
Note that Bproc won't support this operation, so we just ignore the --reuse-daemons directive. I'm afraid I don't understand the POE and XGrid environments well enough to attempt the necessary modifications. Also, please note that XGrid support has been broken on the trunk. I don't understand the code syntax well enough to make the required changes to that PLS component, so it won't compile at the moment. I'm hoping Brian has a few minutes to fix it after SC. This commit was SVN r12614.
Этот коммит содержится в:
родитель
044898f4bf
Коммит
c1813e5c5a
@ -149,8 +149,19 @@ static int get_daemons(opal_list_t *daemons, orte_jobid_t job)
|
||||
orte_pls_daemon_info_t *dmn, *dmn2;
|
||||
bool found_name, found_node, found_cell;
|
||||
opal_list_item_t *item;
|
||||
bool check_dups;
|
||||
int rc;
|
||||
|
||||
/* check the list to see if there is anything already on it. If there is, then
|
||||
* we will need to check for duplicate entries before we add something. If not,
|
||||
* then this can go a lot faster
|
||||
*/
|
||||
if (0 < opal_list_get_size(daemons)) {
|
||||
check_dups = true;
|
||||
} else {
|
||||
check_dups = false;
|
||||
}
|
||||
|
||||
/* setup the key */
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -208,15 +219,17 @@ static int get_daemons(opal_list_t *daemons, orte_jobid_t job)
|
||||
}
|
||||
/* if we found everything, then this is a valid entry */
|
||||
if (found_name && found_node && found_cell) {
|
||||
/* see if this daemon is already on the list - if so, then we don't add it */
|
||||
for (item = opal_list_get_first(daemons);
|
||||
item != opal_list_get_end(daemons);
|
||||
item = opal_list_get_next(item)) {
|
||||
dmn2 = (orte_pls_daemon_info_t*)item;
|
||||
|
||||
if (ORTE_EQUAL == orte_dss.compare(dmn2->name, name, ORTE_NAME)) {
|
||||
/* already on list - ignore it */
|
||||
goto MOVEON;
|
||||
if (check_dups) {
|
||||
/* see if this daemon is already on the list - if so, then we don't add it */
|
||||
for (item = opal_list_get_first(daemons);
|
||||
item != opal_list_get_end(daemons);
|
||||
item = opal_list_get_next(item)) {
|
||||
dmn2 = (orte_pls_daemon_info_t*)item;
|
||||
|
||||
if (ORTE_EQUAL == orte_dss.compare(dmn2->name, name, ORTE_NAME)) {
|
||||
/* already on list - ignore it */
|
||||
goto MOVEON;
|
||||
}
|
||||
}
|
||||
}
|
||||
dmn = OBJ_NEW(orte_pls_daemon_info_t);
|
||||
@ -316,22 +329,40 @@ int orte_pls_base_remove_daemon(orte_pls_daemon_info_t *info)
|
||||
int orte_pls_base_check_avail_daemons(opal_list_t *daemons,
|
||||
orte_jobid_t job)
|
||||
{
|
||||
orte_jobid_t parent;
|
||||
orte_jobid_t root, *descendants;
|
||||
orte_std_cntr_t i, ndesc;
|
||||
int rc;
|
||||
|
||||
/* check for daemons belonging to the parent job */
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.get_parent_job(&parent, job))) {
|
||||
/* check for daemons belonging to any job in this job's family.
|
||||
* Since the jobs in any family must exit together, it is reasonable
|
||||
* for us to reuse any daemons that were spawned by any member
|
||||
* of our extended family. We can find all of our family members
|
||||
* by first finding our root job, and then getting all of its
|
||||
* descendants
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.get_root_job(&root, job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, parent, NULL))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.get_job_descendants(&descendants, &ndesc, root))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* loop through the descendants, adding to the daemon list as we go */
|
||||
for (i=0; i < ndesc; i++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, descendants[i], NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(descendants);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
free(descendants); /* all done with these */
|
||||
|
||||
/* now add in any persistent daemons - they are tagged as bootproxies
|
||||
* for jobid = 0 */
|
||||
* for jobid = 0
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, 0, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
|
@ -85,6 +85,7 @@
|
||||
#include "orte/mca/smr/smr.h"
|
||||
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "orte/mca/pls/base/pls_private.h"
|
||||
#include "orte/mca/pls/gridengine/pls_gridengine.h"
|
||||
|
||||
@ -226,19 +227,33 @@ int orte_pls_gridengine_launch_job(orte_jobid_t jobid)
|
||||
rc = orte_rmaps.get_job_map(&map, jobid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
OBJ_DESTRUCT(&daemons);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if the user requested that we re-use daemons,
|
||||
* launch the procs on any existing, re-usable daemons
|
||||
*/
|
||||
if (orte_pls_base.reuse_daemons) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(map);
|
||||
OBJ_DESTRUCT(&daemons);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
|
||||
|
||||
if (num_nodes == 0) {
|
||||
/* job must have been launched on existing daemons - just return */
|
||||
OBJ_RELEASE(map);
|
||||
OBJ_DESTRUCT(&daemons);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a range of vpids for the daemons.
|
||||
*/
|
||||
if (num_nodes == 0) {
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -479,23 +479,29 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
*/
|
||||
rc = orte_rmaps.get_job_map(&map, jobid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&active_daemons);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if the user requested that we re-use daemons,
|
||||
* launch the procs on any existing, re-usable daemons */
|
||||
* launch the procs on any existing, re-usable daemons
|
||||
*/
|
||||
if (orte_pls_base.reuse_daemons) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
OBJ_RELEASE(map);
|
||||
OBJ_DESTRUCT(&active_daemons);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
|
||||
if (0 >= num_nodes) {
|
||||
if (0 == num_nodes) {
|
||||
/* nothing left to do - just return */
|
||||
rc = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
OBJ_RELEASE(map);
|
||||
OBJ_DESTRUCT(&active_daemons);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (mca_pls_rsh_component.debug_daemons &&
|
||||
|
@ -65,6 +65,7 @@
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "orte/mca/pls/base/pls_private.h"
|
||||
#include "pls_slurm.h"
|
||||
|
||||
@ -156,15 +157,34 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
||||
*/
|
||||
rc = orte_rmaps.get_job_map(&map, jobid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&daemons);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if the user requested that we re-use daemons,
|
||||
* launch the procs on any existing, re-usable daemons
|
||||
*/
|
||||
if (orte_pls_base.reuse_daemons) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(map);
|
||||
OBJ_DESTRUCT(&daemons);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a range of vpids for the daemons.
|
||||
*/
|
||||
num_nodes = opal_list_get_size(&map->nodes);
|
||||
if (num_nodes == 0) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
/* nothing further to do - job must have been launched
|
||||
* on existing daemons, so we can just return
|
||||
*/
|
||||
OBJ_RELEASE(map);
|
||||
OBJ_DESTRUCT(&daemons);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
|
@ -69,7 +69,7 @@
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
|
||||
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "orte/mca/pls/base/pls_private.h"
|
||||
#include "pls_tm.h"
|
||||
|
||||
@ -164,17 +164,31 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
*/
|
||||
rc = orte_rmaps.get_job_map(&map, jobid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if the user requested that we re-use daemons,
|
||||
* launch the procs on any existing, re-usable daemons
|
||||
*/
|
||||
if (orte_pls_base.reuse_daemons) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(map);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
num_nodes = opal_list_get_size(&map->nodes);
|
||||
|
||||
if (0 == num_nodes) {
|
||||
/* must have been launched on existing daemons - just return */
|
||||
OBJ_RELEASE(map);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a range of vpids for the daemons.
|
||||
*/
|
||||
if (0 == num_nodes) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user