1
1

Extend the daemon reuse functionality to *most* of the other environments.

Note that Bproc won't support this operation, so we just ignore the --reuse-daemons directive.

I'm afraid I don't understand the POE and XGrid environments well enough to attempt the necessary modifications.

Also, please note that XGrid support has been broken on the trunk. I don't understand the code syntax well enough to make the required changes to that PLS component, so it won't compile at the moment. I'm hoping Brian has a few minutes to fix it after SC.

This commit was SVN r12614.
Этот коммит содержится в:
Ralph Castain 2006-11-16 15:11:45 +00:00
родитель 044898f4bf
Коммит c1813e5c5a
5 изменённых файлов: 121 добавлений и 35 удалений

Просмотреть файл

@ -149,8 +149,19 @@ static int get_daemons(opal_list_t *daemons, orte_jobid_t job)
orte_pls_daemon_info_t *dmn, *dmn2;
bool found_name, found_node, found_cell;
opal_list_item_t *item;
bool check_dups;
int rc;
/* check the list to see if there is anything already on it. If there is, then
* we will need to check for duplicate entries before we add something. If not,
* then this can go a lot faster
*/
if (0 < opal_list_get_size(daemons)) {
check_dups = true;
} else {
check_dups = false;
}
/* setup the key */
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, job))) {
ORTE_ERROR_LOG(rc);
@ -208,15 +219,17 @@ static int get_daemons(opal_list_t *daemons, orte_jobid_t job)
}
/* if we found everything, then this is a valid entry */
if (found_name && found_node && found_cell) {
/* see if this daemon is already on the list - if so, then we don't add it */
for (item = opal_list_get_first(daemons);
item != opal_list_get_end(daemons);
item = opal_list_get_next(item)) {
dmn2 = (orte_pls_daemon_info_t*)item;
if (ORTE_EQUAL == orte_dss.compare(dmn2->name, name, ORTE_NAME)) {
/* already on list - ignore it */
goto MOVEON;
if (check_dups) {
/* see if this daemon is already on the list - if so, then we don't add it */
for (item = opal_list_get_first(daemons);
item != opal_list_get_end(daemons);
item = opal_list_get_next(item)) {
dmn2 = (orte_pls_daemon_info_t*)item;
if (ORTE_EQUAL == orte_dss.compare(dmn2->name, name, ORTE_NAME)) {
/* already on list - ignore it */
goto MOVEON;
}
}
}
dmn = OBJ_NEW(orte_pls_daemon_info_t);
@ -316,22 +329,40 @@ int orte_pls_base_remove_daemon(orte_pls_daemon_info_t *info)
int orte_pls_base_check_avail_daemons(opal_list_t *daemons,
orte_jobid_t job)
{
orte_jobid_t parent;
orte_jobid_t root, *descendants;
orte_std_cntr_t i, ndesc;
int rc;
/* check for daemons belonging to the parent job */
if (ORTE_SUCCESS != (rc = orte_ns.get_parent_job(&parent, job))) {
/* check for daemons belonging to any job in this job's family.
* Since the jobs in any family must exit together, it is reasonable
* for us to reuse any daemons that were spawned by any member
* of our extended family. We can find all of our family members
* by first finding our root job, and then getting all of its
* descendants
*/
if (ORTE_SUCCESS != (rc = orte_ns.get_root_job(&root, job))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, parent, NULL))) {
if (ORTE_SUCCESS != (rc = orte_ns.get_job_descendants(&descendants, &ndesc, root))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* loop through the descendants, adding to the daemon list as we go */
for (i=0; i < ndesc; i++) {
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, descendants[i], NULL))) {
ORTE_ERROR_LOG(rc);
free(descendants);
return rc;
}
}
free(descendants); /* all done with these */
/* now add in any persistent daemons - they are tagged as bootproxies
* for jobid = 0 */
* for jobid = 0
*/
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, 0, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;

Просмотреть файл

@ -85,6 +85,7 @@
#include "orte/mca/smr/smr.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/pls/base/pls_private.h"
#include "orte/mca/pls/gridengine/pls_gridengine.h"
@ -226,19 +227,33 @@ int orte_pls_gridengine_launch_job(orte_jobid_t jobid)
rc = orte_rmaps.get_job_map(&map, jobid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
OBJ_DESTRUCT(&daemons);
return rc;
}
/* if the user requested that we re-use daemons,
* launch the procs on any existing, re-usable daemons
*/
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(map);
OBJ_DESTRUCT(&daemons);
return rc;
}
}
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
if (num_nodes == 0) {
/* job must have been launched on existing daemons - just return */
OBJ_RELEASE(map);
OBJ_DESTRUCT(&daemons);
return ORTE_SUCCESS;
}
/*
* Allocate a range of vpids for the daemons.
*/
if (num_nodes == 0) {
rc = ORTE_ERR_BAD_PARAM;
ORTE_ERROR_LOG(rc);
goto cleanup;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -479,23 +479,29 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
*/
rc = orte_rmaps.get_job_map(&map, jobid);
if (ORTE_SUCCESS != rc) {
goto cleanup;
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&active_daemons);
return rc;
}
/* if the user requested that we re-use daemons,
* launch the procs on any existing, re-usable daemons */
* launch the procs on any existing, re-usable daemons
*/
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
OBJ_RELEASE(map);
OBJ_DESTRUCT(&active_daemons);
return rc;
}
}
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
if (0 >= num_nodes) {
if (0 == num_nodes) {
/* nothing left to do - just return */
rc = ORTE_SUCCESS;
goto cleanup;
OBJ_RELEASE(map);
OBJ_DESTRUCT(&active_daemons);
return ORTE_SUCCESS;
}
if (mca_pls_rsh_component.debug_daemons &&

Просмотреть файл

@ -65,6 +65,7 @@
#include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/pls/base/pls_private.h"
#include "pls_slurm.h"
@ -156,15 +157,34 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
*/
rc = orte_rmaps.get_job_map(&map, jobid);
if (ORTE_SUCCESS != rc) {
goto cleanup;
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&daemons);
return rc;
}
/* if the user requested that we re-use daemons,
* launch the procs on any existing, re-usable daemons
*/
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(map);
OBJ_DESTRUCT(&daemons);
return rc;
}
}
/*
* Allocate a range of vpids for the daemons.
*/
num_nodes = opal_list_get_size(&map->nodes);
if (num_nodes == 0) {
return ORTE_ERR_BAD_PARAM;
/* nothing further to do - job must have been launched
* on existing daemons, so we can just return
*/
OBJ_RELEASE(map);
OBJ_DESTRUCT(&daemons);
return ORTE_SUCCESS;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) {

Просмотреть файл

@ -69,7 +69,7 @@
#include "orte/mca/rml/rml.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/pls/base/pls_private.h"
#include "pls_tm.h"
@ -164,17 +164,31 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
*/
rc = orte_rmaps.get_job_map(&map, jobid);
if (ORTE_SUCCESS != rc) {
goto cleanup;
ORTE_ERROR_LOG(rc);
return rc;
}
/* if the user requested that we re-use daemons,
* launch the procs on any existing, re-usable daemons
*/
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(map);
return rc;
}
}
num_nodes = opal_list_get_size(&map->nodes);
if (0 == num_nodes) {
/* must have been launched on existing daemons - just return */
OBJ_RELEASE(map);
return ORTE_SUCCESS;
}
/*
* Allocate a range of vpids for the daemons.
*/
if (0 == num_nodes) {
return ORTE_ERR_BAD_PARAM;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) {
goto cleanup;