1
1

Extend the daemon reuse functionality to *most* of the other environments.

Note that Bproc won't support this operation, so we just ignore the --reuse-daemons directive.

I'm afraid I don't understand the POE and XGrid environments well enough to attempt the necessary modifications.

Also, please note that XGrid support has been broken on the trunk. I don't understand the code syntax well enough to make the required changes to that PLS component, so it won't compile at the moment. I'm hoping Brian has a few minutes to fix it after SC.

This commit was SVN r12614.
Этот коммит содержится в:
Ralph Castain 2006-11-16 15:11:45 +00:00
родитель 044898f4bf
Коммит c1813e5c5a
5 изменённых файлов: 121 добавлений и 35 удалений

Просмотреть файл

@ -149,8 +149,19 @@ static int get_daemons(opal_list_t *daemons, orte_jobid_t job)
orte_pls_daemon_info_t *dmn, *dmn2; orte_pls_daemon_info_t *dmn, *dmn2;
bool found_name, found_node, found_cell; bool found_name, found_node, found_cell;
opal_list_item_t *item; opal_list_item_t *item;
bool check_dups;
int rc; int rc;
/* check the list to see if there is anything already on it. If there is, then
* we will need to check for duplicate entries before we add something. If not,
* then this can go a lot faster
*/
if (0 < opal_list_get_size(daemons)) {
check_dups = true;
} else {
check_dups = false;
}
/* setup the key */ /* setup the key */
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, job))) { if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, job))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -208,6 +219,7 @@ static int get_daemons(opal_list_t *daemons, orte_jobid_t job)
} }
/* if we found everything, then this is a valid entry */ /* if we found everything, then this is a valid entry */
if (found_name && found_node && found_cell) { if (found_name && found_node && found_cell) {
if (check_dups) {
/* see if this daemon is already on the list - if so, then we don't add it */ /* see if this daemon is already on the list - if so, then we don't add it */
for (item = opal_list_get_first(daemons); for (item = opal_list_get_first(daemons);
item != opal_list_get_end(daemons); item != opal_list_get_end(daemons);
@ -219,6 +231,7 @@ static int get_daemons(opal_list_t *daemons, orte_jobid_t job)
goto MOVEON; goto MOVEON;
} }
} }
}
dmn = OBJ_NEW(orte_pls_daemon_info_t); dmn = OBJ_NEW(orte_pls_daemon_info_t);
if (NULL == dmn) { if (NULL == dmn) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
@ -316,22 +329,40 @@ int orte_pls_base_remove_daemon(orte_pls_daemon_info_t *info)
int orte_pls_base_check_avail_daemons(opal_list_t *daemons, int orte_pls_base_check_avail_daemons(opal_list_t *daemons,
orte_jobid_t job) orte_jobid_t job)
{ {
orte_jobid_t parent; orte_jobid_t root, *descendants;
orte_std_cntr_t i, ndesc;
int rc; int rc;
/* check for daemons belonging to the parent job */ /* check for daemons belonging to any job in this job's family.
if (ORTE_SUCCESS != (rc = orte_ns.get_parent_job(&parent, job))) { * Since the jobs in any family must exit together, it is reasonable
* for us to reuse any daemons that were spawned by any member
* of our extended family. We can find all of our family members
* by first finding our root job, and then getting all of its
* descendants
*/
if (ORTE_SUCCESS != (rc = orte_ns.get_root_job(&root, job))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, parent, NULL))) { if (ORTE_SUCCESS != (rc = orte_ns.get_job_descendants(&descendants, &ndesc, root))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
/* loop through the descendants, adding to the daemon list as we go */
for (i=0; i < ndesc; i++) {
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, descendants[i], NULL))) {
ORTE_ERROR_LOG(rc);
free(descendants);
return rc;
}
}
free(descendants); /* all done with these */
/* now add in any persistent daemons - they are tagged as bootproxies /* now add in any persistent daemons - they are tagged as bootproxies
* for jobid = 0 */ * for jobid = 0
*/
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, 0, NULL))) { if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, 0, NULL))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;

Просмотреть файл

@ -85,6 +85,7 @@
#include "orte/mca/smr/smr.h" #include "orte/mca/smr/smr.h"
#include "orte/mca/pls/pls.h" #include "orte/mca/pls/pls.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/pls/base/pls_private.h" #include "orte/mca/pls/base/pls_private.h"
#include "orte/mca/pls/gridengine/pls_gridengine.h" #include "orte/mca/pls/gridengine/pls_gridengine.h"
@ -226,19 +227,33 @@ int orte_pls_gridengine_launch_job(orte_jobid_t jobid)
rc = orte_rmaps.get_job_map(&map, jobid); rc = orte_rmaps.get_job_map(&map, jobid);
if (ORTE_SUCCESS != rc) { if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; OBJ_DESTRUCT(&daemons);
return rc;
}
/* if the user requested that we re-use daemons,
* launch the procs on any existing, re-usable daemons
*/
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(map);
OBJ_DESTRUCT(&daemons);
return rc;
}
} }
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes); num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
if (num_nodes == 0) {
/* job must have been launched on existing daemons - just return */
OBJ_RELEASE(map);
OBJ_DESTRUCT(&daemons);
return ORTE_SUCCESS;
}
/* /*
* Allocate a range of vpids for the daemons. * Allocate a range of vpids for the daemons.
*/ */
if (num_nodes == 0) {
rc = ORTE_ERR_BAD_PARAM;
ORTE_ERROR_LOG(rc);
goto cleanup;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid); rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) { if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -479,23 +479,29 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
*/ */
rc = orte_rmaps.get_job_map(&map, jobid); rc = orte_rmaps.get_job_map(&map, jobid);
if (ORTE_SUCCESS != rc) { if (ORTE_SUCCESS != rc) {
goto cleanup; ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&active_daemons);
return rc;
} }
/* if the user requested that we re-use daemons, /* if the user requested that we re-use daemons,
* launch the procs on any existing, re-usable daemons */ * launch the procs on any existing, re-usable daemons
*/
if (orte_pls_base.reuse_daemons) { if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) { if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; OBJ_RELEASE(map);
OBJ_DESTRUCT(&active_daemons);
return rc;
} }
} }
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes); num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
if (0 >= num_nodes) { if (0 == num_nodes) {
/* nothing left to do - just return */ /* nothing left to do - just return */
rc = ORTE_SUCCESS; OBJ_RELEASE(map);
goto cleanup; OBJ_DESTRUCT(&active_daemons);
return ORTE_SUCCESS;
} }
if (mca_pls_rsh_component.debug_daemons && if (mca_pls_rsh_component.debug_daemons &&

Просмотреть файл

@ -65,6 +65,7 @@
#include "orte/mca/rmaps/rmaps.h" #include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/pls/pls.h" #include "orte/mca/pls/pls.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/pls/base/pls_private.h" #include "orte/mca/pls/base/pls_private.h"
#include "pls_slurm.h" #include "pls_slurm.h"
@ -156,7 +157,21 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
*/ */
rc = orte_rmaps.get_job_map(&map, jobid); rc = orte_rmaps.get_job_map(&map, jobid);
if (ORTE_SUCCESS != rc) { if (ORTE_SUCCESS != rc) {
goto cleanup; ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&daemons);
return rc;
}
/* if the user requested that we re-use daemons,
* launch the procs on any existing, re-usable daemons
*/
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(map);
OBJ_DESTRUCT(&daemons);
return rc;
}
} }
/* /*
@ -164,7 +179,12 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
*/ */
num_nodes = opal_list_get_size(&map->nodes); num_nodes = opal_list_get_size(&map->nodes);
if (num_nodes == 0) { if (num_nodes == 0) {
return ORTE_ERR_BAD_PARAM; /* nothing further to do - job must have been launched
* on existing daemons, so we can just return
*/
OBJ_RELEASE(map);
OBJ_DESTRUCT(&daemons);
return ORTE_SUCCESS;
} }
rc = orte_ns.reserve_range(0, num_nodes, &vpid); rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) { if (ORTE_SUCCESS != rc) {

Просмотреть файл

@ -69,7 +69,7 @@
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
#include "orte/mca/ns/ns.h" #include "orte/mca/ns/ns.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/pls/base/pls_private.h" #include "orte/mca/pls/base/pls_private.h"
#include "pls_tm.h" #include "pls_tm.h"
@ -164,17 +164,31 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
*/ */
rc = orte_rmaps.get_job_map(&map, jobid); rc = orte_rmaps.get_job_map(&map, jobid);
if (ORTE_SUCCESS != rc) { if (ORTE_SUCCESS != rc) {
goto cleanup; ORTE_ERROR_LOG(rc);
return rc;
}
/* if the user requested that we re-use daemons,
* launch the procs on any existing, re-usable daemons
*/
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(map);
return rc;
}
} }
num_nodes = opal_list_get_size(&map->nodes); num_nodes = opal_list_get_size(&map->nodes);
if (0 == num_nodes) {
/* must have been launched on existing daemons - just return */
OBJ_RELEASE(map);
return ORTE_SUCCESS;
}
/* /*
* Allocate a range of vpids for the daemons. * Allocate a range of vpids for the daemons.
*/ */
if (0 == num_nodes) {
return ORTE_ERR_BAD_PARAM;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid); rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) { if (ORTE_SUCCESS != rc) {
goto cleanup; goto cleanup;