diff --git a/orte/mca/pls/base/pls_base_dmn_registry_fns.c b/orte/mca/pls/base/pls_base_dmn_registry_fns.c index b882e316a7..52f587bc6a 100644 --- a/orte/mca/pls/base/pls_base_dmn_registry_fns.c +++ b/orte/mca/pls/base/pls_base_dmn_registry_fns.c @@ -149,8 +149,19 @@ static int get_daemons(opal_list_t *daemons, orte_jobid_t job) orte_pls_daemon_info_t *dmn, *dmn2; bool found_name, found_node, found_cell; opal_list_item_t *item; + bool check_dups; int rc; + /* check the list to see if there is anything already on it. If there is, then + * we will need to check for duplicate entries before we add something. If not, + * then this can go a lot faster + */ + if (0 < opal_list_get_size(daemons)) { + check_dups = true; + } else { + check_dups = false; + } + /* setup the key */ if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, job))) { ORTE_ERROR_LOG(rc); @@ -208,15 +219,17 @@ static int get_daemons(opal_list_t *daemons, orte_jobid_t job) } /* if we found everything, then this is a valid entry */ if (found_name && found_node && found_cell) { - /* see if this daemon is already on the list - if so, then we don't add it */ - for (item = opal_list_get_first(daemons); - item != opal_list_get_end(daemons); - item = opal_list_get_next(item)) { - dmn2 = (orte_pls_daemon_info_t*)item; - - if (ORTE_EQUAL == orte_dss.compare(dmn2->name, name, ORTE_NAME)) { - /* already on list - ignore it */ - goto MOVEON; + if (check_dups) { + /* see if this daemon is already on the list - if so, then we don't add it */ + for (item = opal_list_get_first(daemons); + item != opal_list_get_end(daemons); + item = opal_list_get_next(item)) { + dmn2 = (orte_pls_daemon_info_t*)item; + + if (ORTE_EQUAL == orte_dss.compare(dmn2->name, name, ORTE_NAME)) { + /* already on list - ignore it */ + goto MOVEON; + } } } dmn = OBJ_NEW(orte_pls_daemon_info_t); @@ -316,22 +329,40 @@ int orte_pls_base_remove_daemon(orte_pls_daemon_info_t *info) int orte_pls_base_check_avail_daemons(opal_list_t *daemons, orte_jobid_t job) { - orte_jobid_t parent; + orte_jobid_t root, *descendants; + orte_std_cntr_t i, ndesc; int rc; - /* check for daemons belonging to the parent job */ - if (ORTE_SUCCESS != (rc = orte_ns.get_parent_job(&parent, job))) { + /* check for daemons belonging to any job in this job's family. + * Since the jobs in any family must exit together, it is reasonable + * for us to reuse any daemons that were spawned by any member + * of our extended family. We can find all of our family members + * by first finding our root job, and then getting all of its + * descendants + */ + if (ORTE_SUCCESS != (rc = orte_ns.get_root_job(&root, job))) { ORTE_ERROR_LOG(rc); return rc; } - if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, parent, NULL))) { + if (ORTE_SUCCESS != (rc = orte_ns.get_job_descendants(&descendants, &ndesc, root))) { ORTE_ERROR_LOG(rc); return rc; } + /* loop through the descendants, adding to the daemon list as we go */ + for (i=0; i < ndesc; i++) { + if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, descendants[i], NULL))) { + ORTE_ERROR_LOG(rc); + free(descendants); + return rc; + } + } + free(descendants); /* all done with these */ + /* now add in any persistent daemons - they are tagged as bootproxies - * for jobid = 0 */ + * for jobid = 0 + */ if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, 0, NULL))) { ORTE_ERROR_LOG(rc); return rc; diff --git a/orte/mca/pls/gridengine/pls_gridengine_module.c b/orte/mca/pls/gridengine/pls_gridengine_module.c index 0cab075ec3..99bbf1c06d 100644 --- a/orte/mca/pls/gridengine/pls_gridengine_module.c +++ b/orte/mca/pls/gridengine/pls_gridengine_module.c @@ -85,6 +85,7 @@ #include "orte/mca/smr/smr.h" #include "orte/mca/pls/pls.h" +#include "orte/mca/pls/base/base.h" #include "orte/mca/pls/base/pls_private.h" #include "orte/mca/pls/gridengine/pls_gridengine.h" @@ -226,19 +227,33 @@ int orte_pls_gridengine_launch_job(orte_jobid_t jobid) rc = orte_rmaps.get_job_map(&map, jobid); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); - goto cleanup; + OBJ_DESTRUCT(&daemons); + return rc; } + /* if the user requested that we re-use daemons, + * launch the procs on any existing, re-usable daemons + */ + if (orte_pls_base.reuse_daemons) { + if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(map); + OBJ_DESTRUCT(&daemons); + return rc; + } + } + num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes); - + if (num_nodes == 0) { + /* job must have been launched on existing daemons - just return */ + OBJ_RELEASE(map); + OBJ_DESTRUCT(&daemons); + return ORTE_SUCCESS; + } + /* * Allocate a range of vpids for the daemons. */ - if (num_nodes == 0) { - rc = ORTE_ERR_BAD_PARAM; - ORTE_ERROR_LOG(rc); - goto cleanup; - } rc = orte_ns.reserve_range(0, num_nodes, &vpid); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/pls/rsh/pls_rsh_module.c b/orte/mca/pls/rsh/pls_rsh_module.c index 35da010565..999560a966 100644 --- a/orte/mca/pls/rsh/pls_rsh_module.c +++ b/orte/mca/pls/rsh/pls_rsh_module.c @@ -479,23 +479,29 @@ int orte_pls_rsh_launch(orte_jobid_t jobid) */ rc = orte_rmaps.get_job_map(&map, jobid); if (ORTE_SUCCESS != rc) { - goto cleanup; + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&active_daemons); + return rc; } /* if the user requested that we re-use daemons, - * launch the procs on any existing, re-usable daemons */ + * launch the procs on any existing, re-usable daemons + */ if (orte_pls_base.reuse_daemons) { if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) { ORTE_ERROR_LOG(rc); - goto cleanup; + OBJ_RELEASE(map); + OBJ_DESTRUCT(&active_daemons); + return rc; } } num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes); - if (0 >= num_nodes) { + if (0 == num_nodes) { /* nothing left to do - just return */ - rc = ORTE_SUCCESS; - goto cleanup; + OBJ_RELEASE(map); + OBJ_DESTRUCT(&active_daemons); + return ORTE_SUCCESS; } if (mca_pls_rsh_component.debug_daemons && diff --git a/orte/mca/pls/slurm/pls_slurm_module.c b/orte/mca/pls/slurm/pls_slurm_module.c index 31f187c5b5..e267bd1f2c 100644 --- a/orte/mca/pls/slurm/pls_slurm_module.c +++ b/orte/mca/pls/slurm/pls_slurm_module.c @@ -65,6 +65,7 @@ #include "orte/mca/rmaps/rmaps.h" #include "orte/mca/pls/pls.h" +#include "orte/mca/pls/base/base.h" #include "orte/mca/pls/base/pls_private.h" #include "pls_slurm.h" @@ -156,15 +157,34 @@ static int pls_slurm_launch_job(orte_jobid_t jobid) */ rc = orte_rmaps.get_job_map(&map, jobid); if (ORTE_SUCCESS != rc) { - goto cleanup; + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&daemons); + return rc; } + /* if the user requested that we re-use daemons, + * launch the procs on any existing, re-usable daemons + */ + if (orte_pls_base.reuse_daemons) { + if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(map); + OBJ_DESTRUCT(&daemons); + return rc; + } + } + /* * Allocate a range of vpids for the daemons. */ num_nodes = opal_list_get_size(&map->nodes); if (num_nodes == 0) { - return ORTE_ERR_BAD_PARAM; + /* nothing further to do - job must have been launched + * on existing daemons, so we can just return + */ + OBJ_RELEASE(map); + OBJ_DESTRUCT(&daemons); + return ORTE_SUCCESS; } rc = orte_ns.reserve_range(0, num_nodes, &vpid); if (ORTE_SUCCESS != rc) { diff --git a/orte/mca/pls/tm/pls_tm_module.c b/orte/mca/pls/tm/pls_tm_module.c index ea969daaf4..09f4a50f64 100644 --- a/orte/mca/pls/tm/pls_tm_module.c +++ b/orte/mca/pls/tm/pls_tm_module.c @@ -69,7 +69,7 @@ #include "orte/mca/rml/rml.h" #include "orte/mca/ns/ns.h" - +#include "orte/mca/pls/base/base.h" #include "orte/mca/pls/base/pls_private.h" #include "pls_tm.h" @@ -164,17 +164,31 @@ static int pls_tm_launch_job(orte_jobid_t jobid) */ rc = orte_rmaps.get_job_map(&map, jobid); if (ORTE_SUCCESS != rc) { - goto cleanup; + ORTE_ERROR_LOG(rc); + return rc; } + /* if the user requested that we re-use daemons, + * launch the procs on any existing, re-usable daemons + */ + if (orte_pls_base.reuse_daemons) { + if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map, jobid))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(map); + return rc; + } + } + num_nodes = opal_list_get_size(&map->nodes); - + if (0 == num_nodes) { + /* must have been launched on existing daemons - just return */ + OBJ_RELEASE(map); + return ORTE_SUCCESS; + } + /* * Allocate a range of vpids for the daemons. */ - if (0 == num_nodes) { - return ORTE_ERR_BAD_PARAM; - } rc = orte_ns.reserve_range(0, num_nodes, &vpid); if (ORTE_SUCCESS != rc) { goto cleanup;