From 8014e3429efd999c9e3c2eaff25a1cd5bfb3e273 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 8 Aug 2011 06:05:23 +0000 Subject: [PATCH] Don't double-count procs as they are launched This commit was SVN r25011. --- orte/mca/odls/base/odls_base_default_fns.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index bfc7f99b68..42c4a83cdc 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -1245,7 +1245,7 @@ static void timer_cb(int fd, short event, void *cbdata) time_is_up = true; } -static int compute_num_procs_alive(void) +static int compute_num_procs_alive(orte_jobid_t job) { opal_list_item_t *item; orte_odls_child_t *child; @@ -1255,9 +1255,13 @@ static int compute_num_procs_alive(void) item != opal_list_get_end (&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - if (child->alive) { - num_procs_alive++; + if (!child->alive) { + continue; } + if (job == child->name->jobid) { + continue; + } + num_procs_alive++; } return num_procs_alive; } @@ -1393,7 +1397,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, /* compute the number of local procs alive or about to be launched * as part of this job */ - total_num_local_procs = compute_num_procs_alive() + jobdat->num_local_procs; + total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs; /* get the number of local processors */ if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_processor_info(&num_processors))) { /* if we cannot find the number of local processors, we have no choice @@ -1423,7 +1427,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, OBJ_CONSTRUCT(&alert, opal_buffer_t); /* compute the total number of local procs currently alive and about to be launched */ - total_num_local_procs = compute_num_procs_alive() + jobdat->num_local_procs; + total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs; for (j=0; j < jobdat->apps.size; j++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, j))) { @@ -1452,7 +1456,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, /* wait */ ORTE_PROGRESSED_WAIT(time_is_up, 0, 1); /* recompute the num local procs */ - total_num_local_procs = compute_num_procs_alive() + jobdat->num_local_procs; + total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs; /* see if we still have a problem */ OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output, "%s rechecking limit on num procs %d #children needed %d", @@ -1617,7 +1621,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, /* wait */ ORTE_PROGRESSED_WAIT(time_is_up, 0, 1); /* recompute the num procs alive */ - total_num_local_procs = compute_num_procs_alive() + jobdat->num_local_procs; + total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs; /* see if we still have a problem */ limit = 4*total_num_local_procs + 6; OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output,