diff --git a/orte/mca/errmgr/default/errmgr_default.c b/orte/mca/errmgr/default/errmgr_default.c index 211226b017..748c108d86 100644 --- a/orte/mca/errmgr/default/errmgr_default.c +++ b/orte/mca/errmgr/default/errmgr_default.c @@ -77,11 +77,11 @@ void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code) */ jobs = (orte_job_t**)orte_job_data->addr; for (i=1; i < orte_job_data->size; i++) { - /* the array is left justfied, so we can quit once - * we see a NULL + /* the array may have holes in it as we are recovering + * jobids as they complete, so check everything */ if (NULL == jobs[i]) { - break; + continue; } if (ORTE_JOB_STATE_ABORTED != jobs[i]->state && ORTE_JOB_STATE_ABORTED_BY_SIG != jobs[i]->state && diff --git a/orte/mca/ess/base/ess_base_std_prolog.c b/orte/mca/ess/base/ess_base_std_prolog.c index e0910ec495..bd1fb617a2 100644 --- a/orte/mca/ess/base/ess_base_std_prolog.c +++ b/orte/mca/ess/base/ess_base_std_prolog.c @@ -46,17 +46,6 @@ int orte_ess_base_std_prolog(void) goto error; } - /* if I'm the HNP, make sure that the daemon flag is NOT set so that - * components unique to non-HNP orteds can be selected and init - * my basic storage elements - */ - if (orte_process_info.hnp) { - if (ORTE_SUCCESS != (ret = orte_hnp_globals_init())) { - error = "orte_hnp_globals_init"; - goto error; - } - } - /* * Internal startup */ diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index d92ce09da3..69aaa34eea 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -327,11 +327,32 @@ static int rte_init(char flags) } free(contact_path); + /* setup the global job and node arrays */ + orte_job_data = OBJ_NEW(opal_pointer_array_t); + if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, + 1, + ORTE_GLOBAL_ARRAY_MAX_SIZE, + 1))) { + ORTE_ERROR_LOG(ret); + error = "setup job array"; + goto error; + } + + orte_node_pool = OBJ_NEW(opal_pointer_array_t); + if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, + ORTE_GLOBAL_ARRAY_BLOCK_SIZE, + ORTE_GLOBAL_ARRAY_MAX_SIZE, + ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { + ORTE_ERROR_LOG(ret); + error = "setup node array"; + goto error; + } + /* Setup the job data object for the daemons */ /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = ORTE_PROC_MY_NAME->jobid; - opal_pointer_array_add(orte_job_data, jdata); + opal_pointer_array_set_item(orte_job_data, 0, jdata); /* create and store a node object where we are */ node = OBJ_NEW(orte_node_t); @@ -465,6 +486,7 @@ static int rte_finalize(void) { char *contact_path; opal_list_item_t *item; + int i; /* remove my contact info file */ contact_path = opal_os_path(false, orte_process_info.top_session_dir, @@ -517,6 +539,24 @@ static int rte_finalize(void) } OBJ_DESTRUCT(&orte_local_jobdata); + /* cleanup the job and node info arrays */ + if (NULL != orte_node_pool) { + for (i=0; i < orte_node_pool->size; i++) { + if (NULL != orte_node_pool->addr[i]) { + OBJ_RELEASE(orte_node_pool->addr[i]); + } + } + OBJ_RELEASE(orte_node_pool); + } + if (NULL != orte_job_data) { + for (i=0; i < orte_job_data->size; i++) { + if (NULL != orte_job_data->addr[i]) { + OBJ_RELEASE(orte_job_data->addr[i]); + } + } + OBJ_RELEASE(orte_job_data); + } + /* finalize the session directory tree */ orte_session_dir_finalize(ORTE_PROC_MY_NAME); diff --git a/orte/mca/ess/slurmd/ess_slurmd_component.c b/orte/mca/ess/slurmd/ess_slurmd_component.c index 0e89e5a207..7f5f26b45e 100644 --- a/orte/mca/ess/slurmd/ess_slurmd_component.c +++ b/orte/mca/ess/slurmd/ess_slurmd_component.c @@ -76,6 +76,7 @@ int orte_ess_slurmd_component_query(mca_base_module_t **module, int *priority) if (orte_process_info.mpi_proc && NULL != getenv("SLURM_JOBID") && + NULL != getenv("SLURM_STEPID") && NULL == orte_process_info.my_hnp_uri) { *priority = 30; *module = (mca_base_module_t *)&orte_ess_slurmd_module; diff --git a/orte/mca/ess/slurmd/ess_slurmd_module.c b/orte/mca/ess/slurmd/ess_slurmd_module.c index 92cdfca09f..c024fb4be1 100644 --- a/orte/mca/ess/slurmd/ess_slurmd_module.c +++ b/orte/mca/ess/slurmd/ess_slurmd_module.c @@ -88,6 +88,9 @@ orte_ess_base_module_t orte_ess_slurmd_module = { NULL /* ft_event */ }; +/* Local globals */ +static bool app_init_complete; + /**** MODULE FUNCTIONS ****/ static int rte_init(char flags) @@ -110,6 +113,9 @@ static int rte_init(char flags) int *ppn; bool block=false, cyclic=false; + /* init flag */ + app_init_complete = false; + /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; @@ -331,6 +337,9 @@ static int rte_init(char flags) goto error; } + /* flag that we completed init */ + app_init_complete = true; + return ORTE_SUCCESS; error: @@ -345,12 +354,17 @@ static int rte_finalize(void) { int ret; - /* use the default procedure to finish */ - if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { - ORTE_ERROR_LOG(ret); + if (app_init_complete) { + /* use the default procedure to finish */ + if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { + ORTE_ERROR_LOG(ret); + } } - /* deconstruct my nidmap and jobmap arrays */ + /* deconstruct my nidmap and jobmap arrays - this + * function protects itself from being called + * before things were initialized + */ orte_util_nidmap_finalize(); return ret; diff --git a/orte/mca/plm/base/plm_base_jobid.c b/orte/mca/plm/base/plm_base_jobid.c index 7e20327e31..66084c3887 100644 --- a/orte/mca/plm/base/plm_base_jobid.c +++ b/orte/mca/plm/base/plm_base_jobid.c @@ -63,7 +63,7 @@ int orte_plm_base_set_hnp_name(void) ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); ORTE_PROC_MY_NAME->vpid = 0; - orte_plm_globals.next_jobid = ORTE_PROC_MY_NAME->jobid + 1; + orte_plm_globals.next_jobid = 1; /* copy it to the HNP field */ ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; @@ -78,12 +78,38 @@ int orte_plm_base_set_hnp_name(void) */ int orte_plm_base_create_jobid(orte_jobid_t *jobid) { - if (ORTE_JOBID_MAX-1 < orte_plm_globals.next_jobid) { +#if 0 + orte_job_t **jobs; + int32_t j; + + /* RHC: WHILE ORTE CAN NOW HANDLE RECYCLING OF JOBID'S, + * THE MPI LAYER CANNOT SINCE THERE IS NO WAY TO + * UPDATE THE OMPI_PROC_T LIST AND/OR THE BTL'S + */ + + /* see if there is a prior + * jobid that has completed and can be re-used. It can + * never be 0 as that belongs to the HNP and its daemons + */ + jobs = (orte_job_t**)orte_job_data->addr; + for (j=1; j < orte_job_data->size; j++) { + if (NULL == jobs[j]) { + /* this local jobid is available - reuse it */ + *jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, j); + return ORTE_SUCCESS; + } + } +#endif + + if (UINT16_MAX == orte_plm_globals.next_jobid) { + /* if we get here, then no local jobids are available */ ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); *jobid = ORTE_JOBID_INVALID; return ORTE_ERR_OUT_OF_RESOURCE; } - *jobid = orte_plm_globals.next_jobid++; + /* take the next jobid */ + *jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, orte_plm_globals.next_jobid); + orte_plm_globals.next_jobid++; return ORTE_SUCCESS; } diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 18248f6070..e9b446b827 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -66,6 +66,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata) { orte_job_t *jdatorted; int rc; + int32_t ljob; OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:setup_job for job %s", @@ -73,7 +74,8 @@ int orte_plm_base_setup_job(orte_job_t *jdata) ORTE_JOBID_PRINT(jdata->jobid))); /* insert the job object into the global pool */ - opal_pointer_array_add(orte_job_data, jdata); + ljob = ORTE_LOCAL_JOBID(jdata->jobid); + opal_pointer_array_set_item(orte_job_data, ljob, jdata); if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) { ORTE_ERROR_LOG(rc); @@ -1135,9 +1137,13 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, void orte_plm_base_check_job_completed(orte_job_t *jdata) { orte_proc_t **procs; - orte_vpid_t i; + int i; orte_std_cntr_t j; orte_job_t **jobs; + orte_node_t **nodes; + orte_job_map_t *map; + orte_std_cntr_t index; + bool one_still_alive; /* if the incoming job data pointer is NULL, then all we can do * is check all jobs for complete @@ -1167,7 +1173,13 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) * as abnormally terminated, then do not update its state */ if (jdata->state < ORTE_JOB_STATE_TERMINATED) { - for (i=0; i < jdata->num_procs; i++) { + for (i=0; i < jdata->procs->size; i++) { + /* the proc array may no longer be left justified, so + * we need to check everything + */ + if (NULL == procs[i]) { + continue; + } if (ORTE_PROC_STATE_FAILED_TO_START == procs[i]->state) { jdata->state = ORTE_JOB_STATE_FAILED_TO_START; if (!jdata->abort) { @@ -1217,25 +1229,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) } } - /* Release the resources used by this job. */ - if( NULL != jdata->map ) { - int i, index; - orte_node_t* daemon; - orte_proc_t* proc; - orte_job_map_t* map; - - map = jdata->map; - for( index = 0; index < map->num_nodes; index++ ) { - daemon = (orte_node_t *) opal_pointer_array_get_item( map->nodes, index ); - for( i = 0; i < (int)daemon->num_procs; i++ ) { - proc = (orte_proc_t *) opal_pointer_array_get_item(daemon->procs, i); - if( (NULL != proc) && (proc->name.jobid == jdata->jobid) ) { - daemon->slots_inuse--; - } - } - } - } - /* check the resulting job state and notify the appropriate places */ if (ORTE_JOB_STATE_FAILED_TO_START == jdata->state) { @@ -1299,14 +1292,65 @@ CHECK_ALL_JOBS: return; } } + /* Release the resources used by this job. Since some errmgr's may want + * to continue using resources allocated to the job as part of their + * fault recovery procedure, we only do this once the job is "complete". + * Note that an aborted/killed job -is- flagged as complete and will + * therefore have its resources released. + */ + if( NULL != jdata->map ) { + map = jdata->map; + nodes = (orte_node_t**)map->nodes->addr; + for( index = 0; index < map->nodes->size; index++ ) { + if (NULL == nodes[index]) { + /* the nodes in a map are left-justfied and + * there are no holes in the array + */ + break; + } + procs = (orte_proc_t**)nodes[index]->procs->addr; + for( i = 0; i < nodes[index]->procs->size; i++ ) { + if (NULL == procs[i]) { + /* there can be holes in the proc array since + * we are cleaning up as we go + */ + continue; + } + if(procs[i]->name.jobid == jdata->jobid) { + nodes[index]->slots_inuse--; + nodes[index]->num_procs--; + /* release this object, ensuring that the + * pointer array internal accounting + * is maintained! + */ + OBJ_RELEASE(procs[i]); + opal_pointer_array_set_item(nodes[index]->procs, i, NULL); + } + } + } + } + + /* now check to see if all jobs are done - release this jdata + * object when we find it + */ jobs = (orte_job_t**)orte_job_data->addr; - for (j=0; j < orte_job_data->size; j++) { + one_still_alive = false; + for (j=1; j < orte_job_data->size; j++) { if (NULL == jobs[j]) { - /* the jobs are left-justified in the array, so - * if we find a NULL, that means we are past all - * the jobs so we can just quit the loop + /* since we are releasing jdata objects as we + * go, we can no longer assume that the job_data + * array is left justified */ - break; + continue; + } + if (NULL != jdata && jobs[j]->jobid == jdata->jobid) { + /* release this object, ensuring that the + * pointer array internal accounting + * is maintained! + */ + OBJ_RELEASE(jdata); + opal_pointer_array_set_item(orte_job_data, j, NULL); + continue; } /* if the job is flagged to not be monitored, skip it */ if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & jobs[j]->controls) { @@ -1315,20 +1359,33 @@ CHECK_ALL_JOBS: /* when checking for job termination, we must be sure to NOT check * our own job as it - rather obviously - has NOT terminated! */ - if (ORTE_PROC_MY_NAME->jobid != jobs[j]->jobid && - jobs[j]->num_terminated < jobs[j]->num_procs) { - /* we have at least one job that is not done yet */ + if (jobs[j]->num_terminated < jobs[j]->num_procs) { + /* we have at least one job that is not done yet - we cannot + * just return, though, as we need to ensure we cleanout the + * job data for the job that just completed + */ OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:check_job_completed job %s is not terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobs[j]->jobid))); - return; + one_still_alive = true; } } + /* if a job is still alive, we just return */ + if (one_still_alive) { + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:check_job_completed at least one job is not terminated", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + return; + } /* if we get here, then all jobs are done, so wakeup */ OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:check_job_completed all jobs terminated - waking up", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* set the exit status to 0 - this will only happen if it + * wasn't already set by an error condition + */ + ORTE_UPDATE_EXIT_STATUS(0); orte_trigger_event(&orte_exit); } diff --git a/orte/mca/plm/base/plm_private.h b/orte/mca/plm/base/plm_private.h index c0a48f769f..4519ba979a 100644 --- a/orte/mca/plm/base/plm_private.h +++ b/orte/mca/plm/base/plm_private.h @@ -52,7 +52,7 @@ typedef struct { /* orted cmd cond */ opal_condition_t orted_cmd_cond; /* next jobid */ - orte_jobid_t next_jobid; + uint16_t next_jobid; /* time when daemons started launch */ struct timeval daemonlaunchstart; /* rsh launch agent path */ diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index ce5e0ecfd2..22e07f9a57 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -102,9 +102,10 @@ orte_plm_base_module_1_0_0_t orte_plm_slurm_module = { /* * Local variables */ -static pid_t srun_pid = 0; +static pid_t primary_srun_pid = 0; +static bool primary_pid_set = false; static orte_jobid_t active_job = ORTE_JOBID_INVALID; -static bool failed_launch; +static bool launching_daemons; static bool local_launch_available = false; /** @@ -153,7 +154,8 @@ static int plm_slurm_launch_job(orte_job_t *jdata) struct timeval launchstart, launchstop; int proc_vpid_index; orte_jobid_t failed_job; - + bool failed_launch=true; + if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) { /* if this is a request to launch a local slave, * then we will not be launching an orted - we will @@ -187,7 +189,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata) } /* indicate the state of the launch */ - failed_launch = true; + launching_daemons = true; /* create a jobid for this job */ if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) { @@ -398,6 +400,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata) launch_apps: /* get here if daemons launch okay - any failures now by apps */ + launching_daemons = false; failed_job = active_job; if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, @@ -465,19 +468,31 @@ static int plm_slurm_terminate_job(orte_jobid_t jobid) static int plm_slurm_terminate_orteds(void) { int rc; + orte_job_t *jdata; - /* deregister the waitpid callback to ensure we don't make it look like - * srun failed when it didn't. Since the srun may have already completed, - * do NOT ERROR_LOG any return code to avoid confusing, duplicate error - * messages + /* tell them to die without sending a reply - we will rely on the + * waitpid to tell us when they have exited! */ - orte_wait_cb_cancel(srun_pid); - - /* tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) { + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_NO_REPLY_CMD))) { ORTE_ERROR_LOG(rc); } + /* check to see if the primary pid is set. If not, this indicates + * that we never launched any additional daemons, so we cannot + * not wait for a waitpid to fire and tell us it's okay to + * exit. Instead, we simply trigger an exit for ourselves + */ + if (!primary_pid_set) { + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurm: primary daemons complete!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + jdata->state = ORTE_JOB_STATE_TERMINATED; + /* need to set the #terminated value to avoid an incorrect error msg */ + jdata->num_terminated = jdata->num_procs; + orte_trigger_event(&orteds_exit); + } + return rc; } @@ -512,40 +527,63 @@ static int plm_slurm_finalize(void) static void srun_wait_cb(pid_t pid, int status, void* cbdata){ + orte_job_t *jdata; + /* According to the SLURM folks, srun always returns the highest exit - code of our remote processes. Thus, a non-zero exit status doesn't - necessarily mean that srun failed - it could be that an orted returned - a non-zero exit status. Of course, that means the orted failed(!), so - the end result is the same - the job didn't start. - - As a result, we really can't do much with the exit status itself - it - could be something in errno (if srun itself failed), or it could be - something returned by an orted, or it could be something returned by - the OS (e.g., couldn't find the orted binary). Somebody is welcome - to sort out all the options and pretty-print a better error message. For - now, though, the only thing that really matters is that - srun failed. Report the error and make sure that orterun - wakes up - otherwise, do nothing! + code of our remote processes. Thus, a non-zero exit status doesn't + necessarily mean that srun failed - it could be that an orted returned + a non-zero exit status. Of course, that means the orted failed(!), so + the end result is the same - the job didn't start. - Unfortunately, the pid returned here is the srun pid, not the pid of - the proc that actually died! So, to avoid confusion, just use -1 as the - pid so nobody thinks this is real - */ + As a result, we really can't do much with the exit status itself - it + could be something in errno (if srun itself failed), or it could be + something returned by an orted, or it could be something returned by + the OS (e.g., couldn't find the orted binary). Somebody is welcome + to sort out all the options and pretty-print a better error message. For + now, though, the only thing that really matters is that + srun failed. Report the error and make sure that orterun + wakes up - otherwise, do nothing! + + Unfortunately, the pid returned here is the srun pid, not the pid of + the proc that actually died! So, to avoid confusion, just use -1 as the + pid so nobody thinks this is real + */ - if (0 != status) { - if (failed_launch) { - /* report that the daemon has failed so we can exit - */ - orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START); - - } else { + /* if we are in the launch phase, then any termination is bad */ + if (launching_daemons) { + /* report that one or more daemons failed to launch so we can exit */ + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurm: daemon failed during launch", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START); + } else { + /* if this is after launch, then we need to abort only if the status + * returned is non-zero - i.e., if the orteds exited with an error + */ + if (0 != status) { /* an orted must have died unexpectedly after launch - report * that the daemon has failed so we exit */ + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurm: daemon failed while running", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_ABORTED); } + /* otherwise, check to see if this is the primary pid */ + if (primary_srun_pid == pid) { + /* in this case, we just want to fire the proper trigger so + * mpirun can exit + */ + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurm: primary daemons complete!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + jdata->state = ORTE_JOB_STATE_TERMINATED; + /* need to set the #terminated value to avoid an incorrect error msg */ + jdata->num_terminated = jdata->num_procs; + orte_trigger_event(&orteds_exit); + } } - } @@ -553,6 +591,7 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env, char *prefix) { int fd; + int srun_pid; char *exec_argv = opal_path_findv(argv[0], 0, env, NULL); if (NULL == exec_argv) { @@ -653,6 +692,14 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env, sides of the fork... */ setpgid(srun_pid, srun_pid); + /* if this is the primary launch - i.e., not a comm_spawn of a + * child job - then save the pid + */ + if (!primary_pid_set) { + primary_srun_pid = srun_pid; + primary_pid_set = true; + } + /* setup the waitpid so we can find out if srun succeeds! */ orte_wait_cb(srun_pid, srun_wait_cb, NULL); free(exec_argv); diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index a86c7d94c7..2a5331c8e2 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -352,7 +352,7 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata, int orte_rmaps_base_compute_usage(orte_job_t *jdata) { orte_std_cntr_t i; - orte_vpid_t j, k; + int j, k; orte_node_t **nodes; orte_proc_t **procs, *psave, *psave2; orte_vpid_t minv, minv2; @@ -378,13 +378,24 @@ int orte_rmaps_base_compute_usage(orte_job_t *jdata) procs = (orte_proc_t**)nodes[i]->procs->addr; local_rank = 0; - for (k=0; k < nodes[i]->num_procs; k++) { + /* the node map may have holes in it, so cycle + * all the way through and avoid the holes + */ + for (k=0; k < nodes[i]->procs->size; k++) { + /* if this proc is NULL, skip it */ + if (NULL == procs[k]) { + continue; + } minv = ORTE_VPID_MAX; minv2 = ORTE_VPID_MAX; psave = NULL; psave2 = NULL; /* find the minimum vpid proc */ - for (j=0; j < nodes[i]->num_procs; j++) { + for (j=0; j < nodes[i]->procs->size; j++) { + /* if this proc is NULL, skip it */ + if (NULL == procs[j]) { + continue; + } if (procs[j]->name.jobid == jdata->jobid && ORTE_LOCAL_RANK_MAX == procs[j]->local_rank && procs[j]->name.vpid < minv) { diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index b3cc130195..8499eb22d4 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -407,31 +407,6 @@ int orte_dt_init(void) } #if !ORTE_DISABLE_FULL_SUPPORT -int orte_hnp_globals_init(void) -{ - int rc; - - orte_job_data = OBJ_NEW(opal_pointer_array_t); - if (ORTE_SUCCESS != (rc = opal_pointer_array_init(orte_job_data, - 1, - ORTE_GLOBAL_ARRAY_MAX_SIZE, - 1))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - orte_node_pool = OBJ_NEW(opal_pointer_array_t); - if (ORTE_SUCCESS != (rc = opal_pointer_array_init(orte_node_pool, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE, - ORTE_GLOBAL_ARRAY_MAX_SIZE, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; -} - orte_job_t* orte_get_job_data_object(orte_jobid_t job) { @@ -575,20 +550,22 @@ static void orte_job_construct(orte_job_t* job) static void orte_job_destruct(orte_job_t* job) { orte_std_cntr_t i; - orte_vpid_t j; + int n; for (i=0; i < job->num_apps; i++) { if (NULL != job->apps->addr[i]) OBJ_RELEASE(job->apps->addr[i]); } OBJ_RELEASE(job->apps); - for (j=0; j < job->num_procs; j++) { - if (NULL != job->procs->addr[j]) OBJ_RELEASE(job->procs->addr[j]); + if (NULL != job->map) OBJ_RELEASE(job->map); + + for (n=0; n < job->procs->size; n++) { + if (NULL != job->procs->addr[n]) { + OBJ_RELEASE(job->procs->addr[n]); + } } OBJ_RELEASE(job->procs); - if (NULL != job->map) OBJ_RELEASE(job->map); - #if OPAL_ENABLE_FT == 1 if (NULL != job->ckpt_snapshot_ref) { free(job->ckpt_snapshot_ref); @@ -635,7 +612,7 @@ static void orte_node_construct(orte_node_t* node) static void orte_node_destruct(orte_node_t* node) { - orte_vpid_t i; + int i; if (NULL != node->name) { free(node->name); @@ -645,10 +622,16 @@ static void orte_node_destruct(orte_node_t* node) opal_argv_free(node->alias); } - if (NULL != node->daemon) OBJ_RELEASE(node->daemon); + if (NULL != node->daemon) { + node->daemon->node = NULL; + OBJ_RELEASE(node->daemon); + } - for (i=0; i < node->num_procs; i++) { - if (NULL != node->procs->addr[i]) OBJ_RELEASE(node->procs->addr[i]); + for (i=0; i < node->procs->size; i++) { + if (NULL != node->procs->addr[i]) { + ((orte_proc_t*)(node->procs->addr[i]))->node = NULL; + OBJ_RELEASE(node->procs->addr[i]); + } } OBJ_RELEASE(node->procs); diff --git a/orte/runtime/runtime_internals.h b/orte/runtime/runtime_internals.h index a9327ce02c..eb64b5171a 100644 --- a/orte/runtime/runtime_internals.h +++ b/orte/runtime/runtime_internals.h @@ -29,16 +29,6 @@ BEGIN_C_DECLS - -#if !ORTE_DISABLE_FULL_SUPPORT -/** - * Initialize global storage for HNPs - */ -ORTE_DECLSPEC int orte_hnp_globals_init(void); - -#endif /* !ORTE_DISABLE_FULL_SUPPORT */ - - /** * Init the ORTE datatype support */ diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 76269fb812..f9a6b287ee 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -787,7 +787,6 @@ DONE: static void job_completed(int trigpipe, short event, void *arg) { int rc; - orte_job_state_t exit_state; orte_job_t *daemons; /* if the abort exit event is set, delete it */ @@ -796,8 +795,6 @@ static void job_completed(int trigpipe, short event, void *arg) free(abort_exit_event); } - exit_state = jdata->state; - /* if we never launched, just skip this part to avoid * meaningless error messages */ @@ -806,7 +803,7 @@ static void job_completed(int trigpipe, short event, void *arg) goto DONE; } - if (ORTE_JOB_STATE_TERMINATED != exit_state) { + if (0 != orte_exit_status) { /* abnormal termination of some kind */ dump_aborted_procs(); /* If we showed more abort messages than were allowed, @@ -991,8 +988,8 @@ static void dump_aborted_procs(void) jobs = (orte_job_t**)orte_job_data->addr; for (n=1; n < orte_job_data->size; n++) { if (NULL == jobs[n]) { - /* the array is left-justified, so we can quit on the first NULL */ - return; + /* the array is no longer left-justified, so we have to continue */ + continue; } if (ORTE_JOB_STATE_UNDEF != jobs[n]->state && ORTE_JOB_STATE_INIT != jobs[n]->state && diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index 84232e9d5c..7717a6e5a4 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -828,7 +828,14 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr) jobs = (orte_job_t**)orte_job_data->addr; /* for each job... */ - for (j=0; j < orte_job_data->size && NULL != jobs[j]; j++) { + for (j=1; j < orte_job_data->size; j++) { + /* the job array is no longer left-justified and may + * have holes in it as we recover resources at job + * completion + */ + if (NULL == jobs[j]) { + continue; + } jdata = jobs[j]; /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata->jobid, 1, ORTE_JOBID))) {