From d653cf284762f56e65968f30b5d3c2cacf7ed657 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 16 Feb 2016 06:51:20 -0800 Subject: [PATCH] Convert the orte_job_data pointer array to a hash table so it doesn't grow forever as we run lots and lots of jobs in the persistent DVM. --- opal/mca/pmix/pmix112/Makefile.am | 2 +- .../default_orted/errmgr_default_orted.c | 3 +- orte/mca/ess/base/ess_base_std_orted.c | 11 ++-- orte/mca/ess/hnp/ess_hnp_module.c | 11 ++-- orte/mca/odls/base/odls_base_default_fns.c | 54 ++++++++----------- orte/mca/plm/base/plm_base_jobid.c | 22 +------- orte/mca/plm/base/plm_base_launch_support.c | 11 ++-- orte/mca/state/base/state_base_fns.c | 38 ++++++------- orte/orted/orted_comm.c | 28 +++++----- orte/orted/orted_main.c | 4 +- orte/runtime/orte_globals.c | 32 +++-------- orte/runtime/orte_globals.h | 2 +- orte/runtime/orte_quit.c | 22 ++++---- orte/tools/orte-dvm/orte-dvm.c | 1 - orte/tools/orterun/orterun.c | 4 +- 15 files changed, 93 insertions(+), 152 deletions(-) diff --git a/opal/mca/pmix/pmix112/Makefile.am b/opal/mca/pmix/pmix112/Makefile.am index e4dcaa34a1..a1e8ca037f 100644 --- a/opal/mca/pmix/pmix112/Makefile.am +++ b/opal/mca/pmix/pmix112/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright (c) 2014-2015 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2016 Intel, Inc. All rights reserved. # Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2015 Research Organization for Information Science # and Technology (RIST). All rights reserved. diff --git a/orte/mca/errmgr/default_orted/errmgr_default_orted.c b/orte/mca/errmgr/default_orted/errmgr_default_orted.c index a686484a02..618e74e8e1 100644 --- a/orte/mca/errmgr/default_orted/errmgr_default_orted.c +++ b/orte/mca/errmgr/default_orted/errmgr_default_orted.c @@ -8,7 +8,7 @@ * reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -584,7 +584,6 @@ static void proc_errors(int fd, short args, void *cbdata) orte_session_dir_cleanup(jdata->jobid); /* remove this job from our local job data since it is complete */ - opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL); OBJ_RELEASE(jdata); /* send it */ diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index 4beadbf000..a456ddf398 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -14,7 +14,7 @@ * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -301,11 +301,8 @@ int orte_ess_base_orted_setup(char **hosts) } } /* setup the global job and node arrays */ - orte_job_data = OBJ_NEW(opal_pointer_array_t); - if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, - 1, - ORTE_GLOBAL_ARRAY_MAX_SIZE, - 1))) { + orte_job_data = OBJ_NEW(opal_hash_table_t); + if (ORTE_SUCCESS != (ret = opal_hash_table_init(orte_job_data, 128))) { ORTE_ERROR_LOG(ret); error = "setup job array"; goto error; @@ -332,7 +329,7 @@ int orte_ess_base_orted_setup(char **hosts) /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = ORTE_PROC_MY_NAME->jobid; - opal_pointer_array_set_item(orte_job_data, 0, jdata); + opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata); /* every job requires at least one app */ app = OBJ_NEW(orte_app_context_t); opal_pointer_array_set_item(jdata->apps, 0, app); diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index b125610e03..c6f472a6c5 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -13,7 +13,7 @@ * Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -357,11 +357,8 @@ static int rte_init(void) goto error; } /* setup the global job and node arrays */ - orte_job_data = OBJ_NEW(opal_pointer_array_t); - if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, - 1, - ORTE_GLOBAL_ARRAY_MAX_SIZE, - 1))) { + orte_job_data = OBJ_NEW(opal_hash_table_t); + if (ORTE_SUCCESS != (ret = opal_hash_table_init(orte_job_data, 128))) { ORTE_ERROR_LOG(ret); error = "setup job array"; goto error; @@ -388,7 +385,7 @@ static int rte_init(void) /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = ORTE_PROC_MY_NAME->jobid; - opal_pointer_array_set_item(orte_job_data, 0, jdata); + opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata); /* mark that the daemons have reported as we are the * only ones in the system right now, and we definitely * are running! diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index c5fa5bffcf..4fa97b711f 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -101,7 +101,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, orte_jobid_t job) { - int rc, i; + int rc; orte_job_t *jdata=NULL, *jptr; orte_job_map_t *map=NULL; opal_buffer_t *wireup, jobdata; @@ -191,33 +191,29 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, * properly work should a proc from one of the other jobs * interact with this one */ if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS, NULL, OPAL_BOOL)) { + void *nptr; + uint32_t key; OBJ_CONSTRUCT(&jobdata, opal_buffer_t); numjobs = 0; - for (i=0; i < orte_job_data->size; i++) { - if (NULL == (jptr = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { - continue; + rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jptr, &nptr); + while (OPAL_SUCCESS == rc) { + if (NULL != jptr && jptr != jdata && + ORTE_PROC_MY_NAME->jobid != jptr->jobid) { + /* pack the job struct */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &jptr, 1, ORTE_JOB))) { + ORTE_ERROR_LOG(rc); + return rc; + } + ++numjobs; } - if (ORTE_JOB_STATE_UNTERMINATED < jptr->state) { - /* job already terminated - ignore it */ - continue; - } - if (jptr == jdata) { - /* ignore the job we are looking at - we'll get it separately */ - continue; - } - /* pack the job struct */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &jptr, 1, ORTE_JOB))) { - ORTE_ERROR_LOG(rc); - return rc; - } - ++numjobs; + rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jptr, nptr, &nptr); + } + /* pack the number of jobs */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numjobs, 1, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + return rc; } if (0 < numjobs) { - /* pack the number of jobs */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numjobs, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } /* pack the jobdata buffer */ wireup = &jobdata; if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &wireup, 1, OPAL_BUFFER))) { @@ -302,7 +298,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, /* check to see if we already have this one */ if (NULL == orte_get_job_data_object(jdata->jobid)) { /* nope - add it */ - opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), jdata); + opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata); /* connect each proc to its node object */ for (j=0; j < jdata->procs->size; j++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { @@ -401,16 +397,10 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, } } goto COMPLETE; + } else { + opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata); } - if (NULL != orte_get_job_data_object(*job)) { - opal_output(0, "ERROR - JOB ALREADY EXISTS"); - /* setup job object for this job */ - rc = ORTE_ERR_FATAL; - goto REPORT_ERROR; - } - opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), jdata); - /* ensure the map object is present */ if (NULL == jdata->map) { jdata->map = OBJ_NEW(orte_job_map_t); diff --git a/orte/mca/plm/base/plm_base_jobid.c b/orte/mca/plm/base/plm_base_jobid.c index 6964b6a688..de75c135ba 100644 --- a/orte/mca/plm/base/plm_base_jobid.c +++ b/orte/mca/plm/base/plm_base_jobid.c @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -76,27 +77,6 @@ int orte_plm_base_set_hnp_name(void) */ int orte_plm_base_create_jobid(orte_job_t *jdata) { -#if 0 - int32_t j; - - /* RHC: WHILE ORTE CAN NOW HANDLE RECYCLING OF JOBID'S, - * THE MPI LAYER CANNOT SINCE THERE IS NO WAY TO - * UPDATE THE OMPI_PROC_T LIST AND/OR THE BTL'S - */ - - /* see if there is a prior - * jobid that has completed and can be re-used. It can - * never be 0 as that belongs to the HNP and its daemons - */ - for (j=1; j < orte_job_data->size; j++) { - if (NULL == opal_pointer_array_get_item(orte_job_data, j)) { - /* this local jobid is available - reuse it */ - jdata->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, j); - return ORTE_SUCCESS; - } - } -#endif - if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { /* this job is being restarted - do not assign it * a new jobid diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 655cf3c82f..c8d99c0fff 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -289,7 +289,7 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata) * the orte_rmaps_base_setup_virtual_machine routine to * search all apps for any hosts to be used by the vm */ - opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(caddy->jdata->jobid), caddy->jdata); + opal_hash_table_set_value_uint32(orte_job_data, caddy->jdata->jobid, caddy->jdata); } /* if job recovery is not enabled, set it to default */ @@ -1098,18 +1098,19 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, jdatorted->num_reported, jdatorted->num_procs)); if (jdatorted->num_procs == jdatorted->num_reported) { bool dvm = true; + uint32_t key; + void *nptr; jdatorted->state = ORTE_JOB_STATE_DAEMONS_REPORTED; /* activate the daemons_reported state for all jobs * whose daemons were launched */ - for (idx=1; idx < orte_job_data->size; idx++) { - if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, idx))) { - continue; - } + rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jdata, &nptr); + while (OPAL_SUCCESS == rc) { dvm = false; if (ORTE_JOB_STATE_DAEMONS_LAUNCHED == jdata->state) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); } + rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jdata, nptr, &nptr); } if (dvm) { /* must be launching a DVM - activate the state */ diff --git a/orte/mca/state/base/state_base_fns.c b/orte/mca/state/base/state_base_fns.c index 5eb79e053c..ee82b7971f 100644 --- a/orte/mca/state/base/state_base_fns.c +++ b/orte/mca/state/base/state_base_fns.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2011-2012 Los Alamos National Security, LLC. - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -522,13 +522,13 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata) /* update the proc state */ ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; - if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) { + if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) { /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); - } + } /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs * remain (might be some from another job) @@ -551,11 +551,11 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata) } /* return the allocated slot for reuse */ cleanup_node(pdata); - /* track job status */ - jdata->num_terminated++; - if (jdata->num_terminated == jdata->num_procs) { + /* track job status */ + jdata->num_terminated++; + if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); - } + } } cleanup: @@ -577,6 +577,8 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata) bool one_still_alive; orte_vpid_t lowest=0; int32_t i32, *i32ptr; + uint32_t u32; + void *nptr; opal_output_verbose(2, orte_state_base_framework.framework_output, "%s state:base:check_job_complete on job %s", @@ -722,13 +724,11 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata) * object when we find it */ one_still_alive = false; - for (j=1; j < orte_job_data->size; j++) { - if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { - /* since we are releasing jdata objects as we - * go, we can no longer assume that the job_data - * array is left justified - */ - continue; + j = opal_hash_table_get_first_key_uint32(orte_job_data, &u32, (void **)&job, &nptr); + while (OPAL_SUCCESS == j) { + /* skip the daemon job */ + if (job->jobid == ORTE_PROC_MY_NAME->jobid) { + goto next; } /* if this is the job we are checking AND it normally terminated, * then activate the "notify_completed" state - this will release @@ -762,20 +762,19 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata) /* this was a debugger daemon. notify that a debugger has detached */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH); } - opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ OBJ_RELEASE(jdata); } } - continue; + goto next; } /* if the job is flagged to not be monitored, skip it */ if (ORTE_FLAG_TEST(job, ORTE_JOB_FLAG_DO_NOT_MONITOR)) { - continue; + goto next; } /* when checking for job termination, we must be sure to NOT check * our own job as it - rather obviously - has NOT terminated! */ - if (job->num_terminated < job->num_procs) { + if (ORTE_JOB_STATE_NOTIFIED != job->state) { /* we have at least one job that is not done yet - we cannot * just return, though, as we need to ensure we cleanout the * job data for the job that just completed @@ -795,7 +794,10 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata) job->num_terminated, job->num_procs, (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); } + next: + j = opal_hash_table_get_next_key_uint32(orte_job_data, &u32, (void **)&job, nptr, &nptr); } + /* if a job is still alive, we just return */ if (one_still_alive) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index 66fdca5dab..adf62431f8 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -454,7 +454,7 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, break; } /* store it on the global job data pool */ - opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), jdata); + opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata); /* before we launch it, tell the IOF to forward all output exclusively * to the requestor */ { @@ -590,7 +590,7 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, } } else { /* if we are the HNP, process the request */ - int32_t i, num_jobs; + int32_t rc, num_jobs; orte_job_t *jobdat; /* unpack the jobid */ @@ -628,17 +628,9 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, } } } else { - /* since the job array is no longer - * left-justified and may have holes, we have - * to cnt the number of jobs. Be sure to include the daemon - * job - the user can slice that info out if they don't care - */ - num_jobs = 0; - for (i=0; i < orte_job_data->size; i++) { - if (NULL != opal_pointer_array_get_item(orte_job_data, i)) { - num_jobs++; - } - } + uint32_t u32; + void *nptr; + num_jobs = opal_hash_table_get_size(orte_job_data); /* pack the number of jobs */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_jobs, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); @@ -646,14 +638,18 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, goto CLEANUP; } /* now pack the data, one at a time */ - for (i=0; i < orte_job_data->size; i++) { - if (NULL != (jobdat = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jobdat, 1, ORTE_JOB))) { + rc = opal_hash_table_get_first_key_uint32(orte_job_data, &u32, (void **)&jobdat, &nptr); + while (OPAL_SUCCESS == rc) { + if (NULL != jobdat) { + /* pack the job struct */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &jobdat, 1, ORTE_JOB))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } + ++num_jobs; } + rc = opal_hash_table_get_next_key_uint32(orte_job_data, &u32, (void **)&jobdat, nptr, &nptr); } } if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index bd1dc19425..b514b1c843 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -532,7 +532,6 @@ int orte_daemon(int argc, char *argv[]) orte_node_t *node; orte_app_context_t *app; char *tmp, *nptr, *sysinfo; - int32_t ljob; char **singenv=NULL; /* setup the singleton's job */ @@ -540,8 +539,7 @@ int orte_daemon(int argc, char *argv[]) /* default to ompi for now */ opal_argv_append_nosize(&jdata->personality, "ompi"); orte_plm_base_create_jobid(jdata); - ljob = ORTE_LOCAL_JOBID(jdata->jobid); - opal_pointer_array_set_item(orte_job_data, ljob, jdata); + opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata); /* must create a map for it (even though it has no * info in it) so that the job info will be picked diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index b5836c6296..ffd8eeaa8c 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -130,7 +130,7 @@ orte_timer_t *orte_mpiexec_timeout = NULL; opal_buffer_t *orte_tree_launch_cmd = NULL; /* global arrays for data storage */ -opal_pointer_array_t *orte_job_data = NULL; +opal_hash_table_t *orte_job_data = NULL; opal_pointer_array_t *orte_node_pool = NULL; opal_pointer_array_t *orte_node_topologies = NULL; opal_pointer_array_t *orte_local_children = NULL; @@ -416,22 +416,16 @@ int orte_dt_init(void) orte_job_t* orte_get_job_data_object(orte_jobid_t job) { - int32_t ljob; + orte_job_t *jdata; /* if the job data wasn't setup, we cannot provide the data */ if (NULL == orte_job_data) { return NULL; } - /* the job is indexed by its local jobid, so we can - * just look it up here. it is not an error for this - * to not be found - could just be - * a race condition whereby the job has already been - * removed from the array. The get_item function - * will just return NULL in that case. - */ - ljob = ORTE_LOCAL_JOBID(job); - return (orte_job_t*)opal_pointer_array_get_item(orte_job_data, ljob); + jdata = NULL; + opal_hash_table_get_value_uint32(orte_job_data, job, (void**)&jdata); + return jdata; } orte_proc_t* orte_get_proc_object(orte_process_name_t *proc) @@ -667,7 +661,6 @@ static void orte_job_destruct(orte_job_t* job) { orte_proc_t *proc; orte_app_context_t *app; - orte_job_t *jdata; int n; orte_timer_t *evtimer; @@ -724,18 +717,9 @@ static void orte_job_destruct(orte_job_t* job) /* release the attributes */ OPAL_LIST_DESTRUCT(&job->attributes); - /* find the job in the global array */ - if (NULL != orte_job_data && ORTE_JOBID_INVALID != job->jobid) { - for (n=0; n < orte_job_data->size; n++) { - if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) { - continue; - } - if (jdata->jobid == job->jobid) { - /* set the entry to NULL */ - opal_pointer_array_set_item(orte_job_data, n, NULL); - break; - } - } + if (ORTE_JOBID_INVALID != job->jobid) { + /* remove the job from the global array */ + opal_hash_table_remove_value_uint32(orte_job_data, job->jobid); } } diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index ecaf224db1..7cdc4e81da 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -515,7 +515,7 @@ ORTE_DECLSPEC extern orte_timer_t *orte_mpiexec_timeout; ORTE_DECLSPEC extern opal_buffer_t *orte_tree_launch_cmd; /* global arrays for data storage */ -ORTE_DECLSPEC extern opal_pointer_array_t *orte_job_data; +ORTE_DECLSPEC extern opal_hash_table_t *orte_job_data; ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_pool; ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies; ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children; diff --git a/orte/runtime/orte_quit.c b/orte/runtime/orte_quit.c index d1824044cc..ca383ac71d 100644 --- a/orte/runtime/orte_quit.c +++ b/orte/runtime/orte_quit.c @@ -15,7 +15,7 @@ * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -339,17 +339,15 @@ static void dump_aborted_procs(void) orte_proc_t *proc, *pptr; orte_app_context_t *approc; orte_node_t *node; + uint32_t key; + void *nptr; - /* find the job that caused the problem - be sure to start the loop - * at 1 as the daemons are in 0 and will clearly be "running", so no - * point in checking them - */ - for (n=1; n < orte_job_data->size; n++) { - if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) { - /* the array is no longer left-justified, so we have to continue */ - continue; + /* find the job that caused the problem */ + n = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&job, &nptr); + while (OPAL_SUCCESS == n) { + if (job->jobid == ORTE_PROC_MY_NAME->jobid) { + goto next; } - if (ORTE_JOB_STATE_UNDEF != job->state && ORTE_JOB_STATE_INIT != job->state && ORTE_JOB_STATE_RUNNING != job->state && @@ -378,7 +376,7 @@ static void dump_aborted_procs(void) proc = NULL; if (!orte_get_attribute(&job->attributes, ORTE_JOB_ABORTED_PROC, (void**)&proc, OPAL_PTR) || NULL == proc) { - continue; + goto next; } approc = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, proc->app_idx); @@ -387,5 +385,7 @@ static void dump_aborted_procs(void) break; } } + next: + n = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&job, nptr, &nptr); } } diff --git a/orte/tools/orte-dvm/orte-dvm.c b/orte/tools/orte-dvm/orte-dvm.c index c77b533d70..479aed125b 100644 --- a/orte/tools/orte-dvm/orte-dvm.c +++ b/orte/tools/orte-dvm/orte-dvm.c @@ -477,7 +477,6 @@ static void send_callback(int status, orte_process_name_t *peer, OBJ_RELEASE(buffer); /* cleanup the job object */ - opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL); OBJ_RELEASE(jdata); } diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 6a1f203aac..69373cf961 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -2407,7 +2407,6 @@ static void setup_debugger_job(void) { orte_job_t *debugger; orte_app_context_t *app; - int32_t ljob; orte_proc_t *proc; int i, rc; orte_node_t *node; @@ -2429,8 +2428,7 @@ static void setup_debugger_job(void) /* dont push stdin */ debugger->stdin_target = ORTE_VPID_INVALID; /* add it to the global job pool */ - ljob = ORTE_LOCAL_JOBID(debugger->jobid); - opal_pointer_array_set_item(orte_job_data, ljob, debugger); + opal_hash_table_set_value_uint32(orte_job_data, debugger->jobid, debugger); /* create an app_context for the debugger daemon */ app = OBJ_NEW(orte_app_context_t); if (NULL != orte_debugger_test_daemon) {