Convert the orte_job_data pointer array to a hash table so it doesn't grow forever as we run lots and lots of jobs in the persistent DVM.
Этот коммит содержится в:
родитель
309e23ab3a
Коммит
d653cf2847
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2015 Research Organization for Information Science
|
||||
# and Technology (RIST). All rights reserved.
|
||||
|
@ -8,7 +8,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -584,7 +584,6 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
orte_session_dir_cleanup(jdata->jobid);
|
||||
|
||||
/* remove this job from our local job data since it is complete */
|
||||
opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL);
|
||||
OBJ_RELEASE(jdata);
|
||||
|
||||
/* send it */
|
||||
|
@ -14,7 +14,7 @@
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -301,11 +301,8 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
}
|
||||
}
|
||||
/* setup the global job and node arrays */
|
||||
orte_job_data = OBJ_NEW(opal_pointer_array_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
|
||||
1,
|
||||
ORTE_GLOBAL_ARRAY_MAX_SIZE,
|
||||
1))) {
|
||||
orte_job_data = OBJ_NEW(opal_hash_table_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_hash_table_init(orte_job_data, 128))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "setup job array";
|
||||
goto error;
|
||||
@ -332,7 +329,7 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
/* create and store the job data object */
|
||||
jdata = OBJ_NEW(orte_job_t);
|
||||
jdata->jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
opal_pointer_array_set_item(orte_job_data, 0, jdata);
|
||||
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
|
||||
/* every job requires at least one app */
|
||||
app = OBJ_NEW(orte_app_context_t);
|
||||
opal_pointer_array_set_item(jdata->apps, 0, app);
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -357,11 +357,8 @@ static int rte_init(void)
|
||||
goto error;
|
||||
}
|
||||
/* setup the global job and node arrays */
|
||||
orte_job_data = OBJ_NEW(opal_pointer_array_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
|
||||
1,
|
||||
ORTE_GLOBAL_ARRAY_MAX_SIZE,
|
||||
1))) {
|
||||
orte_job_data = OBJ_NEW(opal_hash_table_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_hash_table_init(orte_job_data, 128))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "setup job array";
|
||||
goto error;
|
||||
@ -388,7 +385,7 @@ static int rte_init(void)
|
||||
/* create and store the job data object */
|
||||
jdata = OBJ_NEW(orte_job_t);
|
||||
jdata->jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
opal_pointer_array_set_item(orte_job_data, 0, jdata);
|
||||
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
|
||||
/* mark that the daemons have reported as we are the
|
||||
* only ones in the system right now, and we definitely
|
||||
* are running!
|
||||
|
@ -101,7 +101,7 @@
|
||||
int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
orte_jobid_t job)
|
||||
{
|
||||
int rc, i;
|
||||
int rc;
|
||||
orte_job_t *jdata=NULL, *jptr;
|
||||
orte_job_map_t *map=NULL;
|
||||
opal_buffer_t *wireup, jobdata;
|
||||
@ -191,33 +191,29 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
* properly work should a proc from one of the other jobs
|
||||
* interact with this one */
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS, NULL, OPAL_BOOL)) {
|
||||
void *nptr;
|
||||
uint32_t key;
|
||||
OBJ_CONSTRUCT(&jobdata, opal_buffer_t);
|
||||
numjobs = 0;
|
||||
for (i=0; i < orte_job_data->size; i++) {
|
||||
if (NULL == (jptr = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
|
||||
continue;
|
||||
rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jptr, &nptr);
|
||||
while (OPAL_SUCCESS == rc) {
|
||||
if (NULL != jptr && jptr != jdata &&
|
||||
ORTE_PROC_MY_NAME->jobid != jptr->jobid) {
|
||||
/* pack the job struct */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &jptr, 1, ORTE_JOB))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
++numjobs;
|
||||
}
|
||||
if (ORTE_JOB_STATE_UNTERMINATED < jptr->state) {
|
||||
/* job already terminated - ignore it */
|
||||
continue;
|
||||
}
|
||||
if (jptr == jdata) {
|
||||
/* ignore the job we are looking at - we'll get it separately */
|
||||
continue;
|
||||
}
|
||||
/* pack the job struct */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &jptr, 1, ORTE_JOB))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
++numjobs;
|
||||
rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jptr, nptr, &nptr);
|
||||
}
|
||||
/* pack the number of jobs */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numjobs, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (0 < numjobs) {
|
||||
/* pack the number of jobs */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numjobs, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* pack the jobdata buffer */
|
||||
wireup = &jobdata;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &wireup, 1, OPAL_BUFFER))) {
|
||||
@ -302,7 +298,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
/* check to see if we already have this one */
|
||||
if (NULL == orte_get_job_data_object(jdata->jobid)) {
|
||||
/* nope - add it */
|
||||
opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), jdata);
|
||||
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
|
||||
/* connect each proc to its node object */
|
||||
for (j=0; j < jdata->procs->size; j++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
|
||||
@ -401,16 +397,10 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
}
|
||||
}
|
||||
goto COMPLETE;
|
||||
} else {
|
||||
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
|
||||
}
|
||||
|
||||
if (NULL != orte_get_job_data_object(*job)) {
|
||||
opal_output(0, "ERROR - JOB ALREADY EXISTS");
|
||||
/* setup job object for this job */
|
||||
rc = ORTE_ERR_FATAL;
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), jdata);
|
||||
|
||||
/* ensure the map object is present */
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -76,27 +77,6 @@ int orte_plm_base_set_hnp_name(void)
|
||||
*/
|
||||
int orte_plm_base_create_jobid(orte_job_t *jdata)
|
||||
{
|
||||
#if 0
|
||||
int32_t j;
|
||||
|
||||
/* RHC: WHILE ORTE CAN NOW HANDLE RECYCLING OF JOBID'S,
|
||||
* THE MPI LAYER CANNOT SINCE THERE IS NO WAY TO
|
||||
* UPDATE THE OMPI_PROC_T LIST AND/OR THE BTL'S
|
||||
*/
|
||||
|
||||
/* see if there is a prior
|
||||
* jobid that has completed and can be re-used. It can
|
||||
* never be 0 as that belongs to the HNP and its daemons
|
||||
*/
|
||||
for (j=1; j < orte_job_data->size; j++) {
|
||||
if (NULL == opal_pointer_array_get_item(orte_job_data, j)) {
|
||||
/* this local jobid is available - reuse it */
|
||||
jdata->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, j);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
|
||||
/* this job is being restarted - do not assign it
|
||||
* a new jobid
|
||||
|
@ -289,7 +289,7 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
|
||||
* the orte_rmaps_base_setup_virtual_machine routine to
|
||||
* search all apps for any hosts to be used by the vm
|
||||
*/
|
||||
opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(caddy->jdata->jobid), caddy->jdata);
|
||||
opal_hash_table_set_value_uint32(orte_job_data, caddy->jdata->jobid, caddy->jdata);
|
||||
}
|
||||
|
||||
/* if job recovery is not enabled, set it to default */
|
||||
@ -1098,18 +1098,19 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
||||
jdatorted->num_reported, jdatorted->num_procs));
|
||||
if (jdatorted->num_procs == jdatorted->num_reported) {
|
||||
bool dvm = true;
|
||||
uint32_t key;
|
||||
void *nptr;
|
||||
jdatorted->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
|
||||
/* activate the daemons_reported state for all jobs
|
||||
* whose daemons were launched
|
||||
*/
|
||||
for (idx=1; idx < orte_job_data->size; idx++) {
|
||||
if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, idx))) {
|
||||
continue;
|
||||
}
|
||||
rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jdata, &nptr);
|
||||
while (OPAL_SUCCESS == rc) {
|
||||
dvm = false;
|
||||
if (ORTE_JOB_STATE_DAEMONS_LAUNCHED == jdata->state) {
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
|
||||
}
|
||||
rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jdata, nptr, &nptr);
|
||||
}
|
||||
if (dvm) {
|
||||
/* must be launching a DVM - activate the state */
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -522,13 +522,13 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
|
||||
/* update the proc state */
|
||||
ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
|
||||
pdata->state = state;
|
||||
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
|
||||
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
|
||||
/* Clean up the session directory as if we were the process
|
||||
* itself. This covers the case where the process died abnormally
|
||||
* and didn't cleanup its own session directory.
|
||||
*/
|
||||
orte_session_dir_finalize(proc);
|
||||
}
|
||||
}
|
||||
/* if we are trying to terminate and our routes are
|
||||
* gone, then terminate ourselves IF no local procs
|
||||
* remain (might be some from another job)
|
||||
@ -551,11 +551,11 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
|
||||
}
|
||||
/* return the allocated slot for reuse */
|
||||
cleanup_node(pdata);
|
||||
/* track job status */
|
||||
jdata->num_terminated++;
|
||||
if (jdata->num_terminated == jdata->num_procs) {
|
||||
/* track job status */
|
||||
jdata->num_terminated++;
|
||||
if (jdata->num_terminated == jdata->num_procs) {
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
@ -577,6 +577,8 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
|
||||
bool one_still_alive;
|
||||
orte_vpid_t lowest=0;
|
||||
int32_t i32, *i32ptr;
|
||||
uint32_t u32;
|
||||
void *nptr;
|
||||
|
||||
opal_output_verbose(2, orte_state_base_framework.framework_output,
|
||||
"%s state:base:check_job_complete on job %s",
|
||||
@ -722,13 +724,11 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
|
||||
* object when we find it
|
||||
*/
|
||||
one_still_alive = false;
|
||||
for (j=1; j < orte_job_data->size; j++) {
|
||||
if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) {
|
||||
/* since we are releasing jdata objects as we
|
||||
* go, we can no longer assume that the job_data
|
||||
* array is left justified
|
||||
*/
|
||||
continue;
|
||||
j = opal_hash_table_get_first_key_uint32(orte_job_data, &u32, (void **)&job, &nptr);
|
||||
while (OPAL_SUCCESS == j) {
|
||||
/* skip the daemon job */
|
||||
if (job->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
goto next;
|
||||
}
|
||||
/* if this is the job we are checking AND it normally terminated,
|
||||
* then activate the "notify_completed" state - this will release
|
||||
@ -762,20 +762,19 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
|
||||
/* this was a debugger daemon. notify that a debugger has detached */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
|
||||
}
|
||||
opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */
|
||||
OBJ_RELEASE(jdata);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
goto next;
|
||||
}
|
||||
/* if the job is flagged to not be monitored, skip it */
|
||||
if (ORTE_FLAG_TEST(job, ORTE_JOB_FLAG_DO_NOT_MONITOR)) {
|
||||
continue;
|
||||
goto next;
|
||||
}
|
||||
/* when checking for job termination, we must be sure to NOT check
|
||||
* our own job as it - rather obviously - has NOT terminated!
|
||||
*/
|
||||
if (job->num_terminated < job->num_procs) {
|
||||
if (ORTE_JOB_STATE_NOTIFIED != job->state) {
|
||||
/* we have at least one job that is not done yet - we cannot
|
||||
* just return, though, as we need to ensure we cleanout the
|
||||
* job data for the job that just completed
|
||||
@ -795,7 +794,10 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
|
||||
job->num_terminated, job->num_procs,
|
||||
(NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) ));
|
||||
}
|
||||
next:
|
||||
j = opal_hash_table_get_next_key_uint32(orte_job_data, &u32, (void **)&job, nptr, &nptr);
|
||||
}
|
||||
|
||||
/* if a job is still alive, we just return */
|
||||
if (one_still_alive) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
|
@ -454,7 +454,7 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
break;
|
||||
}
|
||||
/* store it on the global job data pool */
|
||||
opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), jdata);
|
||||
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
|
||||
/* before we launch it, tell the IOF to forward all output exclusively
|
||||
* to the requestor */
|
||||
{
|
||||
@ -590,7 +590,7 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
}
|
||||
} else {
|
||||
/* if we are the HNP, process the request */
|
||||
int32_t i, num_jobs;
|
||||
int32_t rc, num_jobs;
|
||||
orte_job_t *jobdat;
|
||||
|
||||
/* unpack the jobid */
|
||||
@ -628,17 +628,9 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* since the job array is no longer
|
||||
* left-justified and may have holes, we have
|
||||
* to cnt the number of jobs. Be sure to include the daemon
|
||||
* job - the user can slice that info out if they don't care
|
||||
*/
|
||||
num_jobs = 0;
|
||||
for (i=0; i < orte_job_data->size; i++) {
|
||||
if (NULL != opal_pointer_array_get_item(orte_job_data, i)) {
|
||||
num_jobs++;
|
||||
}
|
||||
}
|
||||
uint32_t u32;
|
||||
void *nptr;
|
||||
num_jobs = opal_hash_table_get_size(orte_job_data);
|
||||
/* pack the number of jobs */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_jobs, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -646,14 +638,18 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* now pack the data, one at a time */
|
||||
for (i=0; i < orte_job_data->size; i++) {
|
||||
if (NULL != (jobdat = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jobdat, 1, ORTE_JOB))) {
|
||||
rc = opal_hash_table_get_first_key_uint32(orte_job_data, &u32, (void **)&jobdat, &nptr);
|
||||
while (OPAL_SUCCESS == rc) {
|
||||
if (NULL != jobdat) {
|
||||
/* pack the job struct */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &jobdat, 1, ORTE_JOB))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
goto CLEANUP;
|
||||
}
|
||||
++num_jobs;
|
||||
}
|
||||
rc = opal_hash_table_get_next_key_uint32(orte_job_data, &u32, (void **)&jobdat, nptr, &nptr);
|
||||
}
|
||||
}
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
|
||||
|
@ -532,7 +532,6 @@ int orte_daemon(int argc, char *argv[])
|
||||
orte_node_t *node;
|
||||
orte_app_context_t *app;
|
||||
char *tmp, *nptr, *sysinfo;
|
||||
int32_t ljob;
|
||||
char **singenv=NULL;
|
||||
|
||||
/* setup the singleton's job */
|
||||
@ -540,8 +539,7 @@ int orte_daemon(int argc, char *argv[])
|
||||
/* default to ompi for now */
|
||||
opal_argv_append_nosize(&jdata->personality, "ompi");
|
||||
orte_plm_base_create_jobid(jdata);
|
||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
||||
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
|
||||
|
||||
/* must create a map for it (even though it has no
|
||||
* info in it) so that the job info will be picked
|
||||
|
@ -130,7 +130,7 @@ orte_timer_t *orte_mpiexec_timeout = NULL;
|
||||
opal_buffer_t *orte_tree_launch_cmd = NULL;
|
||||
|
||||
/* global arrays for data storage */
|
||||
opal_pointer_array_t *orte_job_data = NULL;
|
||||
opal_hash_table_t *orte_job_data = NULL;
|
||||
opal_pointer_array_t *orte_node_pool = NULL;
|
||||
opal_pointer_array_t *orte_node_topologies = NULL;
|
||||
opal_pointer_array_t *orte_local_children = NULL;
|
||||
@ -416,22 +416,16 @@ int orte_dt_init(void)
|
||||
|
||||
orte_job_t* orte_get_job_data_object(orte_jobid_t job)
|
||||
{
|
||||
int32_t ljob;
|
||||
orte_job_t *jdata;
|
||||
|
||||
/* if the job data wasn't setup, we cannot provide the data */
|
||||
if (NULL == orte_job_data) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* the job is indexed by its local jobid, so we can
|
||||
* just look it up here. it is not an error for this
|
||||
* to not be found - could just be
|
||||
* a race condition whereby the job has already been
|
||||
* removed from the array. The get_item function
|
||||
* will just return NULL in that case.
|
||||
*/
|
||||
ljob = ORTE_LOCAL_JOBID(job);
|
||||
return (orte_job_t*)opal_pointer_array_get_item(orte_job_data, ljob);
|
||||
jdata = NULL;
|
||||
opal_hash_table_get_value_uint32(orte_job_data, job, (void**)&jdata);
|
||||
return jdata;
|
||||
}
|
||||
|
||||
orte_proc_t* orte_get_proc_object(orte_process_name_t *proc)
|
||||
@ -667,7 +661,6 @@ static void orte_job_destruct(orte_job_t* job)
|
||||
{
|
||||
orte_proc_t *proc;
|
||||
orte_app_context_t *app;
|
||||
orte_job_t *jdata;
|
||||
int n;
|
||||
orte_timer_t *evtimer;
|
||||
|
||||
@ -724,18 +717,9 @@ static void orte_job_destruct(orte_job_t* job)
|
||||
/* release the attributes */
|
||||
OPAL_LIST_DESTRUCT(&job->attributes);
|
||||
|
||||
/* find the job in the global array */
|
||||
if (NULL != orte_job_data && ORTE_JOBID_INVALID != job->jobid) {
|
||||
for (n=0; n < orte_job_data->size; n++) {
|
||||
if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) {
|
||||
continue;
|
||||
}
|
||||
if (jdata->jobid == job->jobid) {
|
||||
/* set the entry to NULL */
|
||||
opal_pointer_array_set_item(orte_job_data, n, NULL);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ORTE_JOBID_INVALID != job->jobid) {
|
||||
/* remove the job from the global array */
|
||||
opal_hash_table_remove_value_uint32(orte_job_data, job->jobid);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -515,7 +515,7 @@ ORTE_DECLSPEC extern orte_timer_t *orte_mpiexec_timeout;
|
||||
ORTE_DECLSPEC extern opal_buffer_t *orte_tree_launch_cmd;
|
||||
|
||||
/* global arrays for data storage */
|
||||
ORTE_DECLSPEC extern opal_pointer_array_t *orte_job_data;
|
||||
ORTE_DECLSPEC extern opal_hash_table_t *orte_job_data;
|
||||
ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_pool;
|
||||
ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies;
|
||||
ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children;
|
||||
|
@ -15,7 +15,7 @@
|
||||
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -339,17 +339,15 @@ static void dump_aborted_procs(void)
|
||||
orte_proc_t *proc, *pptr;
|
||||
orte_app_context_t *approc;
|
||||
orte_node_t *node;
|
||||
uint32_t key;
|
||||
void *nptr;
|
||||
|
||||
/* find the job that caused the problem - be sure to start the loop
|
||||
* at 1 as the daemons are in 0 and will clearly be "running", so no
|
||||
* point in checking them
|
||||
*/
|
||||
for (n=1; n < orte_job_data->size; n++) {
|
||||
if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) {
|
||||
/* the array is no longer left-justified, so we have to continue */
|
||||
continue;
|
||||
/* find the job that caused the problem */
|
||||
n = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&job, &nptr);
|
||||
while (OPAL_SUCCESS == n) {
|
||||
if (job->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
goto next;
|
||||
}
|
||||
|
||||
if (ORTE_JOB_STATE_UNDEF != job->state &&
|
||||
ORTE_JOB_STATE_INIT != job->state &&
|
||||
ORTE_JOB_STATE_RUNNING != job->state &&
|
||||
@ -378,7 +376,7 @@ static void dump_aborted_procs(void)
|
||||
proc = NULL;
|
||||
if (!orte_get_attribute(&job->attributes, ORTE_JOB_ABORTED_PROC, (void**)&proc, OPAL_PTR) ||
|
||||
NULL == proc) {
|
||||
continue;
|
||||
goto next;
|
||||
}
|
||||
|
||||
approc = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, proc->app_idx);
|
||||
@ -387,5 +385,7 @@ static void dump_aborted_procs(void)
|
||||
break;
|
||||
}
|
||||
}
|
||||
next:
|
||||
n = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&job, nptr, &nptr);
|
||||
}
|
||||
}
|
||||
|
@ -477,7 +477,6 @@ static void send_callback(int status, orte_process_name_t *peer,
|
||||
|
||||
OBJ_RELEASE(buffer);
|
||||
/* cleanup the job object */
|
||||
opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL);
|
||||
OBJ_RELEASE(jdata);
|
||||
}
|
||||
|
||||
|
@ -2407,7 +2407,6 @@ static void setup_debugger_job(void)
|
||||
{
|
||||
orte_job_t *debugger;
|
||||
orte_app_context_t *app;
|
||||
int32_t ljob;
|
||||
orte_proc_t *proc;
|
||||
int i, rc;
|
||||
orte_node_t *node;
|
||||
@ -2429,8 +2428,7 @@ static void setup_debugger_job(void)
|
||||
/* dont push stdin */
|
||||
debugger->stdin_target = ORTE_VPID_INVALID;
|
||||
/* add it to the global job pool */
|
||||
ljob = ORTE_LOCAL_JOBID(debugger->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, debugger);
|
||||
opal_hash_table_set_value_uint32(orte_job_data, debugger->jobid, debugger);
|
||||
/* create an app_context for the debugger daemon */
|
||||
app = OBJ_NEW(orte_app_context_t);
|
||||
if (NULL != orte_debugger_test_daemon) {
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user