Don't know how we got away with this for so long, but we really shouldn't be referencing pointer array objects directly.
Also, fix an error in mpirx debugger module - the pointer array object is the pointer to the object itself, not the object "super" like in an opal_list. This commit was SVN r24894.
Этот коммит содержится в:
родитель
fbda6aaf89
Коммит
8d1b31b887
@ -207,7 +207,7 @@ void init_before_spawn(orte_job_t *jdata)
|
||||
}
|
||||
opal_argv_append_nosize(&app->argv, app->app);
|
||||
build_debugger_args(app);
|
||||
opal_pointer_array_add(orte_debugger_daemon->apps, &app->super);
|
||||
opal_pointer_array_add(orte_debugger_daemon->apps, app);
|
||||
orte_debugger_daemon->num_apps = 1;
|
||||
}
|
||||
return;
|
||||
@ -318,7 +318,7 @@ static void attach_debugger(int fd, short event, void *arg)
|
||||
}
|
||||
opal_argv_append_nosize(&app->argv, app->app);
|
||||
build_debugger_args(app);
|
||||
opal_pointer_array_add(jdata->apps, &app->super);
|
||||
opal_pointer_array_add(jdata->apps, app);
|
||||
jdata->num_apps = 1;
|
||||
/* setup the mapping policy to bynode so we get one
|
||||
* daemon on each node
|
||||
|
@ -340,7 +340,7 @@ static int update_state(orte_jobid_t job,
|
||||
/* kill this proc */
|
||||
killprocs(proc->jobid, proc->vpid, proc->epoch);
|
||||
}
|
||||
app = jobdat->apps[child->app_idx];
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx);
|
||||
if( jobdat->enable_recovery && child->restarts < app->max_restarts ) {
|
||||
child->restarts++;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
@ -378,7 +378,7 @@ static int update_state(orte_jobid_t job,
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
|
||||
/* see if this child has reached its local restart limit */
|
||||
app = jobdat->apps[child->app_idx];
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s CHECKING RESTARTS %d VS MAX %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
|
@ -43,7 +43,6 @@
|
||||
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/sys_limits.h"
|
||||
@ -106,6 +105,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
orte_vpid_t i;
|
||||
int j;
|
||||
orte_daemon_cmd_flag_t command;
|
||||
orte_app_context_t *app;
|
||||
|
||||
if (NULL != orte_debugger_daemon && ORTE_JOBID_INVALID == job) {
|
||||
/* all we are doing is launching debugger daemons */
|
||||
@ -198,9 +198,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
}
|
||||
|
||||
/* are we co-locating debugger daemons? */
|
||||
if (NULL != orte_debugger_daemon) {
|
||||
orte_app_context_t **apps;
|
||||
|
||||
if (NULL != orte_debugger_daemon) {
|
||||
/* flag that we are */
|
||||
flag = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &flag, 1, OPAL_INT8))) {
|
||||
@ -213,8 +211,8 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
return rc;
|
||||
}
|
||||
/* pack the executable name */
|
||||
apps = (orte_app_context_t**)orte_debugger_daemon->apps->addr;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, apps, 1, ORTE_APP_CONTEXT))) {
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(orte_debugger_daemon->apps, 0);
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &app, 1, ORTE_APP_CONTEXT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -315,12 +313,14 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the app_contexts for this job - we already checked early on that
|
||||
* there must be at least one, so don't bother checking here again
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, jdata->apps->addr, jdata->num_apps, ORTE_APP_CONTEXT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
/* pack the app_contexts for this job */
|
||||
for (j=0; j < jdata->apps->size; j++) {
|
||||
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, j))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &app, 1, ORTE_APP_CONTEXT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* encode the pidmap */
|
||||
@ -485,7 +485,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
orte_jobid_t debugger;
|
||||
bool add_child;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
orte_app_context_t *app;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:constructing child list",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
@ -518,17 +519,13 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
orte_odls_globals.debugger->num_local_procs = 1;
|
||||
opal_list_append(&orte_local_jobdata, &(orte_odls_globals.debugger)->super);
|
||||
/* retrieve the info */
|
||||
orte_odls_globals.debugger->apps = (orte_app_context_t**)malloc(sizeof(orte_app_context_t*));
|
||||
if (NULL == orte_odls_globals.debugger->apps) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
cnt = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, orte_odls_globals.debugger->apps,
|
||||
&cnt, ORTE_APP_CONTEXT))) {
|
||||
app = NULL;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &app, &cnt, ORTE_APP_CONTEXT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
opal_pointer_array_add(&orte_odls_globals.debugger->apps, app);
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &(orte_odls_globals.debugger->controls), &cnt, ORTE_JOB_CONTROL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -668,22 +665,14 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:construct_child_list unpacking %ld app_contexts",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)jobdat->num_apps));
|
||||
|
||||
/* allocate space and unpack the app_contexts for this job - the HNP checked
|
||||
* that there must be at least one, so don't bother checking here again
|
||||
*/
|
||||
if (NULL != jobdat->apps) {
|
||||
free(jobdat->apps);
|
||||
}
|
||||
jobdat->apps = (orte_app_context_t**)malloc(jobdat->num_apps * sizeof(orte_app_context_t*));
|
||||
if (NULL == jobdat->apps) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
cnt = jobdat->num_apps;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, jobdat->apps, &cnt, ORTE_APP_CONTEXT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
for (j=0; j < jobdat->num_apps; j++) {
|
||||
cnt = 1;
|
||||
app = NULL;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &app, &cnt, ORTE_APP_CONTEXT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
opal_pointer_array_set_item(&jobdat->apps, app->idx, app);
|
||||
}
|
||||
|
||||
/* unpack the pidmap byte object */
|
||||
@ -781,7 +770,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL ==
|
||||
orte_util_compare_name_fields(mask, child->name, &proc)) {
|
||||
orte_util_compare_name_fields(mask, child->name, &proc)) {
|
||||
/* do not duplicate this child on the list! */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"proc %s is on list and is %s",
|
||||
@ -791,7 +780,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
child->restarts = restarts[j];
|
||||
child->do_not_barrier = true;
|
||||
/* mark that this app_context is being used on this node */
|
||||
jobdat->apps[app_idx[j]]->used_on_node = true;
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, app_idx[j]);
|
||||
app->used_on_node = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -817,11 +807,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
if (ORTE_JOB_STATE_RESTART == jobdat->state) {
|
||||
child->do_not_barrier = true;
|
||||
}
|
||||
if (NULL != slot_str && NULL != slot_str[j]) {
|
||||
if (NULL != slot_str && NULL != slot_str[j]) {
|
||||
child->slot_list = strdup(slot_str[j]);
|
||||
}
|
||||
/* mark that this app_context is being used on this node */
|
||||
jobdat->apps[app_idx[j]]->used_on_node = true;
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, app_idx[j]);
|
||||
app->used_on_node = true;
|
||||
/* protect operation on the global list of children */
|
||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||
opal_list_append(&orte_local_children, &child->super);
|
||||
@ -837,7 +828,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
opal_condition_broadcast(&jobdat->cond);
|
||||
OPAL_THREAD_UNLOCK(&jobdat->lock);
|
||||
|
||||
done:
|
||||
done:
|
||||
if (NULL != app_idx) {
|
||||
free(app_idx);
|
||||
app_idx = NULL;
|
||||
@ -856,7 +847,7 @@ done:
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
REPORT_ERROR:
|
||||
REPORT_ERROR:
|
||||
/* we have to report an error back to the HNP so we don't just
|
||||
* hang. Although there shouldn't be any errors once this is
|
||||
* all debugged, it is still good practice to have a way
|
||||
@ -1253,8 +1244,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
orte_odls_base_fork_local_proc_fn_t fork_local)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
orte_app_context_t *app, **apps;
|
||||
orte_app_idx_t i, num_apps;
|
||||
orte_app_context_t *app, *dbg;
|
||||
orte_odls_child_t *child=NULL;
|
||||
int num_processors;
|
||||
bool oversubscribed;
|
||||
@ -1265,7 +1255,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
orte_odls_job_t *jobdat;
|
||||
char basedir[MAXPATHLEN];
|
||||
char **argvsav=NULL;
|
||||
int inm;
|
||||
int inm, j;
|
||||
opal_event_t *delay;
|
||||
int num_procs_alive = 0;
|
||||
orte_nid_t *nid;
|
||||
@ -1305,8 +1295,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
apps = jobdat->apps;
|
||||
num_apps = jobdat->num_apps;
|
||||
|
||||
/* see if the mapper thinks we are oversubscribed */
|
||||
oversubscribed = false;
|
||||
@ -1344,10 +1332,13 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
/* Now we preload any files that are needed. This is done on a per
|
||||
* app context basis
|
||||
*/
|
||||
for (i=0; i < num_apps; i++) {
|
||||
if(apps[i]->used_on_node &&
|
||||
(apps[i]->preload_binary || NULL != apps[i]->preload_files)) {
|
||||
if( ORTE_SUCCESS != (rc = orte_odls_base_preload_files_app_context(apps[i])) ) {
|
||||
for (j=0; j < jobdat->apps.size; j++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, j))) {
|
||||
continue;
|
||||
}
|
||||
if(app->used_on_node &&
|
||||
(app->preload_binary || NULL != app->preload_files)) {
|
||||
if( ORTE_SUCCESS != (rc = orte_odls_base_preload_files_app_context(app)) ) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
/* JJH: Do not fail here, instead try to execute without the preloaded options*/
|
||||
}
|
||||
@ -1355,8 +1346,11 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
for (i=0; i < num_apps; i++) {
|
||||
orte_sstore.fetch_app_deps(apps[i]);
|
||||
for (j=0; j < jobdat->apps.size; j++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, j))) {
|
||||
continue;
|
||||
}
|
||||
orte_sstore.fetch_app_deps(app);
|
||||
}
|
||||
orte_sstore.wait_all_deps();
|
||||
#endif
|
||||
@ -1415,8 +1409,10 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
/* setup to report the proc state to the HNP */
|
||||
OBJ_CONSTRUCT(&alert, opal_buffer_t);
|
||||
|
||||
for (i=0; i < num_apps; i++) {
|
||||
app = apps[i];
|
||||
for (j=0; j < jobdat->apps.size; j++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, j))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* if this app isn't being used on our node, skip it */
|
||||
if (!app->used_on_node) {
|
||||
@ -1494,7 +1490,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID) &&
|
||||
i == child->app_idx) {
|
||||
j == (int)child->app_idx) {
|
||||
child->exit_code = rc;
|
||||
}
|
||||
}
|
||||
@ -1523,7 +1519,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID) &&
|
||||
i == child->app_idx) {
|
||||
j == (int)child->app_idx) {
|
||||
child->exit_code = rc;
|
||||
}
|
||||
}
|
||||
@ -1543,7 +1539,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
|
||||
/* does this child belong to this app? */
|
||||
if (i != child->app_idx) {
|
||||
if (j != (int)child->app_idx) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1839,20 +1835,16 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
if (NULL != orte_odls_globals.debugger &&
|
||||
!orte_odls_globals.debugger_launched &&
|
||||
0 < opal_list_get_size(&orte_local_children)) {
|
||||
dbg = (orte_app_context_t*)opal_pointer_array_get_item(&orte_odls_globals.debugger->apps, 0);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:launch forking debugger %s with %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_odls_globals.debugger->apps[0]->app,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), dbg->app,
|
||||
(ORTE_JOB_CONTROL_FORWARD_OUTPUT & orte_odls_globals.debugger->controls) ? "output forwarded" : "no output"));
|
||||
|
||||
odls_base_default_setup_fork(orte_odls_globals.debugger->apps[0],
|
||||
1, orte_process_info.num_procs,
|
||||
odls_base_default_setup_fork(dbg, 1, orte_process_info.num_procs,
|
||||
orte_process_info.num_procs,
|
||||
orte_process_info.num_procs, false,
|
||||
&(orte_odls_globals.debugger->apps[0]->env));
|
||||
fork_local(orte_odls_globals.debugger->apps[0], NULL,
|
||||
orte_odls_globals.debugger->apps[0]->env,
|
||||
orte_odls_globals.debugger);
|
||||
orte_process_info.num_procs, false, &dbg->env);
|
||||
fork_local(dbg, NULL, dbg->env, orte_odls_globals.debugger);
|
||||
orte_odls_globals.debugger_launched = true;
|
||||
if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(orte_odls_globals.debugger->jobid,
|
||||
ORTE_JOB_STATE_RUNNING,
|
||||
@ -1925,7 +1917,7 @@ int orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buff
|
||||
|
||||
/* if so, send the message */
|
||||
rc = orte_rml.send_buffer(child->name, buffer, tag, 0);
|
||||
if (rc < 0 && OPAL_SOS_GET_ERROR_CODE(rc) != ORTE_ERR_ADDRESSEE_UNKNOWN) {
|
||||
if (rc < 0 && rc != ORTE_ERR_ADDRESSEE_UNKNOWN) {
|
||||
/* ignore if the addressee is unknown as a race condition could
|
||||
* have allowed the child to exit before we send it a barrier
|
||||
* due to the vagaries of the event library.
|
||||
@ -2912,7 +2904,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||
rc = orte_errmgr.update_state(ORTE_JOBID_INVALID, ORTE_JOB_STATE_UNDEF,
|
||||
child->name, child->state, child->pid,
|
||||
child->exit_code);
|
||||
if (ORTE_ERR_SILENT == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (ORTE_ERR_SILENT == rc) {
|
||||
/* all procs are complete - we are done */
|
||||
break;
|
||||
}
|
||||
@ -3044,7 +3036,7 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
|
||||
free(child->rml_uri);
|
||||
child->rml_uri = NULL;
|
||||
}
|
||||
app = jobdat->apps[child->app_idx];
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx);
|
||||
|
||||
/* reset envars to match this child */
|
||||
if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) {
|
||||
|
@ -315,7 +315,8 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
|
||||
ptr->name = NULL;
|
||||
ptr->state = ORTE_JOB_STATE_UNDEF;
|
||||
ptr->launch_msg_processed = false;
|
||||
ptr->apps = NULL;
|
||||
OBJ_CONSTRUCT(&ptr->apps, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&ptr->apps, 2, INT_MAX, 2);
|
||||
ptr->num_apps = 0;
|
||||
ptr->policy = 0;
|
||||
ptr->cpus_per_rank = 1;
|
||||
@ -336,8 +337,9 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
|
||||
}
|
||||
static void orte_odls_job_destructor(orte_odls_job_t *ptr)
|
||||
{
|
||||
orte_app_idx_t i;
|
||||
|
||||
int i;
|
||||
orte_app_context_t *app;
|
||||
|
||||
OBJ_DESTRUCT(&ptr->lock);
|
||||
OBJ_DESTRUCT(&ptr->cond);
|
||||
if (NULL != ptr->instance) {
|
||||
@ -346,15 +348,12 @@ static void orte_odls_job_destructor(orte_odls_job_t *ptr)
|
||||
if (NULL != ptr->name) {
|
||||
free(ptr->name);
|
||||
}
|
||||
if (NULL != ptr->apps) {
|
||||
for (i=0; i < ptr->num_apps; i++) {
|
||||
OBJ_RELEASE(ptr->apps[i]);
|
||||
}
|
||||
if (NULL != ptr->apps) {
|
||||
free(ptr->apps);
|
||||
for (i=0; i < ptr->apps.size; i++) {
|
||||
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(&ptr->apps, i))) {
|
||||
OBJ_RELEASE(app);
|
||||
}
|
||||
OBJ_DESTRUCT(&ptr->apps);
|
||||
}
|
||||
|
||||
if (NULL != ptr->pmap && NULL != ptr->pmap->bytes) {
|
||||
free(ptr->pmap->bytes);
|
||||
free(ptr->pmap);
|
||||
|
@ -134,7 +134,7 @@ typedef struct orte_odls_job_t {
|
||||
char *instance; /* keep handy for scheduler restart */
|
||||
char *name; /* keep handy for scheduler restart */
|
||||
bool launch_msg_processed; /* launch msg has been fully processed */
|
||||
orte_app_context_t **apps; /* app_contexts for this job */
|
||||
opal_pointer_array_t apps; /* app_contexts for this job */
|
||||
orte_app_idx_t num_apps; /* number of app_contexts */
|
||||
orte_mapping_policy_t policy; /* mapping policy */
|
||||
int16_t cpus_per_rank; /* number of cpus/rank */
|
||||
|
@ -138,7 +138,7 @@ static void start(orte_jobid_t jobid)
|
||||
mca_base_component_t *c = &mca_sensor_file_component.super.base_version;
|
||||
opal_list_item_t *item;
|
||||
orte_odls_job_t *jobdat;
|
||||
orte_app_context_t *app;
|
||||
orte_app_context_t *app, *aptr;
|
||||
int rc, tmp;
|
||||
char *filename;
|
||||
file_tracker_t *ft;
|
||||
@ -159,8 +159,15 @@ static void start(orte_jobid_t jobid)
|
||||
item = opal_list_get_end(&orte_local_jobdata)) {
|
||||
jobdat = (orte_odls_job_t*)item;
|
||||
if (jobid == jobdat->jobid || ORTE_JOBID_WILDCARD == jobid) {
|
||||
/* must be at least one app_context, so use the first */
|
||||
if (NULL == (app = jobdat->apps[0])) {
|
||||
/* must be at least one app_context, so use the first one found */
|
||||
app = NULL;
|
||||
for (tmp=0; tmp < jobdat->apps.size; tmp++) {
|
||||
if (NULL != (aptr = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, tmp))) {
|
||||
app = aptr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == app) {
|
||||
/* got a problem */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
continue;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user