Коммит
41d7a5c7d9
@ -633,6 +633,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
orte_proc_t *child = cd->child;
|
||||
char **env = NULL, **argv = NULL, *cmd = NULL;
|
||||
int rc, i;
|
||||
bool found;
|
||||
|
||||
/* thread-protect common values */
|
||||
env = opal_argv_copy(app->env);
|
||||
@ -667,6 +668,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
opal_list_item_t *nmitem;
|
||||
orte_namelist_t *nm;
|
||||
/* see if this rank is one of those requested */
|
||||
found = false;
|
||||
for (nmitem = opal_list_get_first(&orte_odls_globals.xterm_ranks);
|
||||
nmitem != opal_list_get_end(&orte_odls_globals.xterm_ranks);
|
||||
nmitem = opal_list_get_next(nmitem)) {
|
||||
@ -685,6 +687,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
}
|
||||
/* use the xterm cmd as the app string */
|
||||
cmd = strdup(orte_odls_globals.xtermcmd[0]);
|
||||
found = true;
|
||||
break;
|
||||
} else if (jobdat->num_procs <= nm->name.vpid) { /* check for bozo case */
|
||||
/* can't be done! */
|
||||
@ -695,6 +698,10 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
goto errorout;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
cmd = strdup(app->app);
|
||||
argv = opal_argv_copy(app->argv);
|
||||
}
|
||||
} else if (NULL != orte_fork_agent) {
|
||||
/* we were given a fork agent - use it */
|
||||
argv = opal_argv_copy(orte_fork_agent);
|
||||
@ -794,7 +801,6 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
orte_proc_t *child=NULL;
|
||||
int rc=ORTE_SUCCESS;
|
||||
char basedir[MAXPATHLEN];
|
||||
char **argvsav=NULL;
|
||||
int j, idx;
|
||||
int total_num_local_procs = 0;
|
||||
orte_odls_launch_local_t *caddy = (orte_odls_launch_local_t*)cbdata;
|
||||
@ -881,15 +887,19 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
if (opal_sys_limits.num_files < limit) {
|
||||
if (2 < caddy->retries) {
|
||||
/* tried enough - give up */
|
||||
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
||||
for (idx=0; idx < orte_local_children->size; idx++) {
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
|
||||
continue;
|
||||
}
|
||||
if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID)) {
|
||||
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
||||
}
|
||||
}
|
||||
goto ERROR_OUT;
|
||||
}
|
||||
/* don't have enough - wait a little time */
|
||||
ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy);
|
||||
if (NULL != argvsav) {
|
||||
opal_argv_free(argvsav);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -1200,7 +1200,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
}
|
||||
}
|
||||
/* we need mpirun to be the first node on this list */
|
||||
if (0 != strcmp(nodelist[0], orte_process_info.nodename)) {
|
||||
if (NULL == nodelist || 0 != strcmp(nodelist[0], orte_process_info.nodename)) {
|
||||
opal_argv_prepend_nosize(&nodelist, orte_process_info.nodename);
|
||||
}
|
||||
nlistflat = opal_argv_join(nodelist, ',');
|
||||
|
@ -467,13 +467,16 @@ void orte_rmaps_base_display_map(orte_job_t *jdata)
|
||||
continue;
|
||||
}
|
||||
memset(tmp1, 0, 1024);
|
||||
orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, (void**)&bd, OPAL_PTR);
|
||||
if (NULL == bd) {
|
||||
(void)strncpy(tmp1, "UNBOUND", strlen("UNBOUND"));
|
||||
} else {
|
||||
if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2mapstr(tmp1, sizeof(tmp1), node->topology->topo, bd->cpuset)) {
|
||||
if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, (void**)&bd, OPAL_PTR)) {
|
||||
if (NULL == bd) {
|
||||
(void)strncpy(tmp1, "UNBOUND", strlen("UNBOUND"));
|
||||
} else {
|
||||
if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2mapstr(tmp1, sizeof(tmp1), node->topology->topo, bd->cpuset)) {
|
||||
(void)strncpy(tmp1, "UNBOUND", strlen("UNBOUND"));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
(void)strncpy(tmp1, "UNBOUND", strlen("UNBOUND"));
|
||||
}
|
||||
opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu binding=%s>",
|
||||
ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx,
|
||||
@ -488,29 +491,33 @@ void orte_rmaps_base_display_map(orte_job_t *jdata)
|
||||
node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, 0);
|
||||
p0 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, 0);
|
||||
p0bitmap = NULL;
|
||||
orte_get_attribute(&p0->attributes, ORTE_PROC_CPU_BITMAP, (void**)&p0bitmap, OPAL_STRING);
|
||||
opal_output(orte_clean_output, "\t<locality>");
|
||||
for (j=1; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
if (orte_get_attribute(&p0->attributes, ORTE_PROC_CPU_BITMAP, (void**)&p0bitmap, OPAL_STRING) &&
|
||||
NULL != p0bitmap) {
|
||||
opal_output(orte_clean_output, "\t<locality>");
|
||||
for (j=1; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
procbitmap = NULL;
|
||||
if (orte_get_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, (void**)&procbitmap, OPAL_STRING) &&
|
||||
NULL != procbitmap) {
|
||||
locality = opal_hwloc_base_get_relative_locality(node->topology->topo,
|
||||
p0bitmap,
|
||||
procbitmap);
|
||||
opal_output(orte_clean_output, "\t\t<rank=%s rank=%s locality=%s>",
|
||||
ORTE_VPID_PRINT(p0->name.vpid),
|
||||
ORTE_VPID_PRINT(proc->name.vpid),
|
||||
opal_hwloc_base_print_locality(locality));
|
||||
}
|
||||
}
|
||||
opal_output(orte_clean_output, "\t</locality>\n</map>");
|
||||
fflush(stderr);
|
||||
if (NULL != p0bitmap) {
|
||||
free(p0bitmap);
|
||||
}
|
||||
if (NULL != procbitmap) {
|
||||
free(procbitmap);
|
||||
}
|
||||
procbitmap = NULL;
|
||||
orte_get_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, (void**)&procbitmap, OPAL_STRING);
|
||||
locality = opal_hwloc_base_get_relative_locality(node->topology->topo,
|
||||
p0bitmap,
|
||||
procbitmap);
|
||||
opal_output(orte_clean_output, "\t\t<rank=%s rank=%s locality=%s>",
|
||||
ORTE_VPID_PRINT(p0->name.vpid),
|
||||
ORTE_VPID_PRINT(proc->name.vpid),
|
||||
opal_hwloc_base_print_locality(locality));
|
||||
}
|
||||
opal_output(orte_clean_output, "\t</locality>\n</map>");
|
||||
fflush(stderr);
|
||||
if (NULL != p0bitmap) {
|
||||
free(p0bitmap);
|
||||
}
|
||||
if (NULL != procbitmap) {
|
||||
free(procbitmap);
|
||||
}
|
||||
} else {
|
||||
opal_output(orte_clean_output, " Data for JOB %s offset %s", ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(jdata->offset));
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user