Ensure we get a non-zero exit status when we can't find the specified fork agent. Output a better error message, and ensure we don't multiply report the problem.
This commit was SVN r26191.
Этот коммит содержится в:
родитель
6dc44dc4b8
Коммит
ca3ff58c76
@ -73,3 +73,11 @@ This is an error; your job will now abort.
|
|||||||
Local host: %s
|
Local host: %s
|
||||||
Application name: %s
|
Application name: %s
|
||||||
Action requested: %s %s
|
Action requested: %s %s
|
||||||
|
#
|
||||||
|
[orte-odls-base:fork-agent-not-found]
|
||||||
|
The specified fork agent was not found:
|
||||||
|
|
||||||
|
Node: %s
|
||||||
|
Fork agent: %s
|
||||||
|
|
||||||
|
The application cannot be launched.
|
||||||
|
@ -1529,6 +1529,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
|||||||
"orte-odls-base:xterm-rank-out-of-bounds",
|
"orte-odls-base:xterm-rank-out-of-bounds",
|
||||||
true, nm->name.vpid, jobdat->num_procs);
|
true, nm->name.vpid, jobdat->num_procs);
|
||||||
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||||
|
child->exit_code = ORTE_ERR_SILENT;
|
||||||
goto CLEANUP;
|
goto CLEANUP;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1551,8 +1552,11 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
|||||||
free(app->app);
|
free(app->app);
|
||||||
app->app = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
|
app->app = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
|
||||||
if (NULL == app->app) {
|
if (NULL == app->app) {
|
||||||
opal_output(0, "%s CANNOT FIND FORK AGENT %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_fork_agent[0]);
|
orte_show_help("help-orte-odls-base.txt",
|
||||||
rc = ORTE_ERR_NOT_FOUND;
|
"orte-odls-base:fork-agent-not-found",
|
||||||
|
true, orte_process_info.nodename, orte_fork_agent[0]);
|
||||||
|
rc = ORTE_ERR_SILENT;
|
||||||
|
child->exit_code = ORTE_ERR_SILENT;
|
||||||
goto CLEANUP;
|
goto CLEANUP;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -231,121 +231,124 @@ static void dump_aborted_procs(void)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
switch (proc->exit_code) {
|
switch (proc->exit_code) {
|
||||||
case ORTE_ERR_SYS_LIMITS_PIPES:
|
case ORTE_ERR_SILENT:
|
||||||
orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
|
/* say nothing - it was already reported */
|
||||||
orte_basename, proc->node->name,
|
break;
|
||||||
(unsigned long)proc->name.vpid);
|
case ORTE_ERR_SYS_LIMITS_PIPES:
|
||||||
break;
|
orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
|
||||||
case ORTE_ERR_PIPE_SETUP_FAILURE:
|
orte_basename, proc->node->name,
|
||||||
orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
|
(unsigned long)proc->name.vpid);
|
||||||
orte_basename, proc->node->name,
|
break;
|
||||||
(unsigned long)proc->name.vpid);
|
case ORTE_ERR_PIPE_SETUP_FAILURE:
|
||||||
break;
|
orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
|
||||||
case ORTE_ERR_SYS_LIMITS_CHILDREN:
|
orte_basename, proc->node->name,
|
||||||
orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
|
(unsigned long)proc->name.vpid);
|
||||||
orte_basename, proc->node->name,
|
break;
|
||||||
(unsigned long)proc->name.vpid);
|
case ORTE_ERR_SYS_LIMITS_CHILDREN:
|
||||||
break;
|
orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
|
||||||
case ORTE_ERR_FAILED_GET_TERM_ATTRS:
|
orte_basename, proc->node->name,
|
||||||
orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
|
(unsigned long)proc->name.vpid);
|
||||||
orte_basename, proc->node->name,
|
break;
|
||||||
(unsigned long)proc->name.vpid);
|
case ORTE_ERR_FAILED_GET_TERM_ATTRS:
|
||||||
break;
|
orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
|
||||||
case ORTE_ERR_WDIR_NOT_FOUND:
|
orte_basename, proc->node->name,
|
||||||
orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
|
(unsigned long)proc->name.vpid);
|
||||||
orte_basename, approc->cwd,
|
break;
|
||||||
proc->node->name, (unsigned long)proc->name.vpid);
|
case ORTE_ERR_WDIR_NOT_FOUND:
|
||||||
break;
|
orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
|
||||||
case ORTE_ERR_EXE_NOT_FOUND:
|
orte_basename, approc->cwd,
|
||||||
orte_show_help("help-orterun.txt", "orterun:exe-not-found", true,
|
proc->node->name, (unsigned long)proc->name.vpid);
|
||||||
orte_basename,
|
break;
|
||||||
(unsigned long)proc->name.vpid,
|
case ORTE_ERR_EXE_NOT_FOUND:
|
||||||
orte_basename,
|
orte_show_help("help-orterun.txt", "orterun:exe-not-found", true,
|
||||||
orte_basename,
|
orte_basename,
|
||||||
proc->node->name,
|
(unsigned long)proc->name.vpid,
|
||||||
approc->app);
|
orte_basename,
|
||||||
break;
|
orte_basename,
|
||||||
case ORTE_ERR_EXE_NOT_ACCESSIBLE:
|
proc->node->name,
|
||||||
orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
|
approc->app);
|
||||||
orte_basename, approc->app, proc->node->name,
|
break;
|
||||||
(unsigned long)proc->name.vpid);
|
case ORTE_ERR_EXE_NOT_ACCESSIBLE:
|
||||||
break;
|
orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
|
||||||
case ORTE_ERR_MULTIPLE_AFFINITIES:
|
orte_basename, approc->app, proc->node->name,
|
||||||
orte_show_help("help-orterun.txt",
|
(unsigned long)proc->name.vpid);
|
||||||
"orterun:multiple-paffinity-schemes", true, NULL);
|
break;
|
||||||
break;
|
case ORTE_ERR_MULTIPLE_AFFINITIES:
|
||||||
case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
|
orte_show_help("help-orterun.txt",
|
||||||
orte_show_help("help-orterun.txt",
|
"orterun:multiple-paffinity-schemes", true, NULL);
|
||||||
"orterun:topo-not-supported",
|
break;
|
||||||
true, orte_process_info.nodename, "rankfile containing a slot_list of ",
|
case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
|
||||||
NULL, approc->app);
|
orte_show_help("help-orterun.txt",
|
||||||
break;
|
"orterun:topo-not-supported",
|
||||||
case ORTE_ERR_INVALID_NODE_RANK:
|
true, orte_process_info.nodename, "rankfile containing a slot_list of ",
|
||||||
orte_show_help("help-orterun.txt",
|
NULL, approc->app);
|
||||||
"orterun:invalid-node-rank", true);
|
break;
|
||||||
break;
|
case ORTE_ERR_INVALID_NODE_RANK:
|
||||||
case ORTE_ERR_INVALID_LOCAL_RANK:
|
orte_show_help("help-orterun.txt",
|
||||||
orte_show_help("help-orterun.txt",
|
"orterun:invalid-node-rank", true);
|
||||||
"orterun:invalid-local-rank", true);
|
break;
|
||||||
break;
|
case ORTE_ERR_INVALID_LOCAL_RANK:
|
||||||
case ORTE_ERR_NOT_ENOUGH_CORES:
|
orte_show_help("help-orterun.txt",
|
||||||
orte_show_help("help-orterun.txt",
|
"orterun:invalid-local-rank", true);
|
||||||
"orterun:not-enough-resources", true,
|
break;
|
||||||
"sockets", node->name,
|
case ORTE_ERR_NOT_ENOUGH_CORES:
|
||||||
"bind-to-core", approc->app);
|
orte_show_help("help-orterun.txt",
|
||||||
break;
|
"orterun:not-enough-resources", true,
|
||||||
case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED:
|
"sockets", node->name,
|
||||||
orte_show_help("help-orterun.txt",
|
"bind-to-core", approc->app);
|
||||||
"orterun:topo-not-supported",
|
break;
|
||||||
true, node->name, "bind-to-core", "",
|
case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED:
|
||||||
approc->app);
|
orte_show_help("help-orterun.txt",
|
||||||
break;
|
"orterun:topo-not-supported",
|
||||||
case ORTE_ERR_INVALID_PHYS_CPU:
|
true, node->name, "bind-to-core", "",
|
||||||
orte_show_help("help-orterun.txt",
|
approc->app);
|
||||||
"orterun:invalid-phys-cpu", true);
|
break;
|
||||||
break;
|
case ORTE_ERR_INVALID_PHYS_CPU:
|
||||||
case ORTE_ERR_NOT_ENOUGH_SOCKETS:
|
orte_show_help("help-orterun.txt",
|
||||||
orte_show_help("help-orterun.txt",
|
"orterun:invalid-phys-cpu", true);
|
||||||
"orterun:not-enough-resources", true,
|
break;
|
||||||
"sockets", node->name,
|
case ORTE_ERR_NOT_ENOUGH_SOCKETS:
|
||||||
"bind-to-socket", approc->app);
|
orte_show_help("help-orterun.txt",
|
||||||
break;
|
"orterun:not-enough-resources", true,
|
||||||
case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED:
|
"sockets", node->name,
|
||||||
orte_show_help("help-orterun.txt",
|
"bind-to-socket", approc->app);
|
||||||
"orterun:topo-not-supported",
|
break;
|
||||||
true, node->name, "bind-to-socket", "",
|
case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED:
|
||||||
approc->app);
|
orte_show_help("help-orterun.txt",
|
||||||
break;
|
"orterun:topo-not-supported",
|
||||||
case ORTE_ERR_MODULE_NOT_FOUND:
|
true, node->name, "bind-to-socket", "",
|
||||||
orte_show_help("help-orterun.txt",
|
approc->app);
|
||||||
"orterun:paffinity-missing-module",
|
break;
|
||||||
true, node->name);
|
case ORTE_ERR_MODULE_NOT_FOUND:
|
||||||
break;
|
orte_show_help("help-orterun.txt",
|
||||||
case ORTE_ERR_SLOT_LIST_RANGE:
|
"orterun:paffinity-missing-module",
|
||||||
orte_show_help("help-orterun.txt",
|
true, node->name);
|
||||||
"orterun:invalid-slot-list-range",
|
break;
|
||||||
true, node->name, NULL);
|
case ORTE_ERR_SLOT_LIST_RANGE:
|
||||||
break;
|
orte_show_help("help-orterun.txt",
|
||||||
case ORTE_ERR_PIPE_READ_FAILURE:
|
"orterun:invalid-slot-list-range",
|
||||||
orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
|
true, node->name, NULL);
|
||||||
orte_basename, node->name, (unsigned long)proc->name.vpid);
|
break;
|
||||||
break;
|
case ORTE_ERR_PIPE_READ_FAILURE:
|
||||||
case ORTE_ERR_SOCKET_NOT_AVAILABLE:
|
orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
|
||||||
orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
|
orte_basename, node->name, (unsigned long)proc->name.vpid);
|
||||||
|
break;
|
||||||
|
case ORTE_ERR_SOCKET_NOT_AVAILABLE:
|
||||||
|
orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
|
||||||
|
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
||||||
|
(unsigned long)proc->name.vpid);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
if (0 != proc->exit_code) {
|
||||||
|
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
|
||||||
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
||||||
(unsigned long)proc->name.vpid);
|
(unsigned long)proc->name.vpid);
|
||||||
break;
|
} else {
|
||||||
|
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
|
||||||
default:
|
orte_basename, node->name);
|
||||||
if (0 != proc->exit_code) {
|
}
|
||||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
|
break;
|
||||||
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
|
||||||
(unsigned long)proc->name.vpid);
|
|
||||||
} else {
|
|
||||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
|
|
||||||
orte_basename, node->name);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
} else if (ORTE_JOB_STATE_ABORTED == job->state) {
|
} else if (ORTE_JOB_STATE_ABORTED == job->state) {
|
||||||
if (NULL == proc) {
|
if (NULL == proc) {
|
||||||
@ -391,17 +394,17 @@ static void dump_aborted_procs(void)
|
|||||||
ORTE_NAME_PRINT(&proc->name), node->name);
|
ORTE_NAME_PRINT(&proc->name), node->name);
|
||||||
} else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) {
|
} else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) {
|
||||||
switch (proc->exit_code) {
|
switch (proc->exit_code) {
|
||||||
case ORTE_ERR_MEM_LIMIT_EXCEEDED:
|
case ORTE_ERR_MEM_LIMIT_EXCEEDED:
|
||||||
orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true,
|
orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true,
|
||||||
ORTE_NAME_PRINT(&proc->name), node->name);
|
ORTE_NAME_PRINT(&proc->name), node->name);
|
||||||
break;
|
break;
|
||||||
case ORTE_ERR_PROC_STALLED:
|
case ORTE_ERR_PROC_STALLED:
|
||||||
orte_show_help("help-orterun.txt", "orterun:proc-stalled", true);
|
orte_show_help("help-orterun.txt", "orterun:proc-stalled", true);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true);
|
orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else if (ORTE_JOB_STATE_CALLED_ABORT == job->state) {
|
} else if (ORTE_JOB_STATE_CALLED_ABORT == job->state) {
|
||||||
orte_show_help("help-orterun.txt", "orterun:proc-called-abort", true,
|
orte_show_help("help-orterun.txt", "orterun:proc-called-abort", true,
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user