1
1

Ensure we get a non-zero exit status when we can't find the specified fork agent. Output a better error message, and ensure we don't multiply report the problem.

This commit was SVN r26191.
Этот коммит содержится в:
Ralph Castain 2012-03-24 00:49:38 +00:00
родитель 6dc44dc4b8
Коммит ca3ff58c76
3 изменённых файлов: 140 добавлений и 125 удалений

Просмотреть файл

@ -73,3 +73,11 @@ This is an error; your job will now abort.
Local host: %s Local host: %s
Application name: %s Application name: %s
Action requested: %s %s Action requested: %s %s
#
[orte-odls-base:fork-agent-not-found]
The specified fork agent was not found:
Node: %s
Fork agent: %s
The application cannot be launched.

Просмотреть файл

@ -1529,6 +1529,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
"orte-odls-base:xterm-rank-out-of-bounds", "orte-odls-base:xterm-rank-out-of-bounds",
true, nm->name.vpid, jobdat->num_procs); true, nm->name.vpid, jobdat->num_procs);
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
child->exit_code = ORTE_ERR_SILENT;
goto CLEANUP; goto CLEANUP;
} }
@ -1551,8 +1552,11 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
free(app->app); free(app->app);
app->app = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL); app->app = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
if (NULL == app->app) { if (NULL == app->app) {
opal_output(0, "%s CANNOT FIND FORK AGENT %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_fork_agent[0]); orte_show_help("help-orte-odls-base.txt",
rc = ORTE_ERR_NOT_FOUND; "orte-odls-base:fork-agent-not-found",
true, orte_process_info.nodename, orte_fork_agent[0]);
rc = ORTE_ERR_SILENT;
child->exit_code = ORTE_ERR_SILENT;
goto CLEANUP; goto CLEANUP;
} }
} }

Просмотреть файл

@ -231,121 +231,124 @@ static void dump_aborted_procs(void)
return; return;
} }
switch (proc->exit_code) { switch (proc->exit_code) {
case ORTE_ERR_SYS_LIMITS_PIPES: case ORTE_ERR_SILENT:
orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true, /* say nothing - it was already reported */
orte_basename, proc->node->name, break;
(unsigned long)proc->name.vpid); case ORTE_ERR_SYS_LIMITS_PIPES:
break; orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
case ORTE_ERR_PIPE_SETUP_FAILURE: orte_basename, proc->node->name,
orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true, (unsigned long)proc->name.vpid);
orte_basename, proc->node->name, break;
(unsigned long)proc->name.vpid); case ORTE_ERR_PIPE_SETUP_FAILURE:
break; orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
case ORTE_ERR_SYS_LIMITS_CHILDREN: orte_basename, proc->node->name,
orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true, (unsigned long)proc->name.vpid);
orte_basename, proc->node->name, break;
(unsigned long)proc->name.vpid); case ORTE_ERR_SYS_LIMITS_CHILDREN:
break; orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
case ORTE_ERR_FAILED_GET_TERM_ATTRS: orte_basename, proc->node->name,
orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true, (unsigned long)proc->name.vpid);
orte_basename, proc->node->name, break;
(unsigned long)proc->name.vpid); case ORTE_ERR_FAILED_GET_TERM_ATTRS:
break; orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
case ORTE_ERR_WDIR_NOT_FOUND: orte_basename, proc->node->name,
orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true, (unsigned long)proc->name.vpid);
orte_basename, approc->cwd, break;
proc->node->name, (unsigned long)proc->name.vpid); case ORTE_ERR_WDIR_NOT_FOUND:
break; orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
case ORTE_ERR_EXE_NOT_FOUND: orte_basename, approc->cwd,
orte_show_help("help-orterun.txt", "orterun:exe-not-found", true, proc->node->name, (unsigned long)proc->name.vpid);
orte_basename, break;
(unsigned long)proc->name.vpid, case ORTE_ERR_EXE_NOT_FOUND:
orte_basename, orte_show_help("help-orterun.txt", "orterun:exe-not-found", true,
orte_basename, orte_basename,
proc->node->name, (unsigned long)proc->name.vpid,
approc->app); orte_basename,
break; orte_basename,
case ORTE_ERR_EXE_NOT_ACCESSIBLE: proc->node->name,
orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true, approc->app);
orte_basename, approc->app, proc->node->name, break;
(unsigned long)proc->name.vpid); case ORTE_ERR_EXE_NOT_ACCESSIBLE:
break; orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
case ORTE_ERR_MULTIPLE_AFFINITIES: orte_basename, approc->app, proc->node->name,
orte_show_help("help-orterun.txt", (unsigned long)proc->name.vpid);
"orterun:multiple-paffinity-schemes", true, NULL); break;
break; case ORTE_ERR_MULTIPLE_AFFINITIES:
case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED: orte_show_help("help-orterun.txt",
orte_show_help("help-orterun.txt", "orterun:multiple-paffinity-schemes", true, NULL);
"orterun:topo-not-supported", break;
true, orte_process_info.nodename, "rankfile containing a slot_list of ", case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
NULL, approc->app); orte_show_help("help-orterun.txt",
break; "orterun:topo-not-supported",
case ORTE_ERR_INVALID_NODE_RANK: true, orte_process_info.nodename, "rankfile containing a slot_list of ",
orte_show_help("help-orterun.txt", NULL, approc->app);
"orterun:invalid-node-rank", true); break;
break; case ORTE_ERR_INVALID_NODE_RANK:
case ORTE_ERR_INVALID_LOCAL_RANK: orte_show_help("help-orterun.txt",
orte_show_help("help-orterun.txt", "orterun:invalid-node-rank", true);
"orterun:invalid-local-rank", true); break;
break; case ORTE_ERR_INVALID_LOCAL_RANK:
case ORTE_ERR_NOT_ENOUGH_CORES: orte_show_help("help-orterun.txt",
orte_show_help("help-orterun.txt", "orterun:invalid-local-rank", true);
"orterun:not-enough-resources", true, break;
"sockets", node->name, case ORTE_ERR_NOT_ENOUGH_CORES:
"bind-to-core", approc->app); orte_show_help("help-orterun.txt",
break; "orterun:not-enough-resources", true,
case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED: "sockets", node->name,
orte_show_help("help-orterun.txt", "bind-to-core", approc->app);
"orterun:topo-not-supported", break;
true, node->name, "bind-to-core", "", case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED:
approc->app); orte_show_help("help-orterun.txt",
break; "orterun:topo-not-supported",
case ORTE_ERR_INVALID_PHYS_CPU: true, node->name, "bind-to-core", "",
orte_show_help("help-orterun.txt", approc->app);
"orterun:invalid-phys-cpu", true); break;
break; case ORTE_ERR_INVALID_PHYS_CPU:
case ORTE_ERR_NOT_ENOUGH_SOCKETS: orte_show_help("help-orterun.txt",
orte_show_help("help-orterun.txt", "orterun:invalid-phys-cpu", true);
"orterun:not-enough-resources", true, break;
"sockets", node->name, case ORTE_ERR_NOT_ENOUGH_SOCKETS:
"bind-to-socket", approc->app); orte_show_help("help-orterun.txt",
break; "orterun:not-enough-resources", true,
case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED: "sockets", node->name,
orte_show_help("help-orterun.txt", "bind-to-socket", approc->app);
"orterun:topo-not-supported", break;
true, node->name, "bind-to-socket", "", case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED:
approc->app); orte_show_help("help-orterun.txt",
break; "orterun:topo-not-supported",
case ORTE_ERR_MODULE_NOT_FOUND: true, node->name, "bind-to-socket", "",
orte_show_help("help-orterun.txt", approc->app);
"orterun:paffinity-missing-module", break;
true, node->name); case ORTE_ERR_MODULE_NOT_FOUND:
break; orte_show_help("help-orterun.txt",
case ORTE_ERR_SLOT_LIST_RANGE: "orterun:paffinity-missing-module",
orte_show_help("help-orterun.txt", true, node->name);
"orterun:invalid-slot-list-range", break;
true, node->name, NULL); case ORTE_ERR_SLOT_LIST_RANGE:
break; orte_show_help("help-orterun.txt",
case ORTE_ERR_PIPE_READ_FAILURE: "orterun:invalid-slot-list-range",
orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true, true, node->name, NULL);
orte_basename, node->name, (unsigned long)proc->name.vpid); break;
break; case ORTE_ERR_PIPE_READ_FAILURE:
case ORTE_ERR_SOCKET_NOT_AVAILABLE: orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true, orte_basename, node->name, (unsigned long)proc->name.vpid);
break;
case ORTE_ERR_SOCKET_NOT_AVAILABLE:
orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
(unsigned long)proc->name.vpid);
break;
default:
if (0 != proc->exit_code) {
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name, orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
(unsigned long)proc->name.vpid); (unsigned long)proc->name.vpid);
break; } else {
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
default: orte_basename, node->name);
if (0 != proc->exit_code) { }
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true, break;
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
(unsigned long)proc->name.vpid);
} else {
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
orte_basename, node->name);
}
break;
} }
} else if (ORTE_JOB_STATE_ABORTED == job->state) { } else if (ORTE_JOB_STATE_ABORTED == job->state) {
if (NULL == proc) { if (NULL == proc) {
@ -391,17 +394,17 @@ static void dump_aborted_procs(void)
ORTE_NAME_PRINT(&proc->name), node->name); ORTE_NAME_PRINT(&proc->name), node->name);
} else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) { } else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) {
switch (proc->exit_code) { switch (proc->exit_code) {
case ORTE_ERR_MEM_LIMIT_EXCEEDED: case ORTE_ERR_MEM_LIMIT_EXCEEDED:
orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true, orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true,
ORTE_NAME_PRINT(&proc->name), node->name); ORTE_NAME_PRINT(&proc->name), node->name);
break; break;
case ORTE_ERR_PROC_STALLED: case ORTE_ERR_PROC_STALLED:
orte_show_help("help-orterun.txt", "orterun:proc-stalled", true); orte_show_help("help-orterun.txt", "orterun:proc-stalled", true);
break; break;
default: default:
orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true); orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true);
break; break;
} }
} else if (ORTE_JOB_STATE_CALLED_ABORT == job->state) { } else if (ORTE_JOB_STATE_CALLED_ABORT == job->state) {
orte_show_help("help-orterun.txt", "orterun:proc-called-abort", true, orte_show_help("help-orterun.txt", "orterun:proc-called-abort", true,