1
1

Provide a little more help on the error messages when an executable isn't found so we have some better idea where we were looking for it. Don't double-report such errors. Ensure the ORTE_ERROR_NAME doesn't get a NULL back for the string name of an error code as that might cause some systems to segfault

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-03-17 09:54:37 -07:00
родитель 45b46dc446
Коммит dc85e7fde7
7 изменённых файлов: 33 добавлений и 33 удалений

Просмотреть файл

@ -6,6 +6,7 @@
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2014 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -46,6 +47,7 @@ Will continue attempting to launch the process.
The xterm option was asked to display a rank that is larger
than the number of procs in the job:
Node: %s
Rank: %d
Num procs: %d

Просмотреть файл

@ -634,21 +634,24 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
char **env = NULL, **argv = NULL, *cmd = NULL;
int rc, i;
bool found;
orte_proc_state_t state;
/* thread-protect common values */
env = opal_argv_copy(app->env);
/* setup the pmix environment */
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &env))) {
ORTE_ERROR_LOG(rc);
goto errorout;
}
/* ensure we clear any prior info regarding state or exit status in
* case this is a restart
*/
child->exit_code = 0;
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
/* setup the pmix environment */
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &env))) {
ORTE_ERROR_LOG(rc);
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
goto errorout;
}
/* if we are not forwarding output for this job, then
* flag iof as complete
*/
@ -693,8 +696,9 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
/* can't be done! */
orte_show_help("help-orte-odls-base.txt",
"orte-odls-base:xterm-rank-out-of-bounds",
true, nm->name.vpid, jobdat->num_procs);
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
true, orte_process_info.nodename,
nm->name.vpid, jobdat->num_procs);
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
goto errorout;
}
}
@ -717,7 +721,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
orte_show_help("help-orte-odls-base.txt",
"orte-odls-base:fork-agent-not-found",
true, orte_process_info.nodename, orte_fork_agent[0]);
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
goto errorout;
}
} else {
@ -730,7 +734,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
*/
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &env))) {
ORTE_ERROR_LOG(rc);
child->exit_code = rc;
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
goto errorout;
}
@ -754,17 +758,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
}
if (ORTE_SUCCESS != (rc = cd->fork_local(child, cmd, argv, env, jobdat, cd->opts))) {
child->exit_code = rc; /* error message already output */
goto errorout;
}
if (ORTE_SUCCESS != rc) {
/* do NOT ERROR_LOG this error - it generates
* a message/node as most errors will be common
* across the entire cluster. Instead, we let orterun
* output a consolidated error message for us
*/
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
child->exit_code = rc; /* error message already output */
/* error message already output */
state = ORTE_PROC_STATE_FAILED_TO_START;
goto errorout;
}
@ -782,7 +777,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
return;
errorout:
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
ORTE_ACTIVATE_PROC_STATE(&child->name, state);
if (NULL != env) {
opal_argv_free(env);
}

Просмотреть файл

@ -12,6 +12,7 @@
# All rights reserved.
# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -29,6 +30,7 @@ having specified a directory for your application. Your job will now
abort.
Local host: %s
Working dir: %s
Application name: %s
Error: %s
#

Просмотреть файл

@ -328,6 +328,7 @@ static int do_child(orte_proc_t *child,
int i;
sigset_t sigs;
long fd, fdmax = sysconf(_SC_OPEN_MAX);
char dir[MAXPATHLEN];
#if HAVE_SETPGID
/* Set a new process group for this child, so that any
@ -425,9 +426,10 @@ static int do_child(orte_proc_t *child,
/* Exec the new executable */
execve(app, argv, environ_copy);
getcwd(dir, sizeof(dir));
send_error_show_help(write_fd, 1,
"help-orte-odls-default.txt", "execve error",
orte_process_info.nodename, app, strerror(errno));
orte_process_info.nodename, dir, app, strerror(errno));
/* Does not return */
}

Просмотреть файл

@ -15,7 +15,7 @@
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -258,8 +258,8 @@ int orte_print_aborted_job(orte_job_t *job,
default:
if (0 != proc->exit_code) {
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
(unsigned long)proc->name.vpid);
orte_basename, proc->exit_code, ORTE_ERROR_NAME(proc->exit_code),
node->name, (unsigned long)proc->name.vpid);
} else {
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
orte_basename, node->name);

Просмотреть файл

@ -12,6 +12,7 @@
# All rights reserved.
# Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -296,6 +297,7 @@ while attempting to start process rank %lu.
%s was unable to start the specified application as it encountered an
error:
Error code: %d
Error name: %s
Node: %s

Просмотреть файл

@ -89,7 +89,7 @@ int orte_err2str(int errnum, const char **errmsg)
if (orte_report_silent_errors) {
retval = "Silent error";
} else {
retval = NULL;
retval = "";
}
break;
case ORTE_ERR_ADDRESSEE_UNKNOWN:
@ -174,7 +174,7 @@ int orte_err2str(int errnum, const char **errmsg)
if (orte_report_silent_errors) {
retval = "Next option";
} else {
retval = NULL;
retval = "";
}
break;
case ORTE_ERR_SENSOR_LIMIT_EXCEEDED:
@ -244,11 +244,7 @@ int orte_err2str(int errnum, const char **errmsg)
retval = "Partial success";
break;
default:
if (orte_report_silent_errors) {
retval = "Unknown error";
} else {
retval = NULL;
}
retval = "Unknown error";
}
*errmsg = retval;