diff --git a/orte/mca/odls/base/help-orte-odls-base.txt b/orte/mca/odls/base/help-orte-odls-base.txt index cde63e5cfd..29c83dbb1b 100644 --- a/orte/mca/odls/base/help-orte-odls-base.txt +++ b/orte/mca/odls/base/help-orte-odls-base.txt @@ -6,6 +6,7 @@ # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2014 Research Organization for Information Science # and Technology (RIST). All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -46,6 +47,7 @@ Will continue attempting to launch the process. The xterm option was asked to display a rank that is larger than the number of procs in the job: +Node: %s Rank: %d Num procs: %d diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 4f6ad2c95c..93c7c58a85 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -634,21 +634,24 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) char **env = NULL, **argv = NULL, *cmd = NULL; int rc, i; bool found; + orte_proc_state_t state; /* thread-protect common values */ env = opal_argv_copy(app->env); - /* setup the pmix environment */ - if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &env))) { - ORTE_ERROR_LOG(rc); - goto errorout; - } - /* ensure we clear any prior info regarding state or exit status in * case this is a restart */ child->exit_code = 0; ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID); + + /* setup the pmix environment */ + if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &env))) { + ORTE_ERROR_LOG(rc); + state = ORTE_PROC_STATE_FAILED_TO_LAUNCH; + goto errorout; + } + /* if we are not forwarding output for this job, then * flag iof as complete */ @@ -693,8 +696,9 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) /* can't be done! */ orte_show_help("help-orte-odls-base.txt", "orte-odls-base:xterm-rank-out-of-bounds", - true, nm->name.vpid, jobdat->num_procs); - child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH; + true, orte_process_info.nodename, + nm->name.vpid, jobdat->num_procs); + state = ORTE_PROC_STATE_FAILED_TO_LAUNCH; goto errorout; } } @@ -717,7 +721,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) orte_show_help("help-orte-odls-base.txt", "orte-odls-base:fork-agent-not-found", true, orte_process_info.nodename, orte_fork_agent[0]); - child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH; + state = ORTE_PROC_STATE_FAILED_TO_LAUNCH; goto errorout; } } else { @@ -730,7 +734,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) */ if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &env))) { ORTE_ERROR_LOG(rc); - child->exit_code = rc; + state = ORTE_PROC_STATE_FAILED_TO_LAUNCH; goto errorout; } @@ -754,17 +758,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) } if (ORTE_SUCCESS != (rc = cd->fork_local(child, cmd, argv, env, jobdat, cd->opts))) { - child->exit_code = rc; /* error message already output */ - goto errorout; - } - if (ORTE_SUCCESS != rc) { - /* do NOT ERROR_LOG this error - it generates - * a message/node as most errors will be common - * across the entire cluster. Instead, we let orterun - * output a consolidated error message for us - */ - ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE); - child->exit_code = rc; /* error message already output */ + /* error message already output */ + state = ORTE_PROC_STATE_FAILED_TO_START; goto errorout; } @@ -782,7 +777,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) return; errorout: - ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START); + ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE); + ORTE_ACTIVATE_PROC_STATE(&child->name, state); if (NULL != env) { opal_argv_free(env); } diff --git a/orte/mca/odls/default/help-orte-odls-default.txt b/orte/mca/odls/default/help-orte-odls-default.txt index 0e5d526e13..06181b7c96 100644 --- a/orte/mca/odls/default/help-orte-odls-default.txt +++ b/orte/mca/odls/default/help-orte-odls-default.txt @@ -12,6 +12,7 @@ # All rights reserved. # Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. # Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -29,6 +30,7 @@ having specified a directory for your application. Your job will now abort. Local host: %s + Working dir: %s Application name: %s Error: %s # diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index 0e1683e1c9..ecdbb41fe5 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -328,6 +328,7 @@ static int do_child(orte_proc_t *child, int i; sigset_t sigs; long fd, fdmax = sysconf(_SC_OPEN_MAX); + char dir[MAXPATHLEN]; #if HAVE_SETPGID /* Set a new process group for this child, so that any @@ -425,9 +426,10 @@ static int do_child(orte_proc_t *child, /* Exec the new executable */ execve(app, argv, environ_copy); + getcwd(dir, sizeof(dir)); send_error_show_help(write_fd, 1, "help-orte-odls-default.txt", "execve error", - orte_process_info.nodename, app, strerror(errno)); + orte_process_info.nodename, dir, app, strerror(errno)); /* Does not return */ } diff --git a/orte/runtime/orte_quit.c b/orte/runtime/orte_quit.c index ca383ac71d..240ce9dbd2 100644 --- a/orte/runtime/orte_quit.c +++ b/orte/runtime/orte_quit.c @@ -15,7 +15,7 @@ * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -258,8 +258,8 @@ int orte_print_aborted_job(orte_job_t *job, default: if (0 != proc->exit_code) { orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true, - orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name, - (unsigned long)proc->name.vpid); + orte_basename, proc->exit_code, ORTE_ERROR_NAME(proc->exit_code), + node->name, (unsigned long)proc->name.vpid); } else { orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true, orte_basename, node->name); diff --git a/orte/tools/orterun/help-orterun.txt b/orte/tools/orterun/help-orterun.txt index c7aca563d2..ff49f2e786 100644 --- a/orte/tools/orterun/help-orterun.txt +++ b/orte/tools/orterun/help-orterun.txt @@ -12,6 +12,7 @@ # All rights reserved. # Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -296,6 +297,7 @@ while attempting to start process rank %lu. %s was unable to start the specified application as it encountered an error: +Error code: %d Error name: %s Node: %s diff --git a/orte/util/error_strings.c b/orte/util/error_strings.c index 3e9c2239b5..801373cb66 100644 --- a/orte/util/error_strings.c +++ b/orte/util/error_strings.c @@ -89,7 +89,7 @@ int orte_err2str(int errnum, const char **errmsg) if (orte_report_silent_errors) { retval = "Silent error"; } else { - retval = NULL; + retval = ""; } break; case ORTE_ERR_ADDRESSEE_UNKNOWN: @@ -174,7 +174,7 @@ int orte_err2str(int errnum, const char **errmsg) if (orte_report_silent_errors) { retval = "Next option"; } else { - retval = NULL; + retval = ""; } break; case ORTE_ERR_SENSOR_LIMIT_EXCEEDED: @@ -244,11 +244,7 @@ int orte_err2str(int errnum, const char **errmsg) retval = "Partial success"; break; default: - if (orte_report_silent_errors) { - retval = "Unknown error"; - } else { - retval = NULL; - } + retval = "Unknown error"; } *errmsg = retval;