* Add useful error messages to bootproxy for when it fails
* don't push pcmclient env variables to clients, since they don't have useful information in them * add check to comm_spawn code to make sure we can spawn This commit was SVN r2993.
Этот коммит содержится в:
родитель
fc0797ab9d
Коммит
96b3c503f9
@ -273,7 +273,12 @@ int ompi_comm_start_processes (char *command, char **argv, int maxprocs,
|
||||
new_jobid = ompi_name_server.create_jobid();
|
||||
|
||||
/* get the spawn handle to start spawning stuff */
|
||||
spawn_handle = ompi_rte_get_spawn_handle(OMPI_RTE_SPAWN_FROM_MPI, true);
|
||||
spawn_handle =
|
||||
ompi_rte_get_spawn_handle(OMPI_RTE_SPAWN_FROM_MPI|OMPI_RTE_SPAWN_HIGH_QOS, true);
|
||||
if (NULL == spawn_handle) {
|
||||
printf("show_help: get_spawn_handle failed\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* BWB - fix jobid, procs, and nodes */
|
||||
nodelist = ompi_rte_allocate_resources(spawn_handle, new_jobid, 0, maxprocs);
|
||||
|
@ -68,7 +68,9 @@ mca_pcm_base_build_base_env(char **in_env, int *envc, char ***out_envp)
|
||||
int ret;
|
||||
|
||||
for (i = 0 ; in_env[i] != NULL ; ++i) {
|
||||
if (0 == strncmp("OMPI_", in_env[i], strlen("OMPI_"))) {
|
||||
if ((0 == strncmp("OMPI_", in_env[i], strlen("OMPI_"))) &&
|
||||
(0 != strncmp("OMPI_MCA_pcmclient", in_env[i],
|
||||
strlen("OMPI_MCA_pcmclient")))) {
|
||||
ret = ompi_argv_append(envc, &env, in_env[i]);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
ompi_argv_free(env);
|
||||
|
@ -7,6 +7,10 @@ include $(top_srcdir)/config/Makefile.options
|
||||
|
||||
libs = $(top_builddir)/src/libmpi.la
|
||||
|
||||
EXTRA_DIST = $(pkgdata_DATA)
|
||||
|
||||
pkgdata_DATA = help-bootproxy.txt
|
||||
|
||||
bin_PROGRAMS = mca_pcm_rsh_bootproxy
|
||||
|
||||
mca_pcm_rsh_bootproxy_SOURCES = \
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include "runtime/runtime.h"
|
||||
#include "mca/pcm/base/base.h"
|
||||
#include "class/ompi_list.h"
|
||||
#include "util/show_help.h"
|
||||
|
||||
|
||||
struct pid_item_t {
|
||||
@ -107,7 +108,8 @@ main(int argc, char *argv[])
|
||||
ret = mca_pcm_base_recv_schedule(stdin, &jobid, sched,
|
||||
&fork_num_procs);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
fprintf(stderr, "Failure in receiving schedule information\n");
|
||||
ompi_show_help("help-bootproxy.txt", "could-not-receive-schedule",
|
||||
true, ret);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@ -130,7 +132,8 @@ main(int argc, char *argv[])
|
||||
if (sched->cwd != NULL) {
|
||||
ret = chdir(sched->cwd);
|
||||
if (ret != 0) {
|
||||
perror("chdir");
|
||||
ompi_show_help("help-bootproxy.txt", "could-not-chdir",
|
||||
true, sched->cwd, strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
@ -142,7 +145,9 @@ main(int argc, char *argv[])
|
||||
pid = fork();
|
||||
if (pid < 0) {
|
||||
/* error :( */
|
||||
perror("fork");
|
||||
ompi_show_help("help-bootproxy.txt", "could-not-fork",
|
||||
true, sched->argv[0], strerror(errno));
|
||||
exit(errno);
|
||||
} else if (pid == 0) {
|
||||
/* child */
|
||||
|
||||
@ -158,7 +163,10 @@ main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
execvp(sched->argv[0], sched->argv);
|
||||
perror("exec");
|
||||
ompi_show_help("help-bootproxy.txt", "could-not-exec",
|
||||
true, sched->argv[0], sched->cwd,
|
||||
strerror(errno));
|
||||
exit(errno);
|
||||
} else {
|
||||
/* parent */
|
||||
|
||||
|
53
src/tools/bootproxy/help-bootproxy.txt
Обычный файл
53
src/tools/bootproxy/help-bootproxy.txt
Обычный файл
@ -0,0 +1,53 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English bootproxy help file for Open MPI.
|
||||
#
|
||||
[usage]
|
||||
Usage: %s --local-offset <vpid> --global_start_vpid <vpid>
|
||||
--num_procs <num> [--high-qos]
|
||||
|
||||
--local_offset <vpid> Starting vpid from which to assign for all
|
||||
processes started by this bootproxy
|
||||
--global_start_vpid <vpid> Starting vpid for the call to
|
||||
spawn_procs that resulted in this
|
||||
bootproxy being launched.
|
||||
--num_procs <num> Total number of processes started in the
|
||||
call to spawn_procs that resulted in
|
||||
this bootproxy being launched
|
||||
--high-qos Do not exit until all processes started
|
||||
have exited. Do not close
|
||||
std{in,out,err}. If one process exits
|
||||
with non-zero status, kill the remaining
|
||||
started processes.
|
||||
[could-not-receive-schedule]
|
||||
Open MPI was unable to start a process on one or more nodes using the
|
||||
bootproxy strartup mechanism. The bootproxy client was able to
|
||||
start but was not able to receive the scheduling information from the
|
||||
starting process. The error returned was %d.
|
||||
[could-not-chdir]
|
||||
Open MPI was unable to start a process on one or more nodes using the
|
||||
bootproxy strartup mechanism. The bootproxy client was able to
|
||||
start and receive startup information, but was unable to set the
|
||||
current working directory of the new process.
|
||||
|
||||
Current working directory was: %s
|
||||
The error from chdir was: %s
|
||||
[could-not-fork]
|
||||
Open MPI was unable to start a process on one or more nodes using the
|
||||
bootproxy strartup mechanism. The bootproxy client was able to
|
||||
start and receive startup information, but was unable to fork the new
|
||||
process.
|
||||
|
||||
I was trying to start: %s
|
||||
The error from fork was: %s
|
||||
[could-not-exec]
|
||||
Open MPI was unable to start a process on one or more nodes using the
|
||||
bootproxy strartup mechanism. The bootproxy client was able to
|
||||
start and receive startup information, but was unable to exec the new
|
||||
process.
|
||||
|
||||
I was trying to start: %s
|
||||
Current working directory was: %s
|
||||
The error from exec was: %s
|
Загрузка…
Ссылка в новой задаче
Block a user