diff --git a/src/mca/pcm/rsh/Makefile.am b/src/mca/pcm/rsh/Makefile.am index 1d8afcf74a..e88686a38d 100644 --- a/src/mca/pcm/rsh/Makefile.am +++ b/src/mca/pcm/rsh/Makefile.am @@ -10,6 +10,10 @@ include $(top_ompi_srcdir)/config/Makefile.options # mca__.la (for DSO builds) or libmca__.la # (for static builds). +EXTRA_DIST = $(pkgdata_DATA) + +pkgdata_DATA = help-mca-pcm-rsh.txt + sources = \ pcm_rsh.h \ pcm_rsh_allocate.c \ diff --git a/src/mca/pcm/rsh/help-mca-pcm-rsh.txt b/src/mca/pcm/rsh/help-mca-pcm-rsh.txt new file mode 100644 index 0000000000..41a6cf3848 --- /dev/null +++ b/src/mca/pcm/rsh/help-mca-pcm-rsh.txt @@ -0,0 +1,34 @@ +# -*- text -*- +# +# $HEADER$ +# +# This is the US/English RSH PCM help file for Open MPI. +# +[spawn:rsh-failed] +Open MPI was unable to start a process on one or more nodes using the +rsh strartup mechanism. The failure occured while trying to execute +%s to start the bootproxy on %s. The full command executed was: + + %s + +It is likely that there is some output above this error message that +will give more information than appears here. The best information +available is that the error was %s. +[spawn:stderr-output] +Open MPI was unable to start a process on one or more nodes using the +rsh startup mechanism. Output was received on stderr of the startup +command (%s). This is usually a sign that an error has occurred. +The full command executed was: + + %s +[spawn:no-process-record] +Open MPI was notified that process %d exited with status %d. +Unfortunately, that process could not be found in our internal +tables. This most likely means that an error in the RSH PCM of Open +MPI has occurred. Please let the developers know how you came across +this message so that it can be fixed. +[spawn:application-send] +Open MPI was unable to start a process on one or more nodes using the +rsh startup mechanism. Open MPI was able to start the bootproxy on +the remote node, but was unable to send the application schema to the +remote bootproxy. The error was %s. diff --git a/src/mca/pcm/rsh/pcm_rsh_spawn.c b/src/mca/pcm/rsh/pcm_rsh_spawn.c index 7d3ad1bd8d..9c2a00ba90 100644 --- a/src/mca/pcm/rsh/pcm_rsh_spawn.c +++ b/src/mca/pcm/rsh/pcm_rsh_spawn.c @@ -33,7 +33,7 @@ #include "util/numtostr.h" #include "mca/ns/base/base.h" #include "util/proc_info.h" - +#include "util/show_help.h" /* * Internal constants @@ -240,13 +240,14 @@ internal_need_profile(mca_pcm_rsh_module_t *me, sizeof(shellpath) - 1, stderr_is_error)) { if (errno == EFAULT) { - /* BWB - show_help */ - printf("show_help: something on stderr: %s %s %s", - start_node->hostname, cmd0, printable); + ompi_show_help("help-mca-pcm-rsh.txt", + "spawn:stderr-output", true, + cmd0, start_node->hostname, printable, + strerror(errno)); } else { - /* BWB - show_help */ - printf("show_help: fail to rsh: %s %s %s", - start_node->hostname, cmd0, printable); + ompi_show_help("help-mca-pcm-rsh.txt", + "spawn:rsh-failed", true, + cmd0, printable); } ret = OMPI_ERROR; @@ -336,7 +337,7 @@ internal_spawn_proc(mca_pcm_rsh_module_t *me, /* add the start of .profile thing if required */ if (needs_profile) { - ompi_argv_append(&cmdc, &cmdv, "( ! [ -e ./.profile] || . ./.profile;"); + ompi_argv_append(&cmdc, &cmdv, "( ! [ -e ./.profile ] || . ./.profile;"); } /* build the command to start */ @@ -405,22 +406,24 @@ internal_spawn_proc(mca_pcm_rsh_module_t *me, } else { /* parent */ - if (close(kidstdin[0])) { - kill(pid, SIGTERM); - ret = OMPI_ERROR; - goto proc_cleanup; - } + close(kidstdin[0]); /* send our stuff down the wire */ fp = fdopen(kidstdin[1], "a"); if (fp == NULL) { - /* BWB - fix me */ - perror("fdopen"); - abort(); + ompi_show_help("help-mca-pcm-rsh.txt", + "spawn:application-send", true, + strerror(errno)); + ret = OMPI_ERROR; + kill(pid, SIGTERM); + goto proc_cleanup; } ret = mca_pcm_base_send_schedule(fp, jobid, sched, start_node->count); fclose(fp); if (OMPI_SUCCESS != ret) { + ompi_show_help("help-mca-pcm-rsh.txt", + "spawn:application-send", true, + strerror(errno)); kill(pid, SIGTERM); goto proc_cleanup; } @@ -470,27 +473,26 @@ internal_wait_cb(pid_t pid, int status, void *data) mca_ns_base_vpid_t lower = 0; mca_ns_base_vpid_t i = 0; int ret; - char *test; - ompi_process_name_t *proc_name; + char *proc_name; - printf("pcm_rsh was notified that process %d exited with status %d\n", - pid, status); + ompi_output_verbose(10, mca_pcm_rsh_output, + "process %d exited with status %d", pid, status); ret = mca_pcm_base_get_job_info(pid, &jobid, &lower, &upper); if (ret != OMPI_SUCCESS) { - printf("Unfortunately, we could not find the associated job info\n"); - } else { - printf(" It appears that this starter was assocated with jobid %d\n" - " vpids %d to %d\n\n", - jobid, lower, upper); + ompi_show_help("help-mca-pcm-rsh.txt", + "spawn:no-process-record", true, pid, status); + return; } /* unregister all the procs */ for (i = lower ; i <= upper ; ++i) { - test = ns_base_get_proc_name_string(ns_base_create_process_name(0, jobid, i)); - ompi_registry.rte_unregister(test); + proc_name = + ns_base_get_proc_name_string( + ns_base_create_process_name(0, jobid, i)); + ompi_registry.rte_unregister(proc_name); } - /* bwb - fix me - should only remove this range */ + /* BWB - fix me - should only remove this range */ mca_pcm_base_remove_job(jobid); }