1
1

* add abillity to notify mpirun that a process has died from rms. This just

prints a warning now, since the infrastructure to do the notify doesn't
  work yet.

This commit was SVN r2879.
Этот коммит содержится в:
Brian Barrett 2004-09-29 20:40:29 +00:00
родитель 0fb5abc6ab
Коммит d3adf8d816

Просмотреть файл

@ -22,8 +22,11 @@
#include "mca/ns/base/base.h"
#include "util/argv.h"
#include "util/numtostr.h"
#include "runtime/ompi_rte_wait.h"
static void internal_wait_cb(pid_t pid, int status, void *data);
/* ok, this is fairly simple in the RMS world */
ompi_list_t *
@ -159,6 +162,12 @@ mca_pcm_rms_spawn_procs(struct mca_pcm_base_module_1_0_0_t* me,
printf("show_help: unable to record child pid\n");
kill(child, SIGKILL);
}
ret = ompi_rte_wait_cb(child, internal_wait_cb, NULL);
if (OMPI_SUCCESS != ret) {
/* BWB - show_help */
printf("show_help: unable to register callback\n");
kill(child, SIGKILL);
}
return OMPI_SUCCESS;
}
@ -216,3 +225,38 @@ mca_pcm_rms_deallocate_resources(struct mca_pcm_base_module_1_0_0_t* me,
return OMPI_SUCCESS;
}
static void
internal_wait_cb(pid_t pid, int status, void *data)
{
mca_ns_base_jobid_t jobid = 0;
mca_ns_base_vpid_t upper = 0;
mca_ns_base_vpid_t lower = 0;
mca_ns_base_vpid_t i = 0;
int ret;
char *test;
ompi_process_name_t *proc_name;
printf("pcm_rms was notified that process %d exited with status %d\n",
pid, status);
ret = mca_pcm_base_get_job_info(pid, &jobid, &lower, &upper);
if (ret != OMPI_SUCCESS) {
printf("Unfortunately, we could not find the associated job info\n");
} else {
printf(" It appears that this starter was assocated with jobid %d\n"
" vpids %d to %d\n\n",
jobid, lower, upper);
}
/* unregister all the procs */
#if 0
/* BWB - fix me when deadlock in gpr is fixed */
for (i = lower ; i <= upper ; ++i) {
test = ns_base_get_proc_name_string(ns_base_create_process_name(0, jobid, i));
ompi_registry.rte_unregister(test);
}
#endif
}