* Clean up of the rms pcm (still some to go on job killing)
- use show_help() instead of printf() - shut up the job reaping ocde - add the ability to specify partition and extra prun args (fixing bug # 1023) This commit was SVN r3212.
Этот коммит содержится в:
родитель
d23227214a
Коммит
d5431d95fd
@ -4,6 +4,10 @@
|
|||||||
|
|
||||||
include $(top_ompi_srcdir)/config/Makefile.options
|
include $(top_ompi_srcdir)/config/Makefile.options
|
||||||
|
|
||||||
|
EXTRA_DIST = $(pkgdata_DATA)
|
||||||
|
|
||||||
|
pkgdata_DATA = help-mca-pcm-rms.txt
|
||||||
|
|
||||||
noinst_LTLIBRARIES = libmca_pcm_rms.la
|
noinst_LTLIBRARIES = libmca_pcm_rms.la
|
||||||
libmca_pcm_rms_la_SOURCES = \
|
libmca_pcm_rms_la_SOURCES = \
|
||||||
pcm_rms.c \
|
pcm_rms.c \
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
#include "include/constants.h"
|
#include "include/constants.h"
|
||||||
#include "mca/pcm/pcm.h"
|
#include "mca/pcm/pcm.h"
|
||||||
#include "mca/pcm/base/base.h"
|
#include "mca/pcm/base/base.h"
|
||||||
|
#include "runtime/runtime.h"
|
||||||
#include "event/event.h"
|
#include "event/event.h"
|
||||||
#include "class/ompi_list.h"
|
#include "class/ompi_list.h"
|
||||||
#include "mca/ns/ns.h"
|
#include "mca/ns/ns.h"
|
||||||
@ -23,6 +24,7 @@
|
|||||||
#include "util/argv.h"
|
#include "util/argv.h"
|
||||||
#include "util/numtostr.h"
|
#include "util/numtostr.h"
|
||||||
#include "runtime/ompi_rte_wait.h"
|
#include "runtime/ompi_rte_wait.h"
|
||||||
|
#include "util/show_help.h"
|
||||||
|
|
||||||
|
|
||||||
static void internal_wait_cb(pid_t pid, int status, void *data);
|
static void internal_wait_cb(pid_t pid, int status, void *data);
|
||||||
@ -65,9 +67,10 @@ mca_pcm_rms_allocate_resources(struct mca_pcm_base_module_1_0_0_t* me,
|
|||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
mca_pcm_rms_spawn_procs(struct mca_pcm_base_module_1_0_0_t* me,
|
mca_pcm_rms_spawn_procs(struct mca_pcm_base_module_1_0_0_t* me_super,
|
||||||
mca_ns_base_jobid_t jobid, ompi_list_t *schedlist)
|
mca_ns_base_jobid_t jobid, ompi_list_t *schedlist)
|
||||||
{
|
{
|
||||||
|
mca_pcm_rms_module_t *me = (mca_pcm_rms_module_t*) me_super;
|
||||||
ompi_rte_node_allocation_t *nodes;
|
ompi_rte_node_allocation_t *nodes;
|
||||||
ompi_rte_node_schedule_t *sched;
|
ompi_rte_node_schedule_t *sched;
|
||||||
char **argv = NULL;
|
char **argv = NULL;
|
||||||
@ -77,17 +80,17 @@ mca_pcm_rms_spawn_procs(struct mca_pcm_base_module_1_0_0_t* me,
|
|||||||
int ret;
|
int ret;
|
||||||
char *tmp;
|
char *tmp;
|
||||||
pid_t child;
|
pid_t child;
|
||||||
|
char **prun_args;
|
||||||
|
char *printable;
|
||||||
|
|
||||||
/* quick sanity check */
|
/* quick sanity check */
|
||||||
if (ompi_list_get_size(schedlist) > 1) {
|
if (ompi_list_get_size(schedlist) > 1) {
|
||||||
/* BWB: show_help */
|
ompi_show_help("help-mca-pcm-rms.txt", "spawn:multiple-apps", true);
|
||||||
printf("RMS pcm can not cope with multiple schedlist items at this time\n");
|
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
sched = (ompi_rte_node_schedule_t*) ompi_list_get_first(schedlist);
|
sched = (ompi_rte_node_schedule_t*) ompi_list_get_first(schedlist);
|
||||||
if (ompi_list_get_size(sched->nodelist) > 1) {
|
if (ompi_list_get_size(sched->nodelist) > 1) {
|
||||||
/* BWB: show_help */
|
ompi_show_help("help-mca-pcm-rms.txt", "spawn:multiple-nodelists", true);
|
||||||
printf("RMS pcm can not cope with multiple nodelists at this time\n");
|
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -113,16 +116,36 @@ mca_pcm_rms_spawn_procs(struct mca_pcm_base_module_1_0_0_t* me,
|
|||||||
free(num);
|
free(num);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (NULL != me->partition) {
|
||||||
|
ompi_argv_append(&argc, &argv, "-p");
|
||||||
|
ompi_argv_append(&argc, &argv, me->partition);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NULL != me->prun_args) {
|
||||||
|
prun_args = ompi_argv_split(me->prun_args, ' ');
|
||||||
|
if (NULL != prun_args) {
|
||||||
|
for (i = 0 ; prun_args[i] != NULL ; ++i) {
|
||||||
|
ompi_argv_append(&argc, &argv, prun_args[i]);
|
||||||
|
}
|
||||||
|
ompi_argv_free(prun_args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* copy over the command line arguments */
|
/* copy over the command line arguments */
|
||||||
for (i = 0 ; i < sched->argc ; ++i) {
|
for (i = 0 ; i < sched->argc ; ++i) {
|
||||||
ompi_argv_append(&argc, &argv, (sched->argv)[i]);
|
ompi_argv_append(&argc, &argv, (sched->argv)[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
printable = ompi_argv_join(argv, ' ');
|
||||||
|
ompi_output_verbose(5, mca_pcm_base_output,
|
||||||
|
"attempting to execute: %s", printable);
|
||||||
|
free(printable);
|
||||||
|
|
||||||
/* ok, fork! */
|
/* ok, fork! */
|
||||||
child = fork();
|
child = fork();
|
||||||
if (child < 0) {
|
if (child < 0) {
|
||||||
/* show_help */
|
ompi_show_help("help-mca-pcm-rms.txt", "spawn:fork-failure", true,
|
||||||
printf("RMS pcm unable to fork\n");
|
strerror(errno));
|
||||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
} else if (0 == child) {
|
} else if (0 == child) {
|
||||||
/* set up environment */
|
/* set up environment */
|
||||||
@ -142,13 +165,15 @@ mca_pcm_rms_spawn_procs(struct mca_pcm_base_module_1_0_0_t* me,
|
|||||||
/* set cwd */
|
/* set cwd */
|
||||||
ret = chdir(sched->cwd);
|
ret = chdir(sched->cwd);
|
||||||
if (0 != ret) {
|
if (0 != ret) {
|
||||||
/* BWB show_help */
|
ompi_show_help("help-mca-pcm-rms.txt", "spawn:chdir", true,
|
||||||
printf("RMS pcm can not chdir to %s\n", sched->cwd);
|
sched->cwd, strerror(errno));
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* go, go, go! */
|
/* go, go, go! */
|
||||||
ret = execvp(argv[0], argv);
|
ret = execvp(argv[0], argv);
|
||||||
|
ompi_show_help("help-mca-pcm-rms.txt", "spawn:exec-prun", true,
|
||||||
|
argv[0], strerror(errno));
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -158,15 +183,13 @@ mca_pcm_rms_spawn_procs(struct mca_pcm_base_module_1_0_0_t* me,
|
|||||||
nodes->count :
|
nodes->count :
|
||||||
nodes->nodes * nodes->count);
|
nodes->nodes * nodes->count);
|
||||||
if (OMPI_SUCCESS != ret) {
|
if (OMPI_SUCCESS != ret) {
|
||||||
/* BWB show_help */
|
|
||||||
printf("show_help: unable to record child pid\n");
|
|
||||||
kill(child, SIGKILL);
|
kill(child, SIGKILL);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
ret = ompi_rte_wait_cb(child, internal_wait_cb, NULL);
|
ret = ompi_rte_wait_cb(child, internal_wait_cb, NULL);
|
||||||
if (OMPI_SUCCESS != ret) {
|
if (OMPI_SUCCESS != ret) {
|
||||||
/* BWB - show_help */
|
|
||||||
printf("show_help: unable to register callback\n");
|
|
||||||
kill(child, SIGKILL);
|
kill(child, SIGKILL);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
@ -196,8 +219,8 @@ mca_pcm_rms_kill_job(struct mca_pcm_base_module_1_0_0_t* me,
|
|||||||
mca_ns_base_jobid_t jobid, int flags)
|
mca_ns_base_jobid_t jobid, int flags)
|
||||||
{
|
{
|
||||||
pid_t *doomed;
|
pid_t *doomed;
|
||||||
size_t doomed_len;
|
size_t doomed_len, i;
|
||||||
int ret, i;
|
int ret;
|
||||||
|
|
||||||
ret = mca_pcm_base_get_started_pid_list(jobid, &doomed, &doomed_len, true);
|
ret = mca_pcm_base_get_started_pid_list(jobid, &doomed, &doomed_len, true);
|
||||||
if (OMPI_SUCCESS != ret) return ret;
|
if (OMPI_SUCCESS != ret) return ret;
|
||||||
@ -236,27 +259,25 @@ internal_wait_cb(pid_t pid, int status, void *data)
|
|||||||
mca_ns_base_vpid_t lower = 0;
|
mca_ns_base_vpid_t lower = 0;
|
||||||
mca_ns_base_vpid_t i = 0;
|
mca_ns_base_vpid_t i = 0;
|
||||||
int ret;
|
int ret;
|
||||||
char *test;
|
char *proc_name;
|
||||||
ompi_process_name_t *proc_name;
|
|
||||||
|
|
||||||
printf("pcm_rms was notified that process %d exited with status %d\n",
|
ompi_output_verbose(10, mca_pcm_base_output,
|
||||||
pid, status);
|
"process %d exited with status %d", pid, status);
|
||||||
|
|
||||||
ret = mca_pcm_base_get_job_info(pid, &jobid, &lower, &upper);
|
ret = mca_pcm_base_get_job_info(pid, &jobid, &lower, &upper);
|
||||||
if (ret != OMPI_SUCCESS) {
|
if (ret != OMPI_SUCCESS) {
|
||||||
printf("Unfortunately, we could not find the associated job info\n");
|
ompi_show_help("help-mca-pcm-rms.txt",
|
||||||
} else {
|
"spawn:no-process-record", true, pid, status);
|
||||||
printf(" It appears that this starter was assocated with jobid %d\n"
|
return;
|
||||||
" vpids %d to %d\n\n",
|
|
||||||
jobid, lower, upper);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* unregister all the procs */
|
/* unregister all the procs */
|
||||||
#if 0
|
|
||||||
/* BWB - fix me when deadlock in gpr is fixed */
|
|
||||||
for (i = lower ; i <= upper ; ++i) {
|
for (i = lower ; i <= upper ; ++i) {
|
||||||
test = ns_base_get_proc_name_string(ns_base_create_process_name(0, jobid, i));
|
proc_name = ns_base_get_proc_name_string(
|
||||||
ompi_registry.rte_unregister(test);
|
ns_base_create_process_name(0, jobid, i));
|
||||||
|
ompi_registry.rte_unregister(proc_name);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
/* BWB - fix me - should only remove this range */
|
||||||
|
mca_pcm_base_remove_job(jobid);
|
||||||
}
|
}
|
||||||
|
@ -54,13 +54,17 @@ extern "C" {
|
|||||||
mca_ns_base_jobid_t jobid,
|
mca_ns_base_jobid_t jobid,
|
||||||
ompi_list_t *nodelist);
|
ompi_list_t *nodelist);
|
||||||
|
|
||||||
|
struct mca_pcm_rms_module_t {
|
||||||
|
mca_pcm_base_module_t super;
|
||||||
|
|
||||||
|
char *partition;
|
||||||
|
char *prun_args;
|
||||||
|
int constraints;
|
||||||
|
};
|
||||||
|
typedef struct mca_pcm_rms_module_t mca_pcm_rms_module_t;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
|
||||||
* Module variables
|
|
||||||
*/
|
|
||||||
extern int mca_pcm_rms_output;
|
|
||||||
|
|
||||||
#endif /* MCA_PCM_RMS_H_ */
|
#endif /* MCA_PCM_RMS_H_ */
|
||||||
|
@ -50,27 +50,14 @@ mca_pcm_base_component_1_0_0_t mca_pcm_rms_component = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/* need to create output stream to dump in file */
|
|
||||||
ompi_output_stream_t mca_pcm_rms_output_stream = {
|
|
||||||
false, /* lds_is_debugging BWB - change me for release */
|
|
||||||
0, /* lds_verbose_level */
|
|
||||||
false, /* lds_want_syslog */
|
|
||||||
0, /* lds_syslog_priority */
|
|
||||||
NULL, /* lds_syslog_ident */
|
|
||||||
"pcm: rms: ", /* lds_prefix */
|
|
||||||
true, /* lds_want_stdout */
|
|
||||||
false, /* lds_want_stderr */
|
|
||||||
true, /* lds_want_file */
|
|
||||||
true, /* lds_want_file_append */
|
|
||||||
"pcm_rms" /* lds_file_suffix */
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Module variables handles
|
* Module variables handles
|
||||||
*/
|
*/
|
||||||
static int mca_pcm_rms_param_priority;
|
static int mca_pcm_rms_param_priority;
|
||||||
static int mca_pcm_rms_param_debug;
|
static int mca_pcm_rms_param_prun_args;
|
||||||
|
static int mca_pcm_rms_param_partition;
|
||||||
|
static int mca_pcm_rms_param_prun_args;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Component variables. All of these are shared among the module
|
* Component variables. All of these are shared among the module
|
||||||
@ -83,13 +70,14 @@ int mca_pcm_rms_output = -1;
|
|||||||
int
|
int
|
||||||
mca_pcm_rms_component_open(void)
|
mca_pcm_rms_component_open(void)
|
||||||
{
|
{
|
||||||
mca_pcm_rms_param_debug =
|
|
||||||
mca_base_param_register_int("pcm", "rms", "debug", NULL, 100);
|
|
||||||
|
|
||||||
mca_pcm_rms_param_priority =
|
mca_pcm_rms_param_priority =
|
||||||
mca_base_param_register_int("pcm", "rms", "priority", NULL, 5);
|
mca_base_param_register_int("pcm", "rms", "priority", NULL, 5);
|
||||||
|
|
||||||
mca_pcm_rms_output = ompi_output_open(&mca_pcm_rms_output_stream);
|
mca_pcm_rms_param_partition =
|
||||||
|
mca_base_param_register_string("pcm", "rms", "patition", NULL, NULL);
|
||||||
|
|
||||||
|
mca_pcm_rms_param_prun_args =
|
||||||
|
mca_base_param_register_string("pcm", "rms", "prun_args", NULL, NULL);
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -107,14 +95,9 @@ mca_pcm_rms_init(int *priority,
|
|||||||
bool have_threads,
|
bool have_threads,
|
||||||
int constraints)
|
int constraints)
|
||||||
{
|
{
|
||||||
int debug;
|
|
||||||
char *prun;
|
char *prun;
|
||||||
int num_cpus;
|
int num_cpus;
|
||||||
mca_pcm_base_module_t *me;
|
mca_pcm_rms_module_t *me;
|
||||||
|
|
||||||
/* debugging gorp */
|
|
||||||
mca_base_param_lookup_int(mca_pcm_rms_param_debug, &debug);
|
|
||||||
ompi_output_set_verbosity(mca_pcm_rms_output, debug);
|
|
||||||
|
|
||||||
/* get our priority - if 0, we don't run */
|
/* get our priority - if 0, we don't run */
|
||||||
mca_base_param_lookup_int(mca_pcm_rms_param_priority, priority);
|
mca_base_param_lookup_int(mca_pcm_rms_param_priority, priority);
|
||||||
@ -127,7 +110,6 @@ mca_pcm_rms_init(int *priority,
|
|||||||
if (0 != (constraints & OMPI_RTE_SPAWN_FROM_MPI)) return NULL;
|
if (0 != (constraints & OMPI_RTE_SPAWN_FROM_MPI)) return NULL;
|
||||||
|
|
||||||
/* see if we are an RMS system */
|
/* see if we are an RMS system */
|
||||||
/* BWB - is there a better way to do this */
|
|
||||||
num_cpus = rms_numCpus(NULL);
|
num_cpus = rms_numCpus(NULL);
|
||||||
if (num_cpus <= 0) return NULL;
|
if (num_cpus <= 0) return NULL;
|
||||||
|
|
||||||
@ -136,26 +118,37 @@ mca_pcm_rms_init(int *priority,
|
|||||||
free(prun);
|
free(prun);
|
||||||
|
|
||||||
/* ok, now let's try to fire up */
|
/* ok, now let's try to fire up */
|
||||||
me = malloc(sizeof(mca_pcm_base_module_t));
|
me = malloc(sizeof(mca_pcm_rms_module_t));
|
||||||
if (NULL == me) return NULL;
|
if (NULL == me) return NULL;
|
||||||
|
|
||||||
me->pcm_allocate_resources = mca_pcm_rms_allocate_resources;
|
me->super.pcm_allocate_resources = mca_pcm_rms_allocate_resources;
|
||||||
me->pcm_spawn_procs = mca_pcm_rms_spawn_procs;
|
me->super.pcm_spawn_procs = mca_pcm_rms_spawn_procs;
|
||||||
me->pcm_kill_proc = mca_pcm_rms_kill_proc;
|
me->super.pcm_kill_proc = mca_pcm_rms_kill_proc;
|
||||||
me->pcm_kill_job = mca_pcm_rms_kill_job;
|
me->super.pcm_kill_job = mca_pcm_rms_kill_job;
|
||||||
me->pcm_deallocate_resources = mca_pcm_rms_deallocate_resources;
|
me->super.pcm_deallocate_resources = mca_pcm_rms_deallocate_resources;
|
||||||
me->pcm_finalize = mca_pcm_rms_finalize;
|
me->super.pcm_finalize = mca_pcm_rms_finalize;
|
||||||
|
|
||||||
|
mca_base_param_lookup_string(mca_pcm_rms_param_partition,
|
||||||
|
&(me->partition));
|
||||||
|
|
||||||
|
mca_base_param_lookup_string(mca_pcm_rms_param_prun_args,
|
||||||
|
&(me->prun_args));
|
||||||
|
|
||||||
return me;
|
return me;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
mca_pcm_rms_finalize(struct mca_pcm_base_module_1_0_0_t* me)
|
mca_pcm_rms_finalize(struct mca_pcm_base_module_1_0_0_t* me_super)
|
||||||
{
|
{
|
||||||
if (mca_pcm_rms_output > 0) {
|
mca_pcm_rms_module_t *me = (mca_pcm_rms_module_t*) me_super;
|
||||||
ompi_output_close(mca_pcm_rms_output);
|
|
||||||
}
|
if (NULL == me) return OMPI_ERR_BAD_PARAM;
|
||||||
|
|
||||||
|
if (NULL != me->partition) free(me->partition);
|
||||||
|
if (NULL != me->prun_args) free(me->prun_args);
|
||||||
|
|
||||||
|
free(me);
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user