* Make sure that the env pcmclient does something rational even if the
vpid_start variable isn't set * "working" mpirun with rsh pcm. Still no job monitoring or cleanup (as those aren't really available for rsh - cleanup will mostly work in the long term) * minor fixes for pcm_base_comm functions to properly deal with empty environments This commit was SVN r2250.
Этот коммит содержится в:
родитель
b64dc67d7a
Коммит
b7aeaae3a3
@ -1 +1 @@
|
||||
localhost
|
||||
localhost count=200
|
||||
|
@ -40,11 +40,15 @@ mca_pcm_base_send_schedule(FILE *fp,
|
||||
}
|
||||
|
||||
/* ENV - since we don't have a envc, must create ourselves...*/
|
||||
for (envc = 0 ; (sched->env)[envc] != NULL ; ++envc) ;
|
||||
fprintf(fp, "%d\n", envc);
|
||||
for (i = 0 ; i < envc ; ++i) {
|
||||
fprintf(fp, "%d %s\n", (int) strlen((sched->env)[i]),
|
||||
(sched->env)[i]);
|
||||
if (sched->env == NULL) {
|
||||
fprintf(fp, "%d\n", 0);
|
||||
} else {
|
||||
for (envc = 0 ; (sched->env)[envc] != NULL ; ++envc) ;
|
||||
fprintf(fp, "%d\n", envc);
|
||||
for (i = 0 ; i < envc ; ++i) {
|
||||
fprintf(fp, "%d %s\n", (int) strlen((sched->env)[i]),
|
||||
(sched->env)[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/* CWD */
|
||||
|
@ -25,13 +25,19 @@
|
||||
#include "runtime/runtime_types.h"
|
||||
#include "util/output.h"
|
||||
#include "util/argv.h"
|
||||
#include "util/numtostr.h"
|
||||
|
||||
|
||||
#if 1
|
||||
#define BOOTAGENT "mca_pcm_rsh_bootproxy"
|
||||
#else
|
||||
#define BOOTAGENT "cat"
|
||||
#endif
|
||||
#define PRS_BUFSIZE 1024
|
||||
|
||||
static int internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
||||
ompi_list_t *nodelist);
|
||||
ompi_list_t *nodelist,
|
||||
int my_start_vpid, int global_start_vpid,
|
||||
int num_procs);
|
||||
|
||||
|
||||
bool
|
||||
@ -49,14 +55,36 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
|
||||
{
|
||||
ompi_list_item_t *sched_item, *node_item;
|
||||
ompi_rte_node_schedule_t *sched;
|
||||
ompi_rte_node_allocation_t *node;
|
||||
ompi_list_t launch;
|
||||
ompi_list_t done;
|
||||
int ret, i;
|
||||
int width = 1;
|
||||
int local_start_vpid = 0;
|
||||
int global_start_vpid = 0;
|
||||
int num_procs = 0;
|
||||
|
||||
OBJ_CONSTRUCT(&launch, ompi_list_t);
|
||||
OBJ_CONSTRUCT(&done, ompi_list_t);
|
||||
|
||||
|
||||
|
||||
for (sched_item = ompi_list_get_first(schedlist) ;
|
||||
sched_item != ompi_list_get_end(schedlist) ;
|
||||
sched_item = ompi_list_get_next(sched_item)) {
|
||||
sched = (ompi_rte_node_schedule_t*) sched_item;
|
||||
|
||||
for (node_item = ompi_list_get_first(sched->nodelist) ;
|
||||
node_item != ompi_list_get_end(sched->nodelist) ;
|
||||
node_item = ompi_list_get_next(node_item)) {
|
||||
node = (ompi_rte_node_allocation_t*) node_item;
|
||||
num_procs += node->count;
|
||||
}
|
||||
}
|
||||
|
||||
/* BWB - make sure vpids are reserved */
|
||||
local_start_vpid = global_start_vpid;
|
||||
|
||||
for (sched_item = ompi_list_get_first(schedlist) ;
|
||||
sched_item != ompi_list_get_end(schedlist) ;
|
||||
sched_item = ompi_list_get_next(sched_item)) {
|
||||
@ -74,7 +102,7 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
|
||||
/* find enough entries for this slice to go */
|
||||
for (i = 0 ;
|
||||
i < width && node_item != ompi_list_get_end(sched->nodelist) ;
|
||||
node_item = ompi_list_get_next(node_item)) { }
|
||||
node_item = ompi_list_get_next(node_item), ++i) { }
|
||||
/* if we don't have anyone, get us out of here.. */
|
||||
if (i == 0) {
|
||||
continue;
|
||||
@ -88,7 +116,9 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
|
||||
|
||||
/* do the launch to the first node in the list, passing
|
||||
him the rest of the list */
|
||||
ret = internal_spawn_proc(jobid, sched, &launch);
|
||||
ret = internal_spawn_proc(jobid, sched, &launch,
|
||||
local_start_vpid, global_start_vpid,
|
||||
num_procs);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
/* well, crap! put ourselves back together, I guess.
|
||||
Should call killjob */
|
||||
@ -98,6 +128,8 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
|
||||
&done);
|
||||
return ret;
|
||||
}
|
||||
local_start_vpid +=
|
||||
((ompi_rte_node_allocation_t*) ompi_list_get_first(&launch))->count;
|
||||
|
||||
/* copy the list over to the done part */
|
||||
ompi_list_join(&done, ompi_list_get_end(&done), &launch);
|
||||
@ -219,7 +251,8 @@ cleanup:
|
||||
|
||||
static int
|
||||
internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
||||
ompi_list_t *nodelist)
|
||||
ompi_list_t *nodelist, int my_start_vpid,
|
||||
int global_start_vpid, int num_procs)
|
||||
{
|
||||
int kidstdin[2]; /* child stdin pipe */
|
||||
bool needs_profile = false;
|
||||
@ -235,6 +268,7 @@ internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
||||
FILE *fp;
|
||||
int status; /* exit status */
|
||||
int i;
|
||||
char *tmp;
|
||||
|
||||
start_node = (ompi_rte_node_allocation_t*) ompi_list_get_first(nodelist);
|
||||
|
||||
@ -270,8 +304,24 @@ internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
||||
|
||||
/* build the command to start */
|
||||
ompi_argv_append(&cmdc, &cmdv, BOOTAGENT);
|
||||
/* BWB - turn on debugging for now */
|
||||
ompi_argv_append(&cmdc, &cmdv, "-v");
|
||||
|
||||
/* starting vpid for launchee's procs */
|
||||
tmp = ltostr(my_start_vpid);
|
||||
ompi_argv_append(&cmdc, &cmdv, "--local_start_vpid");
|
||||
ompi_argv_append(&cmdc, &cmdv, tmp);
|
||||
free(tmp);
|
||||
|
||||
/* global starting vpid for this pcm spawn */
|
||||
tmp = ltostr(global_start_vpid);
|
||||
ompi_argv_append(&cmdc, &cmdv, "--global_start_vpid");
|
||||
ompi_argv_append(&cmdc, &cmdv, tmp);
|
||||
free(tmp);
|
||||
|
||||
/* number of procs in this pcm spawn */
|
||||
tmp = ltostr(num_procs);
|
||||
ompi_argv_append(&cmdc, &cmdv, "--num_procs");
|
||||
ompi_argv_append(&cmdc, &cmdv, tmp);
|
||||
free(tmp);
|
||||
|
||||
/* add the end of the .profile thing if required */
|
||||
if (needs_profile) {
|
||||
@ -293,7 +343,7 @@ internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
||||
} else if (pid == 0) {
|
||||
/* child */
|
||||
|
||||
if ((dup2(kidstdin[1], 0) < 0)) {
|
||||
if ((dup2(kidstdin[0], 0) < 0)) {
|
||||
perror(cmdv[0]);
|
||||
exit(errno);
|
||||
}
|
||||
@ -314,14 +364,17 @@ internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
||||
} else {
|
||||
/* parent */
|
||||
|
||||
if (close(kidstdin[1])) {
|
||||
#if 0
|
||||
if (close(kidstdin[0])) {
|
||||
kill(pid, SIGTERM);
|
||||
ret = OMPI_ERROR;
|
||||
goto proc_cleanup;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* send our stuff down the wire */
|
||||
fp = fdopen(kidstdin[0], "w");
|
||||
fp = fdopen(kidstdin[1], "a");
|
||||
if (fp == NULL) { perror("fdopen"); abort(); }
|
||||
ret = mca_pcm_base_send_schedule(fp, jobid, sched, nodelist);
|
||||
fclose(fp);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
|
2
src/mca/pcmclient/env/pcmclient_env_component.c
поставляемый
2
src/mca/pcmclient/env/pcmclient_env_component.c
поставляемый
@ -70,7 +70,7 @@ mca_pcmclient_env_open(void)
|
||||
param_procid = mca_base_param_register_int("pcmclient", "env", "procid",
|
||||
NULL, -1);
|
||||
param_vpid_start = mca_base_param_register_int("pcmclient", "env",
|
||||
"vpid_start", NULL, -1);
|
||||
"vpid_start", NULL, 0);
|
||||
param_num_procs = mca_base_param_register_int("pcmclient", "env",
|
||||
"num_procs", NULL, -1);
|
||||
|
||||
|
@ -12,18 +12,62 @@
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static void
|
||||
show_usage(char *myname)
|
||||
{
|
||||
printf("usage: %s --local_start_vpid [vpid] --global_start_vpid [vpid]\n"
|
||||
" --num_procs [num]\n\n", myname);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
main(int argc, char *argv[])
|
||||
{
|
||||
ompi_rte_node_schedule_t *sched;
|
||||
ompi_rte_node_allocation_t *nodelist;
|
||||
ompi_rte_node_allocation_t *node;
|
||||
pid_t pid;
|
||||
int i;
|
||||
int ret;
|
||||
int jobid;
|
||||
ompi_cmd_line_t *cmd_line = NULL;
|
||||
int local_vpid_start, global_vpid_start;
|
||||
int cellid = 0;
|
||||
int num_procs;
|
||||
char *env_buf;
|
||||
|
||||
ompi_init(argc, argv);
|
||||
cmd_line = ompi_cmd_line_create();
|
||||
ompi_cmd_line_make_opt(cmd_line, '\0', "local_start_vpid", 1,
|
||||
"starting vpid to use when launching");
|
||||
ompi_cmd_line_make_opt(cmd_line, '\0', "global_start_vpid", 1,
|
||||
"starting vpid to use when launching");
|
||||
ompi_cmd_line_make_opt(cmd_line, '\0', "num_procs", 1,
|
||||
"number of procs in job");
|
||||
|
||||
if (OMPI_SUCCESS != ompi_cmd_line_parse(cmd_line, false, argc, argv)) {
|
||||
show_usage(argv[0]);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (!ompi_cmd_line_is_taken(cmd_line, "local_start_vpid")) {
|
||||
show_usage(argv[0]);
|
||||
exit(1);
|
||||
}
|
||||
local_vpid_start =
|
||||
atoi(ompi_cmd_line_get_param(cmd_line, "local_start_vpid", 0, 0));
|
||||
|
||||
if (!ompi_cmd_line_is_taken(cmd_line, "global_start_vpid")) {
|
||||
show_usage(argv[0]);
|
||||
exit(1);
|
||||
}
|
||||
global_vpid_start =
|
||||
atoi(ompi_cmd_line_get_param(cmd_line, "global_start_vpid", 0, 0));
|
||||
|
||||
if (!ompi_cmd_line_is_taken(cmd_line, "num_procs")) {
|
||||
show_usage(argv[0]);
|
||||
exit(1);
|
||||
}
|
||||
num_procs = atoi(ompi_cmd_line_get_param(cmd_line, "num_procs", 0, 0));
|
||||
|
||||
sched = OBJ_NEW(ompi_rte_node_schedule_t);
|
||||
|
||||
@ -39,11 +83,24 @@ main(int argc, char *argv[])
|
||||
if (ompi_list_get_size(sched->nodelist) > 1) {
|
||||
fprintf(stderr, "Received more than one node - ignoring extra info\n");
|
||||
}
|
||||
if (ompi_list_get_size(sched->nodelist) < 1) {
|
||||
fprintf(stderr, "Received less than one node\n");
|
||||
}
|
||||
|
||||
/* fill our environment */
|
||||
for (i = 0 ; sched->env[i] != NULL ; ++i) {
|
||||
putenv(sched->env[i]);
|
||||
}
|
||||
/* constant pcmclient info */
|
||||
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_cellid=%d", cellid);
|
||||
putenv(env_buf);
|
||||
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_jobid=%d", jobid);
|
||||
putenv(env_buf);
|
||||
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_num_procs=%d", num_procs);
|
||||
putenv(env_buf);
|
||||
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_vpid_start=%d",
|
||||
global_vpid_start);
|
||||
putenv(env_buf);
|
||||
|
||||
/* get in the right place */
|
||||
if (sched->cwd != NULL) {
|
||||
@ -54,15 +111,24 @@ main(int argc, char *argv[])
|
||||
}
|
||||
}
|
||||
|
||||
node = (ompi_rte_node_allocation_t*) ompi_list_get_first(sched->nodelist);
|
||||
/* let's go! - if we are the parent, don't stick around... */
|
||||
pid = fork();
|
||||
if (pid < 0) {
|
||||
/* error :( */
|
||||
perror("fork");
|
||||
} else if (pid == 0) {
|
||||
/* child */
|
||||
execvp(sched->argv[0], sched->argv);
|
||||
perror("exec");
|
||||
for (i = 0 ; i < node->count ; ++i) {
|
||||
pid = fork();
|
||||
if (pid < 0) {
|
||||
/* error :( */
|
||||
perror("fork");
|
||||
} else if (pid == 0) {
|
||||
/* do the putenv here so that we don't look like we have a
|
||||
giant memory leak */
|
||||
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_procid=%d",
|
||||
local_vpid_start + i);
|
||||
putenv(env_buf);
|
||||
|
||||
/* child */
|
||||
execvp(sched->argv[0], sched->argv);
|
||||
perror("exec");
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_RELEASE(sched);
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mca/ns/ns.h"
|
||||
#include "mca/pcm/base/base.h"
|
||||
#include "runtime/runtime.h"
|
||||
#include "mca/base/base.h"
|
||||
#include "util/cmd_line.h"
|
||||
@ -15,6 +16,8 @@
|
||||
#include <unistd.h>
|
||||
#include <sys/param.h>
|
||||
|
||||
extern char** environ;
|
||||
|
||||
static long num_running_procs;
|
||||
|
||||
static int
|
||||
@ -36,9 +39,8 @@ main(int argc, char *argv[])
|
||||
ompi_list_t *nodelist = NULL;
|
||||
ompi_list_t schedlist;
|
||||
mca_ns_base_jobid_t new_jobid;
|
||||
int num_procs;
|
||||
int num_procs = 1;
|
||||
ompi_rte_node_schedule_t *sched;
|
||||
ompi_list_item_t *nodeitem;
|
||||
char cwd[MAXPATHLEN];
|
||||
|
||||
/*
|
||||
@ -111,8 +113,8 @@ main(int argc, char *argv[])
|
||||
new_jobid = getpid();
|
||||
|
||||
/* BWB - fix jobid, procs, and nodes */
|
||||
nodelist = ompi_rte_allocate_resources(0, 0, 2);
|
||||
if (NULL != nodelist) {
|
||||
nodelist = ompi_rte_allocate_resources(new_jobid, 0, num_procs);
|
||||
if (NULL == nodelist) {
|
||||
/* BWB show_help */
|
||||
printf("show_help: ompi_rte_allocate_resources failed\n");
|
||||
return -1;
|
||||
@ -123,11 +125,18 @@ main(int argc, char *argv[])
|
||||
*/
|
||||
OBJ_CONSTRUCT(&schedlist, ompi_list_t);
|
||||
sched = OBJ_NEW(ompi_rte_node_schedule_t);
|
||||
OBJ_CONSTRUCT(&(sched->nodelist), ompi_list_t);
|
||||
ompi_list_append(&schedlist, (ompi_list_item_t*) sched);
|
||||
ompi_cmd_line_get_tail(cmd_line, &(sched->argc), &(sched->argv));
|
||||
sched->env = NULL;
|
||||
mca_pcm_base_build_base_env(environ, &(sched->env));
|
||||
getcwd(cwd, MAXPATHLEN);
|
||||
sched->cwd = strdup(cwd);
|
||||
sched->nodelist = nodelist;
|
||||
|
||||
if (sched->argc == 0) {
|
||||
printf("no app to start\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* register the monitor
|
||||
@ -157,7 +166,7 @@ main(int argc, char *argv[])
|
||||
mca_base_close();
|
||||
ompi_finalize();
|
||||
|
||||
OBJ_DESTRUCT(&sched);
|
||||
OBJ_DESTRUCT(&schedlist);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user