1
1

* Make sure that the env pcmclient does something rational even if the

vpid_start variable isn't set
* "working" mpirun with rsh pcm.  Still no job monitoring or cleanup (as
  those aren't really available for rsh - cleanup will mostly work in
  the long term)
* minor fixes for pcm_base_comm functions to properly deal with empty
  environments

This commit was SVN r2250.
Этот коммит содержится в:
Brian Barrett 2004-08-20 18:48:57 +00:00
родитель b64dc67d7a
Коммит b7aeaae3a3
6 изменённых файлов: 165 добавлений и 33 удалений

Просмотреть файл

@ -1 +1 @@
localhost
localhost count=200

Просмотреть файл

@ -40,11 +40,15 @@ mca_pcm_base_send_schedule(FILE *fp,
}
/* ENV - since we don't have a envc, must create ourselves...*/
for (envc = 0 ; (sched->env)[envc] != NULL ; ++envc) ;
fprintf(fp, "%d\n", envc);
for (i = 0 ; i < envc ; ++i) {
fprintf(fp, "%d %s\n", (int) strlen((sched->env)[i]),
(sched->env)[i]);
if (sched->env == NULL) {
fprintf(fp, "%d\n", 0);
} else {
for (envc = 0 ; (sched->env)[envc] != NULL ; ++envc) ;
fprintf(fp, "%d\n", envc);
for (i = 0 ; i < envc ; ++i) {
fprintf(fp, "%d %s\n", (int) strlen((sched->env)[i]),
(sched->env)[i]);
}
}
/* CWD */

Просмотреть файл

@ -25,13 +25,19 @@
#include "runtime/runtime_types.h"
#include "util/output.h"
#include "util/argv.h"
#include "util/numtostr.h"
#if 1
#define BOOTAGENT "mca_pcm_rsh_bootproxy"
#else
#define BOOTAGENT "cat"
#endif
#define PRS_BUFSIZE 1024
static int internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
ompi_list_t *nodelist);
ompi_list_t *nodelist,
int my_start_vpid, int global_start_vpid,
int num_procs);
bool
@ -49,14 +55,36 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
{
ompi_list_item_t *sched_item, *node_item;
ompi_rte_node_schedule_t *sched;
ompi_rte_node_allocation_t *node;
ompi_list_t launch;
ompi_list_t done;
int ret, i;
int width = 1;
int local_start_vpid = 0;
int global_start_vpid = 0;
int num_procs = 0;
OBJ_CONSTRUCT(&launch, ompi_list_t);
OBJ_CONSTRUCT(&done, ompi_list_t);
for (sched_item = ompi_list_get_first(schedlist) ;
sched_item != ompi_list_get_end(schedlist) ;
sched_item = ompi_list_get_next(sched_item)) {
sched = (ompi_rte_node_schedule_t*) sched_item;
for (node_item = ompi_list_get_first(sched->nodelist) ;
node_item != ompi_list_get_end(sched->nodelist) ;
node_item = ompi_list_get_next(node_item)) {
node = (ompi_rte_node_allocation_t*) node_item;
num_procs += node->count;
}
}
/* BWB - make sure vpids are reserved */
local_start_vpid = global_start_vpid;
for (sched_item = ompi_list_get_first(schedlist) ;
sched_item != ompi_list_get_end(schedlist) ;
sched_item = ompi_list_get_next(sched_item)) {
@ -74,7 +102,7 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
/* find enough entries for this slice to go */
for (i = 0 ;
i < width && node_item != ompi_list_get_end(sched->nodelist) ;
node_item = ompi_list_get_next(node_item)) { }
node_item = ompi_list_get_next(node_item), ++i) { }
/* if we don't have anyone, get us out of here.. */
if (i == 0) {
continue;
@ -88,7 +116,9 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
/* do the launch to the first node in the list, passing
him the rest of the list */
ret = internal_spawn_proc(jobid, sched, &launch);
ret = internal_spawn_proc(jobid, sched, &launch,
local_start_vpid, global_start_vpid,
num_procs);
if (OMPI_SUCCESS != ret) {
/* well, crap! put ourselves back together, I guess.
Should call killjob */
@ -98,6 +128,8 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
&done);
return ret;
}
local_start_vpid +=
((ompi_rte_node_allocation_t*) ompi_list_get_first(&launch))->count;
/* copy the list over to the done part */
ompi_list_join(&done, ompi_list_get_end(&done), &launch);
@ -219,7 +251,8 @@ cleanup:
static int
internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
ompi_list_t *nodelist)
ompi_list_t *nodelist, int my_start_vpid,
int global_start_vpid, int num_procs)
{
int kidstdin[2]; /* child stdin pipe */
bool needs_profile = false;
@ -235,6 +268,7 @@ internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
FILE *fp;
int status; /* exit status */
int i;
char *tmp;
start_node = (ompi_rte_node_allocation_t*) ompi_list_get_first(nodelist);
@ -270,8 +304,24 @@ internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
/* build the command to start */
ompi_argv_append(&cmdc, &cmdv, BOOTAGENT);
/* BWB - turn on debugging for now */
ompi_argv_append(&cmdc, &cmdv, "-v");
/* starting vpid for launchee's procs */
tmp = ltostr(my_start_vpid);
ompi_argv_append(&cmdc, &cmdv, "--local_start_vpid");
ompi_argv_append(&cmdc, &cmdv, tmp);
free(tmp);
/* global starting vpid for this pcm spawn */
tmp = ltostr(global_start_vpid);
ompi_argv_append(&cmdc, &cmdv, "--global_start_vpid");
ompi_argv_append(&cmdc, &cmdv, tmp);
free(tmp);
/* number of procs in this pcm spawn */
tmp = ltostr(num_procs);
ompi_argv_append(&cmdc, &cmdv, "--num_procs");
ompi_argv_append(&cmdc, &cmdv, tmp);
free(tmp);
/* add the end of the .profile thing if required */
if (needs_profile) {
@ -293,7 +343,7 @@ internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
} else if (pid == 0) {
/* child */
if ((dup2(kidstdin[1], 0) < 0)) {
if ((dup2(kidstdin[0], 0) < 0)) {
perror(cmdv[0]);
exit(errno);
}
@ -314,14 +364,17 @@ internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
} else {
/* parent */
if (close(kidstdin[1])) {
#if 0
if (close(kidstdin[0])) {
kill(pid, SIGTERM);
ret = OMPI_ERROR;
goto proc_cleanup;
}
#endif
/* send our stuff down the wire */
fp = fdopen(kidstdin[0], "w");
fp = fdopen(kidstdin[1], "a");
if (fp == NULL) { perror("fdopen"); abort(); }
ret = mca_pcm_base_send_schedule(fp, jobid, sched, nodelist);
fclose(fp);
if (OMPI_SUCCESS != ret) {

Просмотреть файл

@ -70,7 +70,7 @@ mca_pcmclient_env_open(void)
param_procid = mca_base_param_register_int("pcmclient", "env", "procid",
NULL, -1);
param_vpid_start = mca_base_param_register_int("pcmclient", "env",
"vpid_start", NULL, -1);
"vpid_start", NULL, 0);
param_num_procs = mca_base_param_register_int("pcmclient", "env",
"num_procs", NULL, -1);

Просмотреть файл

@ -12,18 +12,62 @@
#include <unistd.h>
#include <stdlib.h>
static void
show_usage(char *myname)
{
printf("usage: %s --local_start_vpid [vpid] --global_start_vpid [vpid]\n"
" --num_procs [num]\n\n", myname);
}
int
main(int argc, char *argv[])
{
ompi_rte_node_schedule_t *sched;
ompi_rte_node_allocation_t *nodelist;
ompi_rte_node_allocation_t *node;
pid_t pid;
int i;
int ret;
int jobid;
ompi_cmd_line_t *cmd_line = NULL;
int local_vpid_start, global_vpid_start;
int cellid = 0;
int num_procs;
char *env_buf;
ompi_init(argc, argv);
cmd_line = ompi_cmd_line_create();
ompi_cmd_line_make_opt(cmd_line, '\0', "local_start_vpid", 1,
"starting vpid to use when launching");
ompi_cmd_line_make_opt(cmd_line, '\0', "global_start_vpid", 1,
"starting vpid to use when launching");
ompi_cmd_line_make_opt(cmd_line, '\0', "num_procs", 1,
"number of procs in job");
if (OMPI_SUCCESS != ompi_cmd_line_parse(cmd_line, false, argc, argv)) {
show_usage(argv[0]);
exit(1);
}
if (!ompi_cmd_line_is_taken(cmd_line, "local_start_vpid")) {
show_usage(argv[0]);
exit(1);
}
local_vpid_start =
atoi(ompi_cmd_line_get_param(cmd_line, "local_start_vpid", 0, 0));
if (!ompi_cmd_line_is_taken(cmd_line, "global_start_vpid")) {
show_usage(argv[0]);
exit(1);
}
global_vpid_start =
atoi(ompi_cmd_line_get_param(cmd_line, "global_start_vpid", 0, 0));
if (!ompi_cmd_line_is_taken(cmd_line, "num_procs")) {
show_usage(argv[0]);
exit(1);
}
num_procs = atoi(ompi_cmd_line_get_param(cmd_line, "num_procs", 0, 0));
sched = OBJ_NEW(ompi_rte_node_schedule_t);
@ -39,11 +83,24 @@ main(int argc, char *argv[])
if (ompi_list_get_size(sched->nodelist) > 1) {
fprintf(stderr, "Received more than one node - ignoring extra info\n");
}
if (ompi_list_get_size(sched->nodelist) < 1) {
fprintf(stderr, "Received less than one node\n");
}
/* fill our environment */
for (i = 0 ; sched->env[i] != NULL ; ++i) {
putenv(sched->env[i]);
}
/* constant pcmclient info */
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_cellid=%d", cellid);
putenv(env_buf);
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_jobid=%d", jobid);
putenv(env_buf);
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_num_procs=%d", num_procs);
putenv(env_buf);
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_vpid_start=%d",
global_vpid_start);
putenv(env_buf);
/* get in the right place */
if (sched->cwd != NULL) {
@ -54,15 +111,24 @@ main(int argc, char *argv[])
}
}
node = (ompi_rte_node_allocation_t*) ompi_list_get_first(sched->nodelist);
/* let's go! - if we are the parent, don't stick around... */
pid = fork();
if (pid < 0) {
/* error :( */
perror("fork");
} else if (pid == 0) {
/* child */
execvp(sched->argv[0], sched->argv);
perror("exec");
for (i = 0 ; i < node->count ; ++i) {
pid = fork();
if (pid < 0) {
/* error :( */
perror("fork");
} else if (pid == 0) {
/* do the putenv here so that we don't look like we have a
giant memory leak */
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_procid=%d",
local_vpid_start + i);
putenv(env_buf);
/* child */
execvp(sched->argv[0], sched->argv);
perror("exec");
}
}
OBJ_RELEASE(sched);

Просмотреть файл

@ -6,6 +6,7 @@
#include "ompi_config.h"
#include "mca/ns/ns.h"
#include "mca/pcm/base/base.h"
#include "runtime/runtime.h"
#include "mca/base/base.h"
#include "util/cmd_line.h"
@ -15,6 +16,8 @@
#include <unistd.h>
#include <sys/param.h>
extern char** environ;
static long num_running_procs;
static int
@ -36,9 +39,8 @@ main(int argc, char *argv[])
ompi_list_t *nodelist = NULL;
ompi_list_t schedlist;
mca_ns_base_jobid_t new_jobid;
int num_procs;
int num_procs = 1;
ompi_rte_node_schedule_t *sched;
ompi_list_item_t *nodeitem;
char cwd[MAXPATHLEN];
/*
@ -111,8 +113,8 @@ main(int argc, char *argv[])
new_jobid = getpid();
/* BWB - fix jobid, procs, and nodes */
nodelist = ompi_rte_allocate_resources(0, 0, 2);
if (NULL != nodelist) {
nodelist = ompi_rte_allocate_resources(new_jobid, 0, num_procs);
if (NULL == nodelist) {
/* BWB show_help */
printf("show_help: ompi_rte_allocate_resources failed\n");
return -1;
@ -123,11 +125,18 @@ main(int argc, char *argv[])
*/
OBJ_CONSTRUCT(&schedlist, ompi_list_t);
sched = OBJ_NEW(ompi_rte_node_schedule_t);
OBJ_CONSTRUCT(&(sched->nodelist), ompi_list_t);
ompi_list_append(&schedlist, (ompi_list_item_t*) sched);
ompi_cmd_line_get_tail(cmd_line, &(sched->argc), &(sched->argv));
sched->env = NULL;
mca_pcm_base_build_base_env(environ, &(sched->env));
getcwd(cwd, MAXPATHLEN);
sched->cwd = strdup(cwd);
sched->nodelist = nodelist;
if (sched->argc == 0) {
printf("no app to start\n");
return 1;
}
/*
* register the monitor
@ -157,7 +166,7 @@ main(int argc, char *argv[])
mca_base_close();
ompi_finalize();
OBJ_DESTRUCT(&sched);
OBJ_DESTRUCT(&schedlist);
return 0;
}