1
1

* More changes for the rsh pcm:

- make sure to pass jobid to the spawned process
  - update test case and bootproxy to pass/receive jobid
  - work on list splitting code for rsh spawn_procs()

This commit was SVN r2212.
Этот коммит содержится в:
Brian Barrett 2004-08-18 20:34:20 +00:00
родитель aaca5f3bf6
Коммит 964fd9c758
8 изменённых файлов: 114 добавлений и 31 удалений

Просмотреть файл

@ -28,10 +28,12 @@ extern "C" {
int mca_pcm_base_send_schedule(FILE *fd,
int jobid,
ompi_rte_node_schedule_t *sched,
ompi_list_t *nodelist);
int mca_pcm_base_recv_schedule(FILE *fd,
int *jobid,
ompi_rte_node_schedule_t *sched,
ompi_list_t *nodelist);

Просмотреть файл

@ -17,6 +17,7 @@
int
mca_pcm_base_send_schedule(FILE *fp,
int jobid,
ompi_rte_node_schedule_t *sched,
ompi_list_t *nodelist)
{
@ -28,6 +29,9 @@ mca_pcm_base_send_schedule(FILE *fp,
fprintf(fp, START_KEY);
fprintf(fp, "%d\n", PROTOCOL_VERSION);
/* JOBID */
fprintf(fp, "%d\n", jobid);
/* ARGC */
fprintf(fp, "%d\n", sched->argc);
for (i = 0 ; i < sched->argc ; ++i) {
@ -130,14 +134,27 @@ get_key(FILE *fp, const char *key)
}
static int
get_int(FILE *fp, int *num)
{
int ret;
ret = fscanf(fp, "%d\n", num);
if (ret != 1) return OMPI_ERROR;
return OMPI_SUCCESS;
}
static int
get_check_version(FILE *fp)
{
int ret;
int ver;
ret = fscanf(fp, "%d\n", &ver);
if (ret != 1) return OMPI_ERROR;
ret = get_int(fp, &ver);
if (OMPI_SUCCESS != ret) return ret;
if (ver != PROTOCOL_VERSION) return OMPI_ERROR;
return OMPI_SUCCESS;
@ -152,8 +169,8 @@ get_string(FILE *fp, char **strp)
char *str;
size_t str_read;;
ret = fscanf(fp, "%d ", &len);
if (ret != 1) return OMPI_ERROR;
ret = get_int(fp, &len);
if (OMPI_SUCCESS != ret) return ret;
str = (char*) malloc(sizeof(char) * (len + 2));
if (NULL == str) return OMPI_ERROR;
@ -344,6 +361,7 @@ get_nodelist(FILE *fp, ompi_list_t *nodelist)
int
mca_pcm_base_recv_schedule(FILE *fp,
int *jobid,
ompi_rte_node_schedule_t *sched,
ompi_list_t *nodelist)
{
@ -354,23 +372,27 @@ mca_pcm_base_recv_schedule(FILE *fp,
if (OMPI_SUCCESS != ret) return ret;
/* check our version */
get_check_version(fp);
ret = get_check_version(fp);
if (OMPI_SUCCESS != ret) return ret;
/* get our jobid */
ret = get_int(fp, jobid);
if (OMPI_SUCCESS != ret) return ret;
/* get argc */
get_argv_array(fp, &(sched->argc), &(sched->argv));
ret = get_argv_array(fp, &(sched->argc), &(sched->argv));
if (OMPI_SUCCESS != ret) return ret;
/* get env */
get_argv_array(fp, &val, &(sched->env));
ret = get_argv_array(fp, &val, &(sched->env));
if (OMPI_SUCCESS != ret) return ret;
/* get cwd */
get_string(fp, &(sched->cwd));
ret = get_string(fp, &(sched->cwd));
if (OMPI_SUCCESS != ret) return ret;
/* get node list */
get_nodelist(fp, nodelist);
ret = get_nodelist(fp, nodelist);
if (OMPI_SUCCESS != ret) return ret;
/* make sure we have our end */

Просмотреть файл

@ -64,7 +64,7 @@ ompi_output_stream_t mca_pcm_rsh_output_stream = {
false, /* lds_want_syslog */
0, /* lds_syslog_priority */
NULL, /* lds_syslog_ident */
"pcm_rsh", /* lds_prefix */
"pcm_rsh: ", /* lds_prefix */
true, /* lds_want_stdout */
false, /* lds_want_stderr */
true, /* lds_want_file */
@ -76,7 +76,6 @@ ompi_output_stream_t mca_pcm_rsh_output_stream = {
/*
* Module variables handles
*/
static int mca_pcm_rsh_param_no_n;
static int mca_pcm_rsh_param_no_profile;
static int mca_pcm_rsh_param_fast;
static int mca_pcm_rsh_param_ignore_stderr;
@ -87,14 +86,18 @@ static int mca_pcm_rsh_param_debug;
/*
* Module variables
*/
int mca_pcm_rsh_no_n;
/* should we avoid running .profile, even if the shell says we should */
int mca_pcm_rsh_no_profile;
/* should we assume same shell on remote as locally? */
int mca_pcm_rsh_fast;
/* should we ignore things on stderr? */
int mca_pcm_rsh_ignore_stderr;
/* how should we fire procs up on the remote side? */
char *mca_pcm_rsh_agent;
int mca_pcm_rsh_output = 0;
mca_llm_base_module_t mca_pcm_rsh_llm;
static mca_llm_base_module_t mca_pcm_rsh_llm;
int
mca_pcm_rsh_component_open(void)
@ -106,8 +109,6 @@ mca_pcm_rsh_component_open(void)
mca_base_param_register_string("pcm", "rsh", "agent", NULL,
"ssh");
mca_pcm_rsh_param_no_n =
mca_base_param_register_int("pcm", "rsh", "no_n", NULL, 0);
mca_pcm_rsh_param_no_profile =
mca_base_param_register_int("pcm", "rsh", "no_profile", NULL, 0);
mca_pcm_rsh_param_fast =
@ -143,8 +144,6 @@ mca_pcm_rsh_init(int *priority,
mca_base_param_lookup_int(mca_pcm_rsh_param_priority, priority);
mca_base_param_lookup_int(mca_pcm_rsh_param_no_n,
&mca_pcm_rsh_no_n);
mca_base_param_lookup_int(mca_pcm_rsh_param_no_profile,
&mca_pcm_rsh_no_profile);
mca_base_param_lookup_int(mca_pcm_rsh_param_fast,
@ -186,6 +185,10 @@ mca_pcm_rsh_finalize(void)
ompi_output_close(mca_pcm_rsh_output);
}
if (NULL == mca_pcm_rsh_1_0_0.pcm_allocate_resources) {
mca_pcm_rsh_1_0_0.pcm_allocate_resources = NULL;
mca_pcm_rsh_1_0_0.pcm_deallocate_resources = NULL;
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -11,6 +11,10 @@
#include "runtime/runtime_types.h"
static int internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
ompi_list_t *nodelist);
bool
mca_pcm_rsh_can_spawn(void)
{
@ -28,27 +32,75 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
ompi_rte_node_schedule_t *sched;
ompi_rte_node_allocation_t *node;
ompi_list_t launch;
ompi_list_t done;
int ret, i;
int width = 1;
OBJ_CONSTRUCT(&launch, ompi_list_t);
OBJ_CONSTRUCT(&done, ompi_list_t);
for (sched_item = ompi_list_get_first(schedlist) ;
sched_item != ompi_list_get_end(schedlist) ;
sched_item = ompi_list_get_next(sched_item)) {
sched = (ompi_rte_node_schedule_t*) sched_item;
/* when we start doing tree based, more logic here... */
for (node_item = ompi_list_get_first(sched->nodelist) ;
node_item != ompi_list_get_end(sched->nodelist) ;
node_item = ompi_list_get_next(node_item)) {
node = (ompi_rte_node_allocation_t*) node_item;
/*
* make sure I'm the first node in the list and then start our
* deal. We rsh me just like everyone else so that we don't
* have any unexpected environment oddities...
*/
/* BWB - do front of list check! */
node_item = ompi_list_get_first(sched->nodelist);
/* we don't need to push nodes down to the compute places,
so don't do it... */
}
while (node_item != ompi_list_get_end(sched->nodelist)) {
/* find enough entries for this slice to go */
for (i = 0 ;
i < width && node_item != ompi_list_get_end(sched->nodelist) ;
node_item = ompi_list_get_next(node_item)) { }
/* if we don't have anyone, get us out of here.. */
if (i == 0) {
continue;
}
/* make a launch list */
ompi_list_splice(&launch, ompi_list_get_end(&launch),
sched->nodelist,
ompi_list_get_first(sched->nodelist),
node_item);
/* do the launch to the first node in the list, passing
him the rest of the list */
ret = internal_spawn_proc(jobid, sched, &launch);
if (OMPI_SUCCESS != ret) {
/* well, crap! put ourselves back together, I guess.
Should call killjob */
ompi_list_join(&done, ompi_list_get_end(&done), &launch);
ompi_list_join(sched->nodelist,
ompi_list_get_first(sched->nodelist),
&done);
return ret;
}
/* copy the list over to the done part */
ompi_list_join(&done, ompi_list_get_end(&done), &launch);
}
}
OBJ_DESTRUCT(&done);
OBJ_DESTRUCT(&launch);
return OMPI_SUCCESS;
}
static int
internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
ompi_list_t *nodelist)
{
/* ok, we rsh to the first guy in the list, then pass the whole
nodelist */
return OMPI_SUCCESS;
}

Просмотреть файл

@ -21,6 +21,7 @@ main(int argc, char *argv[])
pid_t pid;
int i;
int ret;
int jobid;
ompi_init(argc, argv);
@ -30,7 +31,7 @@ main(int argc, char *argv[])
sched = OBJ_NEW(ompi_rte_node_schedule_t);
/* recv_schedule wants an already initialized ompi_list_t */
ret = mca_pcm_base_recv_schedule(stdin, sched,
ret = mca_pcm_base_recv_schedule(stdin, &jobid, sched,
sched->nodelist);
if (ret != OMPI_SUCCESS) {
fprintf(stderr, "Failure in receiving schedule information\n");

Просмотреть файл

@ -29,6 +29,7 @@ main(int argc, char *argv[])
FILE *test2_out=NULL; /* output file for second test */
FILE *test2_in = NULL;
int result; /* result of system call */
int jobid = 123;
test_init("sched_comm_t");
@ -50,7 +51,7 @@ main(int argc, char *argv[])
schedout->env = env;
schedout->cwd = "/foo/bar/baz";
result = mca_pcm_base_send_schedule(test1_out, schedout,
result = mca_pcm_base_send_schedule(test1_out, jobid, schedout,
schedout->nodelist);
if (result != OMPI_SUCCESS) {
test_failure("send_schedule failed");
@ -73,13 +74,13 @@ main(int argc, char *argv[])
test2_in = fopen("./test1_out", "r");
result = mca_pcm_base_recv_schedule(test2_in, schedin,
result = mca_pcm_base_recv_schedule(test2_in, &jobid, schedin,
schedin->nodelist);
if (result != OMPI_SUCCESS) {
test_failure("recv_schedule failed");
exit(1);
}
mca_pcm_base_send_schedule(test2_out, schedin, schedin->nodelist);
mca_pcm_base_send_schedule(test2_out, jobid, schedin, schedin->nodelist);
if (result != OMPI_SUCCESS) {
test_failure("send_schedule (2) failed");
exit(1);

Просмотреть файл

@ -1,5 +1,6 @@
@MCA_PCM@
1
123
1
12 ./sched_comm
3

Просмотреть файл

@ -1,5 +1,6 @@
@MCA_PCM@
1
123
1
12 ./sched_comm
3