* More changes for the rsh pcm:
- make sure to pass jobid to the spawned process - update test case and bootproxy to pass/receive jobid - work on list splitting code for rsh spawn_procs() This commit was SVN r2212.
Этот коммит содержится в:
родитель
aaca5f3bf6
Коммит
964fd9c758
@ -28,10 +28,12 @@ extern "C" {
|
||||
|
||||
|
||||
int mca_pcm_base_send_schedule(FILE *fd,
|
||||
int jobid,
|
||||
ompi_rte_node_schedule_t *sched,
|
||||
ompi_list_t *nodelist);
|
||||
|
||||
int mca_pcm_base_recv_schedule(FILE *fd,
|
||||
int *jobid,
|
||||
ompi_rte_node_schedule_t *sched,
|
||||
ompi_list_t *nodelist);
|
||||
|
||||
|
@ -17,6 +17,7 @@
|
||||
|
||||
int
|
||||
mca_pcm_base_send_schedule(FILE *fp,
|
||||
int jobid,
|
||||
ompi_rte_node_schedule_t *sched,
|
||||
ompi_list_t *nodelist)
|
||||
{
|
||||
@ -28,6 +29,9 @@ mca_pcm_base_send_schedule(FILE *fp,
|
||||
fprintf(fp, START_KEY);
|
||||
fprintf(fp, "%d\n", PROTOCOL_VERSION);
|
||||
|
||||
/* JOBID */
|
||||
fprintf(fp, "%d\n", jobid);
|
||||
|
||||
/* ARGC */
|
||||
fprintf(fp, "%d\n", sched->argc);
|
||||
for (i = 0 ; i < sched->argc ; ++i) {
|
||||
@ -130,14 +134,27 @@ get_key(FILE *fp, const char *key)
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
get_int(FILE *fp, int *num)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = fscanf(fp, "%d\n", num);
|
||||
if (ret != 1) return OMPI_ERROR;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
get_check_version(FILE *fp)
|
||||
{
|
||||
int ret;
|
||||
int ver;
|
||||
|
||||
ret = fscanf(fp, "%d\n", &ver);
|
||||
if (ret != 1) return OMPI_ERROR;
|
||||
ret = get_int(fp, &ver);
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
|
||||
if (ver != PROTOCOL_VERSION) return OMPI_ERROR;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
@ -152,8 +169,8 @@ get_string(FILE *fp, char **strp)
|
||||
char *str;
|
||||
size_t str_read;;
|
||||
|
||||
ret = fscanf(fp, "%d ", &len);
|
||||
if (ret != 1) return OMPI_ERROR;
|
||||
ret = get_int(fp, &len);
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
|
||||
str = (char*) malloc(sizeof(char) * (len + 2));
|
||||
if (NULL == str) return OMPI_ERROR;
|
||||
@ -344,6 +361,7 @@ get_nodelist(FILE *fp, ompi_list_t *nodelist)
|
||||
|
||||
int
|
||||
mca_pcm_base_recv_schedule(FILE *fp,
|
||||
int *jobid,
|
||||
ompi_rte_node_schedule_t *sched,
|
||||
ompi_list_t *nodelist)
|
||||
{
|
||||
@ -354,23 +372,27 @@ mca_pcm_base_recv_schedule(FILE *fp,
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
|
||||
/* check our version */
|
||||
get_check_version(fp);
|
||||
ret = get_check_version(fp);
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
|
||||
/* get our jobid */
|
||||
ret = get_int(fp, jobid);
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
|
||||
/* get argc */
|
||||
get_argv_array(fp, &(sched->argc), &(sched->argv));
|
||||
ret = get_argv_array(fp, &(sched->argc), &(sched->argv));
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
|
||||
/* get env */
|
||||
get_argv_array(fp, &val, &(sched->env));
|
||||
ret = get_argv_array(fp, &val, &(sched->env));
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
|
||||
/* get cwd */
|
||||
get_string(fp, &(sched->cwd));
|
||||
ret = get_string(fp, &(sched->cwd));
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
|
||||
/* get node list */
|
||||
get_nodelist(fp, nodelist);
|
||||
ret = get_nodelist(fp, nodelist);
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
|
||||
/* make sure we have our end */
|
||||
|
@ -64,7 +64,7 @@ ompi_output_stream_t mca_pcm_rsh_output_stream = {
|
||||
false, /* lds_want_syslog */
|
||||
0, /* lds_syslog_priority */
|
||||
NULL, /* lds_syslog_ident */
|
||||
"pcm_rsh", /* lds_prefix */
|
||||
"pcm_rsh: ", /* lds_prefix */
|
||||
true, /* lds_want_stdout */
|
||||
false, /* lds_want_stderr */
|
||||
true, /* lds_want_file */
|
||||
@ -76,7 +76,6 @@ ompi_output_stream_t mca_pcm_rsh_output_stream = {
|
||||
/*
|
||||
* Module variables handles
|
||||
*/
|
||||
static int mca_pcm_rsh_param_no_n;
|
||||
static int mca_pcm_rsh_param_no_profile;
|
||||
static int mca_pcm_rsh_param_fast;
|
||||
static int mca_pcm_rsh_param_ignore_stderr;
|
||||
@ -87,14 +86,18 @@ static int mca_pcm_rsh_param_debug;
|
||||
/*
|
||||
* Module variables
|
||||
*/
|
||||
int mca_pcm_rsh_no_n;
|
||||
/* should we avoid running .profile, even if the shell says we should */
|
||||
int mca_pcm_rsh_no_profile;
|
||||
/* should we assume same shell on remote as locally? */
|
||||
int mca_pcm_rsh_fast;
|
||||
/* should we ignore things on stderr? */
|
||||
int mca_pcm_rsh_ignore_stderr;
|
||||
/* how should we fire procs up on the remote side? */
|
||||
char *mca_pcm_rsh_agent;
|
||||
|
||||
int mca_pcm_rsh_output = 0;
|
||||
mca_llm_base_module_t mca_pcm_rsh_llm;
|
||||
|
||||
static mca_llm_base_module_t mca_pcm_rsh_llm;
|
||||
|
||||
int
|
||||
mca_pcm_rsh_component_open(void)
|
||||
@ -106,8 +109,6 @@ mca_pcm_rsh_component_open(void)
|
||||
mca_base_param_register_string("pcm", "rsh", "agent", NULL,
|
||||
"ssh");
|
||||
|
||||
mca_pcm_rsh_param_no_n =
|
||||
mca_base_param_register_int("pcm", "rsh", "no_n", NULL, 0);
|
||||
mca_pcm_rsh_param_no_profile =
|
||||
mca_base_param_register_int("pcm", "rsh", "no_profile", NULL, 0);
|
||||
mca_pcm_rsh_param_fast =
|
||||
@ -143,8 +144,6 @@ mca_pcm_rsh_init(int *priority,
|
||||
|
||||
mca_base_param_lookup_int(mca_pcm_rsh_param_priority, priority);
|
||||
|
||||
mca_base_param_lookup_int(mca_pcm_rsh_param_no_n,
|
||||
&mca_pcm_rsh_no_n);
|
||||
mca_base_param_lookup_int(mca_pcm_rsh_param_no_profile,
|
||||
&mca_pcm_rsh_no_profile);
|
||||
mca_base_param_lookup_int(mca_pcm_rsh_param_fast,
|
||||
@ -186,6 +185,10 @@ mca_pcm_rsh_finalize(void)
|
||||
ompi_output_close(mca_pcm_rsh_output);
|
||||
}
|
||||
|
||||
if (NULL == mca_pcm_rsh_1_0_0.pcm_allocate_resources) {
|
||||
mca_pcm_rsh_1_0_0.pcm_allocate_resources = NULL;
|
||||
mca_pcm_rsh_1_0_0.pcm_deallocate_resources = NULL;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -11,6 +11,10 @@
|
||||
#include "runtime/runtime_types.h"
|
||||
|
||||
|
||||
static int internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
||||
ompi_list_t *nodelist);
|
||||
|
||||
|
||||
bool
|
||||
mca_pcm_rsh_can_spawn(void)
|
||||
{
|
||||
@ -28,27 +32,75 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
|
||||
ompi_rte_node_schedule_t *sched;
|
||||
ompi_rte_node_allocation_t *node;
|
||||
ompi_list_t launch;
|
||||
ompi_list_t done;
|
||||
int ret, i;
|
||||
int width = 1;
|
||||
|
||||
OBJ_CONSTRUCT(&launch, ompi_list_t);
|
||||
OBJ_CONSTRUCT(&done, ompi_list_t);
|
||||
|
||||
for (sched_item = ompi_list_get_first(schedlist) ;
|
||||
sched_item != ompi_list_get_end(schedlist) ;
|
||||
sched_item = ompi_list_get_next(sched_item)) {
|
||||
sched = (ompi_rte_node_schedule_t*) sched_item;
|
||||
|
||||
/* when we start doing tree based, more logic here... */
|
||||
for (node_item = ompi_list_get_first(sched->nodelist) ;
|
||||
node_item != ompi_list_get_end(sched->nodelist) ;
|
||||
node_item = ompi_list_get_next(node_item)) {
|
||||
node = (ompi_rte_node_allocation_t*) node_item;
|
||||
/*
|
||||
* make sure I'm the first node in the list and then start our
|
||||
* deal. We rsh me just like everyone else so that we don't
|
||||
* have any unexpected environment oddities...
|
||||
*/
|
||||
/* BWB - do front of list check! */
|
||||
node_item = ompi_list_get_first(sched->nodelist);
|
||||
|
||||
/* we don't need to push nodes down to the compute places,
|
||||
so don't do it... */
|
||||
while (node_item != ompi_list_get_end(sched->nodelist)) {
|
||||
/* find enough entries for this slice to go */
|
||||
for (i = 0 ;
|
||||
i < width && node_item != ompi_list_get_end(sched->nodelist) ;
|
||||
node_item = ompi_list_get_next(node_item)) { }
|
||||
/* if we don't have anyone, get us out of here.. */
|
||||
if (i == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* make a launch list */
|
||||
ompi_list_splice(&launch, ompi_list_get_end(&launch),
|
||||
sched->nodelist,
|
||||
ompi_list_get_first(sched->nodelist),
|
||||
node_item);
|
||||
|
||||
/* do the launch to the first node in the list, passing
|
||||
him the rest of the list */
|
||||
ret = internal_spawn_proc(jobid, sched, &launch);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
/* well, crap! put ourselves back together, I guess.
|
||||
Should call killjob */
|
||||
ompi_list_join(&done, ompi_list_get_end(&done), &launch);
|
||||
ompi_list_join(sched->nodelist,
|
||||
ompi_list_get_first(sched->nodelist),
|
||||
&done);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* copy the list over to the done part */
|
||||
ompi_list_join(&done, ompi_list_get_end(&done), &launch);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&done);
|
||||
OBJ_DESTRUCT(&launch);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
||||
ompi_list_t *nodelist)
|
||||
{
|
||||
/* ok, we rsh to the first guy in the list, then pass the whole
|
||||
nodelist */
|
||||
|
||||
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -21,6 +21,7 @@ main(int argc, char *argv[])
|
||||
pid_t pid;
|
||||
int i;
|
||||
int ret;
|
||||
int jobid;
|
||||
|
||||
ompi_init(argc, argv);
|
||||
|
||||
@ -30,7 +31,7 @@ main(int argc, char *argv[])
|
||||
sched = OBJ_NEW(ompi_rte_node_schedule_t);
|
||||
|
||||
/* recv_schedule wants an already initialized ompi_list_t */
|
||||
ret = mca_pcm_base_recv_schedule(stdin, sched,
|
||||
ret = mca_pcm_base_recv_schedule(stdin, &jobid, sched,
|
||||
sched->nodelist);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
fprintf(stderr, "Failure in receiving schedule information\n");
|
||||
|
@ -29,6 +29,7 @@ main(int argc, char *argv[])
|
||||
FILE *test2_out=NULL; /* output file for second test */
|
||||
FILE *test2_in = NULL;
|
||||
int result; /* result of system call */
|
||||
int jobid = 123;
|
||||
|
||||
test_init("sched_comm_t");
|
||||
|
||||
@ -50,7 +51,7 @@ main(int argc, char *argv[])
|
||||
schedout->env = env;
|
||||
schedout->cwd = "/foo/bar/baz";
|
||||
|
||||
result = mca_pcm_base_send_schedule(test1_out, schedout,
|
||||
result = mca_pcm_base_send_schedule(test1_out, jobid, schedout,
|
||||
schedout->nodelist);
|
||||
if (result != OMPI_SUCCESS) {
|
||||
test_failure("send_schedule failed");
|
||||
@ -73,13 +74,13 @@ main(int argc, char *argv[])
|
||||
|
||||
test2_in = fopen("./test1_out", "r");
|
||||
|
||||
result = mca_pcm_base_recv_schedule(test2_in, schedin,
|
||||
result = mca_pcm_base_recv_schedule(test2_in, &jobid, schedin,
|
||||
schedin->nodelist);
|
||||
if (result != OMPI_SUCCESS) {
|
||||
test_failure("recv_schedule failed");
|
||||
exit(1);
|
||||
}
|
||||
mca_pcm_base_send_schedule(test2_out, schedin, schedin->nodelist);
|
||||
mca_pcm_base_send_schedule(test2_out, jobid, schedin, schedin->nodelist);
|
||||
if (result != OMPI_SUCCESS) {
|
||||
test_failure("send_schedule (2) failed");
|
||||
exit(1);
|
||||
|
@ -1,5 +1,6 @@
|
||||
@MCA_PCM@
|
||||
1
|
||||
123
|
||||
1
|
||||
12 ./sched_comm
|
||||
3
|
||||
|
@ -1,5 +1,6 @@
|
||||
@MCA_PCM@
|
||||
1
|
||||
123
|
||||
1
|
||||
12 ./sched_comm
|
||||
3
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user