* More changes for the rsh pcm:
- make sure to pass jobid to the spawned process - update test case and bootproxy to pass/receive jobid - work on list splitting code for rsh spawn_procs() This commit was SVN r2212.
Этот коммит содержится в:
родитель
aaca5f3bf6
Коммит
964fd9c758
@ -28,10 +28,12 @@ extern "C" {
|
|||||||
|
|
||||||
|
|
||||||
int mca_pcm_base_send_schedule(FILE *fd,
|
int mca_pcm_base_send_schedule(FILE *fd,
|
||||||
|
int jobid,
|
||||||
ompi_rte_node_schedule_t *sched,
|
ompi_rte_node_schedule_t *sched,
|
||||||
ompi_list_t *nodelist);
|
ompi_list_t *nodelist);
|
||||||
|
|
||||||
int mca_pcm_base_recv_schedule(FILE *fd,
|
int mca_pcm_base_recv_schedule(FILE *fd,
|
||||||
|
int *jobid,
|
||||||
ompi_rte_node_schedule_t *sched,
|
ompi_rte_node_schedule_t *sched,
|
||||||
ompi_list_t *nodelist);
|
ompi_list_t *nodelist);
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
|
|
||||||
int
|
int
|
||||||
mca_pcm_base_send_schedule(FILE *fp,
|
mca_pcm_base_send_schedule(FILE *fp,
|
||||||
|
int jobid,
|
||||||
ompi_rte_node_schedule_t *sched,
|
ompi_rte_node_schedule_t *sched,
|
||||||
ompi_list_t *nodelist)
|
ompi_list_t *nodelist)
|
||||||
{
|
{
|
||||||
@ -28,6 +29,9 @@ mca_pcm_base_send_schedule(FILE *fp,
|
|||||||
fprintf(fp, START_KEY);
|
fprintf(fp, START_KEY);
|
||||||
fprintf(fp, "%d\n", PROTOCOL_VERSION);
|
fprintf(fp, "%d\n", PROTOCOL_VERSION);
|
||||||
|
|
||||||
|
/* JOBID */
|
||||||
|
fprintf(fp, "%d\n", jobid);
|
||||||
|
|
||||||
/* ARGC */
|
/* ARGC */
|
||||||
fprintf(fp, "%d\n", sched->argc);
|
fprintf(fp, "%d\n", sched->argc);
|
||||||
for (i = 0 ; i < sched->argc ; ++i) {
|
for (i = 0 ; i < sched->argc ; ++i) {
|
||||||
@ -130,14 +134,27 @@ get_key(FILE *fp, const char *key)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int
|
||||||
|
get_int(FILE *fp, int *num)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = fscanf(fp, "%d\n", num);
|
||||||
|
if (ret != 1) return OMPI_ERROR;
|
||||||
|
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
get_check_version(FILE *fp)
|
get_check_version(FILE *fp)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
int ver;
|
int ver;
|
||||||
|
|
||||||
ret = fscanf(fp, "%d\n", &ver);
|
ret = get_int(fp, &ver);
|
||||||
if (ret != 1) return OMPI_ERROR;
|
if (OMPI_SUCCESS != ret) return ret;
|
||||||
|
|
||||||
if (ver != PROTOCOL_VERSION) return OMPI_ERROR;
|
if (ver != PROTOCOL_VERSION) return OMPI_ERROR;
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
@ -152,8 +169,8 @@ get_string(FILE *fp, char **strp)
|
|||||||
char *str;
|
char *str;
|
||||||
size_t str_read;;
|
size_t str_read;;
|
||||||
|
|
||||||
ret = fscanf(fp, "%d ", &len);
|
ret = get_int(fp, &len);
|
||||||
if (ret != 1) return OMPI_ERROR;
|
if (OMPI_SUCCESS != ret) return ret;
|
||||||
|
|
||||||
str = (char*) malloc(sizeof(char) * (len + 2));
|
str = (char*) malloc(sizeof(char) * (len + 2));
|
||||||
if (NULL == str) return OMPI_ERROR;
|
if (NULL == str) return OMPI_ERROR;
|
||||||
@ -344,6 +361,7 @@ get_nodelist(FILE *fp, ompi_list_t *nodelist)
|
|||||||
|
|
||||||
int
|
int
|
||||||
mca_pcm_base_recv_schedule(FILE *fp,
|
mca_pcm_base_recv_schedule(FILE *fp,
|
||||||
|
int *jobid,
|
||||||
ompi_rte_node_schedule_t *sched,
|
ompi_rte_node_schedule_t *sched,
|
||||||
ompi_list_t *nodelist)
|
ompi_list_t *nodelist)
|
||||||
{
|
{
|
||||||
@ -354,23 +372,27 @@ mca_pcm_base_recv_schedule(FILE *fp,
|
|||||||
if (OMPI_SUCCESS != ret) return ret;
|
if (OMPI_SUCCESS != ret) return ret;
|
||||||
|
|
||||||
/* check our version */
|
/* check our version */
|
||||||
get_check_version(fp);
|
ret = get_check_version(fp);
|
||||||
|
if (OMPI_SUCCESS != ret) return ret;
|
||||||
|
|
||||||
|
/* get our jobid */
|
||||||
|
ret = get_int(fp, jobid);
|
||||||
if (OMPI_SUCCESS != ret) return ret;
|
if (OMPI_SUCCESS != ret) return ret;
|
||||||
|
|
||||||
/* get argc */
|
/* get argc */
|
||||||
get_argv_array(fp, &(sched->argc), &(sched->argv));
|
ret = get_argv_array(fp, &(sched->argc), &(sched->argv));
|
||||||
if (OMPI_SUCCESS != ret) return ret;
|
if (OMPI_SUCCESS != ret) return ret;
|
||||||
|
|
||||||
/* get env */
|
/* get env */
|
||||||
get_argv_array(fp, &val, &(sched->env));
|
ret = get_argv_array(fp, &val, &(sched->env));
|
||||||
if (OMPI_SUCCESS != ret) return ret;
|
if (OMPI_SUCCESS != ret) return ret;
|
||||||
|
|
||||||
/* get cwd */
|
/* get cwd */
|
||||||
get_string(fp, &(sched->cwd));
|
ret = get_string(fp, &(sched->cwd));
|
||||||
if (OMPI_SUCCESS != ret) return ret;
|
if (OMPI_SUCCESS != ret) return ret;
|
||||||
|
|
||||||
/* get node list */
|
/* get node list */
|
||||||
get_nodelist(fp, nodelist);
|
ret = get_nodelist(fp, nodelist);
|
||||||
if (OMPI_SUCCESS != ret) return ret;
|
if (OMPI_SUCCESS != ret) return ret;
|
||||||
|
|
||||||
/* make sure we have our end */
|
/* make sure we have our end */
|
||||||
|
@ -64,7 +64,7 @@ ompi_output_stream_t mca_pcm_rsh_output_stream = {
|
|||||||
false, /* lds_want_syslog */
|
false, /* lds_want_syslog */
|
||||||
0, /* lds_syslog_priority */
|
0, /* lds_syslog_priority */
|
||||||
NULL, /* lds_syslog_ident */
|
NULL, /* lds_syslog_ident */
|
||||||
"pcm_rsh", /* lds_prefix */
|
"pcm_rsh: ", /* lds_prefix */
|
||||||
true, /* lds_want_stdout */
|
true, /* lds_want_stdout */
|
||||||
false, /* lds_want_stderr */
|
false, /* lds_want_stderr */
|
||||||
true, /* lds_want_file */
|
true, /* lds_want_file */
|
||||||
@ -76,7 +76,6 @@ ompi_output_stream_t mca_pcm_rsh_output_stream = {
|
|||||||
/*
|
/*
|
||||||
* Module variables handles
|
* Module variables handles
|
||||||
*/
|
*/
|
||||||
static int mca_pcm_rsh_param_no_n;
|
|
||||||
static int mca_pcm_rsh_param_no_profile;
|
static int mca_pcm_rsh_param_no_profile;
|
||||||
static int mca_pcm_rsh_param_fast;
|
static int mca_pcm_rsh_param_fast;
|
||||||
static int mca_pcm_rsh_param_ignore_stderr;
|
static int mca_pcm_rsh_param_ignore_stderr;
|
||||||
@ -87,14 +86,18 @@ static int mca_pcm_rsh_param_debug;
|
|||||||
/*
|
/*
|
||||||
* Module variables
|
* Module variables
|
||||||
*/
|
*/
|
||||||
int mca_pcm_rsh_no_n;
|
/* should we avoid running .profile, even if the shell says we should */
|
||||||
int mca_pcm_rsh_no_profile;
|
int mca_pcm_rsh_no_profile;
|
||||||
|
/* should we assume same shell on remote as locally? */
|
||||||
int mca_pcm_rsh_fast;
|
int mca_pcm_rsh_fast;
|
||||||
|
/* should we ignore things on stderr? */
|
||||||
int mca_pcm_rsh_ignore_stderr;
|
int mca_pcm_rsh_ignore_stderr;
|
||||||
|
/* how should we fire procs up on the remote side? */
|
||||||
char *mca_pcm_rsh_agent;
|
char *mca_pcm_rsh_agent;
|
||||||
|
|
||||||
int mca_pcm_rsh_output = 0;
|
int mca_pcm_rsh_output = 0;
|
||||||
mca_llm_base_module_t mca_pcm_rsh_llm;
|
|
||||||
|
static mca_llm_base_module_t mca_pcm_rsh_llm;
|
||||||
|
|
||||||
int
|
int
|
||||||
mca_pcm_rsh_component_open(void)
|
mca_pcm_rsh_component_open(void)
|
||||||
@ -106,8 +109,6 @@ mca_pcm_rsh_component_open(void)
|
|||||||
mca_base_param_register_string("pcm", "rsh", "agent", NULL,
|
mca_base_param_register_string("pcm", "rsh", "agent", NULL,
|
||||||
"ssh");
|
"ssh");
|
||||||
|
|
||||||
mca_pcm_rsh_param_no_n =
|
|
||||||
mca_base_param_register_int("pcm", "rsh", "no_n", NULL, 0);
|
|
||||||
mca_pcm_rsh_param_no_profile =
|
mca_pcm_rsh_param_no_profile =
|
||||||
mca_base_param_register_int("pcm", "rsh", "no_profile", NULL, 0);
|
mca_base_param_register_int("pcm", "rsh", "no_profile", NULL, 0);
|
||||||
mca_pcm_rsh_param_fast =
|
mca_pcm_rsh_param_fast =
|
||||||
@ -143,8 +144,6 @@ mca_pcm_rsh_init(int *priority,
|
|||||||
|
|
||||||
mca_base_param_lookup_int(mca_pcm_rsh_param_priority, priority);
|
mca_base_param_lookup_int(mca_pcm_rsh_param_priority, priority);
|
||||||
|
|
||||||
mca_base_param_lookup_int(mca_pcm_rsh_param_no_n,
|
|
||||||
&mca_pcm_rsh_no_n);
|
|
||||||
mca_base_param_lookup_int(mca_pcm_rsh_param_no_profile,
|
mca_base_param_lookup_int(mca_pcm_rsh_param_no_profile,
|
||||||
&mca_pcm_rsh_no_profile);
|
&mca_pcm_rsh_no_profile);
|
||||||
mca_base_param_lookup_int(mca_pcm_rsh_param_fast,
|
mca_base_param_lookup_int(mca_pcm_rsh_param_fast,
|
||||||
@ -186,6 +185,10 @@ mca_pcm_rsh_finalize(void)
|
|||||||
ompi_output_close(mca_pcm_rsh_output);
|
ompi_output_close(mca_pcm_rsh_output);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (NULL == mca_pcm_rsh_1_0_0.pcm_allocate_resources) {
|
||||||
|
mca_pcm_rsh_1_0_0.pcm_allocate_resources = NULL;
|
||||||
|
mca_pcm_rsh_1_0_0.pcm_deallocate_resources = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11,6 +11,10 @@
|
|||||||
#include "runtime/runtime_types.h"
|
#include "runtime/runtime_types.h"
|
||||||
|
|
||||||
|
|
||||||
|
static int internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
||||||
|
ompi_list_t *nodelist);
|
||||||
|
|
||||||
|
|
||||||
bool
|
bool
|
||||||
mca_pcm_rsh_can_spawn(void)
|
mca_pcm_rsh_can_spawn(void)
|
||||||
{
|
{
|
||||||
@ -28,27 +32,75 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
|
|||||||
ompi_rte_node_schedule_t *sched;
|
ompi_rte_node_schedule_t *sched;
|
||||||
ompi_rte_node_allocation_t *node;
|
ompi_rte_node_allocation_t *node;
|
||||||
ompi_list_t launch;
|
ompi_list_t launch;
|
||||||
|
ompi_list_t done;
|
||||||
|
int ret, i;
|
||||||
|
int width = 1;
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&launch, ompi_list_t);
|
OBJ_CONSTRUCT(&launch, ompi_list_t);
|
||||||
|
OBJ_CONSTRUCT(&done, ompi_list_t);
|
||||||
|
|
||||||
for (sched_item = ompi_list_get_first(schedlist) ;
|
for (sched_item = ompi_list_get_first(schedlist) ;
|
||||||
sched_item != ompi_list_get_end(schedlist) ;
|
sched_item != ompi_list_get_end(schedlist) ;
|
||||||
sched_item = ompi_list_get_next(sched_item)) {
|
sched_item = ompi_list_get_next(sched_item)) {
|
||||||
sched = (ompi_rte_node_schedule_t*) sched_item;
|
sched = (ompi_rte_node_schedule_t*) sched_item;
|
||||||
|
|
||||||
/* when we start doing tree based, more logic here... */
|
/*
|
||||||
for (node_item = ompi_list_get_first(sched->nodelist) ;
|
* make sure I'm the first node in the list and then start our
|
||||||
node_item != ompi_list_get_end(sched->nodelist) ;
|
* deal. We rsh me just like everyone else so that we don't
|
||||||
node_item = ompi_list_get_next(node_item)) {
|
* have any unexpected environment oddities...
|
||||||
node = (ompi_rte_node_allocation_t*) node_item;
|
*/
|
||||||
|
/* BWB - do front of list check! */
|
||||||
|
node_item = ompi_list_get_first(sched->nodelist);
|
||||||
|
|
||||||
/* we don't need to push nodes down to the compute places,
|
while (node_item != ompi_list_get_end(sched->nodelist)) {
|
||||||
so don't do it... */
|
/* find enough entries for this slice to go */
|
||||||
|
for (i = 0 ;
|
||||||
}
|
i < width && node_item != ompi_list_get_end(sched->nodelist) ;
|
||||||
|
node_item = ompi_list_get_next(node_item)) { }
|
||||||
|
/* if we don't have anyone, get us out of here.. */
|
||||||
|
if (i == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* make a launch list */
|
||||||
|
ompi_list_splice(&launch, ompi_list_get_end(&launch),
|
||||||
|
sched->nodelist,
|
||||||
|
ompi_list_get_first(sched->nodelist),
|
||||||
|
node_item);
|
||||||
|
|
||||||
|
/* do the launch to the first node in the list, passing
|
||||||
|
him the rest of the list */
|
||||||
|
ret = internal_spawn_proc(jobid, sched, &launch);
|
||||||
|
if (OMPI_SUCCESS != ret) {
|
||||||
|
/* well, crap! put ourselves back together, I guess.
|
||||||
|
Should call killjob */
|
||||||
|
ompi_list_join(&done, ompi_list_get_end(&done), &launch);
|
||||||
|
ompi_list_join(sched->nodelist,
|
||||||
|
ompi_list_get_first(sched->nodelist),
|
||||||
|
&done);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* copy the list over to the done part */
|
||||||
|
ompi_list_join(&done, ompi_list_get_end(&done), &launch);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OBJ_DESTRUCT(&done);
|
||||||
OBJ_DESTRUCT(&launch);
|
OBJ_DESTRUCT(&launch);
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int
|
||||||
|
internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
||||||
|
ompi_list_t *nodelist)
|
||||||
|
{
|
||||||
|
/* ok, we rsh to the first guy in the list, then pass the whole
|
||||||
|
nodelist */
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
@ -21,6 +21,7 @@ main(int argc, char *argv[])
|
|||||||
pid_t pid;
|
pid_t pid;
|
||||||
int i;
|
int i;
|
||||||
int ret;
|
int ret;
|
||||||
|
int jobid;
|
||||||
|
|
||||||
ompi_init(argc, argv);
|
ompi_init(argc, argv);
|
||||||
|
|
||||||
@ -30,7 +31,7 @@ main(int argc, char *argv[])
|
|||||||
sched = OBJ_NEW(ompi_rte_node_schedule_t);
|
sched = OBJ_NEW(ompi_rte_node_schedule_t);
|
||||||
|
|
||||||
/* recv_schedule wants an already initialized ompi_list_t */
|
/* recv_schedule wants an already initialized ompi_list_t */
|
||||||
ret = mca_pcm_base_recv_schedule(stdin, sched,
|
ret = mca_pcm_base_recv_schedule(stdin, &jobid, sched,
|
||||||
sched->nodelist);
|
sched->nodelist);
|
||||||
if (ret != OMPI_SUCCESS) {
|
if (ret != OMPI_SUCCESS) {
|
||||||
fprintf(stderr, "Failure in receiving schedule information\n");
|
fprintf(stderr, "Failure in receiving schedule information\n");
|
||||||
|
@ -29,6 +29,7 @@ main(int argc, char *argv[])
|
|||||||
FILE *test2_out=NULL; /* output file for second test */
|
FILE *test2_out=NULL; /* output file for second test */
|
||||||
FILE *test2_in = NULL;
|
FILE *test2_in = NULL;
|
||||||
int result; /* result of system call */
|
int result; /* result of system call */
|
||||||
|
int jobid = 123;
|
||||||
|
|
||||||
test_init("sched_comm_t");
|
test_init("sched_comm_t");
|
||||||
|
|
||||||
@ -50,7 +51,7 @@ main(int argc, char *argv[])
|
|||||||
schedout->env = env;
|
schedout->env = env;
|
||||||
schedout->cwd = "/foo/bar/baz";
|
schedout->cwd = "/foo/bar/baz";
|
||||||
|
|
||||||
result = mca_pcm_base_send_schedule(test1_out, schedout,
|
result = mca_pcm_base_send_schedule(test1_out, jobid, schedout,
|
||||||
schedout->nodelist);
|
schedout->nodelist);
|
||||||
if (result != OMPI_SUCCESS) {
|
if (result != OMPI_SUCCESS) {
|
||||||
test_failure("send_schedule failed");
|
test_failure("send_schedule failed");
|
||||||
@ -73,13 +74,13 @@ main(int argc, char *argv[])
|
|||||||
|
|
||||||
test2_in = fopen("./test1_out", "r");
|
test2_in = fopen("./test1_out", "r");
|
||||||
|
|
||||||
result = mca_pcm_base_recv_schedule(test2_in, schedin,
|
result = mca_pcm_base_recv_schedule(test2_in, &jobid, schedin,
|
||||||
schedin->nodelist);
|
schedin->nodelist);
|
||||||
if (result != OMPI_SUCCESS) {
|
if (result != OMPI_SUCCESS) {
|
||||||
test_failure("recv_schedule failed");
|
test_failure("recv_schedule failed");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
mca_pcm_base_send_schedule(test2_out, schedin, schedin->nodelist);
|
mca_pcm_base_send_schedule(test2_out, jobid, schedin, schedin->nodelist);
|
||||||
if (result != OMPI_SUCCESS) {
|
if (result != OMPI_SUCCESS) {
|
||||||
test_failure("send_schedule (2) failed");
|
test_failure("send_schedule (2) failed");
|
||||||
exit(1);
|
exit(1);
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
@MCA_PCM@
|
@MCA_PCM@
|
||||||
1
|
1
|
||||||
|
123
|
||||||
1
|
1
|
||||||
12 ./sched_comm
|
12 ./sched_comm
|
||||||
3
|
3
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
@MCA_PCM@
|
@MCA_PCM@
|
||||||
1
|
1
|
||||||
|
123
|
||||||
1
|
1
|
||||||
12 ./sched_comm
|
12 ./sched_comm
|
||||||
3
|
3
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user