* some more (mostly untested, but compiling) code for the cofs pcm module
* some comments on what needs to be done in the MPI_INIT C interface. More coming tomorrow.... This commit was SVN r328.
Этот коммит содержится в:
родитель
e4ac0bc98a
Коммит
3fc789e72e
@ -21,8 +21,9 @@
|
||||
|
||||
static int handle_new_count = 0;
|
||||
|
||||
|
||||
int
|
||||
mca_pcm_cofs_query_get_nodes(lam_pcm_node_t ** nodes, size_t * nodes_len,
|
||||
mca_pcm_cofs_query_get_nodes(mca_pcm_rte_node_t **nodes, size_t * nodes_len,
|
||||
int *available_procs)
|
||||
{
|
||||
*nodes = NULL;
|
||||
@ -40,6 +41,7 @@ mca_pcm_cofs_handle_new(lam_job_handle_t parent)
|
||||
char *ret;
|
||||
size_t ret_len;
|
||||
|
||||
/* should really make this a file lookup kind of thing */
|
||||
pid = getpid();
|
||||
|
||||
ret_len = sizeof(pid_t) * 8 + strlen("pcm_cofs_job_handle") + sizeof(int) * 8 + 5;
|
||||
@ -65,7 +67,10 @@ mca_pcm_cofs_handle_get(void)
|
||||
void
|
||||
mca_pcm_cofs_handle_free(lam_job_handle_t * job_handle)
|
||||
{
|
||||
if (*job_handle != NULL) {
|
||||
if (*job_handle == mca_pcm_cofs_my_handle) {
|
||||
printf("WARNING: attempting to free static internal job handle!\n");
|
||||
printf(" Did you perhaps try to free the return from handle_get()?\n");
|
||||
} else if (*job_handle != NULL) {
|
||||
LAM_FREE(*job_handle);
|
||||
*job_handle = NULL;
|
||||
}
|
||||
@ -75,30 +80,38 @@ mca_pcm_cofs_handle_free(lam_job_handle_t * job_handle)
|
||||
int
|
||||
mca_pcm_cofs_job_can_spawn(lam_job_handle_t job_handle)
|
||||
{
|
||||
#if 1
|
||||
/* Currently, have not coded up spawning support. Need to do
|
||||
so soon */
|
||||
return NULL;
|
||||
#else
|
||||
if (job_handle != NULL) {
|
||||
return LAM_ERR_NOT_SUPPORTED;
|
||||
} else {
|
||||
return LAM_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pcm_cofs_job_set_arguments(lam_job_handle_t job_handle,
|
||||
lam_pcm_control_args_t * opts,
|
||||
mca_pcm_control_args_t * opts,
|
||||
size_t opts_len)
|
||||
{
|
||||
/* need to implement, but not needed to get INIT going */
|
||||
return LAM_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pcm_cofs_job_launch_procs(lam_job_handle_t job_handle,
|
||||
lam_pcm_node_t * nodes,
|
||||
mca_pcm_rte_node_t *nodes,
|
||||
size_t nodes_len, const char *file,
|
||||
int argc, const char *argv[],
|
||||
const char *env[])
|
||||
{
|
||||
/* need to implement, but not needed to get INIT going */
|
||||
return LAM_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
@ -106,6 +119,7 @@ mca_pcm_cofs_job_launch_procs(lam_job_handle_t job_handle,
|
||||
int
|
||||
mca_pcm_cofs_job_rendezvous(lam_job_handle_t job_handle)
|
||||
{
|
||||
/* need to implement, but not needed to get INIT going */
|
||||
return LAM_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
@ -113,6 +127,7 @@ mca_pcm_cofs_job_rendezvous(lam_job_handle_t job_handle)
|
||||
int
|
||||
mca_pcm_cofs_job_wait(lam_job_handle_t job_handle)
|
||||
{
|
||||
/* need to implement, but not needed to get INIT going */
|
||||
return LAM_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
@ -121,6 +136,7 @@ int
|
||||
mca_pcm_cofs_job_running(lam_job_handle_t job_handle,
|
||||
int *running)
|
||||
{
|
||||
/* need to implement, but not needed to get INIT going */
|
||||
return LAM_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
@ -129,6 +145,7 @@ int
|
||||
mca_pcm_cofs_job_list_running(lam_job_handle_t ** handles,
|
||||
size_t handles_len)
|
||||
{
|
||||
/* need to implement, but not needed to get INIT going */
|
||||
return LAM_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
@ -136,21 +153,46 @@ mca_pcm_cofs_job_list_running(lam_job_handle_t ** handles,
|
||||
int
|
||||
mca_pcm_cofs_proc_startup(void)
|
||||
{
|
||||
return LAM_ERR_NOT_IMPLEMENTED;
|
||||
int i;
|
||||
|
||||
if (mca_pcm_cofs_nprocs == 0) {
|
||||
/* well, this really shouldn't happen - we know we have at least ourselves */
|
||||
return LAM_ERR_FATAL;
|
||||
}
|
||||
|
||||
mca_pcm_cofs_procs = LAM_MALLOC(sizeof(mca_pcm_proc_t) * mca_pcm_cofs_nprocs);
|
||||
if (mca_pcm_cofs_procs == NULL) {
|
||||
return LAM_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
for (i = 0 ; i < mca_pcm_cofs_nprocs ; ++i) {
|
||||
/* for now, assume everyone in the same job :( */
|
||||
mca_pcm_cofs_procs[i].job_handle = mca_pcm_cofs_handle_get();
|
||||
mca_pcm_cofs_procs[i].vpid = i;
|
||||
}
|
||||
|
||||
return LAM_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pcm_cofs_proc_get_peers(void)
|
||||
mca_pcm_cofs_proc_get_peers(mca_pcm_proc_t **procs, size_t *nprocs)
|
||||
{
|
||||
return LAM_ERR_NOT_IMPLEMENTED;
|
||||
if (mca_pcm_cofs_procs == NULL) {
|
||||
return LAM_ERROR;
|
||||
}
|
||||
|
||||
*procs = mca_pcm_cofs_procs;
|
||||
*nprocs = mca_pcm_cofs_nprocs;
|
||||
|
||||
return LAM_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pcm_proc_t*
|
||||
mca_pcm_cofs_proc_get_me(void)
|
||||
{
|
||||
return LAM_ERR_NOT_IMPLEMENTED;
|
||||
return &(mca_pcm_cofs_procs[mca_pcm_cofs_my_vpid]);
|
||||
}
|
||||
|
||||
|
||||
|
@ -25,7 +25,7 @@ int mca_pcm_cofs_finalize(void);
|
||||
/*
|
||||
* "Action" functions
|
||||
*/
|
||||
int mca_pcm_cofs_query_get_nodes(lam_pcm_node_t **nodes, size_t *nodes_len,
|
||||
int mca_pcm_cofs_query_get_nodes(mca_pcm_rte_node_t **nodes, size_t *nodes_len,
|
||||
int *available_procs);
|
||||
|
||||
lam_job_handle_t mca_pcm_cofs_handle_new(lam_job_handle_t parent);
|
||||
@ -34,10 +34,10 @@ void mca_pcm_cofs_handle_free(lam_job_handle_t *job_handle);
|
||||
|
||||
int mca_pcm_cofs_job_can_spawn(lam_job_handle_t job_handle);
|
||||
int mca_pcm_cofs_job_set_arguments(lam_job_handle_t job_handle,
|
||||
lam_pcm_control_args_t* opts,
|
||||
mca_pcm_control_args_t* opts,
|
||||
size_t opts_len);
|
||||
int mca_pcm_cofs_job_launch_procs(lam_job_handle_t job_handle,
|
||||
lam_pcm_node_t *nodes,
|
||||
mca_pcm_rte_node_t *nodes,
|
||||
size_t nodes_len, const char* file,
|
||||
int argc, const char* argv[],
|
||||
const char *env[]);
|
||||
@ -49,10 +49,15 @@ int mca_pcm_cofs_job_list_running(lam_job_handle_t **handles,
|
||||
size_t handles_len);
|
||||
|
||||
int mca_pcm_cofs_proc_startup(void);
|
||||
int mca_pcm_cofs_proc_get_peers(void);
|
||||
int mca_pcm_cofs_proc_get_me(void);
|
||||
int mca_pcm_cofs_proc_get_peers(mca_pcm_proc_t **procs, size_t *nprocs);
|
||||
mca_pcm_proc_t* mca_pcm_cofs_proc_get_me(void);
|
||||
int mca_pcm_cofs_proc_get_parent(void);
|
||||
|
||||
extern char mca_pcm_cofs_comm_loc[LAM_PATH_MAX]; /* location for file drop-off */
|
||||
|
||||
extern int mca_pcm_cofs_my_vpid;
|
||||
extern char *mca_pcm_cofs_my_handle;
|
||||
|
||||
extern mca_pcm_proc_t *mca_pcm_cofs_procs;
|
||||
extern size_t mca_pcm_cofs_nprocs;
|
||||
|
||||
|
@ -7,10 +7,11 @@
|
||||
#include "lam_config.h"
|
||||
|
||||
#include "lam/constants.h"
|
||||
#include "lam/types.h"
|
||||
#include "lam/util/malloc.h"
|
||||
#include "mca/mca.h"
|
||||
#include "mca/lam/pcm/pcm.h"
|
||||
#include "mca/lam/pcm/cofs/src/pcm_cofs.h"
|
||||
#include "lam/types.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
@ -83,7 +84,29 @@ mca_pcm_cofs_close(void)
|
||||
int
|
||||
mca_pcm_cofs_query(int *priority)
|
||||
{
|
||||
char *test_ret;
|
||||
|
||||
*priority = 0;
|
||||
|
||||
/* BWB - remove printfs once things settle down some... */
|
||||
test_ret = getenv("MCA_common_lam_cofs_my_vpid");
|
||||
if (test_ret == NULL) {
|
||||
printf("COFS PCM will not be running because MCA_common_lam_cofs_my_vpid not set\n");
|
||||
return LAM_ERROR;
|
||||
}
|
||||
|
||||
test_ret = getenv("MCA_common_lam_cofs_job_handle");
|
||||
if (test_ret == NULL) {
|
||||
printf("COFS PCM will not be running because MCA_common_lam_cofs_job_handle not set\n");
|
||||
return LAM_ERROR;
|
||||
}
|
||||
|
||||
test_ret = getenv("MCA_common_lam_cofs_num_procs");
|
||||
if (test_ret == NULL) {
|
||||
printf("COFS PCM will not be running because MCA_common_lam_cofs_num_procs not set\n");
|
||||
return LAM_ERROR;
|
||||
}
|
||||
|
||||
return LAM_SUCCESS;
|
||||
}
|
||||
|
||||
@ -113,30 +136,41 @@ mca_pcm_cofs_init(void)
|
||||
/*
|
||||
* See if we can write in our directory...
|
||||
*/
|
||||
tmp = malloc(strlen(mca_pcm_cofs_comm_loc) + 5);
|
||||
tmp = LAM_MALLOC(strlen(mca_pcm_cofs_comm_loc) + 5);
|
||||
if (tmp == NULL) return NULL;
|
||||
sprintf(tmp, "%s/me", mca_pcm_cofs_comm_loc);
|
||||
fp = fopen(tmp, "w");
|
||||
if (fp == NULL) {
|
||||
printf("pcm_cofs can not write in communication dir\n");
|
||||
free(tmp);
|
||||
LAM_FREE(tmp);
|
||||
return NULL;
|
||||
}
|
||||
fclose(fp);
|
||||
unlink(tmp);
|
||||
free(tmp);
|
||||
LAM_FREE(tmp);
|
||||
|
||||
/*
|
||||
* BWB - fix me, make register the "right" way...
|
||||
*/
|
||||
/* find our vpid */
|
||||
tmp = getenv("MCA_PCM_BASE_VPID");
|
||||
tmp = getenv("MCA_common_lam_cofs_my_vpid");
|
||||
if (tmp == NULL) {
|
||||
printf("pcm_cofs can not find vpid\n");
|
||||
return NULL;
|
||||
}
|
||||
mca_pcm_cofs_my_vpid = atoi(tmp);
|
||||
|
||||
mca_pcm_cofs_my_handle = getenv("MCA_common_lam_cofs_job_handle");
|
||||
|
||||
mca_pcm_cofs_procs = NULL;
|
||||
|
||||
tmp = getenv("MCA_common_lam_cofs_num_procs");
|
||||
if (tmp == NULL) {
|
||||
printf("pcm_cofs can not find nprocs\n");
|
||||
return NULL;
|
||||
}
|
||||
mca_pcm_cofs_nprocs = atoi(tmp);
|
||||
|
||||
return &mca_pcm_cofs_1_0_0;
|
||||
}
|
||||
|
||||
@ -144,5 +178,12 @@ mca_pcm_cofs_init(void)
|
||||
int
|
||||
mca_pcm_cofs_finalize(void)
|
||||
{
|
||||
if (mca_pcm_cofs_procs != NULL) {
|
||||
LAM_FREE(mca_pcm_cofs_procs);
|
||||
mca_pcm_cofs_procs = NULL;
|
||||
mca_pcm_cofs_nprocs = 0;
|
||||
}
|
||||
|
||||
return LAM_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -65,15 +65,23 @@
|
||||
#define LAM_PCM_PROC_MPIAPP 1
|
||||
#define LAM_PCM_PROC_OTHER 2
|
||||
|
||||
typedef struct lam_pcm_node {
|
||||
struct mca_pcm_rte_node_t {
|
||||
int32_t node_num;
|
||||
int32_t num_procs;
|
||||
} lam_pcm_node_t;
|
||||
};
|
||||
typedef struct mca_pcm_rte_node_t mca_pcm_rte_node_t;
|
||||
|
||||
typedef struct lam_pcm_control_args {
|
||||
struct mca_pcm_control_args_t {
|
||||
char* request;
|
||||
char* value;
|
||||
} lam_pcm_control_args_t;
|
||||
};
|
||||
typedef struct mca_pcm_control_args_t mca_pcm_control_args_t;
|
||||
|
||||
struct mca_pcm_proc_t {
|
||||
lam_job_handle_t job_handle;
|
||||
int vpid;
|
||||
};
|
||||
typedef struct mca_pcm_proc_t mca_pcm_proc_t;
|
||||
|
||||
/*
|
||||
* functions every module must provide
|
||||
@ -83,7 +91,7 @@ typedef int (*mca_pcm_base_query_fn_t)(int *priority);
|
||||
typedef struct mca_pcm_1_0_0_t* (*mca_pcm_base_init_fn_t)(void);
|
||||
|
||||
/**
|
||||
* \func lam_pcm_query_get_nodes
|
||||
* \func mca_pcm_query_get_nodes
|
||||
*
|
||||
* Get list of nodes available for execution
|
||||
*
|
||||
@ -104,7 +112,7 @@ typedef struct mca_pcm_1_0_0_t* (*mca_pcm_base_init_fn_t)(void);
|
||||
* In the case where both are available, available_procs will be
|
||||
* equal to the sum of nodes[0...n].num_procs.
|
||||
*/
|
||||
typedef int (*mca_pcm_base_query_get_nodes_fn_t)(lam_pcm_node_t **nodes,
|
||||
typedef int (*mca_pcm_base_query_get_nodes_fn_t)(mca_pcm_rte_node_t **nodes,
|
||||
size_t *nodes_len,
|
||||
int *available_procs);
|
||||
|
||||
@ -126,7 +134,7 @@ typedef int (*mca_pcm_base_query_get_nodes_fn_t)(lam_pcm_node_t **nodes,
|
||||
* process tree (spawn, etc.) if the user really wants such
|
||||
* information. For mpirun, it should just be NULL.
|
||||
*
|
||||
* \warning The handle must be released using lam_pcm_handle_free
|
||||
* \warning The handle must be released using mca_pcm_handle_free
|
||||
*/
|
||||
typedef lam_job_handle_t (*mca_pcm_base_handle_new_fn_t)(lam_job_handle_t parent);
|
||||
|
||||
@ -139,7 +147,7 @@ typedef lam_job_handle_t (*mca_pcm_base_handle_new_fn_t)(lam_job_handle_t parent
|
||||
*
|
||||
* Return the parallel job handle for the currently running process
|
||||
*
|
||||
* \warning The handle must be released using lam_pcm_handle_free
|
||||
* \warning The handle must be released using mca_pcm_handle_free
|
||||
*/
|
||||
typedef lam_job_handle_t (*mca_pcm_base_handle_get_fn_t)(void);
|
||||
|
||||
@ -149,8 +157,8 @@ typedef lam_job_handle_t (*mca_pcm_base_handle_get_fn_t)(void);
|
||||
*
|
||||
* @param job_handle Poiner to a lam_job_handle_t
|
||||
*
|
||||
* Free a job handle returned by lam_pcm_handle_new or
|
||||
* lam_pcm_handle_get.
|
||||
* Free a job handle returned by mca_pcm_handle_new or
|
||||
* mca_pcm_handle_get.
|
||||
*/
|
||||
typedef void (*mca_pcm_base_handle_free_fn_t)(lam_job_handle_t *job_handle);
|
||||
|
||||
@ -192,7 +200,7 @@ typedef int (*mca_pcm_base_job_can_spawn_fn_t)(lam_job_handle_t job_handle);
|
||||
* job handle.
|
||||
*/
|
||||
typedef int (*mca_pcm_base_job_set_arguments_fn_t)(lam_job_handle_t job_handle,
|
||||
lam_pcm_control_args_t* opts,
|
||||
mca_pcm_control_args_t* opts,
|
||||
size_t opts_len);
|
||||
|
||||
|
||||
@ -222,7 +230,7 @@ typedef int (*mca_pcm_base_job_set_arguments_fn_t)(lam_job_handle_t job_handle,
|
||||
* support spawning of new applications from
|
||||
*/
|
||||
typedef int (*mca_pcm_base_job_launch_procs_fn_t)(lam_job_handle_t job_handle,
|
||||
lam_pcm_node_t *nodes,
|
||||
mca_pcm_rte_node_t *nodes,
|
||||
size_t nodes_len, const char* file,
|
||||
int argc, const char* argv[],
|
||||
const char *env[]);
|
||||
@ -243,6 +251,8 @@ typedef int (*mca_pcm_base_job_launch_procs_fn_t)(lam_job_handle_t job_handle,
|
||||
* this all along and didn't bother to tell you. When this function
|
||||
* returns, it is safe to assume that all rendezvous is complete
|
||||
* (ie, you can exit and not mess anything up
|
||||
*
|
||||
* This function only needs to be called by the launching procs.
|
||||
*/
|
||||
typedef int (*mca_pcm_base_job_rendezvous_fn_t)(lam_job_handle_t job_handle);
|
||||
|
||||
@ -320,16 +330,19 @@ typedef int (*mca_pcm_base_proc_startup_fn_t)(void);
|
||||
/**
|
||||
* Get peers list
|
||||
*
|
||||
* @retval LAM_ERR_NOT_IMPLEMENTED Function not implemented
|
||||
* @param procs Ordered array of lam_proc_t entries describing the job peers
|
||||
*
|
||||
* @retval LAM_SUCCESS success
|
||||
* @retval LAM_ERROR Unknown error
|
||||
*
|
||||
* Get list of peers in the parallel job. Should not require any
|
||||
* communication with other nodes (communication with processes on
|
||||
* this node are allowed).
|
||||
*
|
||||
* \warning This function is not implemented and its argument list
|
||||
* will obviously change in the very near future.
|
||||
* may change in the very near future.
|
||||
*/
|
||||
typedef int (*mca_pcm_base_proc_get_peers_fn_t)(void);
|
||||
typedef int (*mca_pcm_base_proc_get_peers_fn_t)(mca_pcm_proc_t **procs, size_t *nprocs);
|
||||
|
||||
|
||||
/**
|
||||
@ -339,10 +352,8 @@ typedef int (*mca_pcm_base_proc_get_peers_fn_t)(void);
|
||||
*
|
||||
* Get my entry in the peers list
|
||||
*
|
||||
* \warning This function is not implemented and its argument list
|
||||
* will obviously change in the very near future.
|
||||
*/
|
||||
typedef int (*mca_pcm_base_proc_get_me_fn_t)(void);
|
||||
typedef mca_pcm_proc_t* (*mca_pcm_base_proc_get_me_fn_t)(void);
|
||||
|
||||
/**
|
||||
* Get my entry in the peers list
|
||||
|
@ -15,5 +15,62 @@
|
||||
int
|
||||
MPI_Init(int *argc, char ***argv)
|
||||
{
|
||||
#if 0
|
||||
/*
|
||||
* BWB - this comment should be removed at some point in the very near future
|
||||
*
|
||||
* This #if 0'ed out block of code is a rough approximation of what
|
||||
* should happen to get this parallel job bootstrapped and ready to
|
||||
* run. There are probably some bugs in the OOB and PCM interfaces
|
||||
* that are going to make this really interesting (sorry :( ), but I
|
||||
* think it should work once the MPI modules are written...
|
||||
*/
|
||||
|
||||
/* Do the "right" MCA query and init functions to fire up the
|
||||
* run-time environment interfaces. I'm not exactly sure what these
|
||||
* calls will be (since they are in the base functions, right?), but
|
||||
* do them here
|
||||
*
|
||||
* Order is:
|
||||
* 1) PCM
|
||||
* 2) OOB
|
||||
* 3) Registery
|
||||
*
|
||||
* Don't forget to close down in the reverse order at end of the day
|
||||
* - even the silly COFS implementations are going to leak resources
|
||||
* like crazy if you don't.
|
||||
*
|
||||
* The OOB system may not actually be usable until the end of
|
||||
* pcm_proc_startup, but must be initialized here.
|
||||
*/
|
||||
|
||||
/* Do the client side of the rendezvous with our launcher (or
|
||||
* whatever is needed for our RTE to figure out how to talk with our
|
||||
* peers and all that.
|
||||
*/
|
||||
ret = mca_pcm.pcm_proc_startup();
|
||||
if (ret != MPI_SUCCESS) printf("oops!\n");
|
||||
|
||||
/* at this point, we can use the OOB interface directly if we really
|
||||
need to, but is a bit tricky since we don't have a peers list
|
||||
yet. */
|
||||
mca_pcm.get_peers(&procs, &nprocs);
|
||||
|
||||
/* get a pointer to me */
|
||||
my_proc = mca_pcm.get_me();
|
||||
|
||||
/* get my parents. need to think about how to do this - i don't
|
||||
* think this is what we want at all... We can probably ignore
|
||||
* this for a little while since we don't have a run time
|
||||
* environment tha supports spawn just yet, but something to
|
||||
* remember...
|
||||
*/
|
||||
mca_pcm.get_parent(&pprocs, &npprocs);
|
||||
|
||||
/* we should have enough information by now to start running the PML
|
||||
* and PTL interfaces, right?
|
||||
*/
|
||||
#endif
|
||||
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user