1
1

* some more (mostly untested, but compiling) code for the cofs pcm module

* some comments on what needs to be done in the MPI_INIT C interface.  More
  coming tomorrow....

This commit was SVN r328.
Этот коммит содержится в:
Brian Barrett 2004-01-13 09:36:40 +00:00
родитель e4ac0bc98a
Коммит 3fc789e72e
5 изменённых файлов: 193 добавлений и 37 удалений

Просмотреть файл

@ -21,8 +21,9 @@
static int handle_new_count = 0;
int
mca_pcm_cofs_query_get_nodes(lam_pcm_node_t ** nodes, size_t * nodes_len,
mca_pcm_cofs_query_get_nodes(mca_pcm_rte_node_t **nodes, size_t * nodes_len,
int *available_procs)
{
*nodes = NULL;
@ -40,6 +41,7 @@ mca_pcm_cofs_handle_new(lam_job_handle_t parent)
char *ret;
size_t ret_len;
/* should really make this a file lookup kind of thing */
pid = getpid();
ret_len = sizeof(pid_t) * 8 + strlen("pcm_cofs_job_handle") + sizeof(int) * 8 + 5;
@ -65,7 +67,10 @@ mca_pcm_cofs_handle_get(void)
void
mca_pcm_cofs_handle_free(lam_job_handle_t * job_handle)
{
if (*job_handle != NULL) {
if (*job_handle == mca_pcm_cofs_my_handle) {
printf("WARNING: attempting to free static internal job handle!\n");
printf(" Did you perhaps try to free the return from handle_get()?\n");
} else if (*job_handle != NULL) {
LAM_FREE(*job_handle);
*job_handle = NULL;
}
@ -75,30 +80,38 @@ mca_pcm_cofs_handle_free(lam_job_handle_t * job_handle)
int
mca_pcm_cofs_job_can_spawn(lam_job_handle_t job_handle)
{
#if 1
/* Currently, have not coded up spawning support. Need to do
so soon */
return NULL;
#else
if (job_handle != NULL) {
return LAM_ERR_NOT_SUPPORTED;
} else {
return LAM_SUCCESS;
}
#endif
}
int
mca_pcm_cofs_job_set_arguments(lam_job_handle_t job_handle,
lam_pcm_control_args_t * opts,
mca_pcm_control_args_t * opts,
size_t opts_len)
{
/* need to implement, but not needed to get INIT going */
return LAM_ERR_NOT_IMPLEMENTED;
}
int
mca_pcm_cofs_job_launch_procs(lam_job_handle_t job_handle,
lam_pcm_node_t * nodes,
mca_pcm_rte_node_t *nodes,
size_t nodes_len, const char *file,
int argc, const char *argv[],
const char *env[])
{
/* need to implement, but not needed to get INIT going */
return LAM_ERR_NOT_IMPLEMENTED;
}
@ -106,6 +119,7 @@ mca_pcm_cofs_job_launch_procs(lam_job_handle_t job_handle,
int
mca_pcm_cofs_job_rendezvous(lam_job_handle_t job_handle)
{
/* need to implement, but not needed to get INIT going */
return LAM_ERR_NOT_IMPLEMENTED;
}
@ -113,6 +127,7 @@ mca_pcm_cofs_job_rendezvous(lam_job_handle_t job_handle)
int
mca_pcm_cofs_job_wait(lam_job_handle_t job_handle)
{
/* need to implement, but not needed to get INIT going */
return LAM_ERR_NOT_IMPLEMENTED;
}
@ -121,6 +136,7 @@ int
mca_pcm_cofs_job_running(lam_job_handle_t job_handle,
int *running)
{
/* need to implement, but not needed to get INIT going */
return LAM_ERR_NOT_IMPLEMENTED;
}
@ -129,6 +145,7 @@ int
mca_pcm_cofs_job_list_running(lam_job_handle_t ** handles,
size_t handles_len)
{
/* need to implement, but not needed to get INIT going */
return LAM_ERR_NOT_IMPLEMENTED;
}
@ -136,21 +153,46 @@ mca_pcm_cofs_job_list_running(lam_job_handle_t ** handles,
int
mca_pcm_cofs_proc_startup(void)
{
return LAM_ERR_NOT_IMPLEMENTED;
int i;
if (mca_pcm_cofs_nprocs == 0) {
/* well, this really shouldn't happen - we know we have at least ourselves */
return LAM_ERR_FATAL;
}
mca_pcm_cofs_procs = LAM_MALLOC(sizeof(mca_pcm_proc_t) * mca_pcm_cofs_nprocs);
if (mca_pcm_cofs_procs == NULL) {
return LAM_ERR_OUT_OF_RESOURCE;
}
for (i = 0 ; i < mca_pcm_cofs_nprocs ; ++i) {
/* for now, assume everyone in the same job :( */
mca_pcm_cofs_procs[i].job_handle = mca_pcm_cofs_handle_get();
mca_pcm_cofs_procs[i].vpid = i;
}
return LAM_SUCCESS;
}
int
mca_pcm_cofs_proc_get_peers(void)
mca_pcm_cofs_proc_get_peers(mca_pcm_proc_t **procs, size_t *nprocs)
{
return LAM_ERR_NOT_IMPLEMENTED;
if (mca_pcm_cofs_procs == NULL) {
return LAM_ERROR;
}
*procs = mca_pcm_cofs_procs;
*nprocs = mca_pcm_cofs_nprocs;
return LAM_SUCCESS;
}
int
mca_pcm_proc_t*
mca_pcm_cofs_proc_get_me(void)
{
return LAM_ERR_NOT_IMPLEMENTED;
return &(mca_pcm_cofs_procs[mca_pcm_cofs_my_vpid]);
}

Просмотреть файл

@ -25,7 +25,7 @@ int mca_pcm_cofs_finalize(void);
/*
* "Action" functions
*/
int mca_pcm_cofs_query_get_nodes(lam_pcm_node_t **nodes, size_t *nodes_len,
int mca_pcm_cofs_query_get_nodes(mca_pcm_rte_node_t **nodes, size_t *nodes_len,
int *available_procs);
lam_job_handle_t mca_pcm_cofs_handle_new(lam_job_handle_t parent);
@ -34,10 +34,10 @@ void mca_pcm_cofs_handle_free(lam_job_handle_t *job_handle);
int mca_pcm_cofs_job_can_spawn(lam_job_handle_t job_handle);
int mca_pcm_cofs_job_set_arguments(lam_job_handle_t job_handle,
lam_pcm_control_args_t* opts,
mca_pcm_control_args_t* opts,
size_t opts_len);
int mca_pcm_cofs_job_launch_procs(lam_job_handle_t job_handle,
lam_pcm_node_t *nodes,
mca_pcm_rte_node_t *nodes,
size_t nodes_len, const char* file,
int argc, const char* argv[],
const char *env[]);
@ -49,10 +49,15 @@ int mca_pcm_cofs_job_list_running(lam_job_handle_t **handles,
size_t handles_len);
int mca_pcm_cofs_proc_startup(void);
int mca_pcm_cofs_proc_get_peers(void);
int mca_pcm_cofs_proc_get_me(void);
int mca_pcm_cofs_proc_get_peers(mca_pcm_proc_t **procs, size_t *nprocs);
mca_pcm_proc_t* mca_pcm_cofs_proc_get_me(void);
int mca_pcm_cofs_proc_get_parent(void);
extern char mca_pcm_cofs_comm_loc[LAM_PATH_MAX]; /* location for file drop-off */
extern int mca_pcm_cofs_my_vpid;
extern char *mca_pcm_cofs_my_handle;
extern mca_pcm_proc_t *mca_pcm_cofs_procs;
extern size_t mca_pcm_cofs_nprocs;

Просмотреть файл

@ -7,10 +7,11 @@
#include "lam_config.h"
#include "lam/constants.h"
#include "lam/types.h"
#include "lam/util/malloc.h"
#include "mca/mca.h"
#include "mca/lam/pcm/pcm.h"
#include "mca/lam/pcm/cofs/src/pcm_cofs.h"
#include "lam/types.h"
#include <stdio.h>
#include <stdlib.h>
@ -83,7 +84,29 @@ mca_pcm_cofs_close(void)
int
mca_pcm_cofs_query(int *priority)
{
char *test_ret;
*priority = 0;
/* BWB - remove printfs once things settle down some... */
test_ret = getenv("MCA_common_lam_cofs_my_vpid");
if (test_ret == NULL) {
printf("COFS PCM will not be running because MCA_common_lam_cofs_my_vpid not set\n");
return LAM_ERROR;
}
test_ret = getenv("MCA_common_lam_cofs_job_handle");
if (test_ret == NULL) {
printf("COFS PCM will not be running because MCA_common_lam_cofs_job_handle not set\n");
return LAM_ERROR;
}
test_ret = getenv("MCA_common_lam_cofs_num_procs");
if (test_ret == NULL) {
printf("COFS PCM will not be running because MCA_common_lam_cofs_num_procs not set\n");
return LAM_ERROR;
}
return LAM_SUCCESS;
}
@ -113,30 +136,41 @@ mca_pcm_cofs_init(void)
/*
* See if we can write in our directory...
*/
tmp = malloc(strlen(mca_pcm_cofs_comm_loc) + 5);
tmp = LAM_MALLOC(strlen(mca_pcm_cofs_comm_loc) + 5);
if (tmp == NULL) return NULL;
sprintf(tmp, "%s/me", mca_pcm_cofs_comm_loc);
fp = fopen(tmp, "w");
if (fp == NULL) {
printf("pcm_cofs can not write in communication dir\n");
free(tmp);
LAM_FREE(tmp);
return NULL;
}
fclose(fp);
unlink(tmp);
free(tmp);
LAM_FREE(tmp);
/*
* BWB - fix me, make register the "right" way...
*/
/* find our vpid */
tmp = getenv("MCA_PCM_BASE_VPID");
tmp = getenv("MCA_common_lam_cofs_my_vpid");
if (tmp == NULL) {
printf("pcm_cofs can not find vpid\n");
return NULL;
}
mca_pcm_cofs_my_vpid = atoi(tmp);
mca_pcm_cofs_my_handle = getenv("MCA_common_lam_cofs_job_handle");
mca_pcm_cofs_procs = NULL;
tmp = getenv("MCA_common_lam_cofs_num_procs");
if (tmp == NULL) {
printf("pcm_cofs can not find nprocs\n");
return NULL;
}
mca_pcm_cofs_nprocs = atoi(tmp);
return &mca_pcm_cofs_1_0_0;
}
@ -144,5 +178,12 @@ mca_pcm_cofs_init(void)
int
mca_pcm_cofs_finalize(void)
{
if (mca_pcm_cofs_procs != NULL) {
LAM_FREE(mca_pcm_cofs_procs);
mca_pcm_cofs_procs = NULL;
mca_pcm_cofs_nprocs = 0;
}
return LAM_SUCCESS;
}

Просмотреть файл

@ -65,15 +65,23 @@
#define LAM_PCM_PROC_MPIAPP 1
#define LAM_PCM_PROC_OTHER 2
typedef struct lam_pcm_node {
struct mca_pcm_rte_node_t {
int32_t node_num;
int32_t num_procs;
} lam_pcm_node_t;
};
typedef struct mca_pcm_rte_node_t mca_pcm_rte_node_t;
typedef struct lam_pcm_control_args {
struct mca_pcm_control_args_t {
char* request;
char* value;
} lam_pcm_control_args_t;
};
typedef struct mca_pcm_control_args_t mca_pcm_control_args_t;
struct mca_pcm_proc_t {
lam_job_handle_t job_handle;
int vpid;
};
typedef struct mca_pcm_proc_t mca_pcm_proc_t;
/*
* functions every module must provide
@ -83,7 +91,7 @@ typedef int (*mca_pcm_base_query_fn_t)(int *priority);
typedef struct mca_pcm_1_0_0_t* (*mca_pcm_base_init_fn_t)(void);
/**
* \func lam_pcm_query_get_nodes
* \func mca_pcm_query_get_nodes
*
* Get list of nodes available for execution
*
@ -104,7 +112,7 @@ typedef struct mca_pcm_1_0_0_t* (*mca_pcm_base_init_fn_t)(void);
* In the case where both are available, available_procs will be
* equal to the sum of nodes[0...n].num_procs.
*/
typedef int (*mca_pcm_base_query_get_nodes_fn_t)(lam_pcm_node_t **nodes,
typedef int (*mca_pcm_base_query_get_nodes_fn_t)(mca_pcm_rte_node_t **nodes,
size_t *nodes_len,
int *available_procs);
@ -126,7 +134,7 @@ typedef int (*mca_pcm_base_query_get_nodes_fn_t)(lam_pcm_node_t **nodes,
* process tree (spawn, etc.) if the user really wants such
* information. For mpirun, it should just be NULL.
*
* \warning The handle must be released using lam_pcm_handle_free
* \warning The handle must be released using mca_pcm_handle_free
*/
typedef lam_job_handle_t (*mca_pcm_base_handle_new_fn_t)(lam_job_handle_t parent);
@ -139,7 +147,7 @@ typedef lam_job_handle_t (*mca_pcm_base_handle_new_fn_t)(lam_job_handle_t parent
*
* Return the parallel job handle for the currently running process
*
* \warning The handle must be released using lam_pcm_handle_free
* \warning The handle must be released using mca_pcm_handle_free
*/
typedef lam_job_handle_t (*mca_pcm_base_handle_get_fn_t)(void);
@ -149,8 +157,8 @@ typedef lam_job_handle_t (*mca_pcm_base_handle_get_fn_t)(void);
*
* @param job_handle Poiner to a lam_job_handle_t
*
* Free a job handle returned by lam_pcm_handle_new or
* lam_pcm_handle_get.
* Free a job handle returned by mca_pcm_handle_new or
* mca_pcm_handle_get.
*/
typedef void (*mca_pcm_base_handle_free_fn_t)(lam_job_handle_t *job_handle);
@ -192,7 +200,7 @@ typedef int (*mca_pcm_base_job_can_spawn_fn_t)(lam_job_handle_t job_handle);
* job handle.
*/
typedef int (*mca_pcm_base_job_set_arguments_fn_t)(lam_job_handle_t job_handle,
lam_pcm_control_args_t* opts,
mca_pcm_control_args_t* opts,
size_t opts_len);
@ -222,7 +230,7 @@ typedef int (*mca_pcm_base_job_set_arguments_fn_t)(lam_job_handle_t job_handle,
* support spawning of new applications from
*/
typedef int (*mca_pcm_base_job_launch_procs_fn_t)(lam_job_handle_t job_handle,
lam_pcm_node_t *nodes,
mca_pcm_rte_node_t *nodes,
size_t nodes_len, const char* file,
int argc, const char* argv[],
const char *env[]);
@ -243,6 +251,8 @@ typedef int (*mca_pcm_base_job_launch_procs_fn_t)(lam_job_handle_t job_handle,
* this all along and didn't bother to tell you. When this function
* returns, it is safe to assume that all rendezvous is complete
* (ie, you can exit and not mess anything up
*
* This function only needs to be called by the launching procs.
*/
typedef int (*mca_pcm_base_job_rendezvous_fn_t)(lam_job_handle_t job_handle);
@ -320,16 +330,19 @@ typedef int (*mca_pcm_base_proc_startup_fn_t)(void);
/**
* Get peers list
*
* @retval LAM_ERR_NOT_IMPLEMENTED Function not implemented
* @param procs Ordered array of lam_proc_t entries describing the job peers
*
* @retval LAM_SUCCESS success
* @retval LAM_ERROR Unknown error
*
* Get list of peers in the parallel job. Should not require any
* communication with other nodes (communication with processes on
* this node are allowed).
*
* \warning This function is not implemented and its argument list
* will obviously change in the very near future.
* may change in the very near future.
*/
typedef int (*mca_pcm_base_proc_get_peers_fn_t)(void);
typedef int (*mca_pcm_base_proc_get_peers_fn_t)(mca_pcm_proc_t **procs, size_t *nprocs);
/**
@ -339,10 +352,8 @@ typedef int (*mca_pcm_base_proc_get_peers_fn_t)(void);
*
* Get my entry in the peers list
*
* \warning This function is not implemented and its argument list
* will obviously change in the very near future.
*/
typedef int (*mca_pcm_base_proc_get_me_fn_t)(void);
typedef mca_pcm_proc_t* (*mca_pcm_base_proc_get_me_fn_t)(void);
/**
* Get my entry in the peers list

Просмотреть файл

@ -15,5 +15,62 @@
int
MPI_Init(int *argc, char ***argv)
{
#if 0
/*
* BWB - this comment should be removed at some point in the very near future
*
* This #if 0'ed out block of code is a rough approximation of what
* should happen to get this parallel job bootstrapped and ready to
* run. There are probably some bugs in the OOB and PCM interfaces
* that are going to make this really interesting (sorry :( ), but I
* think it should work once the MPI modules are written...
*/
/* Do the "right" MCA query and init functions to fire up the
* run-time environment interfaces. I'm not exactly sure what these
* calls will be (since they are in the base functions, right?), but
* do them here
*
* Order is:
* 1) PCM
* 2) OOB
* 3) Registery
*
* Don't forget to close down in the reverse order at end of the day
* - even the silly COFS implementations are going to leak resources
* like crazy if you don't.
*
* The OOB system may not actually be usable until the end of
* pcm_proc_startup, but must be initialized here.
*/
/* Do the client side of the rendezvous with our launcher (or
* whatever is needed for our RTE to figure out how to talk with our
* peers and all that.
*/
ret = mca_pcm.pcm_proc_startup();
if (ret != MPI_SUCCESS) printf("oops!\n");
/* at this point, we can use the OOB interface directly if we really
need to, but is a bit tricky since we don't have a peers list
yet. */
mca_pcm.get_peers(&procs, &nprocs);
/* get a pointer to me */
my_proc = mca_pcm.get_me();
/* get my parents. need to think about how to do this - i don't
* think this is what we want at all... We can probably ignore
* this for a little while since we don't have a run time
* environment tha supports spawn just yet, but something to
* remember...
*/
mca_pcm.get_parent(&pprocs, &npprocs);
/* we should have enough information by now to start running the PML
* and PTL interfaces, right?
*/
#endif
return MPI_SUCCESS;
}