* rework the pcm/llm interface to be more non-rsh friendly. Push the
host / cpu information down into a handle that need not exist when the llm isn't being used. Fix all the test cases and whatnot to match This commit was SVN r2490.
Этот коммит содержится в:
родитель
910f282c3f
Коммит
1a100e65c1
@ -17,36 +17,99 @@
|
|||||||
#include "mca/llm/base/base.h"
|
#include "mca/llm/base/base.h"
|
||||||
#include "class/ompi_value_array.h"
|
#include "class/ompi_value_array.h"
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Container for per-node hostfile-specific data
|
||||||
|
*/
|
||||||
|
struct mca_llm_base_hostfile_node_t {
|
||||||
|
/** make us an instance of list item */
|
||||||
|
ompi_list_item_t super;
|
||||||
|
/** hostname for this node. Can be used as generic description
|
||||||
|
field if hostnames aren't used on this platform */
|
||||||
|
char hostname[MAXHOSTNAMELEN];
|
||||||
|
/** number of MPI processes Open MPI can start on this host */
|
||||||
|
int count;
|
||||||
|
/** generic key=value storage mechanism */
|
||||||
|
ompi_list_t *info;
|
||||||
|
};
|
||||||
|
/** shorten ompi_rte_base_hostfile_node_t declarations */
|
||||||
|
typedef struct mca_llm_base_hostfile_node_t mca_llm_base_hostfile_node_t;
|
||||||
|
/** create the required instance information */
|
||||||
|
OBJ_CLASS_DECLARATION(mca_llm_base_hostfile_node_t);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* extra data for the \c ompi_rte_node_allocation_t structures when
|
||||||
|
* using the \c mca_llm_base_* functions.
|
||||||
|
*/
|
||||||
|
struct mca_llm_base_hostfile_data_t {
|
||||||
|
/** make ourselves an instance of the data base class */
|
||||||
|
ompi_rte_node_allocation_data_t super;
|
||||||
|
/** keep a list of the hosts allocated to this description */
|
||||||
|
ompi_list_t *hostlist;
|
||||||
|
};
|
||||||
|
/** shorten ompi_rte_base_hostfile_data_t declarations */
|
||||||
|
typedef struct mca_llm_base_hostfile_data_t mca_llm_base_hostfile_data_t;
|
||||||
|
/** create the required instance information */
|
||||||
|
OBJ_CLASS_DECLARATION(mca_llm_base_hostfile_data_t);
|
||||||
|
|
||||||
|
|
||||||
#if defined(c_plusplus) || defined(__cplusplus)
|
#if defined(c_plusplus) || defined(__cplusplus)
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
/**
|
||||||
|
* Do all the pre-use setup code. This should only be called by
|
||||||
|
* unit tests or mca_llm_base_open. In other words, you probably
|
||||||
|
* don't want to call this function.
|
||||||
|
*/
|
||||||
|
void mca_llm_base_setup(void);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse input file and return a list of host entries
|
* Parse input file and return a list of host entries
|
||||||
|
*
|
||||||
|
* \return ompi_list_t containing a list of
|
||||||
|
* mca_llm_base_hostfile_node_t information.
|
||||||
*/
|
*/
|
||||||
ompi_list_t *mca_llm_base_parse_hostfile(const char* filename);
|
ompi_list_t *mca_llm_base_parse_hostfile(const char* filename);
|
||||||
|
|
||||||
/**
|
|
||||||
* Rearrage the provide hostlist to meet the requirements of
|
|
||||||
* nodes / procs
|
|
||||||
*/
|
|
||||||
int mca_llm_base_map_resources(int nodes,
|
|
||||||
int procs,
|
|
||||||
ompi_list_t *hostlist);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remove duplicate host entries from the list, editing
|
* Remove duplicate host entries from the list, editing
|
||||||
* the count as appropriate and merging key=value pairs.
|
* the count as appropriate and merging key=value pairs.
|
||||||
*
|
*
|
||||||
|
* \param hostlist An ompi_list_t containing
|
||||||
|
* mca_llm_base_hostfile_node_t instances.
|
||||||
|
*
|
||||||
* \note If the same key is used with different values, the hosts
|
* \note If the same key is used with different values, the hosts
|
||||||
* are considered different.
|
* are considered different.
|
||||||
*/
|
*/
|
||||||
int mca_llm_base_collapse_resources(ompi_list_t *hostlist);
|
int mca_llm_base_collapse_resources(ompi_list_t *hostlist);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Deallocate resources allocated by parse hostfile
|
* Rearrage the provide hostlist to meet the requirements of
|
||||||
|
* nodes / procs.
|
||||||
|
*
|
||||||
|
* \param hostlist An ompi_list_t containing
|
||||||
|
* mca_llm_base_hostfile_node_t instances.
|
||||||
*/
|
*/
|
||||||
int mca_llm_base_deallocate(ompi_list_t *nodelist);
|
int mca_llm_base_map_resources(int nodes,
|
||||||
|
int procs,
|
||||||
|
ompi_list_t *hostlist);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Take a prepped (including mapped) list of
|
||||||
|
* mca_llm_base_hostfile_node_t instances and wrap it in an
|
||||||
|
* ompi_node_allocation_t list.
|
||||||
|
*/
|
||||||
|
ompi_list_t *mca_llm_base_create_node_allocation(ompi_list_t *hostlist);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* free a list of mca_llm_base_hostfile_node_t instances
|
||||||
|
*/
|
||||||
|
void mca_llm_base_deallocate(ompi_list_t *hostlist);
|
||||||
|
|
||||||
#if defined(c_plusplus) || defined(__cplusplus)
|
#if defined(c_plusplus) || defined(__cplusplus)
|
||||||
}
|
}
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
static
|
static
|
||||||
bool
|
bool
|
||||||
has_conflicts(ompi_rte_node_allocation_t *a, ompi_rte_node_allocation_t *b)
|
has_conflicts(mca_llm_base_hostfile_node_t *a, mca_llm_base_hostfile_node_t *b)
|
||||||
{
|
{
|
||||||
ompi_rte_valuepair_t *a_val, *b_val;
|
ompi_rte_valuepair_t *a_val, *b_val;
|
||||||
ompi_list_item_t *a_item, *b_item;
|
ompi_list_item_t *a_item, *b_item;
|
||||||
@ -43,7 +43,8 @@ has_conflicts(ompi_rte_node_allocation_t *a, ompi_rte_node_allocation_t *b)
|
|||||||
|
|
||||||
static
|
static
|
||||||
void
|
void
|
||||||
keyval_merge(ompi_rte_node_allocation_t *new, ompi_rte_node_allocation_t *old)
|
keyval_merge(mca_llm_base_hostfile_node_t *new,
|
||||||
|
mca_llm_base_hostfile_node_t *old)
|
||||||
{
|
{
|
||||||
ompi_list_item_t *old_item;
|
ompi_list_item_t *old_item;
|
||||||
|
|
||||||
@ -56,19 +57,19 @@ keyval_merge(ompi_rte_node_allocation_t *new, ompi_rte_node_allocation_t *old)
|
|||||||
int
|
int
|
||||||
mca_llm_base_collapse_resources(ompi_list_t *hostlist)
|
mca_llm_base_collapse_resources(ompi_list_t *hostlist)
|
||||||
{
|
{
|
||||||
ompi_rte_node_allocation_t *curr_node, *check_node;
|
mca_llm_base_hostfile_node_t *curr_node, *check_node;
|
||||||
ompi_list_item_t *curr_nodeitem, *check_nodeitem, *tmp;
|
ompi_list_item_t *curr_nodeitem, *check_nodeitem, *tmp;
|
||||||
|
|
||||||
for (curr_nodeitem = ompi_list_get_first(hostlist) ;
|
for (curr_nodeitem = ompi_list_get_first(hostlist) ;
|
||||||
curr_nodeitem != ompi_list_get_end(hostlist) ;
|
curr_nodeitem != ompi_list_get_end(hostlist) ;
|
||||||
curr_nodeitem = ompi_list_get_next(curr_nodeitem)) {
|
curr_nodeitem = ompi_list_get_next(curr_nodeitem)) {
|
||||||
|
|
||||||
curr_node = (ompi_rte_node_allocation_t*) curr_nodeitem;
|
curr_node = (mca_llm_base_hostfile_node_t*) curr_nodeitem;
|
||||||
for (check_nodeitem = ompi_list_get_next(curr_nodeitem) ;
|
for (check_nodeitem = ompi_list_get_next(curr_nodeitem) ;
|
||||||
check_nodeitem != ompi_list_get_end(hostlist) ;
|
check_nodeitem != ompi_list_get_end(hostlist) ;
|
||||||
check_nodeitem = ompi_list_get_next(check_nodeitem)) {
|
check_nodeitem = ompi_list_get_next(check_nodeitem)) {
|
||||||
|
|
||||||
check_node = (ompi_rte_node_allocation_t*) check_nodeitem;
|
check_node = (mca_llm_base_hostfile_node_t*) check_nodeitem;
|
||||||
|
|
||||||
if ((strcmp(curr_node->hostname, check_node->hostname) == 0) &&
|
if ((strcmp(curr_node->hostname, check_node->hostname) == 0) &&
|
||||||
(!has_conflicts(curr_node, check_node))) {
|
(!has_conflicts(curr_node, check_node))) {
|
||||||
|
@ -16,7 +16,7 @@ mca_llm_base_map_resources(int nodes,
|
|||||||
int procs,
|
int procs,
|
||||||
ompi_list_t *hostlist)
|
ompi_list_t *hostlist)
|
||||||
{
|
{
|
||||||
ompi_rte_node_allocation_t *node;
|
mca_llm_base_hostfile_node_t *node;
|
||||||
ompi_list_item_t *nodeitem, *tmp;
|
ompi_list_item_t *nodeitem, *tmp;
|
||||||
|
|
||||||
if (NULL == hostlist) {
|
if (NULL == hostlist) {
|
||||||
@ -35,7 +35,7 @@ mca_llm_base_map_resources(int nodes,
|
|||||||
for (nodeitem = ompi_list_get_first(hostlist);
|
for (nodeitem = ompi_list_get_first(hostlist);
|
||||||
nodeitem != ompi_list_get_end(hostlist);
|
nodeitem != ompi_list_get_end(hostlist);
|
||||||
nodeitem = ompi_list_get_next(nodeitem)) {
|
nodeitem = ompi_list_get_next(nodeitem)) {
|
||||||
node = (ompi_rte_node_allocation_t*) nodeitem;
|
node = (mca_llm_base_hostfile_node_t*) nodeitem;
|
||||||
|
|
||||||
if (alloc_procs >= procs) {
|
if (alloc_procs >= procs) {
|
||||||
/* we've allocated enough - release this guy from the
|
/* we've allocated enough - release this guy from the
|
||||||
@ -61,7 +61,7 @@ mca_llm_base_map_resources(int nodes,
|
|||||||
for (nodeitem = ompi_list_get_first(hostlist);
|
for (nodeitem = ompi_list_get_first(hostlist);
|
||||||
nodeitem != ompi_list_get_end(hostlist);
|
nodeitem != ompi_list_get_end(hostlist);
|
||||||
nodeitem = ompi_list_get_next(nodeitem)) {
|
nodeitem = ompi_list_get_next(nodeitem)) {
|
||||||
node = (ompi_rte_node_allocation_t*) nodeitem;
|
node = (mca_llm_base_hostfile_node_t*) nodeitem;
|
||||||
node->count = 1;
|
node->count = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
#include "mca/base/base.h"
|
#include "mca/base/base.h"
|
||||||
#include "mca/llm/llm.h"
|
#include "mca/llm/llm.h"
|
||||||
#include "mca/llm/base/base.h"
|
#include "mca/llm/base/base.h"
|
||||||
|
#include "mca/llm/base/base_internal.h"
|
||||||
#include "runtime/runtime_types.h"
|
#include "runtime/runtime_types.h"
|
||||||
|
|
||||||
|
|
||||||
@ -54,3 +55,67 @@ int mca_llm_base_open(void)
|
|||||||
/* All done */
|
/* All done */
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Object maintenance code
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** constructor for \c mca_llm_base_hostfile_data_t */
|
||||||
|
static
|
||||||
|
void
|
||||||
|
llm_base_int_hostfile_data_construct(ompi_object_t *obj)
|
||||||
|
{
|
||||||
|
mca_llm_base_hostfile_data_t *data = (mca_llm_base_hostfile_data_t*) obj;
|
||||||
|
data->hostlist = OBJ_NEW(ompi_list_t);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** destructor for \c mca_llm_base_hostfile_data_t */
|
||||||
|
static
|
||||||
|
void
|
||||||
|
llm_base_int_hostfile_data_destruct(ompi_object_t *obj)
|
||||||
|
{
|
||||||
|
mca_llm_base_hostfile_data_t *data = (mca_llm_base_hostfile_data_t*) obj;
|
||||||
|
mca_llm_base_deallocate(data->hostlist);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** constructor for \c mca_llm_base_hostfile_node_t */
|
||||||
|
static
|
||||||
|
void
|
||||||
|
llm_base_int_hostfile_node_construct(ompi_object_t *obj)
|
||||||
|
{
|
||||||
|
mca_llm_base_hostfile_node_t *node = (mca_llm_base_hostfile_node_t*) obj;
|
||||||
|
node->info = OBJ_NEW(ompi_list_t);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** destructor for \c mca_llm_base_hostfile_node_t */
|
||||||
|
static
|
||||||
|
void
|
||||||
|
llm_base_int_hostfile_node_destruct(ompi_object_t *obj)
|
||||||
|
{
|
||||||
|
mca_llm_base_hostfile_node_t *node = (mca_llm_base_hostfile_node_t*) obj;
|
||||||
|
ompi_list_item_t *item;
|
||||||
|
|
||||||
|
if (NULL == node->info) return;
|
||||||
|
|
||||||
|
while (NULL != (item = ompi_list_remove_first(node->info))) {
|
||||||
|
OBJ_RELEASE(item);
|
||||||
|
}
|
||||||
|
|
||||||
|
OBJ_RELEASE(node->info);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** create instance information for \c mca_llm_base_hostfile_data_t */
|
||||||
|
OBJ_CLASS_INSTANCE(mca_llm_base_hostfile_data_t,
|
||||||
|
ompi_rte_node_allocation_data_t,
|
||||||
|
llm_base_int_hostfile_data_construct,
|
||||||
|
llm_base_int_hostfile_data_destruct);
|
||||||
|
/** create instance information for \c mca_llm_base_hostfile_node_t */
|
||||||
|
OBJ_CLASS_INSTANCE(mca_llm_base_hostfile_node_t,
|
||||||
|
ompi_list_item_t,
|
||||||
|
llm_base_int_hostfile_node_construct,
|
||||||
|
llm_base_int_hostfile_node_destruct);
|
||||||
|
@ -16,7 +16,7 @@
|
|||||||
#include "runtime/runtime_types.h"
|
#include "runtime/runtime_types.h"
|
||||||
|
|
||||||
static void parse_error(void);
|
static void parse_error(void);
|
||||||
static int parse_keyval(int, ompi_rte_node_allocation_t*);
|
static int parse_keyval(int, mca_llm_base_hostfile_node_t*);
|
||||||
|
|
||||||
static void
|
static void
|
||||||
parse_error()
|
parse_error()
|
||||||
@ -27,7 +27,7 @@ parse_error()
|
|||||||
|
|
||||||
static
|
static
|
||||||
int
|
int
|
||||||
parse_keyval(int first, ompi_rte_node_allocation_t *node)
|
parse_keyval(int first, mca_llm_base_hostfile_node_t *node)
|
||||||
{
|
{
|
||||||
int val;
|
int val;
|
||||||
char *key, *value;
|
char *key, *value;
|
||||||
@ -89,7 +89,7 @@ parse_count(void)
|
|||||||
|
|
||||||
static
|
static
|
||||||
int
|
int
|
||||||
parse_line(int first, ompi_rte_node_allocation_t *node)
|
parse_line(int first, mca_llm_base_hostfile_node_t *node)
|
||||||
{
|
{
|
||||||
int val;
|
int val;
|
||||||
int ret;
|
int ret;
|
||||||
@ -139,7 +139,7 @@ parse_line(int first, ompi_rte_node_allocation_t *node)
|
|||||||
ompi_list_t *
|
ompi_list_t *
|
||||||
mca_llm_base_parse_hostfile(const char *hostfile)
|
mca_llm_base_parse_hostfile(const char *hostfile)
|
||||||
{
|
{
|
||||||
ompi_rte_node_allocation_t *newnode;
|
mca_llm_base_hostfile_node_t *newnode;
|
||||||
ompi_list_t *list;
|
ompi_list_t *list;
|
||||||
int val, ret;
|
int val, ret;
|
||||||
|
|
||||||
@ -169,7 +169,7 @@ mca_llm_base_parse_hostfile(const char *hostfile)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case MCA_LLM_BASE_STRING:
|
case MCA_LLM_BASE_STRING:
|
||||||
newnode = OBJ_NEW(ompi_rte_node_allocation_t);
|
newnode = OBJ_NEW(mca_llm_base_hostfile_node_t);
|
||||||
ret = parse_line(val, newnode);
|
ret = parse_line(val, newnode);
|
||||||
if (OMPI_SUCCESS != ret) {
|
if (OMPI_SUCCESS != ret) {
|
||||||
OBJ_RELEASE(newnode);
|
OBJ_RELEASE(newnode);
|
||||||
|
@ -9,19 +9,91 @@
|
|||||||
#include "mca/llm/base/base.h"
|
#include "mca/llm/base/base.h"
|
||||||
#include "mca/llm/base/base_internal.h"
|
#include "mca/llm/base/base_internal.h"
|
||||||
|
|
||||||
int
|
|
||||||
mca_llm_base_deallocate(ompi_list_t *nodelist)
|
void
|
||||||
|
mca_llm_base_deallocate(ompi_list_t *hostlist)
|
||||||
{
|
{
|
||||||
ompi_rte_node_allocation_t *node;
|
|
||||||
ompi_list_item_t *item;
|
ompi_list_item_t *item;
|
||||||
|
|
||||||
while (NULL != (item = ompi_list_remove_first(nodelist))) {
|
if (NULL == hostlist) return;
|
||||||
node = (ompi_rte_node_allocation_t*) item;
|
|
||||||
OBJ_RELEASE(node);
|
while (NULL != (item = ompi_list_remove_first(hostlist))) {
|
||||||
|
OBJ_RELEASE(item);
|
||||||
}
|
}
|
||||||
|
|
||||||
OBJ_RELEASE(nodelist);
|
OBJ_RELEASE(hostlist);
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static
|
||||||
|
ompi_rte_node_allocation_t*
|
||||||
|
get_allocation_for_size(int count, ompi_list_t *nodelist)
|
||||||
|
{
|
||||||
|
ompi_list_item_t *nodeitem;
|
||||||
|
ompi_rte_node_allocation_t *node;
|
||||||
|
mca_llm_base_hostfile_data_t *data;
|
||||||
|
|
||||||
|
for (nodeitem = ompi_list_get_first(nodelist) ;
|
||||||
|
nodeitem != ompi_list_get_end(nodelist) ;
|
||||||
|
nodeitem = ompi_list_get_next(nodeitem) ) {
|
||||||
|
node = (ompi_rte_node_allocation_t*) nodeitem;
|
||||||
|
|
||||||
|
if (node->count == count) {
|
||||||
|
return node;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* no joy... make one and put it in the list */
|
||||||
|
node = OBJ_NEW(ompi_rte_node_allocation_t);
|
||||||
|
printf("setting node's count as: %d\n", count);
|
||||||
|
node->count = count;
|
||||||
|
ompi_list_append(nodelist, (ompi_list_item_t*) node);
|
||||||
|
|
||||||
|
data = OBJ_NEW(mca_llm_base_hostfile_data_t);
|
||||||
|
node->data = (ompi_rte_node_allocation_data_t*) data;
|
||||||
|
|
||||||
|
return node;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ompi_list_t*
|
||||||
|
mca_llm_base_create_node_allocation(ompi_list_t *hostlist)
|
||||||
|
{
|
||||||
|
ompi_list_t *nodelist;
|
||||||
|
mca_llm_base_hostfile_node_t *host;
|
||||||
|
mca_llm_base_hostfile_data_t *data;
|
||||||
|
ompi_rte_node_allocation_t *node;
|
||||||
|
ompi_list_item_t *hostitem, *nodeitem;
|
||||||
|
int start_count = 0;
|
||||||
|
|
||||||
|
nodelist = OBJ_NEW(ompi_list_t);
|
||||||
|
|
||||||
|
/* This is going to be slow as molasses in January in
|
||||||
|
* Alaska. Iterate through the list of hosts and group them in
|
||||||
|
* ompi_rte_node_allocation_t structures. Then take those and
|
||||||
|
* iterate through, setting the start numbers. So nothing too
|
||||||
|
* horrible, right?
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* on with the partitioning */
|
||||||
|
while (NULL != (hostitem = ompi_list_remove_first(hostlist))) {
|
||||||
|
host = (mca_llm_base_hostfile_node_t*) hostitem;
|
||||||
|
node = get_allocation_for_size(host->count, nodelist);
|
||||||
|
data = (mca_llm_base_hostfile_data_t*) node->data;
|
||||||
|
node->nodes++;
|
||||||
|
|
||||||
|
ompi_list_append(data->hostlist, (ompi_list_item_t*) host);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* and fix the start numbers */
|
||||||
|
start_count = 0;
|
||||||
|
for (nodeitem = ompi_list_get_first(nodelist) ;
|
||||||
|
nodeitem != ompi_list_get_end(nodelist) ;
|
||||||
|
nodeitem = ompi_list_get_next(nodeitem) ) {
|
||||||
|
node = (ompi_rte_node_allocation_t*) nodeitem;
|
||||||
|
node->start = start_count;
|
||||||
|
start_count += (node->nodes * node->count);
|
||||||
|
}
|
||||||
|
|
||||||
|
return nodelist;
|
||||||
|
}
|
||||||
|
@ -17,25 +17,32 @@ extern char *mca_llm_hostfile_filename;
|
|||||||
ompi_list_t*
|
ompi_list_t*
|
||||||
mca_llm_hostfile_allocate_resources(int jobid, int nodes, int procs)
|
mca_llm_hostfile_allocate_resources(int jobid, int nodes, int procs)
|
||||||
{
|
{
|
||||||
|
ompi_list_t *hostlist = NULL;
|
||||||
ompi_list_t *nodelist = NULL;
|
ompi_list_t *nodelist = NULL;
|
||||||
ompi_list_item_t *nodeitem;
|
ompi_list_item_t *nodeitem;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
/* start by getting the full list of available resources */
|
/* start by getting the full list of available resources */
|
||||||
nodelist = mca_llm_base_parse_hostfile(mca_llm_hostfile_filename);
|
hostlist = mca_llm_base_parse_hostfile(mca_llm_hostfile_filename);
|
||||||
if (NULL == nodelist) {
|
if (NULL == hostlist) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = mca_llm_base_collapse_resources(nodelist);
|
ret = mca_llm_base_collapse_resources(hostlist);
|
||||||
if (OMPI_SUCCESS != ret) {
|
if (OMPI_SUCCESS != ret) {
|
||||||
mca_llm_base_deallocate(nodelist);
|
mca_llm_base_deallocate(hostlist);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = mca_llm_base_map_resources(nodes, procs, nodelist);
|
ret = mca_llm_base_map_resources(nodes, procs, hostlist);
|
||||||
if (OMPI_SUCCESS != ret) {
|
if (OMPI_SUCCESS != ret) {
|
||||||
mca_llm_base_deallocate(nodelist);
|
mca_llm_base_deallocate(hostlist);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
nodelist = mca_llm_base_create_node_allocation(hostlist);
|
||||||
|
if (OMPI_SUCCESS != ret) {
|
||||||
|
mca_llm_base_deallocate(hostlist);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,6 +14,19 @@ int
|
|||||||
mca_llm_hostfile_deallocate_resources(int jobid,
|
mca_llm_hostfile_deallocate_resources(int jobid,
|
||||||
ompi_list_t *nodelist)
|
ompi_list_t *nodelist)
|
||||||
{
|
{
|
||||||
mca_llm_base_deallocate(nodelist);
|
ompi_list_item_t *item;
|
||||||
|
|
||||||
|
/* pop off all the ompi_ret_node_allocatoin_t instances and free
|
||||||
|
* them. Their destructors will kill the
|
||||||
|
* mca_llm_base_hostfile_data_t, who's destructor will kill the
|
||||||
|
* mca_llm_base_hostfile_node_t instances associated with the
|
||||||
|
* node_allocation. In other words, everything goes "bye-bye"
|
||||||
|
*/
|
||||||
|
while (NULL != (item = ompi_list_remove_first(nodelist))) {
|
||||||
|
OBJ_RELEASE(item);
|
||||||
|
}
|
||||||
|
|
||||||
|
OBJ_RELEASE(nodelist);
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -97,22 +97,20 @@ typedef mca_llm_base_component_1_0_0_t mca_llm_base_component_t;
|
|||||||
* called once per jobid.
|
* called once per jobid.
|
||||||
*
|
*
|
||||||
* @param jobid (IN) Jobid with which to associate the given resources.
|
* @param jobid (IN) Jobid with which to associate the given resources.
|
||||||
* @param nodes (IN) Number of nodes to try to allocate. If 0,
|
* @param nodes (IN) Number of ndoes to try to allocate. If 0, the
|
||||||
* the LLM will try to allocate <code>procs</code>
|
* allocator will try to allocate \c procs processes
|
||||||
* processes on as many nodes as are needed. If non-zero,
|
* on as many nodes as are needed. If non-zero,
|
||||||
* will try to fairly distribute <code>procs</code>
|
* will try to allocate \c procs process slots
|
||||||
* processes over the nodes. If <code>procs</code> is 0,
|
* per node.
|
||||||
* will attempt to allocate all cpus on
|
|
||||||
* <code>nodes</code> nodes
|
|
||||||
* @param procs (IN) Number of processors to try to allocate. See the note
|
* @param procs (IN) Number of processors to try to allocate. See the note
|
||||||
* for <code>nodes</code> for usage.
|
* for <code>nodes</code> for usage.
|
||||||
* @param nodelist (OUT) List of <code>ompi_rte_node_allocation_t</code>s
|
* @param nodelist (OUT) List of <code>ompi_rte_node_allocation_t</code>s
|
||||||
* describing the allocated resources.
|
* describing the allocated resources.
|
||||||
*
|
*
|
||||||
* @warning The type for jobid will change in the near future
|
|
||||||
*/
|
*/
|
||||||
typedef ompi_list_t *
|
typedef ompi_list_t *
|
||||||
(*mca_llm_base_allocate_resources_fn_t)(int jobid, int nodes,int procs);
|
(*mca_llm_base_allocate_resources_fn_t)(mca_ns_base_jobid_t jobid,
|
||||||
|
int nodes,int procs);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -123,10 +121,8 @@ typedef ompi_list_t*
|
|||||||
* @param jobid (IN) Jobid associated with the resources to be freed.
|
* @param jobid (IN) Jobid associated with the resources to be freed.
|
||||||
* @param nodes (IN) Nodelist from associated allocate_resource call.
|
* @param nodes (IN) Nodelist from associated allocate_resource call.
|
||||||
* All associated memory will be freed as appropriate.
|
* All associated memory will be freed as appropriate.
|
||||||
*
|
|
||||||
* @warning The type for jobid will change in the near future.
|
|
||||||
*/
|
*/
|
||||||
typedef int (*mca_llm_base_deallocate_resources_fn_t)(int jobid,
|
typedef int (*mca_llm_base_deallocate_resources_fn_t)(mca_ns_base_jobid_t jobid,
|
||||||
ompi_list_t *nodelist);
|
ompi_list_t *nodelist);
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
#include "include/types.h"
|
#include "include/types.h"
|
||||||
#include "mca/mca.h"
|
#include "mca/mca.h"
|
||||||
#include "mca/pcm/pcm.h"
|
#include "mca/pcm/pcm.h"
|
||||||
|
#include "mca/llm/base/base_internal.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Global functions for MCA overall collective open and close
|
* Global functions for MCA overall collective open and close
|
||||||
@ -30,19 +30,19 @@ extern "C" {
|
|||||||
int mca_pcm_base_send_schedule(FILE *fd,
|
int mca_pcm_base_send_schedule(FILE *fd,
|
||||||
int jobid,
|
int jobid,
|
||||||
ompi_rte_node_schedule_t *sched,
|
ompi_rte_node_schedule_t *sched,
|
||||||
ompi_list_t *nodelist);
|
int num_procs);
|
||||||
|
|
||||||
int mca_pcm_base_recv_schedule(FILE *fd,
|
int mca_pcm_base_recv_schedule(FILE *fd,
|
||||||
int *jobid,
|
int *jobid,
|
||||||
ompi_rte_node_schedule_t *sched,
|
ompi_rte_node_schedule_t *sched,
|
||||||
ompi_list_t *nodelist);
|
int *num_procs);
|
||||||
|
|
||||||
int mca_pcm_base_build_base_env(char **in_env, char ***out_envp);
|
int mca_pcm_base_build_base_env(char **in_env, char ***out_envp);
|
||||||
|
|
||||||
int mca_pcm_base_ioexecvp(char **cmdv, int showout, char *outbuff,
|
int mca_pcm_base_ioexecvp(char **cmdv, int showout, char *outbuff,
|
||||||
int outbuffsize, int stderr_is_err);
|
int outbuffsize, int stderr_is_err);
|
||||||
|
|
||||||
char* mca_pcm_base_get_username(ompi_rte_node_allocation_t *node);
|
char* mca_pcm_base_get_username(mca_llm_base_hostfile_node_t *node);
|
||||||
|
|
||||||
#if defined(c_plusplus) || defined(__cplusplus)
|
#if defined(c_plusplus) || defined(__cplusplus)
|
||||||
}
|
}
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#include "include/constants.h"
|
#include "include/constants.h"
|
||||||
#include "class/ompi_list.h"
|
#include "class/ompi_list.h"
|
||||||
#include "mca/pcm/base/base.h"
|
#include "mca/pcm/base/base.h"
|
||||||
|
#include "mca/llm/base/base_internal.h"
|
||||||
|
|
||||||
#define START_KEY "@MCA_PCM@\n"
|
#define START_KEY "@MCA_PCM@\n"
|
||||||
#define END_KEY "@MCA_PCM_END@\n"
|
#define END_KEY "@MCA_PCM_END@\n"
|
||||||
@ -20,12 +21,11 @@ int
|
|||||||
mca_pcm_base_send_schedule(FILE *fp,
|
mca_pcm_base_send_schedule(FILE *fp,
|
||||||
int jobid,
|
int jobid,
|
||||||
ompi_rte_node_schedule_t *sched,
|
ompi_rte_node_schedule_t *sched,
|
||||||
ompi_list_t *nodelist)
|
int num_procs)
|
||||||
{
|
{
|
||||||
int i, envc;
|
int i, envc;
|
||||||
ompi_list_item_t *node_item, *info_item;
|
ompi_list_item_t *node_item;
|
||||||
ompi_rte_node_allocation_t *node;
|
mca_llm_base_hostfile_data_t *node;
|
||||||
ompi_rte_valuepair_t *valpair;
|
|
||||||
|
|
||||||
fprintf(fp, START_KEY);
|
fprintf(fp, START_KEY);
|
||||||
fprintf(fp, "%d\n", PROTOCOL_VERSION);
|
fprintf(fp, "%d\n", PROTOCOL_VERSION);
|
||||||
@ -57,30 +57,8 @@ mca_pcm_base_send_schedule(FILE *fp,
|
|||||||
(strlen(sched->cwd) > 0) ? sched->cwd : "");
|
(strlen(sched->cwd) > 0) ? sched->cwd : "");
|
||||||
fflush(fp);
|
fflush(fp);
|
||||||
|
|
||||||
/* NODE LIST */
|
/* number of processes to start */
|
||||||
fprintf(fp, "%d\n", (int) ompi_list_get_size(nodelist));
|
fprintf(fp, "%d\n", num_procs);
|
||||||
for (node_item = ompi_list_get_first(nodelist) ;
|
|
||||||
node_item != ompi_list_get_end(nodelist) ;
|
|
||||||
node_item = ompi_list_get_next(node_item)) {
|
|
||||||
node = (ompi_rte_node_allocation_t*) node_item;
|
|
||||||
|
|
||||||
fprintf(fp, NODE_KEY);
|
|
||||||
fprintf(fp, "%d %s\n", (int) strlen(node->hostname),
|
|
||||||
node->hostname);
|
|
||||||
fprintf(fp, "%d\n", node->count);
|
|
||||||
|
|
||||||
/* INFO */
|
|
||||||
fprintf(fp, "%d\n", (int) ompi_list_get_size(node->info));
|
|
||||||
for (info_item = ompi_list_get_first(node->info) ;
|
|
||||||
info_item != ompi_list_get_end(node->info) ;
|
|
||||||
info_item = ompi_list_get_next(info_item)) {
|
|
||||||
valpair = (ompi_rte_valuepair_t*) info_item;
|
|
||||||
|
|
||||||
fprintf(fp, "%d %d %s %s\n",
|
|
||||||
(int) strlen(valpair->key), (int) strlen(valpair->value),
|
|
||||||
valpair->key, valpair->value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* so we've basically ignored the fact we might error out up until
|
* so we've basically ignored the fact we might error out up until
|
||||||
@ -236,139 +214,11 @@ get_argv_array(FILE *fp, int *argcp, char ***argvp)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int
|
|
||||||
get_keyval(FILE *fp, char **keyp, char **valp)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
char *key, *val;
|
|
||||||
int keylen, vallen;
|
|
||||||
size_t str_read;;
|
|
||||||
|
|
||||||
ret = fscanf(fp, "%d %d ", &keylen, &vallen);
|
|
||||||
if (ret != 2) return OMPI_ERROR;
|
|
||||||
|
|
||||||
key = (char*) malloc(sizeof(char) * (keylen + 2));
|
|
||||||
if (NULL == key) return OMPI_ERROR;
|
|
||||||
|
|
||||||
val = (char*) malloc(sizeof(char) * (vallen + 2));
|
|
||||||
if (NULL == val) {
|
|
||||||
free(key);
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* get the key */
|
|
||||||
str_read = fread(key, keylen, 1, fp);
|
|
||||||
if (str_read != 1) {
|
|
||||||
free(key);
|
|
||||||
free(val);
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* get the space */
|
|
||||||
ret = fgetc(fp);
|
|
||||||
if (ret != ' ') {
|
|
||||||
free(key);
|
|
||||||
free(val);
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* get the value */
|
|
||||||
str_read = fread(val, vallen, 1, fp);
|
|
||||||
if (str_read != 1) {
|
|
||||||
free(key);
|
|
||||||
free(val);
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* get the end of line newline */
|
|
||||||
ret = fgetc(fp);
|
|
||||||
if (ret != '\n') {
|
|
||||||
free(key);
|
|
||||||
free(val);
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static int
|
|
||||||
get_nodeinfo(FILE *fp, ompi_list_t *info)
|
|
||||||
{
|
|
||||||
ompi_rte_valuepair_t *newinfo;
|
|
||||||
int ret;
|
|
||||||
int info_len;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
ret = fscanf(fp, "%d\n", &info_len);
|
|
||||||
if (ret != 1) return OMPI_ERROR;
|
|
||||||
|
|
||||||
for (i = 0 ; i < info_len ; ++i) {
|
|
||||||
ret = get_keyval(fp, &(newinfo->key), &(newinfo->value));
|
|
||||||
if (OMPI_SUCCESS != ret) {
|
|
||||||
OBJ_RELEASE(newinfo);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
ompi_list_append(info, (ompi_list_item_t*) newinfo);
|
|
||||||
}
|
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static int
|
|
||||||
get_nodelist(FILE *fp, ompi_list_t *nodelist)
|
|
||||||
{
|
|
||||||
int nodelist_len;
|
|
||||||
int ret;
|
|
||||||
ompi_rte_node_allocation_t *newnode;
|
|
||||||
int i;
|
|
||||||
char *tmpstr;
|
|
||||||
|
|
||||||
ret = fscanf(fp, "%d\n", &nodelist_len);
|
|
||||||
if (ret != 1) return OMPI_ERROR;
|
|
||||||
|
|
||||||
for (i = 0 ; i < nodelist_len ; ++i) {
|
|
||||||
/* make sure we have a key */
|
|
||||||
ret = get_key(fp, NODE_KEY);
|
|
||||||
if (OMPI_SUCCESS != ret) return ret;
|
|
||||||
|
|
||||||
/* create the node */
|
|
||||||
newnode = OBJ_NEW(ompi_rte_node_allocation_t);
|
|
||||||
/* fill in fields */
|
|
||||||
ret = get_string(fp, &tmpstr);
|
|
||||||
if (OMPI_SUCCESS != ret) {
|
|
||||||
OBJ_RELEASE(newnode);
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
strncpy(newnode->hostname, tmpstr, sizeof(newnode->hostname));
|
|
||||||
free(tmpstr);
|
|
||||||
|
|
||||||
ret = fscanf(fp, "%d\n", &(newnode->count));
|
|
||||||
if (ret != 1) {
|
|
||||||
OBJ_RELEASE(newnode);
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = get_nodeinfo(fp, newnode->info);
|
|
||||||
if (OMPI_SUCCESS != ret) {
|
|
||||||
OBJ_RELEASE(newnode);
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
ompi_list_append(nodelist, (ompi_list_item_t*) newnode);
|
|
||||||
}
|
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
mca_pcm_base_recv_schedule(FILE *fp,
|
mca_pcm_base_recv_schedule(FILE *fp,
|
||||||
int *jobid,
|
int *jobid,
|
||||||
ompi_rte_node_schedule_t *sched,
|
ompi_rte_node_schedule_t *sched,
|
||||||
ompi_list_t *nodelist)
|
int *num_procs)
|
||||||
{
|
{
|
||||||
int ret, val;
|
int ret, val;
|
||||||
|
|
||||||
@ -396,9 +246,8 @@ mca_pcm_base_recv_schedule(FILE *fp,
|
|||||||
ret = get_string(fp, &(sched->cwd));
|
ret = get_string(fp, &(sched->cwd));
|
||||||
if (OMPI_SUCCESS != ret) return ret;
|
if (OMPI_SUCCESS != ret) return ret;
|
||||||
|
|
||||||
/* get node list */
|
/* get num procs */
|
||||||
ret = get_nodelist(fp, nodelist);
|
ret = get_int(fp, num_procs);
|
||||||
if (OMPI_SUCCESS != ret) return ret;
|
|
||||||
|
|
||||||
/* make sure we have our end */
|
/* make sure we have our end */
|
||||||
ret = get_key(fp, END_KEY);
|
ret = get_key(fp, END_KEY);
|
||||||
|
@ -13,8 +13,8 @@
|
|||||||
#include "util/argv.h"
|
#include "util/argv.h"
|
||||||
#include "runtime/runtime_types.h"
|
#include "runtime/runtime_types.h"
|
||||||
#include "mca/pcm/base/base.h"
|
#include "mca/pcm/base/base.h"
|
||||||
#include "mca/pcm/base/base.h"
|
#include "mca/llm/base/base.h"
|
||||||
|
#include "mca/llm/base/base_internal.h"
|
||||||
|
|
||||||
char *
|
char *
|
||||||
mca_pcm_base_no_unique_name(void)
|
mca_pcm_base_no_unique_name(void)
|
||||||
@ -48,13 +48,13 @@ mca_pcm_base_build_base_env(char **in_env, char ***out_envp)
|
|||||||
|
|
||||||
|
|
||||||
char *
|
char *
|
||||||
mca_pcm_base_get_username(ompi_rte_node_allocation_t *node)
|
mca_pcm_base_get_username(mca_llm_base_hostfile_node_t *host)
|
||||||
{
|
{
|
||||||
ompi_list_item_t *item;
|
ompi_list_item_t *item;
|
||||||
ompi_rte_valuepair_t *valpair;
|
ompi_rte_valuepair_t *valpair;
|
||||||
|
|
||||||
for (item = ompi_list_get_first(node->info) ;
|
for (item = ompi_list_get_first(host->info) ;
|
||||||
item != ompi_list_get_end(node->info) ;
|
item != ompi_list_get_end(host->info) ;
|
||||||
item = ompi_list_get_next(item)) {
|
item = ompi_list_get_next(item)) {
|
||||||
valpair = (ompi_rte_valuepair_t*) item;
|
valpair = (ompi_rte_valuepair_t*) item;
|
||||||
if (0 == strcmp("user", valpair->key)) return valpair->value;
|
if (0 == strcmp("user", valpair->key)) return valpair->value;
|
||||||
|
@ -37,7 +37,7 @@
|
|||||||
#define PRS_BUFSIZE 1024
|
#define PRS_BUFSIZE 1024
|
||||||
|
|
||||||
static int internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
static int internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
||||||
ompi_list_t *nodelist,
|
ompi_list_t *hostlist,
|
||||||
int my_start_vpid, int global_start_vpid,
|
int my_start_vpid, int global_start_vpid,
|
||||||
int num_procs);
|
int num_procs);
|
||||||
|
|
||||||
@ -55,9 +55,11 @@ mca_pcm_rsh_can_spawn(void)
|
|||||||
int
|
int
|
||||||
mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
|
mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
|
||||||
{
|
{
|
||||||
ompi_list_item_t *sched_item, *node_item;
|
ompi_list_item_t *sched_item, *node_item, *host_item;
|
||||||
ompi_rte_node_schedule_t *sched;
|
ompi_rte_node_schedule_t *sched;
|
||||||
ompi_rte_node_allocation_t *node;
|
ompi_rte_node_allocation_t *node;
|
||||||
|
mca_llm_base_hostfile_data_t *data;
|
||||||
|
mca_llm_base_hostfile_node_t *host;
|
||||||
ompi_list_t launch;
|
ompi_list_t launch;
|
||||||
ompi_list_t done;
|
ompi_list_t done;
|
||||||
int ret, i;
|
int ret, i;
|
||||||
@ -65,12 +67,11 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
|
|||||||
int local_start_vpid = 0;
|
int local_start_vpid = 0;
|
||||||
int global_start_vpid = 0;
|
int global_start_vpid = 0;
|
||||||
int num_procs = 0;
|
int num_procs = 0;
|
||||||
|
int tmp_count;
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&launch, ompi_list_t);
|
OBJ_CONSTRUCT(&launch, ompi_list_t);
|
||||||
OBJ_CONSTRUCT(&done, ompi_list_t);
|
OBJ_CONSTRUCT(&done, ompi_list_t);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for (sched_item = ompi_list_get_first(schedlist) ;
|
for (sched_item = ompi_list_get_first(schedlist) ;
|
||||||
sched_item != ompi_list_get_end(schedlist) ;
|
sched_item != ompi_list_get_end(schedlist) ;
|
||||||
sched_item = ompi_list_get_next(sched_item)) {
|
sched_item = ompi_list_get_next(sched_item)) {
|
||||||
@ -80,9 +81,13 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
|
|||||||
node_item != ompi_list_get_end(sched->nodelist) ;
|
node_item != ompi_list_get_end(sched->nodelist) ;
|
||||||
node_item = ompi_list_get_next(node_item)) {
|
node_item = ompi_list_get_next(node_item)) {
|
||||||
node = (ompi_rte_node_allocation_t*) node_item;
|
node = (ompi_rte_node_allocation_t*) node_item;
|
||||||
|
if (node->nodes > 0) {
|
||||||
|
num_procs += (node->count * node->nodes);
|
||||||
|
} else {
|
||||||
num_procs += node->count;
|
num_procs += node->count;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* BWB - make sure vpids are reserved */
|
/* BWB - make sure vpids are reserved */
|
||||||
local_start_vpid = global_start_vpid;
|
local_start_vpid = global_start_vpid;
|
||||||
@ -92,19 +97,30 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
|
|||||||
sched_item = ompi_list_get_next(sched_item)) {
|
sched_item = ompi_list_get_next(sched_item)) {
|
||||||
sched = (ompi_rte_node_schedule_t*) sched_item;
|
sched = (ompi_rte_node_schedule_t*) sched_item;
|
||||||
|
|
||||||
|
for (node_item = ompi_list_get_first(sched->nodelist) ;
|
||||||
|
node_item != ompi_list_get_end(sched->nodelist) ;
|
||||||
|
node_item = ompi_list_get_next(node_item) ) {
|
||||||
|
node = (ompi_rte_node_allocation_t*) node_item;
|
||||||
|
data = (mca_llm_base_hostfile_data_t*) node->data;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* make sure I'm the first node in the list and then start our
|
* make sure I'm the first node in the list and then start
|
||||||
* deal. We rsh me just like everyone else so that we don't
|
* our deal. We rsh me just like everyone else so that we
|
||||||
* have any unexpected environment oddities...
|
* don't have any unexpected environment oddities...
|
||||||
*/
|
*/
|
||||||
/* BWB - do front of list check! */
|
/* BWB - do front of list check! */
|
||||||
node_item = ompi_list_get_first(sched->nodelist);
|
host_item = ompi_list_get_first(data->hostlist);
|
||||||
|
|
||||||
while (node_item != ompi_list_get_end(sched->nodelist)) {
|
while (host_item != ompi_list_get_end(data->hostlist)) {
|
||||||
/* find enough entries for this slice to go */
|
/* find enough entries for this slice to go */
|
||||||
|
tmp_count = 0;
|
||||||
for (i = 0 ;
|
for (i = 0 ;
|
||||||
i < width && node_item != ompi_list_get_end(sched->nodelist) ;
|
i < width &&
|
||||||
node_item = ompi_list_get_next(node_item), ++i) { }
|
host_item != ompi_list_get_end(data->hostlist) ;
|
||||||
|
host_item = ompi_list_get_next(host_item), ++i) {
|
||||||
|
host = (mca_llm_base_hostfile_node_t*) host_item;
|
||||||
|
tmp_count += host->count;
|
||||||
|
}
|
||||||
/* if we don't have anyone, get us out of here.. */
|
/* if we don't have anyone, get us out of here.. */
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
continue;
|
continue;
|
||||||
@ -112,9 +128,9 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
|
|||||||
|
|
||||||
/* make a launch list */
|
/* make a launch list */
|
||||||
ompi_list_splice(&launch, ompi_list_get_end(&launch),
|
ompi_list_splice(&launch, ompi_list_get_end(&launch),
|
||||||
sched->nodelist,
|
data->hostlist,
|
||||||
ompi_list_get_first(sched->nodelist),
|
ompi_list_get_first(data->hostlist),
|
||||||
node_item);
|
host_item);
|
||||||
|
|
||||||
/* do the launch to the first node in the list, passing
|
/* do the launch to the first node in the list, passing
|
||||||
him the rest of the list */
|
him the rest of the list */
|
||||||
@ -125,17 +141,21 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
|
|||||||
/* well, crap! put ourselves back together, I guess.
|
/* well, crap! put ourselves back together, I guess.
|
||||||
Should call killjob */
|
Should call killjob */
|
||||||
ompi_list_join(&done, ompi_list_get_end(&done), &launch);
|
ompi_list_join(&done, ompi_list_get_end(&done), &launch);
|
||||||
ompi_list_join(sched->nodelist,
|
ompi_list_join(data->hostlist,
|
||||||
ompi_list_get_first(sched->nodelist),
|
ompi_list_get_first(data->hostlist),
|
||||||
&done);
|
&done);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
local_start_vpid +=
|
local_start_vpid += tmp_count;
|
||||||
((ompi_rte_node_allocation_t*) ompi_list_get_first(&launch))->count;
|
|
||||||
|
|
||||||
/* copy the list over to the done part */
|
/* copy the list over to the done part */
|
||||||
ompi_list_join(&done, ompi_list_get_end(&done), &launch);
|
ompi_list_join(&done, ompi_list_get_end(&done), &launch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* put the list back where we found it... */
|
||||||
|
ompi_list_join(data->hostlist, ompi_list_get_end(data->hostlist),
|
||||||
|
&done);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
OBJ_DESTRUCT(&done);
|
OBJ_DESTRUCT(&done);
|
||||||
@ -146,7 +166,7 @@ mca_pcm_rsh_spawn_procs(int jobid, ompi_list_t *schedlist)
|
|||||||
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
internal_need_profile(ompi_rte_node_allocation_t *start_node,
|
internal_need_profile(mca_llm_base_hostfile_node_t *start_node,
|
||||||
int stderr_is_error, bool *needs_profile)
|
int stderr_is_error, bool *needs_profile)
|
||||||
{
|
{
|
||||||
struct passwd *p;
|
struct passwd *p;
|
||||||
@ -253,12 +273,12 @@ cleanup:
|
|||||||
|
|
||||||
static int
|
static int
|
||||||
internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
||||||
ompi_list_t *nodelist, int my_start_vpid,
|
ompi_list_t *hostlist, int my_start_vpid,
|
||||||
int global_start_vpid, int num_procs)
|
int global_start_vpid, int num_procs)
|
||||||
{
|
{
|
||||||
int kidstdin[2]; /* child stdin pipe */
|
int kidstdin[2]; /* child stdin pipe */
|
||||||
bool needs_profile = false;
|
bool needs_profile = false;
|
||||||
ompi_rte_node_allocation_t *start_node;
|
mca_llm_base_hostfile_node_t *start_node;
|
||||||
char** cmdv = NULL;
|
char** cmdv = NULL;
|
||||||
char *cmd0 = NULL;
|
char *cmd0 = NULL;
|
||||||
int cmdc = 0;
|
int cmdc = 0;
|
||||||
@ -272,7 +292,7 @@ internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
|||||||
int i;
|
int i;
|
||||||
char *tmp;
|
char *tmp;
|
||||||
|
|
||||||
start_node = (ompi_rte_node_allocation_t*) ompi_list_get_first(nodelist);
|
start_node = (mca_llm_base_hostfile_node_t*) ompi_list_get_first(hostlist);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check to see if we need to do the .profile thing
|
* Check to see if we need to do the .profile thing
|
||||||
@ -306,7 +326,7 @@ internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
|||||||
|
|
||||||
/* build the command to start */
|
/* build the command to start */
|
||||||
ompi_argv_append(&cmdc, &cmdv, BOOTAGENT);
|
ompi_argv_append(&cmdc, &cmdv, BOOTAGENT);
|
||||||
|
#if 1
|
||||||
/* starting vpid for launchee's procs */
|
/* starting vpid for launchee's procs */
|
||||||
tmp = ltostr(my_start_vpid);
|
tmp = ltostr(my_start_vpid);
|
||||||
ompi_argv_append(&cmdc, &cmdv, "--local_start_vpid");
|
ompi_argv_append(&cmdc, &cmdv, "--local_start_vpid");
|
||||||
@ -324,7 +344,7 @@ internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
|||||||
ompi_argv_append(&cmdc, &cmdv, "--num_procs");
|
ompi_argv_append(&cmdc, &cmdv, "--num_procs");
|
||||||
ompi_argv_append(&cmdc, &cmdv, tmp);
|
ompi_argv_append(&cmdc, &cmdv, tmp);
|
||||||
free(tmp);
|
free(tmp);
|
||||||
|
#endif
|
||||||
/* add the end of the .profile thing if required */
|
/* add the end of the .profile thing if required */
|
||||||
if (needs_profile) {
|
if (needs_profile) {
|
||||||
ompi_argv_append(&cmdc, &cmdv, ")");
|
ompi_argv_append(&cmdc, &cmdv, ")");
|
||||||
@ -377,7 +397,7 @@ internal_spawn_proc(int jobid, ompi_rte_node_schedule_t *sched,
|
|||||||
/* send our stuff down the wire */
|
/* send our stuff down the wire */
|
||||||
fp = fdopen(kidstdin[1], "a");
|
fp = fdopen(kidstdin[1], "a");
|
||||||
if (fp == NULL) { perror("fdopen"); abort(); }
|
if (fp == NULL) { perror("fdopen"); abort(); }
|
||||||
ret = mca_pcm_base_send_schedule(fp, jobid, sched, nodelist);
|
ret = mca_pcm_base_send_schedule(fp, jobid, sched, start_node->count);
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
if (OMPI_SUCCESS != ret) {
|
if (OMPI_SUCCESS != ret) {
|
||||||
kill(pid, SIGTERM);
|
kill(pid, SIGTERM);
|
||||||
|
@ -302,6 +302,8 @@ ompi_rte_int_node_schedule_destruct(ompi_object_t *obj)
|
|||||||
ompi_rte_node_allocation_t *node;
|
ompi_rte_node_allocation_t *node;
|
||||||
ompi_list_item_t *item;
|
ompi_list_item_t *item;
|
||||||
|
|
||||||
|
if (NULL == sched->nodelist) return;
|
||||||
|
|
||||||
while (NULL != (item = ompi_list_remove_first(sched->nodelist))) {
|
while (NULL != (item = ompi_list_remove_first(sched->nodelist))) {
|
||||||
node = (ompi_rte_node_allocation_t*) item;
|
node = (ompi_rte_node_allocation_t*) item;
|
||||||
OBJ_RELEASE(node);
|
OBJ_RELEASE(node);
|
||||||
@ -317,7 +319,10 @@ void
|
|||||||
ompi_rte_int_node_allocation_construct(ompi_object_t *obj)
|
ompi_rte_int_node_allocation_construct(ompi_object_t *obj)
|
||||||
{
|
{
|
||||||
ompi_rte_node_allocation_t *node = (ompi_rte_node_allocation_t*) obj;
|
ompi_rte_node_allocation_t *node = (ompi_rte_node_allocation_t*) obj;
|
||||||
node->info = OBJ_NEW(ompi_list_t);
|
node->start = 0;
|
||||||
|
node->nodes = 0;
|
||||||
|
node->count = 0;
|
||||||
|
node->data = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -327,15 +332,10 @@ void
|
|||||||
ompi_rte_int_node_allocation_destruct(ompi_object_t *obj)
|
ompi_rte_int_node_allocation_destruct(ompi_object_t *obj)
|
||||||
{
|
{
|
||||||
ompi_rte_node_allocation_t *node = (ompi_rte_node_allocation_t*) obj;
|
ompi_rte_node_allocation_t *node = (ompi_rte_node_allocation_t*) obj;
|
||||||
ompi_rte_valuepair_t *valpair;
|
|
||||||
ompi_list_item_t *item;
|
|
||||||
|
|
||||||
while (NULL != (item = ompi_list_remove_first(node->info))) {
|
if (NULL == node->data) return;
|
||||||
valpair = (ompi_rte_valuepair_t*) item;
|
|
||||||
OBJ_RELEASE(valpair);
|
|
||||||
}
|
|
||||||
|
|
||||||
OBJ_RELEASE(node->info);
|
OBJ_RELEASE(node->data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -372,3 +372,6 @@ OBJ_CLASS_INSTANCE(ompi_rte_node_allocation_t, ompi_list_item_t,
|
|||||||
OBJ_CLASS_INSTANCE(ompi_rte_valuepair_t, ompi_list_item_t,
|
OBJ_CLASS_INSTANCE(ompi_rte_valuepair_t, ompi_list_item_t,
|
||||||
ompi_rte_int_valuepair_construct,
|
ompi_rte_int_valuepair_construct,
|
||||||
ompi_rte_int_valuepair_destruct);
|
ompi_rte_int_valuepair_destruct);
|
||||||
|
/** create instance information for \c ompi_rte_node_allocation_data_t */
|
||||||
|
OBJ_CLASS_INSTANCE(ompi_rte_node_allocation_data_t, ompi_object_t,
|
||||||
|
NULL, NULL);
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
extern mca_pcm_base_module_t mca_pcm;
|
extern mca_pcm_base_module_t mca_pcm;
|
||||||
|
|
||||||
ompi_list_t*
|
ompi_list_t*
|
||||||
ompi_rte_allocate_resources(int jobid, int nodes, int procs)
|
ompi_rte_allocate_resources(mca_ns_base_jobid_t jobid, int nodes, int procs)
|
||||||
{
|
{
|
||||||
if (NULL == mca_pcm.pcm_allocate_resources) {
|
if (NULL == mca_pcm.pcm_allocate_resources) {
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -23,7 +23,7 @@ ompi_rte_allocate_resources(int jobid, int nodes, int procs)
|
|||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
ompi_rte_deallocate_resources(int jobid, ompi_list_t *nodelist)
|
ompi_rte_deallocate_resources(mca_ns_base_jobid_t jobid, ompi_list_t *nodelist)
|
||||||
{
|
{
|
||||||
if (NULL == mca_pcm.pcm_deallocate_resources) {
|
if (NULL == mca_pcm.pcm_deallocate_resources) {
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
|
@ -25,7 +25,7 @@ ompi_rte_can_spawn(void)
|
|||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
ompi_rte_spawn_procs(int jobid, ompi_list_t *schedule_list)
|
ompi_rte_spawn_procs(mca_ns_base_jobid_t jobid, ompi_list_t *schedule_list)
|
||||||
{
|
{
|
||||||
if (NULL == mca_pcm.pcm_spawn_procs) {
|
if (NULL == mca_pcm.pcm_spawn_procs) {
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
@ -69,7 +69,7 @@ ompi_rte_kill_proc(ompi_process_name_t *name, int flags)
|
|||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
ompi_rte_kill_job(int jobid, int flags)
|
ompi_rte_kill_job(mca_ns_base_jobid_t jobid, int flags)
|
||||||
{
|
{
|
||||||
if (NULL == mca_pcm.pcm_kill_job) {
|
if (NULL == mca_pcm.pcm_kill_job) {
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
|
@ -14,6 +14,7 @@
|
|||||||
#include "ompi_config.h"
|
#include "ompi_config.h"
|
||||||
|
|
||||||
#include "runtime/runtime_types.h"
|
#include "runtime/runtime_types.h"
|
||||||
|
#include "mca/ns/ns.h"
|
||||||
|
|
||||||
/* For backwards compatibility. If you only need MPI stuff, please include
|
/* For backwards compatibility. If you only need MPI stuff, please include
|
||||||
mpiruntime/mpiruntime.h directly */
|
mpiruntime/mpiruntime.h directly */
|
||||||
@ -110,21 +111,23 @@ extern "C" {
|
|||||||
* once per jobid.
|
* once per jobid.
|
||||||
*
|
*
|
||||||
* @param jobid (IN) Jobid with which to associate the given resources.
|
* @param jobid (IN) Jobid with which to associate the given resources.
|
||||||
* @param nodes (IN) Number of nodes to try to allocate. If 0, the
|
* @param nodes (IN) Number of ndoes to try to allocate. If 0, the
|
||||||
* LLM will try to allocate <code>procs</code>
|
* allocator will try to allocate \c procs processes
|
||||||
* processes on as many nodes as are needed. If
|
* on as many nodes as are needed. If non-zero,
|
||||||
* non-zero, will try to fairly distribute
|
* will try to allocate \c procs process slots
|
||||||
* <code>procs</code> processes over the nodes.
|
* per node.
|
||||||
* If <code>procs</code> is 0, will attempt to
|
|
||||||
* allocate all cpus on <code>nodes</code> nodes
|
|
||||||
* @param procs (IN) Number of processors to try to allocate. See the note
|
* @param procs (IN) Number of processors to try to allocate. See the note
|
||||||
* for <code>nodes</code> for usage.
|
* for <code>nodes</code> for usage.
|
||||||
* @return List of <code>ompi_rte_node_allocation_t</code>s
|
* @return List of <code>ompi_rte_node_allocation_t</code>s
|
||||||
* describing the allocated resources.
|
* describing the allocated resources.
|
||||||
*
|
*
|
||||||
* @warning The type for jobid will change in the near future
|
* @note In the future, a more complex resource allocation
|
||||||
|
* function may be added, which allows for complicated
|
||||||
|
* resource requests. This function will continue to exist
|
||||||
|
* as a special case of that function.
|
||||||
*/
|
*/
|
||||||
ompi_list_t* ompi_rte_allocate_resources(int jobid, int nodes, int procs);
|
ompi_list_t* ompi_rte_allocate_resources(mca_ns_base_jobid_t jobid,
|
||||||
|
int nodes, int procs);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -144,9 +147,9 @@ extern "C" {
|
|||||||
* of \c mca_pcm_base_schedule_t structures, which give both process
|
* of \c mca_pcm_base_schedule_t structures, which give both process
|
||||||
* and location information.
|
* and location information.
|
||||||
*
|
*
|
||||||
* @warning Parameter list will probably change in the near future.
|
|
||||||
*/
|
*/
|
||||||
int ompi_rte_spawn_procs(int jobid, ompi_list_t *schedule_list);
|
int ompi_rte_spawn_procs(mca_ns_base_jobid_t jobid,
|
||||||
|
ompi_list_t *schedule_list);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -210,7 +213,7 @@ extern "C" {
|
|||||||
* future compatibility. Will be used to specify how to kill
|
* future compatibility. Will be used to specify how to kill
|
||||||
* processes (0 will be same as a "kill <pid>"
|
* processes (0 will be same as a "kill <pid>"
|
||||||
*/
|
*/
|
||||||
int ompi_rte_kill_job(int jobid, int flags);
|
int ompi_rte_kill_job(mca_ns_base_jobid_t jobid, int flags);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -221,10 +224,9 @@ extern "C" {
|
|||||||
* @param jobid (IN) Jobid associated with the resources to be freed.
|
* @param jobid (IN) Jobid associated with the resources to be freed.
|
||||||
* @param nodes (IN) Nodelist from associated allocate_resource call.
|
* @param nodes (IN) Nodelist from associated allocate_resource call.
|
||||||
* All associated memory will be freed as appropriate.
|
* All associated memory will be freed as appropriate.
|
||||||
*
|
|
||||||
* @warning The type for jobid will change in the near future.
|
|
||||||
*/
|
*/
|
||||||
int ompi_rte_deallocate_resources(int jobid, ompi_list_t *nodelist);
|
int ompi_rte_deallocate_resources(mca_ns_base_jobid_t jobid,
|
||||||
|
ompi_list_t *nodelist);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -59,22 +59,58 @@ OBJ_CLASS_DECLARATION(ompi_rte_node_schedule_t);
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Node
|
* Base container for node-related information
|
||||||
*
|
*
|
||||||
* Container for allocation and deallocation of resources used to
|
* Base container type for holding llm/pcm private information on a \c
|
||||||
* launch parallel jobs.
|
* ompi_rte_node_allocation_t container.
|
||||||
|
*/
|
||||||
|
struct ompi_rte_node_allocation_data_t {
|
||||||
|
/** make us an instance of object so our constructors go boom */
|
||||||
|
ompi_object_t super;
|
||||||
|
};
|
||||||
|
/** shorten ompi_rte_node_allocation_data_t declarations */
|
||||||
|
typedef struct ompi_rte_node_allocation_data_t ompi_rte_node_allocation_data_t;
|
||||||
|
/** create the required instance information */
|
||||||
|
OBJ_CLASS_DECLARATION(ompi_rte_node_allocation_data_t);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resource allocation container
|
||||||
*
|
*
|
||||||
|
* Container for passing information between the resource allocator,
|
||||||
|
* the resource/job mapper, and the job starter portions of the
|
||||||
|
* run-time environment.
|
||||||
|
*
|
||||||
|
* \c count has a strange meaning. If \c nodes is 0, \c count is the
|
||||||
|
* total number of cpus available in this block of resources. If \c
|
||||||
|
* nodes is non-zero, \c count is the number of cpus available per
|
||||||
|
* node.
|
||||||
|
*
|
||||||
|
* \c start provides an integer number of where in the job the
|
||||||
|
* resource is available. If you had two node_allocation_t elements
|
||||||
|
* returned from a call to allocate resources, one with
|
||||||
|
* nodes=4,count=2 and one with nodes=2,count=4, start would be 0 for
|
||||||
|
* the first element and 8 for the second.
|
||||||
|
*
|
||||||
|
* The contents of the structure (with the exception of \c data) may
|
||||||
|
* be examined by the process mapping functions. However, the fields
|
||||||
|
* should be considered read-only. The \c data field may contain
|
||||||
|
* private data that reflects the status of the \c nodes and \c count
|
||||||
|
* fields. The \c ompi_rte_node_* functions are available for
|
||||||
|
* manipulating \c ompi_rte_node_allocation_t structures.
|
||||||
*/
|
*/
|
||||||
struct ompi_rte_node_allocation_t {
|
struct ompi_rte_node_allocation_t {
|
||||||
/** make us an instance of list item */
|
/** make us an instance of list item */
|
||||||
ompi_list_item_t super;
|
ompi_list_item_t super;
|
||||||
/** hostname for this node. Can be used as generic description
|
/** start of allocation numbers for this block of nodes */
|
||||||
field if hostnames aren't used on this platform */
|
int start;
|
||||||
char hostname[MAXHOSTNAMELEN];
|
/** number of nodes in this allocation - 0 means unknown */
|
||||||
/** number of MPI processes Open MPI can start on this host */
|
int nodes;
|
||||||
|
/** number of "process slots" (places to start a process) that
|
||||||
|
are allocated as part of this block of processes */
|
||||||
int count;
|
int count;
|
||||||
/** generic key=value storage mechanism */
|
/** data store for use by the Open MPI run-time environment */
|
||||||
ompi_list_t *info;
|
ompi_rte_node_allocation_data_t *data;
|
||||||
};
|
};
|
||||||
/** shorten ompi_rte_allocation_t declarations */
|
/** shorten ompi_rte_allocation_t declarations */
|
||||||
typedef struct ompi_rte_node_allocation_t ompi_rte_node_allocation_t;
|
typedef struct ompi_rte_node_allocation_t ompi_rte_node_allocation_t;
|
||||||
|
@ -25,7 +25,6 @@ int
|
|||||||
main(int argc, char *argv[])
|
main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
ompi_rte_node_schedule_t *sched;
|
ompi_rte_node_schedule_t *sched;
|
||||||
ompi_rte_node_allocation_t *node;
|
|
||||||
pid_t pid;
|
pid_t pid;
|
||||||
int i;
|
int i;
|
||||||
int ret;
|
int ret;
|
||||||
@ -33,7 +32,8 @@ main(int argc, char *argv[])
|
|||||||
ompi_cmd_line_t *cmd_line = NULL;
|
ompi_cmd_line_t *cmd_line = NULL;
|
||||||
int local_vpid_start, global_vpid_start;
|
int local_vpid_start, global_vpid_start;
|
||||||
int cellid = 0;
|
int cellid = 0;
|
||||||
int num_procs;
|
int total_num_procs;
|
||||||
|
int fork_num_procs;
|
||||||
char *env_buf;
|
char *env_buf;
|
||||||
|
|
||||||
ompi_init(argc, argv);
|
ompi_init(argc, argv);
|
||||||
@ -68,26 +68,18 @@ main(int argc, char *argv[])
|
|||||||
show_usage(argv[0]);
|
show_usage(argv[0]);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
num_procs = atoi(ompi_cmd_line_get_param(cmd_line, "num_procs", 0, 0));
|
total_num_procs = atoi(ompi_cmd_line_get_param(cmd_line, "num_procs", 0, 0));
|
||||||
|
|
||||||
sched = OBJ_NEW(ompi_rte_node_schedule_t);
|
sched = OBJ_NEW(ompi_rte_node_schedule_t);
|
||||||
|
|
||||||
/* recv_schedule wants an already initialized ompi_list_t */
|
/* recv_schedule wants an already initialized ompi_list_t */
|
||||||
ret = mca_pcm_base_recv_schedule(stdin, &jobid, sched,
|
ret = mca_pcm_base_recv_schedule(stdin, &jobid, sched,
|
||||||
sched->nodelist);
|
&fork_num_procs);
|
||||||
if (ret != OMPI_SUCCESS) {
|
if (ret != OMPI_SUCCESS) {
|
||||||
fprintf(stderr, "Failure in receiving schedule information\n");
|
fprintf(stderr, "Failure in receiving schedule information\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* sanity check */
|
|
||||||
if (ompi_list_get_size(sched->nodelist) > 1) {
|
|
||||||
fprintf(stderr, "Received more than one node - ignoring extra info\n");
|
|
||||||
}
|
|
||||||
if (ompi_list_get_size(sched->nodelist) < 1) {
|
|
||||||
fprintf(stderr, "Received less than one node\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
/* fill our environment */
|
/* fill our environment */
|
||||||
for (i = 0 ; sched->env[i] != NULL ; ++i) {
|
for (i = 0 ; sched->env[i] != NULL ; ++i) {
|
||||||
putenv(sched->env[i]);
|
putenv(sched->env[i]);
|
||||||
@ -97,7 +89,7 @@ main(int argc, char *argv[])
|
|||||||
putenv(env_buf);
|
putenv(env_buf);
|
||||||
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_jobid=%d", jobid);
|
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_jobid=%d", jobid);
|
||||||
putenv(env_buf);
|
putenv(env_buf);
|
||||||
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_num_procs=%d", num_procs);
|
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_num_procs=%d", total_num_procs);
|
||||||
putenv(env_buf);
|
putenv(env_buf);
|
||||||
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_vpid_start=%d",
|
asprintf(&env_buf, "OMPI_MCA_pcmclient_env_vpid_start=%d",
|
||||||
global_vpid_start);
|
global_vpid_start);
|
||||||
@ -112,9 +104,8 @@ main(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
node = (ompi_rte_node_allocation_t*) ompi_list_get_first(sched->nodelist);
|
|
||||||
/* let's go! - if we are the parent, don't stick around... */
|
/* let's go! - if we are the parent, don't stick around... */
|
||||||
for (i = 0 ; i < node->count ; ++i) {
|
for (i = 0 ; i < fork_num_procs ; ++i) {
|
||||||
pid = fork();
|
pid = fork();
|
||||||
if (pid < 0) {
|
if (pid < 0) {
|
||||||
/* error :( */
|
/* error :( */
|
||||||
|
@ -21,7 +21,7 @@ int
|
|||||||
main(int argc, char *argv[])
|
main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
ompi_list_t *hostlist;
|
ompi_list_t *hostlist;
|
||||||
ompi_rte_node_allocation_t *node;
|
mca_llm_base_hostfile_node_t *node;
|
||||||
ompi_rte_valuepair_t *valpair;
|
ompi_rte_valuepair_t *valpair;
|
||||||
ompi_list_item_t *nodeitem, *valpairitem;
|
ompi_list_item_t *nodeitem, *valpairitem;
|
||||||
FILE *test1_out=NULL; /* output file for first test */
|
FILE *test1_out=NULL; /* output file for first test */
|
||||||
@ -55,7 +55,7 @@ main(int argc, char *argv[])
|
|||||||
nodeitem != ompi_list_get_end(hostlist);
|
nodeitem != ompi_list_get_end(hostlist);
|
||||||
nodeitem = ompi_list_get_next(nodeitem)) {
|
nodeitem = ompi_list_get_next(nodeitem)) {
|
||||||
|
|
||||||
node = (ompi_rte_node_allocation_t*) nodeitem;
|
node = (mca_llm_base_hostfile_node_t*) nodeitem;
|
||||||
fprintf(test1_out, "\t%s %d\n", node->hostname, node->count);
|
fprintf(test1_out, "\t%s %d\n", node->hostname, node->count);
|
||||||
|
|
||||||
for (valpairitem = ompi_list_get_first(node->info);
|
for (valpairitem = ompi_list_get_first(node->info);
|
||||||
@ -84,7 +84,7 @@ main(int argc, char *argv[])
|
|||||||
nodeitem != ompi_list_get_end(hostlist);
|
nodeitem != ompi_list_get_end(hostlist);
|
||||||
nodeitem = ompi_list_get_next(nodeitem)) {
|
nodeitem = ompi_list_get_next(nodeitem)) {
|
||||||
|
|
||||||
node = (ompi_rte_node_allocation_t*) nodeitem;
|
node = (mca_llm_base_hostfile_node_t*) nodeitem;
|
||||||
fprintf(test2_out, "\t%s %d\n", node->hostname, node->count);
|
fprintf(test2_out, "\t%s %d\n", node->hostname, node->count);
|
||||||
|
|
||||||
for (valpairitem = ompi_list_get_first(node->info);
|
for (valpairitem = ompi_list_get_first(node->info);
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include "mca/pcm/base/base.h"
|
#include "mca/pcm/base/base.h"
|
||||||
|
#include "util/argv.h"
|
||||||
|
|
||||||
char *env[] = {
|
char *env[] = {
|
||||||
"ENV0=",
|
"ENV0=",
|
||||||
|
@ -30,6 +30,8 @@ main(int argc, char *argv[])
|
|||||||
FILE *test2_in = NULL;
|
FILE *test2_in = NULL;
|
||||||
int result; /* result of system call */
|
int result; /* result of system call */
|
||||||
int jobid = 123;
|
int jobid = 123;
|
||||||
|
int out_num_procs = 5;
|
||||||
|
int in_num_procs = 5;
|
||||||
|
|
||||||
test_init("sched_comm_t");
|
test_init("sched_comm_t");
|
||||||
|
|
||||||
@ -52,7 +54,7 @@ main(int argc, char *argv[])
|
|||||||
schedout->cwd = "/foo/bar/baz";
|
schedout->cwd = "/foo/bar/baz";
|
||||||
|
|
||||||
result = mca_pcm_base_send_schedule(test1_out, jobid, schedout,
|
result = mca_pcm_base_send_schedule(test1_out, jobid, schedout,
|
||||||
schedout->nodelist);
|
out_num_procs);
|
||||||
if (result != OMPI_SUCCESS) {
|
if (result != OMPI_SUCCESS) {
|
||||||
test_failure("send_schedule failed");
|
test_failure("send_schedule failed");
|
||||||
exit(1);
|
exit(1);
|
||||||
@ -75,12 +77,12 @@ main(int argc, char *argv[])
|
|||||||
test2_in = fopen("./test1_out", "r");
|
test2_in = fopen("./test1_out", "r");
|
||||||
|
|
||||||
result = mca_pcm_base_recv_schedule(test2_in, &jobid, schedin,
|
result = mca_pcm_base_recv_schedule(test2_in, &jobid, schedin,
|
||||||
schedin->nodelist);
|
&in_num_procs);
|
||||||
if (result != OMPI_SUCCESS) {
|
if (result != OMPI_SUCCESS) {
|
||||||
test_failure("recv_schedule failed");
|
test_failure("recv_schedule failed");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
mca_pcm_base_send_schedule(test2_out, jobid, schedin, schedin->nodelist);
|
mca_pcm_base_send_schedule(test2_out, jobid, schedin, in_num_procs);
|
||||||
if (result != OMPI_SUCCESS) {
|
if (result != OMPI_SUCCESS) {
|
||||||
test_failure("send_schedule (2) failed");
|
test_failure("send_schedule (2) failed");
|
||||||
exit(1);
|
exit(1);
|
||||||
|
@ -2,11 +2,11 @@
|
|||||||
1
|
1
|
||||||
123
|
123
|
||||||
1
|
1
|
||||||
12 ./sched_comm
|
73 /Users/brbarret/research/ompi/nodelist/test/mca/pcm/base/.libs/sched_comm
|
||||||
3
|
3
|
||||||
19 ENV1=blah blah blah
|
19 ENV1=blah blah blah
|
||||||
19 ENV2=foo bar is fun
|
19 ENV2=foo bar is fun
|
||||||
8 ENV3=123
|
8 ENV3=123
|
||||||
12 /foo/bar/baz
|
12 /foo/bar/baz
|
||||||
0
|
5
|
||||||
@MCA_PCM_END@
|
@MCA_PCM_END@
|
||||||
|
@ -2,11 +2,11 @@
|
|||||||
1
|
1
|
||||||
123
|
123
|
||||||
1
|
1
|
||||||
12 ./sched_comm
|
73 /Users/brbarret/research/ompi/nodelist/test/mca/pcm/base/.libs/sched_comm
|
||||||
3
|
3
|
||||||
19 ENV1=blah blah blah
|
19 ENV1=blah blah blah
|
||||||
19 ENV2=foo bar is fun
|
19 ENV2=foo bar is fun
|
||||||
8 ENV3=123
|
8 ENV3=123
|
||||||
12 /foo/bar/baz
|
12 /foo/bar/baz
|
||||||
0
|
5
|
||||||
@MCA_PCM_END@
|
@MCA_PCM_END@
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user