Fixed a number of issues related to resource allocation:
- Simplified the logic of the ras modules by moving the attribute handling into the base allocation function. This allows us to decide how to allocate based on the situation, and solves some of the allocation problems we were having with comm_spawn. - moved the proxy component into the base. This was done because we always want to call the proxy functions if we are not on a HNP regardless of the attributes passed. - Got rid of the hostfile component. What little logic was in it was moved into the base to deal with other circumstances. The hostfile information is currently being propagated into the registry by the RDS, so we just use what is already in the registry. - renamed some slurm function so that they have the proper prefix. Not strictly necessary as they were static, but it makes debugging much easier. - fixed a buglet in the round_robin rmaps where we would return an error when really no error occured. I tried to make proper corrections to all the ras modules, but I cannot test all of them. This commit was SVN r12202.
Этот коммит содержится в:
родитель
ab196c3121
Коммит
ade94b523b
@ -18,7 +18,8 @@
|
||||
|
||||
headers += \
|
||||
base/base.h \
|
||||
base/ras_private.h
|
||||
base/ras_private.h \
|
||||
base/proxy/ras_base_proxy.h
|
||||
|
||||
libmca_ras_la_SOURCES += \
|
||||
base/ras_base_alloc.c \
|
||||
@ -35,4 +36,6 @@ libmca_ras_la_SOURCES += \
|
||||
base/data_type_support/ras_data_type_print_fns.c \
|
||||
base/data_type_support/ras_data_type_release_fns.c \
|
||||
base/data_type_support/ras_data_type_size_fns.c \
|
||||
base/data_type_support/ras_data_type_unpacking_fns.c
|
||||
base/data_type_support/ras_data_type_unpacking_fns.c \
|
||||
base/proxy/ras_base_proxy.c \
|
||||
base/proxy/ras_base_proxy_component.c
|
||||
|
@ -76,6 +76,7 @@ typedef struct orte_ras_base_t {
|
||||
int ras_output;
|
||||
opal_list_t ras_opened;
|
||||
bool ras_opened_valid;
|
||||
bool ras_using_proxy;
|
||||
opal_list_t ras_available;
|
||||
bool ras_available_valid;
|
||||
orte_std_cntr_t ras_num_nodes;
|
||||
|
@ -15,9 +15,6 @@
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
@ -31,17 +28,9 @@
|
||||
#include "orte/mca/rml/rml.h"
|
||||
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
#include "ras_proxy.h"
|
||||
#include "ras_base_proxy.h"
|
||||
|
||||
/**
|
||||
* globals
|
||||
*/
|
||||
|
||||
/*
|
||||
* functions
|
||||
*/
|
||||
|
||||
int orte_ras_proxy_allocate(orte_jobid_t job, opal_list_t *attributes)
|
||||
int orte_ras_base_proxy_allocate(orte_jobid_t job, opal_list_t *attributes)
|
||||
{
|
||||
orte_buffer_t* cmd;
|
||||
orte_buffer_t* answer;
|
||||
@ -75,7 +64,7 @@ int orte_ras_proxy_allocate(orte_jobid_t job, opal_list_t *attributes)
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(orte_ras_proxy_replica, cmd, ORTE_RML_TAG_RAS, 0)) {
|
||||
if (0 > orte_rml.send_buffer(orte_ras_base_proxy_replica, cmd, ORTE_RML_TAG_RAS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
@ -88,7 +77,7 @@ int orte_ras_proxy_allocate(orte_jobid_t job, opal_list_t *attributes)
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.recv_buffer(orte_ras_proxy_replica, answer, ORTE_RML_TAG_RAS)) {
|
||||
if (0 > orte_rml.recv_buffer(orte_ras_base_proxy_replica, answer, ORTE_RML_TAG_RAS)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
@ -111,7 +100,7 @@ int orte_ras_proxy_allocate(orte_jobid_t job, opal_list_t *attributes)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_ras_proxy_deallocate(orte_jobid_t job)
|
||||
int orte_ras_base_proxy_deallocate(orte_jobid_t job)
|
||||
{
|
||||
orte_buffer_t* cmd;
|
||||
orte_buffer_t* answer;
|
||||
@ -139,7 +128,7 @@ int orte_ras_proxy_deallocate(orte_jobid_t job)
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(orte_ras_proxy_replica, cmd, ORTE_RML_TAG_RAS, 0)) {
|
||||
if (0 > orte_rml.send_buffer(orte_ras_base_proxy_replica, cmd, ORTE_RML_TAG_RAS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
@ -152,7 +141,7 @@ int orte_ras_proxy_deallocate(orte_jobid_t job)
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.recv_buffer(orte_ras_proxy_replica, answer, ORTE_RML_TAG_RAS)) {
|
||||
if (0 > orte_rml.recv_buffer(orte_ras_base_proxy_replica, answer, ORTE_RML_TAG_RAS)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
@ -1,5 +1,4 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
@ -29,26 +28,28 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
/* my replica */
|
||||
extern orte_process_name_t *orte_ras_proxy_replica;
|
||||
extern orte_process_name_t *orte_ras_base_proxy_replica;
|
||||
|
||||
extern orte_ras_base_module_t orte_ras_base_proxy_module;
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int orte_ras_proxy_open(void);
|
||||
int orte_ras_proxy_close(void);
|
||||
int orte_ras_base_proxy_open(void);
|
||||
int orte_ras_base_proxy_close(void);
|
||||
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
orte_ras_base_module_t* orte_ras_proxy_init(int* priority);
|
||||
int orte_ras_proxy_finalize(void);
|
||||
orte_ras_base_module_t* orte_ras_base_proxy_init(int* priority);
|
||||
int orte_ras_base_proxy_finalize(void);
|
||||
|
||||
/*
|
||||
* proxy function prototypes
|
||||
*/
|
||||
int orte_ras_proxy_allocate(orte_jobid_t job, opal_list_t *attributes);
|
||||
int orte_ras_proxy_deallocate(orte_jobid_t job);
|
||||
int orte_ras_base_proxy_allocate(orte_jobid_t job, opal_list_t *attributes);
|
||||
int orte_ras_base_proxy_deallocate(orte_jobid_t job);
|
||||
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
@ -1,5 +1,4 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
@ -16,13 +15,6 @@
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
@ -33,39 +25,19 @@
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
#include "ras_proxy.h"
|
||||
|
||||
/*
|
||||
* Struct of function pointers that need to be initialized
|
||||
*/
|
||||
orte_ras_base_component_t mca_ras_proxy_component = {
|
||||
{
|
||||
ORTE_RAS_BASE_VERSION_1_3_0,
|
||||
|
||||
"proxy", /* MCA module name */
|
||||
ORTE_MAJOR_VERSION, /* MCA module major version */
|
||||
ORTE_MINOR_VERSION, /* MCA module minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA module release version */
|
||||
orte_ras_proxy_open, /* module open */
|
||||
orte_ras_proxy_close /* module close */
|
||||
},
|
||||
{
|
||||
false /* checkpoint / restart */
|
||||
},
|
||||
orte_ras_proxy_init /* module init */
|
||||
};
|
||||
#include "ras_base_proxy.h"
|
||||
|
||||
/*
|
||||
* setup the function pointers for the module
|
||||
*/
|
||||
static orte_ras_base_module_t orte_ras_proxy_module = {
|
||||
orte_ras_proxy_allocate,
|
||||
orte_ras_base_module_t orte_ras_base_proxy_module = {
|
||||
orte_ras_base_proxy_allocate,
|
||||
orte_ras_base_node_insert,
|
||||
orte_ras_base_node_query,
|
||||
orte_ras_base_node_query_alloc,
|
||||
orte_ras_base_node_lookup,
|
||||
orte_ras_proxy_deallocate,
|
||||
orte_ras_proxy_finalize
|
||||
orte_ras_base_proxy_deallocate,
|
||||
orte_ras_base_proxy_finalize
|
||||
};
|
||||
|
||||
/*
|
||||
@ -74,12 +46,12 @@ static orte_ras_base_module_t orte_ras_proxy_module = {
|
||||
static bool initialized = false;
|
||||
|
||||
/* the name of our replica */
|
||||
orte_process_name_t *orte_ras_proxy_replica;
|
||||
orte_process_name_t *orte_ras_base_proxy_replica;
|
||||
|
||||
/*
|
||||
* Not much to do here.
|
||||
*/
|
||||
int orte_ras_proxy_open(void)
|
||||
int orte_ras_base_proxy_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -87,33 +59,28 @@ int orte_ras_proxy_open(void)
|
||||
/*
|
||||
* ditto for this one
|
||||
*/
|
||||
int orte_ras_proxy_close(void)
|
||||
int orte_ras_base_proxy_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
orte_ras_base_module_t* orte_ras_proxy_init(int* priority)
|
||||
orte_ras_base_module_t* orte_ras_base_proxy_init(int* priority)
|
||||
{
|
||||
/* If we are an HNP, then don't pick us */
|
||||
if (orte_process_info.seed) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* define the replica for us to use - for now, just point
|
||||
* to the name service replica
|
||||
*/
|
||||
orte_ras_proxy_replica = orte_process_info.ns_replica;
|
||||
orte_ras_base_proxy_replica = orte_process_info.ns_replica;
|
||||
|
||||
initialized = true;
|
||||
*priority = 1;
|
||||
return &orte_ras_proxy_module;
|
||||
|
||||
return &orte_ras_base_proxy_module;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* finalize routine
|
||||
*/
|
||||
int orte_ras_proxy_finalize(void)
|
||||
int orte_ras_base_proxy_finalize(void)
|
||||
{
|
||||
initialized = false;
|
||||
|
||||
@ -121,3 +88,4 @@ int orte_ras_proxy_finalize(void)
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
|
||||
#include "orte/mca/ras/base/proxy/ras_base_proxy.h"
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
|
||||
/*
|
||||
@ -39,56 +40,168 @@ int orte_ras_base_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
int ret;
|
||||
opal_list_item_t *item;
|
||||
orte_ras_base_cmp_t *cmp;
|
||||
opal_list_t nodes;
|
||||
orte_attribute_t * attr;
|
||||
orte_jobid_t * jptr;
|
||||
|
||||
/* If no components are available, then return an error */
|
||||
if (opal_list_is_empty(&orte_ras_base.ras_available)) {
|
||||
/* so there are a lot of possibilities here */
|
||||
/* 1: we are not on the head node, so use the proxy component */
|
||||
if (!orte_process_info.seed) {
|
||||
return orte_ras_base_proxy_allocate(jobid, attributes);
|
||||
}
|
||||
|
||||
/* 2: either no attributes were passed, or ORTE_RAS_INITIAL_ALLOCATION
|
||||
* was passed. This means that if the node segment is empty, we
|
||||
* want to allocate new nodes. Otherwise allocate all the existing nodes to
|
||||
* our job */
|
||||
if(NULL == attributes || opal_list_is_empty(attributes) ||
|
||||
NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_INITIAL_ALLOCATION))) {
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
/* See if there are any nodes already on the registry. Most of the time
|
||||
* these would have been put there by the RDS reading the hostfile. */
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_base_node_query(&nodes))) {
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
return ret;
|
||||
}
|
||||
/* If there are any nodes at all, allocate them all to this job */
|
||||
if (!opal_list_is_empty(&nodes)) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: reallocating nodes that are already on registry");
|
||||
ret = orte_ras_base_allocate_nodes(jobid, &nodes);
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* there were no nodes already on the registry, so get them from the
|
||||
* RAS components */
|
||||
|
||||
/* If no components are available, then return an error */
|
||||
if (opal_list_is_empty(&orte_ras_base.ras_available)) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: no components available!");
|
||||
ret = ORTE_ERR_NOT_FOUND;
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Otherwise, go through the [already sorted in priority order]
|
||||
* list and initialize them until one of them puts something on
|
||||
* the node segment */
|
||||
for (item = opal_list_get_first(&orte_ras_base.ras_available);
|
||||
item != opal_list_get_end(&orte_ras_base.ras_available);
|
||||
item = opal_list_get_next(item)) {
|
||||
cmp = (orte_ras_base_cmp_t *) item;
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: attemping to allocate using module: %s",
|
||||
cmp->component->ras_version.mca_component_name);
|
||||
|
||||
if (NULL != cmp->module->allocate_job) {
|
||||
ret = cmp->module->allocate_job(jobid, attributes);
|
||||
if (ORTE_SUCCESS == ret) {
|
||||
bool empty;
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
(ret = orte_ras_base_node_segment_empty(&empty))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* If this module put something on the node segment,
|
||||
we're done */
|
||||
|
||||
if (!empty) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: found good module: %s",
|
||||
cmp->component->ras_version.mca_component_name);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* We didn't find anyone who put anything on the node segment */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:select: no components available!");
|
||||
"orte:ras:base:allocate: no module put anything in the node segment");
|
||||
ret = ORTE_ERR_NOT_FOUND;
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Otherwise, go through the [already sorted in priority order]
|
||||
list and initialize them until one of them puts something on
|
||||
the node segment */
|
||||
|
||||
for (item = opal_list_get_first(&orte_ras_base.ras_available);
|
||||
item != opal_list_get_end(&orte_ras_base.ras_available);
|
||||
item = opal_list_get_next(item)) {
|
||||
cmp = (orte_ras_base_cmp_t *) item;
|
||||
/* Case 3: We want to use our parent's allocation. This can occur if we
|
||||
* are doing a dynamic process spawn and don't want to do go through
|
||||
* the allocators again. */
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_PARENT_ALLOCATION))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: attemping to allocate using module: %s",
|
||||
cmp->component->ras_version.mca_component_name);
|
||||
"orte:ras:base:allocate: reallocating parent's allocation as our own");
|
||||
/* attribute was given - just reallocate to the new jobid */
|
||||
if (ORTE_SUCCESS != (ret = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_base_reallocate(*jptr, jobid))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (NULL != cmp->module->allocate_job) {
|
||||
ret = cmp->module->allocate_job(jobid, attributes);
|
||||
if (ORTE_SUCCESS == ret) {
|
||||
bool empty;
|
||||
/* Case 4: We want to use a new allocation. This can happen if we
|
||||
* are spawning a new process that does not want to use its parent's
|
||||
* allocation. */
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_NEW_ALLOCATION))) {
|
||||
/* If no components are available, then return an error */
|
||||
if (opal_list_is_empty(&orte_ras_base.ras_available)) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: no components available!");
|
||||
ret = ORTE_ERR_NOT_FOUND;
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
(ret = orte_ras_base_node_segment_empty(&empty))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
/* Otherwise, go through the [already sorted in priority order]
|
||||
* list and initialize them until one of them puts something on
|
||||
* the node segment */
|
||||
for (item = opal_list_get_first(&orte_ras_base.ras_available);
|
||||
item != opal_list_get_end(&orte_ras_base.ras_available);
|
||||
item = opal_list_get_next(item)) {
|
||||
cmp = (orte_ras_base_cmp_t *) item;
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: attemping to allocate using module: %s",
|
||||
cmp->component->ras_version.mca_component_name);
|
||||
|
||||
/* If this module put something on the node segment,
|
||||
we're done */
|
||||
if (NULL != cmp->module->allocate_job) {
|
||||
ret = cmp->module->allocate_job(jobid, attributes);
|
||||
if (ORTE_SUCCESS == ret) {
|
||||
bool empty;
|
||||
|
||||
if (!empty) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: found good module: %s",
|
||||
cmp->component->ras_version.mca_component_name);
|
||||
return ORTE_SUCCESS;
|
||||
if (ORTE_SUCCESS !=
|
||||
(ret = orte_ras_base_node_segment_empty(&empty))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* If this module put something on the node segment,
|
||||
we're done */
|
||||
|
||||
if (!empty) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: found good module: %s",
|
||||
cmp->component->ras_version.mca_component_name);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* We didn't find anyone who put anything on the node segment */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: no module put anything in the node segment");
|
||||
ret = ORTE_ERR_NOT_FOUND;
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* We didn't find anyone who put anything on the node segment */
|
||||
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: no module put anything in the node segment");
|
||||
/* none of the above cases fit. This is not a good thing... */
|
||||
ret = ORTE_ERR_NOT_FOUND;
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
@ -96,6 +209,10 @@ int orte_ras_base_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
|
||||
int orte_ras_base_deallocate(orte_jobid_t job)
|
||||
{
|
||||
/* if we are not a HNP, then use proxy */
|
||||
if (!orte_process_info.seed) {
|
||||
return orte_ras_base_proxy_deallocate(job);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -110,6 +227,7 @@ int orte_ras_base_reallocate(orte_jobid_t parent_jobid,
|
||||
opal_list_t current_alloc;
|
||||
opal_list_item_t *item;
|
||||
int rc;
|
||||
|
||||
|
||||
OBJ_CONSTRUCT(¤t_alloc, opal_list_t);
|
||||
|
||||
|
@ -30,6 +30,7 @@
|
||||
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
#include "orte/mca/ras/base/proxy/ras_base_proxy.h"
|
||||
|
||||
|
||||
/*
|
||||
@ -92,6 +93,7 @@ int orte_ras_base_open(void)
|
||||
/* Defaults */
|
||||
|
||||
orte_ras_base.ras_opened_valid = false;
|
||||
orte_ras_base.ras_using_proxy = false;
|
||||
orte_ras_base.ras_available_valid = false;
|
||||
|
||||
/** register the base system types with the DSS */
|
||||
@ -138,6 +140,15 @@ int orte_ras_base_open(void)
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* if we are not on a HNP, select the proxy 'module' */
|
||||
if (!orte_process_info.seed) {
|
||||
orte_ras = orte_ras_base_proxy_module;
|
||||
/* initialize the module */
|
||||
orte_ras_base_proxy_init(&rc);
|
||||
orte_ras_base.ras_using_proxy = true;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
orte_ras_base.ras_opened_valid = true;
|
||||
|
@ -243,34 +243,16 @@ static int orte_ras_bjs_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
opal_list_item_t* item;
|
||||
int rc;
|
||||
orte_app_context_t **context = NULL;
|
||||
orte_std_cntr_t i, num_context;
|
||||
orte_jobid_t *jptr;
|
||||
orte_attribute_t *attr;
|
||||
orte_std_cntr_t i, num_context = 0;
|
||||
|
||||
/* check the attributes to see if we are supposed to use the parent
|
||||
* jobid's allocation. This can occur if we are doing a dynamic
|
||||
* process spawn and don't want to go through the allocator again
|
||||
*/
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_PARENT_ALLOCATION))) {
|
||||
/* attribute was given - just reallocate to the new jobid */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_reallocate(*jptr, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
|
||||
rc = orte_rmgr.get_app_context(jobid, &context, &num_context);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
if(ORTE_SUCCESS != (rc = orte_ras_bjs_discover(&nodes, context, num_context))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
|
@ -75,40 +75,8 @@ static int orte_ras_dash_host_allocate(orte_jobid_t jobid, opal_list_t *attribut
|
||||
int rc;
|
||||
char **mapped_nodes = NULL, **mini_map;
|
||||
orte_ras_node_t *node;
|
||||
bool empty;
|
||||
orte_jobid_t *jptr;
|
||||
orte_attribute_t *attr;
|
||||
|
||||
/* check the attributes to see if we are supposed to use the parent
|
||||
* jobid's allocation. This can occur if we are doing a dynamic
|
||||
* process spawn and don't want to go through the allocator again
|
||||
*/
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_PARENT_ALLOCATION))) {
|
||||
/* attribute was given - just reallocate to the new jobid */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_reallocate(*jptr, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* If the node segment is not empty, do nothing */
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_node_segment_empty(&empty))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (!empty) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:dash_host: node segment not empty; not doing anything");
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Otherwise, get the context */
|
||||
/* Get the context */
|
||||
|
||||
rc = orte_rmgr.get_app_context(jobid, &context, &num_context);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
|
@ -77,36 +77,17 @@ static int orte_ras_gridengine_allocate(orte_jobid_t jobid, opal_list_t *attribu
|
||||
opal_list_item_t* item;
|
||||
int rc;
|
||||
orte_app_context_t **context = NULL;
|
||||
orte_std_cntr_t i, num_context;
|
||||
orte_jobid_t *jptr;
|
||||
orte_attribute_t *attr;
|
||||
|
||||
/* check the attributes to see if we are supposed to use the parent
|
||||
* jobid's allocation. This can occur if we are doing a dynamic
|
||||
* process spawn and don't want to go through the allocator again
|
||||
*/
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_PARENT_ALLOCATION))) {
|
||||
/* attribute was given - just reallocate to the new jobid */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_reallocate(*jptr, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
orte_std_cntr_t i, num_context = 0;
|
||||
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
/* get the context */
|
||||
rc = orte_rmgr.get_app_context(jobid, &context, &num_context);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* construct a node object and pass to discover to gather valid nodes */
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
if(ORTE_SUCCESS != (rc =
|
||||
orte_ras_gridengine_discover(&nodes, context, num_context))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -1,47 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_ras_hostfile_DSO
|
||||
component_noinst =
|
||||
component_install = mca_ras_hostfile.la
|
||||
else
|
||||
component_noinst = libmca_ras_hostfile.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
hostfile_SOURCES = \
|
||||
ras_hostfile.h \
|
||||
ras_hostfile_module.c \
|
||||
ras_hostfile_component.c
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_ras_hostfile_la_SOURCES = $(hostfile_SOURCES)
|
||||
mca_ras_hostfile_la_LIBADD = \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
mca_ras_hostfile_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_ras_hostfile_la_SOURCES = $(hostfile_SOURCES)
|
||||
libmca_ras_hostfile_la_LIBADD =
|
||||
libmca_ras_hostfile_la_LDFLAGS = -module -avoid-version
|
@ -1,23 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_INIT_FILE=ras_hostfile_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
@ -1,59 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Resource Allocation (hostfile)
|
||||
*/
|
||||
#ifndef ORTE_RAS_HOSTFILE_H
|
||||
#define ORTE_RAS_HOSTFILE_H
|
||||
|
||||
#include "orte/mca/ras/ras.h"
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* hostfile-specific RAS component struct
|
||||
*/
|
||||
struct orte_ras_hostfile_component_t {
|
||||
/** Base RAS component */
|
||||
orte_ras_base_component_t super;
|
||||
/** What's the priority of this component */
|
||||
int priority;
|
||||
};
|
||||
/**
|
||||
* Convenience typedef
|
||||
*/
|
||||
typedef struct orte_ras_hostfile_component_t orte_ras_hostfile_component_t;
|
||||
|
||||
/**
|
||||
* Component export structure
|
||||
*/
|
||||
ORTE_MODULE_DECLSPEC extern orte_ras_hostfile_component_t mca_ras_hostfile_component;
|
||||
|
||||
/**
|
||||
* Module init function
|
||||
*/
|
||||
orte_ras_base_module_t *orte_ras_hostfile_init(int* priority);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
@ -1,77 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/ras/hostfile/ras_hostfile.h"
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_ras_hostfile_open(void);
|
||||
|
||||
|
||||
orte_ras_hostfile_component_t mca_ras_hostfile_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta
|
||||
information about the component itself */
|
||||
|
||||
{
|
||||
/* Indicate that we are a ras v1.3.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
|
||||
ORTE_RAS_BASE_VERSION_1_3_0,
|
||||
|
||||
"hostfile", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_ras_hostfile_open, /* component open */
|
||||
NULL
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
false
|
||||
},
|
||||
|
||||
orte_ras_hostfile_init
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component open function
|
||||
*/
|
||||
static int orte_ras_hostfile_open(void)
|
||||
{
|
||||
mca_base_param_reg_int(&mca_ras_hostfile_component.super.ras_version,
|
||||
"priority",
|
||||
"Selection priority for the hostfile RAS component",
|
||||
false, false, 10,
|
||||
&mca_ras_hostfile_component.priority);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,159 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
#include "orte/mca/ras/hostfile/ras_hostfile.h"
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_ras_hostfile_allocate(orte_jobid_t jobid, opal_list_t *attributes);
|
||||
static int orte_ras_hostfile_deallocate(orte_jobid_t jobid);
|
||||
static int orte_ras_hostfile_finalize(void);
|
||||
|
||||
|
||||
/*
|
||||
* Local variables
|
||||
*/
|
||||
orte_ras_base_module_t orte_ras_hostfile_module = {
|
||||
orte_ras_hostfile_allocate,
|
||||
orte_ras_base_node_insert,
|
||||
orte_ras_base_node_query,
|
||||
orte_ras_base_node_query_alloc,
|
||||
orte_ras_base_node_lookup,
|
||||
orte_ras_hostfile_deallocate,
|
||||
orte_ras_hostfile_finalize
|
||||
};
|
||||
|
||||
|
||||
orte_ras_base_module_t *orte_ras_hostfile_init(int* priority)
|
||||
{
|
||||
/* if we are not an HNP, then we must not be selected */
|
||||
if (!orte_process_info.seed) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*priority = mca_ras_hostfile_component.priority;
|
||||
return &orte_ras_hostfile_module;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* THIS FUNCTION NEEDS TO CHANGE POST-1.0.
|
||||
*
|
||||
* After 1.0, this function, and the rds/hostfile need to change to
|
||||
* clean up properly. They're not "broken" right now, so we're not
|
||||
* fixing them. But they're implemented wrong, so they should be
|
||||
* adapted to the model that they're supposed to implement, not the
|
||||
* workarounds that they currently have. The end result will be much,
|
||||
* much cleaner.
|
||||
*
|
||||
* Specifically, the rds/hostfile currently puts all of its nodes on
|
||||
* the resource segment *and* the node segment. It should not. It
|
||||
* should only put its nodes on the resource segment, appropriately
|
||||
* tagged that they came from a hostfile. The ras/hostfile should
|
||||
* then examine the resources segment and pull out all nodes that came
|
||||
* from a hostfile and put them on the nodes segment.
|
||||
*/
|
||||
static int orte_ras_hostfile_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
{
|
||||
opal_list_t nodes;
|
||||
opal_list_item_t* item;
|
||||
int rc;
|
||||
orte_jobid_t *jptr;
|
||||
orte_attribute_t *attr;
|
||||
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
|
||||
/* check the attributes to see if we are supposed to use the parent
|
||||
* jobid's allocation. This can occur if we are doing a dynamic
|
||||
* process spawn and don't want to go through the allocator again
|
||||
*/
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_PARENT_ALLOCATION))) {
|
||||
/* attribute was given - just reallocate to the new jobid */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_reallocate(*jptr, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Query for all nodes in the node segment that have been
|
||||
allocated to this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_node_query_alloc(&nodes, jobid))) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* If there are nodes allocated, then query for *all* nodes */
|
||||
if (opal_list_is_empty(&nodes)) {
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_node_query(&nodes))) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* If there are any nodes at all, allocate them all to this job */
|
||||
if (!opal_list_is_empty(&nodes)) {
|
||||
rc = orte_ras_base_allocate_nodes(jobid, &nodes);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
while (NULL != (item = opal_list_remove_first(&nodes))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static int orte_ras_hostfile_deallocate(orte_jobid_t jobid)
|
||||
{
|
||||
/* Nothing to do */
|
||||
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:hostfile:deallocate: success (nothing to do)");
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_ras_hostfile_finalize(void)
|
||||
{
|
||||
/* Nothing to do */
|
||||
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:hostfile:finalize: success (nothing to do)");
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -83,7 +83,7 @@ orte_ras_base_module_t orte_ras_loadleveler_module = {
|
||||
*/
|
||||
static int orte_ras_loadleveler_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
{
|
||||
int i, rc, ret;
|
||||
int i, ret;
|
||||
opal_list_t nodes_list;
|
||||
opal_list_item_t* item;
|
||||
orte_ras_node_t* node;
|
||||
@ -92,29 +92,13 @@ static int orte_ras_loadleveler_allocate(orte_jobid_t jobid, opal_list_t *attrib
|
||||
orte_jobid_t *jptr;
|
||||
orte_attribute_t *attr;
|
||||
|
||||
/* check the attributes to see if we are supposed to use the parent
|
||||
* jobid's allocation. This can occur if we are doing a dynamic
|
||||
* process spawn and don't want to go through the allocator again
|
||||
*/
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_PARENT_ALLOCATION))) {
|
||||
/* attribute was given - just reallocate to the new jobid */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_reallocate(*jptr, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
rc = orte_ras_loadleveler_get_hostlist(&num_hosts, &hostlist);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
return rc;
|
||||
OBJ_CONSTRUCT(&nodes_list, opal_list_t);
|
||||
|
||||
ret = orte_ras_loadleveler_get_hostlist(&num_hosts, &hostlist);
|
||||
if(ORTE_SUCCESS != ret) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&nodes_list, opal_list_t);
|
||||
for (i = 0; i < num_hosts; i++) {
|
||||
/* check for duplicated nodes */
|
||||
for (item = opal_list_get_first(&nodes_list);
|
||||
@ -131,7 +115,8 @@ static int orte_ras_loadleveler_allocate(orte_jobid_t jobid, opal_list_t *attrib
|
||||
/* we did not find a duplicate, so add a new item to the list */
|
||||
node = OBJ_NEW(orte_ras_node_t);
|
||||
if (NULL == node) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ret = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
node->node_name = strdup(hostlist[i]);
|
||||
node->node_arch = orte_ras_loadleveler_get_host_arch(hostlist[i]);
|
||||
@ -145,14 +130,15 @@ static int orte_ras_loadleveler_allocate(orte_jobid_t jobid, opal_list_t *attrib
|
||||
}
|
||||
ret = orte_ras_base_node_insert(&nodes_list);
|
||||
ret = orte_ras_base_allocate_nodes(jobid, &nodes_list);
|
||||
|
||||
|
||||
cleanup:
|
||||
while (NULL != (item = opal_list_remove_first(&nodes_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&nodes_list);
|
||||
opal_argv_free(hostlist);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -74,26 +74,7 @@ static int orte_ras_localhost_allocate(orte_jobid_t jobid, opal_list_t *attribut
|
||||
opal_list_t nodes;
|
||||
orte_ras_node_t *node;
|
||||
opal_list_item_t *item;
|
||||
orte_attribute_t *attr;
|
||||
orte_jobid_t *jptr;
|
||||
|
||||
/* check the attributes to see if we are supposed to use the parent
|
||||
* jobid's allocation. This can occur if we are doing a dynamic
|
||||
* process spawn and don't want to go through the allocator again
|
||||
*/
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_PARENT_ALLOCATION))) {
|
||||
/* attribute was given - just reallocate to the new jobid */
|
||||
if (ORTE_SUCCESS != (ret = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_base_reallocate(*jptr, jobid))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* If the node segment is not empty, do nothing */
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_base_node_segment_empty(&empty))) {
|
||||
|
@ -1,51 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Use the top-level Makefile.options
|
||||
|
||||
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_ras_proxy_DSO
|
||||
component_noinst =
|
||||
component_install = mca_ras_proxy.la
|
||||
else
|
||||
component_noinst = libmca_ras_proxy.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
proxy_SOURCES = \
|
||||
ras_proxy.c \
|
||||
ras_proxy.h \
|
||||
ras_proxy_component.c
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_ras_proxy_la_SOURCES = $(proxy_SOURCES)
|
||||
mca_ras_proxy_la_LIBADD = \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
mca_ras_proxy_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_ras_proxy_la_SOURCES = $(proxy_SOURCES)
|
||||
libmca_ras_proxy_la_LIBADD =
|
||||
libmca_ras_proxy_la_LDFLAGS = -module -avoid-version
|
@ -1,23 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_INIT_FILE=ras_proxy.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
@ -33,6 +33,8 @@ extern "C" {
|
||||
* RAS Attributes
|
||||
*/
|
||||
#define ORTE_RAS_USE_PARENT_ALLOCATION "orte-use-parent-alloc"
|
||||
#define ORTE_RAS_USE_NEW_ALLOCATION "orte-use-new-alloc"
|
||||
#define ORTE_RAS_INITIAL_ALLOCATION "orte-initial-alloc"
|
||||
|
||||
|
||||
/**
|
||||
|
@ -37,13 +37,13 @@
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int allocate(orte_jobid_t jobid, opal_list_t *attributes);
|
||||
static int deallocate(orte_jobid_t jobid);
|
||||
static int finalize(void);
|
||||
static int orte_ras_slurm_allocate(orte_jobid_t jobid, opal_list_t *attributes);
|
||||
static int orte_ras_slurm_deallocate(orte_jobid_t jobid);
|
||||
static int orte_ras_slurm_finalize(void);
|
||||
|
||||
static int discover(char *regexp, opal_list_t *nodelist);
|
||||
static int parse_ranges(char *base, char *ranges, char ***nodelist);
|
||||
static int parse_range(char *base, char *range, char ***nodelist);
|
||||
static int orte_ras_slurm_discover(char *regexp, opal_list_t *nodelist);
|
||||
static int orte_ras_slurm_parse_ranges(char *base, char *ranges, char ***nodelist);
|
||||
static int orte_ras_slurm_parse_range(char *base, char *range, char ***nodelist);
|
||||
|
||||
|
||||
|
||||
@ -51,13 +51,13 @@ static int parse_range(char *base, char *range, char ***nodelist);
|
||||
* Global variable
|
||||
*/
|
||||
orte_ras_base_module_t orte_ras_slurm_module = {
|
||||
allocate,
|
||||
orte_ras_slurm_allocate,
|
||||
orte_ras_base_node_insert,
|
||||
orte_ras_base_node_query,
|
||||
orte_ras_base_node_query_alloc,
|
||||
orte_ras_base_node_lookup,
|
||||
deallocate,
|
||||
finalize
|
||||
orte_ras_slurm_deallocate,
|
||||
orte_ras_slurm_finalize
|
||||
};
|
||||
|
||||
/**
|
||||
@ -65,32 +65,15 @@ orte_ras_base_module_t orte_ras_slurm_module = {
|
||||
* requested number of nodes/process slots to the job.
|
||||
*
|
||||
*/
|
||||
static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
static int orte_ras_slurm_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
{
|
||||
int ret;
|
||||
char *slurm_node_str;
|
||||
opal_list_t nodes;
|
||||
opal_list_item_t* item;
|
||||
orte_jobid_t *jptr;
|
||||
orte_attribute_t *attr;
|
||||
|
||||
/* check the attributes to see if we are supposed to use the parent
|
||||
* jobid's allocation. This can occur if we are doing a dynamic
|
||||
* process spawn and don't want to go through the allocator again
|
||||
*/
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_PARENT_ALLOCATION))) {
|
||||
/* attribute was given - just reallocate to the new jobid */
|
||||
if (ORTE_SUCCESS != (ret = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_base_reallocate(*jptr, jobid))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
|
||||
slurm_node_str = getenv("SLURM_NODELIST");
|
||||
if (NULL == slurm_node_str) {
|
||||
opal_show_help("help-ras-slurm.txt", "env-var-not-found", 1,
|
||||
@ -98,8 +81,7 @@ static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
if (ORTE_SUCCESS != (ret = discover(slurm_node_str, &nodes))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_slurm_discover(slurm_node_str, &nodes))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:slurm:allocate: discover failed!");
|
||||
return ret;
|
||||
@ -127,7 +109,7 @@ static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
/*
|
||||
* There's really nothing to do here
|
||||
*/
|
||||
static int deallocate(orte_jobid_t jobid)
|
||||
static int orte_ras_slurm_deallocate(orte_jobid_t jobid)
|
||||
{
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:slurm:deallocate: success (nothing to do)");
|
||||
@ -138,7 +120,7 @@ static int deallocate(orte_jobid_t jobid)
|
||||
/*
|
||||
* There's really nothing to do here
|
||||
*/
|
||||
static int finalize(void)
|
||||
static int orte_ras_slurm_finalize(void)
|
||||
{
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:slurm:finalize: success (nothing to do)");
|
||||
@ -155,7 +137,7 @@ static int finalize(void)
|
||||
* - check for additional nodes that have already been allocated
|
||||
*/
|
||||
|
||||
static int discover(char *regexp, opal_list_t* nodelist)
|
||||
static int orte_ras_slurm_discover(char *regexp, opal_list_t* nodelist)
|
||||
{
|
||||
int i, j, len, ret, count, reps;
|
||||
char *base, **names = NULL;
|
||||
@ -211,7 +193,7 @@ static int discover(char *regexp, opal_list_t* nodelist)
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
ret = parse_ranges(base, base + i + 1, &names);
|
||||
ret = orte_ras_slurm_parse_ranges(base, base + i + 1, &names);
|
||||
}
|
||||
|
||||
/* Find the number of slots per node */
|
||||
@ -295,7 +277,7 @@ static int discover(char *regexp, opal_list_t* nodelist)
|
||||
/*
|
||||
* Parse one or more ranges in a set
|
||||
*/
|
||||
static int parse_ranges(char *base, char *ranges, char ***names)
|
||||
static int orte_ras_slurm_parse_ranges(char *base, char *ranges, char ***names)
|
||||
{
|
||||
int i, len, ret;
|
||||
char *start, *orig;
|
||||
@ -306,7 +288,7 @@ static int parse_ranges(char *base, char *ranges, char ***names)
|
||||
for (orig = start = ranges, i = 0; i < len; ++i) {
|
||||
if (',' == ranges[i]) {
|
||||
ranges[i] = '\0';
|
||||
if (ORTE_SUCCESS != (ret = parse_range(base, start, names))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_slurm_parse_range(base, start, names))) {
|
||||
return ret;
|
||||
}
|
||||
start = ranges + i + 1;
|
||||
@ -319,7 +301,7 @@ static int parse_ranges(char *base, char *ranges, char ***names)
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:slurm:allocate:discover: parse range %s (2)",
|
||||
start);
|
||||
if (ORTE_SUCCESS != (ret = parse_range(base, start, names))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_slurm_parse_range(base, start, names))) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
@ -333,7 +315,7 @@ static int parse_ranges(char *base, char *ranges, char ***names)
|
||||
/*
|
||||
* Parse a single range in a set
|
||||
*/
|
||||
static int parse_range(char *base, char *range, char ***names)
|
||||
static int orte_ras_slurm_parse_range(char *base, char *range, char ***names)
|
||||
{
|
||||
char *str, temp1[BUFSIZ], temp2[BUFSIZ];
|
||||
size_t i, j, start, end;
|
||||
|
@ -74,26 +74,7 @@ static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
opal_list_t nodes;
|
||||
opal_list_item_t* item;
|
||||
struct tm_roots root;
|
||||
orte_jobid_t *jptr;
|
||||
orte_attribute_t *attr;
|
||||
|
||||
/* check the attributes to see if we are supposed to use the parent
|
||||
* jobid's allocation. This can occur if we are doing a dynamic
|
||||
* process spawn and don't want to go through the allocator again
|
||||
*/
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_PARENT_ALLOCATION))) {
|
||||
/* attribute was given - just reallocate to the new jobid */
|
||||
if (ORTE_SUCCESS != (ret = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_base_reallocate(*jptr, jobid))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Open up our connection to tm */
|
||||
|
||||
ret = tm_init(NULL, &root);
|
||||
|
@ -71,27 +71,9 @@ static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
int ret;
|
||||
opal_list_t nodes;
|
||||
opal_list_item_t* item;
|
||||
orte_jobid_t *jptr;
|
||||
orte_attribute_t *attr;
|
||||
|
||||
/* check the attributes to see if we are supposed to use the parent
|
||||
* jobid's allocation. This can occur if we are doing a dynamic
|
||||
* process spawn and don't want to go through the allocator again
|
||||
*/
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_PARENT_ALLOCATION))) {
|
||||
/* attribute was given - just reallocate to the new jobid */
|
||||
if (ORTE_SUCCESS != (ret = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_base_reallocate(*jptr, jobid))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = discover(jobid, &nodes))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:xgrid:allocate: discover failed!");
|
||||
|
@ -117,8 +117,15 @@ static int map_app_by_node(
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(map, node, jobid, vpid_start + num_alloc, app->idx,
|
||||
nodes, max_used_nodes,
|
||||
mca_rmaps_round_robin_component.oversubscribe))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error - we just need to break from the loop
|
||||
* since the node is fully used up. For now, just don't report
|
||||
* an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
++num_alloc;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user