1
1
openmpi/orte/mca/ras/tm/ras_tm_module.c
Ralph Castain f4a458532b This doesn't totally resolve the comm_spawn problem, but it helps a little. I'll continue working on it and hope to resolve it completely shortly. The issue primarily centers on where to start mapping the child job's processes, and how to deal with oversubscription that might result. At the moment, I am trying to resolve the first issue first (hey, that even sounds right!).
This change does a couple of things:

1. Since the USE_PARENT_ALLOC attribute is a directive about regarding allocation of resources to a job, it more properly should be an attribute of the RAS. Change the name to reflect that and move the attribute define to the ras_types.h file.

2. Add the attributes list to the RMAPS map_job interface. This provides us with the desired flexibility to dynamically specify directives for mapping. The system will - in the absence of any attribute-based directive - default to the values provided in the MCA parameters (either from environment or command-line interface).

This commit was SVN r12164.
2006-10-18 14:01:44 +00:00

319 строки
9.4 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include <errno.h>
#include <unistd.h>
#include <string.h>
#include "tm.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "orte/dss/dss.h"
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ras/base/ras_private.h"
#include "ras_tm.h"
/*
* Local functions
*/
static int allocate(orte_jobid_t jobid, opal_list_t *attributes);
static int deallocate(orte_jobid_t jobid);
static int finalize(void);
static int discover(opal_list_t* nodelist);
static int get_tm_hostname(tm_node_id node, char **hostname, char **arch);
/*
* Global variable
*/
orte_ras_base_module_t orte_ras_tm_module = {
allocate,
orte_ras_base_node_insert,
orte_ras_base_node_query,
orte_ras_base_node_query_alloc,
orte_ras_base_node_lookup,
deallocate,
finalize
};
/**
* Discover available (pre-allocated) nodes. Allocate the
* requested number of nodes/process slots to the job.
*
*/
#include "orte/mca/gpr/gpr.h"
static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
{
int ret;
opal_list_t nodes;
opal_list_item_t* item;
struct tm_roots root;
orte_jobid_t *jptr;
orte_attribute_t *attr;
/* check the attributes to see if we are supposed to use the parent
* jobid's allocation. This can occur if we are doing a dynamic
* process spawn and don't want to go through the allocator again
*/
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_PARENT_ALLOCATION))) {
/* attribute was given - just reallocate to the new jobid */
if (ORTE_SUCCESS != (ret = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (ORTE_SUCCESS != (ret = orte_ras_base_reallocate(*jptr, jobid))) {
ORTE_ERROR_LOG(ret);
return ret;
}
return ORTE_SUCCESS;
}
/* Open up our connection to tm */
ret = tm_init(NULL, &root);
if (TM_SUCCESS != ret) {
opal_output(orte_ras_base.ras_output,
"ras:tm:allocate: tm_init failed!");
return ORTE_ERR_RESOURCE_BUSY;
}
OBJ_CONSTRUCT(&nodes, opal_list_t);
if (ORTE_SUCCESS != (ret = discover(&nodes))) {
opal_output(orte_ras_base.ras_output,
"ras:tm:allocate: discover failed!");
tm_finalize();
return ret;
}
ret = orte_ras_base_allocate_nodes(jobid, &nodes);
while (NULL != (item = opal_list_remove_first(&nodes))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&nodes);
/* All done */
if (ORTE_SUCCESS == ret) {
opal_output(orte_ras_base.ras_output,
"ras:tm:allocate: success");
} else {
opal_output(orte_ras_base.ras_output,
"ras:tm:allocate: failure (base_allocate_nodes=%d)", ret);
}
tm_finalize();
return ret;
}
/*
* There's really nothing to do here
*/
static int deallocate(orte_jobid_t jobid)
{
opal_output(orte_ras_base.ras_output,
"ras:tm:deallocate: success (nothing to do)");
return ORTE_SUCCESS;
}
/*
* There's really nothing to do here
*/
static int finalize(void)
{
opal_output(orte_ras_base.ras_output,
"ras:tm:finalize: success (nothing to do)");
return ORTE_SUCCESS;
}
/**
* Discover the available resources. Obtain directly from TM (and
* therefore have no need to validate) -- ignore hostfile or any other
* user-specified parameters.
*
* - validate any nodes specified via hostfile/commandline
* - check for additional nodes that have already been allocated
*/
static int discover(opal_list_t* nodelist)
{
int i, ret, num_node_ids;
orte_ras_node_t *node;
opal_list_item_t* item;
opal_list_t new_nodes;
tm_node_id *tm_node_ids;
char *hostname, *arch;
/* Ignore anything that the user already specified -- we're
getting nodes only from TM. */
/* TM "nodes" may actually correspond to PBS "VCPUs", which means
there may be multiple "TM nodes" that correspond to the same
physical node. This doesn't really affect what we're doing
here (we actually ignore the fact that they're duplicates --
slightly inefficient, but no big deal); just mentioned for
completeness... */
ret = tm_nodeinfo(&tm_node_ids, &num_node_ids);
if (ret != TM_SUCCESS) {
opal_output(orte_ras_base.ras_output,
"ras:tm:allocate:discover: tm_nodeinfo failed");
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* Iterate through all the nodes and make an entry for each. TM
node ID's will never be duplicated, but they may end up
resolving to the same hostname (i.e., vcpu's on a single
host). */
OBJ_CONSTRUCT(&new_nodes, opal_list_t);
for (i = 0; i < num_node_ids; ++i) {
get_tm_hostname(tm_node_ids[i], &hostname, &arch);
opal_output(orte_ras_base.ras_output,
"ras:tm:allocate:discover: got hostname %s", hostname);
/* Remember that TM may list the same node more than once. So
we have to check for duplicates. */
for (item = opal_list_get_first(&new_nodes);
opal_list_get_end(&new_nodes) != item;
item = opal_list_get_next(item)) {
node = (orte_ras_node_t*) item;
if (0 == strcmp(node->node_name, hostname)) {
++node->node_slots;
opal_output(orte_ras_base.ras_output,
"ras:tm:allocate:discover: found -- bumped slots to %d",
node->node_slots);
break;
}
}
/* Did we find it? */
if (opal_list_get_end(&new_nodes) == item) {
/* Nope -- didn't find it, so add a new item to the list */
opal_output(orte_ras_base.ras_output,
"ras:tm:allocate:discover: not found -- added to list");
node = OBJ_NEW(orte_ras_node_t);
node->node_name = hostname;
node->node_arch = arch;
node->node_state = ORTE_NODE_STATE_UP;
node->node_cellid = 0;
node->node_slots_inuse = 0;
node->node_slots_max = 0;
node->node_slots = 1;
opal_list_append(&new_nodes, &node->super);
} else {
/* Yes, so we need to free the hostname that came back
from get_tm_hostname() */
free(hostname);
}
}
/* Add these nodes to the registry, and return all the values */
opal_output(orte_ras_base.ras_output,
"ras:tm:allocate:discover: done -- adding to registry");
ret = orte_ras_base_node_insert(&new_nodes);
for (item = opal_list_remove_first(&new_nodes);
NULL != item; item = opal_list_remove_first(&new_nodes)) {
if (ORTE_SUCCESS == ret) {
opal_list_append(nodelist, item);
} else {
OBJ_RELEASE(item);
}
}
/* All done */
if (ORTE_SUCCESS == ret) {
opal_output(orte_ras_base.ras_output,
"ras:tm:allocate:discover: success");
} else {
opal_output(orte_ras_base.ras_output,
"ras:tm:allocate:discover: failed (rc=%d)", ret);
}
OBJ_DESTRUCT(&new_nodes);
return ret;
}
/*
* For a given TM node ID, get the string hostname corresponding to
* it.
*/
static int get_tm_hostname(tm_node_id node, char **hostname, char **arch)
{
int ret, local_errno;
tm_event_t event;
char buffer[256];
char **argv;
/* Get the info string corresponding to this TM node ID */
ret = tm_rescinfo(node, buffer, sizeof(buffer) - 1, &event);
if (TM_SUCCESS != ret) {
opal_output(orte_ras_base.ras_output,
"ras:tm:hostname: tm_rescinfo failed");
return ORTE_ERROR;
}
/* Now wait for that event to happen */
ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
if (TM_SUCCESS != ret) {
return ORTE_ERROR;
}
/* According to the TM man page, we get back a space-separated
string array. The hostname is the second item. Use a cheap
trick to get it. */
opal_output(orte_ras_base.ras_output,
"ras:tm:hostname: got back %s", buffer);
buffer[sizeof(buffer) - 1] = '\0';
argv = opal_argv_split(buffer, ' ');
if (NULL == argv) {
return ORTE_ERROR;
}
*hostname = strdup(argv[1]);
*arch = strdup(buffer);
opal_argv_free(argv);
/* All done */
opal_output(orte_ras_base.ras_output,
"ras:tm:hostname: got hostname %s", *hostname);
return ORTE_SUCCESS;
}