9613b3176c
After much work by Jeff and myself, and quite a lot of discussion, it has become clear that we simply cannot resolve the infinite loops caused by RML-involved subsystems calling orte_output. The original rationale for the change to orte_output has also been reduced by shifting the output of XML-formatted vs human readable messages to an alternative approach. I have globally replaced the orte_output/ORTE_OUTPUT calls in the code base, as well as the corresponding .h file name. I have test compiled and run this on the various environments within my reach, so hopefully this will prove minimally disruptive. This commit was SVN r18619.
179 строки
5.4 KiB
C
179 строки
5.4 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2006-2007 Sun Microsystems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
/**
|
|
* @file:
|
|
* Resource Allocation for Grid Engine
|
|
*/
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
|
|
#include <errno.h>
|
|
#include <unistd.h>
|
|
#include <string.h>
|
|
|
|
#include "opal/util/argv.h"
|
|
#include "orte/util/show_help.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/mca/ras/base/ras_private.h"
|
|
#include "orte/mca/ras/gridengine/ras_gridengine.h"
|
|
|
|
/*
|
|
* Local functions
|
|
*/
|
|
static int orte_ras_gridengine_allocate(opal_list_t *nodes);
|
|
static int orte_ras_gridengine_finalize(void);
|
|
#if 0
|
|
static int get_slot_count(char* node_name, int* slot_cnt);
|
|
#endif
|
|
|
|
/*
|
|
* Global variable
|
|
*/
|
|
orte_ras_base_module_t orte_ras_gridengine_module = {
|
|
orte_ras_gridengine_allocate,
|
|
orte_ras_gridengine_finalize
|
|
};
|
|
|
|
/**
|
|
* Discover available (pre-allocated) nodes. Allocate the
|
|
* requested number of nodes/process slots to the job.
|
|
*
|
|
*/
|
|
static int orte_ras_gridengine_allocate(opal_list_t *nodelist)
|
|
{
|
|
char *pe_hostfile = getenv("PE_HOSTFILE");
|
|
char *job_id = getenv("JOB_ID");
|
|
char buf[1024], *tok, *num, *queue, *arch, *ptr;
|
|
int rc;
|
|
FILE *fp;
|
|
orte_node_t *node;
|
|
|
|
/* show the Grid Engine's JOB_ID */
|
|
if (mca_ras_gridengine_component.show_jobid ||
|
|
mca_ras_gridengine_component.verbose != -1) {
|
|
opal_output(0, "ras:gridengine: JOB_ID: %s", job_id);
|
|
}
|
|
|
|
/* check the PE_HOSTFILE before continuing on */
|
|
if (!(fp = fopen(pe_hostfile, "r"))) {
|
|
orte_show_help("help-ras-gridengine.txt", "cannot-read-pe-hostfile",
|
|
true, pe_hostfile, strerror(errno));
|
|
rc = ORTE_ERROR;
|
|
ORTE_ERROR_LOG(rc);
|
|
goto cleanup;
|
|
}
|
|
|
|
/* parse the pe_hostfile for hostname, slots, etc, then compare the
|
|
* current node with a list of hosts in the nodelist, if the current
|
|
* node is not found in nodelist, add it in */
|
|
while (fgets(buf, sizeof(buf), fp)) {
|
|
ptr = strtok_r(buf, " \n", &tok);
|
|
num = strtok_r(NULL, " \n", &tok);
|
|
queue = strtok_r(NULL, " \n", &tok);
|
|
arch = strtok_r(NULL, " \n", &tok);
|
|
|
|
/* create a new node entry */
|
|
node = OBJ_NEW(orte_node_t);
|
|
if (NULL == node) {
|
|
fclose(fp);
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
node->name = strdup(ptr);
|
|
node->state = ORTE_NODE_STATE_UP;
|
|
node->slots_inuse = 0;
|
|
node->slots_max = 0;
|
|
node->slots = (int)strtol(num, (char **)NULL, 10);
|
|
opal_output(mca_ras_gridengine_component.verbose,
|
|
"ras:gridengine: %s: PE_HOSTFILE shows slots=%d",
|
|
node->name, node->slots);
|
|
opal_list_append(nodelist, &node->super);
|
|
|
|
} /* finished reading the $PE_HOSTFILE */
|
|
|
|
cleanup:
|
|
fclose(fp);
|
|
|
|
/* in gridengine, if we didn't find anything, then something
|
|
* is wrong. The user may not have indicated this was a parallel
|
|
* job, or may not have an allocation at all. In any case, this
|
|
* is considered an unrecoverable error and we need to report it
|
|
*/
|
|
if (opal_list_is_empty(nodelist)) {
|
|
orte_show_help("help-ras-gridengine.txt", "no-nodes-found", true);
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
#if 0
|
|
/**
|
|
* This function is not used currently, but may be used eventually.
|
|
* Parse the PE_HOSTFILE to determine the number of process
|
|
* slots/processors available on the node.
|
|
*/
|
|
static int get_slot_count(char* node_name, int* slot_cnt)
|
|
{
|
|
char buf[1024], *tok, *name, *num, *queue, *arch;
|
|
char *pe_hostfile = getenv("PE_HOSTFILE");
|
|
FILE *fp;
|
|
|
|
/* check the PE_HOSTFILE before continuing on */
|
|
if (!(fp = fopen(pe_hostfile, "r"))) {
|
|
orte_show_help("help-ras-gridengine.txt", "cannot-read-pe-hostfile",
|
|
true, pe_hostfile, strerror(errno));
|
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
|
return(ORTE_ERROR);
|
|
}
|
|
|
|
while (fgets(buf, sizeof(buf), fp)) {
|
|
name = strtok_r(buf, " \n", &tok);
|
|
num = strtok_r(NULL, " \n", &tok);
|
|
queue = strtok_r(NULL, " \n", &tok);
|
|
arch = strtok_r(NULL, " \n", &tok);
|
|
|
|
if(strcmp(node_name,name) == 0) {
|
|
*slot_cnt = (int) strtol(num, (char **)NULL, 10);
|
|
opal_output(mca_ras_gridengine_component.verbose,
|
|
"ras:gridengine: %s: PE_HOSTFILE shows slots=%d",
|
|
node_name, *slot_cnt);
|
|
fclose(fp);
|
|
return ORTE_SUCCESS;
|
|
}
|
|
}
|
|
|
|
/* when there is no match */
|
|
fclose(fp);
|
|
return ORTE_ERROR;
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* finalize
|
|
*/
|
|
static int orte_ras_gridengine_finalize(void)
|
|
{
|
|
/* Nothing to do */
|
|
opal_output(mca_ras_gridengine_component.verbose,
|
|
"ras:gridengine:finalize: success (nothing to do)");
|
|
return ORTE_SUCCESS;
|
|
}
|