0b9552cd4e
This commit was SVN r22870.
334 строки
11 KiB
C
334 строки
11 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
|
|
#ifdef HAVE_STRING_H
|
|
#include <string.h>
|
|
#endif
|
|
|
|
#if !ORTE_DISABLE_FULL_SUPPORT
|
|
#include "opal/mca/mca.h"
|
|
#include "opal/mca/base/base.h"
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
#include "opal/mca/paffinity/base/base.h"
|
|
#include "opal/mca/sysinfo/sysinfo.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/path.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/threads/mutex.h"
|
|
#include "opal/threads/condition.h"
|
|
|
|
#include "orte/mca/plm/plm_types.h"
|
|
#include "orte/util/name_fns.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/util/show_help.h"
|
|
#include "orte/util/parse_options.h"
|
|
|
|
#include "orte/mca/odls/base/odls_private.h"
|
|
|
|
#endif
|
|
|
|
#include "orte/mca/odls/base/base.h"
|
|
|
|
|
|
/*
|
|
* The following file was created by configure. It contains extern
|
|
* statements and the definition of an array of pointers to each
|
|
* component's public mca_base_component_t struct.
|
|
*/
|
|
|
|
#include "orte/mca/odls/base/static-components.h"
|
|
|
|
#if ORTE_DISABLE_FULL_SUPPORT
|
|
/* have to include a bogus function here so that
|
|
* the build system sees at least one function
|
|
* in the library
|
|
*/
|
|
int orte_odls_base_open(void)
|
|
{
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
#else
|
|
|
|
/*
|
|
* Instantiate globals
|
|
*/
|
|
orte_odls_base_module_t orte_odls;
|
|
|
|
/* instance the child list object */
|
|
static void orte_odls_child_constructor(orte_odls_child_t *ptr)
|
|
{
|
|
ptr->name = NULL;
|
|
ptr->restarts = 0;
|
|
ptr->pid = 0;
|
|
ptr->app_idx = 0;
|
|
ptr->alive = false;
|
|
ptr->coll_recvd = false;
|
|
/* set the default state to "failed to start" so
|
|
* we can correctly report should something
|
|
* go wrong during launch
|
|
*/
|
|
ptr->state = ORTE_PROC_STATE_FAILED_TO_START;
|
|
ptr->exit_code = 0;
|
|
ptr->init_recvd = false;
|
|
ptr->fini_recvd = false;
|
|
ptr->rml_uri = NULL;
|
|
ptr->slot_list = NULL;
|
|
ptr->waitpid_recvd = false;
|
|
ptr->iof_complete = false;
|
|
ptr->do_not_barrier = false;
|
|
}
|
|
static void orte_odls_child_destructor(orte_odls_child_t *ptr)
|
|
{
|
|
if (NULL != ptr->name) free(ptr->name);
|
|
if (NULL != ptr->rml_uri) free(ptr->rml_uri);
|
|
if (NULL != ptr->slot_list) free(ptr->slot_list);
|
|
}
|
|
OBJ_CLASS_INSTANCE(orte_odls_child_t,
|
|
opal_list_item_t,
|
|
orte_odls_child_constructor,
|
|
orte_odls_child_destructor);
|
|
|
|
static void orte_odls_job_constructor(orte_odls_job_t *ptr)
|
|
{
|
|
OBJ_CONSTRUCT(&ptr->lock, opal_mutex_t);
|
|
OBJ_CONSTRUCT(&ptr->cond, opal_condition_t);
|
|
ptr->jobid = ORTE_JOBID_INVALID;
|
|
ptr->state = ORTE_JOB_STATE_UNDEF;
|
|
ptr->launch_msg_processed = false;
|
|
ptr->apps = NULL;
|
|
ptr->num_apps = 0;
|
|
ptr->policy = 0;
|
|
ptr->cpus_per_rank = 1;
|
|
ptr->stride = 1;
|
|
ptr->controls = 0;
|
|
ptr->stdin_target = ORTE_VPID_INVALID;
|
|
ptr->total_slots_alloc = 0;
|
|
ptr->num_procs = 0;
|
|
ptr->num_local_procs = 0;
|
|
ptr->regexp = NULL;
|
|
ptr->pmap = NULL;
|
|
OBJ_CONSTRUCT(&ptr->collection_bucket, opal_buffer_t);
|
|
OBJ_CONSTRUCT(&ptr->local_collection, opal_buffer_t);
|
|
ptr->collective_type = ORTE_GRPCOMM_COLL_NONE;
|
|
ptr->num_contributors = 0;
|
|
ptr->num_participating = -1;
|
|
ptr->num_collected = 0;
|
|
}
|
|
static void orte_odls_job_destructor(orte_odls_job_t *ptr)
|
|
{
|
|
orte_app_idx_t i;
|
|
|
|
OBJ_DESTRUCT(&ptr->lock);
|
|
OBJ_DESTRUCT(&ptr->cond);
|
|
if (NULL != ptr->apps) {
|
|
for (i=0; i < ptr->num_apps; i++) {
|
|
OBJ_RELEASE(ptr->apps[i]);
|
|
}
|
|
if (NULL != ptr->apps) {
|
|
free(ptr->apps);
|
|
}
|
|
}
|
|
|
|
if (NULL != ptr->regexp) {
|
|
free(ptr->regexp);
|
|
}
|
|
|
|
if (NULL != ptr->pmap && NULL != ptr->pmap->bytes) {
|
|
free(ptr->pmap->bytes);
|
|
free(ptr->pmap);
|
|
}
|
|
|
|
OBJ_DESTRUCT(&ptr->collection_bucket);
|
|
OBJ_DESTRUCT(&ptr->local_collection);
|
|
}
|
|
OBJ_CLASS_INSTANCE(orte_odls_job_t,
|
|
opal_list_item_t,
|
|
orte_odls_job_constructor,
|
|
orte_odls_job_destructor);
|
|
|
|
/*
|
|
* Framework global variables
|
|
*/
|
|
orte_odls_base_t orte_odls_base;
|
|
orte_odls_globals_t orte_odls_globals;
|
|
|
|
/**
|
|
* Function for finding and opening either all MCA components, or the one
|
|
* that was specifically requested via a MCA parameter.
|
|
*/
|
|
int orte_odls_base_open(void)
|
|
{
|
|
char **ranks=NULL, *tmp;
|
|
int i, rank, sock, core;
|
|
orte_namelist_t *nm;
|
|
bool xterm_hold;
|
|
char *keys[] = {
|
|
OPAL_SYSINFO_CPU_TYPE,
|
|
OPAL_SYSINFO_CPU_MODEL,
|
|
OPAL_SYSINFO_NUM_CPUS,
|
|
OPAL_SYSINFO_MEM_SIZE,
|
|
NULL
|
|
};
|
|
opal_list_item_t *item;
|
|
opal_sysinfo_value_t *info;
|
|
|
|
/* Debugging / verbose output. Always have stream open, with
|
|
verbose set by the mca open system... */
|
|
orte_odls_globals.output = opal_output_open(NULL);
|
|
|
|
mca_base_param_reg_int_name("odls", "base_sigkill_timeout",
|
|
"Time to wait for a process to die after issuing a kill signal to it",
|
|
false, false, 1, &orte_odls_globals.timeout_before_sigkill);
|
|
|
|
/* initialize ODLS globals */
|
|
OBJ_CONSTRUCT(&orte_odls_globals.mutex, opal_mutex_t);
|
|
OBJ_CONSTRUCT(&orte_odls_globals.cond, opal_condition_t);
|
|
OBJ_CONSTRUCT(&orte_odls_globals.xterm_ranks, opal_list_t);
|
|
orte_odls_globals.xtermcmd = NULL;
|
|
orte_odls_globals.dmap = NULL;
|
|
orte_odls_globals.debugger = NULL;
|
|
orte_odls_globals.debugger_launched = false;
|
|
OBJ_CONSTRUCT(&orte_odls_globals.sysinfo, opal_list_t);
|
|
|
|
/* get any external processor bindings */
|
|
OPAL_PAFFINITY_CPU_ZERO(orte_odls_globals.my_cores);
|
|
orte_odls_globals.bound = false;
|
|
orte_odls_globals.num_processors = 0;
|
|
OBJ_CONSTRUCT(&orte_odls_globals.sockets, opal_bitmap_t);
|
|
opal_bitmap_init(&orte_odls_globals.sockets, 16);
|
|
/* default the number of sockets to those found during startup */
|
|
orte_odls_globals.num_sockets = orte_default_num_sockets_per_board;
|
|
/* see if paffinity is supported */
|
|
if (ORTE_SUCCESS == opal_paffinity_base_get(&orte_odls_globals.my_cores)) {
|
|
/* get the number of local processors */
|
|
opal_paffinity_base_get_processor_info(&orte_odls_globals.num_processors);
|
|
/* determine if we are bound */
|
|
OPAL_PAFFINITY_PROCESS_IS_BOUND(orte_odls_globals.my_cores, &orte_odls_globals.bound);
|
|
/* if we are bound, determine the number of sockets - and which ones - that are available to us */
|
|
if (orte_odls_globals.bound) {
|
|
for (i=0; i < orte_odls_globals.num_processors; i++) {
|
|
if (OPAL_PAFFINITY_CPU_ISSET(i, orte_odls_globals.my_cores)) {
|
|
opal_paffinity_base_get_map_to_socket_core(i, &sock, &core);
|
|
opal_bitmap_set_bit(&orte_odls_globals.sockets, sock);
|
|
}
|
|
}
|
|
/* determine how many sockets we have available to us */
|
|
orte_odls_globals.num_sockets = 0;
|
|
for (i=0; i < opal_bitmap_size(&orte_odls_globals.sockets); i++) {
|
|
if (opal_bitmap_is_set_bit(&orte_odls_globals.sockets, i)) {
|
|
orte_odls_globals.num_sockets++;
|
|
}
|
|
}
|
|
if (ORTE_PROC_IS_HNP && orte_report_bindings) {
|
|
opal_output(0, "System has detected external process binding to cores %04lx",
|
|
orte_odls_globals.my_cores.bitmask[0]);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* check if the user requested that we display output in xterms */
|
|
if (NULL != orte_xterm) {
|
|
/* construct a list of ranks to be displayed */
|
|
xterm_hold = false;
|
|
orte_util_parse_range_options(orte_xterm, &ranks);
|
|
for (i=0; i < opal_argv_count(ranks); i++) {
|
|
if (0 == strcmp(ranks[i], "BANG")) {
|
|
xterm_hold = true;
|
|
continue;
|
|
}
|
|
nm = OBJ_NEW(orte_namelist_t);
|
|
rank = strtol(ranks[i], NULL, 10);
|
|
if (-1 == rank) {
|
|
/* wildcard */
|
|
nm->name.vpid = ORTE_VPID_WILDCARD;
|
|
} else if (rank < 0) {
|
|
/* error out on bozo case */
|
|
orte_show_help("help-odls-base.txt",
|
|
"orte-odls-base:xterm-neg-rank",
|
|
true, rank);
|
|
return ORTE_ERROR;
|
|
} else {
|
|
/* we can't check here if the rank is out of
|
|
* range as we don't yet know how many ranks
|
|
* will be in the job - we'll check later
|
|
*/
|
|
nm->name.vpid = rank;
|
|
}
|
|
opal_list_append(&orte_odls_globals.xterm_ranks, &nm->item);
|
|
}
|
|
opal_argv_free(ranks);
|
|
/* construct the xtermcmd */
|
|
orte_odls_globals.xtermcmd = NULL;
|
|
tmp = opal_find_absolute_path("xterm");
|
|
if (NULL == tmp) {
|
|
return ORTE_ERROR;
|
|
}
|
|
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, tmp);
|
|
free(tmp);
|
|
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-T");
|
|
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "save");
|
|
if (xterm_hold) {
|
|
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-hold");
|
|
}
|
|
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-e");
|
|
}
|
|
|
|
/* collect the system info */
|
|
if (NULL != opal_sysinfo.query) {
|
|
/* get and store our local resources */
|
|
opal_sysinfo.query(keys, &orte_odls_globals.sysinfo);
|
|
/* find our cpu model and save it for later */
|
|
for (item = opal_list_get_first(&orte_odls_globals.sysinfo);
|
|
item != opal_list_get_end(&orte_odls_globals.sysinfo);
|
|
item = opal_list_get_next(item)) {
|
|
info = (opal_sysinfo_value_t*)item;
|
|
|
|
if (0 == strcmp(info->key, OPAL_SYSINFO_CPU_MODEL)) {
|
|
orte_local_cpu_model = strdup(info->data.str);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Open up all available components */
|
|
if (ORTE_SUCCESS !=
|
|
mca_base_components_open("odls", orte_odls_globals.output,
|
|
mca_odls_base_static_components,
|
|
&orte_odls_base.available_components, true)) {
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
/* are there components available for use ? -
|
|
* orte_odls_base.available_components is always initialized */
|
|
if(0 < opal_list_get_size(&(orte_odls_base.available_components))) {
|
|
orte_odls_base.components_available = true;
|
|
} else {
|
|
orte_odls_base.components_available = false;
|
|
}
|
|
|
|
/* All done */
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
#endif
|