1
1

Add a new mapper component that sequentially maps ranks-to-hosts according to the ordering in the hostfile.

Not functional yet - still under development. Just placeholding for now to clear a backlog

This commit was SVN r18062.
Этот коммит содержится в:
Ralph Castain 2008-04-01 20:03:49 +00:00
родитель ee5b96269e
Коммит 51533c9340
8 изменённых файлов: 562 добавлений и 0 удалений

45
orte/mca/rmaps/seq/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,45 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-rmaps-seq.txt
sources = \
rmaps_seq.c \
rmaps_seq.h \
rmaps_seq_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_rmaps_seq_DSO
component_noinst =
component_install = mca_rmaps_seq.la
else
component_noinst = libmca_rmaps_seq.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_rmaps_seq_la_SOURCES = $(sources)
mca_rmaps_seq_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_rmaps_seq_la_SOURCES =$(sources)
libmca_rmaps_seq_la_LDFLAGS = -module -avoid-version

24
orte/mca/rmaps/seq/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,24 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

53
orte/mca/rmaps/seq/help-orte-rmaps-seq.txt Обычный файл
Просмотреть файл

@ -0,0 +1,53 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open RTE's orterun.
#
[orte-rmaps-rr:alloc-error]
There are not enough slots available in the system to satisfy the %d slots
that were requested by the application:
%s
Either request fewer slots for your application, or make more slots available
for use.
[orte-rmaps-rr:multi-apps-and-zero-np]
RMAPS found multiple applications to be launched, with
at least one that failed to specify the number of processes to execute.
When specifying multiple applications, you must specify how many processes
of each to launch via the -np argument.
[orte-rmaps-rr:per-node-and-too-many-procs]
There are not enough nodes in your allocation to satisfy your request to launch
%d processes on a per-node basis - only %d nodes were available.
Either request fewer processes, or obtain a larger allocation.
[orte-rmaps-rr:n-per-node-and-too-many-procs]
There are not enough nodes in your allocation to satisfy your request to launch
%d processes on a %d per-node basis - only %d nodes with a total of %d slots were available.
Either request fewer processes, or obtain a larger allocation.
[orte-rmaps-rr:n-per-node-and-not-enough-slots]
There are not enough slots on the nodes in your allocation to satisfy your request to launch on a %d process-per-node basis - only %d slots/node were available.
Either request fewer processes/node, or obtain a larger allocation.
[orte-rmaps-rr:no-np-and-user-map]
You have specified a rank-to-node/slot mapping, but failed to provide
the number of processes to be executed. For some reason, this information
could not be obtained from the mapping you provided, so we cannot continue
with executing the specified application.

252
orte/mca/rmaps/seq/rmaps_seq.c Обычный файл
Просмотреть файл

@ -0,0 +1,252 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "opal/util/show_help.h"
#include "opal/util/argv.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
#include "rmaps_seq.h"
/*
* Sequentially map the ranks according to the placement in the
* specified hostfile
*/
static int orte_rmaps_seq_map(orte_job_t *jdata)
{
#if 0
orte_job_map_t *map;
orte_app_context_t *app, **apps;
orte_std_cntr_t i;
opal_list_t node_list, procs;
opal_list_item_t *item;
orte_node_t *node;
orte_vpid_t vpid_start;
orte_std_cntr_t num_nodes, num_slots;
int rc;
orte_std_cntr_t slots_per_node;
OPAL_TRACE(1);
/* conveniece def */
map = jdata->map;
apps = (orte_app_context_t**)jdata->apps->addr;
/* start at the beginning... */
vpid_start = 0;
/* cycle through the app_contexts, mapping them sequentially */
for(i=0; i < jdata->num_apps; i++) {
app = apps[i];
/* if the number of processes wasn't specified, then we know there can be only
* one app_context allowed in the launch, and that we are to launch it across
* all available slots. We'll double-check the single app_context rule first
*/
if (0 == app->num_procs && 1 < jdata->num_apps) {
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np",
true, jdata->num_apps, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
/* for each app_context, we have to get an ordered list of nodes
* from the specified hostfile
*/
OBJ_CONSTRUCT(&node_list, opal_list_t);
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
map->no_use_local))) {
ORTE_ERROR_LOG(rc);
goto error;
}
num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
/* if a bookmark exists from some prior mapping, set us to start there */
if (NULL != jdata->bookmark) {
cur_node_item = NULL;
/* find this node on the list */
for (item = opal_list_get_first(&node_list);
item != opal_list_get_end(&node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
if (node->index == jdata->bookmark->index) {
cur_node_item = item;
break;
}
}
/* see if we found it - if not, just start at the beginning */
if (NULL == cur_node_item) {
cur_node_item = opal_list_get_first(&node_list);
}
} else {
/* if no bookmark, then just start at the beginning of the list */
cur_node_item = opal_list_get_first(&node_list);
}
if (map->pernode && map->npernode == 1) {
/* there are three use-cases that we need to deal with:
* (a) if -np was not provided, then we just use the number of nodes
* (b) if -np was provided AND #procs > #nodes, then error out
* (c) if -np was provided AND #procs <= #nodes, then launch
* the specified #procs one/node. In this case, we just
* leave app->num_procs alone
*/
if (0 == app->num_procs) {
app->num_procs = num_nodes;
} else if (app->num_procs > num_nodes) {
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:per-node-and-too-many-procs",
true, app->num_procs, num_nodes, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
} else if (map->pernode && map->npernode > 1) {
/* first, let's check to see if there are enough slots/node to
* meet the request - error out if not
*/
slots_per_node = num_slots / num_nodes;
if (map->npernode > slots_per_node) {
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:n-per-node-and-not-enough-slots",
true, map->npernode, slots_per_node, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
/* there are three use-cases that we need to deal with:
* (a) if -np was not provided, then we just use the n/node * #nodes
* (b) if -np was provided AND #procs > (n/node * #nodes), then error out
* (c) if -np was provided AND #procs <= (n/node * #nodes), then launch
* the specified #procs n/node. In this case, we just
* leave app->num_procs alone
*/
if (0 == app->num_procs) {
/* set the num_procs to equal the specified num/node * the number of nodes */
app->num_procs = map->npernode * num_nodes;
} else if (app->num_procs > (map->npernode * num_nodes)) {
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:n-per-node-and-too-many-procs",
true, app->num_procs, map->npernode, num_nodes, num_slots, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
} else if (0 == app->num_procs) {
/** set the num_procs to equal the number of slots on these mapped nodes - if
user has specified "-bynode", then set it to the number of nodes
*/
if (map->policy == ORTE_RMAPS_BYNODE) {
app->num_procs = num_nodes;
} else if (map->policy == ORTE_RMAPS_BYSLOT) {
app->num_procs = num_slots;
} else if (map->policy == ORTE_RMAPS_BYUSER) {
/* we can't handle this - it should have been set when we got
* the map info. If it wasn't, then we can only error out
*/
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:no-np-and-user-map",
true, app->num_procs, map->npernode, num_nodes, num_slots, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
}
/** track the total number of processes we mapped */
jdata->num_procs += app->num_procs;
/* Make assignments */
if (map->policy == ORTE_RMAPS_BYUSER) {
rc = map_app_by_user_map(app, jdata, vpid_start, &node_list, &procs);
} else if (map->policy == ORTE_RMAPS_BYNODE) {
rc = map_app_by_node(app, jdata, vpid_start, &node_list);
} else {
rc = map_app_by_slot(app, jdata, vpid_start, &node_list);
}
/* update the starting vpid for the next app_context */
vpid_start += app->num_procs;
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* save the bookmark */
jdata->bookmark = (orte_node_t*)cur_node_item;
/* cleanup the node list - it can differ from one app_context
* to another, so we have to get it every time
*/
while(NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
}
/* compute and save convenience values */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* define the daemons that we will use for this job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(map))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
#if 0
if(!opal_list_is_empty(&procs)){
if(ORTE_SUCCESS != (rc = orte_rmaps_base_rearrange_map(app, map, &procs))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
#endif
error:
while(NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
#endif
return ORTE_SUCCESS;
}
orte_rmaps_base_module_t orte_rmaps_seq_module = {
orte_rmaps_seq_map
};

40
orte/mca/rmaps/seq/rmaps_seq.h Обычный файл
Просмотреть файл

@ -0,0 +1,40 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Resource Mapping
*/
#ifndef ORTE_RMAPS_SEQ_H
#define ORTE_RMAPS_SEQ_H
#include "orte/mca/rmaps/rmaps.h"
BEGIN_C_DECLS
/**
* RMGR Component
*/
ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_seq_component;
extern orte_rmaps_base_module_t orte_rmaps_seq_module;
END_C_DECLS
#endif

92
orte/mca/rmaps/seq/rmaps_seq_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,92 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/output.h"
#include "orte/mca/rmaps/rmaps.h"
#include "rmaps_seq.h"
/*
* Local functions
*/
static int orte_rmaps_seq_open(void);
static int orte_rmaps_seq_close(void);
static orte_rmaps_base_module_t* orte_rmaps_seq_init(int* priority);
orte_rmaps_base_component_t mca_rmaps_seq_component = {
{
/* Indicate that we are a rmaps v1.3.0 component (which also
implies a specific MCA version) */
ORTE_RMAPS_BASE_VERSION_1_3_0,
"seq", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_rmaps_seq_open, /* component open */
orte_rmaps_seq_close /* component close */
},
/* Next the MCA v1.0.0 component meta data */
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
orte_rmaps_seq_init
};
/**
* component open/close/init function
*/
static int orte_rmaps_seq_open(void)
{
return ORTE_SUCCESS;
}
static orte_rmaps_base_module_t*
orte_rmaps_seq_init(int *priority)
{
/* the RMAPS framework is -only- opened on HNP's,
* so no need to check for that here
*/
*priority = 0; /* only select if specified */
return &orte_rmaps_seq_module;
}
/**
* Close all subsystems.
*/
static int orte_rmaps_seq_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -531,3 +531,55 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
return ORTE_SUCCESS;
}
int orte_util_get_ordered_host_list(opal_list_t *nodes,
bool *override_oversubscribed,
char *hostfile)
{
opal_list_t exclude;
opal_list_item_t *item, *itm;
int rc;
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s hostfile: creating ordered list of hosts from hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile));
OBJ_CONSTRUCT(&exclude, opal_list_t);
/* parse the hostfile and add the contents to the list */
if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, nodes, &exclude))) {
goto cleanup;
}
/* remove from the list of nodes those that are in the exclude list */
while(NULL != (item = opal_list_remove_first(&exclude))) {
orte_node_t *exnode = (orte_node_t*)item;
/* check for matches on nodes */
for (itm = opal_list_get_first(nodes);
itm != opal_list_get_end(nodes);
itm = opal_list_get_next(itm)) {
orte_node_t *node=(orte_node_t*)itm;
if (0 == strcmp(exnode->name, node->name)) {
/* match - remove it */
opal_list_remove_item(nodes, itm);
OBJ_RELEASE(itm);
break;
}
}
OBJ_RELEASE(item);
}
/* indicate that ORTE should override any oversubscribed conditions
* based on local hardware limits since the user (a) might not have
* provided us any info on the #slots for a node, and (b) the user
* might have been wrong! If we don't check the number of local physical
* processors, then we could be too aggressive on our sched_yield setting
* and cause performance problems.
*/
*override_oversubscribed = true;
cleanup:
OBJ_DESTRUCT(&exclude);
return rc;
}

Просмотреть файл

@ -37,6 +37,10 @@ ORTE_DECLSPEC int orte_util_add_hostfile_nodes(opal_list_t *nodes,
ORTE_DECLSPEC int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
char *hostfile);
ORTE_DECLSPEC int orte_util_get_ordered_host_list(opal_list_t *nodes,
bool *override_oversubscribed,
char *hostfile);
END_C_DECLS
#endif