openmpi/orte/mca/ras/base/base.h

/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University.
 *                         All rights reserved.
 * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
 *                         All rights reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */
/** @file:
 */

#ifndef ORTE_MCA_RAS_BASE_H
#define ORTE_MCA_RAS_BASE_H

/*
 * includes
 */
#include "orte_config.h"
#include "include/orte_constants.h"
#include "opal/class/opal_list.h"
#include "mca/ras/ras.h"


/*
 * Global functions for MCA overall collective open and close
 */
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif

/*
 * Internal definitions
 */

struct orte_ras_base_cmp_t {
    /** Base object */
    opal_list_item_t super;
    /** ras component */
    orte_ras_base_component_t *component;
    /** ras module */
    orte_ras_base_module_t* module;
    /** This component's priority */
    int priority;
};
typedef struct orte_ras_base_cmp_t orte_ras_base_cmp_t;


/*
 * function definitions
 */
ORTE_DECLSPEC int orte_ras_base_open(void);
ORTE_DECLSPEC int orte_ras_base_finalize(void);
ORTE_DECLSPEC int orte_ras_base_close(void);
ORTE_DECLSPEC orte_ras_base_module_t* orte_ras_base_select(const char*);
ORTE_DECLSPEC int orte_ras_base_allocate(orte_jobid_t job);
ORTE_DECLSPEC int orte_ras_base_deallocate(orte_jobid_t job);
ORTE_DECLSPEC int orte_ras_base_allocate_nodes_by_node(orte_jobid_t jobid, 
                                                       opal_list_t* nodes);
ORTE_DECLSPEC int orte_ras_base_allocate_nodes_by_slot(orte_jobid_t jobid, 
                                                       opal_list_t* nodes);

/*
 * globals that might be needed
 */


typedef struct orte_ras_base_t {
    int ras_output;
    opal_list_t ras_opened;
    opal_list_t ras_available;
    size_t ras_num_nodes;
} orte_ras_base_t;
 
ORTE_DECLSPEC extern orte_ras_base_t orte_ras_base;

/** Class declaration */
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(orte_ras_base_cmp_t);


/*
 * external API functions will be documented in the mca/ns/ns.h file
 */

#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif
* results from initial merge of the tim branch into the trunk. Compiles and ompi_info works, but that's all that has been tested. This commit was SVN r4827. 2005-03-14 20:57:21 +00:00			`/*`
			`* Copyright (c) 2004-2005 The Trustees of Indiana University.`
			`* All rights reserved.`
			`* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.`
			`* All rights reserved.`
			`* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,`
			`* University of Stuttgart. All rights reserved.`
Add UC copyright This commit was SVN r5009. 2005-03-24 12:43:37 +00:00			`* Copyright (c) 2004-2005 The Regents of the University of California.`
			`* All rights reserved.`
* results from initial merge of the tim branch into the trunk. Compiles and ompi_info works, but that's all that has been tested. This commit was SVN r4827. 2005-03-14 20:57:21 +00:00			`* $COPYRIGHT$`
			`*`
			`* Additional copyrights may follow`
			`*`
			`* $HEADER$`
			`*/`
			`/** @file:`
			`*/`

			`#ifndef ORTE_MCA_RAS_BASE_H`
			`#define ORTE_MCA_RAS_BASE_H`

			`/*`
			`* includes`
			`*/`
			`#include "orte_config.h"`
			`#include "include/orte_constants.h"`
* rename ompi_list to opal_list This commit was SVN r6322. 2005-07-03 16:22:16 +00:00			`#include "opal/class/opal_list.h"`
* results from initial merge of the tim branch into the trunk. Compiles and ompi_info works, but that's all that has been tested. This commit was SVN r4827. 2005-03-14 20:57:21 +00:00			`#include "mca/ras/ras.h"`


			`/*`
			`* Global functions for MCA overall collective open and close`
			`*/`
			`#if defined(c_plusplus) \|\| defined(__cplusplus)`
			`extern "C" {`
			`#endif`

			`/*`
			`* Internal definitions`
			`*/`

			`struct orte_ras_base_cmp_t {`
			`/** Base object */`
* rename ompi_list to opal_list This commit was SVN r6322. 2005-07-03 16:22:16 +00:00			`opal_list_item_t super;`
* results from initial merge of the tim branch into the trunk. Compiles and ompi_info works, but that's all that has been tested. This commit was SVN r4827. 2005-03-14 20:57:21 +00:00			`/** ras component */`
			`orte_ras_base_component_t *component;`
			`/** ras module */`
			`orte_ras_base_module_t* module;`
			`/** This component's priority */`
			`int priority;`
			`};`
			`typedef struct orte_ras_base_cmp_t orte_ras_base_cmp_t;`


			`/*`
			`* function definitions`
			`*/`
			`ORTE_DECLSPEC int orte_ras_base_open(void);`
split close into finalize/close so that rmgr can finalize all sub-components prior to entering close. moved pls logic to wait on children from close to finalize. This commit was SVN r5392. 2005-04-15 17:04:57 +00:00			`ORTE_DECLSPEC int orte_ras_base_finalize(void);`
* results from initial merge of the tim branch into the trunk. Compiles and ompi_info works, but that's all that has been tested. This commit was SVN r4827. 2005-03-14 20:57:21 +00:00			`ORTE_DECLSPEC int orte_ras_base_close(void);`
			`ORTE_DECLSPEC orte_ras_base_module_t* orte_ras_base_select(const char*);`
			`ORTE_DECLSPEC int orte_ras_base_allocate(orte_jobid_t job);`
			`ORTE_DECLSPEC int orte_ras_base_deallocate(orte_jobid_t job);`
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894. 2005-05-31 16:36:53 +00:00			`ORTE_DECLSPEC int orte_ras_base_allocate_nodes_by_node(orte_jobid_t jobid,`
* rename ompi_list to opal_list This commit was SVN r6322. 2005-07-03 16:22:16 +00:00			`opal_list_t* nodes);`
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894. 2005-05-31 16:36:53 +00:00			`ORTE_DECLSPEC int orte_ras_base_allocate_nodes_by_slot(orte_jobid_t jobid,`
* rename ompi_list to opal_list This commit was SVN r6322. 2005-07-03 16:22:16 +00:00			`opal_list_t* nodes);`
* results from initial merge of the tim branch into the trunk. Compiles and ompi_info works, but that's all that has been tested. This commit was SVN r4827. 2005-03-14 20:57:21 +00:00
			`/*`
			`* globals that might be needed`
			`*/`


			`typedef struct orte_ras_base_t {`
			`int ras_output;`
* rename ompi_list to opal_list This commit was SVN r6322. 2005-07-03 16:22:16 +00:00			`opal_list_t ras_opened;`
			`opal_list_t ras_available;`
* results from initial merge of the tim branch into the trunk. Compiles and ompi_info works, but that's all that has been tested. This commit was SVN r4827. 2005-03-14 20:57:21 +00:00			`size_t ras_num_nodes;`
			`} orte_ras_base_t;`

			`ORTE_DECLSPEC extern orte_ras_base_t orte_ras_base;`

			`/** Class declaration */`
			`OMPI_DECLSPEC OBJ_CLASS_DECLARATION(orte_ras_base_cmp_t);`


			`/*`
			`* external API functions will be documented in the mca/ns/ns.h file`
			`*/`

			`#if defined(c_plusplus) \|\| defined(__cplusplus)`
			`}`
			`#endif`
			`#endif`