1
1

392 строки
12 KiB
C
Исходник Обычный вид История

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <errno.h>
#include <unistd.h>
#include <string.h>
#include "include/orte_constants.h"
#include "include/orte_types.h"
#include "opal/util/output.h"
#include "mca/ns/ns.h"
#include "mca/gpr/gpr.h"
#include "mca/rmaps/base/base.h"
#include "mca/rmgr/base/base.h"
#include "mca/rmaps/base/rmaps_base_map.h"
#include "mca/ras/base/ras_base_node.h"
#include "rmaps_rr.h"
/*
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
* Local variable
*/
static opal_list_item_t *cur_node_item = NULL;
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
static int claim_slot(orte_rmaps_base_map_t *map,
Quite a range of small changes. ns_replica.c - Removed the error logging since I use this function in orte_init_stage1 to check if we have created a cellid yet or not. ras_types.h & rase_base_node.h - This was an empty file. moved the orte_ras_node_t from base/ras_base_node.h to this file. - Changed the name of orte_ras_base_node_t to orte_ras_node_t to match the naming mechanisms in place. ras.h - Exposed 2 functions: - node_insert: This takes a list of orte_ras_base_node_t's and places them in the Node Segment of the GPR. This is to be used in orte_init_stage1 for singleton processes, and the hostfile parsing (see rds_hostfile.c). This just puts in the appropriate API interface to keep from calling the orte_ras_base_node_insert function directly. - node_query: This is used in hostfile parsing. This just puts in the appropriate API interface to keep from calling the orte_ras_base_node_query function directly. - Touched all of the implemented components to add reference to these new function pointers ras_base_select.c & ras_base_open.c - Add and set the global module reference rds.h - Exposed 1 function: - store_resource: This stores a list of rds_cell_desc_t's to the Resource Segment. This is used in conjunction with the orte_ras.node_insert function in both the orte_init_stage1 for singleton processes and rds_hostfile.c rds_base_select.c & rds_base_open.c - Add and set the global module reference rds_hostfile.c - Added functionality to create a new cellid for each hostfile, placing each entry in the hostfile into the same cellid. Currently this is commented out with the cellid hard coded to 0, with the intention of taking this out once ORTE is able to handle multiple cellid's - Instead of just adding hosts to the Node Segment via a direct call to the ras_base_node_insert() function. First add the hosts to the Resource Segment of the GPR using the orte_rds.store_resource() function then use the API version of orte_ras.node_insert() to store the hosts on the Node Segment. - Add 1 new function pointer to module as required by the API. rds_hostfile_component.c - Converted this to use the new MCA parameter registration orte_init_stage1.c - It is possible that a cellid was not created yet for the current environment. So I put in some logic to test if the cellid 0 existed. If it does then continue, otherwise create the cellid so we can properly interact with the GPR via the RDS. - For the singleton case we insert some 'dummy' data into the GPR. The RAS matches this logic, so I took out the duplicate GPR put logic, and replaced it with a call to the orte_ras.node_insert() function. - Further before calling orte_ras.node_insert() in the singleton case, we also call orte_rds.store_resource() to add the singleton node to the Resource Segment. Console: - Added a bunch of new functions. Still experimenting with many aspects of the implementation. This is a checkpoint, and has very limited functionality. - Should not be considered stable at the moment. This commit was SVN r6813.
2005-08-11 19:51:50 +00:00
orte_ras_node_t *current_node,
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
orte_jobid_t jobid, orte_vpid_t vpid, int proc_index)
{
orte_rmaps_base_proc_t *proc;
orte_process_name_t *proc_name;
orte_rmaps_base_node_t *rmaps_node;
int rc;
/* create objects */
rmaps_node = OBJ_NEW(orte_rmaps_base_node_t);
if (NULL == rmaps_node) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
rmaps_node->node_name = strdup(current_node->node_name);
proc = OBJ_NEW(orte_rmaps_base_proc_t);
if (NULL == proc) {
OBJ_RELEASE(rmaps_node);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* create the process name as an offset from the vpid-start */
rc = orte_ns.create_process_name(&proc_name, current_node->node_cellid,
jobid, vpid);
if (rc != ORTE_SUCCESS) {
OBJ_RELEASE(proc);
OBJ_RELEASE(rmaps_node);
return rc;
}
proc->proc_node = rmaps_node;
proc->proc_name = *proc_name;
proc->proc_rank = vpid;
orte_ns.free_name(&proc_name);
OBJ_RETAIN(proc); /* bump reference count for the node */
opal_list_append(&rmaps_node->node_procs, &proc->super);
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
map->procs[proc_index] = proc;
/* Save this node on the map */
opal_list_append(&map->nodes, &rmaps_node->super);
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
/* Decrease the number of slots available for allocation
on this node */
--current_node->node_slots_alloc;
return ORTE_SUCCESS;
}
/*
* Create a default mapping for the application, scheduling round
* robin by node.
*
* NOTE: This function assumes that the allocator has already setup
* the list of nodes such that the sum of the node_slots_alloc fields
* from all entries will be the total number of processes in all the
* apps.
*/
static int map_app_by_node(
orte_app_context_t* app,
orte_rmaps_base_map_t* map,
orte_jobid_t jobid,
orte_vpid_t vpid_start,
int rank,
opal_list_t* nodes)
{
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
int rc;
size_t num_alloc = 0;
size_t proc_index = 0;
opal_list_item_t *start, *next;
Quite a range of small changes. ns_replica.c - Removed the error logging since I use this function in orte_init_stage1 to check if we have created a cellid yet or not. ras_types.h & rase_base_node.h - This was an empty file. moved the orte_ras_node_t from base/ras_base_node.h to this file. - Changed the name of orte_ras_base_node_t to orte_ras_node_t to match the naming mechanisms in place. ras.h - Exposed 2 functions: - node_insert: This takes a list of orte_ras_base_node_t's and places them in the Node Segment of the GPR. This is to be used in orte_init_stage1 for singleton processes, and the hostfile parsing (see rds_hostfile.c). This just puts in the appropriate API interface to keep from calling the orte_ras_base_node_insert function directly. - node_query: This is used in hostfile parsing. This just puts in the appropriate API interface to keep from calling the orte_ras_base_node_query function directly. - Touched all of the implemented components to add reference to these new function pointers ras_base_select.c & ras_base_open.c - Add and set the global module reference rds.h - Exposed 1 function: - store_resource: This stores a list of rds_cell_desc_t's to the Resource Segment. This is used in conjunction with the orte_ras.node_insert function in both the orte_init_stage1 for singleton processes and rds_hostfile.c rds_base_select.c & rds_base_open.c - Add and set the global module reference rds_hostfile.c - Added functionality to create a new cellid for each hostfile, placing each entry in the hostfile into the same cellid. Currently this is commented out with the cellid hard coded to 0, with the intention of taking this out once ORTE is able to handle multiple cellid's - Instead of just adding hosts to the Node Segment via a direct call to the ras_base_node_insert() function. First add the hosts to the Resource Segment of the GPR using the orte_rds.store_resource() function then use the API version of orte_ras.node_insert() to store the hosts on the Node Segment. - Add 1 new function pointer to module as required by the API. rds_hostfile_component.c - Converted this to use the new MCA parameter registration orte_init_stage1.c - It is possible that a cellid was not created yet for the current environment. So I put in some logic to test if the cellid 0 existed. If it does then continue, otherwise create the cellid so we can properly interact with the GPR via the RDS. - For the singleton case we insert some 'dummy' data into the GPR. The RAS matches this logic, so I took out the duplicate GPR put logic, and replaced it with a call to the orte_ras.node_insert() function. - Further before calling orte_ras.node_insert() in the singleton case, we also call orte_rds.store_resource() to add the singleton node to the Resource Segment. Console: - Added a bunch of new functions. Still experimenting with many aspects of the implementation. This is a checkpoint, and has very limited functionality. - Should not be considered stable at the moment. This commit was SVN r6813.
2005-08-11 19:51:50 +00:00
orte_ras_node_t *node;
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
bool did_alloc;
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
/* Note that cur_node_item already points to the Right place in
the node list to start looking (i.e., if this is the first time
through, it'll point to the first item. If this is not the
first time through -- i.e., we have multiple app contexts --
it'll point to where we left off last time.). If we're at the
end, bounce back to the front (as would happen in the loop
below)
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
But do a bozo check to ensure that we don't have a empty node
list. */
if (0 == opal_list_get_size(nodes)) {
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
} else if (opal_list_get_end(nodes) == cur_node_item) {
cur_node_item = opal_list_get_first(nodes);
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
}
start = cur_node_item;
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
/* This loop continues until all procs have been mapped or we run
out of resources. There are two definitions of "run out of
resources":
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
1. All nodes have node_slots processes mapped to them
2. All nodes have node_slots_max processes mapped to them
We first map until condition #1 is met. If there are still
processes that haven't been mapped yet, then we continue until
condition #2 is met. If we still have processes that haven't
been mapped yet, then it's an "out of resources" error. */
did_alloc = false;
while (num_alloc < app->num_procs) {
Quite a range of small changes. ns_replica.c - Removed the error logging since I use this function in orte_init_stage1 to check if we have created a cellid yet or not. ras_types.h & rase_base_node.h - This was an empty file. moved the orte_ras_node_t from base/ras_base_node.h to this file. - Changed the name of orte_ras_base_node_t to orte_ras_node_t to match the naming mechanisms in place. ras.h - Exposed 2 functions: - node_insert: This takes a list of orte_ras_base_node_t's and places them in the Node Segment of the GPR. This is to be used in orte_init_stage1 for singleton processes, and the hostfile parsing (see rds_hostfile.c). This just puts in the appropriate API interface to keep from calling the orte_ras_base_node_insert function directly. - node_query: This is used in hostfile parsing. This just puts in the appropriate API interface to keep from calling the orte_ras_base_node_query function directly. - Touched all of the implemented components to add reference to these new function pointers ras_base_select.c & ras_base_open.c - Add and set the global module reference rds.h - Exposed 1 function: - store_resource: This stores a list of rds_cell_desc_t's to the Resource Segment. This is used in conjunction with the orte_ras.node_insert function in both the orte_init_stage1 for singleton processes and rds_hostfile.c rds_base_select.c & rds_base_open.c - Add and set the global module reference rds_hostfile.c - Added functionality to create a new cellid for each hostfile, placing each entry in the hostfile into the same cellid. Currently this is commented out with the cellid hard coded to 0, with the intention of taking this out once ORTE is able to handle multiple cellid's - Instead of just adding hosts to the Node Segment via a direct call to the ras_base_node_insert() function. First add the hosts to the Resource Segment of the GPR using the orte_rds.store_resource() function then use the API version of orte_ras.node_insert() to store the hosts on the Node Segment. - Add 1 new function pointer to module as required by the API. rds_hostfile_component.c - Converted this to use the new MCA parameter registration orte_init_stage1.c - It is possible that a cellid was not created yet for the current environment. So I put in some logic to test if the cellid 0 existed. If it does then continue, otherwise create the cellid so we can properly interact with the GPR via the RDS. - For the singleton case we insert some 'dummy' data into the GPR. The RAS matches this logic, so I took out the duplicate GPR put logic, and replaced it with a call to the orte_ras.node_insert() function. - Further before calling orte_ras.node_insert() in the singleton case, we also call orte_rds.store_resource() to add the singleton node to the Resource Segment. Console: - Added a bunch of new functions. Still experimenting with many aspects of the implementation. This is a checkpoint, and has very limited functionality. - Should not be considered stable at the moment. This commit was SVN r6813.
2005-08-11 19:51:50 +00:00
node = (orte_ras_node_t*) cur_node_item;
next = opal_list_get_next(cur_node_item);
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
/* If we have an available slot on this node, claim it */
if (node->node_slots_alloc > 0) {
fflush(stdout);
rc = claim_slot(map, node, jobid, vpid_start + rank, proc_index);
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
if (ORTE_SUCCESS != rc) {
return rc;
}
if (node->node_slots_alloc == 0) {
opal_list_remove_item(nodes, (opal_list_item_t*)node);
OBJ_RELEASE(node);
}
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
++rank;
++proc_index;
/* Save the fact that we successfully allocated a process
to a node in this round */
did_alloc = true;
/* Increase the number of procs allocated and see if we're
done */
++num_alloc;
}
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
/* Move on to the next node */
cur_node_item = next;
if (opal_list_get_end(nodes) == cur_node_item) {
cur_node_item = opal_list_get_first(nodes);
}
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
/* Are we done? */
if (num_alloc == app->num_procs) {
break;
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
}
/* Double check that the list is not empty */
if (opal_list_get_end(nodes) == cur_node_item) {
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
}
/* If we looped around without allocating any new processes,
then we're full */
if (start == cur_node_item) {
if (!did_alloc) {
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
}
}
}
map->num_procs = num_alloc;
return ORTE_SUCCESS;
}
/*
* Create a default mapping for the application, scheduling one round
* robin by slot.
*
* NOTE: This function assumes that the allocator has already setup
* the list of nodes such that the sum of the node_slots_alloc fields
* from all entries will be the total number of processes in all the
* apps.
*/
static int map_app_by_slot(
orte_app_context_t* app,
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
orte_rmaps_base_map_t* map,
orte_jobid_t jobid,
orte_vpid_t vpid_start,
int rank,
opal_list_t* nodes)
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
{
int rc;
size_t num_alloc = 0;
size_t proc_index = 0;
opal_list_item_t *next;
Quite a range of small changes. ns_replica.c - Removed the error logging since I use this function in orte_init_stage1 to check if we have created a cellid yet or not. ras_types.h & rase_base_node.h - This was an empty file. moved the orte_ras_node_t from base/ras_base_node.h to this file. - Changed the name of orte_ras_base_node_t to orte_ras_node_t to match the naming mechanisms in place. ras.h - Exposed 2 functions: - node_insert: This takes a list of orte_ras_base_node_t's and places them in the Node Segment of the GPR. This is to be used in orte_init_stage1 for singleton processes, and the hostfile parsing (see rds_hostfile.c). This just puts in the appropriate API interface to keep from calling the orte_ras_base_node_insert function directly. - node_query: This is used in hostfile parsing. This just puts in the appropriate API interface to keep from calling the orte_ras_base_node_query function directly. - Touched all of the implemented components to add reference to these new function pointers ras_base_select.c & ras_base_open.c - Add and set the global module reference rds.h - Exposed 1 function: - store_resource: This stores a list of rds_cell_desc_t's to the Resource Segment. This is used in conjunction with the orte_ras.node_insert function in both the orte_init_stage1 for singleton processes and rds_hostfile.c rds_base_select.c & rds_base_open.c - Add and set the global module reference rds_hostfile.c - Added functionality to create a new cellid for each hostfile, placing each entry in the hostfile into the same cellid. Currently this is commented out with the cellid hard coded to 0, with the intention of taking this out once ORTE is able to handle multiple cellid's - Instead of just adding hosts to the Node Segment via a direct call to the ras_base_node_insert() function. First add the hosts to the Resource Segment of the GPR using the orte_rds.store_resource() function then use the API version of orte_ras.node_insert() to store the hosts on the Node Segment. - Add 1 new function pointer to module as required by the API. rds_hostfile_component.c - Converted this to use the new MCA parameter registration orte_init_stage1.c - It is possible that a cellid was not created yet for the current environment. So I put in some logic to test if the cellid 0 existed. If it does then continue, otherwise create the cellid so we can properly interact with the GPR via the RDS. - For the singleton case we insert some 'dummy' data into the GPR. The RAS matches this logic, so I took out the duplicate GPR put logic, and replaced it with a call to the orte_ras.node_insert() function. - Further before calling orte_ras.node_insert() in the singleton case, we also call orte_rds.store_resource() to add the singleton node to the Resource Segment. Console: - Added a bunch of new functions. Still experimenting with many aspects of the implementation. This is a checkpoint, and has very limited functionality. - Should not be considered stable at the moment. This commit was SVN r6813.
2005-08-11 19:51:50 +00:00
orte_ras_node_t *node;
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
/* Note that cur_node_item already points to the Right place in
the node list to start looking (i.e., if this is the first time
through, it'll point to the first item. If this is not the
first time through -- i.e., we have multiple app contexts --
it'll point to where we left off last time.). If we're at the
end, bounce back to the front (as would happen in the loop
below)
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
But do a bozo check to ensure that we don't have a empty node
list. */
if (0 == opal_list_get_size(nodes)) {
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
} else if (opal_list_get_end(nodes) == cur_node_item) {
cur_node_item = opal_list_get_first(nodes);
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
}
/* Go through all nodes and take up to node_slots_alloc slots and
map it to this job */
while (opal_list_get_end(nodes) != cur_node_item &&
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
num_alloc < app->num_procs) {
Quite a range of small changes. ns_replica.c - Removed the error logging since I use this function in orte_init_stage1 to check if we have created a cellid yet or not. ras_types.h & rase_base_node.h - This was an empty file. moved the orte_ras_node_t from base/ras_base_node.h to this file. - Changed the name of orte_ras_base_node_t to orte_ras_node_t to match the naming mechanisms in place. ras.h - Exposed 2 functions: - node_insert: This takes a list of orte_ras_base_node_t's and places them in the Node Segment of the GPR. This is to be used in orte_init_stage1 for singleton processes, and the hostfile parsing (see rds_hostfile.c). This just puts in the appropriate API interface to keep from calling the orte_ras_base_node_insert function directly. - node_query: This is used in hostfile parsing. This just puts in the appropriate API interface to keep from calling the orte_ras_base_node_query function directly. - Touched all of the implemented components to add reference to these new function pointers ras_base_select.c & ras_base_open.c - Add and set the global module reference rds.h - Exposed 1 function: - store_resource: This stores a list of rds_cell_desc_t's to the Resource Segment. This is used in conjunction with the orte_ras.node_insert function in both the orte_init_stage1 for singleton processes and rds_hostfile.c rds_base_select.c & rds_base_open.c - Add and set the global module reference rds_hostfile.c - Added functionality to create a new cellid for each hostfile, placing each entry in the hostfile into the same cellid. Currently this is commented out with the cellid hard coded to 0, with the intention of taking this out once ORTE is able to handle multiple cellid's - Instead of just adding hosts to the Node Segment via a direct call to the ras_base_node_insert() function. First add the hosts to the Resource Segment of the GPR using the orte_rds.store_resource() function then use the API version of orte_ras.node_insert() to store the hosts on the Node Segment. - Add 1 new function pointer to module as required by the API. rds_hostfile_component.c - Converted this to use the new MCA parameter registration orte_init_stage1.c - It is possible that a cellid was not created yet for the current environment. So I put in some logic to test if the cellid 0 existed. If it does then continue, otherwise create the cellid so we can properly interact with the GPR via the RDS. - For the singleton case we insert some 'dummy' data into the GPR. The RAS matches this logic, so I took out the duplicate GPR put logic, and replaced it with a call to the orte_ras.node_insert() function. - Further before calling orte_ras.node_insert() in the singleton case, we also call orte_rds.store_resource() to add the singleton node to the Resource Segment. Console: - Added a bunch of new functions. Still experimenting with many aspects of the implementation. This is a checkpoint, and has very limited functionality. - Should not be considered stable at the moment. This commit was SVN r6813.
2005-08-11 19:51:50 +00:00
node = (orte_ras_node_t*) cur_node_item;
next = opal_list_get_next(cur_node_item);
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
/* If we have available slots on this node, claim it */
while (node->node_slots_alloc > 0 &&
num_alloc < app->num_procs) {
fflush(stdout);
rc = claim_slot(map, node, jobid, vpid_start + rank, proc_index);
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
if (ORTE_SUCCESS != rc) {
return rc;
}
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
++rank;
++proc_index;
/* Increase the number of procs allocated and see if we're
done */
++num_alloc;
}
if (node->node_slots_alloc == 0) {
opal_list_remove_item(nodes, (opal_list_item_t*)node);
OBJ_RELEASE(node);
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
/* Move on to the next node since we have allocated all of
this node's slots */
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
cur_node_item = next;
}
}
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
/* Did we allocate everything? */
if (num_alloc < app->num_procs) {
return ORTE_ERR_OUT_OF_RESOURCE;
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
}
map->num_procs = num_alloc;
return ORTE_SUCCESS;
}
/*
* Create a default mapping for the job.
*/
static int orte_rmaps_rr_map(orte_jobid_t jobid)
{
orte_app_context_t** context;
size_t i, num_context;
opal_list_t nodes;
opal_list_t mapping;
opal_list_item_t* item;
orte_vpid_t vpid_start;
size_t num_procs = 0;
int rank = 0;
int rc = ORTE_SUCCESS;
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
bool bynode = true;
/* query for the application context and allocated nodes */
if(ORTE_SUCCESS != (rc = orte_rmgr_base_get_app_context(jobid, &context, &num_context))) {
return rc;
}
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
/* which policy should we use? */
if (0 == strcmp(mca_rmaps_round_robin_component.schedule_policy, "node")) {
bynode = true;
} else {
bynode = false;
}
/* total number of procs required */
for(i=0; i<num_context; i++) {
orte_app_context_t* app = context[i];
num_procs += app->num_procs;
}
/* allocate a vpid range for the job */
if(ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, num_procs, &vpid_start))) {
return rc;
}
/* query for all nodes allocated to this job */
OBJ_CONSTRUCT(&nodes, opal_list_t);
if(ORTE_SUCCESS != (rc = orte_ras_base_node_query_alloc(&nodes, jobid))) {
OBJ_DESTRUCT(&nodes);
return rc;
}
/* construct a default mapping */
OBJ_CONSTRUCT(&mapping, opal_list_t);
cur_node_item = opal_list_get_first(&nodes);
for(i=0; i<num_context; i++) {
orte_app_context_t* app = context[i];
orte_rmaps_base_map_t* map = OBJ_NEW(orte_rmaps_base_map_t);
if(NULL == map) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
opal_list_append(&mapping, &map->super);
map->app = app;
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
map->procs = malloc(sizeof(orte_rmaps_base_proc_t*) * app->num_procs);
if(NULL == map->procs) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
if (bynode) {
rc = map_app_by_node(app, map, jobid, vpid_start, rank, &nodes);
} else {
rc = map_app_by_slot(app, map, jobid, vpid_start, rank, &nodes);
}
if (ORTE_SUCCESS != rc) {
goto cleanup;
}
rank += app->num_procs;
}
/* save mapping to the registry */
if(ORTE_SUCCESS != (rc = orte_rmaps_base_set_map(jobid, &mapping))) {
goto cleanup;
}
/* save vpid start/range on the job segment */
rc = orte_rmaps_base_set_vpid_range(jobid,vpid_start,num_procs);
cleanup:
while(NULL != (item = opal_list_remove_first(&nodes))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&nodes);
while(NULL != (item = opal_list_remove_first(&mapping))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&mapping);
return rc;
}
static int orte_rmaps_rr_finalize(void)
{
return ORTE_SUCCESS;
}
orte_rmaps_base_module_t orte_rmaps_round_robin_module = {
orte_rmaps_rr_map,
orte_rmaps_rr_finalize
};