openmpi/orte/mca/ras/base/ras_base_alloc.c

/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University.
 *                         All rights reserved.
 * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
 *                         All rights reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */


#include "orte_config.h"
#include "include/orte_constants.h"

#include "mca/mca.h"
#include "mca/base/base.h"
#include "mca/ras/base/base.h"
#include "mca/ras/base/ras_base_node.h"
#include "mca/rmgr/base/base.h"
#include "mca/errmgr/errmgr.h"


/*
 * Allocate one process per node on a round-robin basis, looping back
 * around to the beginning as necessary
 */
int orte_ras_base_allocate_nodes_by_node(orte_jobid_t jobid, 
                                         ompi_list_t* nodes)
{
    ompi_list_t allocated;
    ompi_list_item_t* item;
    size_t num_requested = 0;
    size_t num_allocated = 0;
    size_t num_constrained = 0;
    size_t slots;
    bool oversubscribe = false;
    int rc;

    /* query for the number of process slots required */
    if (ORTE_SUCCESS != 
       (rc = orte_rmgr_base_get_job_slots(jobid, &num_requested))) {
        return rc;
    }

    OBJ_CONSTRUCT(&allocated, ompi_list_t);
    num_allocated = 0;

    /* This loop continues until all procs have been allocated or we run
       out of resources.  There are two definitions of "run out of
       resources":

       1. All nodes have node_slots processes allocated to them
       2. All nodes have node_slots_max processes allocated to them

       We first map until condition #1 is met.  If there are still
       processes that haven't been allocated yet, then we continue
       until condition #2 is met.  If we still have processes that
       haven't been allocated yet, then it's an "out of resources"
       error. */
    while (num_allocated < num_requested) {
        num_constrained = 0;

        /* loop over all nodes until either all processes are
           allocated or they all become constrained */
        for (item = ompi_list_get_first(nodes);
             item != ompi_list_get_end(nodes) && num_allocated < num_requested;
             item = ompi_list_get_next(item)) {
            orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
            
            /* are any slots available? */
            slots = (oversubscribe ? node->node_slots_max : node->node_slots);
            if (node->node_slots_inuse < slots ||
                (oversubscribe && 0 == slots)) {
                ++num_allocated;
                ++node->node_slots_inuse; /* running total */
                ++node->node_slots_alloc; /* this job */
            } else {
                ++num_constrained;
            }
        }

        /* if all nodes are constrained:
           - if this is the first time through the loop, then set
             "oversubscribe" to true, and we'll now start obeying
             node_slots_max instead of node_slots
           - if this is the second time through the loop, then all
             nodes are full to the max, and therefore we can't do
             anything more -- we're out of resources */
        if (ompi_list_get_size(nodes) == num_constrained) {
            if (oversubscribe) {
                rc = ORTE_ERR_OUT_OF_RESOURCE;
                goto cleanup;
            } else {
                oversubscribe = true;
            }
        }
    }

    /* move all nodes w/ allocations to the allocated list */
    item = ompi_list_get_first(nodes);
    while(item != ompi_list_get_end(nodes)) {
        orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
        ompi_list_item_t* next = ompi_list_get_next(item);
        if(node->node_slots_alloc) {
            ompi_list_remove_item(nodes, item);
            ompi_list_append(&allocated, item);
        }
        item = next;
    }

    rc = orte_ras_base_node_assign(&allocated, jobid);
    if(ORTE_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
    }
cleanup:

    while(NULL != (item = ompi_list_remove_first(&allocated))) 
        ompi_list_append(nodes, item);
    OBJ_DESTRUCT(&allocated);
    return rc;
}


/*
 * Allocate processes to nodes, using all available slots on a node.
 */
int orte_ras_base_allocate_nodes_by_slot(orte_jobid_t jobid, 
                                         ompi_list_t* nodes)
{
    ompi_list_t allocated;
    ompi_list_item_t* item;
    size_t num_requested = 0;
    size_t num_allocated = 0;
    size_t num_constrained = 0;
    size_t available;
    int rc;

    /* query for the number of process slots required */
    if (ORTE_SUCCESS != 
       (rc = orte_rmgr_base_get_job_slots(jobid, &num_requested))) {
        return rc;
    }

    OBJ_CONSTRUCT(&allocated, ompi_list_t);
    num_allocated = 0;

    /* In the first pass, just grab all available slots (i.e., stay <=
       node_slots) greedily off each node */
    for (item = ompi_list_get_first(nodes);
         item != ompi_list_get_end(nodes) && num_allocated < num_requested;
         item = ompi_list_get_next(item)) {
        orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
        
        /* are any slots available? */
        if (node->node_slots_inuse < node->node_slots) {
            available = node->node_slots - node->node_slots_inuse;
            if (num_requested - num_allocated < available) {
                node->node_slots_inuse +=
                    (num_requested - num_allocated); /* running total */
                node->node_slots_alloc +=
                    (num_requested - num_allocated); /* this job */
                num_allocated = num_requested;
            } else {
                num_allocated += available;
                node->node_slots_inuse += available; /* running total */
                node->node_slots_alloc += available; /* this job */
            }
        }
    }
    
    /* If we're not done, then we're in an oversubscribing situation.
       Switch to a round-robin-by-node policy -- take one slot from
       each node until we hit node_slots_max or we have no more
       resources; whichever occurs first. */
    while (num_allocated < num_requested) {
        num_constrained = 0;

        /* loop over all nodes until either all processes are
           allocated or they all become constrained */
        for (item = ompi_list_get_first(nodes);
             item != ompi_list_get_end(nodes) && num_allocated < num_requested;
             item = ompi_list_get_next(item)) {
            orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
            
            /* are any slots available? */
            if (node->node_slots_inuse < node->node_slots_max ||
                0 == node->node_slots_max) {
                ++num_allocated;
                ++node->node_slots_inuse; /* running total */
                ++node->node_slots_alloc; /* this job */
            } else {
                ++num_constrained;
            }
        }

        /* if all nodes are constrained, then we're out of resources
           -- thanks for playing */
        if (ompi_list_get_size(nodes) == num_constrained) {
            rc = ORTE_ERR_OUT_OF_RESOURCE;
            goto cleanup;
        }
    }

    /* move all nodes w/ allocations to the allocated list */
    item = ompi_list_get_first(nodes);
    while(item != ompi_list_get_end(nodes)) {
        orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
        ompi_list_item_t* next = ompi_list_get_next(item);
        if(node->node_slots_alloc) {
            ompi_list_remove_item(nodes, item);
            ompi_list_append(&allocated, item);
        }
        item = next;
    }

    rc = orte_ras_base_node_assign(&allocated, jobid);
    if(ORTE_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
    }

cleanup:

    while(NULL != (item = ompi_list_remove_first(&allocated))) 
        ompi_list_append(nodes, item);
    OBJ_DESTRUCT(&allocated);
    return rc;
}