diff --git a/orte/include/orte/orte_constants.h b/orte/include/orte/orte_constants.h index 38c941523b..670f7980fb 100644 --- a/orte/include/orte/orte_constants.h +++ b/orte/include/orte/orte_constants.h @@ -85,7 +85,8 @@ enum { ORTE_ERR_OPERATION_UNSUPPORTED = (ORTE_ERR_BASE - 22), ORTE_ERR_PROC_STATE_MISSING = (ORTE_ERR_BASE - 23), ORTE_ERR_PROC_EXIT_STATUS_MISSING = (ORTE_ERR_BASE - 24), - ORTE_ERR_INDETERMINATE_STATE_INFO = (ORTE_ERR_BASE - 25) + ORTE_ERR_INDETERMINATE_STATE_INFO = (ORTE_ERR_BASE - 25), + ORTE_ERR_NODE_FULLY_USED = (ORTE_ERR_BASE - 26) }; #define ORTE_ERR_MAX (ORTE_ERR_BASE - 100) diff --git a/orte/include/orte/orte_types.h b/orte/include/orte/orte_types.h index f911e0db02..8bbfac76bf 100644 --- a/orte/include/orte/orte_types.h +++ b/orte/include/orte/orte_types.h @@ -89,8 +89,9 @@ typedef uint8_t orte_data_type_t ; /* Resource Manager types */ #define ORTE_APP_CONTEXT (orte_data_type_t) 42 /**< argv and enviro arrays */ #define ORTE_APP_CONTEXT_MAP (orte_data_type_t) 43 /**< application context mapping array */ +#define ORTE_RAS_NODE (orte_data_type_t) 44 /**< node information */ /* DAEMON communication type */ -#define ORTE_DAEMON_CMD (orte_data_type_t) 44 /**< command flag for communicating with the daemon */ +#define ORTE_DAEMON_CMD (orte_data_type_t) 45 /**< command flag for communicating with the daemon */ /* define the starting point for dynamically assigning data types */ #define ORTE_DSS_ID_DYNAMIC 60 diff --git a/orte/mca/ras/base/Makefile.am b/orte/mca/ras/base/Makefile.am index 62793a0402..b660bd627f 100644 --- a/orte/mca/ras/base/Makefile.am +++ b/orte/mca/ras/base/Makefile.am @@ -27,4 +27,11 @@ libmca_ras_la_SOURCES += \ base/ras_base_find_available.c \ base/ras_base_node.h \ base/ras_base_node.c \ - base/ras_base_open.c + base/ras_base_open.c \ + base/data_type_support/ras_data_type_compare_fns.c \ + base/data_type_support/ras_data_type_copy_fns.c \ + base/data_type_support/ras_data_type_packing_fns.c \ + base/data_type_support/ras_data_type_print_fns.c \ + base/data_type_support/ras_data_type_release_fns.c \ + base/data_type_support/ras_data_type_size_fns.c \ + base/data_type_support/ras_data_type_unpacking_fns.c diff --git a/orte/mca/ras/base/base.h b/orte/mca/ras/base/base.h index 601dd74496..8e1e5f0601 100644 --- a/orte/mca/ras/base/base.h +++ b/orte/mca/ras/base/base.h @@ -26,7 +26,11 @@ */ #include "orte_config.h" #include "orte/orte_constants.h" +#include "orte/orte_types.h" + #include "opal/class/opal_list.h" + +#include "orte/dss/dss_types.h" #include "orte/mca/ras/ras.h" @@ -87,6 +91,18 @@ ORTE_DECLSPEC extern orte_ras_base_t orte_ras_base; OMPI_DECLSPEC OBJ_CLASS_DECLARATION(orte_ras_base_cmp_t); +/** Local data type functions */ +int orte_ras_base_copy_node(orte_ras_node_t **dest, orte_ras_node_t *src, orte_data_type_t type); +int orte_ras_base_compare_node(orte_ras_node_t *value1, orte_ras_node_t *value2, orte_data_type_t type); +int orte_ras_base_pack_node(orte_buffer_t *buffer, void *src, + size_t num_vals, orte_data_type_t type); +int orte_ras_base_print_node(char **output, char *prefix, orte_ras_node_t *src, orte_data_type_t type); +void orte_ras_base_std_obj_release(orte_data_value_t *value); +int orte_ras_base_size_node(size_t *size, orte_ras_node_t *src, orte_data_type_t type); +int orte_ras_base_unpack_node(orte_buffer_t *buffer, void *dest, + size_t *num_vals, orte_data_type_t type); + + /* * external API functions will be documented in the mca/ns/ns.h file */ diff --git a/orte/mca/ras/base/data_type_support/ras_data_type_compare_fns.c b/orte/mca/ras/base/data_type_support/ras_data_type_compare_fns.c new file mode 100644 index 0000000000..bd81a83bc0 --- /dev/null +++ b/orte/mca/ras/base/data_type_support/ras_data_type_compare_fns.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#if HAVE_NETINET_IN_H +#include +#endif + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/dss/dss_internal.h" + +#include "orte/mca/ras/base/base.h" + +/** + * RAS NODE + */ +int orte_ras_base_compare_node(orte_ras_node_t *value1, orte_ras_node_t *value2, orte_data_type_t type) +{ + int test; + + if (value1->node_cellid > value2->node_cellid) return ORTE_VALUE1_GREATER; + if (value2->node_cellid > value1->node_cellid) return ORTE_VALUE2_GREATER; + + /** same cell - check node names */ + test = strcmp(value1->node_name, value2->node_name); + if (0 == test) return ORTE_EQUAL; + if (0 < test) return ORTE_VALUE2_GREATER; + + return ORTE_VALUE1_GREATER; +} diff --git a/orte/mca/ras/base/data_type_support/ras_data_type_copy_fns.c b/orte/mca/ras/base/data_type_support/ras_data_type_copy_fns.c new file mode 100644 index 0000000000..d7db293cd7 --- /dev/null +++ b/orte/mca/ras/base/data_type_support/ras_data_type_copy_fns.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#if HAVE_NETINET_IN_H +#include +#endif + +#include "opal/util/argv.h" +#include "opal/class/opal_list.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/dss/dss_internal.h" + +#include "orte/mca/ras/base/base.h" + +/** + * RAS NODE + */ +int orte_ras_base_copy_node(orte_ras_node_t **dest, orte_ras_node_t *src, orte_data_type_t type) +{ + /* create the new object */ + *dest = OBJ_NEW(orte_ras_node_t); + if (NULL == *dest) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /* copy data into it */ + if (NULL != src->node_name) (*dest)->node_name = strdup(src->node_name); + if (NULL != src->node_arch) (*dest)->node_arch = strdup(src->node_arch); + (*dest)->node_cellid = src->node_cellid; + (*dest)->node_state = src->node_state; + (*dest)->node_slots = src->node_slots; + (*dest)->node_slots_inuse = src->node_slots_inuse; + (*dest)->node_slots_alloc = src->node_slots_alloc; + (*dest)->node_slots_max = src->node_slots_max; + if (NULL != src->node_username) (*dest)->node_username = strdup(src->node_username); + (*dest)->node_launched = src->node_launched; + + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ras/base/data_type_support/ras_data_type_packing_fns.c b/orte/mca/ras/base/data_type_support/ras_data_type_packing_fns.c new file mode 100644 index 0000000000..ea0c4b55fb --- /dev/null +++ b/orte/mca/ras/base/data_type_support/ras_data_type_packing_fns.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#if HAVE_NETINET_IN_H +#include +#endif + +#include "opal/util/argv.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/dss/dss_internal.h" + +#include "orte/mca/ras/base/base.h" + +/* + * RAS NODE + */ +int orte_ras_base_pack_node(orte_buffer_t *buffer, void *src, + size_t num_vals, orte_data_type_t type) +{ + int rc; + size_t i; + orte_ras_node_t **nodes; + + /* array of pointers to orte_ras_node_t objects - need to pack the objects a set of fields at a time */ + nodes = (orte_ras_node_t**) src; + + for (i=0; i < num_vals; i++) { + /* pack the node name */ + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, + (void*)(&(nodes[i]->node_name)), 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* pack the arch */ + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, + (void*)(&(nodes[i]->node_arch)), 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* pack the cellid */ + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, + (void*)(&(nodes[i]->node_cellid)), 1, ORTE_CELLID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* pack the state */ + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, + (void*)(&(nodes[i]->node_state)), 1, ORTE_NODE_STATE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* pack the number of slots */ + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, + (void*)(&(nodes[i]->node_slots)), 1, ORTE_SIZE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* pack the number of slots in use */ + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, + (void*)(&(nodes[i]->node_slots_inuse)), 1, ORTE_SIZE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* pack the number of slots allocated */ + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, + (void*)(&(nodes[i]->node_slots_alloc)), 1, ORTE_SIZE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* pack the max number of slots */ + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, + (void*)(&(nodes[i]->node_slots_max)), 1, ORTE_SIZE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* pack the username */ + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, + (void*)(&(nodes[i]->node_username)), 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* pack the node launched flag */ + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, + (void*)(&(nodes[i]->node_launched)), 1, ORTE_INT))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + + return ORTE_SUCCESS; +} diff --git a/orte/mca/ras/base/data_type_support/ras_data_type_print_fns.c b/orte/mca/ras/base/data_type_support/ras_data_type_print_fns.c new file mode 100644 index 0000000000..57dc1e7299 --- /dev/null +++ b/orte/mca/ras/base/data_type_support/ras_data_type_print_fns.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#if HAVE_NETINET_IN_H +#include +#endif + +#include "opal/util/argv.h" +#include "opal/class/opal_list.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/dss/dss_internal.h" + +#include "orte/mca/ras/base/base.h" + +/* + * RAS NODE + */ +int orte_ras_base_print_node(char **output, char *prefix, orte_ras_node_t *src, orte_data_type_t type) +{ + char *tmp, *tmp2, *pfx2; + size_t j; + + /* set default result */ + *output = NULL; + + /* protect against NULL prefix */ + if (NULL == prefix) { + asprintf(&pfx2, " "); + } else { + asprintf(&pfx2, "%s", prefix); + } + + asprintf(&tmp, "%sData for node: cellid: %lu\tName: %s", + pfx2, (unsigned long)src->node_cellid, src->node_name); + + asprintf(&tmp2, "%s\n%s\tArch: %s\tState: %lu", tmp, pfx2, + src->node_arch, (unsigned long)src->node_state); + free(tmp); + tmp = tmp2; + + asprintf(&tmp2, "%s\n%s\tNum slots: %s\tSlots in use: %lu", tmp, pfx2, + (unsigned long)src->node_slots, (unsigned long)src->node_slots_inuse); + free(tmp); + tmp = tmp2; + + asprintf(&tmp2, "%s\n%s\tNum slots allocated: %s\tMax slots: %lu", tmp, pfx2, + (unsigned long)src->node_slots_alloc, (unsigned long)src->node_slots_max); + free(tmp); + tmp = tmp2; + + asprintf(&tmp2, "%s\n%s\tUsername on node: %s\tLaunched?: %lu", tmp, pfx2, + src->node_username, (unsigned long)src->node_launched); + free(tmp); + tmp = tmp2; + + /* set the return */ + *output = tmp; + + free(pfx2); + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ras/base/data_type_support/ras_data_type_release_fns.c b/orte/mca/ras/base/data_type_support/ras_data_type_release_fns.c new file mode 100644 index 0000000000..e0c9ba072a --- /dev/null +++ b/orte/mca/ras/base/data_type_support/ras_data_type_release_fns.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#if HAVE_NETINET_IN_H +#include +#endif + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/dss/dss_internal.h" + +#include "orte/mca/ras/base/base.h" + +/* + * STANDARD OBJECT RELEASE + */ +void orte_ras_base_std_obj_release(orte_data_value_t *value) +{ + OBJ_RELEASE(value->data); +} + diff --git a/orte/mca/ras/base/data_type_support/ras_data_type_size_fns.c b/orte/mca/ras/base/data_type_support/ras_data_type_size_fns.c new file mode 100644 index 0000000000..cd29272d7d --- /dev/null +++ b/orte/mca/ras/base/data_type_support/ras_data_type_size_fns.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#if HAVE_NETINET_IN_H +#include +#endif + +#include "opal/util/argv.h" +#include "opal/class/opal_list.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/dss/dss_internal.h" + +#include "orte/mca/ras/base/base.h" + +/* + * RAS NODE + */ +int orte_ras_base_size_node(size_t *size, orte_ras_node_t *src, orte_data_type_t type) +{ + /* account for the object itself */ + *size = sizeof(orte_ras_node_t); + + /* if src is NULL, then that's all we wanted */ + if (NULL == src) return ORTE_SUCCESS; + + if (NULL != src->node_name) { + *size += strlen(src->node_name); + } + + if (NULL != src->node_arch) { + *size += strlen(src->node_arch); + } + + if (NULL != src->node_username) { + *size += strlen(src->node_username); + } + + return ORTE_SUCCESS; +} diff --git a/orte/mca/ras/base/data_type_support/ras_data_type_unpacking_fns.c b/orte/mca/ras/base/data_type_support/ras_data_type_unpacking_fns.c new file mode 100644 index 0000000000..6b87ef5bae --- /dev/null +++ b/orte/mca/ras/base/data_type_support/ras_data_type_unpacking_fns.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/orte_constants.h" +#include "orte/orte_types.h" + +#include +#if HAVE_NETINET_IN_H +#include +#endif + +#include "opal/class/opal_list.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/dss/dss_internal.h" + +#include "orte/mca/ras/base/base.h" + +/* + * RAS NODE + */ +int orte_ras_base_unpack_node(orte_buffer_t *buffer, void *dest, + size_t *num_vals, orte_data_type_t type) +{ + int rc; + size_t i, n; + orte_ras_node_t **nodes; + + /* unpack into array of ras_node objects */ + nodes = (orte_ras_node_t**) dest; + for (i=0; i < *num_vals; i++) { + + /* create the ras_node object */ + nodes[i] = OBJ_NEW(orte_ras_node_t); + if (NULL == nodes[i]) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /* unpack the node name */ + n = 1; + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, + &(nodes[i]->node_name), &n, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* unpack the arch */ + n = 1; + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, + (&(nodes[i]->node_arch)), &n, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* unpack the cellid */ + n = 1; + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, + (&(nodes[i]->node_cellid)), &n, ORTE_CELLID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* unpack the state */ + n = 1; + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, + (&(nodes[i]->node_state)), &n, ORTE_NODE_STATE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* unpack the number of slots */ + n = 1; + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, + (&(nodes[i]->node_slots)), &n, ORTE_SIZE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* unpack the number of slots in use */ + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, + (&(nodes[i]->node_slots_inuse)), &n, ORTE_SIZE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* unpack the number of slots allocated */ + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, + (&(nodes[i]->node_slots_alloc)), &n, ORTE_SIZE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* unpack the max number of slots */ + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, + (&(nodes[i]->node_slots_max)), &n, ORTE_SIZE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* unpack the username */ + n = 1; + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, + (&(nodes[i]->node_username)), &n, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* unpack the node launched flag */ + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, + (&(nodes[i]->node_launched)), &n, ORTE_INT))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ras/base/ras_base_node.c b/orte/mca/ras/base/ras_base_node.c index 5b5b7c87de..765c14a35c 100644 --- a/orte/mca/ras/base/ras_base_node.c +++ b/orte/mca/ras/base/ras_base_node.c @@ -134,12 +134,20 @@ int orte_ras_base_node_query(opal_list_t* nodes) node->node_slots = *sptr; continue; } + if(strcmp(keyval->key, ORTE_NODE_SLOTS_IN_USE_KEY) == 0) { + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_SIZE))) { + ORTE_ERROR_LOG(rc); + continue; + } + node->node_slots_inuse = *sptr; + continue; + } if(strncmp(keyval->key, ORTE_NODE_SLOTS_ALLOC_KEY, strlen(ORTE_NODE_SLOTS_ALLOC_KEY)) == 0) { if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_SIZE))) { ORTE_ERROR_LOG(rc); continue; } - node->node_slots_inuse += *sptr; + node->node_slots_alloc += *sptr; continue; } if(strcmp(keyval->key, ORTE_NODE_SLOTS_MAX_KEY) == 0) { @@ -188,6 +196,7 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid) ORTE_NODE_ARCH_KEY, ORTE_NODE_STATE_KEY, ORTE_NODE_SLOTS_KEY, + ORTE_NODE_SLOTS_IN_USE_KEY, ORTE_NODE_SLOTS_ALLOC_KEY, ORTE_NODE_SLOTS_MAX_KEY, ORTE_NODE_USERNAME_KEY, @@ -200,15 +209,15 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid) size_t *sptr; orte_node_state_t *nsptr; orte_cellid_t *cptr; - int rc; + int rc, alloc_key_posn=5; if(ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_str, jobid))) { ORTE_ERROR_LOG(rc); return rc; } - asprintf(&keys[4], "%s-%s", ORTE_NODE_SLOTS_ALLOC_KEY, jobid_str); - keys_len = strlen(keys[4]); + asprintf(&keys[alloc_key_posn], "%s-%s", ORTE_NODE_SLOTS_ALLOC_KEY, jobid_str); + keys_len = strlen(keys[alloc_key_posn]); free(jobid_str); /* query selected node entries */ @@ -235,7 +244,7 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid) for (k = 0; k < value->cnt; k++) { orte_gpr_keyval_t* keyval = value->keyvals[k]; - if(0 == strcmp(keyval->key, keys[4])) { + if(0 == strcmp(keyval->key, keys[alloc_key_posn])) { found = true; break; } @@ -283,12 +292,19 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid) node->node_slots = *sptr; continue; } - if(strncmp(keyval->key, keys[4], keys_len) == 0) { + if(strcmp(keyval->key, ORTE_NODE_SLOTS_IN_USE_KEY) == 0) { + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_SIZE))) { + ORTE_ERROR_LOG(rc); + continue; + } + node->node_slots_inuse = *sptr; + continue; + } + if(strncmp(keyval->key, keys[alloc_key_posn], keys_len) == 0) { if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_SIZE))) { ORTE_ERROR_LOG(rc); continue; } - node->node_slots_inuse += *sptr; node->node_slots_alloc += *sptr; continue; } @@ -319,16 +335,11 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid) continue; } } - /* in case we get back more than we asked for */ - if(node->node_slots_inuse == 0) { - OBJ_RELEASE(node); - } else { - opal_list_append(nodes, &node->super); - } + opal_list_append(nodes, &node->super); OBJ_RELEASE(value); } - free (keys[4]); + free (keys[alloc_key_posn]); if (NULL != values) free(values); return ORTE_SUCCESS; } @@ -412,12 +423,20 @@ orte_ras_node_t* orte_ras_base_node_lookup(orte_cellid_t cellid, const char* nod node->node_slots = *sptr; continue; } + if(strcmp(keyval->key, ORTE_NODE_SLOTS_IN_USE_KEY) == 0) { + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_SIZE))) { + ORTE_ERROR_LOG(rc); + continue; + } + node->node_slots_inuse = *sptr; + continue; + } if(strncmp(keyval->key, ORTE_NODE_SLOTS_ALLOC_KEY, strlen(ORTE_NODE_SLOTS_ALLOC_KEY)) == 0) { if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_SIZE))) { ORTE_ERROR_LOG(rc); continue; } - node->node_slots_inuse += *sptr; + node->node_slots_alloc += *sptr; continue; } if(strcmp(keyval->key, ORTE_NODE_SLOTS_MAX_KEY) == 0) { @@ -473,6 +492,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes) ORTE_NODE_STATE_KEY, ORTE_CELLID_KEY, ORTE_NODE_SLOTS_KEY, + ORTE_NODE_SLOTS_IN_USE_KEY, ORTE_NODE_SLOTS_MAX_KEY, ORTE_NODE_USERNAME_KEY }; @@ -483,6 +503,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes) ORTE_CELLID, ORTE_SIZE, ORTE_SIZE, + ORTE_SIZE, ORTE_STRING }; orte_ras_node_t* node; @@ -502,7 +523,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes) for (i=0; i < num_values; i++) { if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[i]), ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND, - ORTE_NODE_SEGMENT, 7, 0))) { + ORTE_NODE_SEGMENT, 8, 0))) { ORTE_ERROR_LOG(rc); for (j=0; j < i; j++) { OBJ_RELEASE(values[j]); @@ -547,6 +568,12 @@ int orte_ras_base_node_insert(opal_list_t* nodes) goto cleanup; } + ++j; + if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[j]), keys[j], types[j], &(node->node_slots_inuse)))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + ++j; if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[j]), keys[j], types[j], &(node->node_slots_max)))) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/ras/base/ras_base_open.c b/orte/mca/ras/base/ras_base_open.c index bb63578068..861b12c3c9 100644 --- a/orte/mca/ras/base/ras_base_open.c +++ b/orte/mca/ras/base/ras_base_open.c @@ -24,6 +24,9 @@ #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_param.h" #include "opal/util/output.h" + +#include "orte/dss/dss.h" +#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ras/base/base.h" @@ -49,7 +52,8 @@ orte_ras_base_t orte_ras_base; */ int orte_ras_base_open(void) { - int value; + int value, rc; + orte_data_type_t tmp; /* Debugging / verbose output */ @@ -68,6 +72,21 @@ int orte_ras_base_open(void) orte_ras_base.ras_opened_valid = false; orte_ras_base.ras_available_valid = false; + /** register the base system types with the DSS */ + tmp = ORTE_RAS_NODE; + if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_ras_base_pack_node, + orte_ras_base_unpack_node, + (orte_dss_copy_fn_t)orte_ras_base_copy_node, + (orte_dss_compare_fn_t)orte_ras_base_compare_node, + (orte_dss_size_fn_t)orte_ras_base_size_node, + (orte_dss_print_fn_t)orte_ras_base_print_node, + (orte_dss_release_fn_t)orte_ras_base_std_obj_release, + ORTE_DSS_STRUCTURED, + "ORTE_RAS_NODE", &tmp))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* Open up all available components */ if (ORTE_SUCCESS != diff --git a/orte/mca/rmaps/base/Makefile.am b/orte/mca/rmaps/base/Makefile.am index 23f4ddd92b..d84d7afea0 100644 --- a/orte/mca/rmaps/base/Makefile.am +++ b/orte/mca/rmaps/base/Makefile.am @@ -18,11 +18,13 @@ headers += \ base/base.h \ + base/rmaps_base_node.h \ base/rmaps_base_map.h libmca_rmaps_la_SOURCES += \ base/rmaps_base_close.c \ base/rmaps_base_map.h \ base/rmaps_base_map.c \ + base/rmaps_base_node.c \ base/rmaps_base_open.c \ base/rmaps_base_select.c diff --git a/orte/mca/rmaps/base/base.h b/orte/mca/rmaps/base/base.h index e6131ea3a3..447104ccd5 100644 --- a/orte/mca/rmaps/base/base.h +++ b/orte/mca/rmaps/base/base.h @@ -53,6 +53,8 @@ extern "C" { opal_list_t rmaps_opened; /** Sorted list of available components (highest priority first) */ opal_list_t rmaps_available; + /** whether or not we allow oversubscription of nodes */ + bool oversubscribe; } orte_rmaps_base_t; /** diff --git a/orte/mca/rmaps/base/rmaps_base_node.c b/orte/mca/rmaps/base/rmaps_base_node.c new file mode 100644 index 0000000000..8b0ddc18e4 --- /dev/null +++ b/orte/mca/rmaps/base/rmaps_base_node.c @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/orte_constants.h" + +#include + +#include "opal/util/output.h" +#include "opal/util/argv.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/util/if.h" +#include "opal/util/show_help.h" + +#include "orte/util/sys_info.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/soh/soh_types.h" +#include "orte/mca/ras/base/ras_base_node.h" +#include "orte/mca/gpr/gpr.h" +#include "orte/mca/ns/ns.h" + +#include "orte/mca/rmaps/base/base.h" +#include "orte/mca/rmaps/base/rmaps_base_map.h" +#include "orte/mca/rmaps/base/rmaps_base_node.h" + + +/* + * A sanity check to ensure that all of the requested nodes are actually + * allocated to this application. + */ +static bool are_all_mapped_valid(char **mapping, + int num_mapped, + opal_list_t* nodes) +{ + opal_list_item_t *item; + int i; + bool matched; + + for (i = 0; i < num_mapped; ++i) { + matched = false; + + for(item = opal_list_get_first(nodes); + item != opal_list_get_end(nodes); + item = opal_list_get_next(item) ) { + if( 0 == strcmp( ((orte_ras_node_t*) item)->node_name, mapping[i]) ) { + matched = true; + break; + } + } + + /* If we find one requested resource that is not allocated, + * then return an error */ + if(!matched) { + return false; + } + } + + return true; +} + +/* + * If the node in question is in the current mapping. + */ +static bool is_mapped(opal_list_item_t *item, + char **mapping, + int num_mapped) +{ + int i; + + for ( i = 0; i < num_mapped; ++i) { + if ( 0 == strcmp( ((orte_ras_node_t*) item)->node_name, mapping[i])){ + return true; + } + } + + return false; +} + +/* + * Query the registry for all nodes allocated to a specified job + */ +int orte_rmaps_base_get_target_nodes(opal_list_t* nodes, orte_jobid_t jobid) +{ + opal_list_item_t *item, *next; + orte_ras_node_t *node; + int id, rc, nolocal; + + if(ORTE_SUCCESS != (rc = orte_ras_base_node_query_alloc(nodes, jobid))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* If the "no local" option was set, then remove the local node + from the list */ + + id = mca_base_param_find("rmaps", NULL, "base_schedule_local"); + mca_base_param_lookup_int(id, &nolocal); + if (0 == nolocal) { + for (item = opal_list_get_first(nodes); + item != opal_list_get_end(nodes); + item = opal_list_get_next(item) ) { + if (0 == strcmp(((orte_ras_node_t *) item)->node_name, + orte_system_info.nodename) || + opal_ifislocal(((orte_ras_node_t *) item)->node_name)) { + opal_list_remove_item(nodes, item); + break; + } + } + } + + /** remove all nodes that are already at max usage */ + item = opal_list_get_first(nodes); + while (item != opal_list_get_end(nodes)) { + + /** save the next pointer in case we remove this node */ + next = opal_list_get_next(item); + + /** check to see if this node is fully used - remove if so */ + node = (orte_ras_node_t*)item; + if (0 != node->node_slots_max && node->node_slots_inuse > node->node_slots_max) { + opal_list_remove_item(nodes, item); + } + + /** go on to next item */ + item = next; + } + + /* Sanity check to make sure we have been allocated nodes */ + if (0 == opal_list_get_size(nodes)) { + ORTE_ERROR_LOG(ORTE_ERR_TEMP_OUT_OF_RESOURCE); + return ORTE_ERR_TEMP_OUT_OF_RESOURCE; + } + + return ORTE_SUCCESS; +} + + +/* + * Create a sub-list of nodes to be used for user-specified mappings + */ +int orte_rmaps_base_get_mapped_targets(opal_list_t *mapped_node_list, + orte_app_context_t *app, + opal_list_t *master_node_list) +{ + orte_app_context_map_t** loc_map = app->map_data; + opal_list_item_t *item; + orte_ras_node_t *node, *new_node; + char **mapped_nodes = NULL; + int num_mapped_nodes = 0; + size_t j, k; + int rc; + + /* Accumulate all of the host name mappings */ + for(k = 0; k < app->num_map; ++k) { + if ( ORTE_APP_CONTEXT_MAP_HOSTNAME == loc_map[k]->map_type ) { + if(mapped_nodes == NULL) { + mapped_nodes = opal_argv_split(loc_map[k]->map_data, ','); + num_mapped_nodes = opal_argv_count(mapped_nodes); + } + else { /* Append to the existing mapping */ + char ** mini_map = opal_argv_split(loc_map[k]->map_data, ','); + size_t mini_num_map = opal_argv_count(mini_map); + for (j = 0; j < mini_num_map; ++j) { + rc = opal_argv_append(&num_mapped_nodes, &mapped_nodes, mini_map[j]); + if (OPAL_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + opal_argv_free(mini_map); + } + } + } + + /** check to see that all the nodes in the specified mapping have been allocated + * for our use - if not, then that's an error + */ + if( !are_all_mapped_valid(mapped_nodes, num_mapped_nodes, master_node_list) ) { + opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:not-all-mapped-alloc", + true, app->app, opal_argv_join(mapped_nodes, ',')); + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /** setup the working node list to include only those nodes that were specified + * in this mapping. We don't need to worry about nodes being fully used or not + * since the master list only includes nodes that aren't. + */ + for (item = opal_list_get_first(master_node_list); + item != opal_list_get_end(master_node_list); + item = opal_list_get_next(item) ) { + node = (orte_ras_node_t*)item; + + if( is_mapped(item, mapped_nodes, num_mapped_nodes) ) { + /** we can't just add this item to the mapped_node_list as it cannot be + * on two lists at the same time, so we need to copy it first + */ + if (ORTE_SUCCESS != (rc = orte_dss.copy(&new_node, node, ORTE_RAS_NODE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + opal_list_append(mapped_node_list, &new_node->super); + } + } + + /** check that anything is left! */ + if (0 == opal_list_get_size(mapped_node_list)) { + opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:alloc-error", + true, app->num_procs, app->app); + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + return ORTE_SUCCESS; +} + + +/* + * Claim a slot for a specified job on a node + */ +int orte_rmaps_base_claim_slot(orte_rmaps_base_map_t *map, + orte_ras_node_t *current_node, + orte_jobid_t jobid, orte_vpid_t vpid, + int proc_index, + opal_list_t *nodes, + opal_list_t *fully_used_nodes) +{ + orte_rmaps_base_proc_t *proc; + orte_process_name_t *proc_name; + orte_rmaps_base_node_t *rmaps_node; + int rc; + + /* create objects */ + rmaps_node = OBJ_NEW(orte_rmaps_base_node_t); + if (NULL == rmaps_node) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + OBJ_RETAIN(current_node); + rmaps_node->node = current_node; + proc = OBJ_NEW(orte_rmaps_base_proc_t); + if (NULL == proc) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(rmaps_node); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /* create the process name as an offset from the vpid-start */ + rc = orte_ns.create_process_name(&proc_name, current_node->node_cellid, + jobid, vpid); + if (rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(proc); + OBJ_RELEASE(rmaps_node); + return rc; + } + proc->proc_node = rmaps_node; + proc->proc_name = *proc_name; + proc->proc_rank = vpid; + orte_ns.free_name(&proc_name); + OBJ_RETAIN(proc); /* bump reference count for the node */ + opal_list_append(&rmaps_node->node_procs, &proc->super); + map->procs[proc_index] = proc; + + /* Save this node on the map */ + opal_list_append(&map->nodes, &rmaps_node->super); + + /* Be sure to demarcate this slot as claimed for the node */ + current_node->node_slots_inuse++; + + /* Remove this node if it has reached its max number of allocatable slots OR it has + * reached the soft limit AND we are in a "no oversubscribe" state + */ + if ((0 != current_node->node_slots_max && + current_node->node_slots_inuse >= current_node->node_slots_max) || + (!orte_rmaps_base.oversubscribe && + current_node->node_slots_inuse >= current_node->node_slots)) { + opal_list_remove_item(nodes, (opal_list_item_t*)current_node); + /* add it to the list of fully used nodes */ + opal_list_append(fully_used_nodes, ¤t_node->super); + /** now return the proper code so the caller knows we removed the node! */ + return ORTE_ERR_NODE_FULLY_USED; + } + + return ORTE_SUCCESS; +} + + +/* + * Update the node allocations stored in the registry + */ +int orte_rmaps_base_update_node_usage(opal_list_t *nodes) +{ + opal_list_item_t* item; + orte_gpr_value_t **values; + int rc; + size_t num_values, i, j; + orte_ras_node_t* node; + + num_values = opal_list_get_size(nodes); + if (0 >= num_values) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + + values = (orte_gpr_value_t**)malloc(num_values * sizeof(orte_gpr_value_t*)); + if (NULL == values) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + for (i=0; i < num_values; i++) { + if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[i]), + ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND, + ORTE_NODE_SEGMENT, 1, 0))) { + ORTE_ERROR_LOG(rc); + for (j=0; j < i; j++) { + OBJ_RELEASE(values[j]); + } + free(values); + return rc; + } + } + + for(i=0, item = opal_list_get_first(nodes); + i < num_values && item != opal_list_get_end(nodes); + i++, item = opal_list_get_next(item)) { + node = (orte_ras_node_t*)item; + + if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[0]), ORTE_NODE_SLOTS_IN_USE_KEY, + ORTE_SIZE, &(node->node_slots_inuse)))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + /* setup index/keys for this node */ + rc = orte_schema.get_node_tokens(&(values[i]->tokens), &(values[i]->num_tokens), node->node_cellid, node->node_name); + if (ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } + + /* try the insert */ + if (ORTE_SUCCESS != (rc = orte_gpr.put(num_values, values))) { + ORTE_ERROR_LOG(rc); + } + +cleanup: + for (j=0; j < num_values; j++) { + OBJ_RELEASE(values[j]); + } + if (NULL != values) free(values); + + return rc; +} diff --git a/orte/mca/rmaps/base/rmaps_base_node.h b/orte/mca/rmaps/base/rmaps_base_node.h new file mode 100644 index 0000000000..1241872a16 --- /dev/null +++ b/orte/mca/rmaps/base/rmaps_base_node.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * RMAPS framework base functionality. + */ + +#ifndef ORTE_RMAPS_BASE_NODE_H +#define ORTE_RMAPS_BASE_NODE_H + +/* + * includes + */ +#include "orte_config.h" +#include "orte/orte_constants.h" + +#ifdef HAVE_SYS_TYPES_H +#include +#endif + +#include "opal/class/opal_list.h" +#include "orte/mca/ns/ns_types.h" +#include "orte/mca/rmaps/rmaps.h" + + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +/** + * RMAPS + */ + +int orte_rmaps_base_get_target_nodes(opal_list_t* node_list, orte_jobid_t jobid); +int orte_rmaps_base_update_node_usage(opal_list_t *nodes); +int orte_rmaps_base_get_mapped_targets(opal_list_t *mapped_node_list, + orte_app_context_t *app, + opal_list_t *master_node_list); +int orte_rmaps_base_claim_slot(orte_rmaps_base_map_t *map, + orte_ras_node_t *current_node, + orte_jobid_t jobid, orte_vpid_t vpid, + int proc_index, + opal_list_t *nodes, + opal_list_t *fully_used_nodes); + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/orte/mca/rmaps/base/rmaps_base_open.c b/orte/mca/rmaps/base/rmaps_base_open.c index 1d184fcd52..dc0cd60a7d 100644 --- a/orte/mca/rmaps/base/rmaps_base_open.c +++ b/orte/mca/rmaps/base/rmaps_base_open.c @@ -92,6 +92,18 @@ int orte_rmaps_base_open(void) "If nonzero, allow scheduling MPI applications on the same node as mpirun (default). If zero, do not schedule any MPI applications on the same node as mpirun", false, false, 1, &value); + /* Should we oversubscribe or not? */ + + mca_base_param_reg_int_name("rmaps", "base_no_oversubscribe", + "If nonzero, then do not allow oversubscription of nodes - mpirun will return an error if there aren't enough nodes to launch all processes without oversubscribing", + false, false, 0, &value); + if (0 == value) { + orte_rmaps_base.oversubscribe = true; /** default condition */ + } else { + orte_rmaps_base.oversubscribe = false; + } + + /* Open up all the components that we can find */ if (ORTE_SUCCESS != diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index c7ba88b16b..356cf42e40 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -18,6 +18,9 @@ */ #include "orte_config.h" +#include "orte/orte_constants.h" +#include "orte/orte_types.h" + #include #ifdef HAVE_UNISTD_H #include @@ -30,16 +33,14 @@ #include "opal/util/output.h" #include "opal/util/show_help.h" #include "opal/util/argv.h" -#include "opal/util/if.h" -#include "orte/orte_constants.h" -#include "orte/orte_types.h" -#include "orte/util/sys_info.h" + +#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ns/ns.h" #include "orte/mca/gpr/gpr.h" #include "orte/mca/rmaps/base/base.h" #include "orte/mca/rmgr/base/base.h" #include "orte/mca/rmaps/base/rmaps_base_map.h" -#include "orte/mca/ras/base/ras_base_node.h" +#include "orte/mca/rmaps/base/rmaps_base_node.h" #include "rmaps_rr.h" @@ -47,164 +48,7 @@ * Local variable */ static opal_list_item_t *cur_node_item = NULL; - -/* - * A sanity check to ensure that all of the requested nodes are actually - * allocated to this application. - */ -static bool are_all_mapped_valid(char **mapping, - int num_mapped, - opal_list_t* nodes) -{ - opal_list_item_t *item; - int i; - bool matched; - - for (i = 0; i < num_mapped; ++i) { - matched = false; - - for(item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item) ) { - if( 0 == strcmp( ((orte_ras_node_t*) item)->node_name, mapping[i]) ) { - matched = true; - break; - } - } - - /* If we find one requested resource that is not allocated, - * then return an error */ - if(!matched) { - return false; - } - } - - return true; -} - -/* - * If the node in question is in the current mapping. - */ -static bool is_mapped(opal_list_item_t *item, - char **mapping, - int num_mapped, - opal_list_t* nodes) -{ - int i; - - for ( i = 0; i < num_mapped; ++i) { - if ( 0 == strcmp( ((orte_ras_node_t*) item)->node_name, mapping[i])){ - return true; - } - } - - return false; -} - -/* - * Return a point to the next node allocated, included in the mapping. - */ -static opal_list_item_t* get_next_mapped(opal_list_item_t *node_item, - char **mapping, - int num_mapped, - opal_list_t* nodes) -{ - opal_list_item_t *item, *initial_item = NULL; - - /* Wrap around to beginning if we are at the end of the list */ - if (opal_list_get_end(nodes) == opal_list_get_next(node_item)) { - item = opal_list_get_first(nodes); - } - else { - item = opal_list_get_next(node_item); - } - - do { - /* See if current node is in the mapping and contains slots */ - if( is_mapped(item, mapping, num_mapped, nodes) ) { - return item; - } - - /* - * We just rechecked the current item and concluded that - * it wasn't in the list, thus the list contains no matches - * in this mapping. Return an error. - */ - if(node_item == item){ - return NULL; - } - - /* save the node we started with */ - if( NULL == initial_item ) { - initial_item = item; - } - - /* Access next item in Round Robin Manner */ - if (opal_list_get_end(nodes) == opal_list_get_next(item)) { - item = opal_list_get_first(nodes); - } - else { - item = opal_list_get_next(item); - } - - /* Check to make sure we didn't loop back around without - * finding a node in the mapping */ - if( initial_item == item) { - return NULL; - } - - } while( true ); -} - -static int claim_slot(orte_rmaps_base_map_t *map, - orte_ras_node_t *current_node, - orte_jobid_t jobid, - orte_vpid_t vpid, - int proc_index) -{ - orte_rmaps_base_proc_t *proc; - orte_process_name_t *proc_name; - orte_rmaps_base_node_t *rmaps_node; - int rc; - - /* create objects */ - rmaps_node = OBJ_NEW(orte_rmaps_base_node_t); - if (NULL == rmaps_node) { - return ORTE_ERR_OUT_OF_RESOURCE; - } - - OBJ_RETAIN(current_node); - rmaps_node->node = current_node; - proc = OBJ_NEW(orte_rmaps_base_proc_t); - if (NULL == proc) { - OBJ_RELEASE(rmaps_node); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - /* create the process name as an offset from the vpid-start */ - rc = orte_ns.create_process_name(&proc_name, current_node->node_cellid, - jobid, vpid); - if (rc != ORTE_SUCCESS) { - OBJ_RELEASE(proc); - OBJ_RELEASE(rmaps_node); - return rc; - } - proc->proc_node = rmaps_node; - proc->proc_name = *proc_name; - proc->proc_rank = vpid; - orte_ns.free_name(&proc_name); - OBJ_RETAIN(proc); /* bump reference count for the node */ - opal_list_append(&rmaps_node->node_procs, &proc->super); - map->procs[proc_index] = proc; - - /* Save this node on the map */ - opal_list_append(&map->nodes, &rmaps_node->super); - - /* Be sure to demarcate this slot claim for the node */ - current_node->node_slots_inuse++; - - return ORTE_SUCCESS; -} +static opal_list_t fully_used_nodes; /* @@ -218,77 +62,73 @@ static int map_app_by_node( orte_vpid_t vpid_start, int rank, opal_list_t* nodes, - char **mapped_nodes, - int num_mapped_nodes) + opal_list_t* max_used_nodes) { int rc = ORTE_SUCCESS; size_t num_alloc = 0; size_t proc_index = 0; opal_list_item_t *next; orte_ras_node_t *node; + /* This loop continues until all procs have been mapped or we run out of resources. We determine that we have "run out of resources" when all nodes have node_slots_max processes mapped to them, - thus there are no free slots for a process to be mapped. - If we still have processes that haven't been mapped yet, then it's an - "out of resources" error. */ + thus there are no free slots for a process to be mapped, or we have + hit the soft limit on all nodes and are in a "no oversubscribe" state. + If we still have processes that haven't been mapped yet, then it's an + "out of resources" error. + + In this scenario, we rely on the claim_slot function to handle the + oversubscribed case. The claim_slot function will leave a node on the + list until it either reachs node_slots_max OR reaches node_slots (the + soft limit) and the "no_oversubscribe" flag has been set - at which point, + the node will be removed to prevent any more processes from being mapped to + it. Since we are taking one slot from each node as we cycle through, the + list, oversubscription is automatically taken care of via this logic. + */ + while (num_alloc < app->num_procs) { - node = (orte_ras_node_t*) cur_node_item; - /* Find the next node we can use before claiming slots, since - * we may need to prune the nodes list removing overused nodes */ - if ( 0 < app->num_map ) { - next = get_next_mapped(cur_node_item, mapped_nodes, num_mapped_nodes, nodes); - if (NULL == next ) { - /* Not allocated anything */ - opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:no-mapped-node", - true, app->app, opal_argv_join(mapped_nodes, ',')); - rc = ORTE_ERR_OUT_OF_RESOURCE; - goto cleanup; - } + /** see if any nodes remain unused and available. We need to do this check + * each time since we may remove nodes from the list (as they become fully + * used) as we cycle through the loop */ + if(0 >= opal_list_get_size(nodes) ) { + /* No more nodes to allocate :( */ + opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:alloc-error", + true, app->num_procs, app->app); + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /* Save the next node we can use before claiming slots, since + * we may need to prune the nodes list removing overused nodes. + * Wrap around to beginning if we are at the end of the list */ + if (opal_list_get_end(nodes) == opal_list_get_next(cur_node_item)) { + next = opal_list_get_first(nodes); } else { - if (opal_list_get_end(nodes) == opal_list_get_next(cur_node_item)) { - next = opal_list_get_first(nodes); - } - else { - next = opal_list_get_next(cur_node_item); - } + next = opal_list_get_next(cur_node_item); + } + + /* Allocate a slot on this node */ + node = (orte_ras_node_t*) cur_node_item; + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(map, node, jobid, vpid_start + rank, proc_index, + nodes, max_used_nodes))) { + ORTE_ERROR_LOG(rc); + return rc; } - /* Remove this node if it has reached its max number of allocatable slots */ - if( 0 != node->node_slots_max && - node->node_slots_inuse > node->node_slots_max) { - opal_list_remove_item(nodes, (opal_list_item_t*)node); - if(0 >= opal_list_get_size(nodes) ) { - /* No more nodes to allocate :( */ - opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:alloc-error", - true, app->num_procs, app->app); - rc = ORTE_ERR_OUT_OF_RESOURCE; - goto cleanup; - } - } - /* Allocate this node */ - else { - rc = claim_slot(map, node, jobid, vpid_start + rank, proc_index); - if (ORTE_SUCCESS != rc) { - goto cleanup; - } - - ++rank; - ++proc_index; - ++num_alloc; - } + ++rank; + ++proc_index; + ++num_alloc; cur_node_item = next; } map->num_procs = num_alloc; - cleanup: - - return rc; + return ORTE_SUCCESS; } @@ -303,129 +143,123 @@ static int map_app_by_slot( orte_vpid_t vpid_start, int rank, opal_list_t* nodes, - char **mapped_nodes, - int num_mapped_nodes) + opal_list_t* max_used_nodes) { int rc = ORTE_SUCCESS; - size_t i; + size_t i, num_slots_to_take; size_t num_alloc = 0; size_t proc_index = 0; orte_ras_node_t *node; - opal_list_item_t *start, *next; - bool oversubscribe; + opal_list_item_t *next; /* This loop continues until all procs have been mapped or we run out of resources. We determine that we have "run out of - resources" when all nodes have node_slots_max processes mapped to them, - thus there are no free slots for a process to be mapped. - If we still have processes that haven't been mapped yet, then it's an + resources" when either all nodes have node_slots_max processes mapped to them, + (thus there are no free slots for a process to be mapped), OR all nodes + have reached their soft limit and the user directed us to "no oversubscribe". + If we still have processes that haven't been mapped yet, then it's an "out of resources" error. */ num_alloc = 0; - start = cur_node_item; - oversubscribe = false; + while ( num_alloc < app->num_procs) { - node = (orte_ras_node_t*) cur_node_item; + + /** see if any nodes remain unused and available. We need to do this check + * each time since we may remove nodes from the list (as they become fully + * used) as we cycle through the loop */ + if(0 >= opal_list_get_size(nodes) ) { + /* No more nodes to allocate :( */ + opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:alloc-error", + true, app->num_procs, app->app); + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } - /* Find the next node we can use before claiming slots, since - * we may need to prune the nodes list removing over used nodes */ - if ( 0 < app->num_map ) { - next = get_next_mapped(cur_node_item, mapped_nodes, num_mapped_nodes, nodes); - if (NULL == next ) { - /* Not allocated anything */ - opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:no-mapped-node", - true, app->app, opal_argv_join(mapped_nodes, ',')); - rc = ORTE_ERR_OUT_OF_RESOURCE; - goto cleanup; - } + /* Save the next node we can use before claiming slots, since + * we may need to prune the nodes list removing overused nodes. + * Wrap around to beginning if we are at the end of the list */ + if (opal_list_get_end(nodes) == opal_list_get_next(cur_node_item)) { + next = opal_list_get_first(nodes); } else { - if (opal_list_get_end(nodes) == opal_list_get_next(cur_node_item)) { - next = opal_list_get_first(nodes); - } - else { - next = opal_list_get_next(cur_node_item); - } + next = opal_list_get_next(cur_node_item); } - + + /** declare a shorter name for convenience in the code below */ + node = (orte_ras_node_t*) cur_node_item; + /* If we have available slots on this node, claim all of them * If node_slots == 0, assume 1 slot for that node. - * JJH - is this assumption fully justified? */ - for( i = 0; i < ((node->node_slots == 0) ? 1 : node->node_slots); ++i) { - /* If we are not oversubscribing, and this node is full, skip it. */ - if( !oversubscribe && - 0 != node->node_slots && - node->node_slots_inuse > node->node_slots) { - break; - } - /* If this node has reached its max number of slots, - * take it out of the list, and skip it */ - else if( 0 != node->node_slots_max && - node->node_slots_inuse > node->node_slots_max){ - opal_list_remove_item(nodes, (opal_list_item_t*)node); - if( 0 >= opal_list_get_size(nodes) ) { - /* No more nodes to allocate */ - opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:alloc-error", - true, app->num_procs, app->app); - rc = ORTE_ERR_OUT_OF_RESOURCE; - goto cleanup; + * JJH - is this assumption fully justified? + * + * If we are now oversubscribing the nodes, then we still take + * a full node_slots from each node until either everything is done, + * or all nodes have hit their hard limit. This preserves the ratio + * of processes between the nodes (e.g., if one node has twice as + * many processes as another before oversubscribing, it will continue + * to do so after oversubscribing). + */ + num_slots_to_take = (node->node_slots == 0) ? 1 : node->node_slots; + + for( i = 0; i < num_slots_to_take; ++i) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(map, node, jobid, vpid_start + rank, proc_index, + nodes, max_used_nodes))) { + /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this + * really isn't an error - we just need to break from the loop + * since the node is fully used up. For now, just don't report + * an error + */ + if (ORTE_ERR_NODE_FULLY_USED != rc) { + ORTE_ERROR_LOG(rc); + return rc; } - break; } - rc = claim_slot(map, node, jobid, vpid_start + rank, proc_index); - if (ORTE_SUCCESS != rc) { - goto cleanup; - } - - /* Increase the number of procs allocated */ + /* Record the number of procs allocated */ ++num_alloc; ++rank; ++proc_index; - if(num_alloc >= app->num_procs) { + + /** if all the procs have been mapped OR we have fully used up this node, then + * break from the loop + */ + if(num_alloc >= app->num_procs || ORTE_ERR_NODE_FULLY_USED == rc) { break; } } cur_node_item = next; - /* Since we have now looped back around, go ahead and oversubscribe nodes */ - if(start == cur_node_item) { - oversubscribe = true; - } } map->num_procs = num_alloc; - cleanup: - - return rc; + return ORTE_SUCCESS; } /* - * Create a default mapping for the job. + * Create a round-robin mapping for the job. */ static int orte_rmaps_rr_map(orte_jobid_t jobid) { orte_app_context_t** context, *app; orte_rmaps_base_map_t* map; - size_t i, j, k, num_context; - opal_list_t nodes; + size_t i, num_context; + opal_list_t master_node_list, mapped_node_list, max_used_nodes, *working_node_list; opal_list_t mapping; - opal_list_item_t* item; + opal_list_item_t *item, *item2; + orte_ras_node_t *node, *node2; orte_vpid_t vpid_start; size_t num_procs = 0; int rank = 0; - int rc = ORTE_SUCCESS; + int rc; bool bynode = true; - char **mapped_nodes = NULL; - int num_mapped_nodes = 0; - int id, value; /* query for the application context and allocated nodes */ if(ORTE_SUCCESS != (rc = orte_rmgr_base_get_app_context(jobid, &context, &num_context))) { + ORTE_ERROR_LOG(rc); return rc; } @@ -435,38 +269,18 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid) } else { bynode = false; } - - /* query for all nodes allocated to this job */ - OBJ_CONSTRUCT(&nodes, opal_list_t); - if(ORTE_SUCCESS != (rc = orte_ras_base_node_query_alloc(&nodes, jobid))) { - OBJ_DESTRUCT(&nodes); + + /* query for all nodes allocated to this job - this will become our master list of + * nodes. From this, we will construct a working list of nodes based on any specified + * mappings from the user + */ + OBJ_CONSTRUCT(&master_node_list, opal_list_t); + if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&master_node_list, jobid))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&master_node_list); return rc; } - /* If the "no local" option was set, then remove the local node - from the list */ - - id = mca_base_param_find("rmaps", NULL, "base_schedule_local"); - mca_base_param_lookup_int(id, &value); - if (0 == value) { - for (item = opal_list_get_first(&nodes); - item != opal_list_get_end(&nodes); - item = opal_list_get_next(item) ) { - if (0 == strcmp(((orte_ras_node_t *) item)->node_name, - orte_system_info.nodename) || - opal_ifislocal(((orte_ras_node_t *) item)->node_name)) { - opal_list_remove_item(&nodes, item); - break; - } - } - } - - /* Sanity check to make sure we have been allocated nodes */ - if (0 == opal_list_get_size(&nodes)) { - OBJ_DESTRUCT(&nodes); - return ORTE_ERR_TEMP_OUT_OF_RESOURCE; - } - /* Total number of procs required * DEVEL NOTE: Need to extend this when implementing C/N notation * Will need to set the app->num_procs approprately before this, @@ -479,110 +293,167 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid) /* allocate a vpid range for the job */ if(ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, num_procs, &vpid_start))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&master_node_list); return rc; } - - /* construct a default mapping by application */ + /* construct a mapping for the job - the list will hold mappings for each + * application context + */ OBJ_CONSTRUCT(&mapping, opal_list_t); - cur_node_item = opal_list_get_first(&nodes); + + /** initialize the cur_node_item to point to the first node in the list */ + cur_node_item = opal_list_get_first(&master_node_list); + + /** construct the list to hold any nodes that get fully used during this + * mapping. We need to keep a record of these so we can update their + * information on the registry when we are done, but we want to remove + * them from our master_node_list as we go so we don't keep checking to + * see if we can still map something onto them. + */ + OBJ_CONSTRUCT(&fully_used_nodes, opal_list_t); + + /** construct an intermediate list that will hold the nodes that are fully + * used during any one pass through the mapper (i.e., for each app_context). + * we will join the results together to form the fully_used_nodes list. This + * allows us to more efficiently handle the cases where users specify + * the proc-to-node mapping themselves. + */ + OBJ_CONSTRUCT(&max_used_nodes, opal_list_t); + + /** construct a list to hold any nodes involved in a user-specified mapping */ + OBJ_CONSTRUCT(&mapped_node_list, opal_list_t); + for(i=0; isuper); map->app = app; map->procs = malloc(sizeof(orte_rmaps_base_proc_t*) * app->num_procs); if(NULL == map->procs) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); rc = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } - - /* Extract the requested mapping for this application */ - /* Note that cur_node_item already points to the Right place in - the node list to start looking (i.e., if this is the first time - through, it'll point to the first item. If this is not the - first time through -- i.e., we have multiple app contexts -- - it'll point to where we left off last time.). If we're at the - end, bounce back to the front (as would happen in the loop - below) - - But do a bozo check to ensure that we don't have a empty - node list.*/ - if (0 == opal_list_get_size(&nodes)) { - rc = ORTE_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } else if (opal_list_get_end(&nodes) == cur_node_item) { - cur_node_item = opal_list_get_first(&nodes); - } - - /* If this application has a mapping then - * - if the current node is in the mapping, use it - * - ow get the next node in that mapping. - */ if ( 0 < app->num_map ) { - orte_app_context_map_t** loc_map = app->map_data; - - /* Accumulate all of the host name mappings */ - for(k = 0; k < app->num_map; ++k) { - if ( ORTE_APP_CONTEXT_MAP_HOSTNAME == loc_map[k]->map_type ) { - if(mapped_nodes == NULL) { - mapped_nodes = opal_argv_split(loc_map[k]->map_data, ','); - num_mapped_nodes = opal_argv_count(mapped_nodes); - } - else { /* Append to the existing mapping */ - char ** mini_map = opal_argv_split(loc_map[k]->map_data, ','); - size_t mini_num_map = opal_argv_count(mini_map); - for (j = 0; j < mini_num_map; ++j) { - rc = opal_argv_append(&num_mapped_nodes, &mapped_nodes, mini_map[j]); - if (OPAL_SUCCESS != rc) { - goto cleanup; - } - } - opal_argv_free(mini_map); - } - } - } - - if( !are_all_mapped_valid(mapped_nodes, num_mapped_nodes, &nodes) ) { - opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:not-all-mapped-alloc", - true, app->app, opal_argv_join(mapped_nodes, ',')); - rc = ORTE_ERR_OUT_OF_RESOURCE; + /** If the user has specified a mapping for this app_context, then we + * create a working node list that contains only those nodes. + */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_mapped_targets(&mapped_node_list, app, &master_node_list))) { + ORTE_ERROR_LOG(rc); goto cleanup; } - - /* If the current node is not in the current mapping - * Then get the next node that is in the mapping */ - if( !is_mapped(cur_node_item, mapped_nodes, num_mapped_nodes, &nodes) ) { - cur_node_item = get_next_mapped(cur_node_item, mapped_nodes, num_mapped_nodes, &nodes); - if( NULL == cur_node_item) { - opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:no-mapped-node", - true, app->app, opal_argv_join(mapped_nodes, ',')); - rc = ORTE_ERR_OUT_OF_RESOURCE; - goto cleanup; - } - } + working_node_list = &mapped_node_list; + /* Set cur_node_item to point to the first node in the specified list to be used */ + cur_node_item = opal_list_get_first(working_node_list); + } + else { + /** no mapping was specified, so we are going to just use everything that was + * allocated to us. We don't need to update cur_node_item in this case since it + * is always pointing to something in the master_node_list - we'll just pick up + * from wherever we last stopped. + */ + working_node_list = &master_node_list; } /* Make assignments */ if (bynode) { - rc = map_app_by_node(app, map, jobid, vpid_start, rank, &nodes, mapped_nodes, num_mapped_nodes); + rc = map_app_by_node(app, map, jobid, vpid_start, rank, working_node_list, &max_used_nodes); } else { - rc = map_app_by_slot(app, map, jobid, vpid_start, rank, &nodes, mapped_nodes, num_mapped_nodes); + rc = map_app_by_slot(app, map, jobid, vpid_start, rank, working_node_list, &max_used_nodes); } + + if (ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); goto cleanup; } rank += app->num_procs; - opal_argv_free(mapped_nodes); - mapped_nodes = NULL; + + /** cleanup the mapped_node_list, if necessary */ + if (0 < app->num_map) { + /** before we get rid of the mapped_node_list, we first need to update + * corresponding entries in the master_node_list so we accurately + * track the usage of slots. Also, any node that was "used up" will have + * been removed from the mapped_node_list - we now also must ensure that + * such a node is removed from the master_node_list. + * + * Clearly, there will be a performance penalty in doing all these + * operations to maintain data integrity. However, the case where + * someone maps processes this specifically is considered the + * atypical one, so penalizing it may not be a major issue. + * + * Still, some effort to improve the efficiency of this process + * may be in order for the future. + */ + while (NULL != (item = opal_list_remove_first(&mapped_node_list))) { + node = (orte_ras_node_t*)item; + + /** if the node was still on the mapped_node_list, then it hasn't + * been moved to the fully_used_node list - find it on the + * master_node_list and update the slots_inuse count there + */ + for (item2 = opal_list_get_first(&master_node_list); + item2 != opal_list_get_end(&master_node_list); + item2 = opal_list_get_next(item2) ) { + node2 = (orte_ras_node_t*)item2; + + if (0 == strcmp(node2->node_name, node->node_name)) { + node2->node_slots_inuse = node->node_slots_inuse; + break; + } + } + OBJ_RELEASE(item); + } + + /** that updated everything that wasn't fully used up while + * processing the specific map. Now we have to ensure that + * any nodes that were used up (and hence, transferred to the + * max_used_node list) are removed from the master_node_list + * No really nice way to do this - we just have to run through + * the two lists and remove any duplicates. + */ + while (NULL != (item = opal_list_remove_first(&max_used_nodes))) { + node = (orte_ras_node_t*)item; + + for (item2 = opal_list_get_first(&master_node_list); + item2 != opal_list_get_end(&master_node_list); + item2 = opal_list_get_next(item2) ) { + node2 = (orte_ras_node_t*)item2; + + /** if we have a match, then remove the entry from the + * master_node_list + */ + if (0 == strcmp(node2->node_name, node->node_name)) { + opal_list_remove_item(&master_node_list, item2); + break; + } + } + + /** now put that node on the fully_used_nodes list */ + opal_list_append(&fully_used_nodes, &node->super); + } + + } else { + /** this mapping wasn't specified, so all we have to do is add any nodes + * that were used up in the mapping to the fully_used_nodes list - they + * were already removed from the master_node_list when we did the mapping. + */ + opal_list_join(&fully_used_nodes, opal_list_get_end(&fully_used_nodes), &max_used_nodes); + } + } /* save mapping to the registry */ @@ -591,23 +462,36 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid) } /* save vpid start/range on the job segment */ - rc = orte_rmaps_base_set_vpid_range(jobid,vpid_start,num_procs); + if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_vpid_range(jobid,vpid_start,num_procs))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + /** join the master_node_list and fully_used_list so that all info gets updated */ + opal_list_join(&master_node_list, opal_list_get_end(&master_node_list), &fully_used_nodes); + + /** save the modified node information so we can start from the right + * place next time through + */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_update_node_usage(&master_node_list))) { + ORTE_ERROR_LOG(rc); + } + cleanup: - while(NULL != (item = opal_list_remove_first(&nodes))) { + while(NULL != (item = opal_list_remove_first(&master_node_list))) { OBJ_RELEASE(item); } - OBJ_DESTRUCT(&nodes); + OBJ_DESTRUCT(&master_node_list); while(NULL != (item = opal_list_remove_first(&mapping))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&mapping); - - if( NULL != mapped_nodes ) { - opal_argv_free(mapped_nodes); - } + OBJ_DESTRUCT(&max_used_nodes); + OBJ_DESTRUCT(&fully_used_nodes); + OBJ_DESTRUCT(&mapped_node_list); return rc; } diff --git a/orte/mca/schema/schema_types.h b/orte/mca/schema/schema_types.h index 08da1b9bc4..b47d71277a 100644 --- a/orte/mca/schema/schema_types.h +++ b/orte/mca/schema/schema_types.h @@ -57,6 +57,7 @@ #define ORTE_NODE_STATE_KEY "orte-node-state" #define ORTE_NODE_SLOTS_KEY "orte-node-slots" #define ORTE_NODE_SLOTS_ALLOC_KEY "orte-node-slots-alloc" +#define ORTE_NODE_SLOTS_IN_USE_KEY "orte-node-slots-in-use" #define ORTE_NODE_SLOTS_MAX_KEY "orte-node-slots-max" #define ORTE_NODE_ALLOC_KEY "orte-node-alloc" #define ORTE_NODE_BOOTPROXY_KEY "orte-node-bootproxy" diff --git a/orte/test/system/Makefile b/orte/test/system/Makefile index 0f193575a2..efe9051f6f 100644 --- a/orte/test/system/Makefile +++ b/orte/test/system/Makefile @@ -1,4 +1,4 @@ -PROGS = no_op mpi_no_op hello hello_null sigusr_trap +PROGS = no_op mpi_no_op hello hello_null sigusr_trap hello_nodename all: $(PROGS) diff --git a/orte/test/system/hello_nodename.c b/orte/test/system/hello_nodename.c new file mode 100644 index 0000000000..33d44cbd4f --- /dev/null +++ b/orte/test/system/hello_nodename.c @@ -0,0 +1,25 @@ +/* -*- C -*- + * + * $HEADER$ + * + * The most basic of MPI applications + */ + +#include +#include "mpi.h" + +int main(int argc, char* argv[]) +{ + int rank, size; + char hostname[512]; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + gethostname(hostname, 512); + printf("Hello, World, I am %d of %d on host %s\n", rank, size, hostname); + + MPI_Finalize(); + return 0; +} diff --git a/orte/util/error_strings.c b/orte/util/error_strings.c index f38d7e66dd..0f9f89a7fe 100644 --- a/orte/util/error_strings.c +++ b/orte/util/error_strings.c @@ -103,6 +103,9 @@ orte_err2str(int errnum) case ORTE_ERR_INDETERMINATE_STATE_INFO: retval = "Request for state returned multiple responses"; break; + case ORTE_ERR_NODE_FULLY_USED: + retval = "All the slots on a given node have been used"; + break; default: retval = NULL; }