From a056fdea2d83029cd955ead158303ecdf78a805d Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Fri, 12 Jan 2018 10:09:35 +0900 Subject: [PATCH 1/5] regx/fwd: correctly handle node names with multiple set of digits Refs. open-mpi/ompi#4689 Signed-off-by: Gilles Gouaillardet --- orte/mca/regx/fwd/regx_fwd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/orte/mca/regx/fwd/regx_fwd.c b/orte/mca/regx/fwd/regx_fwd.c index 87250172a1..311f20051a 100644 --- a/orte/mca/regx/fwd/regx_fwd.c +++ b/orte/mca/regx/fwd/regx_fwd.c @@ -170,14 +170,12 @@ static int nidmap_create(opal_pointer_array_t *pool, char **regex) len = strlen(node); startnum = -1; memset(prefix, 0, ORTE_MAX_NODE_PREFIX); - numdigits = 0; for (i=0, j=0; i < len; i++) { /* valid hostname characters are ascii letters, digits and the '-' character. */ if (isdigit(node[i])) { /* count the size of the numeric field - but don't * add the digits to the prefix */ - numdigits++; if (startnum < 0) { /* okay, this defines end of the prefix */ startnum = i; @@ -204,8 +202,10 @@ static int nidmap_create(opal_pointer_array_t *pool, char **regex) nodenum = strtol(&node[startnum], &sfx, 10); if (NULL != sfx) { suffix = strdup(sfx); + numdigits = (int)(sfx - &node[startnum]); } else { suffix = NULL; + numdigits = (int)strlen(&node[startnum]); } /* is this node name already on our list? */ found = false; From 0c686f01e5963bea54666ca829dd882bde51f95f Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Fri, 12 Jan 2018 09:15:00 +0900 Subject: [PATCH 2/5] regx: add the extract_node_names callback typedef int (*orte_regx_base_module_extract_node_names_fn_t)(char *regexp, char ***names); among other things, that will make testing way easier. Signed-off-by: Gilles Gouaillardet --- orte/mca/regx/fwd/regx_fwd.c | 5 ++++- orte/mca/regx/regx.h | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/orte/mca/regx/fwd/regx_fwd.c b/orte/mca/regx/fwd/regx_fwd.c index 311f20051a..81b5e6eaed 100644 --- a/orte/mca/regx/fwd/regx_fwd.c +++ b/orte/mca/regx/fwd/regx_fwd.c @@ -1,5 +1,7 @@ /* * Copyright (c) 2016-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -33,6 +35,7 @@ static int nidmap_create(opal_pointer_array_t *pool, char **regex); static int nidmap_parse(char *regex); +static int extract_node_names(char *regex, char ***node_names); static int encode_nodemap(opal_buffer_t *buffer); static int decode_daemon_nodemap(opal_buffer_t *buffer); static int generate_ppn(orte_job_t *jdata, char **ppn); @@ -41,6 +44,7 @@ static int parse_ppn(orte_job_t *jdata, char *ppn); orte_regx_base_module_t orte_regx_fwd_module = { .nidmap_create = nidmap_create, .nidmap_parse = nidmap_parse, + .extract_node_names = extract_node_names, .encode_nodemap = encode_nodemap, .decode_daemon_nodemap = decode_daemon_nodemap, .generate_ppn = generate_ppn, @@ -98,7 +102,6 @@ OBJ_CLASS_INSTANCE(orte_regex_node_t, opal_list_item_t, orte_regex_node_construct, orte_regex_node_destruct); -static int extract_node_names(char *regexp, char ***names); static int nidmap_create(opal_pointer_array_t *pool, char **regex) { diff --git a/orte/mca/regx/regx.h b/orte/mca/regx/regx.h index 592ea6a6ea..2d3630e0b6 100644 --- a/orte/mca/regx/regx.h +++ b/orte/mca/regx/regx.h @@ -3,6 +3,8 @@ * Copyright (c) 2015-2018 Intel, Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -59,6 +61,7 @@ typedef int (*orte_regx_base_module_init_fn_t)(void); typedef int (*orte_regx_base_module_nidmap_create_fn_t)(opal_pointer_array_t *pool, char **regex); typedef int (*orte_regx_base_module_nidmap_parse_fn_t)(char *regex); +typedef int (*orte_regx_base_module_extract_node_names_fn_t)(char *regexp, char ***names); /* create a regular expression describing the nodes in the * allocation */ @@ -87,6 +90,7 @@ typedef struct { orte_regx_base_module_init_fn_t init; orte_regx_base_module_nidmap_create_fn_t nidmap_create; orte_regx_base_module_nidmap_parse_fn_t nidmap_parse; + orte_regx_base_module_extract_node_names_fn_t extract_node_names; orte_regx_base_module_encode_nodemap_fn_t encode_nodemap; orte_regx_base_module_decode_daemon_nodemap_fn_t decode_daemon_nodemap; orte_regx_base_module_build_daemon_nidmap_fn_t build_daemon_nidmap; From c2a358ff452f843ee3a309a7c2fee2fc3a0ba85d Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Fri, 12 Jan 2018 11:44:27 +0900 Subject: [PATCH 3/5] regx: move most functions from the fwd component to base Signed-off-by: Gilles Gouaillardet --- orte/mca/regx/base/Makefile.am | 3 + orte/mca/regx/base/base.h | 34 + orte/mca/regx/base/regx_base_default_fns.c | 1281 ++++++++++++++++++++ orte/mca/regx/fwd/regx_fwd.c | 1278 +------------------ 4 files changed, 1324 insertions(+), 1272 deletions(-) create mode 100644 orte/mca/regx/base/regx_base_default_fns.c diff --git a/orte/mca/regx/base/Makefile.am b/orte/mca/regx/base/Makefile.am index e0237653f4..cee4dd7ceb 100644 --- a/orte/mca/regx/base/Makefile.am +++ b/orte/mca/regx/base/Makefile.am @@ -1,5 +1,7 @@ # # Copyright (c) 2015-2018 Intel, Inc. All rights reserved. +# Copyright (c) 2018 Research Organization for Information Science +# and Technology (RIST). All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -11,5 +13,6 @@ headers += \ base/base.h libmca_regx_la_SOURCES += \ + base/regx_base_default_fns.c \ base/regx_base_frame.c \ base/regx_base_select.c diff --git a/orte/mca/regx/base/base.h b/orte/mca/regx/base/base.h index 29c9a286b0..a1d34e67c7 100644 --- a/orte/mca/regx/base/base.h +++ b/orte/mca/regx/base/base.h @@ -35,6 +35,40 @@ ORTE_DECLSPEC extern mca_base_framework_t orte_regx_base_framework; /* select all components */ ORTE_DECLSPEC int orte_regx_base_select(void); +/* + * common stuff + */ +typedef struct { + opal_list_item_t super; + int vpid; + int cnt; + int slots; + orte_topology_t *t; +} orte_regex_range_t; + +OBJ_CLASS_DECLARATION(orte_regex_range_t); + +typedef struct { + /* list object */ + opal_list_item_t super; + char *prefix; + char *suffix; + int num_digits; + opal_list_t ranges; +} orte_regex_node_t; END_C_DECLS +OBJ_CLASS_DECLARATION(orte_regex_node_t); + +ORTE_DECLSPEC extern int orte_regx_base_nidmap_parse(char *regex); + +ORTE_DECLSPEC extern int orte_regx_base_encode_nodemap(opal_buffer_t *buffer); + +ORTE_DECLSPEC int orte_regx_base_decode_daemon_nodemap(opal_buffer_t *buffer); + +ORTE_DECLSPEC int orte_regx_base_generate_ppn(orte_job_t *jdata, char **ppn); + +ORTE_DECLSPEC int orte_regx_base_parse_ppn(orte_job_t *jdata, char *regex); + +ORTE_DECLSPEC int orte_regx_base_extract_node_names(char *regexp, char ***names); #endif diff --git a/orte/mca/regx/base/regx_base_default_fns.c b/orte/mca/regx/base/regx_base_default_fns.c new file mode 100644 index 0000000000..6b70f78cad --- /dev/null +++ b/orte/mca/regx/base/regx_base_default_fns.c @@ -0,0 +1,1281 @@ +/* + * Copyright (c) 2016-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +#include "orte_config.h" +#include "orte/types.h" +#include "opal/types.h" + +#ifdef HAVE_UNISTD_H +#include +#endif +#include + +#include "opal/util/argv.h" +#include "opal/util/basename.h" +#include "opal/util/opal_environ.h" + +#include "orte/runtime/orte_globals.h" +#include "orte/util/name_fns.h" +#include "orte/util/show_help.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rmaps/base/base.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/regx/base/base.h" + +static void range_construct(orte_regex_range_t *ptr) +{ + ptr->vpid = 0; + ptr->cnt = 0; +} +OBJ_CLASS_INSTANCE(orte_regex_range_t, + opal_list_item_t, + range_construct, NULL); + +static void orte_regex_node_construct(orte_regex_node_t *ptr) +{ + ptr->prefix = NULL; + ptr->suffix = NULL; + ptr->num_digits = 0; + OBJ_CONSTRUCT(&ptr->ranges, opal_list_t); +} + +static void orte_regex_node_destruct(orte_regex_node_t *ptr) +{ + opal_list_item_t *item; + + if (NULL != ptr->prefix) { + free(ptr->prefix); + } + if (NULL != ptr->suffix) { + free(ptr->suffix); + } + + while (NULL != (item = opal_list_remove_first(&ptr->ranges))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&ptr->ranges); +} + +OBJ_CLASS_INSTANCE(orte_regex_node_t, + opal_list_item_t, + orte_regex_node_construct, + orte_regex_node_destruct); + +int orte_regx_base_nidmap_parse(char *regex) +{ + char *nodelist, *vpids, *ptr; + char **nodes, **dvpids; + int rc, n, cnt; + orte_regex_range_t *rng; + opal_list_t dids; + orte_job_t *daemons; + orte_node_t *nd; + orte_proc_t *proc; + + /* if we are the HNP, we don't need to parse this */ + if (ORTE_PROC_IS_HNP) { + return ORTE_SUCCESS; + } + + /* split the regex into its node and vpid parts */ + nodelist = regex; + vpids = strchr(regex, '@'); + if (NULL == vpids) { + /* indicates the regex got mangled somewhere */ + return ORTE_ERR_BAD_PARAM; + } + *vpids = '\0'; // terminate the nodelist string + ++vpids; // step over the separator + if (NULL == vpids || '\0' == *vpids) { + /* indicates the regex got mangled somewhere */ + return ORTE_ERR_BAD_PARAM; + } + + /* decompress the nodes regex */ + nodes = NULL; + if (ORTE_SUCCESS != (rc = orte_regx.extract_node_names(nodelist, &nodes))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (NULL == nodes) { + /* should not happen */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + + /* decompress the vpids */ + OBJ_CONSTRUCT(&dids, opal_list_t); + dvpids = opal_argv_split(vpids, ','); + for (n=0; NULL != dvpids[n]; n++) { + rng = OBJ_NEW(orte_regex_range_t); + opal_list_append(&dids, &rng->super); + /* check for a count */ + if (NULL != (ptr = strchr(dvpids[n], '('))) { + dvpids[n][strlen(dvpids[n])-1] = '\0'; // remove trailing paren + *ptr = '\0'; + ++ptr; + rng->cnt = strtoul(ptr, NULL, 10); + } else { + rng->cnt = 1; + } + /* convert the number */ + rng->vpid = strtoul(dvpids[n], NULL, 10); + } + opal_argv_free(dvpids); + + /* get the daemon job object */ + daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + + /* create the node pool array - this will include + * _all_ nodes known to the allocation */ + rng = (orte_regex_range_t*)opal_list_get_first(&dids); + cnt = 0; + for (n=0; NULL != nodes[n]; n++) { + nd = OBJ_NEW(orte_node_t); + nd->name = nodes[n]; + opal_pointer_array_set_item(orte_node_pool, n, nd); + /* see if it has a daemon on it */ + if (-1 != rng->vpid) { + /* we have a daemon, so let's create the tracker for it */ + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, rng->vpid+cnt))) { + proc = OBJ_NEW(orte_proc_t); + proc->name.jobid = ORTE_PROC_MY_NAME->jobid; + proc->name.vpid = rng->vpid + cnt; + proc->state = ORTE_PROC_STATE_RUNNING; + ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE); + daemons->num_procs++; + opal_pointer_array_set_item(daemons->procs, proc->name.vpid, proc); + } + nd->index = proc->name.vpid; + OBJ_RETAIN(nd); + proc->node = nd; + OBJ_RETAIN(proc); + nd->daemon = proc; + } + ++cnt; + if (rng->cnt <= cnt) { + rng = (orte_regex_range_t*)opal_list_get_next(&rng->super); + if (NULL == rng) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + cnt = 0; + } + } + + /* update num procs */ + if (orte_process_info.num_procs != daemons->num_procs) { + orte_process_info.num_procs = daemons->num_procs; + /* need to update the routing plan */ + orte_routed.update_routing_plan(NULL); + } + + if (orte_process_info.max_procs < orte_process_info.num_procs) { + orte_process_info.max_procs = orte_process_info.num_procs; + } + + if (0 < opal_output_get_verbosity(orte_regx_base_framework.framework_output)) { + int i; + for (i=0; i < orte_node_pool->size; i++) { + if (NULL == (nd = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { + continue; + } + opal_output(0, "%s node[%d].name %s daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i, + (NULL == nd->name) ? "NULL" : nd->name, + (NULL == nd->daemon) ? "NONE" : ORTE_VPID_PRINT(nd->daemon->name.vpid)); + } + } + + return ORTE_SUCCESS; +} + +int orte_regx_base_encode_nodemap(opal_buffer_t *buffer) +{ + int n; + bool test; + orte_regex_range_t *rng, *slt, *tp, *flg; + opal_list_t slots, topos, flags; + opal_list_item_t *item; + char *tmp, *tmp2; + orte_node_t *nptr; + int rc; + uint8_t ui8; + orte_topology_t *ortetopo; + + /* setup the list of results */ + OBJ_CONSTRUCT(&slots, opal_list_t); + OBJ_CONSTRUCT(&topos, opal_list_t); + OBJ_CONSTRUCT(&flags, opal_list_t); + + slt = NULL; + tp = NULL; + flg = NULL; + + /* pack a flag indicating if the HNP was included in the allocation */ + if (orte_hnp_is_allocated) { + ui8 = 1; + } else { + ui8 = 0; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &ui8, 1, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* pack a flag indicating if we are in a managed allocation */ + if (orte_managed_allocation) { + ui8 = 1; + } else { + ui8 = 0; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &ui8, 1, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* handle the topologies - as the most common case by far + * is to have homogeneous topologies, we only send them + * if something is different. We know that the HNP is + * the first topology, and that any differing topology + * on the compute nodes must follow. So send the topologies + * if and only if: + * + * (a) the HNP is being used to house application procs and + * there is more than one topology on our list; or + * + * (b) the HNP is not being used, but there are more than + * two topologies on our list, thus indicating that + * there are multiple topologies on the compute nodes + */ + nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); + if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) { + /* assign a NULL topology so we still account for our presence, + * but don't cause us to send topology info when not needed */ + tp = OBJ_NEW(orte_regex_range_t); + tp->t = NULL; + tp->cnt = 1; + } else { + /* there is always one topology - our own - so start with it */ + tp = OBJ_NEW(orte_regex_range_t); + tp->t = nptr->topology; + tp->cnt = 1; + } + opal_list_append(&topos, &tp->super); + + opal_output_verbose(5, orte_regx_base_framework.framework_output, + "%s STARTING WITH TOPOLOGY FOR NODE %s: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + nptr->name, (NULL == tp->t) ? "NULL" : tp->t->sig); + + /* likewise, we have slots */ + slt = OBJ_NEW(orte_regex_range_t); + slt->slots = nptr->slots; + slt->cnt = 1; + opal_list_append(&slots, &slt->super); + + /* and flags */ + flg = OBJ_NEW(orte_regex_range_t); + if (ORTE_FLAG_TEST(nptr, ORTE_NODE_FLAG_SLOTS_GIVEN)) { + flg->slots = 1; + } else { + flg->slots = 0; + } + flg->cnt = 1; + opal_list_append(&flags, &flg->super); + + for (n=1; n < orte_node_pool->size; n++) { + if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { + continue; + } + /* check the #slots */ + /* is this the next in line */ + if (nptr->slots == slt->slots) { + slt->cnt++; + } else { + /* need to start another range */ + slt = OBJ_NEW(orte_regex_range_t); + slt->slots = nptr->slots; + slt->cnt = 1; + opal_list_append(&slots, &slt->super); + } + /* check the topologies */ + if (NULL != tp->t && NULL == nptr->topology) { + /* we don't know this topology, likely because + * we don't have a daemon on the node */ + tp = OBJ_NEW(orte_regex_range_t); + tp->t = NULL; + tp->cnt = 1; + opal_output_verbose(5, orte_regx_base_framework.framework_output, + "%s ADD TOPOLOGY FOR NODE %s: NULL", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nptr->name); + opal_list_append(&topos, &tp->super); + } else { + /* is this the next in line */ + if (tp->t == nptr->topology) { + tp->cnt++; + opal_output_verbose(5, orte_regx_base_framework.framework_output, + "%s CONTINUE TOPOLOGY RANGE (%d) WITH NODE %s: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + tp->cnt, nptr->name, + (NULL == tp->t) ? "N/A" : tp->t->sig); + } else { + /* need to start another range */ + tp = OBJ_NEW(orte_regex_range_t); + tp->t = nptr->topology; + tp->cnt = 1; + opal_output_verbose(5, orte_regx_base_framework.framework_output, + "%s STARTING NEW TOPOLOGY RANGE WITH NODE %s: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + nptr->name, tp->t->sig); + opal_list_append(&topos, &tp->super); + } + } + /* check the flags */ + test = ORTE_FLAG_TEST(nptr, ORTE_NODE_FLAG_SLOTS_GIVEN); + /* is this the next in line */ + if ((test && 1 == flg->slots) || + (!test && 0 == flg->slots)) { + flg->cnt++; + } else { + /* need to start another range */ + flg = OBJ_NEW(orte_regex_range_t); + if (test) { + flg->slots = 1; + } else { + flg->slots = 0; + } + flg->cnt = 1; + opal_list_append(&flags, &flg->super); + } + } + + /* pass #slots on each node */ + tmp = NULL; + while (NULL != (item = opal_list_remove_first(&slots))) { + rng = (orte_regex_range_t*)item; + if (NULL == tmp) { + asprintf(&tmp, "%d[%d]", rng->cnt, rng->slots); + } else { + asprintf(&tmp2, "%s,%d[%d]", tmp, rng->cnt, rng->slots); + free(tmp); + tmp = tmp2; + } + OBJ_RELEASE(rng); + } + OPAL_LIST_DESTRUCT(&slots); + opal_output_verbose(1, orte_regx_base_framework.framework_output, + "%s SLOT ASSIGNMENTS: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); + /* pack the string */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (NULL != tmp) { + free(tmp); + } + + /* do the same to pass the flags for each node */ + tmp = NULL; + while (NULL != (item = opal_list_remove_first(&flags))) { + rng = (orte_regex_range_t*)item; + if (NULL == tmp) { + asprintf(&tmp, "%d[%d]", rng->cnt, rng->slots); + } else { + asprintf(&tmp2, "%s,%d[%d]", tmp, rng->cnt, rng->slots); + free(tmp); + tmp = tmp2; + } + OBJ_RELEASE(rng); + } + OPAL_LIST_DESTRUCT(&flags); + + /* pack the string */ + opal_output_verbose(1, orte_regx_base_framework.framework_output, + "%s FLAG ASSIGNMENTS: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (NULL != tmp) { + free(tmp); + } + + /* don't try to be cute - there aren't going to be that many + * topologies, so just scan the list and see if they are the + * same, excluding any NULL values */ + ortetopo = NULL; + test = false; + OPAL_LIST_FOREACH(rng, &topos, orte_regex_range_t) { + if (NULL == rng->t) { + continue; + } + if (NULL == ortetopo) { + ortetopo = rng->t; + } else if (0 != strcmp(ortetopo->sig, rng->t->sig)) { + /* we have a difference, so send them */ + test = true; + } + } + tmp = NULL; + if (test) { + opal_buffer_t bucket, *bptr; + OBJ_CONSTRUCT(&bucket, opal_buffer_t); + while (NULL != (item = opal_list_remove_first(&topos))) { + rng = (orte_regex_range_t*)item; + opal_output_verbose(5, orte_regx_base_framework.framework_output, + "%s PASSING TOPOLOGY %s RANGE %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == rng->t) ? "NULL" : rng->t->sig, rng->cnt); + if (NULL == tmp) { + asprintf(&tmp, "%d", rng->cnt); + } else { + asprintf(&tmp2, "%s,%d", tmp, rng->cnt); + free(tmp); + tmp = tmp2; + } + if (NULL == rng->t) { + /* need to account for NULL topology */ + opal_output_verbose(1, orte_regx_base_framework.framework_output, + "%s PACKING NULL TOPOLOGY", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + tmp2 = NULL; + if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &tmp2, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(rng); + OPAL_LIST_DESTRUCT(&topos); + OBJ_DESTRUCT(&bucket); + free(tmp); + return rc; + } + } else { + opal_output_verbose(1, orte_regx_base_framework.framework_output, + "%s PACKING TOPOLOGY: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rng->t->sig); + /* pack this topology string */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->sig, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(rng); + OPAL_LIST_DESTRUCT(&topos); + OBJ_DESTRUCT(&bucket); + free(tmp); + return rc; + } + /* pack the topology itself */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->topo, 1, OPAL_HWLOC_TOPO))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(rng); + OPAL_LIST_DESTRUCT(&topos); + OBJ_DESTRUCT(&bucket); + free(tmp); + return rc; + } + } + OBJ_RELEASE(rng); + } + OPAL_LIST_DESTRUCT(&topos); + /* pack the string */ + opal_output_verbose(1, orte_regx_base_framework.framework_output, + "%s TOPOLOGY ASSIGNMENTS: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&bucket); + free(tmp); + return rc; + } + free(tmp); + + /* now pack the topologies */ + bptr = &bucket; + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &bptr, 1, OPAL_BUFFER))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&bucket); + return rc; + } + OBJ_DESTRUCT(&bucket); + } else { + opal_output_verbose(1, orte_regx_base_framework.framework_output, + "%s NOT PASSING TOPOLOGIES", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + /* need to pack the NULL just to terminate the region */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + + return ORTE_SUCCESS; +} + +int orte_regx_base_decode_daemon_nodemap(opal_buffer_t *buffer) +{ + int n, nn, rc, cnt, offset; + orte_node_t *node; + char *slots=NULL, *topos=NULL, *flags=NULL; + char *rmndr, **tmp; + opal_list_t slts, flgs;; + opal_buffer_t *bptr=NULL; + orte_topology_t *t2; + orte_regex_range_t *rng, *srng, *frng; + uint8_t ui8; + + OBJ_CONSTRUCT(&slts, opal_list_t); + OBJ_CONSTRUCT(&flgs, opal_list_t); + + /* unpack the flag indicating if the HNP was allocated */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &ui8, &n, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (0 == ui8) { + orte_hnp_is_allocated = false; + } else { + orte_hnp_is_allocated = true; + } + + /* unpack the flag indicating we are in a managed allocation */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &ui8, &n, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (0 == ui8) { + orte_managed_allocation = false; + } else { + orte_managed_allocation = true; + } + + /* unpack the slots regex */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &slots, &n, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* this is not allowed to be NULL */ + if (NULL == slots) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + rc = ORTE_ERR_BAD_PARAM; + goto cleanup; + } + + /* unpack the flags regex */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flags, &n, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* this is not allowed to be NULL */ + if (NULL == flags) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + rc = ORTE_ERR_BAD_PARAM; + goto cleanup; + } + + /* unpack the topos regex - this may not have been + * provided (e.g., for a homogeneous machine) */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &topos, &n, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (NULL != topos) { + /* need to unpack the topologies */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bptr, &n, OPAL_BUFFER))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } + + /* if we are the HNP, then we just discard these strings as we already + * have a complete picture - but we needed to unpack them in order to + * maintain sync in the unpacking order */ + if (ORTE_PROC_IS_HNP) { + rc = ORTE_SUCCESS; + goto cleanup; + } + + /* decompress the slots */ + tmp = opal_argv_split(slots, ','); + for (n=0; NULL != tmp[n]; n++) { + rng = OBJ_NEW(orte_regex_range_t); + opal_list_append(&slts, &rng->super); + /* find the '[' as that delimits the value */ + rmndr = strchr(tmp[n], '['); + if (NULL == rmndr) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + rc = ORTE_ERR_BAD_PARAM; + opal_argv_free(tmp); + goto cleanup; + } + *rmndr = '\0'; + ++rmndr; + /* convert that number as this is the number of + * slots for this range */ + rng->slots = strtoul(rmndr, NULL, 10); + /* convert the initial number as that is the cnt */ + rng->cnt = strtoul(tmp[n], NULL, 10); + } + opal_argv_free(tmp); + + /* decompress the flags */ + tmp = opal_argv_split(flags, ','); + for (n=0; NULL != tmp[n]; n++) { + rng = OBJ_NEW(orte_regex_range_t); + opal_list_append(&flgs, &rng->super); + /* find the '[' as that delimits the value */ + rmndr = strchr(tmp[n], '['); + if (NULL == rmndr) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + opal_argv_free(tmp); + rc = ORTE_ERR_BAD_PARAM; + goto cleanup; + } + *rmndr = '\0'; + ++rmndr; + /* check the value - it is just one character */ + if ('1' == *rmndr) { + rng->slots = 1; + } else { + rng->slots = 0; + } + /* convert the initial number as that is the cnt */ + rng->cnt = strtoul(tmp[n], NULL, 10); + } + opal_argv_free(tmp); + free(flags); + + /* update the node array */ + srng = (orte_regex_range_t*)opal_list_get_first(&slts); + frng = (orte_regex_range_t*)opal_list_get_first(&flgs); + for (n=0; n < orte_node_pool->size; n++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { + continue; + } + /* set the number of slots */ + node->slots = srng->slots; + srng->cnt--; + if (0 == srng->cnt) { + srng = (orte_regex_range_t*)opal_list_get_next(&srng->super); + } + /* set the flags */ + if (0 == frng->slots) { + ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_SLOTS_GIVEN); + } else { + ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN); + } + frng->cnt--; + if (0 == frng->cnt) { + frng = (orte_regex_range_t*)opal_list_get_next(&frng->super); + } + } + + /* if no topology info was passed, then everyone shares our topology */ + if (NULL == bptr) { + /* our topology is first in the array */ + t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0); + opal_output_verbose(1, orte_regx_base_framework.framework_output, + "%s ASSIGNING ALL TOPOLOGIES TO: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), t2->sig); + for (n=0; n < orte_node_pool->size; n++) { + if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { + if (NULL == node->topology) { + OBJ_RETAIN(t2); + node->topology = t2; + } + } + } + } else { + char *sig; + hwloc_topology_t topo; + /* decompress the topology regex */ + tmp = opal_argv_split(topos, ','); + /* there must be a topology definition for each range */ + offset = 0; + for (nn=0; NULL != tmp[nn]; nn++) { + cnt = strtoul(tmp[nn], NULL, 10); + /* unpack the signature */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &sig, &n, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + opal_argv_free(tmp); + OBJ_RELEASE(bptr); + goto cleanup; + } + if (NULL == sig) { + /* the nodes in this range have not reported a topology, + * so skip them */ + offset += cnt; + continue; + } + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &topo, &n, OPAL_HWLOC_TOPO))) { + ORTE_ERROR_LOG(rc); + opal_argv_free(tmp); + OBJ_RELEASE(bptr); + free(sig); + goto cleanup; + } + /* see if we already have this topology - could be an update */ + t2 = NULL; + for (n=0; n < orte_node_topologies->size; n++) { + if (NULL == (t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, n))) { + continue; + } + if (0 == strcmp(t2->sig, sig)) { + /* found a match */ + free(sig); + opal_hwloc_base_free_topology(topo); + sig = NULL; + break; + } + } + if (NULL != sig || NULL == t2) { + /* new topology - record it */ + t2 = OBJ_NEW(orte_topology_t); + t2->sig = sig; + t2->topo = topo; + opal_pointer_array_add(orte_node_topologies, t2); + } + /* point each of the nodes in this range to this topology */ + n=0; + while (n < cnt && (n+offset) < orte_node_pool->size) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n+offset))) { + continue; + } + opal_output_verbose(1, orte_regx_base_framework.framework_output, + "%s ASSIGNING NODE %s WITH TOPO: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name, t2->sig); + if (NULL == node->topology) { + OBJ_RETAIN(t2); + node->topology = t2; + } + ++n; + } + offset += cnt; + } + OBJ_RELEASE(bptr); + opal_argv_free(tmp); + } + + cleanup: + OPAL_LIST_DESTRUCT(&slts); + OPAL_LIST_DESTRUCT(&flgs); + return rc; +} + +int orte_regx_base_generate_ppn(orte_job_t *jdata, char **ppn) +{ + orte_nidmap_regex_t *prng, **actives; + opal_list_t *prk; + orte_node_t *nptr; + orte_proc_t *proc; + size_t n; + int *cnt, i, k; + char *tmp2, *ptmp, **cache = NULL; + + /* create an array of lists to handle the number of app_contexts in this job */ + prk = (opal_list_t*)malloc(jdata->num_apps * sizeof(opal_list_t)); + cnt = (int*)malloc(jdata->num_apps * sizeof(int)); + actives = (orte_nidmap_regex_t**)malloc(jdata->num_apps * sizeof(orte_nidmap_regex_t*)); + for (n=0; n < jdata->num_apps; n++) { + OBJ_CONSTRUCT(&prk[n], opal_list_t); + actives[n] = NULL; + } + + /* we provide a complete map in the regex, with an entry for every + * node in the pool */ + for (i=0; i < orte_node_pool->size; i++) { + if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { + continue; + } + /* if a daemon has been assigned, then count how many procs + * for each app_context from the specified job are assigned to this node */ + memset(cnt, 0, jdata->num_apps * sizeof(int)); + if (NULL != nptr->daemon) { + for (k=0; k < nptr->procs->size; k++) { + if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(nptr->procs, k))) { + if (proc->name.jobid == jdata->jobid) { + ++cnt[proc->app_idx]; + } + } + } + } + /* track the #procs on this node */ + for (n=0; n < jdata->num_apps; n++) { + if (NULL == actives[n]) { + /* just starting */ + actives[n] = OBJ_NEW(orte_nidmap_regex_t); + actives[n]->nprocs = cnt[n]; + actives[n]->cnt = 1; + opal_list_append(&prk[n], &actives[n]->super); + } else { + /* is this the next in line */ + if (cnt[n] == actives[n]->nprocs) { + actives[n]->cnt++; + } else { + /* need to start another range */ + actives[n] = OBJ_NEW(orte_nidmap_regex_t); + actives[n]->nprocs = cnt[n]; + actives[n]->cnt = 1; + opal_list_append(&prk[n], &actives[n]->super); + } + } + } + } + + /* construct the regex from the found ranges for each app_context */ + ptmp = NULL; + for (n=0; n < jdata->num_apps; n++) { + OPAL_LIST_FOREACH(prng, &prk[n], orte_nidmap_regex_t) { + if (1 < prng->cnt) { + if (NULL == ptmp) { + asprintf(&ptmp, "%u(%u)", prng->nprocs, prng->cnt); + } else { + asprintf(&tmp2, "%s,%u(%u)", ptmp, prng->nprocs, prng->cnt); + free(ptmp); + ptmp = tmp2; + } + } else { + if (NULL == ptmp) { + asprintf(&ptmp, "%u", prng->nprocs); + } else { + asprintf(&tmp2, "%s,%u", ptmp, prng->nprocs); + free(ptmp); + ptmp = tmp2; + } + } + } + OPAL_LIST_DESTRUCT(&prk[n]); // releases all the actives objects + if (NULL != ptmp) { + opal_argv_append_nosize(&cache, ptmp); + free(ptmp); + ptmp = NULL; + } + } + free(prk); + free(cnt); + free(actives); + + *ppn = opal_argv_join(cache, '@'); + opal_argv_free(cache); + + return ORTE_SUCCESS; +} + +int orte_regx_base_parse_ppn(orte_job_t *jdata, char *regex) +{ + orte_node_t *node; + orte_proc_t *proc; + int n, k, m, cnt; + char **tmp, *ptr, **ppn; + orte_nidmap_regex_t *rng; + opal_list_t trk; + int rc = ORTE_SUCCESS; + + /* split the regex by app_context */ + tmp = opal_argv_split(regex, '@'); + + /* for each app_context, set the ppn */ + for (n=0; NULL != tmp[n]; n++) { + ppn = opal_argv_split(tmp[n], ','); + /* decompress the ppn */ + OBJ_CONSTRUCT(&trk, opal_list_t); + for (m=0; NULL != ppn[m]; m++) { + rng = OBJ_NEW(orte_nidmap_regex_t); + opal_list_append(&trk, &rng->super); + /* check for a count */ + if (NULL != (ptr = strchr(ppn[m], '('))) { + ppn[m][strlen(ppn[m])-1] = '\0'; // remove trailing paren + *ptr = '\0'; + ++ptr; + rng->cnt = strtoul(ptr, NULL, 10); + } else { + rng->cnt = 1; + } + /* convert the number */ + rng->nprocs = strtoul(ppn[m], NULL, 10); + } + opal_argv_free(ppn); + + /* cycle thru our node pool and add the indicated number of procs + * to each node */ + rng = (orte_nidmap_regex_t*)opal_list_get_first(&trk); + cnt = 0; + for (m=0; m < orte_node_pool->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, m))) { + continue; + } + /* see if it has any procs for this job and app_context */ + if (0 < rng->nprocs) { + /* add this node to the job map if it isn't already there */ + if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { + OBJ_RETAIN(node); + ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); + opal_pointer_array_add(jdata->map->nodes, node); + } + /* create a proc object for each one */ + for (k=0; k < rng->nprocs; k++) { + proc = OBJ_NEW(orte_proc_t); + proc->name.jobid = jdata->jobid; + /* leave the vpid undefined as this will be determined + * later when we do the overall ranking */ + proc->app_idx = n; + proc->parent = node->daemon->name.vpid; + OBJ_RETAIN(node); + proc->node = node; + /* flag the proc as ready for launch */ + proc->state = ORTE_PROC_STATE_INIT; + opal_pointer_array_add(node->procs, proc); + /* we will add the proc to the jdata array when we + * compute its rank */ + } + node->num_procs += rng->nprocs; + } + ++cnt; + if (rng->cnt <= cnt) { + rng = (orte_nidmap_regex_t*)opal_list_get_next(&rng->super); + if (NULL == rng) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + opal_argv_free(tmp); + rc = ORTE_ERR_NOT_FOUND; + goto complete; + } + cnt = 0; + } + } + OPAL_LIST_DESTRUCT(&trk); + } + opal_argv_free(tmp); + + complete: + /* reset any node map flags we used so the next job will start clean */ + for (n=0; n < jdata->map->nodes->size; n++) { + if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) { + ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); + } + } + + return rc; +} + + +static int regex_parse_node_range(char *base, char *range, int num_digits, char *suffix, char ***names); + +/* + * Parse one or more ranges in a set + * + * @param base The base text of the node name + * @param *ranges A pointer to a range. This can contain multiple ranges + * (i.e. "1-3,10" or "5" or "9,0100-0130,250") + * @param ***names An argv array to add the newly discovered nodes to + */ +static int regex_parse_node_ranges(char *base, char *ranges, int num_digits, char *suffix, char ***names) +{ + int i, len, ret; + char *start, *orig; + + /* Look for commas, the separator between ranges */ + + len = strlen(ranges); + for (orig = start = ranges, i = 0; i < len; ++i) { + if (',' == ranges[i]) { + ranges[i] = '\0'; + ret = regex_parse_node_range(base, start, num_digits, suffix, names); + if (ORTE_SUCCESS != ret) { + ORTE_ERROR_LOG(ret); + return ret; + } + start = ranges + i + 1; + } + } + + /* Pick up the last range, if it exists */ + + if (start < orig + len) { + + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + "%s regex:parse:ranges: parse range %s (2)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), start)); + + ret = regex_parse_node_range(base, start, num_digits, suffix, names); + if (ORTE_SUCCESS != ret) { + ORTE_ERROR_LOG(ret); + return ret; + } + } + + /* All done */ + return ORTE_SUCCESS; +} + + +/* + * Parse a single range in a set and add the full names of the nodes + * found to the names argv + * + * @param base The base text of the node name + * @param *ranges A pointer to a single range. (i.e. "1-3" or "5") + * @param ***names An argv array to add the newly discovered nodes to + */ +static int regex_parse_node_range(char *base, char *range, int num_digits, char *suffix, char ***names) +{ + char *str, tmp[132]; + size_t i, k, start, end; + size_t base_len, len; + bool found; + int ret; + + if (NULL == base || NULL == range) { + return ORTE_ERROR; + } + + len = strlen(range); + base_len = strlen(base); + /* Silence compiler warnings; start and end are always assigned + properly, below */ + start = end = 0; + + /* Look for the beginning of the first number */ + + for (found = false, i = 0; i < len; ++i) { + if (isdigit((int) range[i])) { + if (!found) { + start = atoi(range + i); + found = true; + break; + } + } + } + if (!found) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + + /* Look for the end of the first number */ + + for (found = false; i < len; ++i) { + if (!isdigit(range[i])) { + break; + } + } + + /* Was there no range, just a single number? */ + + if (i >= len) { + end = start; + found = true; + } else { + /* Nope, there was a range. Look for the beginning of the second + * number + */ + for (; i < len; ++i) { + if (isdigit(range[i])) { + end = strtol(range + i, NULL, 10); + found = true; + break; + } + } + } + if (!found) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + + /* Make strings for all values in the range */ + + len = base_len + num_digits + 32; + if (NULL != suffix) { + len += strlen(suffix); + } + str = (char *) malloc(len); + if (NULL == str) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + for (i = start; i <= end; ++i) { + memset(str, 0, len); + strcpy(str, base); + /* we need to zero-pad the digits */ + for (k=0; k < (size_t)num_digits; k++) { + str[k+base_len] = '0'; + } + memset(tmp, 0, 132); + snprintf(tmp, 132, "%lu", (unsigned long)i); + for (k=0; k < strlen(tmp); k++) { + str[base_len + num_digits - k - 1] = tmp[strlen(tmp)-k-1]; + } + /* if there is a suffix, add it */ + if (NULL != suffix) { + strcat(str, suffix); + } + ret = opal_argv_append_nosize(names, str); + if(ORTE_SUCCESS != ret) { + ORTE_ERROR_LOG(ret); + free(str); + return ret; + } + } + free(str); + + /* All done */ + return ORTE_SUCCESS; +} + +static int regex_parse_node_range(char *base, char *range, int num_digits, char *suffix, char ***names); + +int orte_regx_base_extract_node_names(char *regexp, char ***names) +{ + int i, j, k, len, ret; + char *base; + char *orig, *suffix; + bool found_range = false; + bool more_to_come = false; + int num_digits; + + if (NULL == regexp) { + *names = NULL; + return ORTE_SUCCESS; + } + + orig = base = strdup(regexp); + if (NULL == base) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + "%s regex:extract:nodenames: checking nodelist: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + regexp)); + + do { + /* Find the base */ + len = strlen(base); + for (i = 0; i <= len; ++i) { + if (base[i] == '[') { + /* we found a range. this gets dealt with below */ + base[i] = '\0'; + found_range = true; + break; + } + if (base[i] == ',') { + /* we found a singleton node, and there are more to come */ + base[i] = '\0'; + found_range = false; + more_to_come = true; + break; + } + if (base[i] == '\0') { + /* we found a singleton node */ + found_range = false; + more_to_come = false; + break; + } + } + if (i == 0 && !found_range) { + /* we found a special character at the beginning of the string */ + orte_show_help("help-regex.txt", "regex:special-char", true, regexp); + free(orig); + return ORTE_ERR_BAD_PARAM; + } + + if (found_range) { + /* If we found a range, get the number of digits in the numbers */ + i++; /* step over the [ */ + for (j=i; j < len; j++) { + if (base[j] == ':') { + base[j] = '\0'; + break; + } + } + if (j >= len) { + /* we didn't find the number of digits */ + orte_show_help("help-regex.txt", "regex:num-digits-missing", true, regexp); + free(orig); + return ORTE_ERR_BAD_PARAM; + } + num_digits = strtol(&base[i], NULL, 10); + i = j + 1; /* step over the : */ + /* now find the end of the range */ + for (j = i; j < len; ++j) { + if (base[j] == ']') { + base[j] = '\0'; + break; + } + } + if (j >= len) { + /* we didn't find the end of the range */ + orte_show_help("help-regex.txt", "regex:end-range-missing", true, regexp); + free(orig); + return ORTE_ERR_BAD_PARAM; + } + /* check for a suffix */ + if (j+1 < len && base[j+1] != ',') { + /* find the next comma, if present */ + for (k=j+1; k < len && base[k] != ','; k++); + if (k < len) { + base[k] = '\0'; + } + suffix = strdup(&base[j+1]); + if (k < len) { + base[k] = ','; + } + j = k-1; + } else { + suffix = NULL; + } + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + "%s regex:extract:nodenames: parsing range %s %s %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + base, base + i, suffix)); + + ret = regex_parse_node_ranges(base, base + i, num_digits, suffix, names); + if (NULL != suffix) { + free(suffix); + } + if (ORTE_SUCCESS != ret) { + orte_show_help("help-regex.txt", "regex:bad-value", true, regexp); + free(orig); + return ret; + } + if (j+1 < len && base[j + 1] == ',') { + more_to_come = true; + base = &base[j + 2]; + } else { + more_to_come = false; + } + } else { + /* If we didn't find a range, just add the node */ + if(ORTE_SUCCESS != (ret = opal_argv_append_nosize(names, base))) { + ORTE_ERROR_LOG(ret); + free(orig); + return ret; + } + /* step over the comma */ + i++; + /* set base equal to the (possible) next base to look at */ + base = &base[i]; + } + } while(more_to_come); + + free(orig); + + /* All done */ + return ret; +} diff --git a/orte/mca/regx/fwd/regx_fwd.c b/orte/mca/regx/fwd/regx_fwd.c index 81b5e6eaed..893b96e0ae 100644 --- a/orte/mca/regx/fwd/regx_fwd.c +++ b/orte/mca/regx/fwd/regx_fwd.c @@ -34,75 +34,17 @@ #include "regx_fwd.h" static int nidmap_create(opal_pointer_array_t *pool, char **regex); -static int nidmap_parse(char *regex); -static int extract_node_names(char *regex, char ***node_names); -static int encode_nodemap(opal_buffer_t *buffer); -static int decode_daemon_nodemap(opal_buffer_t *buffer); -static int generate_ppn(orte_job_t *jdata, char **ppn); -static int parse_ppn(orte_job_t *jdata, char *ppn); orte_regx_base_module_t orte_regx_fwd_module = { .nidmap_create = nidmap_create, - .nidmap_parse = nidmap_parse, - .extract_node_names = extract_node_names, - .encode_nodemap = encode_nodemap, - .decode_daemon_nodemap = decode_daemon_nodemap, - .generate_ppn = generate_ppn, - .parse_ppn = parse_ppn + .nidmap_parse = orte_regx_base_nidmap_parse, + .extract_node_names = orte_regx_base_extract_node_names, + .encode_nodemap = orte_regx_base_encode_nodemap, + .decode_daemon_nodemap = orte_regx_base_decode_daemon_nodemap, + .generate_ppn = orte_regx_base_generate_ppn, + .parse_ppn = orte_regx_base_parse_ppn }; -typedef struct { - opal_list_item_t super; - int vpid; - int cnt; - int slots; - orte_topology_t *t; -} orte_regex_range_t; -static void range_construct(orte_regex_range_t *ptr) -{ - ptr->vpid = 0; - ptr->cnt = 0; -} -OBJ_CLASS_INSTANCE(orte_regex_range_t, - opal_list_item_t, - range_construct, NULL); - -typedef struct { - /* list object */ - opal_list_item_t super; - char *prefix; - char *suffix; - int num_digits; - opal_list_t ranges; -} orte_regex_node_t; -static void orte_regex_node_construct(orte_regex_node_t *ptr) -{ - ptr->prefix = NULL; - ptr->suffix = NULL; - ptr->num_digits = 0; - OBJ_CONSTRUCT(&ptr->ranges, opal_list_t); -} -static void orte_regex_node_destruct(orte_regex_node_t *ptr) -{ - opal_list_item_t *item; - - if (NULL != ptr->prefix) { - free(ptr->prefix); - } - if (NULL != ptr->suffix) { - free(ptr->suffix); - } - - while (NULL != (item = opal_list_remove_first(&ptr->ranges))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&ptr->ranges); -} -OBJ_CLASS_INSTANCE(orte_regex_node_t, - opal_list_item_t, - orte_regex_node_construct, - orte_regex_node_destruct); - static int nidmap_create(opal_pointer_array_t *pool, char **regex) { char *node; @@ -373,1211 +315,3 @@ static int nidmap_create(opal_pointer_array_t *pool, char **regex) *regex = tmp2; return ORTE_SUCCESS; } - -static int nidmap_parse(char *regex) -{ - char *nodelist, *vpids, *ptr; - char **nodes, **dvpids; - int rc, n, cnt; - orte_regex_range_t *rng; - opal_list_t dids; - orte_job_t *daemons; - orte_node_t *nd; - orte_proc_t *proc; - - /* if we are the HNP, we don't need to parse this */ - if (ORTE_PROC_IS_HNP) { - return ORTE_SUCCESS; - } - - /* split the regex into its node and vpid parts */ - nodelist = regex; - vpids = strchr(regex, '@'); - if (NULL == vpids) { - /* indicates the regex got mangled somewhere */ - return ORTE_ERR_BAD_PARAM; - } - *vpids = '\0'; // terminate the nodelist string - ++vpids; // step over the separator - if (NULL == vpids || '\0' == *vpids) { - /* indicates the regex got mangled somewhere */ - return ORTE_ERR_BAD_PARAM; - } - - /* decompress the nodes regex */ - nodes = NULL; - if (ORTE_SUCCESS != (rc = extract_node_names(nodelist, &nodes))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - if (NULL == nodes) { - /* should not happen */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - - /* decompress the vpids */ - OBJ_CONSTRUCT(&dids, opal_list_t); - dvpids = opal_argv_split(vpids, ','); - for (n=0; NULL != dvpids[n]; n++) { - rng = OBJ_NEW(orte_regex_range_t); - opal_list_append(&dids, &rng->super); - /* check for a count */ - if (NULL != (ptr = strchr(dvpids[n], '('))) { - dvpids[n][strlen(dvpids[n])-1] = '\0'; // remove trailing paren - *ptr = '\0'; - ++ptr; - rng->cnt = strtoul(ptr, NULL, 10); - } else { - rng->cnt = 1; - } - /* convert the number */ - rng->vpid = strtoul(dvpids[n], NULL, 10); - } - opal_argv_free(dvpids); - - /* get the daemon job object */ - daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - - /* create the node pool array - this will include - * _all_ nodes known to the allocation */ - rng = (orte_regex_range_t*)opal_list_get_first(&dids); - cnt = 0; - for (n=0; NULL != nodes[n]; n++) { - nd = OBJ_NEW(orte_node_t); - nd->name = nodes[n]; - opal_pointer_array_set_item(orte_node_pool, n, nd); - /* see if it has a daemon on it */ - if (-1 != rng->vpid) { - /* we have a daemon, so let's create the tracker for it */ - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, rng->vpid+cnt))) { - proc = OBJ_NEW(orte_proc_t); - proc->name.jobid = ORTE_PROC_MY_NAME->jobid; - proc->name.vpid = rng->vpid + cnt; - proc->state = ORTE_PROC_STATE_RUNNING; - ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE); - daemons->num_procs++; - opal_pointer_array_set_item(daemons->procs, proc->name.vpid, proc); - } - nd->index = proc->name.vpid; - OBJ_RETAIN(nd); - proc->node = nd; - OBJ_RETAIN(proc); - nd->daemon = proc; - } - ++cnt; - if (rng->cnt <= cnt) { - rng = (orte_regex_range_t*)opal_list_get_next(&rng->super); - if (NULL == rng) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - cnt = 0; - } - } - - /* update num procs */ - if (orte_process_info.num_procs != daemons->num_procs) { - orte_process_info.num_procs = daemons->num_procs; - /* need to update the routing plan */ - orte_routed.update_routing_plan(NULL); - } - - if (orte_process_info.max_procs < orte_process_info.num_procs) { - orte_process_info.max_procs = orte_process_info.num_procs; - } - - if (0 < opal_output_get_verbosity(orte_regx_base_framework.framework_output)) { - int i; - for (i=0; i < orte_node_pool->size; i++) { - if (NULL == (nd = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { - continue; - } - opal_output(0, "%s node[%d].name %s daemon %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i, - (NULL == nd->name) ? "NULL" : nd->name, - (NULL == nd->daemon) ? "NONE" : ORTE_VPID_PRINT(nd->daemon->name.vpid)); - } - } - - return ORTE_SUCCESS; -} - -static int encode_nodemap(opal_buffer_t *buffer) -{ - int n; - bool test; - orte_regex_range_t *rng, *slt, *tp, *flg; - opal_list_t slots, topos, flags; - opal_list_item_t *item; - char *tmp, *tmp2; - orte_node_t *nptr; - int rc; - uint8_t ui8; - orte_topology_t *ortetopo; - - /* setup the list of results */ - OBJ_CONSTRUCT(&slots, opal_list_t); - OBJ_CONSTRUCT(&topos, opal_list_t); - OBJ_CONSTRUCT(&flags, opal_list_t); - - slt = NULL; - tp = NULL; - flg = NULL; - - /* pack a flag indicating if the HNP was included in the allocation */ - if (orte_hnp_is_allocated) { - ui8 = 1; - } else { - ui8 = 0; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &ui8, 1, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* pack a flag indicating if we are in a managed allocation */ - if (orte_managed_allocation) { - ui8 = 1; - } else { - ui8 = 0; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &ui8, 1, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* handle the topologies - as the most common case by far - * is to have homogeneous topologies, we only send them - * if something is different. We know that the HNP is - * the first topology, and that any differing topology - * on the compute nodes must follow. So send the topologies - * if and only if: - * - * (a) the HNP is being used to house application procs and - * there is more than one topology on our list; or - * - * (b) the HNP is not being used, but there are more than - * two topologies on our list, thus indicating that - * there are multiple topologies on the compute nodes - */ - nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); - if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) { - /* assign a NULL topology so we still account for our presence, - * but don't cause us to send topology info when not needed */ - tp = OBJ_NEW(orte_regex_range_t); - tp->t = NULL; - tp->cnt = 1; - } else { - /* there is always one topology - our own - so start with it */ - tp = OBJ_NEW(orte_regex_range_t); - tp->t = nptr->topology; - tp->cnt = 1; - } - opal_list_append(&topos, &tp->super); - - opal_output_verbose(5, orte_regx_base_framework.framework_output, - "%s STARTING WITH TOPOLOGY FOR NODE %s: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - nptr->name, (NULL == tp->t) ? "NULL" : tp->t->sig); - - /* likewise, we have slots */ - slt = OBJ_NEW(orte_regex_range_t); - slt->slots = nptr->slots; - slt->cnt = 1; - opal_list_append(&slots, &slt->super); - - /* and flags */ - flg = OBJ_NEW(orte_regex_range_t); - if (ORTE_FLAG_TEST(nptr, ORTE_NODE_FLAG_SLOTS_GIVEN)) { - flg->slots = 1; - } else { - flg->slots = 0; - } - flg->cnt = 1; - opal_list_append(&flags, &flg->super); - - for (n=1; n < orte_node_pool->size; n++) { - if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { - continue; - } - /* check the #slots */ - /* is this the next in line */ - if (nptr->slots == slt->slots) { - slt->cnt++; - } else { - /* need to start another range */ - slt = OBJ_NEW(orte_regex_range_t); - slt->slots = nptr->slots; - slt->cnt = 1; - opal_list_append(&slots, &slt->super); - } - /* check the topologies */ - if (NULL != tp->t && NULL == nptr->topology) { - /* we don't know this topology, likely because - * we don't have a daemon on the node */ - tp = OBJ_NEW(orte_regex_range_t); - tp->t = NULL; - tp->cnt = 1; - opal_output_verbose(5, orte_regx_base_framework.framework_output, - "%s ADD TOPOLOGY FOR NODE %s: NULL", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nptr->name); - opal_list_append(&topos, &tp->super); - } else { - /* is this the next in line */ - if (tp->t == nptr->topology) { - tp->cnt++; - opal_output_verbose(5, orte_regx_base_framework.framework_output, - "%s CONTINUE TOPOLOGY RANGE (%d) WITH NODE %s: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - tp->cnt, nptr->name, - (NULL == tp->t) ? "N/A" : tp->t->sig); - } else { - /* need to start another range */ - tp = OBJ_NEW(orte_regex_range_t); - tp->t = nptr->topology; - tp->cnt = 1; - opal_output_verbose(5, orte_regx_base_framework.framework_output, - "%s STARTING NEW TOPOLOGY RANGE WITH NODE %s: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - nptr->name, tp->t->sig); - opal_list_append(&topos, &tp->super); - } - } - /* check the flags */ - test = ORTE_FLAG_TEST(nptr, ORTE_NODE_FLAG_SLOTS_GIVEN); - /* is this the next in line */ - if ((test && 1 == flg->slots) || - (!test && 0 == flg->slots)) { - flg->cnt++; - } else { - /* need to start another range */ - flg = OBJ_NEW(orte_regex_range_t); - if (test) { - flg->slots = 1; - } else { - flg->slots = 0; - } - flg->cnt = 1; - opal_list_append(&flags, &flg->super); - } - } - - /* pass #slots on each node */ - tmp = NULL; - while (NULL != (item = opal_list_remove_first(&slots))) { - rng = (orte_regex_range_t*)item; - if (NULL == tmp) { - asprintf(&tmp, "%d[%d]", rng->cnt, rng->slots); - } else { - asprintf(&tmp2, "%s,%d[%d]", tmp, rng->cnt, rng->slots); - free(tmp); - tmp = tmp2; - } - OBJ_RELEASE(rng); - } - OPAL_LIST_DESTRUCT(&slots); - opal_output_verbose(1, orte_regx_base_framework.framework_output, - "%s SLOT ASSIGNMENTS: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); - /* pack the string */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (NULL != tmp) { - free(tmp); - } - - /* do the same to pass the flags for each node */ - tmp = NULL; - while (NULL != (item = opal_list_remove_first(&flags))) { - rng = (orte_regex_range_t*)item; - if (NULL == tmp) { - asprintf(&tmp, "%d[%d]", rng->cnt, rng->slots); - } else { - asprintf(&tmp2, "%s,%d[%d]", tmp, rng->cnt, rng->slots); - free(tmp); - tmp = tmp2; - } - OBJ_RELEASE(rng); - } - OPAL_LIST_DESTRUCT(&flags); - - /* pack the string */ - opal_output_verbose(1, orte_regx_base_framework.framework_output, - "%s FLAG ASSIGNMENTS: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (NULL != tmp) { - free(tmp); - } - - /* don't try to be cute - there aren't going to be that many - * topologies, so just scan the list and see if they are the - * same, excluding any NULL values */ - ortetopo = NULL; - test = false; - OPAL_LIST_FOREACH(rng, &topos, orte_regex_range_t) { - if (NULL == rng->t) { - continue; - } - if (NULL == ortetopo) { - ortetopo = rng->t; - } else if (0 != strcmp(ortetopo->sig, rng->t->sig)) { - /* we have a difference, so send them */ - test = true; - } - } - tmp = NULL; - if (test) { - opal_buffer_t bucket, *bptr; - OBJ_CONSTRUCT(&bucket, opal_buffer_t); - while (NULL != (item = opal_list_remove_first(&topos))) { - rng = (orte_regex_range_t*)item; - opal_output_verbose(5, orte_regx_base_framework.framework_output, - "%s PASSING TOPOLOGY %s RANGE %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == rng->t) ? "NULL" : rng->t->sig, rng->cnt); - if (NULL == tmp) { - asprintf(&tmp, "%d", rng->cnt); - } else { - asprintf(&tmp2, "%s,%d", tmp, rng->cnt); - free(tmp); - tmp = tmp2; - } - if (NULL == rng->t) { - /* need to account for NULL topology */ - opal_output_verbose(1, orte_regx_base_framework.framework_output, - "%s PACKING NULL TOPOLOGY", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - tmp2 = NULL; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &tmp2, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(rng); - OPAL_LIST_DESTRUCT(&topos); - OBJ_DESTRUCT(&bucket); - free(tmp); - return rc; - } - } else { - opal_output_verbose(1, orte_regx_base_framework.framework_output, - "%s PACKING TOPOLOGY: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rng->t->sig); - /* pack this topology string */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->sig, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(rng); - OPAL_LIST_DESTRUCT(&topos); - OBJ_DESTRUCT(&bucket); - free(tmp); - return rc; - } - /* pack the topology itself */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->topo, 1, OPAL_HWLOC_TOPO))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(rng); - OPAL_LIST_DESTRUCT(&topos); - OBJ_DESTRUCT(&bucket); - free(tmp); - return rc; - } - } - OBJ_RELEASE(rng); - } - OPAL_LIST_DESTRUCT(&topos); - /* pack the string */ - opal_output_verbose(1, orte_regx_base_framework.framework_output, - "%s TOPOLOGY ASSIGNMENTS: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&bucket); - free(tmp); - return rc; - } - free(tmp); - - /* now pack the topologies */ - bptr = &bucket; - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &bptr, 1, OPAL_BUFFER))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&bucket); - return rc; - } - OBJ_DESTRUCT(&bucket); - } else { - opal_output_verbose(1, orte_regx_base_framework.framework_output, - "%s NOT PASSING TOPOLOGIES", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* need to pack the NULL just to terminate the region */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - - return ORTE_SUCCESS; -} - -static int decode_daemon_nodemap(opal_buffer_t *buffer) -{ - int n, nn, rc, cnt, offset; - orte_node_t *node; - char *slots=NULL, *topos=NULL, *flags=NULL; - char *rmndr, **tmp; - opal_list_t slts, flgs;; - opal_buffer_t *bptr=NULL; - orte_topology_t *t2; - orte_regex_range_t *rng, *srng, *frng; - uint8_t ui8; - - OBJ_CONSTRUCT(&slts, opal_list_t); - OBJ_CONSTRUCT(&flgs, opal_list_t); - - /* unpack the flag indicating if the HNP was allocated */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &ui8, &n, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - if (0 == ui8) { - orte_hnp_is_allocated = false; - } else { - orte_hnp_is_allocated = true; - } - - /* unpack the flag indicating we are in a managed allocation */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &ui8, &n, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - if (0 == ui8) { - orte_managed_allocation = false; - } else { - orte_managed_allocation = true; - } - - /* unpack the slots regex */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &slots, &n, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - /* this is not allowed to be NULL */ - if (NULL == slots) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - rc = ORTE_ERR_BAD_PARAM; - goto cleanup; - } - - /* unpack the flags regex */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flags, &n, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - /* this is not allowed to be NULL */ - if (NULL == flags) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - rc = ORTE_ERR_BAD_PARAM; - goto cleanup; - } - - /* unpack the topos regex - this may not have been - * provided (e.g., for a homogeneous machine) */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &topos, &n, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - if (NULL != topos) { - /* need to unpack the topologies */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bptr, &n, OPAL_BUFFER))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - } - - /* if we are the HNP, then we just discard these strings as we already - * have a complete picture - but we needed to unpack them in order to - * maintain sync in the unpacking order */ - if (ORTE_PROC_IS_HNP) { - rc = ORTE_SUCCESS; - goto cleanup; - } - - /* decompress the slots */ - tmp = opal_argv_split(slots, ','); - for (n=0; NULL != tmp[n]; n++) { - rng = OBJ_NEW(orte_regex_range_t); - opal_list_append(&slts, &rng->super); - /* find the '[' as that delimits the value */ - rmndr = strchr(tmp[n], '['); - if (NULL == rmndr) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - rc = ORTE_ERR_BAD_PARAM; - opal_argv_free(tmp); - goto cleanup; - } - *rmndr = '\0'; - ++rmndr; - /* convert that number as this is the number of - * slots for this range */ - rng->slots = strtoul(rmndr, NULL, 10); - /* convert the initial number as that is the cnt */ - rng->cnt = strtoul(tmp[n], NULL, 10); - } - opal_argv_free(tmp); - - /* decompress the flags */ - tmp = opal_argv_split(flags, ','); - for (n=0; NULL != tmp[n]; n++) { - rng = OBJ_NEW(orte_regex_range_t); - opal_list_append(&flgs, &rng->super); - /* find the '[' as that delimits the value */ - rmndr = strchr(tmp[n], '['); - if (NULL == rmndr) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - opal_argv_free(tmp); - rc = ORTE_ERR_BAD_PARAM; - goto cleanup; - } - *rmndr = '\0'; - ++rmndr; - /* check the value - it is just one character */ - if ('1' == *rmndr) { - rng->slots = 1; - } else { - rng->slots = 0; - } - /* convert the initial number as that is the cnt */ - rng->cnt = strtoul(tmp[n], NULL, 10); - } - opal_argv_free(tmp); - free(flags); - - /* update the node array */ - srng = (orte_regex_range_t*)opal_list_get_first(&slts); - frng = (orte_regex_range_t*)opal_list_get_first(&flgs); - for (n=0; n < orte_node_pool->size; n++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { - continue; - } - /* set the number of slots */ - node->slots = srng->slots; - srng->cnt--; - if (0 == srng->cnt) { - srng = (orte_regex_range_t*)opal_list_get_next(&srng->super); - } - /* set the flags */ - if (0 == frng->slots) { - ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_SLOTS_GIVEN); - } else { - ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN); - } - frng->cnt--; - if (0 == frng->cnt) { - frng = (orte_regex_range_t*)opal_list_get_next(&frng->super); - } - } - - /* if no topology info was passed, then everyone shares our topology */ - if (NULL == bptr) { - /* our topology is first in the array */ - t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0); - opal_output_verbose(1, orte_regx_base_framework.framework_output, - "%s ASSIGNING ALL TOPOLOGIES TO: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), t2->sig); - for (n=0; n < orte_node_pool->size; n++) { - if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { - if (NULL == node->topology) { - OBJ_RETAIN(t2); - node->topology = t2; - } - } - } - } else { - char *sig; - hwloc_topology_t topo; - /* decompress the topology regex */ - tmp = opal_argv_split(topos, ','); - /* there must be a topology definition for each range */ - offset = 0; - for (nn=0; NULL != tmp[nn]; nn++) { - cnt = strtoul(tmp[nn], NULL, 10); - /* unpack the signature */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &sig, &n, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - opal_argv_free(tmp); - OBJ_RELEASE(bptr); - goto cleanup; - } - if (NULL == sig) { - /* the nodes in this range have not reported a topology, - * so skip them */ - offset += cnt; - continue; - } - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &topo, &n, OPAL_HWLOC_TOPO))) { - ORTE_ERROR_LOG(rc); - opal_argv_free(tmp); - OBJ_RELEASE(bptr); - free(sig); - goto cleanup; - } - /* see if we already have this topology - could be an update */ - t2 = NULL; - for (n=0; n < orte_node_topologies->size; n++) { - if (NULL == (t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, n))) { - continue; - } - if (0 == strcmp(t2->sig, sig)) { - /* found a match */ - free(sig); - opal_hwloc_base_free_topology(topo); - sig = NULL; - break; - } - } - if (NULL != sig || NULL == t2) { - /* new topology - record it */ - t2 = OBJ_NEW(orte_topology_t); - t2->sig = sig; - t2->topo = topo; - opal_pointer_array_add(orte_node_topologies, t2); - } - /* point each of the nodes in this range to this topology */ - n=0; - while (n < cnt && (n+offset) < orte_node_pool->size) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n+offset))) { - continue; - } - opal_output_verbose(1, orte_regx_base_framework.framework_output, - "%s ASSIGNING NODE %s WITH TOPO: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name, t2->sig); - if (NULL == node->topology) { - OBJ_RETAIN(t2); - node->topology = t2; - } - ++n; - } - offset += cnt; - } - OBJ_RELEASE(bptr); - opal_argv_free(tmp); - } - - cleanup: - OPAL_LIST_DESTRUCT(&slts); - OPAL_LIST_DESTRUCT(&flgs); - return rc; -} -static int generate_ppn(orte_job_t *jdata, char **ppn) -{ - orte_nidmap_regex_t *prng, **actives; - opal_list_t *prk; - orte_node_t *nptr; - orte_proc_t *proc; - size_t n; - int *cnt, i, k; - char *tmp2, *ptmp, **cache = NULL; - - /* create an array of lists to handle the number of app_contexts in this job */ - prk = (opal_list_t*)malloc(jdata->num_apps * sizeof(opal_list_t)); - cnt = (int*)malloc(jdata->num_apps * sizeof(int)); - actives = (orte_nidmap_regex_t**)malloc(jdata->num_apps * sizeof(orte_nidmap_regex_t*)); - for (n=0; n < jdata->num_apps; n++) { - OBJ_CONSTRUCT(&prk[n], opal_list_t); - actives[n] = NULL; - } - - /* we provide a complete map in the regex, with an entry for every - * node in the pool */ - for (i=0; i < orte_node_pool->size; i++) { - if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { - continue; - } - /* if a daemon has been assigned, then count how many procs - * for each app_context from the specified job are assigned to this node */ - memset(cnt, 0, jdata->num_apps * sizeof(int)); - if (NULL != nptr->daemon) { - for (k=0; k < nptr->procs->size; k++) { - if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(nptr->procs, k))) { - if (proc->name.jobid == jdata->jobid) { - ++cnt[proc->app_idx]; - } - } - } - } - /* track the #procs on this node */ - for (n=0; n < jdata->num_apps; n++) { - if (NULL == actives[n]) { - /* just starting */ - actives[n] = OBJ_NEW(orte_nidmap_regex_t); - actives[n]->nprocs = cnt[n]; - actives[n]->cnt = 1; - opal_list_append(&prk[n], &actives[n]->super); - } else { - /* is this the next in line */ - if (cnt[n] == actives[n]->nprocs) { - actives[n]->cnt++; - } else { - /* need to start another range */ - actives[n] = OBJ_NEW(orte_nidmap_regex_t); - actives[n]->nprocs = cnt[n]; - actives[n]->cnt = 1; - opal_list_append(&prk[n], &actives[n]->super); - } - } - } - } - - /* construct the regex from the found ranges for each app_context */ - ptmp = NULL; - for (n=0; n < jdata->num_apps; n++) { - OPAL_LIST_FOREACH(prng, &prk[n], orte_nidmap_regex_t) { - if (1 < prng->cnt) { - if (NULL == ptmp) { - asprintf(&ptmp, "%u(%u)", prng->nprocs, prng->cnt); - } else { - asprintf(&tmp2, "%s,%u(%u)", ptmp, prng->nprocs, prng->cnt); - free(ptmp); - ptmp = tmp2; - } - } else { - if (NULL == ptmp) { - asprintf(&ptmp, "%u", prng->nprocs); - } else { - asprintf(&tmp2, "%s,%u", ptmp, prng->nprocs); - free(ptmp); - ptmp = tmp2; - } - } - } - OPAL_LIST_DESTRUCT(&prk[n]); // releases all the actives objects - if (NULL != ptmp) { - opal_argv_append_nosize(&cache, ptmp); - free(ptmp); - ptmp = NULL; - } - } - free(prk); - free(cnt); - free(actives); - - *ppn = opal_argv_join(cache, '@'); - opal_argv_free(cache); - - return ORTE_SUCCESS; -} - -static int parse_ppn(orte_job_t *jdata, char *regex) -{ - orte_node_t *node; - orte_proc_t *proc; - int n, k, m, cnt; - char **tmp, *ptr, **ppn; - orte_nidmap_regex_t *rng; - opal_list_t trk; - int rc = ORTE_SUCCESS; - - /* split the regex by app_context */ - tmp = opal_argv_split(regex, '@'); - - /* for each app_context, set the ppn */ - for (n=0; NULL != tmp[n]; n++) { - ppn = opal_argv_split(tmp[n], ','); - /* decompress the ppn */ - OBJ_CONSTRUCT(&trk, opal_list_t); - for (m=0; NULL != ppn[m]; m++) { - rng = OBJ_NEW(orte_nidmap_regex_t); - opal_list_append(&trk, &rng->super); - /* check for a count */ - if (NULL != (ptr = strchr(ppn[m], '('))) { - ppn[m][strlen(ppn[m])-1] = '\0'; // remove trailing paren - *ptr = '\0'; - ++ptr; - rng->cnt = strtoul(ptr, NULL, 10); - } else { - rng->cnt = 1; - } - /* convert the number */ - rng->nprocs = strtoul(ppn[m], NULL, 10); - } - opal_argv_free(ppn); - - /* cycle thru our node pool and add the indicated number of procs - * to each node */ - rng = (orte_nidmap_regex_t*)opal_list_get_first(&trk); - cnt = 0; - for (m=0; m < orte_node_pool->size; m++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, m))) { - continue; - } - /* see if it has any procs for this job and app_context */ - if (0 < rng->nprocs) { - /* add this node to the job map if it isn't already there */ - if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { - OBJ_RETAIN(node); - ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); - opal_pointer_array_add(jdata->map->nodes, node); - } - /* create a proc object for each one */ - for (k=0; k < rng->nprocs; k++) { - proc = OBJ_NEW(orte_proc_t); - proc->name.jobid = jdata->jobid; - /* leave the vpid undefined as this will be determined - * later when we do the overall ranking */ - proc->app_idx = n; - proc->parent = node->daemon->name.vpid; - OBJ_RETAIN(node); - proc->node = node; - /* flag the proc as ready for launch */ - proc->state = ORTE_PROC_STATE_INIT; - opal_pointer_array_add(node->procs, proc); - /* we will add the proc to the jdata array when we - * compute its rank */ - } - node->num_procs += rng->nprocs; - } - ++cnt; - if (rng->cnt <= cnt) { - rng = (orte_nidmap_regex_t*)opal_list_get_next(&rng->super); - if (NULL == rng) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - opal_argv_free(tmp); - rc = ORTE_ERR_NOT_FOUND; - goto complete; - } - cnt = 0; - } - } - OPAL_LIST_DESTRUCT(&trk); - } - opal_argv_free(tmp); - - complete: - /* reset any node map flags we used so the next job will start clean */ - for (n=0; n < jdata->map->nodes->size; n++) { - if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) { - ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); - } - } - - return rc; -} - - -static int regex_parse_node_ranges(char *base, char *ranges, int num_digits, char *suffix, char ***names); -static int regex_parse_node_range(char *base, char *range, int num_digits, char *suffix, char ***names); - -static int extract_node_names(char *regexp, char ***names) -{ - int i, j, k, len, ret; - char *base; - char *orig, *suffix; - bool found_range = false; - bool more_to_come = false; - int num_digits; - - if (NULL == regexp) { - *names = NULL; - return ORTE_SUCCESS; - } - - orig = base = strdup(regexp); - if (NULL == base) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s regex:extract:nodenames: checking nodelist: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - regexp)); - - do { - /* Find the base */ - len = strlen(base); - for (i = 0; i <= len; ++i) { - if (base[i] == '[') { - /* we found a range. this gets dealt with below */ - base[i] = '\0'; - found_range = true; - break; - } - if (base[i] == ',') { - /* we found a singleton node, and there are more to come */ - base[i] = '\0'; - found_range = false; - more_to_come = true; - break; - } - if (base[i] == '\0') { - /* we found a singleton node */ - found_range = false; - more_to_come = false; - break; - } - } - if (i == 0 && !found_range) { - /* we found a special character at the beginning of the string */ - orte_show_help("help-regex.txt", "regex:special-char", true, regexp); - free(orig); - return ORTE_ERR_BAD_PARAM; - } - - if (found_range) { - /* If we found a range, get the number of digits in the numbers */ - i++; /* step over the [ */ - for (j=i; j < len; j++) { - if (base[j] == ':') { - base[j] = '\0'; - break; - } - } - if (j >= len) { - /* we didn't find the number of digits */ - orte_show_help("help-regex.txt", "regex:num-digits-missing", true, regexp); - free(orig); - return ORTE_ERR_BAD_PARAM; - } - num_digits = strtol(&base[i], NULL, 10); - i = j + 1; /* step over the : */ - /* now find the end of the range */ - for (j = i; j < len; ++j) { - if (base[j] == ']') { - base[j] = '\0'; - break; - } - } - if (j >= len) { - /* we didn't find the end of the range */ - orte_show_help("help-regex.txt", "regex:end-range-missing", true, regexp); - free(orig); - return ORTE_ERR_BAD_PARAM; - } - /* check for a suffix */ - if (j+1 < len && base[j+1] != ',') { - /* find the next comma, if present */ - for (k=j+1; k < len && base[k] != ','; k++); - if (k < len) { - base[k] = '\0'; - } - suffix = strdup(&base[j+1]); - if (k < len) { - base[k] = ','; - } - j = k-1; - } else { - suffix = NULL; - } - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s regex:extract:nodenames: parsing range %s %s %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - base, base + i, suffix)); - - ret = regex_parse_node_ranges(base, base + i, num_digits, suffix, names); - if (NULL != suffix) { - free(suffix); - } - if (ORTE_SUCCESS != ret) { - orte_show_help("help-regex.txt", "regex:bad-value", true, regexp); - free(orig); - return ret; - } - if (j+1 < len && base[j + 1] == ',') { - more_to_come = true; - base = &base[j + 2]; - } else { - more_to_come = false; - } - } else { - /* If we didn't find a range, just add the node */ - if(ORTE_SUCCESS != (ret = opal_argv_append_nosize(names, base))) { - ORTE_ERROR_LOG(ret); - free(orig); - return ret; - } - /* step over the comma */ - i++; - /* set base equal to the (possible) next base to look at */ - base = &base[i]; - } - } while(more_to_come); - - free(orig); - - /* All done */ - return ret; -} - -/* - * Parse one or more ranges in a set - * - * @param base The base text of the node name - * @param *ranges A pointer to a range. This can contain multiple ranges - * (i.e. "1-3,10" or "5" or "9,0100-0130,250") - * @param ***names An argv array to add the newly discovered nodes to - */ -static int regex_parse_node_ranges(char *base, char *ranges, int num_digits, char *suffix, char ***names) -{ - int i, len, ret; - char *start, *orig; - - /* Look for commas, the separator between ranges */ - - len = strlen(ranges); - for (orig = start = ranges, i = 0; i < len; ++i) { - if (',' == ranges[i]) { - ranges[i] = '\0'; - ret = regex_parse_node_range(base, start, num_digits, suffix, names); - if (ORTE_SUCCESS != ret) { - ORTE_ERROR_LOG(ret); - return ret; - } - start = ranges + i + 1; - } - } - - /* Pick up the last range, if it exists */ - - if (start < orig + len) { - - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s regex:parse:ranges: parse range %s (2)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), start)); - - ret = regex_parse_node_range(base, start, num_digits, suffix, names); - if (ORTE_SUCCESS != ret) { - ORTE_ERROR_LOG(ret); - return ret; - } - } - - /* All done */ - return ORTE_SUCCESS; -} - - -/* - * Parse a single range in a set and add the full names of the nodes - * found to the names argv - * - * @param base The base text of the node name - * @param *ranges A pointer to a single range. (i.e. "1-3" or "5") - * @param ***names An argv array to add the newly discovered nodes to - */ -static int regex_parse_node_range(char *base, char *range, int num_digits, char *suffix, char ***names) -{ - char *str, tmp[132]; - size_t i, k, start, end; - size_t base_len, len; - bool found; - int ret; - - if (NULL == base || NULL == range) { - return ORTE_ERROR; - } - - len = strlen(range); - base_len = strlen(base); - /* Silence compiler warnings; start and end are always assigned - properly, below */ - start = end = 0; - - /* Look for the beginning of the first number */ - - for (found = false, i = 0; i < len; ++i) { - if (isdigit((int) range[i])) { - if (!found) { - start = atoi(range + i); - found = true; - break; - } - } - } - if (!found) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - - /* Look for the end of the first number */ - - for (found = false; i < len; ++i) { - if (!isdigit(range[i])) { - break; - } - } - - /* Was there no range, just a single number? */ - - if (i >= len) { - end = start; - found = true; - } else { - /* Nope, there was a range. Look for the beginning of the second - * number - */ - for (; i < len; ++i) { - if (isdigit(range[i])) { - end = strtol(range + i, NULL, 10); - found = true; - break; - } - } - } - if (!found) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - - /* Make strings for all values in the range */ - - len = base_len + num_digits + 32; - if (NULL != suffix) { - len += strlen(suffix); - } - str = (char *) malloc(len); - if (NULL == str) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - for (i = start; i <= end; ++i) { - memset(str, 0, len); - strcpy(str, base); - /* we need to zero-pad the digits */ - for (k=0; k < (size_t)num_digits; k++) { - str[k+base_len] = '0'; - } - memset(tmp, 0, 132); - snprintf(tmp, 132, "%lu", (unsigned long)i); - for (k=0; k < strlen(tmp); k++) { - str[base_len + num_digits - k - 1] = tmp[strlen(tmp)-k-1]; - } - /* if there is a suffix, add it */ - if (NULL != suffix) { - strcat(str, suffix); - } - ret = opal_argv_append_nosize(names, str); - if(ORTE_SUCCESS != ret) { - ORTE_ERROR_LOG(ret); - free(str); - return ret; - } - } - free(str); - - /* All done */ - return ORTE_SUCCESS; -} From 4130c93976f28b82e18cdf52a29d800010344dea Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Fri, 12 Jan 2018 10:08:53 +0900 Subject: [PATCH 4/5] regx/reverse: add the reverse component Search for the digits to be compressed from the end of the node names. For example, if the nodelist is c712f6n01,c712f6n02,c712f6n03 the regx/fwd component generates c[3:712]f6n01,c[3:712]f6n02,c[3:712]f6n03@(3) when the regx/reverse component generates c712f6n[2:1-3]@0(3) which is a better fit here. Josh Hursey authored the changes and must be credited. Signed-off-by: Gilles Gouaillardet --- orte/mca/regx/reverse/regx_reverse.c | 319 ++++++++++++++++++++++++--- 1 file changed, 288 insertions(+), 31 deletions(-) diff --git a/orte/mca/regx/reverse/regx_reverse.c b/orte/mca/regx/reverse/regx_reverse.c index 1a8b5f2199..b94a9be353 100644 --- a/orte/mca/regx/reverse/regx_reverse.c +++ b/orte/mca/regx/reverse/regx_reverse.c @@ -1,5 +1,8 @@ /* * Copyright (c) 2016-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2018 IBM Corporation. All rights reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -23,52 +26,306 @@ #include "orte/runtime/orte_globals.h" #include "orte/util/name_fns.h" +#include "orte/util/show_help.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rmaps/base/base.h" +#include "orte/mca/routed/routed.h" #include "orte/mca/regx/base/base.h" #include "regx_reverse.h" static int nidmap_create(opal_pointer_array_t *pool, char **regex); -static int nidmap_parse(char *regex); -static int encode_nodemap(opal_buffer_t *buffer); -static int decode_daemon_nodemap(opal_buffer_t *buffer); -static int generate_ppn(orte_job_t *jdata, char **ppn); -static int parse_ppn(orte_job_t *jdata, char *ppn); orte_regx_base_module_t orte_regx_reverse_module = { .nidmap_create = nidmap_create, - .nidmap_parse = nidmap_parse, - .encode_nodemap = encode_nodemap, - .decode_daemon_nodemap = decode_daemon_nodemap, - .generate_ppn = generate_ppn, - .parse_ppn = parse_ppn + .nidmap_parse = orte_regx_base_nidmap_parse, + .extract_node_names = orte_regx_base_extract_node_names, + .encode_nodemap = orte_regx_base_encode_nodemap, + .decode_daemon_nodemap = orte_regx_base_decode_daemon_nodemap, + .generate_ppn = orte_regx_base_generate_ppn, + .parse_ppn = orte_regx_base_parse_ppn }; static int nidmap_create(opal_pointer_array_t *pool, char **regex) { - return ORTE_ERR_NOT_IMPLEMENTED; -} + char *node; + char prefix[ORTE_MAX_NODE_PREFIX]; + int i, j, n, len, startnum, nodenum, numdigits; + bool found; + char *suffix, *sfx, *nodenames; + orte_regex_node_t *ndreg; + orte_regex_range_t *range, *rng; + opal_list_t nodenms, dvpids; + opal_list_item_t *item, *itm2; + char **regexargs = NULL, *tmp, *tmp2; + orte_node_t *nptr; + orte_vpid_t vpid; -static int nidmap_parse(char *regex) -{ - return ORTE_ERR_NOT_IMPLEMENTED; -} + OBJ_CONSTRUCT(&nodenms, opal_list_t); + OBJ_CONSTRUCT(&dvpids, opal_list_t); -static int encode_nodemap(opal_buffer_t *buffer) -{ - return ORTE_ERR_NOT_IMPLEMENTED; -} + rng = NULL; + for (n=0; n < pool->size; n++) { + if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(pool, n))) { + continue; + } + /* if no daemon has been assigned, then this node is not being used */ + if (NULL == nptr->daemon) { + vpid = -1; // indicates no daemon assigned + } else { + vpid = nptr->daemon->name.vpid; + } + /* deal with the daemon vpid - see if it is next in the + * current range */ + if (NULL == rng) { + /* just starting */ + rng = OBJ_NEW(orte_regex_range_t); + rng->vpid = vpid; + rng->cnt = 1; + opal_list_append(&dvpids, &rng->super); + } else if (UINT32_MAX == vpid) { + if (-1 == rng->vpid) { + rng->cnt++; + } else { + /* need to start another range */ + rng = OBJ_NEW(orte_regex_range_t); + rng->vpid = vpid; + rng->cnt = 1; + opal_list_append(&dvpids, &rng->super); + } + } else if (-1 == rng->vpid) { + /* need to start another range */ + rng = OBJ_NEW(orte_regex_range_t); + rng->vpid = vpid; + rng->cnt = 1; + opal_list_append(&dvpids, &rng->super); + } else { + /* is this the next in line */ + if (vpid == (orte_vpid_t)(rng->vpid + rng->cnt)) { + rng->cnt++; + } else { + /* need to start another range */ + rng = OBJ_NEW(orte_regex_range_t); + rng->vpid = vpid; + rng->cnt = 1; + opal_list_append(&dvpids, &rng->super); + } + } + node = nptr->name; + opal_output_verbose(5, orte_regx_base_framework.framework_output, + "%s PROCESS NODE <%s>", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node); + /* determine this node's prefix by looking for first digit char */ + len = strlen(node); + startnum = -1; + memset(prefix, 0, ORTE_MAX_NODE_PREFIX); + numdigits = 0; -static int decode_daemon_nodemap(opal_buffer_t *buffer) -{ - return ORTE_ERR_NOT_IMPLEMENTED; -} + /* Valid hostname characters are: + * - ascii letters, digits, and the '-' character. + * Determine the prefix in reverse to better support hostnames like: + * c712f6n01, c699c086 where there are sets of digits, and the lowest + * set changes most frequently. + */ + startnum = -1; + memset(prefix, 0, ORTE_MAX_NODE_PREFIX); + numdigits = 0; + for (i=len-1; i >= 0; i--) { + // Count all of the digits + if( isdigit(node[i]) ) { + numdigits++; + continue; + } + else { + // At this point everything at and above position 'i' is prefix. + for( j = 0; j <= i; ++j) { + prefix[j] = node[j]; + } + startnum = j; + break; + } + } -static int generate_ppn(orte_job_t *jdata, char **ppn) -{ - return ORTE_ERR_NOT_IMPLEMENTED; -} + opal_output_verbose(5, orte_regx_base_framework.framework_output, + "%s PROCESS NODE <%s> : reverse / prefix \"%s\" / numdigits %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node, prefix, numdigits); -static int parse_ppn(orte_job_t *jdata, char *ppn) -{ - return ORTE_ERR_NOT_IMPLEMENTED; + if (startnum < 0) { + /* can't compress this name - just add it to the list */ + ndreg = OBJ_NEW(orte_regex_node_t); + ndreg->prefix = strdup(node); + opal_list_append(&nodenms, &ndreg->super); + continue; + } + /* convert the digits and get any suffix */ + nodenum = strtol(&node[startnum], &sfx, 10); + if (NULL != sfx) { + suffix = strdup(sfx); + } else { + suffix = NULL; + } + /* is this node name already on our list? */ + found = false; + for (item = opal_list_get_first(&nodenms); + !found && item != opal_list_get_end(&nodenms); + item = opal_list_get_next(item)) { + ndreg = (orte_regex_node_t*)item; + if (0 < strlen(prefix) && NULL == ndreg->prefix) { + continue; + } + if (0 == strlen(prefix) && NULL != ndreg->prefix) { + continue; + } + if (0 < strlen(prefix) && NULL != ndreg->prefix + && 0 != strcmp(prefix, ndreg->prefix)) { + continue; + } + if (NULL == suffix && NULL != ndreg->suffix) { + continue; + } + if (NULL != suffix && NULL == ndreg->suffix) { + continue; + } + if (NULL != suffix && NULL != ndreg->suffix && + 0 != strcmp(suffix, ndreg->suffix)) { + continue; + } + if (numdigits != ndreg->num_digits) { + continue; + } + /* found a match - flag it */ + found = true; + /* get the last range on this nodeid - we do this + * to preserve order + */ + range = (orte_regex_range_t*)opal_list_get_last(&ndreg->ranges); + if (NULL == range) { + /* first range for this nodeid */ + range = OBJ_NEW(orte_regex_range_t); + range->vpid = nodenum; + range->cnt = 1; + opal_list_append(&ndreg->ranges, &range->super); + break; + } + /* see if the node number is out of sequence */ + if (nodenum != (range->vpid + range->cnt)) { + /* start a new range */ + range = OBJ_NEW(orte_regex_range_t); + range->vpid = nodenum; + range->cnt = 1; + opal_list_append(&ndreg->ranges, &range->super); + break; + } + /* everything matches - just increment the cnt */ + range->cnt++; + break; + } + if (!found) { + /* need to add it */ + ndreg = OBJ_NEW(orte_regex_node_t); + if (0 < strlen(prefix)) { + ndreg->prefix = strdup(prefix); + } + if (NULL != suffix) { + ndreg->suffix = strdup(suffix); + } + ndreg->num_digits = numdigits; + opal_list_append(&nodenms, &ndreg->super); + /* record the first range for this nodeid - we took + * care of names we can't compress above + */ + range = OBJ_NEW(orte_regex_range_t); + range->vpid = nodenum; + range->cnt = 1; + opal_list_append(&ndreg->ranges, &range->super); + } + if (NULL != suffix) { + free(suffix); + } + } + /* begin constructing the regular expression */ + while (NULL != (item = opal_list_remove_first(&nodenms))) { + ndreg = (orte_regex_node_t*)item; + + /* if no ranges, then just add the name */ + if (0 == opal_list_get_size(&ndreg->ranges)) { + if (NULL != ndreg->prefix) { + /* solitary node */ + asprintf(&tmp, "%s", ndreg->prefix); + opal_argv_append_nosize(®exargs, tmp); + free(tmp); + } + OBJ_RELEASE(ndreg); + continue; + } + /* start the regex for this nodeid with the prefix */ + if (NULL != ndreg->prefix) { + asprintf(&tmp, "%s[%d:", ndreg->prefix, ndreg->num_digits); + } else { + asprintf(&tmp, "[%d:", ndreg->num_digits); + } + /* add the ranges */ + while (NULL != (itm2 = opal_list_remove_first(&ndreg->ranges))) { + range = (orte_regex_range_t*)itm2; + if (1 == range->cnt) { + asprintf(&tmp2, "%s%u,", tmp, range->vpid); + } else { + asprintf(&tmp2, "%s%u-%u,", tmp, range->vpid, range->vpid + range->cnt - 1); + } + free(tmp); + tmp = tmp2; + OBJ_RELEASE(range); + } + /* replace the final comma */ + tmp[strlen(tmp)-1] = ']'; + if (NULL != ndreg->suffix) { + /* add in the suffix, if provided */ + asprintf(&tmp2, "%s%s", tmp, ndreg->suffix); + free(tmp); + tmp = tmp2; + } + opal_argv_append_nosize(®exargs, tmp); + free(tmp); + OBJ_RELEASE(ndreg); + } + + /* assemble final result */ + nodenames = opal_argv_join(regexargs, ','); + /* cleanup */ + opal_argv_free(regexargs); + OBJ_DESTRUCT(&nodenms); + + /* do the same for the vpids */ + tmp = NULL; + while (NULL != (item = opal_list_remove_first(&dvpids))) { + rng = (orte_regex_range_t*)item; + if (1 < rng->cnt) { + if (NULL == tmp) { + asprintf(&tmp, "%u(%u)", rng->vpid, rng->cnt); + } else { + asprintf(&tmp2, "%s,%u(%u)", tmp, rng->vpid, rng->cnt); + free(tmp); + tmp = tmp2; + } + } else { + if (NULL == tmp) { + asprintf(&tmp, "%u", rng->vpid); + } else { + asprintf(&tmp2, "%s,%u", tmp, rng->vpid); + free(tmp); + tmp = tmp2; + } + } + OBJ_RELEASE(rng); + } + OPAL_LIST_DESTRUCT(&dvpids); + + /* now concatenate the results into one string */ + asprintf(&tmp2, "%s@%s", nodenames, tmp); + free(nodenames); + free(tmp); + *regex = tmp2; + return ORTE_SUCCESS; } From c988011afd8140d3501ad125dc39952067a9b807 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Fri, 12 Jan 2018 09:14:29 +0900 Subject: [PATCH 5/5] test/util: test the regx framework Signed-off-by: Gilles Gouaillardet --- test/util/Makefile.am | 11 +- test/util/orte_nidmap.c | 240 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 250 insertions(+), 1 deletion(-) create mode 100644 test/util/orte_nidmap.c diff --git a/test/util/Makefile.am b/test/util/Makefile.am index e498e3218d..75f18339ae 100644 --- a/test/util/Makefile.am +++ b/test/util/Makefile.am @@ -12,6 +12,8 @@ # Copyright (c) 2012 Los Alamos National Security, LLC. All rights # reserved. # Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2018 Research Organization for Information Science +# and Technology (RIST). All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -30,7 +32,8 @@ AM_CPPFLAGS = -I$(top_srcdir)/test/support # opal_os_create_dirpath \ # opal_argv \ # opal_basename \ -# opal_path_nfs +# opal_path_nfs \ +# orte_nidmap check_PROGRAMS = \ @@ -119,6 +122,12 @@ opal_path_nfs_DEPENDENCIES = $(opal_path_nfs_LDADD) # $(top_builddir)/test/support/libsupport.a #orte_universe_setup_file_io_DEPENDENCIES = $(orte_universe_setup_file_io_LDADD) +#orte_nidmap_SOURCES = orte_nidmap.c +#orte_nidmap_LDADD = \ +# $(top_builddir)/orte/libopen-rte.la \ +# $(top_builddir)/test/support/libsupport.a +#orte_nidmap_DEPENDENCIES = $(orte_nidmap_LDADD) + bipartite_graph_SOURCES = bipartite_graph.c bipartite_graph_LDADD = \ $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \ diff --git a/test/util/orte_nidmap.c b/test/util/orte_nidmap.c new file mode 100644 index 0000000000..f91b9564b4 --- /dev/null +++ b/test/util/orte_nidmap.c @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#include +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_SYS_PARAM_H +#include +#endif /* HAVE_SYS_PARAM_H */ +#include + +#include "support.h" +#include "opal/runtime/opal.h" +#include "orte/include/orte/frameworks.h" +#include "orte/constants.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/regx/regx.h" +#include "orte/mca/regx/base/base.h" + +static void check (const char *value, const char * expected) { + size_t sz = strlen(expected); + assert(strlen(value) >= sz); + assert(0 == strncmp(value, expected, sz-1)); + assert('@' == value[sz] || '\0' == value[sz]); +} + +int main(int argc, char* argv[]) +{ + char * regex; + char ** nodes; + opal_pointer_array_t pool; + orte_node_t * node; + orte_proc_info(); /* initialize proc info structure */ + + test_init("orte_nidmap"); + + opal_init(&argc, &argv); + orte_init(&argc, &argv, ORTE_PROC_TYPE_NONE); + + if (ORTE_SUCCESS != mca_base_framework_open(&orte_regx_base_framework, 0)) { + return -1; + } + if (ORTE_SUCCESS != orte_regx_base_select()) { + return -1; + } + + OBJ_CONSTRUCT(&pool, opal_pointer_array_t); + orte_node_pool = OBJ_NEW(opal_pointer_array_t); + orte_job_data = OBJ_NEW(opal_hash_table_t); + orte_job_t *jdata = OBJ_NEW(orte_job_t); + jdata->jobid = 1; + orte_process_info.my_name.jobid = 1; + opal_hash_table_init(orte_job_data, 1); + opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata); + + + node = OBJ_NEW(orte_node_t); + node->daemon = OBJ_NEW(orte_proc_t); + node->daemon->name.jobid = 1; + node->daemon->name.vpid = 0; + node->name = "n0"; + opal_pointer_array_add(&pool, node); + + orte_regx.nidmap_create(&pool, ®ex); + printf ("regex for n0 is %s\n", regex); + + nodes = NULL; + orte_regx.extract_node_names(regex, &nodes); + + check(nodes[0], "n0"); + assert(NULL == nodes[1]); + + + node = OBJ_NEW(orte_node_t); + node->daemon = OBJ_NEW(orte_proc_t); + node->daemon->name.vpid = 1; + node->name = "n1"; + opal_pointer_array_add(&pool, node); + + regex = NULL; + orte_regx.nidmap_create(&pool, ®ex); + printf ("regex for n0,n1 is %s\n", regex); + + nodes = NULL; + orte_regx.extract_node_names(regex, &nodes); + + check(nodes[0], "n0"); + check(nodes[1], "n1"); + assert(NULL == nodes[2]); + + node = opal_pointer_array_get_item(&pool, 0); + node->name = "n-0"; + node = opal_pointer_array_get_item(&pool, 1); + node->name = "n-1"; + + regex = NULL; + orte_regx.nidmap_create(&pool, ®ex); + printf ("regex for n-0,n-1 is %s\n", regex); + + nodes = NULL; + orte_regx.extract_node_names(regex, &nodes); + + check(nodes[0], "n-0"); + check(nodes[1], "n-1"); + assert(NULL == nodes[2]); + + + node = opal_pointer_array_get_item(&pool, 0); + node->name = "n-000"; + node = opal_pointer_array_get_item(&pool, 1); + node->name = "n-001"; + + regex = NULL; + orte_regx.nidmap_create(&pool, ®ex); + printf ("regex for n-000,n-001 is %s\n", regex); + + nodes = NULL; + orte_regx.extract_node_names(regex, &nodes); + + check(nodes[0], "n-000"); + check(nodes[1], "n-001"); + assert(NULL == nodes[2]); + + + node = opal_pointer_array_get_item(&pool, 0); + node->name = "n9"; + node = opal_pointer_array_get_item(&pool, 1); + node->name = "n10"; + + regex = NULL; + orte_regx.nidmap_create(&pool, ®ex); + printf ("regex for n9,n10 is %s\n", regex); + + nodes = NULL; + orte_regx.extract_node_names(regex, &nodes); + + check(nodes[0], "n9"); + check(nodes[1], "n10"); + assert(NULL == nodes[2]); + + + node = opal_pointer_array_get_item(&pool, 0); + node->name = "n99"; + node = opal_pointer_array_get_item(&pool, 1); + node->name = "n100"; + + regex = NULL; + orte_regx.nidmap_create(&pool, ®ex); + printf ("regex for n99,n100 is %s\n", regex); + + nodes = NULL; + orte_regx.extract_node_names(regex, &nodes); + + check(nodes[0], "n99"); + check(nodes[1], "n100"); + assert(NULL == nodes[2]); + + + node = opal_pointer_array_get_item(&pool, 0); + node->name = "c712f6n01"; + node = opal_pointer_array_get_item(&pool, 1); + node->name = "c712f6n02"; + node = OBJ_NEW(orte_node_t); + node->daemon = OBJ_NEW(orte_proc_t); + node->daemon->name.vpid = 2; + node->name = "c712f6n03"; + opal_pointer_array_add(&pool, node); + + regex = NULL; + orte_regx.nidmap_create(&pool, ®ex); + printf ("regex for c712f6n01,c712f6n02,c712f6n03 is %s\n", regex); + + nodes = NULL; + orte_regx.extract_node_names(regex, &nodes); + + check(nodes[0], "c712f6n01"); + check(nodes[1], "c712f6n02"); + check(nodes[2], "c712f6n03"); + assert(NULL == nodes[3]); + + + node = opal_pointer_array_get_item(&pool, 0); + node->name = "n01c712"; + node = opal_pointer_array_get_item(&pool, 1); + node->name = "n02c712"; + node = opal_pointer_array_get_item(&pool, 2); + node->name = "n03c712"; + + regex = NULL; + orte_regx.nidmap_create(&pool, ®ex); + printf ("regex for n01c712,n02c712,n03c712 is %s\n", regex); + + nodes = NULL; + orte_regx.extract_node_names(regex, &nodes); + + check(nodes[0], "n01c712"); + check(nodes[1], "n02c712"); + check(nodes[2], "n03c712"); + assert(NULL == nodes[3]); + + + node = opal_pointer_array_get_item(&pool, 0); + node->name = "c8n"; + node = opal_pointer_array_get_item(&pool, 1); + node->name = "c9n"; + node = opal_pointer_array_get_item(&pool, 2); + node->name = "c10n"; + + regex = NULL; + orte_regx.nidmap_create(&pool, ®ex); + printf ("regex for c8n,c9n,c10n is %s\n", regex); + + nodes = NULL; + orte_regx.extract_node_names(regex, &nodes); + + check(nodes[0], "c8n"); + check(nodes[1], "c9n"); + check(nodes[2], "c10n"); + assert(NULL == nodes[3]); + + + + /* clean up */ + orte_proc_info_finalize(); + + test_finalize(); + return 0; +}