From 911961ee21f4cef8cfef1befab1e9b962dd3d5ae Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 10 May 2017 11:26:42 -0700 Subject: [PATCH 1/2] Sigh - remove debug Signed-off-by: Ralph Castain --- orte/mca/rmaps/round_robin/rmaps_rr.c | 7 ------- orte/mca/rmaps/round_robin/rmaps_rr_mappers.c | 9 --------- 2 files changed, 16 deletions(-) diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index 06b621383c..a764e0243f 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -108,7 +108,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np", true, jdata->num_apps, NULL); rc = ORTE_ERR_SILENT; - opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } @@ -119,7 +118,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->mapping, initial_map, false))) { ORTE_ERROR_LOG(rc); - opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } /* flag that all subsequent requests should not reset the node->mapped flag */ @@ -238,12 +236,10 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) true, "mapping", orte_rmaps_base_print_mapping(jdata->map->mapping)); rc = ORTE_ERR_SILENT; - opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); - opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } @@ -253,7 +249,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { ORTE_ERROR_LOG(rc); - opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); return rc; } @@ -275,7 +270,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) return ORTE_SUCCESS; error: - opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); while(NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } @@ -287,4 +281,3 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) orte_rmaps_base_module_t orte_rmaps_round_robin_module = { orte_rmaps_rr_map }; - diff --git a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c index 8c2c9925e4..c0b08e2a03 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c @@ -493,7 +493,6 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app, orte_process_info.nodename); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } } @@ -511,7 +510,6 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, if (NULL == node->topology || NULL == node->topology->topo) { orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } start = 0; @@ -550,7 +548,6 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, /* add this node to the map, if reqd */ if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); ORTE_ERROR_LOG(idx); return idx; } @@ -569,18 +566,15 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, /* get the hwloc object */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, (i+start) % nobjs, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_NOT_FOUND; } if (orte_rmaps_base.cpus_per_rank > (int)opal_hwloc_base_get_npus(node->topology->topo, obj)) { orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low", true, orte_rmaps_base.cpus_per_rank, opal_hwloc_base_get_npus(node->topology->topo, obj), orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; @@ -607,14 +601,12 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app, orte_process_info.nodename); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { /* if we were explicitly told not to oversubscribe, then don't */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app, orte_process_info.nodename); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } } @@ -629,7 +621,6 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, if (nprocs_mapped < app->num_procs) { /* usually means there were no objects of the requested type */ - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_NOT_FOUND; } From 55f4b825af506dad3caf7eeacf9b7ab1782e3fc3 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 10 May 2017 12:40:02 -0700 Subject: [PATCH 2/2] Add verbose output to nidmap code for debugging as this is a new, and sometimes fragile, feature Signed-off-by: Ralph Castain --- orte/runtime/orte_init.c | 4 +++- orte/util/nidmap.c | 43 ++++++++++++++++++++++++++++++++++++++++ orte/util/nidmap.h | 2 ++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/orte/runtime/orte_init.c b/orte/runtime/orte_init.c index 4a885f1088..827c268230 100644 --- a/orte/runtime/orte_init.c +++ b/orte/runtime/orte_init.c @@ -13,7 +13,7 @@ * reserved. * Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * @@ -48,6 +48,7 @@ #include "orte/mca/schizo/base/base.h" #include "orte/util/listener.h" #include "orte/util/name_fns.h" +#include "orte/util/nidmap.h" #include "orte/util/proc_info.h" #include "orte/util/error_strings.h" #include "orte/orted/pmix/pmix_server.h" @@ -216,6 +217,7 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags) if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { /* let the pmix server register params */ pmix_server_register_params(); + orte_util_nidmap_init(); OPAL_TIMING_ENV_NEXT(tmng, "pmix_server_register_params"); } diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index ef7509e2a8..1243e1dd8f 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -74,6 +74,27 @@ #include "orte/util/nidmap.h" +static int orte_nidmap_verbosity = -1; +static int orte_nidmap_output = -1; + +void orte_util_nidmap_init(void) +{ + orte_nidmap_verbosity = -1; + (void) mca_base_var_register ("orte", "orte", NULL, "nidmap_verbose", + "Verbosity level for ORTE debug messages in the nidmap utilities", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, + &orte_nidmap_verbosity); + + /* set default output */ + orte_nidmap_output = opal_output_open(NULL); + + /* open up the verbose output for debugging */ + if (0 < orte_nidmap_verbosity) { + opal_output_set_verbosity(orte_nidmap_output, orte_nidmap_verbosity); + } +} + int orte_util_build_daemon_nidmap(void) { int i; @@ -585,6 +606,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) OBJ_RELEASE(rng); } OPAL_LIST_DESTRUCT(&slots); + opal_output_verbose(1, orte_nidmap_output, + "%s SLOT ASSIGNMENTS: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); /* pack the string */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); @@ -610,6 +634,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) OPAL_LIST_DESTRUCT(&flags); /* pack the string */ + opal_output_verbose(1, orte_nidmap_output, + "%s FLAG ASSIGNMENTS: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); return rc; @@ -652,6 +679,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) } if (NULL == rng->t) { /* need to account for NULL topology */ + opal_output_verbose(1, orte_nidmap_output, + "%s PACKING NULL TOPOLOGY", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); tmp2 = NULL; if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &tmp2, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); @@ -662,6 +692,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) return rc; } } else { + opal_output_verbose(1, orte_nidmap_output, + "%s PACKING TOPOLOGY: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rng->t->sig); /* pack this topology string */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->sig, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); @@ -685,6 +718,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) } OPAL_LIST_DESTRUCT(&topos); /* pack the string */ + opal_output_verbose(1, orte_nidmap_output, + "%s TOPOLOGY ASSIGNMENTS: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&bucket); @@ -1011,6 +1047,9 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) if (NULL == bptr) { /* our topology is first in the array */ t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0); + opal_output_verbose(1, orte_nidmap_output, + "%s ASSIGNING ALL TOPOLOGIES TO: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), t2->sig); for (n=0; n < orte_node_pool->size; n++) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { if (NULL == node->topology) { @@ -1077,6 +1116,10 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n+offset))) { continue; } + opal_output_verbose(1, orte_nidmap_output, + "%s ASSIGNING NODE %s WITH TOPO: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name, t2->sig); if (NULL == node->topology) { OBJ_RETAIN(t2); node->topology = t2; diff --git a/orte/util/nidmap.h b/orte/util/nidmap.h index 521cc352c0..3acc29b927 100644 --- a/orte/util/nidmap.h +++ b/orte/util/nidmap.h @@ -44,6 +44,8 @@ BEGIN_C_DECLS #define ORTE_NON_CONTIG_NODE_CMD 0x02 +ORTE_DECLSPEC void orte_util_nidmap_init(void); + ORTE_DECLSPEC int orte_util_nidmap_create(char **regex); ORTE_DECLSPEC int orte_util_nidmap_parse(char *regex);