From 3ad3d4e3e79d005375bb7c56279d9813de928993 Mon Sep 17 00:00:00 2001 From: Joshua Hursey Date: Tue, 21 Mar 2017 14:47:15 -0500 Subject: [PATCH 1/2] opal_info: Add ability to report load failures * Add a path for failed component load information to be reported up. * This allows ompi_info to display this information inline to make it easier for folks to see if the component is present but failed for some reason. Most likely a missing library, but could be a libnl conflict. * Add MCA parameter to enable this feature: - `mca_base_component_track_load_errors` takes a boolean - Default: `false` Signed-off-by: Joshua Hursey --- opal/mca/base/base.h | 2 + opal/mca/base/mca_base_component_repository.c | 32 +++++++++++++++ opal/mca/base/mca_base_component_repository.h | 12 ++++++ opal/mca/base/mca_base_framework.c | 6 +++ opal/mca/base/mca_base_framework.h | 3 ++ opal/mca/base/mca_base_open.c | 9 +++++ opal/runtime/opal_info_support.c | 40 +++++++++++++++++++ opal/runtime/opal_info_support.h | 2 + 8 files changed, 106 insertions(+) diff --git a/opal/mca/base/base.h b/opal/mca/base/base.h index 1fdcbd899d..5c29c0039b 100644 --- a/opal/mca/base/base.h +++ b/opal/mca/base/base.h @@ -15,6 +15,7 @@ * reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -68,6 +69,7 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_base_component_priority_list_item_t); */ OPAL_DECLSPEC extern char *mca_base_component_path; OPAL_DECLSPEC extern bool mca_base_component_show_load_errors; +OPAL_DECLSPEC extern bool mca_base_component_track_load_errors; OPAL_DECLSPEC extern bool mca_base_component_disable_dlopen; OPAL_DECLSPEC extern char *mca_base_system_default_path; OPAL_DECLSPEC extern char *mca_base_user_default_path; diff --git a/opal/mca/base/mca_base_component_repository.c b/opal/mca/base/mca_base_component_repository.c index f1497f6836..b34f19eea0 100644 --- a/opal/mca/base/mca_base_component_repository.c +++ b/opal/mca/base/mca_base_component_repository.c @@ -15,6 +15,7 @@ * reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -55,6 +56,29 @@ OBJ_CLASS_INSTANCE(mca_base_component_repository_item_t, opal_list_item_t, #endif /* OPAL_HAVE_DL_SUPPORT */ +static void clf_constructor(opal_object_t *obj); +static void clf_destructor(opal_object_t *obj); + +OBJ_CLASS_INSTANCE(mca_base_failed_component_t, opal_list_item_t, + clf_constructor, clf_destructor); + + +static void clf_constructor(opal_object_t *obj) +{ + mca_base_failed_component_t *cli = (mca_base_failed_component_t *) obj; + cli->comp = NULL; + cli->error_msg = NULL; +} + +static void clf_destructor(opal_object_t *obj) +{ + mca_base_failed_component_t *cli = (mca_base_failed_component_t *) obj; + cli->comp = NULL; + if( NULL != cli->error_msg ) { + free(cli->error_msg); + cli->error_msg = NULL; + } +} /* * Private variables @@ -408,6 +432,14 @@ int mca_base_component_repository_open (mca_base_framework_t *framework, } opal_output_verbose(vl, 0, "mca_base_component_repository_open: unable to open %s: %s (ignored)", ri->ri_base, err_msg); + + if( mca_base_component_track_load_errors ) { + mca_base_failed_component_t *f_comp = OBJ_NEW(mca_base_failed_component_t); + f_comp->comp = ri; + asprintf(&(f_comp->error_msg), "%s", err_msg); + opal_list_append(&framework->framework_failed_components, &f_comp->super); + } + return OPAL_ERR_BAD_PARAM; } diff --git a/opal/mca/base/mca_base_component_repository.h b/opal/mca/base/mca_base_component_repository.h index 290c83c83c..08babe7051 100644 --- a/opal/mca/base/mca_base_component_repository.h +++ b/opal/mca/base/mca_base_component_repository.h @@ -13,6 +13,7 @@ * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -60,6 +61,17 @@ typedef struct mca_base_component_repository_item_t mca_base_component_repositor OBJ_CLASS_DECLARATION(mca_base_component_repository_item_t); +/* + * Structure to track information about why a component failed to load. + */ +struct mca_base_failed_component_t { + opal_list_item_t super; + mca_base_component_repository_item_t *comp; + char *error_msg; +}; +typedef struct mca_base_failed_component_t mca_base_failed_component_t; +OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_base_failed_component_t); + /** * @brief initialize the component repository * diff --git a/opal/mca/base/mca_base_framework.c b/opal/mca/base/mca_base_framework.c index a1e49e4d5b..9bd968319e 100644 --- a/opal/mca/base/mca_base_framework.c +++ b/opal/mca/base/mca_base_framework.c @@ -3,6 +3,7 @@ * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -66,6 +67,7 @@ int mca_base_framework_register (struct mca_base_framework_t *framework, } OBJ_CONSTRUCT(&framework->framework_components, opal_list_t); + OBJ_CONSTRUCT(&framework->framework_failed_components, opal_list_t); if (framework->framework_flags & MCA_BASE_FRAMEWORK_FLAG_NO_DSO) { flags |= MCA_BASE_REGISTER_STATIC_ONLY; @@ -228,12 +230,16 @@ int mca_base_framework_close (struct mca_base_framework_t *framework) { framework->framework_output); OBJ_RELEASE(item); } + while (NULL != (item = opal_list_remove_first (&framework->framework_failed_components))) { + OBJ_RELEASE(item); + } ret = OPAL_SUCCESS; } framework->framework_flags &= ~(MCA_BASE_FRAMEWORK_FLAG_REGISTERED | MCA_BASE_FRAMEWORK_FLAG_OPEN); OBJ_DESTRUCT(&framework->framework_components); + OBJ_DESTRUCT(&framework->framework_failed_components); framework_close_output (framework); diff --git a/opal/mca/base/mca_base_framework.h b/opal/mca/base/mca_base_framework.h index c5009ac382..46dfc1de22 100644 --- a/opal/mca/base/mca_base_framework.h +++ b/opal/mca/base/mca_base_framework.h @@ -2,6 +2,7 @@ /* * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -154,6 +155,8 @@ typedef struct mca_base_framework_t { /** List of selected components (filled in by mca_base_framework_register() or mca_base_framework_open() */ opal_list_t framework_components; + /** List of components that failed to load */ + opal_list_t framework_failed_components; } mca_base_framework_t; diff --git a/opal/mca/base/mca_base_open.c b/opal/mca/base/mca_base_open.c index 0e7144ac1a..c615af5b6c 100644 --- a/opal/mca/base/mca_base_open.c +++ b/opal/mca/base/mca_base_open.c @@ -49,6 +49,7 @@ int mca_base_opened = 0; char *mca_base_system_default_path = NULL; char *mca_base_user_default_path = NULL; bool mca_base_component_show_load_errors = true; +bool mca_base_component_track_load_errors = false; bool mca_base_component_disable_dlopen = false; static char *mca_base_verbose = NULL; @@ -111,6 +112,14 @@ int mca_base_open(void) (void) mca_base_var_register_synonym(var_id, "opal", "mca", NULL, "component_show_load_errors", MCA_BASE_VAR_SYN_FLAG_DEPRECATED); + mca_base_component_track_load_errors = false; + var_id = mca_base_var_register("opal", "mca", "base", "component_track_load_errors", + "Whether to track errors for components that failed to load or not", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_base_component_track_load_errors); + mca_base_component_disable_dlopen = false; var_id = mca_base_var_register("opal", "mca", "base", "component_disable_dlopen", "Whether to attempt to disable opening dynamic components or not", diff --git a/opal/runtime/opal_info_support.c b/opal/runtime/opal_info_support.c index 832dda6d0f..7c02af6a8d 100644 --- a/opal/runtime/opal_info_support.c +++ b/opal/runtime/opal_info_support.c @@ -15,6 +15,7 @@ * reserved. * Copyright (c) 2011-2012 University of Houston. All rights reserved. * Copyright (c) 2016 Intel, Inc. All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -50,6 +51,7 @@ #include "opal/mca/installdirs/installdirs.h" #include "opal/runtime/opal_info_support.h" +#include "opal/mca/base/mca_base_component_repository.h" const char *opal_info_path_prefix = "prefix"; const char *opal_info_path_bindir = "bindir"; @@ -109,6 +111,9 @@ OBJ_CLASS_INSTANCE(opal_info_component_map_t, component_map_construct, component_map_destruct); +static void opal_info_show_failed_component(const mca_base_component_repository_item_t* ri, + const char *error_msg); + int opal_info_init(int argc, char **argv, opal_cmd_line_t *opal_info_cmd_line) { @@ -245,6 +250,7 @@ static int info_register_framework (mca_base_framework_t *framework, opal_pointe map = OBJ_NEW(opal_info_component_map_t); map->type = strdup(framework->framework_name); map->components = &framework->framework_components; + map->failed_components = &framework->framework_failed_components; opal_pointer_array_add(component_map, map); } @@ -1012,6 +1018,7 @@ void opal_info_show_component_version(opal_pointer_array_t *mca_types, bool want_all_types = false; bool found; mca_base_component_list_item_t *cli; + mca_base_failed_component_t *cli_failed; int j; char *pos; opal_info_component_map_t *map; @@ -1057,6 +1064,15 @@ void opal_info_show_component_version(opal_pointer_array_t *mca_types, } } + /* found it! */ + OPAL_LIST_FOREACH(cli_failed, map->failed_components, mca_base_failed_component_t) { + mca_base_component_repository_item_t *ri = cli_failed->comp; + if (want_all_components || + 0 == strcmp(component_name, ri->ri_name) ) { + opal_info_show_failed_component(ri, cli_failed->error_msg); + } + } + if (!want_all_types) { break; } @@ -1065,6 +1081,30 @@ void opal_info_show_component_version(opal_pointer_array_t *mca_types, } +static void opal_info_show_failed_component(const mca_base_component_repository_item_t* ri, + const char *error_msg) +{ + char *message, *content; + + if (opal_info_pretty) { + asprintf(&message, "MCA %s", ri->ri_type); + asprintf(&content, "%s (failed to load) %s", ri->ri_name, error_msg); + + opal_info_out(message, NULL, content); + + free(message); + free(content); + } else { + asprintf(&message, "mca:%s:%s:failed", ri->ri_type, ri->ri_name); + asprintf(&content, "%s", error_msg); + + opal_info_out(NULL, message, content); + + free(message); + free(content); + } +} + /* * Given a component, display its relevant version(s) */ diff --git a/opal/runtime/opal_info_support.h b/opal/runtime/opal_info_support.h index 6128397143..db68e6c497 100644 --- a/opal/runtime/opal_info_support.h +++ b/opal/runtime/opal_info_support.h @@ -2,6 +2,7 @@ * Copyright (c) 2012-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. +* Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -50,6 +51,7 @@ typedef struct { opal_list_item_t super; char *type; opal_list_t *components; + opal_list_t *failed_components; } opal_info_component_map_t; OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_info_component_map_t); From 742d452c621f2d59252d668ccca7f4106b3d6038 Mon Sep 17 00:00:00 2001 From: Joshua Hursey Date: Tue, 21 Mar 2017 14:48:22 -0500 Subject: [PATCH 2/2] opal_info: Add --show-failed CLI option * `ompi_info --show-failed` will include the failed components along with information about why they failed. Signed-off-by: Joshua Hursey --- opal/runtime/opal_info_support.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/opal/runtime/opal_info_support.c b/opal/runtime/opal_info_support.c index 7c02af6a8d..9f73697539 100644 --- a/opal/runtime/opal_info_support.c +++ b/opal/runtime/opal_info_support.c @@ -162,6 +162,8 @@ int opal_info_init(int argc, char **argv, "Show only variables with at most this level (1-9)"); opal_cmd_line_make_opt3(opal_info_cmd_line, 's', NULL, "selected-only", 0, "Show only variables from selected components"); + opal_cmd_line_make_opt3(opal_info_cmd_line, '\0', NULL, "show-failed", 0, + "Show the components that failed to load along with the reason why they failed."); /* set our threading level */ opal_set_using_threads(false); @@ -228,6 +230,10 @@ int opal_info_init(int argc, char **argv, opal_info_register_flags = MCA_BASE_REGISTER_DEFAULT; } + if( opal_cmd_line_is_taken(opal_info_cmd_line, "show-failed") ) { + mca_base_component_track_load_errors = true; + } + return OPAL_SUCCESS; }