opal_info: Add ability to report load failures
* Add a path for failed component load information to be reported up. * This allows ompi_info to display this information inline to make it easier for folks to see if the component is present but failed for some reason. Most likely a missing library, but could be a libnl conflict. * Add MCA parameter to enable this feature: - `mca_base_component_track_load_errors` takes a boolean - Default: `false` Signed-off-by: Joshua Hursey <jhursey@us.ibm.com>
Этот коммит содержится в:
родитель
539f71d0cc
Коммит
3ad3d4e3e7
@ -15,6 +15,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -68,6 +69,7 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_base_component_priority_list_item_t);
|
||||
*/
|
||||
OPAL_DECLSPEC extern char *mca_base_component_path;
|
||||
OPAL_DECLSPEC extern bool mca_base_component_show_load_errors;
|
||||
OPAL_DECLSPEC extern bool mca_base_component_track_load_errors;
|
||||
OPAL_DECLSPEC extern bool mca_base_component_disable_dlopen;
|
||||
OPAL_DECLSPEC extern char *mca_base_system_default_path;
|
||||
OPAL_DECLSPEC extern char *mca_base_user_default_path;
|
||||
|
@ -15,6 +15,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -55,6 +56,29 @@ OBJ_CLASS_INSTANCE(mca_base_component_repository_item_t, opal_list_item_t,
|
||||
|
||||
#endif /* OPAL_HAVE_DL_SUPPORT */
|
||||
|
||||
static void clf_constructor(opal_object_t *obj);
|
||||
static void clf_destructor(opal_object_t *obj);
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_base_failed_component_t, opal_list_item_t,
|
||||
clf_constructor, clf_destructor);
|
||||
|
||||
|
||||
static void clf_constructor(opal_object_t *obj)
|
||||
{
|
||||
mca_base_failed_component_t *cli = (mca_base_failed_component_t *) obj;
|
||||
cli->comp = NULL;
|
||||
cli->error_msg = NULL;
|
||||
}
|
||||
|
||||
static void clf_destructor(opal_object_t *obj)
|
||||
{
|
||||
mca_base_failed_component_t *cli = (mca_base_failed_component_t *) obj;
|
||||
cli->comp = NULL;
|
||||
if( NULL != cli->error_msg ) {
|
||||
free(cli->error_msg);
|
||||
cli->error_msg = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Private variables
|
||||
@ -408,6 +432,14 @@ int mca_base_component_repository_open (mca_base_framework_t *framework,
|
||||
}
|
||||
opal_output_verbose(vl, 0, "mca_base_component_repository_open: unable to open %s: %s (ignored)",
|
||||
ri->ri_base, err_msg);
|
||||
|
||||
if( mca_base_component_track_load_errors ) {
|
||||
mca_base_failed_component_t *f_comp = OBJ_NEW(mca_base_failed_component_t);
|
||||
f_comp->comp = ri;
|
||||
asprintf(&(f_comp->error_msg), "%s", err_msg);
|
||||
opal_list_append(&framework->framework_failed_components, &f_comp->super);
|
||||
}
|
||||
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
|
@ -13,6 +13,7 @@
|
||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -60,6 +61,17 @@ typedef struct mca_base_component_repository_item_t mca_base_component_repositor
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_base_component_repository_item_t);
|
||||
|
||||
/*
|
||||
* Structure to track information about why a component failed to load.
|
||||
*/
|
||||
struct mca_base_failed_component_t {
|
||||
opal_list_item_t super;
|
||||
mca_base_component_repository_item_t *comp;
|
||||
char *error_msg;
|
||||
};
|
||||
typedef struct mca_base_failed_component_t mca_base_failed_component_t;
|
||||
OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_base_failed_component_t);
|
||||
|
||||
/**
|
||||
* @brief initialize the component repository
|
||||
*
|
||||
|
@ -3,6 +3,7 @@
|
||||
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -66,6 +67,7 @@ int mca_base_framework_register (struct mca_base_framework_t *framework,
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&framework->framework_components, opal_list_t);
|
||||
OBJ_CONSTRUCT(&framework->framework_failed_components, opal_list_t);
|
||||
|
||||
if (framework->framework_flags & MCA_BASE_FRAMEWORK_FLAG_NO_DSO) {
|
||||
flags |= MCA_BASE_REGISTER_STATIC_ONLY;
|
||||
@ -228,12 +230,16 @@ int mca_base_framework_close (struct mca_base_framework_t *framework) {
|
||||
framework->framework_output);
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
while (NULL != (item = opal_list_remove_first (&framework->framework_failed_components))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
ret = OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
framework->framework_flags &= ~(MCA_BASE_FRAMEWORK_FLAG_REGISTERED | MCA_BASE_FRAMEWORK_FLAG_OPEN);
|
||||
|
||||
OBJ_DESTRUCT(&framework->framework_components);
|
||||
OBJ_DESTRUCT(&framework->framework_failed_components);
|
||||
|
||||
framework_close_output (framework);
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
/*
|
||||
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -154,6 +155,8 @@ typedef struct mca_base_framework_t {
|
||||
/** List of selected components (filled in by mca_base_framework_register()
|
||||
or mca_base_framework_open() */
|
||||
opal_list_t framework_components;
|
||||
/** List of components that failed to load */
|
||||
opal_list_t framework_failed_components;
|
||||
} mca_base_framework_t;
|
||||
|
||||
|
||||
|
@ -49,6 +49,7 @@ int mca_base_opened = 0;
|
||||
char *mca_base_system_default_path = NULL;
|
||||
char *mca_base_user_default_path = NULL;
|
||||
bool mca_base_component_show_load_errors = true;
|
||||
bool mca_base_component_track_load_errors = false;
|
||||
bool mca_base_component_disable_dlopen = false;
|
||||
|
||||
static char *mca_base_verbose = NULL;
|
||||
@ -111,6 +112,14 @@ int mca_base_open(void)
|
||||
(void) mca_base_var_register_synonym(var_id, "opal", "mca", NULL, "component_show_load_errors",
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
|
||||
mca_base_component_track_load_errors = false;
|
||||
var_id = mca_base_var_register("opal", "mca", "base", "component_track_load_errors",
|
||||
"Whether to track errors for components that failed to load or not",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_base_component_track_load_errors);
|
||||
|
||||
mca_base_component_disable_dlopen = false;
|
||||
var_id = mca_base_var_register("opal", "mca", "base", "component_disable_dlopen",
|
||||
"Whether to attempt to disable opening dynamic components or not",
|
||||
|
@ -15,6 +15,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2012 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -50,6 +51,7 @@
|
||||
#include "opal/mca/installdirs/installdirs.h"
|
||||
|
||||
#include "opal/runtime/opal_info_support.h"
|
||||
#include "opal/mca/base/mca_base_component_repository.h"
|
||||
|
||||
const char *opal_info_path_prefix = "prefix";
|
||||
const char *opal_info_path_bindir = "bindir";
|
||||
@ -109,6 +111,9 @@ OBJ_CLASS_INSTANCE(opal_info_component_map_t,
|
||||
component_map_construct,
|
||||
component_map_destruct);
|
||||
|
||||
static void opal_info_show_failed_component(const mca_base_component_repository_item_t* ri,
|
||||
const char *error_msg);
|
||||
|
||||
int opal_info_init(int argc, char **argv,
|
||||
opal_cmd_line_t *opal_info_cmd_line)
|
||||
{
|
||||
@ -245,6 +250,7 @@ static int info_register_framework (mca_base_framework_t *framework, opal_pointe
|
||||
map = OBJ_NEW(opal_info_component_map_t);
|
||||
map->type = strdup(framework->framework_name);
|
||||
map->components = &framework->framework_components;
|
||||
map->failed_components = &framework->framework_failed_components;
|
||||
opal_pointer_array_add(component_map, map);
|
||||
}
|
||||
|
||||
@ -1012,6 +1018,7 @@ void opal_info_show_component_version(opal_pointer_array_t *mca_types,
|
||||
bool want_all_types = false;
|
||||
bool found;
|
||||
mca_base_component_list_item_t *cli;
|
||||
mca_base_failed_component_t *cli_failed;
|
||||
int j;
|
||||
char *pos;
|
||||
opal_info_component_map_t *map;
|
||||
@ -1057,6 +1064,15 @@ void opal_info_show_component_version(opal_pointer_array_t *mca_types,
|
||||
}
|
||||
}
|
||||
|
||||
/* found it! */
|
||||
OPAL_LIST_FOREACH(cli_failed, map->failed_components, mca_base_failed_component_t) {
|
||||
mca_base_component_repository_item_t *ri = cli_failed->comp;
|
||||
if (want_all_components ||
|
||||
0 == strcmp(component_name, ri->ri_name) ) {
|
||||
opal_info_show_failed_component(ri, cli_failed->error_msg);
|
||||
}
|
||||
}
|
||||
|
||||
if (!want_all_types) {
|
||||
break;
|
||||
}
|
||||
@ -1065,6 +1081,30 @@ void opal_info_show_component_version(opal_pointer_array_t *mca_types,
|
||||
}
|
||||
|
||||
|
||||
static void opal_info_show_failed_component(const mca_base_component_repository_item_t* ri,
|
||||
const char *error_msg)
|
||||
{
|
||||
char *message, *content;
|
||||
|
||||
if (opal_info_pretty) {
|
||||
asprintf(&message, "MCA %s", ri->ri_type);
|
||||
asprintf(&content, "%s (failed to load) %s", ri->ri_name, error_msg);
|
||||
|
||||
opal_info_out(message, NULL, content);
|
||||
|
||||
free(message);
|
||||
free(content);
|
||||
} else {
|
||||
asprintf(&message, "mca:%s:%s:failed", ri->ri_type, ri->ri_name);
|
||||
asprintf(&content, "%s", error_msg);
|
||||
|
||||
opal_info_out(NULL, message, content);
|
||||
|
||||
free(message);
|
||||
free(content);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a component, display its relevant version(s)
|
||||
*/
|
||||
|
@ -2,6 +2,7 @@
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -50,6 +51,7 @@ typedef struct {
|
||||
opal_list_item_t super;
|
||||
char *type;
|
||||
opal_list_t *components;
|
||||
opal_list_t *failed_components;
|
||||
} opal_info_component_map_t;
|
||||
OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_info_component_map_t);
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user