1
1

opal_info: Add ability to report load failures

* Add a path for failed component load information to be reported up.
 * This allows ompi_info to display this information inline to make it
   easier for folks to see if the component is present but failed for
   some reason. Most likely a missing library, but could be a libnl
   conflict.
 * Add MCA parameter to enable this feature:
   - `mca_base_component_track_load_errors` takes a boolean
   - Default: `false`

Signed-off-by: Joshua Hursey <jhursey@us.ibm.com>
Этот коммит содержится в:
Joshua Hursey 2017-03-21 14:47:15 -05:00
родитель 539f71d0cc
Коммит 3ad3d4e3e7
8 изменённых файлов: 106 добавлений и 0 удалений

Просмотреть файл

@ -15,6 +15,7 @@
* reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -68,6 +69,7 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_base_component_priority_list_item_t);
*/
OPAL_DECLSPEC extern char *mca_base_component_path;
OPAL_DECLSPEC extern bool mca_base_component_show_load_errors;
OPAL_DECLSPEC extern bool mca_base_component_track_load_errors;
OPAL_DECLSPEC extern bool mca_base_component_disable_dlopen;
OPAL_DECLSPEC extern char *mca_base_system_default_path;
OPAL_DECLSPEC extern char *mca_base_user_default_path;

Просмотреть файл

@ -15,6 +15,7 @@
* reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -55,6 +56,29 @@ OBJ_CLASS_INSTANCE(mca_base_component_repository_item_t, opal_list_item_t,
#endif /* OPAL_HAVE_DL_SUPPORT */
static void clf_constructor(opal_object_t *obj);
static void clf_destructor(opal_object_t *obj);
OBJ_CLASS_INSTANCE(mca_base_failed_component_t, opal_list_item_t,
clf_constructor, clf_destructor);
static void clf_constructor(opal_object_t *obj)
{
mca_base_failed_component_t *cli = (mca_base_failed_component_t *) obj;
cli->comp = NULL;
cli->error_msg = NULL;
}
static void clf_destructor(opal_object_t *obj)
{
mca_base_failed_component_t *cli = (mca_base_failed_component_t *) obj;
cli->comp = NULL;
if( NULL != cli->error_msg ) {
free(cli->error_msg);
cli->error_msg = NULL;
}
}
/*
* Private variables
@ -408,6 +432,14 @@ int mca_base_component_repository_open (mca_base_framework_t *framework,
}
opal_output_verbose(vl, 0, "mca_base_component_repository_open: unable to open %s: %s (ignored)",
ri->ri_base, err_msg);
if( mca_base_component_track_load_errors ) {
mca_base_failed_component_t *f_comp = OBJ_NEW(mca_base_failed_component_t);
f_comp->comp = ri;
asprintf(&(f_comp->error_msg), "%s", err_msg);
opal_list_append(&framework->framework_failed_components, &f_comp->super);
}
return OPAL_ERR_BAD_PARAM;
}

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -60,6 +61,17 @@ typedef struct mca_base_component_repository_item_t mca_base_component_repositor
OBJ_CLASS_DECLARATION(mca_base_component_repository_item_t);
/*
* Structure to track information about why a component failed to load.
*/
struct mca_base_failed_component_t {
opal_list_item_t super;
mca_base_component_repository_item_t *comp;
char *error_msg;
};
typedef struct mca_base_failed_component_t mca_base_failed_component_t;
OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_base_failed_component_t);
/**
* @brief initialize the component repository
*

Просмотреть файл

@ -3,6 +3,7 @@
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -66,6 +67,7 @@ int mca_base_framework_register (struct mca_base_framework_t *framework,
}
OBJ_CONSTRUCT(&framework->framework_components, opal_list_t);
OBJ_CONSTRUCT(&framework->framework_failed_components, opal_list_t);
if (framework->framework_flags & MCA_BASE_FRAMEWORK_FLAG_NO_DSO) {
flags |= MCA_BASE_REGISTER_STATIC_ONLY;
@ -228,12 +230,16 @@ int mca_base_framework_close (struct mca_base_framework_t *framework) {
framework->framework_output);
OBJ_RELEASE(item);
}
while (NULL != (item = opal_list_remove_first (&framework->framework_failed_components))) {
OBJ_RELEASE(item);
}
ret = OPAL_SUCCESS;
}
framework->framework_flags &= ~(MCA_BASE_FRAMEWORK_FLAG_REGISTERED | MCA_BASE_FRAMEWORK_FLAG_OPEN);
OBJ_DESTRUCT(&framework->framework_components);
OBJ_DESTRUCT(&framework->framework_failed_components);
framework_close_output (framework);

Просмотреть файл

@ -2,6 +2,7 @@
/*
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -154,6 +155,8 @@ typedef struct mca_base_framework_t {
/** List of selected components (filled in by mca_base_framework_register()
or mca_base_framework_open() */
opal_list_t framework_components;
/** List of components that failed to load */
opal_list_t framework_failed_components;
} mca_base_framework_t;

Просмотреть файл

@ -49,6 +49,7 @@ int mca_base_opened = 0;
char *mca_base_system_default_path = NULL;
char *mca_base_user_default_path = NULL;
bool mca_base_component_show_load_errors = true;
bool mca_base_component_track_load_errors = false;
bool mca_base_component_disable_dlopen = false;
static char *mca_base_verbose = NULL;
@ -111,6 +112,14 @@ int mca_base_open(void)
(void) mca_base_var_register_synonym(var_id, "opal", "mca", NULL, "component_show_load_errors",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
mca_base_component_track_load_errors = false;
var_id = mca_base_var_register("opal", "mca", "base", "component_track_load_errors",
"Whether to track errors for components that failed to load or not",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_base_component_track_load_errors);
mca_base_component_disable_dlopen = false;
var_id = mca_base_var_register("opal", "mca", "base", "component_disable_dlopen",
"Whether to attempt to disable opening dynamic components or not",

Просмотреть файл

@ -15,6 +15,7 @@
* reserved.
* Copyright (c) 2011-2012 University of Houston. All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -50,6 +51,7 @@
#include "opal/mca/installdirs/installdirs.h"
#include "opal/runtime/opal_info_support.h"
#include "opal/mca/base/mca_base_component_repository.h"
const char *opal_info_path_prefix = "prefix";
const char *opal_info_path_bindir = "bindir";
@ -109,6 +111,9 @@ OBJ_CLASS_INSTANCE(opal_info_component_map_t,
component_map_construct,
component_map_destruct);
static void opal_info_show_failed_component(const mca_base_component_repository_item_t* ri,
const char *error_msg);
int opal_info_init(int argc, char **argv,
opal_cmd_line_t *opal_info_cmd_line)
{
@ -245,6 +250,7 @@ static int info_register_framework (mca_base_framework_t *framework, opal_pointe
map = OBJ_NEW(opal_info_component_map_t);
map->type = strdup(framework->framework_name);
map->components = &framework->framework_components;
map->failed_components = &framework->framework_failed_components;
opal_pointer_array_add(component_map, map);
}
@ -1012,6 +1018,7 @@ void opal_info_show_component_version(opal_pointer_array_t *mca_types,
bool want_all_types = false;
bool found;
mca_base_component_list_item_t *cli;
mca_base_failed_component_t *cli_failed;
int j;
char *pos;
opal_info_component_map_t *map;
@ -1057,6 +1064,15 @@ void opal_info_show_component_version(opal_pointer_array_t *mca_types,
}
}
/* found it! */
OPAL_LIST_FOREACH(cli_failed, map->failed_components, mca_base_failed_component_t) {
mca_base_component_repository_item_t *ri = cli_failed->comp;
if (want_all_components ||
0 == strcmp(component_name, ri->ri_name) ) {
opal_info_show_failed_component(ri, cli_failed->error_msg);
}
}
if (!want_all_types) {
break;
}
@ -1065,6 +1081,30 @@ void opal_info_show_component_version(opal_pointer_array_t *mca_types,
}
static void opal_info_show_failed_component(const mca_base_component_repository_item_t* ri,
const char *error_msg)
{
char *message, *content;
if (opal_info_pretty) {
asprintf(&message, "MCA %s", ri->ri_type);
asprintf(&content, "%s (failed to load) %s", ri->ri_name, error_msg);
opal_info_out(message, NULL, content);
free(message);
free(content);
} else {
asprintf(&message, "mca:%s:%s:failed", ri->ri_type, ri->ri_name);
asprintf(&content, "%s", error_msg);
opal_info_out(NULL, message, content);
free(message);
free(content);
}
}
/*
* Given a component, display its relevant version(s)
*/

Просмотреть файл

@ -2,6 +2,7 @@
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -50,6 +51,7 @@ typedef struct {
opal_list_item_t super;
char *type;
opal_list_t *components;
opal_list_t *failed_components;
} opal_info_component_map_t;
OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_info_component_map_t);