From e773c17cf3ccc97c1d6a37deecf4211174a1897c Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sun, 2 Oct 2016 16:02:23 -0700 Subject: [PATCH] Put show_help thru the PMIx "log" API. This pushes the show_help output from apps into the pmix thread, thus avoiding conflicts in the RML thread, which should help with thread lock situations. --- opal/mca/pmix/pmix.h | 10 ++++ opal/mca/pmix/pmix3x/pmix3x.c | 66 +++++++++++++++++++++ opal/mca/pmix/pmix_types.h | 4 ++ orte/orted/pmix/pmix_server_gen.c | 17 ++++-- orte/util/show_help.c | 95 ++++++++++++++++++++++--------- 5 files changed, 162 insertions(+), 30 deletions(-) diff --git a/opal/mca/pmix/pmix.h b/opal/mca/pmix/pmix.h index 153339e6e5..28da8fb916 100644 --- a/opal/mca/pmix/pmix.h +++ b/opal/mca/pmix/pmix.h @@ -786,6 +786,14 @@ typedef const char* (*opal_pmix_base_module_get_nspace_fn_t)(opal_jobid_t jobid) /* register a jobid-to-nspace pair */ typedef void (*opal_pmix_base_module_register_jobid_fn_t)(opal_jobid_t jobid, const char *nspace); +/* query information from the system */ +typedef void (*opal_pmix_base_module_query_fn_t)(opal_list_t *queries, + opal_pmix_info_cbfunc_t cbfunc, void *cbdata); + +/* log data to the system */ +typedef void (*opal_pmix_base_log_fn_t)(opal_list_t *info, + opal_pmix_op_cbfunc_t cbfunc, void *cbdata); + /* * the standard public API data structure */ @@ -815,6 +823,8 @@ typedef struct { opal_pmix_base_module_disconnect_nb_fn_t disconnect_nb; opal_pmix_base_module_resolve_peers_fn_t resolve_peers; opal_pmix_base_module_resolve_nodes_fn_t resolve_nodes; + opal_pmix_base_module_query_fn_t query; + opal_pmix_base_log_fn_t log; /* server APIs */ opal_pmix_base_module_server_init_fn_t server_init; opal_pmix_base_module_server_finalize_fn_t server_finalize; diff --git a/opal/mca/pmix/pmix3x/pmix3x.c b/opal/mca/pmix/pmix3x/pmix3x.c index 53e4814f6a..371fcf6253 100644 --- a/opal/mca/pmix/pmix3x/pmix3x.c +++ b/opal/mca/pmix/pmix3x/pmix3x.c @@ -40,6 +40,7 @@ #include "opal/mca/pmix/pmix_types.h" #include +#include /**** C.O.M.M.O.N I.N.T.E.R.F.A.C.E.S ****/ @@ -61,6 +62,10 @@ static int notify_event(int status, opal_pmix_data_range_t range, opal_list_t *info, opal_pmix_op_cbfunc_t cbfunc, void *cbdata); +static void pmix3x_query(opal_list_t *queries, + opal_pmix_info_cbfunc_t cbfunc, void *cbdata); +static void pmix3x_log(opal_list_t *info, + opal_pmix_op_cbfunc_t cbfunc, void *cbdata); const opal_pmix_base_module_t opal_pmix_pmix3x_module = { /* client APIs */ @@ -88,6 +93,8 @@ const opal_pmix_base_module_t opal_pmix_pmix3x_module = { .disconnect_nb = pmix3x_disconnectnb, .resolve_peers = pmix3x_resolve_peers, .resolve_nodes = pmix3x_resolve_nodes, + .query = pmix3x_query, + .log = pmix3x_log, /* server APIs */ .server_init = pmix3x_server_init, .server_finalize = pmix3x_server_finalize, @@ -1293,6 +1300,65 @@ static int notify_event(int status, return OPAL_SUCCESS; } +static void pmix3x_query(opal_list_t *queries, + opal_pmix_info_cbfunc_t cbfunc, void *cbdata) +{ + if (NULL != cbfunc) { + cbfunc(OPAL_ERR_NOT_SUPPORTED, NULL, cbdata, NULL, NULL); + } + return; +} + +static void opcbfunc(pmix_status_t status, void *cbdata) +{ + pmix3x_opcaddy_t *op = (pmix3x_opcaddy_t*)cbdata; + + if (NULL != op->opcbfunc) { + op->opcbfunc(pmix3x_convert_rc(status), op->cbdata); + } + OBJ_RELEASE(op); +} + +static void pmix3x_log(opal_list_t *info, + opal_pmix_op_cbfunc_t cbfunc, void *cbdata) +{ + int rc; + opal_value_t *ival; + size_t n, ninfo; + pmix3x_opcaddy_t *cd; + + /* bozo check */ + if (NULL == info || 0 == (ninfo = opal_list_get_size(info))) { + rc = OPAL_ERR_BAD_PARAM; + goto CLEANUP; + } + + /* setup the operation */ + cd = OBJ_NEW(pmix3x_opcaddy_t); + cd->opcbfunc = cbfunc; + cd->cbdata = cbdata; + cd->ninfo = ninfo; + + /* convert the list to an array of info objects */ + PMIX_INFO_CREATE(cd->info, cd->ninfo); + n=0; + OPAL_LIST_FOREACH(ival, info, opal_value_t) { + (void)strncpy(cd->info[n].key, ival->key, PMIX_MAX_KEYLEN); + pmix3x_value_load(&cd->info[n].value, ival); + ++n; + } + + /* pass it down */ + PMIx_Log_nb(cd->info, cd->ninfo, NULL, 0, + opcbfunc, cd); + return; + + CLEANUP: + if (NULL != cbfunc) { + cbfunc(rc, cbdata); + } +} + /**** INSTANTIATE INTERNAL CLASSES ****/ OBJ_CLASS_INSTANCE(opal_pmix3x_jobid_trkr_t, opal_list_item_t, diff --git a/opal/mca/pmix/pmix_types.h b/opal/mca/pmix/pmix_types.h index af34e4d25a..f3edda91c9 100644 --- a/opal/mca/pmix/pmix_types.h +++ b/opal/mca/pmix/pmix_types.h @@ -201,6 +201,10 @@ BEGIN_C_DECLS // procs in job on same node #define OPAL_PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // return operations tool is authorized to perform" +/* log attributes */ +#define OPAL_PMIX_LOG_STDERR "pmix.log.stderr" // (bool) log data to stderr +#define OPAL_PMIX_LOG_STDOUT "pmix.log.stdout" // (bool) log data to stdout +#define OPAL_PMIX_LOG_SYSLOG "pmix.log.syslog" // (bool) log data to syslog - defaults to ERROR priority unless /* define a scope for data "put" by PMI per the following: * diff --git a/orte/orted/pmix/pmix_server_gen.c b/orte/orted/pmix/pmix_server_gen.c index eec3438d1e..dec71546cf 100644 --- a/orte/orted/pmix/pmix_server_gen.c +++ b/orte/orted/pmix/pmix_server_gen.c @@ -481,18 +481,27 @@ void pmix_server_log_fn(opal_process_name_t *requestor, void *cbdata) { opal_value_t *val; + opal_buffer_t *buf; + int rc; /* for now, we only support logging show_help messages */ OPAL_LIST_FOREACH(val, info, opal_value_t) { /* we ignore the key as irrelevant - we only want to - * pull out the string value */ - if (OPAL_STRING != val->type) { + * pull out the blob */ + if (OPAL_BYTE_OBJECT != val->type) { continue; } - opal_output(0, "SHOWHELP: %s", val->data.string); + buf = OBJ_NEW(opal_buffer_t); + opal_dss.load(buf, val->data.bo.bytes, val->data.bo.size); + val->data.bo.bytes = NULL; + if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, + ORTE_RML_TAG_SHOW_HELP, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + } } if (NULL != cbfunc) { cbfunc(OPAL_SUCCESS, cbdata); } } - diff --git a/orte/util/show_help.c b/orte/util/show_help.c index 9ee695c2e4..13886009ed 100644 --- a/orte/util/show_help.c +++ b/orte/util/show_help.c @@ -30,6 +30,7 @@ #include "opal/util/output.h" #include "opal/dss/dss.h" #include "opal/mca/event/event.h" +#include "opal/mca/pmix/pmix.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rml/rml.h" @@ -602,11 +603,23 @@ int orte_show_help(const char *filename, const char *topic, return rc; } +static void cbfunc(int status, void *cbdata) +{ + volatile bool *active = (volatile bool*)cbdata; + *active = false; +} + int orte_show_help_norender(const char *filename, const char *topic, bool want_error_header, const char *output) { int rc = ORTE_SUCCESS; int8_t have_output = 1; + opal_buffer_t *buf; + bool am_inside = false; + opal_list_t info; + opal_value_t *kv; + volatile bool active; + struct timespec tp; if (!ready) { /* if we are finalizing, then we have no way to process @@ -628,39 +641,44 @@ int orte_show_help_norender(const char *filename, const char *topic, * mode, then all we can do is process this locally */ if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_TOOL || - orte_standalone_operation || - NULL == orte_rml.send_buffer_nb || - NULL == orte_routed.get_route || - NULL == orte_process_info.my_hnp_uri) { + orte_standalone_operation) { rc = show_help(filename, topic, output, ORTE_PROC_MY_NAME); + goto CLEANUP; + } else if (ORTE_PROC_IS_DAEMON) { + if (NULL == orte_rml.send_buffer_nb || + NULL == orte_routed.get_route || + NULL == orte_process_info.my_hnp_uri) { + rc = show_help(filename, topic, output, ORTE_PROC_MY_NAME); + goto CLEANUP; + } } /* otherwise, we relay the output message to * the HNP for processing */ - else { - opal_buffer_t *buf; - static bool am_inside = false; - /* JMS Note that we *may* have a recursion situation here where - the RML could call show_help. Need to think about this - properly, but put a safeguard in here for sure for the time - being. */ - if (am_inside) { - rc = show_help(filename, topic, output, ORTE_PROC_MY_NAME); - } else { - am_inside = true; + /* JMS Note that we *may* have a recursion situation here where + the RML could call show_help. Need to think about this + properly, but put a safeguard in here for sure for the time + being. */ + if (am_inside) { + rc = show_help(filename, topic, output, ORTE_PROC_MY_NAME); + } else { + am_inside = true; - /* build the message to the HNP */ - buf = OBJ_NEW(opal_buffer_t); - /* pack the filename of the show_help text file */ - opal_dss.pack(buf, &filename, 1, OPAL_STRING); - /* pack the topic tag */ - opal_dss.pack(buf, &topic, 1, OPAL_STRING); - /* pack the flag that we have a string */ - opal_dss.pack(buf, &have_output, 1, OPAL_INT8); - /* pack the resulting string */ - opal_dss.pack(buf, &output, 1, OPAL_STRING); + /* build the message to the HNP */ + buf = OBJ_NEW(opal_buffer_t); + /* pack the filename of the show_help text file */ + opal_dss.pack(buf, &filename, 1, OPAL_STRING); + /* pack the topic tag */ + opal_dss.pack(buf, &topic, 1, OPAL_STRING); + /* pack the flag that we have a string */ + opal_dss.pack(buf, &have_output, 1, OPAL_INT8); + /* pack the resulting string */ + opal_dss.pack(buf, &output, 1, OPAL_STRING); + + /* if we are a daemon, then send it via RML to the HNP */ + if (ORTE_PROC_IS_DAEMON) { /* send it to the HNP */ if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_SHOW_HELP, @@ -672,8 +690,33 @@ int orte_show_help_norender(const char *filename, const char *topic, } else { rc = ORTE_SUCCESS; } - am_inside = false; + } else { + /* if we are not a daemon (i.e., we are an app) and if PMIx + * support for "log" is available, then use that channel */ + if (NULL != opal_pmix.log) { + OBJ_CONSTRUCT(&info, opal_list_t); + kv = OBJ_NEW(opal_value_t), + kv->key = strdup(OPAL_PMIX_LOG_STDERR); + kv->type = OPAL_BYTE_OBJECT; + opal_dss.unload(buf, (void**)&kv->data.bo.bytes, &kv->data.bo.size); + opal_list_append(&info, &kv->super); + active = true; + tp.tv_sec = 0; + tp.tv_nsec = 1000000; + opal_pmix.log(&info, cbfunc, (void*)&active); + while (active) { + nanosleep(&tp, NULL); + } + OBJ_RELEASE(buf); + kv->data.bo.bytes = NULL; + OPAL_LIST_DESTRUCT(&info); + rc = ORTE_SUCCESS; + goto CLEANUP; + } else { + rc = show_help(filename, topic, output, ORTE_PROC_MY_NAME); + } } + am_inside = false; } CLEANUP: