1
1
openmpi/orte/util/show_help.c

803 строки
26 KiB
C
Исходник Обычный вид История

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
2015-06-23 20:59:57 -07:00
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved.
2015-06-23 20:59:57 -07:00
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
As per the RFC, bring in the ORTE async progress code and the rewrite of OOB: *** THIS RFC INCLUDES A MINOR CHANGE TO THE MPI-RTE INTERFACE *** Note: during the course of this work, it was necessary to completely separate the MPI and RTE progress engines. There were multiple places in the MPI layer where ORTE_WAIT_FOR_COMPLETION was being used. A new OMPI_WAIT_FOR_COMPLETION macro was created (defined in ompi/mca/rte/rte.h) that simply cycles across opal_progress until the provided flag becomes false. Places where the MPI layer blocked waiting for RTE to complete an event have been modified to use this macro. *************************************************************************************** I am reissuing this RFC because of the time that has passed since its original release. Since its initial release and review, I have debugged it further to ensure it fully supports tests like loop_spawn. It therefore seems ready for merge back to the trunk. Given its prior review, I have set the timeout for one week. The code is in https://bitbucket.org/rhc/ompi-oob2 WHAT: Rewrite of ORTE OOB WHY: Support asynchronous progress and a host of other features WHEN: Wed, August 21 SYNOPSIS: The current OOB has served us well, but a number of limitations have been identified over the years. Specifically: * it is only progressed when called via opal_progress, which can lead to hangs or recursive calls into libevent (which is not supported by that code) * we've had issues when multiple NICs are available as the code doesn't "shift" messages between transports - thus, all nodes had to be available via the same TCP interface. * the OOB "unloads" incoming opal_buffer_t objects during the transmission, thus preventing use of OBJ_RETAIN in the code when repeatedly sending the same message to multiple recipients * there is no failover mechanism across NICs - if the selected NIC (or its attached switch) fails, we are forced to abort * only one transport (i.e., component) can be "active" The revised OOB resolves these problems: * async progress is used for all application processes, with the progress thread blocking in the event library * each available TCP NIC is supported by its own TCP module. The ability to asynchronously progress each module independently is provided, but not enabled by default (a runtime MCA parameter turns it "on") * multi-address TCP NICs (e.g., a NIC with both an IPv4 and IPv6 address, or with virtual interfaces) are supported - reachability is determined by comparing the contact info for a peer against all addresses within the range covered by the address/mask pairs for the NIC. * a message that arrives on one TCP NIC is automatically shifted to whatever NIC that is connected to the next "hop" if that peer cannot be reached by the incoming NIC. If no TCP module will reach the peer, then the OOB attempts to send the message via all other available components - if none can reach the peer, then an "error" is reported back to the RML, which then calls the errmgr for instructions. * opal_buffer_t now conforms to standard object rules re OBJ_RETAIN as we no longer "unload" the incoming object * NIC failure is reported to the TCP component, which then tries to resend the message across any other available TCP NIC. If that doesn't work, then the message is given back to the OOB base to try using other components. If all that fails, then the error is reported to the RML, which reports to the errmgr for instructions * obviously from the above, multiple OOB components (e.g., TCP and UD) can be active in parallel * the matching code has been moved to the RML (and out of the OOB/TCP component) so it is independent of transport * routing is done by the individual OOB modules (as opposed to the RML). Thus, both routed and non-routed transports can simultaneously be active * all blocking send/recv APIs have been removed. Everything operates asynchronously. KNOWN LIMITATIONS: * although provision is made for component failover as described above, the code for doing so has not been fully implemented yet. At the moment, if all connections for a given peer fail, the errmgr is notified of a "lost connection", which by default results in termination of the job if it was a lifeline * the IPv6 code is present and compiles, but is not complete. Since the current IPv6 support in the OOB doesn't work anyway, I don't consider this a blocker * routing is performed at the individual module level, yet the active routed component is selected on a global basis. We probably should update that to reflect that different transports may need/choose to route in different ways * obviously, not every error path has been tested nor necessarily covered * determining abnormal termination is more challenging than in the old code as we now potentially have multiple ways of connecting to a process. Ideally, we would declare "connection failed" when *all* transports can no longer reach the process, but that requires some additional (possibly complex) code. For now, the code replicates the old behavior only somewhat modified - i.e., if a module sees its connection fail, it checks to see if it is a lifeline. If so, it notifies the errmgr that the lifeline is lost - otherwise, it notifies the errmgr that a non-lifeline connection was lost. * reachability is determined solely on the basis of a shared subnet address/mask - more sophisticated algorithms (e.g., the one used in the tcp btl) are required to handle routing via gateways * the RML needs to assign sequence numbers to each message on a per-peer basis. The receiving RML will then deliver messages in order, thus preventing out-of-order messaging in the case where messages travel across different transports or a message needs to be redirected/resent due to failure of a NIC This commit was SVN r29058.
2013-08-22 16:37:40 +00:00
* All rights reserved.
* Copyright (c) 2016-2019 Intel, Inc. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
2015-06-23 20:59:57 -07:00
*
* Additional copyrights may follow
2015-06-23 20:59:57 -07:00
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/types.h"
#include "orte/constants.h"
#include <stdio.h>
#include <string.h>
#include <time.h>
#include "opal/util/show_help.h"
#include "opal/util/output.h"
#include "opal/util/printf.h"
#include "opal/dss/dss.h"
Update libevent to the 2.0 series, currently at 2.0.7rc. We will update to their final release when it becomes available. Currently known errors exist in unused portions of the libevent code. This revision passes the IBM test suite on a Linux machine and on a standalone Mac. This is a fairly intrusive change, but outside of the moving of opal/event to opal/mca/event, the only changes involved (a) changing all calls to opal_event functions to reflect the new framework instead, and (b) ensuring that all opal_event_t objects are properly constructed since they are now true opal_objects. Note: Shiqing has just returned from vacation and has not yet had a chance to complete the Windows integration. Thus, this commit almost certainly breaks Windows support on the trunk. However, I want this to have a chance to soak for as long as possible before I become less available a week from today (going to be at a class for 5 days, and thus will only be sparingly available) so we can find and fix any problems. Biggest change is moving the libevent code from opal/event to a new opal/mca/event framework. This was done to make it much easier to update libevent in the future. New versions can be inserted as a new component and tested in parallel with the current version until validated, then we can remove the earlier version if we so choose. This is a statically built framework ala installdirs, so only one component will build at a time. There is no selection logic - the sole compiled component simply loads its function pointers into the opal_event struct. I have gone thru the code base and converted all the libevent calls I could find. However, I cannot compile nor test every environment. It is therefore quite likely that errors remain in the system. Please keep an eye open for two things: 1. compile-time errors: these will be obvious as calls to the old functions (e.g., opal_evtimer_new) must be replaced by the new framework APIs (e.g., opal_event.evtimer_new) 2. run-time errors: these will likely show up as segfaults due to missing constructors on opal_event_t objects. It appears that it became a typical practice for people to "init" an opal_event_t by simply using memset to zero it out. This will no longer work - you must either OBJ_NEW or OBJ_CONSTRUCT an opal_event_t. I tried to catch these cases, but may have missed some. Believe me, you'll know when you hit it. There is also the issue of the new libevent "no recursion" behavior. As I described on a recent email, we will have to discuss this and figure out what, if anything, we need to do. This commit was SVN r23925.
2010-10-24 18:35:54 +00:00
#include "opal/mca/event/event.h"
#include "opal/mca/pmix/pmix.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h"
bool orte_help_want_aggregate = false;
2016-10-07 19:23:52 -07:00
static int orte_help_output;
/*
* Local variable to know whether aggregated show_help is available or
2015-06-23 20:59:57 -07:00
* not
*/
static bool ready = false;
/*
* Same for systems with or without full ORTE support
*/
bool orte_show_help_is_available(void)
{
/* This is a function only to give us forward flexibility in case
we need a more complicated check someday. */
return ready;
}
/* List items for holding (filename, topic) tuples */
typedef struct {
opal_list_item_t super;
/* The filename */
char *tli_filename;
/* The topic */
char *tli_topic;
/* List of process names that have displayed this (filename, topic) */
opal_list_t tli_processes;
/* Time this message was displayed */
time_t tli_time_displayed;
/* Count of processes since last display (i.e., "new" processes
that have showed this message that have not yet been output) */
int tli_count_since_last_display;
/* Do we want to display these? */
bool tli_display;
} tuple_list_item_t;
static void tuple_list_item_constructor(tuple_list_item_t *obj);
static void tuple_list_item_destructor(tuple_list_item_t *obj);
static OBJ_CLASS_INSTANCE(tuple_list_item_t, opal_list_item_t,
tuple_list_item_constructor,
tuple_list_item_destructor);
/* List of (filename, topic) tuples that have already been displayed */
static opal_list_t abd_tuples;
/* How long to wait between displaying duplicate show_help notices */
static struct timeval show_help_interval = { 5, 0 };
/* Timer for displaying duplicate help message notices */
static time_t show_help_time_last_displayed = 0;
static bool show_help_timer_set = false;
static opal_event_t show_help_timer_event;
static opal_show_help_fn_t save_help = NULL;
static void tuple_list_item_constructor(tuple_list_item_t *obj)
{
obj->tli_filename = NULL;
obj->tli_topic = NULL;
OBJ_CONSTRUCT(&(obj->tli_processes), opal_list_t);
obj->tli_time_displayed = time(NULL);
obj->tli_count_since_last_display = 0;
obj->tli_display = true;
}
static void tuple_list_item_destructor(tuple_list_item_t *obj)
{
opal_list_item_t *item, *next;
if (NULL != obj->tli_filename) {
free(obj->tli_filename);
}
if (NULL != obj->tli_topic) {
free(obj->tli_topic);
}
2015-06-23 20:59:57 -07:00
for (item = opal_list_get_first(&(obj->tli_processes));
opal_list_get_end(&(obj->tli_processes)) != item;
item = next) {
next = opal_list_get_next(item);
opal_list_remove_item(&(obj->tli_processes), item);
OBJ_RELEASE(item);
}
}
/* dealing with special characters in xml output */
static char* xml_format(unsigned char *input)
{
int i, j, k, len, outlen;
char *output, qprint[10];
char *endtag="</stderr>";
char *starttag="<stderr>";
int endtaglen, starttaglen;
bool endtagged = false;
2015-06-23 20:59:57 -07:00
len = strlen((char*)input);
/* add some arbitrary size padding */
output = (char*)malloc((len+1024)*sizeof(char));
if (NULL == output) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return (char*)input; /* default to no xml formatting */
}
memset(output, 0, len+1024);
outlen = len+1023;
endtaglen = strlen(endtag);
starttaglen = strlen(starttag);
2015-06-23 20:59:57 -07:00
/* start at the beginning */
k=0;
2015-06-23 20:59:57 -07:00
/* start with the tag */
for (j=0; j < starttaglen && k < outlen; j++) {
output[k++] = starttag[j];
2015-06-23 20:59:57 -07:00
}
for (i=0; i < len; i++) {
if ('&' == input[i]) {
if (k+5 >= outlen) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto error;
}
snprintf(qprint, 10, "&amp;");
for (j=0; j < (int)strlen(qprint) && k < outlen; j++) {
output[k++] = qprint[j];
}
} else if ('<' == input[i]) {
if (k+4 >= outlen) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto error;
}
snprintf(qprint, 10, "&lt;");
for (j=0; j < (int)strlen(qprint) && k < outlen; j++) {
output[k++] = qprint[j];
}
} else if ('>' == input[i]) {
if (k+4 >= outlen) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto error;
}
snprintf(qprint, 10, "&gt;");
for (j=0; j < (int)strlen(qprint) && k < outlen; j++) {
output[k++] = qprint[j];
}
} else if (input[i] < 32 || input[i] > 127) {
/* this is a non-printable character, so escape it too */
if (k+7 >= outlen) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto error;
}
snprintf(qprint, 10, "&#%03d;", (int)input[i]);
for (j=0; j < (int)strlen(qprint) && k < outlen; j++) {
output[k++] = qprint[j];
}
/* if this was a \n, then we also need to break the line with the end tag */
if ('\n' == input[i] && (k+endtaglen+1) < outlen) {
/* we need to break the line with the end tag */
for (j=0; j < endtaglen && k < outlen-1; j++) {
output[k++] = endtag[j];
}
/* move the <cr> over */
output[k++] = '\n';
/* if this isn't the end of the input buffer, add a new start tag */
if (i < len-1 && (k+starttaglen) < outlen) {
for (j=0; j < starttaglen && k < outlen; j++) {
output[k++] = starttag[j];
endtagged = false;
}
} else {
endtagged = true;
}
}
} else {
output[k++] = input[i];
2015-06-23 20:59:57 -07:00
}
}
if (!endtagged) {
/* need to add an endtag */
for (j=0; j < endtaglen && k < outlen-1; j++) {
output[k++] = endtag[j];
}
output[k++] = '\n';
}
2015-06-23 20:59:57 -07:00
return output;
2015-06-23 20:59:57 -07:00
error:
/* if we couldn't complete the processing for
* some reason, return the unprocessed input
* so at least the message gets out!
*/
free(output);
return (char*)input;
}
/*
* Returns ORTE_SUCCESS if the strings match; ORTE_ERROR otherwise.
*/
static int match(const char *a, const char *b)
{
int rc = ORTE_ERROR;
char *p1, *p2, *tmp1 = NULL, *tmp2 = NULL;
size_t min;
/* Check straight string match first */
if (0 == strcmp(a, b)) return ORTE_SUCCESS;
if (NULL != strchr(a, '*') || NULL != strchr(b, '*')) {
tmp1 = strdup(a);
if (NULL == tmp1) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
tmp2 = strdup(b);
if (NULL == tmp2) {
free(tmp1);
return ORTE_ERR_OUT_OF_RESOURCE;
}
p1 = strchr(tmp1, '*');
p2 = strchr(tmp2, '*');
if (NULL != p1) {
*p1 = '\0';
}
if (NULL != p2) {
*p2 = '\0';
}
min = strlen(tmp1);
if (strlen(tmp2) < min) {
min = strlen(tmp2);
}
if (0 == min || 0 == strncmp(tmp1, tmp2, min)) {
rc = ORTE_SUCCESS;
}
free(tmp1);
free(tmp2);
return rc;
}
/* No match */
return ORTE_ERROR;
}
/*
* Check to see if a given (filename, topic) tuple has been displayed
* already. Return ORTE_SUCCESS if so, or ORTE_ERR_NOT_FOUND if not.
*
* Always return a tuple_list_item_t representing this (filename,
* topic) entry in the list of "already been displayed tuples" (if it
* wasn't in the list already, this function will create a new entry
* in the list and return it).
*
* Note that a list is not an overly-efficient mechanism for this kind
* of data. The assupmtion is that there will only be a small numebr
* of (filename, topic) tuples displayed so the storage required will
* be fairly small, and linear searches will be fast enough.
*/
static int get_tli(const char *filename, const char *topic,
tuple_list_item_t **tli)
{
opal_list_item_t *item;
/* Search the list for a duplicate. */
2015-06-23 20:59:57 -07:00
for (item = opal_list_get_first(&abd_tuples);
opal_list_get_end(&abd_tuples) != item;
item = opal_list_get_next(item)) {
(*tli) = (tuple_list_item_t*) item;
if (ORTE_SUCCESS == match((*tli)->tli_filename, filename) &&
ORTE_SUCCESS == match((*tli)->tli_topic, topic)) {
return ORTE_SUCCESS;
}
}
/* Nope, we didn't find it -- make a new one */
*tli = OBJ_NEW(tuple_list_item_t);
if (NULL == *tli) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
(*tli)->tli_filename = strdup(filename);
(*tli)->tli_topic = strdup(topic);
opal_list_append(&abd_tuples, &((*tli)->super));
return ORTE_ERR_NOT_FOUND;
}
static void show_accumulated_duplicates(int fd, short event, void *context)
{
opal_list_item_t *item;
time_t now = time(NULL);
tuple_list_item_t *tli;
char *tmp, *output;
/* Loop through all the messages we've displayed and see if any
processes have sent duplicates that have not yet been displayed
yet */
2015-06-23 20:59:57 -07:00
for (item = opal_list_get_first(&abd_tuples);
opal_list_get_end(&abd_tuples) != item;
item = opal_list_get_next(item)) {
tli = (tuple_list_item_t*) item;
2015-06-23 20:59:57 -07:00
if (tli->tli_display &&
tli->tli_count_since_last_display > 0) {
static bool first = true;
if (orte_xml_output) {
opal_asprintf(&tmp, "%d more process%s sent help message %s / %s",
tli->tli_count_since_last_display,
(tli->tli_count_since_last_display > 1) ? "es have" : " has",
tli->tli_filename, tli->tli_topic);
output = xml_format((unsigned char*)tmp);
free(tmp);
fprintf(orte_xml_fp, "%s", output);
free(output);
} else {
opal_output(0, "%d more process%s sent help message %s / %s",
tli->tli_count_since_last_display,
(tli->tli_count_since_last_display > 1) ? "es have" : " has",
tli->tli_filename, tli->tli_topic);
}
tli->tli_count_since_last_display = 0;
if (first) {
if (orte_xml_output) {
fprintf(orte_xml_fp, "<stderr>Set MCA parameter \"orte_base_help_aggregate\" to 0 to see all help / error messages</stderr>\n");
fflush(orte_xml_fp);
} else {
opal_output(0, "Set MCA parameter \"orte_base_help_aggregate\" to 0 to see all help / error messages");
}
first = false;
}
}
}
show_help_time_last_displayed = now;
show_help_timer_set = false;
}
static int show_help(const char *filename, const char *topic,
const char *output, orte_process_name_t *sender)
{
int rc;
tuple_list_item_t *tli = NULL;
orte_namelist_t *pnli;
time_t now = time(NULL);
/* If we're aggregating, check for duplicates. Otherwise, don't
track duplicates at all and always display the message. */
if (orte_help_want_aggregate) {
rc = get_tli(filename, topic, &tli);
} else {
rc = ORTE_ERR_NOT_FOUND;
}
/* If there's no output string (i.e., this is a control message
asking us to suppress), then skip to the end. */
if (NULL == output) {
tli->tli_display = false;
goto after_output;
}
/* Was it already displayed? */
if (ORTE_SUCCESS == rc) {
/* Yes. But do we want to print anything? That's complicated.
We always show the first message of a given (filename,
topic) tuple as soon as it arrives. But we don't want to
show duplicate notices often, because we could get overrun
with them. So we want to gather them up and say "We got N
duplicates" every once in a while.
And keep in mind that at termination, we'll unconditionally
show all accumulated duplicate notices.
A simple scheme is as follows:
- when the first of a (filename, topic) tuple arrives
- print the message
- if a timer is not set, set T=now
- when a duplicate (filename, topic) tuple arrives
- if now>(T+5) and timer is not set (due to
non-pre-emptiveness of our libevent, a timer *could* be
set!)
- print all accumulated duplicates
- reset T=now
- else if a timer was not set, set the timer for T+5
- else if a timer was set, do nothing (just wait)
- set T=now when the timer expires
2015-06-23 20:59:57 -07:00
*/
++tli->tli_count_since_last_display;
if (now > show_help_time_last_displayed + 5 && !show_help_timer_set) {
show_accumulated_duplicates(0, 0, NULL);
} else if (!show_help_timer_set) {
opal_event_evtimer_set(orte_event_base, &show_help_timer_event,
Update libevent to the 2.0 series, currently at 2.0.7rc. We will update to their final release when it becomes available. Currently known errors exist in unused portions of the libevent code. This revision passes the IBM test suite on a Linux machine and on a standalone Mac. This is a fairly intrusive change, but outside of the moving of opal/event to opal/mca/event, the only changes involved (a) changing all calls to opal_event functions to reflect the new framework instead, and (b) ensuring that all opal_event_t objects are properly constructed since they are now true opal_objects. Note: Shiqing has just returned from vacation and has not yet had a chance to complete the Windows integration. Thus, this commit almost certainly breaks Windows support on the trunk. However, I want this to have a chance to soak for as long as possible before I become less available a week from today (going to be at a class for 5 days, and thus will only be sparingly available) so we can find and fix any problems. Biggest change is moving the libevent code from opal/event to a new opal/mca/event framework. This was done to make it much easier to update libevent in the future. New versions can be inserted as a new component and tested in parallel with the current version until validated, then we can remove the earlier version if we so choose. This is a statically built framework ala installdirs, so only one component will build at a time. There is no selection logic - the sole compiled component simply loads its function pointers into the opal_event struct. I have gone thru the code base and converted all the libevent calls I could find. However, I cannot compile nor test every environment. It is therefore quite likely that errors remain in the system. Please keep an eye open for two things: 1. compile-time errors: these will be obvious as calls to the old functions (e.g., opal_evtimer_new) must be replaced by the new framework APIs (e.g., opal_event.evtimer_new) 2. run-time errors: these will likely show up as segfaults due to missing constructors on opal_event_t objects. It appears that it became a typical practice for people to "init" an opal_event_t by simply using memset to zero it out. This will no longer work - you must either OBJ_NEW or OBJ_CONSTRUCT an opal_event_t. I tried to catch these cases, but may have missed some. Believe me, you'll know when you hit it. There is also the issue of the new libevent "no recursion" behavior. As I described on a recent email, we will have to discuss this and figure out what, if anything, we need to do. This commit was SVN r23925.
2010-10-24 18:35:54 +00:00
show_accumulated_duplicates, NULL);
opal_event_evtimer_add(&show_help_timer_event, &show_help_interval);
show_help_timer_set = true;
}
2015-06-23 20:59:57 -07:00
}
/* Not already displayed */
else if (ORTE_ERR_NOT_FOUND == rc) {
if (orte_xml_output) {
char *tmp;
tmp = xml_format((unsigned char*)output);
fprintf(orte_xml_fp, "%s", tmp);
fflush(orte_xml_fp);
free(tmp);
} else {
2016-10-07 19:23:52 -07:00
opal_output(orte_help_output, "%s", output);
}
if (!show_help_timer_set) {
show_help_time_last_displayed = now;
}
}
/* Some other error occurred */
else {
ORTE_ERROR_LOG(rc);
return rc;
}
after_output:
/* If we're aggregating, add this process name to the list */
if (orte_help_want_aggregate) {
pnli = OBJ_NEW(orte_namelist_t);
if (NULL == pnli) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(rc);
return rc;
}
pnli->name = *sender;
opal_list_append(&(tli->tli_processes), &(pnli->super));
}
return ORTE_SUCCESS;
}
/* Note that this function is called from ess/hnp, so don't make it
static */
void orte_show_help_recv(int status, orte_process_name_t* sender,
opal_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
{
char *output=NULL;
char *filename=NULL, *topic=NULL;
int32_t n;
int8_t have_output;
int rc;
2015-06-23 20:59:57 -07:00
OPAL_OUTPUT_VERBOSE((5, orte_debug_output,
"%s got show_help from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
2015-06-23 20:59:57 -07:00
/* unpack the filename of the show_help text file */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &filename, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* unpack the topic tag */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &topic, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* unpack the flag */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &have_output, &n, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
2015-06-23 20:59:57 -07:00
/* If we have an output string, unpack it */
if (have_output) {
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &output, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
2015-06-23 20:59:57 -07:00
/* Send it to show_help */
rc = show_help(filename, topic, output, sender);
2015-06-23 20:59:57 -07:00
cleanup:
if (NULL != output) {
free(output);
}
if (NULL != filename) {
free(filename);
}
if (NULL != topic) {
free(topic);
}
}
int orte_show_help_init(void)
{
2016-10-07 19:23:52 -07:00
opal_output_stream_t lds;
OPAL_OUTPUT_VERBOSE((5, orte_debug_output, "orte_show_help init"));
/* Show help duplicate detection */
if (ready) {
return ORTE_SUCCESS;
}
OBJ_CONSTRUCT(&abd_tuples, opal_list_t);
2015-06-23 20:59:57 -07:00
2016-10-07 19:23:52 -07:00
/* create an output stream for us */
OBJ_CONSTRUCT(&lds, opal_output_stream_t);
lds.lds_want_stderr = true;
orte_help_output = opal_output_open(&lds);
OBJ_DESTRUCT(&lds);
save_help = opal_show_help;
opal_show_help = orte_show_help;
ready = true;
return ORTE_SUCCESS;
}
void orte_show_help_finalize(void)
{
if (!ready) {
return;
}
ready = false;
2016-10-07 19:23:52 -07:00
opal_output_close(orte_help_output);
opal_show_help = save_help;
save_help = NULL;
/* Shutdown show_help, showing final messages */
if (ORTE_PROC_IS_HNP) {
show_accumulated_duplicates(0, 0, NULL);
OBJ_DESTRUCT(&abd_tuples);
if (show_help_timer_set) {
opal_event_evtimer_del(&show_help_timer_event);
}
2015-06-23 20:59:57 -07:00
/* cancel the recv */
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP);
return;
}
}
2015-06-23 20:59:57 -07:00
int orte_show_help(const char *filename, const char *topic,
int want_error_header, ...)
{
int rc = ORTE_SUCCESS;
va_list arglist;
char *output;
2015-06-23 20:59:57 -07:00
if (orte_execute_quiet) {
return ORTE_SUCCESS;
}
2015-06-23 20:59:57 -07:00
va_start(arglist, want_error_header);
2015-06-23 20:59:57 -07:00
output = opal_show_help_vstring(filename, topic, want_error_header,
arglist);
va_end(arglist);
/* If nothing came back, there's nothing to do */
if (NULL == output) {
return ORTE_SUCCESS;
}
rc = orte_show_help_norender(filename, topic, want_error_header, output);
free(output);
return rc;
}
static void cbfunc(int status, void *cbdata)
{
volatile bool *active = (volatile bool*)cbdata;
*active = false;
}
2015-06-23 20:59:57 -07:00
int orte_show_help_norender(const char *filename, const char *topic,
int want_error_header, const char *output)
{
int rc = ORTE_SUCCESS;
int8_t have_output = 1;
opal_buffer_t *buf;
bool am_inside = false;
opal_list_t info;
opal_value_t *kv;
volatile bool active;
struct timespec tp;
if (!ready) {
/* if we are finalizing, then we have no way to process
* this through the orte_show_help system - just drop it to
* stderr; that's at least better than not showing it.
*
* If we are not finalizing, then this is probably a show_help
* stemming from either a cmd-line request to display the usage
* message, or a show_help related to a user error. In either case,
* we can't do anything but just print to stderr.
*/
fprintf(stderr, "%s", output);
goto CLEANUP;
}
2015-06-23 20:59:57 -07:00
/* if we are the HNP, or the RML has not yet been setup,
* or ROUTED has not been setup,
* or we weren't given an HNP, or we are running in standalone
* mode, then all we can do is process this locally
*/
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_TOOL ||
orte_standalone_operation) {
rc = show_help(filename, topic, output, ORTE_PROC_MY_NAME);
goto CLEANUP;
} else if (ORTE_PROC_IS_DAEMON) {
if (NULL == orte_rml.send_buffer_nb ||
NULL == orte_routed.get_route ||
NULL == orte_process_info.my_hnp_uri) {
rc = show_help(filename, topic, output, ORTE_PROC_MY_NAME);
goto CLEANUP;
}
}
2015-06-23 20:59:57 -07:00
/* otherwise, we relay the output message to
* the HNP for processing
*/
/* JMS Note that we *may* have a recursion situation here where
the RML could call show_help. Need to think about this
properly, but put a safeguard in here for sure for the time
being. */
if (am_inside) {
rc = show_help(filename, topic, output, ORTE_PROC_MY_NAME);
} else {
am_inside = true;
/* build the message to the HNP */
buf = OBJ_NEW(opal_buffer_t);
/* pack the filename of the show_help text file */
opal_dss.pack(buf, &filename, 1, OPAL_STRING);
/* pack the topic tag */
opal_dss.pack(buf, &topic, 1, OPAL_STRING);
/* pack the flag that we have a string */
opal_dss.pack(buf, &have_output, 1, OPAL_INT8);
/* pack the resulting string */
opal_dss.pack(buf, &output, 1, OPAL_STRING);
/* if we are a daemon, then send it via RML to the HNP */
if (ORTE_PROC_IS_DAEMON) {
/* send it to the HNP */
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
ORTE_RML_TAG_SHOW_HELP,
orte_rml_send_callback, NULL))) {
OBJ_RELEASE(buf);
/* okay, that didn't work, output locally */
opal_output(orte_help_output, "%s", output);
} else {
rc = ORTE_SUCCESS;
}
} else {
/* if we are not a daemon (i.e., we are an app) and if PMIx
* support for "log" is available, then use that channel */
if (NULL != opal_pmix.log) {
OBJ_CONSTRUCT(&info, opal_list_t);
kv = OBJ_NEW(opal_value_t),
kv->key = strdup(OPAL_PMIX_LOG_MSG);
kv->type = OPAL_BYTE_OBJECT;
opal_dss.unload(buf, (void**)&kv->data.bo.bytes, &kv->data.bo.size);
opal_list_append(&info, &kv->super);
active = true;
tp.tv_sec = 0;
tp.tv_nsec = 1000000;
opal_pmix.log(&info, cbfunc, (void*)&active);
while (active) {
nanosleep(&tp, NULL);
}
OBJ_RELEASE(buf);
kv->data.bo.bytes = NULL;
OPAL_LIST_DESTRUCT(&info);
rc = ORTE_SUCCESS;
goto CLEANUP;
} else {
rc = show_help(filename, topic, output, ORTE_PROC_MY_NAME);
}
}
am_inside = false;
}
2015-06-23 20:59:57 -07:00
CLEANUP:
return rc;
}
int orte_show_help_suppress(const char *filename, const char *topic)
{
int rc = ORTE_SUCCESS;
int8_t have_output = 0;
2015-06-23 20:59:57 -07:00
if (orte_execute_quiet) {
return ORTE_SUCCESS;
}
2015-06-23 20:59:57 -07:00
if (!ready) {
/* If we are finalizing, then we have no way to process this
through the orte_show_help system - just drop it. */
return ORTE_SUCCESS;
}
2015-06-23 20:59:57 -07:00
/* If we are the HNP, or the RML has not yet been setup, or ROUTED
has not been setup, or we weren't given an HNP, then all we can
do is process this locally. */
if (ORTE_PROC_IS_HNP ||
NULL == orte_rml.send_buffer_nb ||
NULL == orte_routed.get_route ||
NULL == orte_process_info.my_hnp_uri) {
rc = show_help(filename, topic, NULL, ORTE_PROC_MY_NAME);
}
2015-06-23 20:59:57 -07:00
/* otherwise, we relay the output message to
* the HNP for processing
*/
else {
opal_buffer_t *buf;
static bool am_inside = false;
/* JMS Note that we *may* have a recursion situation here where
the RML could call show_help. Need to think about this
properly, but put a safeguard in here for sure for the time
being. */
if (am_inside) {
rc = show_help(filename, topic, NULL, ORTE_PROC_MY_NAME);
} else {
am_inside = true;
2015-06-23 20:59:57 -07:00
/* build the message to the HNP */
buf = OBJ_NEW(opal_buffer_t);
/* pack the filename of the show_help text file */
opal_dss.pack(buf, &filename, 1, OPAL_STRING);
/* pack the topic tag */
opal_dss.pack(buf, &topic, 1, OPAL_STRING);
/* pack the flag that we DO NOT have a string */
opal_dss.pack(buf, &have_output, 1, OPAL_INT8);
/* send it to the HNP */
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
ORTE_RML_TAG_SHOW_HELP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
/* okay, that didn't work, just process locally error, just ignore return */
show_help(filename, topic, NULL, ORTE_PROC_MY_NAME);
}
am_inside = false;
}
}
2015-06-23 20:59:57 -07:00
return ORTE_SUCCESS;
}