diff --git a/orte/mca/iof/base/base.h b/orte/mca/iof/base/base.h index fa3095d4a0..a0df4e4b32 100644 --- a/orte/mca/iof/base/base.h +++ b/orte/mca/iof/base/base.h @@ -71,15 +71,6 @@ typedef struct { } orte_iof_write_event_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_write_event_t); -struct orte_iof_base_t { - int iof_output; - opal_list_t iof_components_opened; - opal_mutex_t iof_write_output_lock; - orte_iof_write_event_t iof_write_stdout; - orte_iof_write_event_t iof_write_stderr; -}; -typedef struct orte_iof_base_t orte_iof_base_t; - typedef struct { opal_list_item_t super; orte_process_name_t name; @@ -122,14 +113,25 @@ typedef struct { } orte_iof_write_output_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_write_output_t); +/* the iof globals struct */ +struct orte_iof_base_t { + int iof_output; + opal_list_t iof_components_opened; + opal_mutex_t iof_write_output_lock; + orte_iof_sink_t *iof_write_stdout; + orte_iof_sink_t *iof_write_stderr; +}; +typedef struct orte_iof_base_t orte_iof_base_t; + + #if OMPI_ENABLE_DEBUG #define ORTE_IOF_SINK_DEFINE(snk, nm, fid, tg, wrthndlr, eplist) \ do { \ orte_iof_sink_t *ep; \ OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, \ - "defining endpoint: %s %d", \ - __FILE__, __LINE__)); \ + "defining endpt: file %s line %d fd %d",\ + __FILE__, __LINE__, (fid))); \ ep = OBJ_NEW(orte_iof_sink_t); \ ep->name.jobid = (nm)->jobid; \ ep->name.vpid = (nm)->vpid; \ @@ -138,9 +140,11 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_write_output_t); ep->wev->fd = (fid); \ opal_event_set(&(ep->wev->ev), ep->wev->fd, \ OPAL_EV_WRITE, \ - wrthndlr, ep) ; \ + wrthndlr, ep); \ + } \ + if (NULL != (eplist)) { \ + opal_list_append((eplist), &ep->super); \ } \ - opal_list_append((eplist), &ep->super); \ *(snk) = ep; \ ep->file = strdup(__FILE__); \ ep->line = __LINE__; \ diff --git a/orte/mca/iof/base/iof_base_close.c b/orte/mca/iof/base/iof_base_close.c index dc8ff63ff4..ef7f617889 100644 --- a/orte/mca/iof/base/iof_base_close.c +++ b/orte/mca/iof/base/iof_base_close.c @@ -36,6 +36,7 @@ int orte_iof_base_close(void) bool dump; opal_list_item_t *item; orte_iof_write_output_t *output; + orte_iof_write_event_t *wev; int num_written; /* shutdown any remaining opened components */ @@ -48,13 +49,14 @@ int orte_iof_base_close(void) OPAL_THREAD_LOCK(&orte_iof_base.iof_write_output_lock); if (!orte_process_info.daemon) { /* check if anything is still trying to be written out */ - if (!opal_list_is_empty(&orte_iof_base.iof_write_stdout.outputs)) { + wev = orte_iof_base.iof_write_stdout->wev; + if (!opal_list_is_empty(&wev->outputs)) { dump = false; /* make one last attempt to write this out */ - while (NULL != (item = opal_list_remove_first(&orte_iof_base.iof_write_stdout.outputs))) { + while (NULL != (item = opal_list_remove_first(&wev->outputs))) { output = (orte_iof_write_output_t*)item; if (!dump) { - num_written = write(orte_iof_base.iof_write_stdout.fd, output->data, output->numbytes); + num_written = write(wev->fd, output->data, output->numbytes); if (num_written < output->numbytes) { /* don't retry - just cleanout the list and dump it */ dump = true; @@ -63,14 +65,15 @@ int orte_iof_base_close(void) OBJ_RELEASE(output); } } - OBJ_DESTRUCT(&orte_iof_base.iof_write_stdout); - if (!opal_list_is_empty(&orte_iof_base.iof_write_stderr.outputs)) { + OBJ_RELEASE(orte_iof_base.iof_write_stdout); + wev = orte_iof_base.iof_write_stderr->wev; + if (!opal_list_is_empty(&wev->outputs)) { dump = false; /* make one last attempt to write this out */ - while (NULL != (item = opal_list_remove_first(&orte_iof_base.iof_write_stderr.outputs))) { + while (NULL != (item = opal_list_remove_first(&wev->outputs))) { output = (orte_iof_write_output_t*)item; if (!dump) { - num_written = write(orte_iof_base.iof_write_stderr.fd, output->data, output->numbytes); + num_written = write(wev->fd, output->data, output->numbytes); if (num_written < output->numbytes) { /* don't retry - just cleanout the list and dump it */ dump = true; @@ -79,7 +82,7 @@ int orte_iof_base_close(void) OBJ_RELEASE(output); } } - OBJ_DESTRUCT(&orte_iof_base.iof_write_stderr); + OBJ_RELEASE(orte_iof_base.iof_write_stderr); } OPAL_THREAD_UNLOCK(&orte_iof_base.iof_write_output_lock); diff --git a/orte/mca/iof/base/iof_base_open.c b/orte/mca/iof/base/iof_base_open.c index 4fc1e4bf95..7f2d5a0b41 100644 --- a/orte/mca/iof/base/iof_base_open.c +++ b/orte/mca/iof/base/iof_base_open.c @@ -25,6 +25,8 @@ #include "opal/mca/mca.h" #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_param.h" +#include "opal/util/os_dirpath.h" +#include "opal/util/basename.h" #include "orte/util/show_help.h" #include "orte/util/proc_info.h" @@ -84,6 +86,8 @@ OBJ_CLASS_INSTANCE(orte_iof_proc_t, static void orte_iof_base_sink_construct(orte_iof_sink_t* ptr) { + ptr->daemon.jobid = ORTE_JOBID_INVALID; + ptr->daemon.vpid = ORTE_VPID_INVALID; ptr->wev = OBJ_NEW(orte_iof_write_event_t); } static void orte_iof_base_sink_destruct(orte_iof_sink_t* ptr) @@ -163,31 +167,38 @@ orte_iof_base_t orte_iof_base; */ int orte_iof_base_open(void) { + int rc; + /* Initialize globals */ OBJ_CONSTRUCT(&orte_iof_base.iof_components_opened, opal_list_t); OBJ_CONSTRUCT(&orte_iof_base.iof_write_output_lock, opal_mutex_t); + /* did the user request we print output to files? */ + if (NULL != orte_output_filename) { + /* we will setup the files themselves as needed in the iof + * module. For now, let's see if the filename contains a + * path, or just a name + */ + char *path; + path = opal_dirname(orte_output_filename); + if (0 != strcmp(path, orte_output_filename)) { + /* there is a path in this name - ensure that the directory + * exists, and create it if not + */ + if (ORTE_SUCCESS != (rc = opal_os_dirpath_create(path, S_IRWXU))) { + return rc; + } + } + } + /* daemons do not need to do this as they do not write out stdout/err */ if (!orte_process_info.daemon) { /* setup the stdout event */ - OBJ_CONSTRUCT(&orte_iof_base.iof_write_stdout, orte_iof_write_event_t); - orte_iof_base.iof_write_stdout.fd = 1; - /* create the write event, but don't add it until we need it */ - opal_event_set(&orte_iof_base.iof_write_stdout.ev, - orte_iof_base.iof_write_stdout.fd, - OPAL_EV_WRITE, - orte_iof_base_write_handler, - &orte_iof_base.iof_write_stdout); - + ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stdout, ORTE_PROC_MY_NAME, + 1, ORTE_IOF_STDOUT, orte_iof_base_write_handler, NULL); /* setup the stderr event */ - OBJ_CONSTRUCT(&orte_iof_base.iof_write_stderr, orte_iof_write_event_t); - orte_iof_base.iof_write_stderr.fd = 2; - /* create the write event, but don't add it until we need it */ - opal_event_set(&orte_iof_base.iof_write_stderr.ev, - orte_iof_base.iof_write_stderr.fd, - OPAL_EV_WRITE, - orte_iof_base_write_handler, - &orte_iof_base.iof_write_stderr); + ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stderr, ORTE_PROC_MY_NAME, + 2, ORTE_IOF_STDERR, orte_iof_base_write_handler, NULL); /* do NOT set these file descriptors to non-blocking. If we do so, * we set the file descriptor to non-blocking for everyone that has * that file descriptor, which includes everyone else in our shell diff --git a/orte/mca/iof/base/iof_base_output.c b/orte/mca/iof/base/iof_base_output.c index 59b7144736..5ca1ddb58a 100644 --- a/orte/mca/iof/base/iof_base_output.c +++ b/orte/mca/iof/base/iof_base_output.c @@ -30,6 +30,9 @@ #ifdef HAVE_UNISTD_H #include #endif +#ifdef HAVE_TIME_H +#include +#endif #include #include "orte/util/name_fns.h" @@ -47,17 +50,26 @@ int orte_iof_base_write_output(orte_process_name_t *name, orte_iof_tag_t stream, int i, j, k, starttaglen, endtaglen, num_buffered; OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, - "%s write:output setting up to write %d bytes to %s of %s", + "%s write:output setting up to write %d bytes to %s for %s on fd %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, (ORTE_IOF_STDIN & stream) ? "stdin" : ((ORTE_IOF_STDOUT & stream) ? "stdout" : ((ORTE_IOF_STDERR & stream) ? "stderr" : "stddiag")), - ORTE_NAME_PRINT(name))); + ORTE_NAME_PRINT(name), channel->fd)); /* setup output object */ output = OBJ_NEW(orte_iof_write_output_t); /* write output data to the corresponding tag */ if (ORTE_IOF_STDIN & stream) { - suffix = NULL; + /* copy over the data to be written */ + if (0 < numbytes) { + /* don't copy 0 bytes - we just need to pass + * the zero bytes so the fd can be closed + * after it writes everything out + */ + memcpy(output->data, data, numbytes); + } + output->numbytes = numbytes; + goto process; } else if (ORTE_IOF_STDOUT & stream) { /* write the bytes to stdout */ suffix = "stdout"; @@ -74,59 +86,92 @@ int orte_iof_base_write_output(orte_process_name_t *name, orte_iof_tag_t stream, "%s stream %0x", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), stream)); return ORTE_ERR_VALUE_OUT_OF_BOUNDS; } - - /* see if data is to be tagged */ - if (orte_tag_output && NULL != suffix) { - /* if this is to be xml tagged, create a tag with the correct syntax */ - if (orte_xml_output) { - snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "<%s rank=\"%s\">", suffix, ORTE_VPID_PRINT(name->vpid)); - snprintf(endtag, ORTE_IOF_BASE_TAG_MAX, "", suffix); - } else { - snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "[%s,%s]<%s>", - ORTE_LOCAL_JOBID_PRINT(name->jobid), - ORTE_VPID_PRINT(name->vpid), suffix); - memset(endtag, '\0', ORTE_IOF_BASE_TAG_MAX); - } - starttaglen = strlen(starttag); - endtaglen = strlen(endtag); - /* start with the tag */ - for (j=0, k=0; j < starttaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) { - output->data[k++] = starttag[j]; - } - /* cycle through the data looking for - * and replace those with the tag - */ - for (i=0; i < numbytes && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; i++) { - if ('\n' == data[i]) { - /* we need to break the line with the end tag */ - for (j=0; j < endtaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) { - output->data[k++] = endtag[j]; - } - /* move the over */ - output->data[k++] = '\n'; - /* if this isn't the end of the line, add a new start tag */ - if (i < numbytes-1) { - for (j=0; j < starttaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) { - output->data[k++] = starttag[j]; - } - } - } else { - output->data[k++] = data[i]; - } - } - output->numbytes = k; - } else { - /* copy over the data to be written */ - if (0 < numbytes) { - /* don't copy 0 bytes - we just need to pass - * the zero bytes so the fd can be closed - * after it writes everything out - */ - memcpy(output->data, data, numbytes); - } - output->numbytes = numbytes; + + /* if this is to be xml tagged, create a tag with the correct syntax - we do not allow + * timestamping of xml output + */ + if (orte_xml_output) { + snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "<%s rank=\"%s\">", suffix, ORTE_VPID_PRINT(name->vpid)); + snprintf(endtag, ORTE_IOF_BASE_TAG_MAX, "", suffix); + goto construct; } + /* if we are to timestamp output, start the tag with that */ + if (orte_timestamp_output) { + time_t mytime; + char *cptr; + /* get the timestamp */ + time(&mytime); + cptr = ctime(&mytime); + cptr[strlen(cptr)-1] = '\0'; /* remove trailing newline */ + + if (orte_tag_output) { + /* if we want it tagged as well, use both */ + snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "%s[%s,%s]<%s>:", + cptr, ORTE_LOCAL_JOBID_PRINT(name->jobid), + ORTE_VPID_PRINT(name->vpid), suffix); + } else { + /* only use timestamp */ + snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "%s<%s>:", cptr, suffix); + } + /* no endtag for this option */ + memset(endtag, '\0', ORTE_IOF_BASE_TAG_MAX); + goto construct; + } + + if (orte_tag_output) { + snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "[%s,%s]<%s>:", + ORTE_LOCAL_JOBID_PRINT(name->jobid), + ORTE_VPID_PRINT(name->vpid), suffix); + /* no endtag for this option */ + memset(endtag, '\0', ORTE_IOF_BASE_TAG_MAX); + goto construct; + } + + /* if we get here, then the data is not to be tagged - just copy it + * and move on to processing + */ + if (0 < numbytes) { + /* don't copy 0 bytes - we just need to pass + * the zero bytes so the fd can be closed + * after it writes everything out + */ + memcpy(output->data, data, numbytes); + } + output->numbytes = numbytes; + goto process; + +construct: + starttaglen = strlen(starttag); + endtaglen = strlen(endtag); + /* start with the tag */ + for (j=0, k=0; j < starttaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) { + output->data[k++] = starttag[j]; + } + /* cycle through the data looking for + * and replace those with the tag + */ + for (i=0; i < numbytes && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; i++) { + if ('\n' == data[i]) { + /* we need to break the line with the end tag */ + for (j=0; j < endtaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) { + output->data[k++] = endtag[j]; + } + /* move the over */ + output->data[k++] = '\n'; + /* if this isn't the end of the line, add a new start tag */ + if (i < numbytes-1) { + for (j=0; j < starttaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) { + output->data[k++] = starttag[j]; + } + } + } else { + output->data[k++] = data[i]; + } + } + output->numbytes = k; + +process: /* lock us up to protect global operations */ OPAL_THREAD_LOCK(&orte_iof_base.iof_write_output_lock); @@ -154,7 +199,8 @@ int orte_iof_base_write_output(orte_process_name_t *name, orte_iof_tag_t stream, void orte_iof_base_write_handler(int fd, short event, void *cbdata) { - orte_iof_write_event_t *wev = (orte_iof_write_event_t*)cbdata; + orte_iof_sink_t *sink = (orte_iof_sink_t*)cbdata; + orte_iof_write_event_t *wev = sink->wev; opal_list_item_t *item; orte_iof_write_output_t *output; int num_written; diff --git a/orte/mca/iof/hnp/iof_hnp.c b/orte/mca/iof/hnp/iof_hnp.c index a319ce5910..18b082d0d6 100644 --- a/orte/mca/iof/hnp/iof_hnp.c +++ b/orte/mca/iof/hnp/iof_hnp.c @@ -41,6 +41,8 @@ #include "orte/mca/oob/base/base.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/util/name_fns.h" +#include "orte/mca/odls/odls_types.h" #include "orte/mca/iof/base/base.h" #include "iof_hnp.h" @@ -99,6 +101,10 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, orte_iof_proc_t *proct; opal_list_item_t *item; int flags; + char *outfile; + int fdout; + orte_odls_job_t *jobdat; + int np, numdigs; int rc; /* don't do this if the dst vpid is invalid or the fd is negative! */ @@ -138,18 +144,64 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, proct->name.jobid = dst_name->jobid; proct->name.vpid = dst_name->vpid; opal_list_append(&mca_iof_hnp_component.procs, &proct->super); + /* see if we are to output to a file */ + if (NULL != orte_output_filename) { + /* get the local jobdata for this proc */ + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jobdat = (orte_odls_job_t*)item; + if (jobdat->jobid == proct->name.jobid) { + break; + } + } + np = jobdat->num_procs / 10; + /* determine the number of digits required for max vpid */ + numdigs = 1; + while (np > 0) { + numdigs++; + np = np / 10; + } + /* construct the filename */ + asprintf(&outfile, "%s.%*0lu", orte_output_filename, numdigs, (unsigned long)proct->name.vpid); + /* create the file */ + fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644); + free(outfile); + if (fdout < 0) { + /* couldn't be opened */ + ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); + return ORTE_ERR_FILE_OPEN_FAILURE; + } + /* define a sink to that file descriptor */ + ORTE_IOF_SINK_DEFINE(&sink, dst_name, fdout, ORTE_IOF_STDOUTALL, + orte_iof_base_write_handler, + &mca_iof_hnp_component.sinks); + } SETUP: /* define a read event and activate it */ if (src_tag & ORTE_IOF_STDOUT) { ORTE_IOF_READ_EVENT(&proct->revstdout, dst_name, fd, ORTE_IOF_STDOUT, - orte_iof_hnp_read_local_handler, true); + orte_iof_hnp_read_local_handler, false); } else if (src_tag & ORTE_IOF_STDERR) { ORTE_IOF_READ_EVENT(&proct->revstderr, dst_name, fd, ORTE_IOF_STDERR, - orte_iof_hnp_read_local_handler, true); + orte_iof_hnp_read_local_handler, false); } else if (src_tag & ORTE_IOF_STDDIAG) { ORTE_IOF_READ_EVENT(&proct->revstddiag, dst_name, fd, ORTE_IOF_STDDIAG, - orte_iof_hnp_read_local_handler, true); + orte_iof_hnp_read_local_handler, false); + } + /* if -all- of the readevents for this proc have been defined, then + * activate them. Otherwise, we can think that the proc is complete + * because one of the readevents fires -prior- to all of them having + * been defined! + */ + if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) { + proct->revstdout->active = true; + opal_event_add(&(proct->revstdout->ev), 0); + proct->revstderr->active = true; + opal_event_add(&(proct->revstderr->ev), 0); + proct->revstddiag->active = true; + opal_event_add(&(proct->revstddiag->ev), 0); } return ORTE_SUCCESS; } diff --git a/orte/mca/iof/hnp/iof_hnp_component.c b/orte/mca/iof/hnp/iof_hnp_component.c index d49e74603b..2f4d202a14 100644 --- a/orte/mca/iof/hnp/iof_hnp_component.c +++ b/orte/mca/iof/hnp/iof_hnp_component.c @@ -109,6 +109,7 @@ static int orte_iof_hnp_close(void) } OBJ_DESTRUCT(&mca_iof_hnp_component.procs); orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_HNP); + /* release and cleanup the lock */ OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); OBJ_DESTRUCT(&mca_iof_hnp_component.lock); } @@ -156,7 +157,7 @@ static int orte_iof_hnp_query(mca_base_module_t **module, int *priority) OBJ_CONSTRUCT(&mca_iof_hnp_component.sinks, opal_list_t); OBJ_CONSTRUCT(&mca_iof_hnp_component.procs, opal_list_t); mca_iof_hnp_component.stdinev = NULL; - + /* we must be selected */ *priority = 100; *module = (mca_base_module_t *) &orte_iof_hnp_module; diff --git a/orte/mca/iof/hnp/iof_hnp_read.c b/orte/mca/iof/hnp/iof_hnp_read.c index df07deb8b0..d7196a5e8b 100644 --- a/orte/mca/iof/hnp/iof_hnp_read.c +++ b/orte/mca/iof/hnp/iof_hnp_read.c @@ -203,7 +203,11 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) item != opal_list_get_end(&mca_iof_hnp_component.sinks); item = opal_list_get_next(item)) { orte_iof_sink_t *sink = (orte_iof_sink_t*)item; - if (sink->tag & rev->tag && + /* if the target isn't set, then this sink is for another purpose - ignore it */ + if (ORTE_JOBID_INVALID == sink->daemon.jobid) { + continue; + } + if ((sink->tag & rev->tag) && sink->name.jobid == rev->name.jobid && (ORTE_VPID_WILDCARD == sink->name.vpid || sink->name.vpid == rev->name.vpid)) { /* need to send the data to the remote endpoint - if @@ -275,17 +279,45 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) break; } } - - } else { - if (ORTE_IOF_STDOUT & rev->tag) { - orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, &orte_iof_base.iof_write_stdout); - } else { - orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, &orte_iof_base.iof_write_stderr); - - } - /* re-add the event */ - opal_event_add(&rev->ev, 0); + OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); + return; } + + /* see if the user wanted the output directed to files */ + if (NULL != orte_output_filename) { + /* find the sink for this rank */ + for (item = opal_list_get_first(&mca_iof_hnp_component.sinks); + item != opal_list_get_end(&mca_iof_hnp_component.sinks); + item = opal_list_get_next(item)) { + orte_iof_sink_t *sink = (orte_iof_sink_t*)item; + /* if the target is set, then this sink is for another purpose - ignore it */ + if (ORTE_JOBID_INVALID != sink->daemon.jobid) { + continue; + } + /* if this sink isn't for output, ignore it */ + if (ORTE_IOF_STDIN & sink->tag) { + continue; + } + /* is this the desired proc? */ + if (sink->name.jobid == rev->name.jobid && + sink->name.vpid == rev->name.vpid) { + /* output to the corresponding file */ + orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev); + /* done */ + break; + } + } + } else { + /* output this to our local output */ + if (ORTE_IOF_STDOUT & rev->tag) { + orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stdout->wev); + } else { + orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stderr->wev); + } + } + + /* re-add the event */ + opal_event_add(&rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; diff --git a/orte/mca/iof/hnp/iof_hnp_receive.c b/orte/mca/iof/hnp/iof_hnp_receive.c index 895da896ca..313d6f20f4 100644 --- a/orte/mca/iof/hnp/iof_hnp_receive.c +++ b/orte/mca/iof/hnp/iof_hnp_receive.c @@ -27,6 +27,13 @@ #ifdef HAVE_STRING_H #include #endif /* HAVE_STRING_H */ +#ifdef HAVE_FCNTL_H +#include +#else +#ifdef HAVE_SYS_FCNTL_H +#include +#endif +#endif #include "orte/util/show_help.h" @@ -129,9 +136,12 @@ static void process_msg(int fd, short event, void *cbdata) while (item != opal_list_get_end(&mca_iof_hnp_component.sinks)) { next = opal_list_get_next(item); sink = (orte_iof_sink_t*)item; - + /* if the target isn't set, then this sink is for another purpose - ignore it */ + if (ORTE_JOBID_INVALID == sink->daemon.jobid) { + continue; + } /* if this sink is the designated one, then remove it from list */ - if (stream & sink->tag && + if ((stream & sink->tag) && sink->name.jobid == origin.jobid && (ORTE_VPID_WILDCARD == sink->name.vpid || ORTE_VPID_WILDCARD == origin.vpid || @@ -161,19 +171,23 @@ static void process_msg(int fd, short event, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&origin))); - /* write the output locally */ + /* output this to our local output */ if (ORTE_IOF_STDOUT & stream) { - orte_iof_base_write_output(&origin, stream, data, numbytes, &orte_iof_base.iof_write_stdout); + orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stdout->wev); } else { - orte_iof_base_write_output(&origin, stream, data, numbytes, &orte_iof_base.iof_write_stderr); + orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stderr->wev); } /* cycle through the endpoints to see if someone else wants a copy */ for (item = opal_list_get_first(&mca_iof_hnp_component.sinks); item != opal_list_get_end(&mca_iof_hnp_component.sinks); item = opal_list_get_next(item)) { - orte_iof_sink_t* sink = (orte_iof_sink_t*)item; - if (stream & sink->tag && + sink = (orte_iof_sink_t*)item; + /* if the target isn't set, then this sink is for another purpose - ignore it */ + if (ORTE_JOBID_INVALID == sink->daemon.jobid) { + continue; + } + if ((stream & sink->tag) && sink->name.jobid == origin.jobid && (ORTE_VPID_WILDCARD == sink->name.vpid || ORTE_VPID_WILDCARD == origin.vpid || diff --git a/orte/mca/iof/iof_types.h b/orte/mca/iof/iof_types.h index ce4a86ea0f..de68166742 100644 --- a/orte/mca/iof/iof_types.h +++ b/orte/mca/iof/iof_types.h @@ -37,6 +37,8 @@ typedef uint8_t orte_iof_tag_t; #define ORTE_IOF_STDOUT 0x02 #define ORTE_IOF_STDERR 0x04 #define ORTE_IOF_STDDIAG 0x08 +#define ORTE_IOF_STDOUTALL 0x0e + /* flow control flags */ #define ORTE_IOF_XON 0x10 #define ORTE_IOF_XOFF 0x20 diff --git a/orte/mca/iof/orted/iof_orted.c b/orte/mca/iof/orted/iof_orted.c index 33d2ea6aac..97c5ff2adb 100644 --- a/orte/mca/iof/orted/iof_orted.c +++ b/orte/mca/iof/orted/iof_orted.c @@ -42,6 +42,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" +#include "orte/mca/odls/odls_types.h" #include "orte/mca/iof/iof.h" #include "orte/mca/iof/base/base.h" @@ -91,6 +92,11 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta int flags; opal_list_item_t *item; orte_iof_proc_t *proct; + orte_iof_sink_t *sink; + char *outfile; + int fdout; + orte_odls_job_t *jobdat; + int np, numdigs; OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s iof:orted pushing fd %d for process %s", @@ -124,20 +130,65 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta proct->name.jobid = dst_name->jobid; proct->name.vpid = dst_name->vpid; opal_list_append(&mca_iof_orted_component.procs, &proct->super); + /* see if we are to output to a file */ + if (NULL != orte_output_filename) { + /* get the local jobdata for this proc */ + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jobdat = (orte_odls_job_t*)item; + if (jobdat->jobid == proct->name.jobid) { + break; + } + } + np = jobdat->num_procs / 10; + /* determine the number of digits required for max vpid */ + numdigs = 1; + while (np > 0) { + numdigs++; + np = np / 10; + } + /* construct the filename */ + asprintf(&outfile, "%s.%*0lu", orte_output_filename, numdigs, (unsigned long)proct->name.vpid); + /* create the file */ + fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644); + free(outfile); + if (fdout < 0) { + /* couldn't be opened */ + ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); + return ORTE_ERR_FILE_OPEN_FAILURE; + } + /* define a sink to that file descriptor */ + ORTE_IOF_SINK_DEFINE(&sink, dst_name, fdout, ORTE_IOF_STDOUTALL, + orte_iof_base_write_handler, + &mca_iof_orted_component.sinks); + } SETUP: /* define a read event and activate it */ if (src_tag & ORTE_IOF_STDOUT) { ORTE_IOF_READ_EVENT(&proct->revstdout, dst_name, fd, ORTE_IOF_STDOUT, - orte_iof_orted_read_handler, true); + orte_iof_orted_read_handler, false); } else if (src_tag & ORTE_IOF_STDERR) { ORTE_IOF_READ_EVENT(&proct->revstderr, dst_name, fd, ORTE_IOF_STDERR, - orte_iof_orted_read_handler, true); + orte_iof_orted_read_handler, false); } else if (src_tag & ORTE_IOF_STDDIAG) { ORTE_IOF_READ_EVENT(&proct->revstddiag, dst_name, fd, ORTE_IOF_STDDIAG, - orte_iof_orted_read_handler, true); + orte_iof_orted_read_handler, false); + } + /* if -all- of the readevents for this proc have been defined, then + * activate them. Otherwise, we can think that the proc is complete + * because one of the readevents fires -prior- to all of them having + * been defined! + */ + if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) { + proct->revstdout->active = true; + opal_event_add(&(proct->revstdout->ev), 0); + proct->revstderr->active = true; + opal_event_add(&(proct->revstderr->ev), 0); + proct->revstddiag->active = true; + opal_event_add(&(proct->revstddiag->ev), 0); } - return ORTE_SUCCESS; } diff --git a/orte/mca/iof/orted/iof_orted_read.c b/orte/mca/iof/orted/iof_orted_read.c index 5a09170e55..ae7d4c595e 100644 --- a/orte/mca/iof/orted/iof_orted_read.c +++ b/orte/mca/iof/orted/iof_orted_read.c @@ -103,6 +103,33 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata) goto CLEAN_RETURN; } + /* see if the user wanted the output directed to files */ + if (NULL != orte_output_filename) { + /* find the sink for this rank */ + for (item = opal_list_get_first(&mca_iof_orted_component.sinks); + item != opal_list_get_end(&mca_iof_orted_component.sinks); + item = opal_list_get_next(item)) { + orte_iof_sink_t *sink = (orte_iof_sink_t*)item; + /* if the target is set, then this sink is for another purpose - ignore it */ + if (ORTE_JOBID_INVALID != sink->daemon.jobid) { + continue; + } + /* if this sink isn't for output, ignore it */ + if (ORTE_IOF_STDIN & sink->tag) { + continue; + } + /* is this the desired proc? */ + if (sink->name.jobid == rev->name.jobid && + sink->name.vpid == rev->name.vpid) { + /* output to the corresponding file */ + orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev); + /* done */ + break; + } + } + goto RESTART; + } + /* prep the buffer */ buf = OBJ_NEW(opal_buffer_t); @@ -134,6 +161,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata) orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP, 0, send_cb, NULL); +RESTART: /* re-add the event */ opal_event_add(&rev->ev, 0); diff --git a/orte/mca/iof/tool/iof_tool_receive.c b/orte/mca/iof/tool/iof_tool_receive.c index 7c583f40f6..607104f4c1 100644 --- a/orte/mca/iof/tool/iof_tool_receive.c +++ b/orte/mca/iof/tool/iof_tool_receive.c @@ -96,9 +96,9 @@ static void process_msg(int fd, short event, void *cbdata) if (0 < numbytes) { /* write the output locally */ if (ORTE_IOF_STDOUT & stream) { - orte_iof_base_write_output(&origin, stream, data, numbytes, &orte_iof_base.iof_write_stdout); + orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stdout->wev); } else { - orte_iof_base_write_output(&origin, stream, data, numbytes, &orte_iof_base.iof_write_stderr); + orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stderr->wev); } } diff --git a/orte/mca/odls/base/help-orte-odls-base.txt b/orte/mca/odls/base/help-orte-odls-base.txt index 582e5ab859..4b6b4369b4 100644 --- a/orte/mca/odls/base/help-orte-odls-base.txt +++ b/orte/mca/odls/base/help-orte-odls-base.txt @@ -31,3 +31,21 @@ Fileset: %s Will continue attempting to launch the process. +# +[orte-odls-base:xterm-neg-rank] +The xterm option was given a negative rank to display: + +Rank: %d + +Note that a value of -1 represents "all", but all other values +must range from 0 to #procs-1. +# +[orte-odls-base:xterm-rank-out-of-bounds] +The xterm option was asked to display a rank that is larger +than the number of procs in the job: + +Rank: %d +#procs: %d + +Note that ranks start with 0, not 1, and must be specified +accordingly. diff --git a/orte/mca/odls/base/odls_base_close.c b/orte/mca/odls/base/odls_base_close.c index ba8ea65b10..17e07ed8f7 100644 --- a/orte/mca/odls/base/odls_base_close.c +++ b/orte/mca/odls/base/odls_base_close.c @@ -23,6 +23,7 @@ #include "opal/mca/mca.h" #include "opal/mca/base/base.h" +#include "opal/class/opal_list.h" #include "orte/mca/odls/odls.h" #include "orte/mca/odls/base/base.h" @@ -31,9 +32,15 @@ int orte_odls_base_close(void) { + opal_list_item_t *item; + /* cleanup ODLS globals */ OBJ_DESTRUCT(&orte_odls_globals.mutex); OBJ_DESTRUCT(&orte_odls_globals.cond); + while (NULL != (item = opal_list_remove_first(&orte_odls_globals.xterm_ranks))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&orte_odls_globals.xterm_ranks); if (NULL != orte_odls_globals.dmap && NULL != orte_odls_globals.dmap->bytes) { free(orte_odls_globals.dmap->bytes); free(orte_odls_globals.dmap); diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 31a48b5326..bcc5feed63 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -904,6 +904,8 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, char dir[MAXPATHLEN]; char **argvptr; char *full_search; + char **argvsav=NULL; + int inm; /* protect operations involving the global list of children */ OPAL_THREAD_LOCK(&orte_odls_globals.mutex); @@ -1141,7 +1143,57 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, continue; } - /* setup the rest of the environment with the proc-specific items - these + /* did the user request we display output in xterms? */ + if (NULL != orte_xterm) { + opal_list_item_t *nmitem; + orte_namelist_t *nm; + /* see if this rank is one of those requested */ + for (nmitem = opal_list_get_first(&orte_odls_globals.xterm_ranks); + nmitem != opal_list_get_end(&orte_odls_globals.xterm_ranks); + nmitem = opal_list_get_next(nmitem)) { + nm = (orte_namelist_t*)nmitem; + /* check for bozo case */ + if (jobdat->num_procs <= nm->name.vpid) { + /* can't be done! */ + orte_show_help("help-odls-base.txt", + "orte-odls-base:xterm-rank-out-of-bounds", + true, nm->name.vpid, jobdat->num_procs); + rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; + goto CLEANUP; + } + if (ORTE_VPID_WILDCARD == nm->name.vpid || + child->name->vpid == nm->name.vpid) { + /* we want this one - modify the app's command to include + * the orte xterm cmd. Need to be careful, though, that we + * don't modify the app for ALL ranks that use it! So we + * will create a copy of the argv so we can restore it later + */ + argvsav = opal_argv_copy(app->argv); + /* free the argv */ + opal_argv_free(app->argv); + app->argv = NULL; + /* now create a new one that starts with the xtermcmd */ + for (inm=0; inm < opal_argv_count(orte_odls_globals.xtermcmd); inm++) { + opal_argv_append_nosize(&app->argv, orte_odls_globals.xtermcmd[inm]); + } + /* insert the rank into the correct place as a window title */ + free(app->argv[2]); + asprintf(&app->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name->vpid)); + /* add back the original argv */ + for (inm=0; inm < opal_argv_count(argvsav); inm++) { + opal_argv_append_nosize(&app->argv, argvsav[inm]); + } + /* the app exe name itself is in the argvsav array, so + * we can recover it from there later + */ + free(app->app); + app->app = strdup(orte_odls_globals.xtermcmd[0]); + break; + } + } + } + + /* setup the rest of the environment with the proc-specific items - these * will be overwritten for each child */ if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&job_str, child->name->jobid))) { @@ -1168,6 +1220,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } opal_setenv(param, vpid_str, true, &app->env); free(param); + /* although the vpid IS the process' rank within the job, users * would appreciate being given a public environmental variable * that also represents this value - something MPI specific - so @@ -1179,7 +1232,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, opal_setenv("OMPI_COMM_WORLD_RANK", vpid_str, true, &app->env); free(vpid_str); /* done with this now */ - /* users would appreciate being given a public environmental variable + /* users would appreciate being given a public environmental variable * that also represents the local rank value - something MPI specific - so * do that here. * @@ -1195,7 +1248,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env); free(value); - param = mca_base_param_environ_variable("opal", NULL, "paffinity_base_slot_list"); + param = mca_base_param_environ_variable("opal", NULL, "paffinity_base_slot_list"); if ( NULL != child->slot_list ) { asprintf(&value, "%s", child->slot_list); opal_setenv(param, value, true, &app->env); @@ -1205,7 +1258,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } free(param); - /* if we are timing things, record when we are going to launch this proc */ + /* if we are timing things, record when we are going to launch this proc */ if (orte_timing) { gettimeofday(&child->starttime, NULL); } @@ -1266,6 +1319,17 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } /* move to next processor */ proc_rank++; + /* reset the exe name, if necessary */ + if (NULL != argvsav) { + /* release the current argv array */ + opal_argv_free(app->argv); + /* restore the original one */ + app->argv = argvsav; + argvsav = NULL; + /* the app exe name itself is now in the argv[0] posn */ + free(app->app); + app->app = strdup(app->argv[0]); + } } /* complete launching all children for this app */ /* reset our working directory back to our default location - if we * don't do this, then we will be looking for relative paths starting diff --git a/orte/mca/odls/base/odls_base_open.c b/orte/mca/odls/base/odls_base_open.c index 0a02a1a61d..ac0978fbb7 100644 --- a/orte/mca/odls/base/odls_base_open.c +++ b/orte/mca/odls/base/odls_base_open.c @@ -25,6 +25,7 @@ #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_param.h" #include "opal/util/trace.h" +#include "opal/util/path.h" #include "opal/util/argv.h" #include "opal/class/opal_value_array.h" #include "opal/class/opal_pointer_array.h" @@ -35,6 +36,7 @@ #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" #include "orte/util/show_help.h" +#include "orte/util/parse_options.h" #include "orte/mca/odls/base/odls_private.h" @@ -155,6 +157,10 @@ orte_odls_globals_t orte_odls_globals; */ int orte_odls_base_open(void) { + char **ranks=NULL, *tmp; + int i, rank; + orte_namelist_t *nm; + /* Debugging / verbose output. Always have stream open, with verbose set by the mca open system... */ orte_odls_globals.output = opal_output_open(NULL); @@ -166,10 +172,51 @@ int orte_odls_base_open(void) /* initialize ODLS globals */ OBJ_CONSTRUCT(&orte_odls_globals.mutex, opal_mutex_t); OBJ_CONSTRUCT(&orte_odls_globals.cond, opal_condition_t); + OBJ_CONSTRUCT(&orte_odls_globals.xterm_ranks, opal_list_t); + orte_odls_globals.xtermcmd = NULL; orte_odls_globals.dmap = NULL; orte_odls_globals.debugger = NULL; orte_odls_globals.debugger_launched = false; + /* check if the user requested that we display output in xterms */ + if (NULL != orte_xterm) { + /* construct a list of ranks to be displayed */ + orte_util_parse_range_options(orte_xterm, &ranks); + for (i=0; i < opal_argv_count(ranks); i++) { + nm = OBJ_NEW(orte_namelist_t); + rank = strtol(ranks[i], NULL, 10); + if (-1 == rank) { + /* wildcard */ + nm->name.vpid = ORTE_VPID_WILDCARD; + } else if (rank < 0) { + /* error out on bozo case */ + orte_show_help("help-odls-base.txt", + "orte-odls-base:xterm-neg-rank", + true, rank); + return ORTE_ERROR; + } else { + /* we can't check here if the rank is out of + * range as we don't yet know how many ranks + * will be in the job - we'll check later + */ + nm->name.vpid = rank; + } + opal_list_append(&orte_odls_globals.xterm_ranks, &nm->item); + } + opal_argv_free(ranks); + /* construct the xtermcmd */ + orte_odls_globals.xtermcmd = NULL; + tmp = opal_find_absolute_path("xterm"); + if (NULL == tmp) { + return ORTE_ERROR; + } + opal_argv_append_nosize(&orte_odls_globals.xtermcmd, tmp); + free(tmp); + opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-T"); + opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "save"); + opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-e"); + } + /* Open up all available components */ if (ORTE_SUCCESS != diff --git a/orte/mca/odls/base/odls_private.h b/orte/mca/odls/base/odls_private.h index fdb76afd93..01e54912dd 100644 --- a/orte/mca/odls/base/odls_private.h +++ b/orte/mca/odls/base/odls_private.h @@ -63,6 +63,10 @@ typedef struct { orte_odls_job_t *debugger; /* debugger launched */ bool debugger_launched; + /* list of ranks to be displayed on separate xterms */ + opal_list_t xterm_ranks; + /* the xterm cmd to be used */ + char **xtermcmd; } orte_odls_globals_t; ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals; diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 9b738ba4d1..d75274d35e 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -55,7 +55,6 @@ bool orte_do_not_launch = false; bool orted_spin_flag = false; bool orte_static_ports = false; bool orte_keep_fqdn_hostnames = false; -bool orte_tag_output; bool orte_show_resolved_nodenames; int orted_debug_failure; int orted_debug_failure_delay; @@ -110,6 +109,13 @@ opal_list_t orte_local_children; /* list of job data for local children on a daemon */ opal_list_t orte_local_jobdata; +/* IOF controls */ +bool orte_tag_output; +bool orte_timestamp_output; +char *orte_output_filename; +/* generate new xterm windows to display output from specified ranks */ +char *orte_xterm; + /* whether or not to forward SIGTSTP and SIGCONT signals */ bool orte_forward_job_control; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 4c3f55e2a2..d86c66d63f 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -427,7 +427,6 @@ ORTE_DECLSPEC extern bool orted_spin_flag; ORTE_DECLSPEC extern bool orte_static_ports; ORTE_DECLSPEC extern int32_t orte_contiguous_nodes; ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames; -ORTE_DECLSPEC extern bool orte_tag_output; ORTE_DECLSPEC extern bool orte_show_resolved_nodenames; ORTE_DECLSPEC extern int orted_debug_failure; ORTE_DECLSPEC extern int orted_debug_failure_delay; @@ -485,6 +484,12 @@ ORTE_DECLSPEC extern opal_list_t orte_local_jobdata; /* whether or not to forward SIGTSTP and SIGCONT signals */ ORTE_DECLSPEC extern bool orte_forward_job_control; +/* IOF controls */ +ORTE_DECLSPEC extern bool orte_tag_output; +ORTE_DECLSPEC extern bool orte_timestamp_output; +ORTE_DECLSPEC extern char *orte_output_filename; +/* generate new xterm windows to display output from specified ranks */ +ORTE_DECLSPEC extern char *orte_xterm; #endif /* ORTE_DISABLE_FULL_SUPPORT */ diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index 58b579c05e..1edb9473d7 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -211,6 +211,7 @@ int orte_register_params(void) "Number of nodes after which contiguous nodename encoding will automatically be used [default: INT_MAX]", false, false, INT32_MAX, &orte_contiguous_nodes); + /* whether to tag output */ mca_base_param_reg_int_name("orte", "tag_output", "Tag all output with [job,rank] (default: false)", false, false, (int) false, &value); @@ -224,7 +225,18 @@ int orte_register_params(void) if (orte_xml_output) { orte_tag_output = true; } - + + /* whether to timestamp output */ + mca_base_param_reg_int_name("orte", "timestamp_output", + "Timestamp all application process output (default: false)", + false, false, (int) false, &value); + orte_timestamp_output = OPAL_INT_TO_BOOL(value); + + /* redirect output into files */ + mca_base_param_reg_string_name("orte", "output_filename", + "Redirect output from application processes into filename.rank [default: NULL]", + false, false, NULL, &orte_output_filename); + mca_base_param_reg_int_name("orte", "show_resolved_nodenames", "Display any node names that are resolved to a different name (default: false)", false, false, (int) false, &value); @@ -246,6 +258,11 @@ int orte_register_params(void) false, false, (int)false, &value); orte_allocation_required = OPAL_INT_TO_BOOL(value); + /* generate new terminal windows to display output from specified ranks */ + mca_base_param_reg_string_name("orte", "xterm", + "Create a new xterm window and display output from the specified ranks there [default: none]", + false, false, NULL, &orte_xterm); + /* whether or not to forward SIGTSTP and SIGCONT signals */ mca_base_param_reg_int_name("orte", "forward_job_control", "Forward SIGTSTP (after converting to SIGSTOP) and SIGCONT signals to the application procs [default: no]", diff --git a/orte/tools/orterun/orterun.1in b/orte/tools/orterun/orterun.1in index e951a123b1..1ea603d57f 100644 --- a/orte/tools/orterun/orterun.1in +++ b/orte/tools/orterun/orterun.1in @@ -265,6 +265,14 @@ is 10 seconds. . . .TP +.B -output-filename\fR,\fP --output-filename \fR\fP +Redirect the stdout, stderr, and stddiag of all ranks to a rank-unique version of +the specified filename. Any directories in the filename will automatically be created. +Each output file will consist of filename.rank, where the rank will be left-filled with +zero's for correct ordering in listings. +. +. +.TP .B -path\fR,\fP --path \fR\fP that will be used when attempting to locate the requested executables. This is used prior to using the local PATH setting. @@ -341,11 +349,16 @@ indicating that no ranks are to receive stdin. . .TP .B -tag-output\fR,\fP --tag-output -Tag each line output to stdout, stderr, and stddiag with \fB[jobid, rank]\fP indicating the process jobid +Tag each line of output to stdout, stderr, and stddiag with \fB[jobid, rank]\fP indicating the process jobid and rank that generated the output, and the channel which generated it. . . .TP +.B -timestamp-output\fR,\fP --timestamp-output +Timestamp each line of output to stdout, stderr, and stddiag. +. +. +.TP .B --tmpdir \fR\fP Set the root for the session directory tree for mpirun only. . @@ -377,7 +390,10 @@ See the "Current Working Directory" section for notes on relative paths. .B Note: If the \fI-wdir\fP option appears both on the command line and in an application context, the context will take precedence over the command -line. +line. Relative paths are converted to absolute paths on the node where +mpirun is executed. Thus, if the path to the desired wdir is different +on the backend nodes, then it must be specified as an absolute path that +is correct for the backend node. . . .TP @@ -396,6 +412,20 @@ then use \fI-x\fP to export (not define) them. Provide all output to stdout, stderr, and stddiag in an xml format. . . +.TP +.B -xterm\fR,\fP --xterm \fR\fP +Display the specified ranks in separate xterm windows. The ranks are specified +as a comma-separated list of ranges, with a -1 indicating all. A separate +window will be created for each specified rank. +.B Note: +In some environments, xterm may require that the executable be in the user's +path, or be specified in absolute or relative terms. Thus, it may be necessary +to specify a local executable as "./foo" instead of just "foo". If xterm fails to +find the executable, mpirun will hang, but still respond correctly to a ctrl-c. +If this happens, please check that the executable is being specified correctly +and try again. +. +. .P The following options are useful for developers; they are not generally useful to most ORTE and/or MPI users: diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index ea17c65d4a..6111962d73 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -63,6 +63,7 @@ #include "opal/version.h" #include "opal/runtime/opal.h" +#include "opal/util/os_dirpath.h" #include "opal/util/os_path.h" #include "opal/util/path.h" #include "opal/class/opal_pointer_array.h" @@ -73,6 +74,7 @@ #include "orte/util/session_dir.h" #include "orte/util/name_fns.h" #include "orte/util/hnp_contact.h" +#include "orte/util/parse_options.h" #include "orte/mca/odls/odls.h" #include "orte/mca/plm/plm.h" @@ -162,7 +164,16 @@ static opal_cmd_line_init_t cmd_line_init[] = { { "orte", "tag", "output", '\0', "tag-output", "tag-output", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Tag all output with [job,rank]" }, - + { "orte", "timestamp", "output", '\0', "timestamp-output", "timestamp-output", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Timestamp all application process output" }, + { "orte", "output", "filename", '\0', "output-filename", "output-filename", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Redirect output from application processes into filename.rank" }, + { "orte", "xterm", NULL, '\0', "xterm", "xterm", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Create a new xterm window and display output from the specified ranks there" }, + /* select stdin option */ { NULL, NULL, NULL, '\0', "stdin", "stdin", 1, &orterun_globals.stdin_target, OPAL_CMD_LINE_TYPE_STRING, diff --git a/orte/util/proc_info.c b/orte/util/proc_info.c index a5af8b5511..9607fc8d4e 100644 --- a/orte/util/proc_info.c +++ b/orte/util/proc_info.c @@ -64,7 +64,6 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = { /* .sock_stderr = */ NULL }; -#define ORTE_MAX_HOSTNAME_SIZE 512 static bool init=false; int orte_proc_info(void) diff --git a/orte/util/proc_info.h b/orte/util/proc_info.h index 5d1c240591..90a411d3c9 100644 --- a/orte/util/proc_info.h +++ b/orte/util/proc_info.h @@ -37,6 +37,8 @@ BEGIN_C_DECLS +#define ORTE_MAX_HOSTNAME_SIZE 512 + /** * Process information structure *