Fix a race condition in the IOF and add some new user-requested features:
1. fix a race condition whereby a proc's output could trigger an event prior to the other outputs being setup, thus c ausing the IOF to declare the proc "terminated" too early. This was really rare, but could happen. 2. add a new "timestamp-output" option that timestamp's each line of output 3. add a new "output-filename" option that redirects each proc's output to a separate rank-named file. 4. add a new "xterm" option that redirects the output of the specified ranks to a separate xterm window. This commit was SVN r20392.
Этот коммит содержится в:
родитель
0704b98668
Коммит
2966206f58
@ -71,15 +71,6 @@ typedef struct {
|
||||
} orte_iof_write_event_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_write_event_t);
|
||||
|
||||
struct orte_iof_base_t {
|
||||
int iof_output;
|
||||
opal_list_t iof_components_opened;
|
||||
opal_mutex_t iof_write_output_lock;
|
||||
orte_iof_write_event_t iof_write_stdout;
|
||||
orte_iof_write_event_t iof_write_stderr;
|
||||
};
|
||||
typedef struct orte_iof_base_t orte_iof_base_t;
|
||||
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
orte_process_name_t name;
|
||||
@ -122,14 +113,25 @@ typedef struct {
|
||||
} orte_iof_write_output_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_write_output_t);
|
||||
|
||||
/* the iof globals struct */
|
||||
struct orte_iof_base_t {
|
||||
int iof_output;
|
||||
opal_list_t iof_components_opened;
|
||||
opal_mutex_t iof_write_output_lock;
|
||||
orte_iof_sink_t *iof_write_stdout;
|
||||
orte_iof_sink_t *iof_write_stderr;
|
||||
};
|
||||
typedef struct orte_iof_base_t orte_iof_base_t;
|
||||
|
||||
|
||||
#if OMPI_ENABLE_DEBUG
|
||||
|
||||
#define ORTE_IOF_SINK_DEFINE(snk, nm, fid, tg, wrthndlr, eplist) \
|
||||
do { \
|
||||
orte_iof_sink_t *ep; \
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, \
|
||||
"defining endpoint: %s %d", \
|
||||
__FILE__, __LINE__)); \
|
||||
"defining endpt: file %s line %d fd %d",\
|
||||
__FILE__, __LINE__, (fid))); \
|
||||
ep = OBJ_NEW(orte_iof_sink_t); \
|
||||
ep->name.jobid = (nm)->jobid; \
|
||||
ep->name.vpid = (nm)->vpid; \
|
||||
@ -138,9 +140,11 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_write_output_t);
|
||||
ep->wev->fd = (fid); \
|
||||
opal_event_set(&(ep->wev->ev), ep->wev->fd, \
|
||||
OPAL_EV_WRITE, \
|
||||
wrthndlr, ep) ; \
|
||||
wrthndlr, ep); \
|
||||
} \
|
||||
if (NULL != (eplist)) { \
|
||||
opal_list_append((eplist), &ep->super); \
|
||||
} \
|
||||
opal_list_append((eplist), &ep->super); \
|
||||
*(snk) = ep; \
|
||||
ep->file = strdup(__FILE__); \
|
||||
ep->line = __LINE__; \
|
||||
|
@ -36,6 +36,7 @@ int orte_iof_base_close(void)
|
||||
bool dump;
|
||||
opal_list_item_t *item;
|
||||
orte_iof_write_output_t *output;
|
||||
orte_iof_write_event_t *wev;
|
||||
int num_written;
|
||||
|
||||
/* shutdown any remaining opened components */
|
||||
@ -48,13 +49,14 @@ int orte_iof_base_close(void)
|
||||
OPAL_THREAD_LOCK(&orte_iof_base.iof_write_output_lock);
|
||||
if (!orte_process_info.daemon) {
|
||||
/* check if anything is still trying to be written out */
|
||||
if (!opal_list_is_empty(&orte_iof_base.iof_write_stdout.outputs)) {
|
||||
wev = orte_iof_base.iof_write_stdout->wev;
|
||||
if (!opal_list_is_empty(&wev->outputs)) {
|
||||
dump = false;
|
||||
/* make one last attempt to write this out */
|
||||
while (NULL != (item = opal_list_remove_first(&orte_iof_base.iof_write_stdout.outputs))) {
|
||||
while (NULL != (item = opal_list_remove_first(&wev->outputs))) {
|
||||
output = (orte_iof_write_output_t*)item;
|
||||
if (!dump) {
|
||||
num_written = write(orte_iof_base.iof_write_stdout.fd, output->data, output->numbytes);
|
||||
num_written = write(wev->fd, output->data, output->numbytes);
|
||||
if (num_written < output->numbytes) {
|
||||
/* don't retry - just cleanout the list and dump it */
|
||||
dump = true;
|
||||
@ -63,14 +65,15 @@ int orte_iof_base_close(void)
|
||||
OBJ_RELEASE(output);
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&orte_iof_base.iof_write_stdout);
|
||||
if (!opal_list_is_empty(&orte_iof_base.iof_write_stderr.outputs)) {
|
||||
OBJ_RELEASE(orte_iof_base.iof_write_stdout);
|
||||
wev = orte_iof_base.iof_write_stderr->wev;
|
||||
if (!opal_list_is_empty(&wev->outputs)) {
|
||||
dump = false;
|
||||
/* make one last attempt to write this out */
|
||||
while (NULL != (item = opal_list_remove_first(&orte_iof_base.iof_write_stderr.outputs))) {
|
||||
while (NULL != (item = opal_list_remove_first(&wev->outputs))) {
|
||||
output = (orte_iof_write_output_t*)item;
|
||||
if (!dump) {
|
||||
num_written = write(orte_iof_base.iof_write_stderr.fd, output->data, output->numbytes);
|
||||
num_written = write(wev->fd, output->data, output->numbytes);
|
||||
if (num_written < output->numbytes) {
|
||||
/* don't retry - just cleanout the list and dump it */
|
||||
dump = true;
|
||||
@ -79,7 +82,7 @@ int orte_iof_base_close(void)
|
||||
OBJ_RELEASE(output);
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&orte_iof_base.iof_write_stderr);
|
||||
OBJ_RELEASE(orte_iof_base.iof_write_stderr);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&orte_iof_base.iof_write_output_lock);
|
||||
|
||||
|
@ -25,6 +25,8 @@
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/os_dirpath.h"
|
||||
#include "opal/util/basename.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
@ -84,6 +86,8 @@ OBJ_CLASS_INSTANCE(orte_iof_proc_t,
|
||||
|
||||
static void orte_iof_base_sink_construct(orte_iof_sink_t* ptr)
|
||||
{
|
||||
ptr->daemon.jobid = ORTE_JOBID_INVALID;
|
||||
ptr->daemon.vpid = ORTE_VPID_INVALID;
|
||||
ptr->wev = OBJ_NEW(orte_iof_write_event_t);
|
||||
}
|
||||
static void orte_iof_base_sink_destruct(orte_iof_sink_t* ptr)
|
||||
@ -163,31 +167,38 @@ orte_iof_base_t orte_iof_base;
|
||||
*/
|
||||
int orte_iof_base_open(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* Initialize globals */
|
||||
OBJ_CONSTRUCT(&orte_iof_base.iof_components_opened, opal_list_t);
|
||||
OBJ_CONSTRUCT(&orte_iof_base.iof_write_output_lock, opal_mutex_t);
|
||||
|
||||
/* did the user request we print output to files? */
|
||||
if (NULL != orte_output_filename) {
|
||||
/* we will setup the files themselves as needed in the iof
|
||||
* module. For now, let's see if the filename contains a
|
||||
* path, or just a name
|
||||
*/
|
||||
char *path;
|
||||
path = opal_dirname(orte_output_filename);
|
||||
if (0 != strcmp(path, orte_output_filename)) {
|
||||
/* there is a path in this name - ensure that the directory
|
||||
* exists, and create it if not
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = opal_os_dirpath_create(path, S_IRWXU))) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* daemons do not need to do this as they do not write out stdout/err */
|
||||
if (!orte_process_info.daemon) {
|
||||
/* setup the stdout event */
|
||||
OBJ_CONSTRUCT(&orte_iof_base.iof_write_stdout, orte_iof_write_event_t);
|
||||
orte_iof_base.iof_write_stdout.fd = 1;
|
||||
/* create the write event, but don't add it until we need it */
|
||||
opal_event_set(&orte_iof_base.iof_write_stdout.ev,
|
||||
orte_iof_base.iof_write_stdout.fd,
|
||||
OPAL_EV_WRITE,
|
||||
orte_iof_base_write_handler,
|
||||
&orte_iof_base.iof_write_stdout);
|
||||
|
||||
ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stdout, ORTE_PROC_MY_NAME,
|
||||
1, ORTE_IOF_STDOUT, orte_iof_base_write_handler, NULL);
|
||||
/* setup the stderr event */
|
||||
OBJ_CONSTRUCT(&orte_iof_base.iof_write_stderr, orte_iof_write_event_t);
|
||||
orte_iof_base.iof_write_stderr.fd = 2;
|
||||
/* create the write event, but don't add it until we need it */
|
||||
opal_event_set(&orte_iof_base.iof_write_stderr.ev,
|
||||
orte_iof_base.iof_write_stderr.fd,
|
||||
OPAL_EV_WRITE,
|
||||
orte_iof_base_write_handler,
|
||||
&orte_iof_base.iof_write_stderr);
|
||||
ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stderr, ORTE_PROC_MY_NAME,
|
||||
2, ORTE_IOF_STDERR, orte_iof_base_write_handler, NULL);
|
||||
/* do NOT set these file descriptors to non-blocking. If we do so,
|
||||
* we set the file descriptor to non-blocking for everyone that has
|
||||
* that file descriptor, which includes everyone else in our shell
|
||||
|
@ -30,6 +30,9 @@
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_TIME_H
|
||||
#include <time.h>
|
||||
#endif
|
||||
#include <errno.h>
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
@ -47,17 +50,26 @@ int orte_iof_base_write_output(orte_process_name_t *name, orte_iof_tag_t stream,
|
||||
int i, j, k, starttaglen, endtaglen, num_buffered;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
|
||||
"%s write:output setting up to write %d bytes to %s of %s",
|
||||
"%s write:output setting up to write %d bytes to %s for %s on fd %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
|
||||
(ORTE_IOF_STDIN & stream) ? "stdin" : ((ORTE_IOF_STDOUT & stream) ? "stdout" : ((ORTE_IOF_STDERR & stream) ? "stderr" : "stddiag")),
|
||||
ORTE_NAME_PRINT(name)));
|
||||
ORTE_NAME_PRINT(name), channel->fd));
|
||||
|
||||
/* setup output object */
|
||||
output = OBJ_NEW(orte_iof_write_output_t);
|
||||
|
||||
/* write output data to the corresponding tag */
|
||||
if (ORTE_IOF_STDIN & stream) {
|
||||
suffix = NULL;
|
||||
/* copy over the data to be written */
|
||||
if (0 < numbytes) {
|
||||
/* don't copy 0 bytes - we just need to pass
|
||||
* the zero bytes so the fd can be closed
|
||||
* after it writes everything out
|
||||
*/
|
||||
memcpy(output->data, data, numbytes);
|
||||
}
|
||||
output->numbytes = numbytes;
|
||||
goto process;
|
||||
} else if (ORTE_IOF_STDOUT & stream) {
|
||||
/* write the bytes to stdout */
|
||||
suffix = "stdout";
|
||||
@ -74,59 +86,92 @@ int orte_iof_base_write_output(orte_process_name_t *name, orte_iof_tag_t stream,
|
||||
"%s stream %0x", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), stream));
|
||||
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
}
|
||||
|
||||
/* see if data is to be tagged */
|
||||
if (orte_tag_output && NULL != suffix) {
|
||||
/* if this is to be xml tagged, create a tag with the correct syntax */
|
||||
if (orte_xml_output) {
|
||||
snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "<%s rank=\"%s\">", suffix, ORTE_VPID_PRINT(name->vpid));
|
||||
snprintf(endtag, ORTE_IOF_BASE_TAG_MAX, "</%s>", suffix);
|
||||
} else {
|
||||
snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "[%s,%s]<%s>",
|
||||
ORTE_LOCAL_JOBID_PRINT(name->jobid),
|
||||
ORTE_VPID_PRINT(name->vpid), suffix);
|
||||
memset(endtag, '\0', ORTE_IOF_BASE_TAG_MAX);
|
||||
}
|
||||
starttaglen = strlen(starttag);
|
||||
endtaglen = strlen(endtag);
|
||||
/* start with the tag */
|
||||
for (j=0, k=0; j < starttaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) {
|
||||
output->data[k++] = starttag[j];
|
||||
}
|
||||
/* cycle through the data looking for <cr>
|
||||
* and replace those with the tag
|
||||
*/
|
||||
for (i=0; i < numbytes && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; i++) {
|
||||
if ('\n' == data[i]) {
|
||||
/* we need to break the line with the end tag */
|
||||
for (j=0; j < endtaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) {
|
||||
output->data[k++] = endtag[j];
|
||||
}
|
||||
/* move the <cr> over */
|
||||
output->data[k++] = '\n';
|
||||
/* if this isn't the end of the line, add a new start tag */
|
||||
if (i < numbytes-1) {
|
||||
for (j=0; j < starttaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) {
|
||||
output->data[k++] = starttag[j];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
output->data[k++] = data[i];
|
||||
}
|
||||
}
|
||||
output->numbytes = k;
|
||||
} else {
|
||||
/* copy over the data to be written */
|
||||
if (0 < numbytes) {
|
||||
/* don't copy 0 bytes - we just need to pass
|
||||
* the zero bytes so the fd can be closed
|
||||
* after it writes everything out
|
||||
*/
|
||||
memcpy(output->data, data, numbytes);
|
||||
}
|
||||
output->numbytes = numbytes;
|
||||
|
||||
/* if this is to be xml tagged, create a tag with the correct syntax - we do not allow
|
||||
* timestamping of xml output
|
||||
*/
|
||||
if (orte_xml_output) {
|
||||
snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "<%s rank=\"%s\">", suffix, ORTE_VPID_PRINT(name->vpid));
|
||||
snprintf(endtag, ORTE_IOF_BASE_TAG_MAX, "</%s>", suffix);
|
||||
goto construct;
|
||||
}
|
||||
|
||||
/* if we are to timestamp output, start the tag with that */
|
||||
if (orte_timestamp_output) {
|
||||
time_t mytime;
|
||||
char *cptr;
|
||||
/* get the timestamp */
|
||||
time(&mytime);
|
||||
cptr = ctime(&mytime);
|
||||
cptr[strlen(cptr)-1] = '\0'; /* remove trailing newline */
|
||||
|
||||
if (orte_tag_output) {
|
||||
/* if we want it tagged as well, use both */
|
||||
snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "%s[%s,%s]<%s>:",
|
||||
cptr, ORTE_LOCAL_JOBID_PRINT(name->jobid),
|
||||
ORTE_VPID_PRINT(name->vpid), suffix);
|
||||
} else {
|
||||
/* only use timestamp */
|
||||
snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "%s<%s>:", cptr, suffix);
|
||||
}
|
||||
/* no endtag for this option */
|
||||
memset(endtag, '\0', ORTE_IOF_BASE_TAG_MAX);
|
||||
goto construct;
|
||||
}
|
||||
|
||||
if (orte_tag_output) {
|
||||
snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "[%s,%s]<%s>:",
|
||||
ORTE_LOCAL_JOBID_PRINT(name->jobid),
|
||||
ORTE_VPID_PRINT(name->vpid), suffix);
|
||||
/* no endtag for this option */
|
||||
memset(endtag, '\0', ORTE_IOF_BASE_TAG_MAX);
|
||||
goto construct;
|
||||
}
|
||||
|
||||
/* if we get here, then the data is not to be tagged - just copy it
|
||||
* and move on to processing
|
||||
*/
|
||||
if (0 < numbytes) {
|
||||
/* don't copy 0 bytes - we just need to pass
|
||||
* the zero bytes so the fd can be closed
|
||||
* after it writes everything out
|
||||
*/
|
||||
memcpy(output->data, data, numbytes);
|
||||
}
|
||||
output->numbytes = numbytes;
|
||||
goto process;
|
||||
|
||||
construct:
|
||||
starttaglen = strlen(starttag);
|
||||
endtaglen = strlen(endtag);
|
||||
/* start with the tag */
|
||||
for (j=0, k=0; j < starttaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) {
|
||||
output->data[k++] = starttag[j];
|
||||
}
|
||||
/* cycle through the data looking for <cr>
|
||||
* and replace those with the tag
|
||||
*/
|
||||
for (i=0; i < numbytes && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; i++) {
|
||||
if ('\n' == data[i]) {
|
||||
/* we need to break the line with the end tag */
|
||||
for (j=0; j < endtaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) {
|
||||
output->data[k++] = endtag[j];
|
||||
}
|
||||
/* move the <cr> over */
|
||||
output->data[k++] = '\n';
|
||||
/* if this isn't the end of the line, add a new start tag */
|
||||
if (i < numbytes-1) {
|
||||
for (j=0; j < starttaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) {
|
||||
output->data[k++] = starttag[j];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
output->data[k++] = data[i];
|
||||
}
|
||||
}
|
||||
output->numbytes = k;
|
||||
|
||||
process:
|
||||
/* lock us up to protect global operations */
|
||||
OPAL_THREAD_LOCK(&orte_iof_base.iof_write_output_lock);
|
||||
|
||||
@ -154,7 +199,8 @@ int orte_iof_base_write_output(orte_process_name_t *name, orte_iof_tag_t stream,
|
||||
|
||||
void orte_iof_base_write_handler(int fd, short event, void *cbdata)
|
||||
{
|
||||
orte_iof_write_event_t *wev = (orte_iof_write_event_t*)cbdata;
|
||||
orte_iof_sink_t *sink = (orte_iof_sink_t*)cbdata;
|
||||
orte_iof_write_event_t *wev = sink->wev;
|
||||
opal_list_item_t *item;
|
||||
orte_iof_write_output_t *output;
|
||||
int num_written;
|
||||
|
@ -41,6 +41,8 @@
|
||||
#include "orte/mca/oob/base/base.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
#include "iof_hnp.h"
|
||||
@ -99,6 +101,10 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
|
||||
orte_iof_proc_t *proct;
|
||||
opal_list_item_t *item;
|
||||
int flags;
|
||||
char *outfile;
|
||||
int fdout;
|
||||
orte_odls_job_t *jobdat;
|
||||
int np, numdigs;
|
||||
int rc;
|
||||
|
||||
/* don't do this if the dst vpid is invalid or the fd is negative! */
|
||||
@ -138,18 +144,64 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
|
||||
proct->name.jobid = dst_name->jobid;
|
||||
proct->name.vpid = dst_name->vpid;
|
||||
opal_list_append(&mca_iof_hnp_component.procs, &proct->super);
|
||||
/* see if we are to output to a file */
|
||||
if (NULL != orte_output_filename) {
|
||||
/* get the local jobdata for this proc */
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
jobdat = (orte_odls_job_t*)item;
|
||||
if (jobdat->jobid == proct->name.jobid) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
np = jobdat->num_procs / 10;
|
||||
/* determine the number of digits required for max vpid */
|
||||
numdigs = 1;
|
||||
while (np > 0) {
|
||||
numdigs++;
|
||||
np = np / 10;
|
||||
}
|
||||
/* construct the filename */
|
||||
asprintf(&outfile, "%s.%*0lu", orte_output_filename, numdigs, (unsigned long)proct->name.vpid);
|
||||
/* create the file */
|
||||
fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644);
|
||||
free(outfile);
|
||||
if (fdout < 0) {
|
||||
/* couldn't be opened */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
|
||||
return ORTE_ERR_FILE_OPEN_FAILURE;
|
||||
}
|
||||
/* define a sink to that file descriptor */
|
||||
ORTE_IOF_SINK_DEFINE(&sink, dst_name, fdout, ORTE_IOF_STDOUTALL,
|
||||
orte_iof_base_write_handler,
|
||||
&mca_iof_hnp_component.sinks);
|
||||
}
|
||||
|
||||
SETUP:
|
||||
/* define a read event and activate it */
|
||||
if (src_tag & ORTE_IOF_STDOUT) {
|
||||
ORTE_IOF_READ_EVENT(&proct->revstdout, dst_name, fd, ORTE_IOF_STDOUT,
|
||||
orte_iof_hnp_read_local_handler, true);
|
||||
orte_iof_hnp_read_local_handler, false);
|
||||
} else if (src_tag & ORTE_IOF_STDERR) {
|
||||
ORTE_IOF_READ_EVENT(&proct->revstderr, dst_name, fd, ORTE_IOF_STDERR,
|
||||
orte_iof_hnp_read_local_handler, true);
|
||||
orte_iof_hnp_read_local_handler, false);
|
||||
} else if (src_tag & ORTE_IOF_STDDIAG) {
|
||||
ORTE_IOF_READ_EVENT(&proct->revstddiag, dst_name, fd, ORTE_IOF_STDDIAG,
|
||||
orte_iof_hnp_read_local_handler, true);
|
||||
orte_iof_hnp_read_local_handler, false);
|
||||
}
|
||||
/* if -all- of the readevents for this proc have been defined, then
|
||||
* activate them. Otherwise, we can think that the proc is complete
|
||||
* because one of the readevents fires -prior- to all of them having
|
||||
* been defined!
|
||||
*/
|
||||
if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) {
|
||||
proct->revstdout->active = true;
|
||||
opal_event_add(&(proct->revstdout->ev), 0);
|
||||
proct->revstderr->active = true;
|
||||
opal_event_add(&(proct->revstderr->ev), 0);
|
||||
proct->revstddiag->active = true;
|
||||
opal_event_add(&(proct->revstddiag->ev), 0);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -109,6 +109,7 @@ static int orte_iof_hnp_close(void)
|
||||
}
|
||||
OBJ_DESTRUCT(&mca_iof_hnp_component.procs);
|
||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_HNP);
|
||||
/* release and cleanup the lock */
|
||||
OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock);
|
||||
OBJ_DESTRUCT(&mca_iof_hnp_component.lock);
|
||||
}
|
||||
@ -156,7 +157,7 @@ static int orte_iof_hnp_query(mca_base_module_t **module, int *priority)
|
||||
OBJ_CONSTRUCT(&mca_iof_hnp_component.sinks, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_iof_hnp_component.procs, opal_list_t);
|
||||
mca_iof_hnp_component.stdinev = NULL;
|
||||
|
||||
|
||||
/* we must be selected */
|
||||
*priority = 100;
|
||||
*module = (mca_base_module_t *) &orte_iof_hnp_module;
|
||||
|
@ -203,7 +203,11 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
||||
item != opal_list_get_end(&mca_iof_hnp_component.sinks);
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_iof_sink_t *sink = (orte_iof_sink_t*)item;
|
||||
if (sink->tag & rev->tag &&
|
||||
/* if the target isn't set, then this sink is for another purpose - ignore it */
|
||||
if (ORTE_JOBID_INVALID == sink->daemon.jobid) {
|
||||
continue;
|
||||
}
|
||||
if ((sink->tag & rev->tag) &&
|
||||
sink->name.jobid == rev->name.jobid &&
|
||||
(ORTE_VPID_WILDCARD == sink->name.vpid || sink->name.vpid == rev->name.vpid)) {
|
||||
/* need to send the data to the remote endpoint - if
|
||||
@ -275,17 +279,45 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
if (ORTE_IOF_STDOUT & rev->tag) {
|
||||
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, &orte_iof_base.iof_write_stdout);
|
||||
} else {
|
||||
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, &orte_iof_base.iof_write_stderr);
|
||||
|
||||
}
|
||||
/* re-add the event */
|
||||
opal_event_add(&rev->ev, 0);
|
||||
OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock);
|
||||
return;
|
||||
}
|
||||
|
||||
/* see if the user wanted the output directed to files */
|
||||
if (NULL != orte_output_filename) {
|
||||
/* find the sink for this rank */
|
||||
for (item = opal_list_get_first(&mca_iof_hnp_component.sinks);
|
||||
item != opal_list_get_end(&mca_iof_hnp_component.sinks);
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_iof_sink_t *sink = (orte_iof_sink_t*)item;
|
||||
/* if the target is set, then this sink is for another purpose - ignore it */
|
||||
if (ORTE_JOBID_INVALID != sink->daemon.jobid) {
|
||||
continue;
|
||||
}
|
||||
/* if this sink isn't for output, ignore it */
|
||||
if (ORTE_IOF_STDIN & sink->tag) {
|
||||
continue;
|
||||
}
|
||||
/* is this the desired proc? */
|
||||
if (sink->name.jobid == rev->name.jobid &&
|
||||
sink->name.vpid == rev->name.vpid) {
|
||||
/* output to the corresponding file */
|
||||
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev);
|
||||
/* done */
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* output this to our local output */
|
||||
if (ORTE_IOF_STDOUT & rev->tag) {
|
||||
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stdout->wev);
|
||||
} else {
|
||||
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stderr->wev);
|
||||
}
|
||||
}
|
||||
|
||||
/* re-add the event */
|
||||
opal_event_add(&rev->ev, 0);
|
||||
|
||||
OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock);
|
||||
return;
|
||||
|
@ -27,6 +27,13 @@
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#ifdef HAVE_FCNTL_H
|
||||
#include <fcntl.h>
|
||||
#else
|
||||
#ifdef HAVE_SYS_FCNTL_H
|
||||
#include <sys/fcntl.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
@ -129,9 +136,12 @@ static void process_msg(int fd, short event, void *cbdata)
|
||||
while (item != opal_list_get_end(&mca_iof_hnp_component.sinks)) {
|
||||
next = opal_list_get_next(item);
|
||||
sink = (orte_iof_sink_t*)item;
|
||||
|
||||
/* if the target isn't set, then this sink is for another purpose - ignore it */
|
||||
if (ORTE_JOBID_INVALID == sink->daemon.jobid) {
|
||||
continue;
|
||||
}
|
||||
/* if this sink is the designated one, then remove it from list */
|
||||
if (stream & sink->tag &&
|
||||
if ((stream & sink->tag) &&
|
||||
sink->name.jobid == origin.jobid &&
|
||||
(ORTE_VPID_WILDCARD == sink->name.vpid ||
|
||||
ORTE_VPID_WILDCARD == origin.vpid ||
|
||||
@ -161,19 +171,23 @@ static void process_msg(int fd, short event, void *cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
|
||||
ORTE_NAME_PRINT(&origin)));
|
||||
|
||||
/* write the output locally */
|
||||
/* output this to our local output */
|
||||
if (ORTE_IOF_STDOUT & stream) {
|
||||
orte_iof_base_write_output(&origin, stream, data, numbytes, &orte_iof_base.iof_write_stdout);
|
||||
orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stdout->wev);
|
||||
} else {
|
||||
orte_iof_base_write_output(&origin, stream, data, numbytes, &orte_iof_base.iof_write_stderr);
|
||||
orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stderr->wev);
|
||||
}
|
||||
|
||||
/* cycle through the endpoints to see if someone else wants a copy */
|
||||
for (item = opal_list_get_first(&mca_iof_hnp_component.sinks);
|
||||
item != opal_list_get_end(&mca_iof_hnp_component.sinks);
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_iof_sink_t* sink = (orte_iof_sink_t*)item;
|
||||
if (stream & sink->tag &&
|
||||
sink = (orte_iof_sink_t*)item;
|
||||
/* if the target isn't set, then this sink is for another purpose - ignore it */
|
||||
if (ORTE_JOBID_INVALID == sink->daemon.jobid) {
|
||||
continue;
|
||||
}
|
||||
if ((stream & sink->tag) &&
|
||||
sink->name.jobid == origin.jobid &&
|
||||
(ORTE_VPID_WILDCARD == sink->name.vpid ||
|
||||
ORTE_VPID_WILDCARD == origin.vpid ||
|
||||
|
@ -37,6 +37,8 @@ typedef uint8_t orte_iof_tag_t;
|
||||
#define ORTE_IOF_STDOUT 0x02
|
||||
#define ORTE_IOF_STDERR 0x04
|
||||
#define ORTE_IOF_STDDIAG 0x08
|
||||
#define ORTE_IOF_STDOUTALL 0x0e
|
||||
|
||||
/* flow control flags */
|
||||
#define ORTE_IOF_XON 0x10
|
||||
#define ORTE_IOF_XOFF 0x20
|
||||
|
@ -42,6 +42,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
@ -91,6 +92,11 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta
|
||||
int flags;
|
||||
opal_list_item_t *item;
|
||||
orte_iof_proc_t *proct;
|
||||
orte_iof_sink_t *sink;
|
||||
char *outfile;
|
||||
int fdout;
|
||||
orte_odls_job_t *jobdat;
|
||||
int np, numdigs;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
|
||||
"%s iof:orted pushing fd %d for process %s",
|
||||
@ -124,20 +130,65 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta
|
||||
proct->name.jobid = dst_name->jobid;
|
||||
proct->name.vpid = dst_name->vpid;
|
||||
opal_list_append(&mca_iof_orted_component.procs, &proct->super);
|
||||
/* see if we are to output to a file */
|
||||
if (NULL != orte_output_filename) {
|
||||
/* get the local jobdata for this proc */
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
jobdat = (orte_odls_job_t*)item;
|
||||
if (jobdat->jobid == proct->name.jobid) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
np = jobdat->num_procs / 10;
|
||||
/* determine the number of digits required for max vpid */
|
||||
numdigs = 1;
|
||||
while (np > 0) {
|
||||
numdigs++;
|
||||
np = np / 10;
|
||||
}
|
||||
/* construct the filename */
|
||||
asprintf(&outfile, "%s.%*0lu", orte_output_filename, numdigs, (unsigned long)proct->name.vpid);
|
||||
/* create the file */
|
||||
fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644);
|
||||
free(outfile);
|
||||
if (fdout < 0) {
|
||||
/* couldn't be opened */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
|
||||
return ORTE_ERR_FILE_OPEN_FAILURE;
|
||||
}
|
||||
/* define a sink to that file descriptor */
|
||||
ORTE_IOF_SINK_DEFINE(&sink, dst_name, fdout, ORTE_IOF_STDOUTALL,
|
||||
orte_iof_base_write_handler,
|
||||
&mca_iof_orted_component.sinks);
|
||||
}
|
||||
|
||||
SETUP:
|
||||
/* define a read event and activate it */
|
||||
if (src_tag & ORTE_IOF_STDOUT) {
|
||||
ORTE_IOF_READ_EVENT(&proct->revstdout, dst_name, fd, ORTE_IOF_STDOUT,
|
||||
orte_iof_orted_read_handler, true);
|
||||
orte_iof_orted_read_handler, false);
|
||||
} else if (src_tag & ORTE_IOF_STDERR) {
|
||||
ORTE_IOF_READ_EVENT(&proct->revstderr, dst_name, fd, ORTE_IOF_STDERR,
|
||||
orte_iof_orted_read_handler, true);
|
||||
orte_iof_orted_read_handler, false);
|
||||
} else if (src_tag & ORTE_IOF_STDDIAG) {
|
||||
ORTE_IOF_READ_EVENT(&proct->revstddiag, dst_name, fd, ORTE_IOF_STDDIAG,
|
||||
orte_iof_orted_read_handler, true);
|
||||
orte_iof_orted_read_handler, false);
|
||||
}
|
||||
/* if -all- of the readevents for this proc have been defined, then
|
||||
* activate them. Otherwise, we can think that the proc is complete
|
||||
* because one of the readevents fires -prior- to all of them having
|
||||
* been defined!
|
||||
*/
|
||||
if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) {
|
||||
proct->revstdout->active = true;
|
||||
opal_event_add(&(proct->revstdout->ev), 0);
|
||||
proct->revstderr->active = true;
|
||||
opal_event_add(&(proct->revstderr->ev), 0);
|
||||
proct->revstddiag->active = true;
|
||||
opal_event_add(&(proct->revstddiag->ev), 0);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -103,6 +103,33 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
|
||||
goto CLEAN_RETURN;
|
||||
}
|
||||
|
||||
/* see if the user wanted the output directed to files */
|
||||
if (NULL != orte_output_filename) {
|
||||
/* find the sink for this rank */
|
||||
for (item = opal_list_get_first(&mca_iof_orted_component.sinks);
|
||||
item != opal_list_get_end(&mca_iof_orted_component.sinks);
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_iof_sink_t *sink = (orte_iof_sink_t*)item;
|
||||
/* if the target is set, then this sink is for another purpose - ignore it */
|
||||
if (ORTE_JOBID_INVALID != sink->daemon.jobid) {
|
||||
continue;
|
||||
}
|
||||
/* if this sink isn't for output, ignore it */
|
||||
if (ORTE_IOF_STDIN & sink->tag) {
|
||||
continue;
|
||||
}
|
||||
/* is this the desired proc? */
|
||||
if (sink->name.jobid == rev->name.jobid &&
|
||||
sink->name.vpid == rev->name.vpid) {
|
||||
/* output to the corresponding file */
|
||||
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev);
|
||||
/* done */
|
||||
break;
|
||||
}
|
||||
}
|
||||
goto RESTART;
|
||||
}
|
||||
|
||||
/* prep the buffer */
|
||||
buf = OBJ_NEW(opal_buffer_t);
|
||||
|
||||
@ -134,6 +161,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
|
||||
orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP,
|
||||
0, send_cb, NULL);
|
||||
|
||||
RESTART:
|
||||
/* re-add the event */
|
||||
opal_event_add(&rev->ev, 0);
|
||||
|
||||
|
@ -96,9 +96,9 @@ static void process_msg(int fd, short event, void *cbdata)
|
||||
if (0 < numbytes) {
|
||||
/* write the output locally */
|
||||
if (ORTE_IOF_STDOUT & stream) {
|
||||
orte_iof_base_write_output(&origin, stream, data, numbytes, &orte_iof_base.iof_write_stdout);
|
||||
orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stdout->wev);
|
||||
} else {
|
||||
orte_iof_base_write_output(&origin, stream, data, numbytes, &orte_iof_base.iof_write_stderr);
|
||||
orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stderr->wev);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -31,3 +31,21 @@ Fileset: %s
|
||||
|
||||
Will continue attempting to launch the process.
|
||||
|
||||
#
|
||||
[orte-odls-base:xterm-neg-rank]
|
||||
The xterm option was given a negative rank to display:
|
||||
|
||||
Rank: %d
|
||||
|
||||
Note that a value of -1 represents "all", but all other values
|
||||
must range from 0 to #procs-1.
|
||||
#
|
||||
[orte-odls-base:xterm-rank-out-of-bounds]
|
||||
The xterm option was asked to display a rank that is larger
|
||||
than the number of procs in the job:
|
||||
|
||||
Rank: %d
|
||||
#procs: %d
|
||||
|
||||
Note that ranks start with 0, not 1, and must be specified
|
||||
accordingly.
|
||||
|
@ -23,6 +23,7 @@
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
@ -31,9 +32,15 @@
|
||||
|
||||
int orte_odls_base_close(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
/* cleanup ODLS globals */
|
||||
OBJ_DESTRUCT(&orte_odls_globals.mutex);
|
||||
OBJ_DESTRUCT(&orte_odls_globals.cond);
|
||||
while (NULL != (item = opal_list_remove_first(&orte_odls_globals.xterm_ranks))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&orte_odls_globals.xterm_ranks);
|
||||
if (NULL != orte_odls_globals.dmap && NULL != orte_odls_globals.dmap->bytes) {
|
||||
free(orte_odls_globals.dmap->bytes);
|
||||
free(orte_odls_globals.dmap);
|
||||
|
@ -904,6 +904,8 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
char dir[MAXPATHLEN];
|
||||
char **argvptr;
|
||||
char *full_search;
|
||||
char **argvsav=NULL;
|
||||
int inm;
|
||||
|
||||
/* protect operations involving the global list of children */
|
||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||
@ -1141,7 +1143,57 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
continue;
|
||||
}
|
||||
|
||||
/* setup the rest of the environment with the proc-specific items - these
|
||||
/* did the user request we display output in xterms? */
|
||||
if (NULL != orte_xterm) {
|
||||
opal_list_item_t *nmitem;
|
||||
orte_namelist_t *nm;
|
||||
/* see if this rank is one of those requested */
|
||||
for (nmitem = opal_list_get_first(&orte_odls_globals.xterm_ranks);
|
||||
nmitem != opal_list_get_end(&orte_odls_globals.xterm_ranks);
|
||||
nmitem = opal_list_get_next(nmitem)) {
|
||||
nm = (orte_namelist_t*)nmitem;
|
||||
/* check for bozo case */
|
||||
if (jobdat->num_procs <= nm->name.vpid) {
|
||||
/* can't be done! */
|
||||
orte_show_help("help-odls-base.txt",
|
||||
"orte-odls-base:xterm-rank-out-of-bounds",
|
||||
true, nm->name.vpid, jobdat->num_procs);
|
||||
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (ORTE_VPID_WILDCARD == nm->name.vpid ||
|
||||
child->name->vpid == nm->name.vpid) {
|
||||
/* we want this one - modify the app's command to include
|
||||
* the orte xterm cmd. Need to be careful, though, that we
|
||||
* don't modify the app for ALL ranks that use it! So we
|
||||
* will create a copy of the argv so we can restore it later
|
||||
*/
|
||||
argvsav = opal_argv_copy(app->argv);
|
||||
/* free the argv */
|
||||
opal_argv_free(app->argv);
|
||||
app->argv = NULL;
|
||||
/* now create a new one that starts with the xtermcmd */
|
||||
for (inm=0; inm < opal_argv_count(orte_odls_globals.xtermcmd); inm++) {
|
||||
opal_argv_append_nosize(&app->argv, orte_odls_globals.xtermcmd[inm]);
|
||||
}
|
||||
/* insert the rank into the correct place as a window title */
|
||||
free(app->argv[2]);
|
||||
asprintf(&app->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name->vpid));
|
||||
/* add back the original argv */
|
||||
for (inm=0; inm < opal_argv_count(argvsav); inm++) {
|
||||
opal_argv_append_nosize(&app->argv, argvsav[inm]);
|
||||
}
|
||||
/* the app exe name itself is in the argvsav array, so
|
||||
* we can recover it from there later
|
||||
*/
|
||||
free(app->app);
|
||||
app->app = strdup(orte_odls_globals.xtermcmd[0]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* setup the rest of the environment with the proc-specific items - these
|
||||
* will be overwritten for each child
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&job_str, child->name->jobid))) {
|
||||
@ -1168,6 +1220,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
}
|
||||
opal_setenv(param, vpid_str, true, &app->env);
|
||||
free(param);
|
||||
|
||||
/* although the vpid IS the process' rank within the job, users
|
||||
* would appreciate being given a public environmental variable
|
||||
* that also represents this value - something MPI specific - so
|
||||
@ -1179,7 +1232,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
opal_setenv("OMPI_COMM_WORLD_RANK", vpid_str, true, &app->env);
|
||||
free(vpid_str); /* done with this now */
|
||||
|
||||
/* users would appreciate being given a public environmental variable
|
||||
/* users would appreciate being given a public environmental variable
|
||||
* that also represents the local rank value - something MPI specific - so
|
||||
* do that here.
|
||||
*
|
||||
@ -1195,7 +1248,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env);
|
||||
free(value);
|
||||
|
||||
param = mca_base_param_environ_variable("opal", NULL, "paffinity_base_slot_list");
|
||||
param = mca_base_param_environ_variable("opal", NULL, "paffinity_base_slot_list");
|
||||
if ( NULL != child->slot_list ) {
|
||||
asprintf(&value, "%s", child->slot_list);
|
||||
opal_setenv(param, value, true, &app->env);
|
||||
@ -1205,7 +1258,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
}
|
||||
free(param);
|
||||
|
||||
/* if we are timing things, record when we are going to launch this proc */
|
||||
/* if we are timing things, record when we are going to launch this proc */
|
||||
if (orte_timing) {
|
||||
gettimeofday(&child->starttime, NULL);
|
||||
}
|
||||
@ -1266,6 +1319,17 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
}
|
||||
/* move to next processor */
|
||||
proc_rank++;
|
||||
/* reset the exe name, if necessary */
|
||||
if (NULL != argvsav) {
|
||||
/* release the current argv array */
|
||||
opal_argv_free(app->argv);
|
||||
/* restore the original one */
|
||||
app->argv = argvsav;
|
||||
argvsav = NULL;
|
||||
/* the app exe name itself is now in the argv[0] posn */
|
||||
free(app->app);
|
||||
app->app = strdup(app->argv[0]);
|
||||
}
|
||||
} /* complete launching all children for this app */
|
||||
/* reset our working directory back to our default location - if we
|
||||
* don't do this, then we will be looking for relative paths starting
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/trace.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/class/opal_value_array.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
@ -35,6 +36,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/parse_options.h"
|
||||
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
|
||||
@ -155,6 +157,10 @@ orte_odls_globals_t orte_odls_globals;
|
||||
*/
|
||||
int orte_odls_base_open(void)
|
||||
{
|
||||
char **ranks=NULL, *tmp;
|
||||
int i, rank;
|
||||
orte_namelist_t *nm;
|
||||
|
||||
/* Debugging / verbose output. Always have stream open, with
|
||||
verbose set by the mca open system... */
|
||||
orte_odls_globals.output = opal_output_open(NULL);
|
||||
@ -166,10 +172,51 @@ int orte_odls_base_open(void)
|
||||
/* initialize ODLS globals */
|
||||
OBJ_CONSTRUCT(&orte_odls_globals.mutex, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&orte_odls_globals.cond, opal_condition_t);
|
||||
OBJ_CONSTRUCT(&orte_odls_globals.xterm_ranks, opal_list_t);
|
||||
orte_odls_globals.xtermcmd = NULL;
|
||||
orte_odls_globals.dmap = NULL;
|
||||
orte_odls_globals.debugger = NULL;
|
||||
orte_odls_globals.debugger_launched = false;
|
||||
|
||||
/* check if the user requested that we display output in xterms */
|
||||
if (NULL != orte_xterm) {
|
||||
/* construct a list of ranks to be displayed */
|
||||
orte_util_parse_range_options(orte_xterm, &ranks);
|
||||
for (i=0; i < opal_argv_count(ranks); i++) {
|
||||
nm = OBJ_NEW(orte_namelist_t);
|
||||
rank = strtol(ranks[i], NULL, 10);
|
||||
if (-1 == rank) {
|
||||
/* wildcard */
|
||||
nm->name.vpid = ORTE_VPID_WILDCARD;
|
||||
} else if (rank < 0) {
|
||||
/* error out on bozo case */
|
||||
orte_show_help("help-odls-base.txt",
|
||||
"orte-odls-base:xterm-neg-rank",
|
||||
true, rank);
|
||||
return ORTE_ERROR;
|
||||
} else {
|
||||
/* we can't check here if the rank is out of
|
||||
* range as we don't yet know how many ranks
|
||||
* will be in the job - we'll check later
|
||||
*/
|
||||
nm->name.vpid = rank;
|
||||
}
|
||||
opal_list_append(&orte_odls_globals.xterm_ranks, &nm->item);
|
||||
}
|
||||
opal_argv_free(ranks);
|
||||
/* construct the xtermcmd */
|
||||
orte_odls_globals.xtermcmd = NULL;
|
||||
tmp = opal_find_absolute_path("xterm");
|
||||
if (NULL == tmp) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, tmp);
|
||||
free(tmp);
|
||||
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-T");
|
||||
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "save");
|
||||
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-e");
|
||||
}
|
||||
|
||||
/* Open up all available components */
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
|
@ -63,6 +63,10 @@ typedef struct {
|
||||
orte_odls_job_t *debugger;
|
||||
/* debugger launched */
|
||||
bool debugger_launched;
|
||||
/* list of ranks to be displayed on separate xterms */
|
||||
opal_list_t xterm_ranks;
|
||||
/* the xterm cmd to be used */
|
||||
char **xtermcmd;
|
||||
} orte_odls_globals_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals;
|
||||
|
@ -55,7 +55,6 @@ bool orte_do_not_launch = false;
|
||||
bool orted_spin_flag = false;
|
||||
bool orte_static_ports = false;
|
||||
bool orte_keep_fqdn_hostnames = false;
|
||||
bool orte_tag_output;
|
||||
bool orte_show_resolved_nodenames;
|
||||
int orted_debug_failure;
|
||||
int orted_debug_failure_delay;
|
||||
@ -110,6 +109,13 @@ opal_list_t orte_local_children;
|
||||
/* list of job data for local children on a daemon */
|
||||
opal_list_t orte_local_jobdata;
|
||||
|
||||
/* IOF controls */
|
||||
bool orte_tag_output;
|
||||
bool orte_timestamp_output;
|
||||
char *orte_output_filename;
|
||||
/* generate new xterm windows to display output from specified ranks */
|
||||
char *orte_xterm;
|
||||
|
||||
/* whether or not to forward SIGTSTP and SIGCONT signals */
|
||||
bool orte_forward_job_control;
|
||||
|
||||
|
@ -427,7 +427,6 @@ ORTE_DECLSPEC extern bool orted_spin_flag;
|
||||
ORTE_DECLSPEC extern bool orte_static_ports;
|
||||
ORTE_DECLSPEC extern int32_t orte_contiguous_nodes;
|
||||
ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames;
|
||||
ORTE_DECLSPEC extern bool orte_tag_output;
|
||||
ORTE_DECLSPEC extern bool orte_show_resolved_nodenames;
|
||||
ORTE_DECLSPEC extern int orted_debug_failure;
|
||||
ORTE_DECLSPEC extern int orted_debug_failure_delay;
|
||||
@ -485,6 +484,12 @@ ORTE_DECLSPEC extern opal_list_t orte_local_jobdata;
|
||||
/* whether or not to forward SIGTSTP and SIGCONT signals */
|
||||
ORTE_DECLSPEC extern bool orte_forward_job_control;
|
||||
|
||||
/* IOF controls */
|
||||
ORTE_DECLSPEC extern bool orte_tag_output;
|
||||
ORTE_DECLSPEC extern bool orte_timestamp_output;
|
||||
ORTE_DECLSPEC extern char *orte_output_filename;
|
||||
/* generate new xterm windows to display output from specified ranks */
|
||||
ORTE_DECLSPEC extern char *orte_xterm;
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
|
@ -211,6 +211,7 @@ int orte_register_params(void)
|
||||
"Number of nodes after which contiguous nodename encoding will automatically be used [default: INT_MAX]",
|
||||
false, false, INT32_MAX, &orte_contiguous_nodes);
|
||||
|
||||
/* whether to tag output */
|
||||
mca_base_param_reg_int_name("orte", "tag_output",
|
||||
"Tag all output with [job,rank] (default: false)",
|
||||
false, false, (int) false, &value);
|
||||
@ -224,7 +225,18 @@ int orte_register_params(void)
|
||||
if (orte_xml_output) {
|
||||
orte_tag_output = true;
|
||||
}
|
||||
|
||||
|
||||
/* whether to timestamp output */
|
||||
mca_base_param_reg_int_name("orte", "timestamp_output",
|
||||
"Timestamp all application process output (default: false)",
|
||||
false, false, (int) false, &value);
|
||||
orte_timestamp_output = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* redirect output into files */
|
||||
mca_base_param_reg_string_name("orte", "output_filename",
|
||||
"Redirect output from application processes into filename.rank [default: NULL]",
|
||||
false, false, NULL, &orte_output_filename);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "show_resolved_nodenames",
|
||||
"Display any node names that are resolved to a different name (default: false)",
|
||||
false, false, (int) false, &value);
|
||||
@ -246,6 +258,11 @@ int orte_register_params(void)
|
||||
false, false, (int)false, &value);
|
||||
orte_allocation_required = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* generate new terminal windows to display output from specified ranks */
|
||||
mca_base_param_reg_string_name("orte", "xterm",
|
||||
"Create a new xterm window and display output from the specified ranks there [default: none]",
|
||||
false, false, NULL, &orte_xterm);
|
||||
|
||||
/* whether or not to forward SIGTSTP and SIGCONT signals */
|
||||
mca_base_param_reg_int_name("orte", "forward_job_control",
|
||||
"Forward SIGTSTP (after converting to SIGSTOP) and SIGCONT signals to the application procs [default: no]",
|
||||
|
@ -265,6 +265,14 @@ is 10 seconds.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -output-filename\fR,\fP --output-filename \fR<filename>\fP
|
||||
Redirect the stdout, stderr, and stddiag of all ranks to a rank-unique version of
|
||||
the specified filename. Any directories in the filename will automatically be created.
|
||||
Each output file will consist of filename.rank, where the rank will be left-filled with
|
||||
zero's for correct ordering in listings.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -path\fR,\fP --path \fR<path>\fP
|
||||
<path> that will be used when attempting to locate the requested
|
||||
executables. This is used prior to using the local PATH setting.
|
||||
@ -341,11 +349,16 @@ indicating that no ranks are to receive stdin.
|
||||
.
|
||||
.TP
|
||||
.B -tag-output\fR,\fP --tag-output
|
||||
Tag each line output to stdout, stderr, and stddiag with \fB[jobid, rank]<stdxxx>\fP indicating the process jobid
|
||||
Tag each line of output to stdout, stderr, and stddiag with \fB[jobid, rank]<stdxxx>\fP indicating the process jobid
|
||||
and rank that generated the output, and the channel which generated it.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -timestamp-output\fR,\fP --timestamp-output
|
||||
Timestamp each line of output to stdout, stderr, and stddiag.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --tmpdir \fR<dir>\fP
|
||||
Set the root for the session directory tree for mpirun only.
|
||||
.
|
||||
@ -377,7 +390,10 @@ See the "Current Working Directory" section for notes on relative paths.
|
||||
.B Note:
|
||||
If the \fI-wdir\fP option appears both on the command line and in an
|
||||
application context, the context will take precedence over the command
|
||||
line.
|
||||
line. Relative paths are converted to absolute paths on the node where
|
||||
mpirun is executed. Thus, if the path to the desired wdir is different
|
||||
on the backend nodes, then it must be specified as an absolute path that
|
||||
is correct for the backend node.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
@ -396,6 +412,20 @@ then use \fI-x\fP to export (not define) them.
|
||||
Provide all output to stdout, stderr, and stddiag in an xml format.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -xterm\fR,\fP --xterm \fR<ranks>\fP
|
||||
Display the specified ranks in separate xterm windows. The ranks are specified
|
||||
as a comma-separated list of ranges, with a -1 indicating all. A separate
|
||||
window will be created for each specified rank.
|
||||
.B Note:
|
||||
In some environments, xterm may require that the executable be in the user's
|
||||
path, or be specified in absolute or relative terms. Thus, it may be necessary
|
||||
to specify a local executable as "./foo" instead of just "foo". If xterm fails to
|
||||
find the executable, mpirun will hang, but still respond correctly to a ctrl-c.
|
||||
If this happens, please check that the executable is being specified correctly
|
||||
and try again.
|
||||
.
|
||||
.
|
||||
.P
|
||||
The following options are useful for developers; they are not generally
|
||||
useful to most ORTE and/or MPI users:
|
||||
|
@ -63,6 +63,7 @@
|
||||
|
||||
#include "opal/version.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/util/os_dirpath.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
@ -73,6 +74,7 @@
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/hnp_contact.h"
|
||||
#include "orte/util/parse_options.h"
|
||||
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
@ -162,7 +164,16 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
{ "orte", "tag", "output", '\0', "tag-output", "tag-output", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Tag all output with [job,rank]" },
|
||||
|
||||
{ "orte", "timestamp", "output", '\0', "timestamp-output", "timestamp-output", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Timestamp all application process output" },
|
||||
{ "orte", "output", "filename", '\0', "output-filename", "output-filename", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Redirect output from application processes into filename.rank" },
|
||||
{ "orte", "xterm", NULL, '\0', "xterm", "xterm", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Create a new xterm window and display output from the specified ranks there" },
|
||||
|
||||
/* select stdin option */
|
||||
{ NULL, NULL, NULL, '\0', "stdin", "stdin", 1,
|
||||
&orterun_globals.stdin_target, OPAL_CMD_LINE_TYPE_STRING,
|
||||
|
@ -64,7 +64,6 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
|
||||
/* .sock_stderr = */ NULL
|
||||
};
|
||||
|
||||
#define ORTE_MAX_HOSTNAME_SIZE 512
|
||||
static bool init=false;
|
||||
|
||||
int orte_proc_info(void)
|
||||
|
@ -37,6 +37,8 @@
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#define ORTE_MAX_HOSTNAME_SIZE 512
|
||||
|
||||
/**
|
||||
* Process information structure
|
||||
*
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user