From b2f77bf08ff595d406e881be5ce8ea8d541cdea5 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 2 May 2012 21:00:22 +0000 Subject: [PATCH] Extend the iof by adding two new components to support map-reduce IO chaining. Add a mapreduce tool for running such applications. Fix the state machine to support multiple jobs being simultaneously launched as this is not only required for mapreduce, but can happen under comm-spawn applications as well. This commit was SVN r26380. --- orte/config/config_files.m4 | 3 +- orte/mca/grpcomm/base/grpcomm_base_xcast.c | 2 +- orte/mca/iof/base/base.h | 73 +- orte/mca/iof/base/iof_base_open.c | 24 + orte/mca/iof/base/iof_base_output.c | 26 +- orte/mca/iof/base/iof_base_setup.c | 3 + orte/mca/iof/hnp/iof_hnp.c | 1 + orte/mca/iof/hnp/iof_hnp_read.c | 8 +- orte/mca/iof/iof.h | 7 + orte/mca/iof/mr_hnp/Makefile.am | 40 + orte/mca/iof/mr_hnp/configure.m4 | 19 + orte/mca/iof/mr_hnp/iof_mrhnp.c | 700 ++++++ orte/mca/iof/mr_hnp/iof_mrhnp.h | 64 + orte/mca/iof/mr_hnp/iof_mrhnp_component.c | 96 + orte/mca/iof/mr_hnp/iof_mrhnp_read.c | 383 +++ orte/mca/iof/mr_hnp/iof_mrhnp_receive.c | 106 + orte/mca/iof/mr_orted/Makefile.am | 40 + orte/mca/iof/mr_orted/configure.m4 | 19 + orte/mca/iof/mr_orted/iof_mrorted.c | 464 ++++ orte/mca/iof/mr_orted/iof_mrorted.h | 45 + orte/mca/iof/mr_orted/iof_mrorted_component.c | 85 + orte/mca/iof/mr_orted/iof_mrorted_read.c | 281 +++ orte/mca/iof/mr_orted/iof_mrorted_receive.c | 162 ++ orte/mca/iof/orted/iof_orted.c | 1 + orte/mca/iof/tool/iof_tool.c | 3 +- orte/mca/odls/base/odls_base_default_fns.c | 57 +- orte/mca/plm/alps/plm_alps_module.c | 8 +- orte/mca/plm/base/base.h | 1 + orte/mca/plm/base/plm_base_launch_support.c | 95 +- orte/mca/plm/lsf/plm_lsf_module.c | 7 +- orte/mca/plm/plm_types.h | 25 +- orte/mca/plm/process/plm_process_module.c | 9 +- orte/mca/plm/rsh/plm_rsh_module.c | 7 +- orte/mca/plm/slurm/plm_slurm_module.c | 8 +- orte/mca/plm/tm/plm_tm_module.c | 8 +- orte/mca/state/base/state_base_fns.c | 18 +- orte/mca/state/hnp/state_hnp.c | 7 + orte/mca/state/orted/state_orted.c | 20 +- orte/orted/orted_main.c | 10 + orte/runtime/orte_globals.c | 9 +- orte/runtime/orte_globals.h | 24 +- orte/tools/Makefile.am | 6 +- orte/tools/mapreduce/Makefile.am | 40 + orte/tools/mapreduce/help-mapreduce.txt | 627 +++++ orte/tools/mapreduce/mapreduce.1in | 1293 ++++++++++ orte/tools/mapreduce/mapreduce.c | 2138 +++++++++++++++++ orte/tools/orterun/help-orterun.txt | 9 + orte/util/error_strings.c | 2 + orte/util/nidmap.c | 44 +- 49 files changed, 6931 insertions(+), 196 deletions(-) create mode 100644 orte/mca/iof/mr_hnp/Makefile.am create mode 100644 orte/mca/iof/mr_hnp/configure.m4 create mode 100644 orte/mca/iof/mr_hnp/iof_mrhnp.c create mode 100644 orte/mca/iof/mr_hnp/iof_mrhnp.h create mode 100644 orte/mca/iof/mr_hnp/iof_mrhnp_component.c create mode 100644 orte/mca/iof/mr_hnp/iof_mrhnp_read.c create mode 100644 orte/mca/iof/mr_hnp/iof_mrhnp_receive.c create mode 100644 orte/mca/iof/mr_orted/Makefile.am create mode 100644 orte/mca/iof/mr_orted/configure.m4 create mode 100644 orte/mca/iof/mr_orted/iof_mrorted.c create mode 100644 orte/mca/iof/mr_orted/iof_mrorted.h create mode 100644 orte/mca/iof/mr_orted/iof_mrorted_component.c create mode 100644 orte/mca/iof/mr_orted/iof_mrorted_read.c create mode 100644 orte/mca/iof/mr_orted/iof_mrorted_receive.c create mode 100644 orte/tools/mapreduce/Makefile.am create mode 100644 orte/tools/mapreduce/help-mapreduce.txt create mode 100644 orte/tools/mapreduce/mapreduce.1in create mode 100644 orte/tools/mapreduce/mapreduce.c diff --git a/orte/config/config_files.m4 b/orte/config/config_files.m4 index e245d3bcd4..e29b951a2e 100644 --- a/orte/config/config_files.m4 +++ b/orte/config/config_files.m4 @@ -4,7 +4,7 @@ # Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2011 Los Alamos National Security, LLC. All rights +# Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights # reserved. # $COPYRIGHT$ # @@ -32,5 +32,6 @@ AC_DEFUN([ORTE_CONFIG_FILES],[ orte/tools/orte-top/Makefile orte/tools/orte-migrate/Makefile orte/tools/orte-info/Makefile + orte/tools/mapreduce/Makefile ]) ]) diff --git a/orte/mca/grpcomm/base/grpcomm_base_xcast.c b/orte/mca/grpcomm/base/grpcomm_base_xcast.c index 5c1f1624e7..bfd7e61409 100644 --- a/orte/mca/grpcomm/base/grpcomm_base_xcast.c +++ b/orte/mca/grpcomm/base/grpcomm_base_xcast.c @@ -86,7 +86,7 @@ void orte_grpcomm_base_xcast_recv(int status, orte_process_name_t* sender, * knows what to do - it will also free the bytes in the bo */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:xcast updating nidmap", + "%s grpcomm:base:xcast updating daemon nidmap", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (ret = orte_ess.update_nidmap(bo))) { diff --git a/orte/mca/iof/base/base.h b/orte/mca/iof/base/base.h index 5b92eb3a29..92bc61fdbf 100644 --- a/orte/mca/iof/base/base.h +++ b/orte/mca/iof/base/base.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -42,7 +44,10 @@ #include #endif +#include "opal/class/opal_list.h" +#include "opal/class/opal_bitmap.h" #include "opal/mca/mca.h" +#include "opal/mca/event/event.h" #include "orte/mca/iof/iof.h" #include "orte/runtime/orte_globals.h" @@ -53,6 +58,14 @@ ORTE_DECLSPEC int orte_iof_base_open(void); #if !ORTE_DISABLE_FULL_SUPPORT +/* track xon/xoff of processes */ +typedef struct { + opal_object_t super; + orte_job_t *jdata; + opal_bitmap_t xoff; +} orte_iof_job_t; +ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_job_t); + /* * Maximum size of single msg */ @@ -76,10 +89,7 @@ typedef struct { orte_process_name_t daemon; orte_iof_tag_t tag; orte_iof_write_event_t *wev; -#if OPAL_ENABLE_DEBUG - char *file; - int line; -#endif + bool xoff; } orte_iof_sink_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_sink_t); @@ -90,10 +100,6 @@ typedef struct { int fd; orte_iof_tag_t tag; bool active; -#if OPAL_ENABLE_DEBUG - char *file; - int line; -#endif } orte_iof_read_event_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_read_event_t); @@ -103,6 +109,7 @@ typedef struct { orte_iof_read_event_t *revstdout; orte_iof_read_event_t *revstderr; orte_iof_read_event_t *revstddiag; + orte_iof_sink_t *sink; } orte_iof_proc_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_proc_t); @@ -116,6 +123,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_write_output_t); /* the iof globals struct */ struct orte_iof_base_t { int iof_output; + char *input_files; opal_list_t iof_components_opened; opal_mutex_t iof_write_output_lock; orte_iof_sink_t *iof_write_stdout; @@ -124,8 +132,6 @@ struct orte_iof_base_t { typedef struct orte_iof_base_t orte_iof_base_t; -#if OPAL_ENABLE_DEBUG - #define ORTE_IOF_SINK_DEFINE(snk, nm, fid, tg, wrthndlr, eplist) \ do { \ orte_iof_sink_t *ep; \ @@ -147,8 +153,6 @@ typedef struct orte_iof_base_t orte_iof_base_t; opal_list_append((eplist), &ep->super); \ } \ *(snk) = ep; \ - ep->file = strdup(__FILE__); \ - ep->line = __LINE__; \ } while(0); /* add list of structs that has name of proc + orte_iof_tag_t - when @@ -171,8 +175,6 @@ typedef struct orte_iof_base_t orte_iof_base_t; rev->tag = (tg); \ rev->fd = (fid); \ *(rv) = rev; \ - rev->file = strdup(__FILE__); \ - rev->line = __LINE__; \ opal_event_set(orte_event_base, \ rev->ev, (fid), \ OPAL_EV_READ, \ @@ -184,49 +186,6 @@ typedef struct orte_iof_base_t orte_iof_base_t; } while(0); -#else - -#define ORTE_IOF_SINK_DEFINE(snk, nm, fid, tg, wrthndlr, eplist) \ - do { \ - orte_iof_sink_t *ep; \ - ep = OBJ_NEW(orte_iof_sink_t); \ - ep->name.jobid = (nm)->jobid; \ - ep->name.vpid = (nm)->vpid; \ - ep->tag = (tg); \ - if (0 <= (fid)) { \ - ep->wev->fd = (fid); \ - opal_event_set(orte_event_base, \ - ep->wev->ev, ep->wev->fd, \ - OPAL_EV_WRITE, \ - wrthndlr, ep); \ - } \ - if (NULL != (eplist)) { \ - opal_list_append((eplist), &ep->super); \ - } \ - *(snk) = ep; \ - } while(0); - -#define ORTE_IOF_READ_EVENT(rv, nm, fid, tg, cbfunc, actv) \ - do { \ - orte_iof_read_event_t *rev; \ - rev = OBJ_NEW(orte_iof_read_event_t); \ - rev->name.jobid = (nm)->jobid; \ - rev->name.vpid = (nm)->vpid; \ - rev->tag = (tg); \ - rev->fd = (fid); \ - *(rv) = rev; \ - opal_event_set(orte_event_base, \ - rev->ev, (fid), \ - OPAL_EV_READ, \ - (cbfunc), rev); \ - if ((actv)) { \ - rev->active = true; \ - opal_event_add(rev->ev, 0); \ - } \ - } while(0); - -#endif - ORTE_DECLSPEC int orte_iof_base_close(void); ORTE_DECLSPEC int orte_iof_base_select(void); ORTE_DECLSPEC int orte_iof_base_flush(void); diff --git a/orte/mca/iof/base/iof_base_open.c b/orte/mca/iof/base/iof_base_open.c index 1e929f56b1..365213f1c5 100644 --- a/orte/mca/iof/base/iof_base_open.c +++ b/orte/mca/iof/base/iof_base_open.c @@ -63,11 +63,29 @@ int orte_iof_base_open(void) #else /* class instances */ +static void orte_iof_job_construct(orte_iof_job_t *ptr) +{ + ptr->jdata = NULL; + OBJ_CONSTRUCT(&ptr->xoff, opal_bitmap_t); +} +static void orte_iof_job_destruct(orte_iof_job_t *ptr) +{ + if (NULL != ptr->jdata) { + OBJ_RELEASE(ptr->jdata); + } + OBJ_DESTRUCT(&ptr->xoff); +} +OBJ_CLASS_INSTANCE(orte_iof_job_t, + opal_object_t, + orte_iof_job_construct, + orte_iof_job_destruct); + static void orte_iof_base_proc_construct(orte_iof_proc_t* ptr) { ptr->revstdout = NULL; ptr->revstderr = NULL; ptr->revstddiag = NULL; + ptr->sink = NULL; } static void orte_iof_base_proc_destruct(orte_iof_proc_t* ptr) { @@ -92,6 +110,7 @@ static void orte_iof_base_sink_construct(orte_iof_sink_t* ptr) ptr->daemon.jobid = ORTE_JOBID_INVALID; ptr->daemon.vpid = ORTE_VPID_INVALID; ptr->wev = OBJ_NEW(orte_iof_write_event_t); + ptr->xoff = false; } static void orte_iof_base_sink_destruct(orte_iof_sink_t* ptr) { @@ -205,6 +224,11 @@ int orte_iof_base_open(void) } } + /* check for files to be sent to stdin of procs */ + mca_base_param_reg_string_name("iof", "base_input_files", + "Comma-separated list of input files to be read and sent to stdin of procs (default: NULL)", + false, false, NULL, &orte_iof_base.input_files); + /* daemons do not need to do this as they do not write out stdout/err */ if (!ORTE_PROC_IS_DAEMON || (ORTE_PROC_IS_DAEMON && ORTE_PROC_IS_CM)) { diff --git a/orte/mca/iof/base/iof_base_output.c b/orte/mca/iof/base/iof_base_output.c index f18ec7d0a1..b594e87ca9 100644 --- a/orte/mca/iof/base/iof_base_output.c +++ b/orte/mca/iof/base/iof_base_output.c @@ -60,7 +60,8 @@ int orte_iof_base_write_output(orte_process_name_t *name, orte_iof_tag_t stream, "%s write:output setting up to write %d bytes to %s for %s on fd %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, (ORTE_IOF_STDIN & stream) ? "stdin" : ((ORTE_IOF_STDOUT & stream) ? "stdout" : ((ORTE_IOF_STDERR & stream) ? "stderr" : "stddiag")), - ORTE_NAME_PRINT(name), channel->fd)); + ORTE_NAME_PRINT(name), + (NULL == channel) ? -1 : channel->fd)); /* setup output object */ output = OBJ_NEW(orte_iof_write_output_t); @@ -251,9 +252,6 @@ construct: output->numbytes = k; process: - /* lock us up to protect global operations */ - OPAL_THREAD_LOCK(&orte_iof_base.iof_write_output_lock); - /* add this data to the write list for this fd */ opal_list_append(&channel->outputs, &output->super); @@ -270,9 +268,6 @@ process: channel->pending = true; } - /* unlock and go */ - OPAL_THREAD_UNLOCK(&orte_iof_base.iof_write_output_lock); - return num_buffered; } @@ -289,11 +284,13 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wev->fd)); - /* lock us up to protect global operations */ - OPAL_THREAD_LOCK(&orte_iof_base.iof_write_output_lock); - while (NULL != (item = opal_list_remove_first(&wev->outputs))) { output = (orte_iof_write_output_t*)item; + if (0 == output->numbytes) { + /* indicates we are to close this stream */ + OBJ_RELEASE(sink); + return; + } num_written = write(wev->fd, output->data, output->numbytes); if (num_written < 0) { if (EAGAIN == errno || EINTR == errno) { @@ -302,7 +299,7 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata) /* leave the write event running so it will call us again * when the fd is ready. */ - goto DEPART; + return; } /* otherwise, something bad happened so all we can do is abort * this attempt @@ -312,12 +309,12 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata) } else if (num_written < output->numbytes) { /* incomplete write - adjust data to avoid duplicate output */ memmove(output->data, &output->data[num_written], output->numbytes - num_written); - /* push this item back on the front of the list */ + /* push this item back on the front of the list */ opal_list_prepend(&wev->outputs, item); /* leave the write event running so it will call us again * when the fd is ready */ - goto DEPART; + return; } OBJ_RELEASE(output); } @@ -325,7 +322,4 @@ ABORT: opal_event_del(wev->ev); wev->pending = false; -DEPART: - /* unlock and go */ - OPAL_THREAD_UNLOCK(&orte_iof_base.iof_write_output_lock); } diff --git a/orte/mca/iof/base/iof_base_setup.c b/orte/mca/iof/base/iof_base_setup.c index e8290147fd..da02b5d30f 100644 --- a/orte/mca/iof/base/iof_base_setup.c +++ b/orte/mca/iof/base/iof_base_setup.c @@ -57,8 +57,11 @@ #include "opal/util/opal_pty.h" #include "opal/util/opal_environ.h" +#include "opal/util/output.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" #include "orte/mca/iof/iof.h" #include "orte/mca/iof/base/iof_base_setup.h" diff --git a/orte/mca/iof/hnp/iof_hnp.c b/orte/mca/iof/hnp/iof_hnp.c index 9e34d1959c..7c5e25be77 100644 --- a/orte/mca/iof/hnp/iof_hnp.c +++ b/orte/mca/iof/hnp/iof_hnp.c @@ -84,6 +84,7 @@ orte_iof_base_module_t orte_iof_hnp_module = { hnp_push, hnp_pull, hnp_close, + NULL, finalize, hnp_ft_event }; diff --git a/orte/mca/iof/hnp/iof_hnp_read.c b/orte/mca/iof/hnp/iof_hnp_read.c index 5f730b6285..902117ec60 100644 --- a/orte/mca/iof/hnp/iof_hnp_read.c +++ b/orte/mca/iof/hnp/iof_hnp_read.c @@ -181,6 +181,10 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } + if (0 < numbytes && numbytes < (int)sizeof(data)) { + /* need to write a 0-byte event to clear the stream and close it */ + orte_iof_base_write_output(&rev->name, ORTE_IOF_STDIN, data, 0, sink->wev); + } } } else { OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, @@ -207,8 +211,8 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) } } } - /* if num_bytes was zero, then we need to terminate the event */ - if (0 == numbytes) { + /* if num_bytes was zero, or we read the last piece of the file, then we need to terminate the event */ + if (0 == numbytes || numbytes < (int)sizeof(data)) { /* this will also close our stdin file descriptor */ OBJ_RELEASE(mca_iof_hnp_component.stdinev); } else { diff --git a/orte/mca/iof/iof.h b/orte/mca/iof/iof.h index 2d61a84059..e2303be0ea 100644 --- a/orte/mca/iof/iof.h +++ b/orte/mca/iof/iof.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -119,6 +121,7 @@ #include "opal/mca/crs/crs.h" #include "opal/mca/crs/base/base.h" +#include "orte/runtime/orte_globals.h" #include "iof_types.h" @@ -157,6 +160,9 @@ typedef int (*orte_iof_base_pull_fn_t)(const orte_process_name_t* peer, typedef int (*orte_iof_base_close_fn_t)(const orte_process_name_t* peer, orte_iof_tag_t source_tag); +/* Flag that a job is complete */ +typedef void (*orte_iof_base_complete_fn_t)(const orte_job_t *jdata); + /* finalize the selected module */ typedef int (*orte_iof_base_finalize_fn_t)(void); @@ -173,6 +179,7 @@ struct orte_iof_base_module_2_0_0_t { orte_iof_base_push_fn_t push; orte_iof_base_pull_fn_t pull; orte_iof_base_close_fn_t close; + orte_iof_base_complete_fn_t complete; orte_iof_base_finalize_fn_t finalize; orte_iof_base_ft_event_fn_t ft_event; }; diff --git a/orte/mca/iof/mr_hnp/Makefile.am b/orte/mca/iof/mr_hnp/Makefile.am new file mode 100644 index 0000000000..aaec8fd1c9 --- /dev/null +++ b/orte/mca/iof/mr_hnp/Makefile.am @@ -0,0 +1,40 @@ +# +# Copyright (c) 2012 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = .windows + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_iof_mr_hnp_DSO +component_noinst = +component_install = mca_iof_mr_hnp.la +else +component_noinst = libmca_iof_mr_hnp.la +component_install = +endif + +mr_hnp_SOURCES = \ + iof_mrhnp.c \ + iof_mrhnp.h \ + iof_mrhnp_component.c \ + iof_mrhnp_read.c \ + iof_mrhnp_receive.c + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_iof_mr_hnp_la_SOURCES = $(mr_hnp_SOURCES) +mca_iof_mr_hnp_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_iof_mr_hnp_la_SOURCES = $(mr_hnp_SOURCES) +libmca_iof_mr_hnp_la_LIBADD = +libmca_iof_mr_hnp_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/iof/mr_hnp/configure.m4 b/orte/mca/iof/mr_hnp/configure.m4 new file mode 100644 index 0000000000..346c0afb59 --- /dev/null +++ b/orte/mca/iof/mr_hnp/configure.m4 @@ -0,0 +1,19 @@ +# -*- shell-script -*- +# +# Copyright (c) 2012 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# MCA_iof_mr_hnp_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_orte_iof_mr_hnp_CONFIG], [ + AC_CONFIG_FILES([orte/mca/iof/mr_hnp/Makefile]) + + AS_IF([test "$orte_without_full_support" = 0], + [$1], + [$2]) +]) diff --git a/orte/mca/iof/mr_hnp/iof_mrhnp.c b/orte/mca/iof/mr_hnp/iof_mrhnp.c new file mode 100644 index 0000000000..dda96010d7 --- /dev/null +++ b/orte/mca/iof/mr_hnp/iof_mrhnp.c @@ -0,0 +1,700 @@ +/* + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "orte_config.h" +#include "opal/util/output.h" +#include "orte/constants.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ + +#ifdef HAVE_FCNTL_H +#include +#else +#ifdef HAVE_SYS_FCNTL_H +#include +#endif +#endif + +#include "opal/mca/event/event.h" + +#include "orte/runtime/orte_globals.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" +#include "orte/mca/rml/rml.h" +#include "orte/util/name_fns.h" +#include "orte/mca/odls/odls_types.h" + +#include "orte/mca/iof/base/base.h" +#include "iof_mrhnp.h" + +/* LOCAL FUNCTIONS */ +static void stdin_write_handler(int fd, short event, void *cbdata); + +/* API FUNCTIONS */ +static int init(void); + +static int mrhnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd); + +static int mrhnp_pull(const orte_process_name_t* src_name, + orte_iof_tag_t src_tag, + int fd); + +static int mrhnp_close(const orte_process_name_t* peer, + orte_iof_tag_t source_tag); + +static void mrhnp_complete(const orte_job_t *jdata); + +static int finalize(void); + +static int mrhnp_ft_event(int state); + +/* The API's in this module are solely used to support LOCAL + * procs - i.e., procs that are co-located to the HNP. Remote + * procs interact with the HNP's IOF via the HNP's receive function, + * which operates independently and is in the iof_mrhnp_receive.c file + */ + +orte_iof_base_module_t orte_iof_mrhnp_module = { + init, + mrhnp_push, + mrhnp_pull, + mrhnp_close, + mrhnp_complete, + finalize, + mrhnp_ft_event +}; + +/* Initialize the module */ +static int init(void) +{ + int rc; + + /* post non-blocking recv to catch forwarded IO from + * the orteds + */ + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_IOF_HNP, + ORTE_RML_PERSISTENT, + orte_iof_mrhnp_recv, + NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + + } + + OBJ_CONSTRUCT(&mca_iof_mr_hnp_component.sinks, opal_list_t); + OBJ_CONSTRUCT(&mca_iof_mr_hnp_component.procs, opal_list_t); + mca_iof_mr_hnp_component.stdinev = NULL; + OBJ_CONSTRUCT(&mca_iof_mr_hnp_component.stdin_jobs, opal_pointer_array_t); + opal_pointer_array_init(&mca_iof_mr_hnp_component.stdin_jobs, 1, INT_MAX, 1); + return ORTE_SUCCESS; +} + +/* Setup to read from stdin. + */ +static int mrhnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd) +{ + orte_job_t *jdata; + orte_iof_sink_t *sink; + orte_iof_proc_t *proct; + opal_list_item_t *item; + int flags; + char *outfile; + int fdout; + int np, numdigs; + orte_ns_cmp_bitmask_t mask; + orte_iof_job_t *jptr; + int j; + bool found; + + /* don't do this if the dst vpid is invalid or the fd is negative! */ + if (ORTE_VPID_INVALID == dst_name->vpid || fd < 0) { + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s iof:mrhnp pushing fd %d for process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + fd, ORTE_NAME_PRINT(dst_name))); + + /* we get a push for stdout, stderr, and stddiag on every LOCAL process, so + * setup to read those streams and forward them to the next app_context + */ + if (!(src_tag & ORTE_IOF_STDIN)) { + /* set the file descriptor to non-blocking - do this before we setup + * and activate the read event in case it fires right away + */ + if((flags = fcntl(fd, F_GETFL, 0)) < 0) { + opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", + __FILE__, __LINE__, errno); + } else { + flags |= O_NONBLOCK; + fcntl(fd, F_SETFL, flags); + } + /* do we already have this process in our list? */ + for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs); + item != opal_list_get_end(&mca_iof_mr_hnp_component.procs); + item = opal_list_get_next(item)) { + proct = (orte_iof_proc_t*)item; + mask = ORTE_NS_CMP_ALL; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, dst_name)) { + /* found it */ + goto SETUP; + } + } + /* if we get here, then we don't yet have this proc in our list */ + proct = OBJ_NEW(orte_iof_proc_t); + proct->name.jobid = dst_name->jobid; + proct->name.vpid = dst_name->vpid; + opal_list_append(&mca_iof_mr_hnp_component.procs, &proct->super); + /* see if we are to output to a file */ + if (NULL != orte_output_filename) { + /* get the jobdata for this proc */ + if (NULL == (jdata = orte_get_job_data_object(dst_name->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + np = jdata->num_procs / 10; + /* determine the number of digits required for max vpid */ + numdigs = 1; + while (np > 0) { + numdigs++; + np = np / 10; + } + /* construct the filename */ + asprintf(&outfile, "%s.%d.%0*lu", orte_output_filename, + (int)ORTE_LOCAL_JOBID(proct->name.jobid), + numdigs, (unsigned long)proct->name.vpid); + /* create the file */ + fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644); + free(outfile); + if (fdout < 0) { + /* couldn't be opened */ + ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); + return ORTE_ERR_FILE_OPEN_FAILURE; + } + /* define a sink to that file descriptor */ + ORTE_IOF_SINK_DEFINE(&sink, dst_name, fdout, ORTE_IOF_STDOUTALL, + orte_iof_base_write_handler, + &mca_iof_mr_hnp_component.sinks); + } + + SETUP: + /* define a read event but don't activate it */ + if (src_tag & ORTE_IOF_STDOUT) { + ORTE_IOF_READ_EVENT(&proct->revstdout, dst_name, fd, ORTE_IOF_STDOUT, + orte_iof_mrhnp_read_local_handler, false); + } else if (src_tag & ORTE_IOF_STDERR) { + ORTE_IOF_READ_EVENT(&proct->revstderr, dst_name, fd, ORTE_IOF_STDERR, + orte_iof_mrhnp_read_local_handler, false); + } else if (src_tag & ORTE_IOF_STDDIAG) { + ORTE_IOF_READ_EVENT(&proct->revstddiag, dst_name, fd, ORTE_IOF_STDDIAG, + orte_iof_mrhnp_read_local_handler, false); + } + /* if -all- of the readevents for this proc have been defined, then + * activate them. Otherwise, we can think that the proc is complete + * because one of the readevents fires -prior- to all of them having been defined! + */ + if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) { + /* now activate read events */ + proct->revstdout->active = true; + opal_event_add(proct->revstdout->ev, 0); + proct->revstderr->active = true; + opal_event_add(proct->revstderr->ev, 0); + proct->revstddiag->active = true; + opal_event_add(proct->revstddiag->ev, 0); + } + return ORTE_SUCCESS; + } + + /*** HANDLE STDIN PUSH ***/ + + /* get the job object for this proc and check to see if it + * is a mapper - if so, add it to the jobs that receive + * our stdin + */ + jdata = orte_get_job_data_object(dst_name->jobid); + if (ORTE_JOB_CONTROL_MAPPER & jdata->controls) { + /* see if we already have it */ + found = false; + for (j=0; j < mca_iof_mr_hnp_component.stdin_jobs.size; j++) { + if (NULL == (jptr = (orte_iof_job_t*)opal_pointer_array_get_item(&mca_iof_mr_hnp_component.stdin_jobs, j))) { + continue; + } + if (jptr->jdata->jobid == jdata->jobid) { + found = true; + break; + } + } + if (!found) { + jptr = OBJ_NEW(orte_iof_job_t); + OBJ_RETAIN(jdata); + jptr->jdata = jdata; + opal_bitmap_init(&jptr->xoff, jdata->num_procs); + opal_pointer_array_add(&mca_iof_mr_hnp_component.stdin_jobs, jptr); + } + } + + /* now setup the read - but check to only do this once */ + if (NULL == mca_iof_mr_hnp_component.stdinev) { + /* Since we are the HNP, we don't want to set nonblocking on our + * stdio stream. If we do so, we set the file descriptor to + * non-blocking for everyone that has that file descriptor, which + * includes everyone else in our shell pipeline chain. (See + * http://lists.freebsd.org/pipermail/freebsd-hackers/2005-January/009742.html). + * This causes things like "mpirun -np 1 big_app | cat" to lose + * output, because cat's stdout is then ALSO non-blocking and cat + * isn't built to deal with that case (same with almost all other + * unix text utils). + */ + if (0 != fd) { + if((flags = fcntl(fd, F_GETFL, 0)) < 0) { + opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", + __FILE__, __LINE__, errno); + } else { + flags |= O_NONBLOCK; + fcntl(fd, F_SETFL, flags); + } + } + if (isatty(fd)) { + /* We should avoid trying to read from stdin if we + * have a terminal, but are backgrounded. Catch the + * signals that are commonly used when we switch + * between being backgrounded and not. If the + * filedescriptor is not a tty, don't worry about it + * and always stay connected. + */ + opal_event_signal_set(orte_event_base, &mca_iof_mr_hnp_component.stdinsig, + SIGCONT, orte_iof_mrhnp_stdin_cb, + NULL); + + /* setup a read event to read stdin, but don't activate it yet. The + * dst_name indicates who should receive the stdin. If that recipient + * doesn't do a corresponding pull, however, then the stdin will + * be dropped upon receipt at the local daemon + */ + ORTE_IOF_READ_EVENT(&mca_iof_mr_hnp_component.stdinev, + dst_name, fd, ORTE_IOF_STDIN, + orte_iof_mrhnp_read_local_handler, false); + + /* check to see if we want the stdin read event to be + * active - we will always at least define the event, + * but may delay its activation + */ + if (!(src_tag & ORTE_IOF_STDIN) || orte_iof_mrhnp_stdin_check(fd)) { + mca_iof_mr_hnp_component.stdinev->active = true; + opal_event_add(mca_iof_mr_hnp_component.stdinev->ev, 0); + } + } else { + /* if we are not looking at a tty, just setup a read event + * and activate it + */ + ORTE_IOF_READ_EVENT(&mca_iof_mr_hnp_component.stdinev, + dst_name, fd, ORTE_IOF_STDIN, + orte_iof_mrhnp_read_local_handler, true); + } + } + return ORTE_SUCCESS; +} + + +/* + * Since we are the HNP, the only "pull" call comes from a local + * process so we can record the file descriptor for its stdin. + */ + +static int mrhnp_pull(const orte_process_name_t* dst_name, + orte_iof_tag_t src_tag, + int fd) +{ + orte_iof_sink_t *sink; + int flags, j; + orte_iof_proc_t *ptr, *proct; + opal_list_item_t *item; + orte_job_t *jdata; + orte_iof_job_t *jptr; + bool found; + + /* this is a local call - only stdin is supported */ + if (ORTE_IOF_STDIN != src_tag) { + return ORTE_ERR_NOT_SUPPORTED; + } + + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s iof:mrhnp pulling fd %d for process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + fd, ORTE_NAME_PRINT(dst_name))); + + /* get the job object for this proc and check to see if it + * is a mapper - if so, add it to the jobs that receive + * our stdin + */ + jdata = orte_get_job_data_object(dst_name->jobid); + if (ORTE_JOB_CONTROL_MAPPER & jdata->controls) { + /* see if we already have it */ + found = false; + for (j=0; j < mca_iof_mr_hnp_component.stdin_jobs.size; j++) { + if (NULL == (jptr = (orte_iof_job_t*)opal_pointer_array_get_item(&mca_iof_mr_hnp_component.stdin_jobs, j))) { + continue; + } + if (jptr->jdata->jobid == jdata->jobid) { + found = true; + break; + } + } + if (!found) { + jptr = OBJ_NEW(orte_iof_job_t); + OBJ_RETAIN(jdata); + jptr->jdata = jdata; + opal_bitmap_init(&jptr->xoff, jdata->num_procs); + opal_pointer_array_add(&mca_iof_mr_hnp_component.stdin_jobs, jptr); + } + } + + /* set the file descriptor to non-blocking - do this before we setup + * the sink in case it fires right away + */ + if((flags = fcntl(fd, F_GETFL, 0)) < 0) { + opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", + __FILE__, __LINE__, errno); + } else { + flags |= O_NONBLOCK; + fcntl(fd, F_SETFL, flags); + } + + ORTE_IOF_SINK_DEFINE(&sink, dst_name, fd, ORTE_IOF_STDIN, + stdin_write_handler, NULL); + sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; + sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid; + + /* find the proct for this proc */ + proct = NULL; + for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs); + item != opal_list_get_end(&mca_iof_mr_hnp_component.procs); + item = opal_list_get_next(item)) { + ptr = (orte_iof_proc_t*)item; + if (ptr->name.jobid == dst_name->jobid && + ptr->name.vpid == dst_name->vpid) { + proct = ptr; + break; + } + } + if (NULL == proct) { + /* we don't yet have this proc in our list */ + proct = OBJ_NEW(orte_iof_proc_t); + proct->name.jobid = dst_name->jobid; + proct->name.vpid = dst_name->vpid; + opal_list_append(&mca_iof_mr_hnp_component.procs, &proct->super); + } + proct->sink = sink; + + return ORTE_SUCCESS; +} + +/* + * One of our local procs wants us to close the specifed + * stream(s), thus terminating any potential io to/from it. + */ +static int mrhnp_close(const orte_process_name_t* peer, + orte_iof_tag_t source_tag) +{ + opal_list_item_t *item, *next_item; + orte_iof_sink_t* sink; + orte_ns_cmp_bitmask_t mask; + + for (item = opal_list_get_first(&mca_iof_mr_hnp_component.sinks); + item != opal_list_get_end(&mca_iof_mr_hnp_component.sinks); + item = next_item ) { + sink = (orte_iof_sink_t*)item; + next_item = opal_list_get_next(item); + + mask = ORTE_NS_CMP_ALL; + + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, peer) && + (source_tag & sink->tag)) { + + /* No need to delete the event or close the file + * descriptor - the destructor will automatically + * do it for us. + */ + opal_list_remove_item(&mca_iof_mr_hnp_component.sinks, item); + OBJ_RELEASE(item); + break; + } + } + return ORTE_SUCCESS; +} + +static void send_data(orte_process_name_t *name, orte_iof_tag_t tag, + orte_jobid_t jobid, + unsigned char *data, int32_t nbytes) +{ + opal_buffer_t *buf; + int rc; + + buf = OBJ_NEW(opal_buffer_t); + + if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG))) { + ORTE_ERROR_LOG(rc); + return; + } + + if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return; + } + + if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, data, nbytes, OPAL_BYTE))) { + ORTE_ERROR_LOG(rc); + return; + } + + if (0 > (rc = orte_rml.send_buffer_nb(name, buf, ORTE_RML_TAG_IOF_PROXY, + 0, orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + } +} + +static void mrhnp_complete(const orte_job_t *jdata) +{ + orte_job_t *jptr; + orte_job_map_t *map; + orte_proc_t *daemon; + orte_iof_proc_t *proct; + unsigned char data[1]; + opal_list_item_t *item; + int i; + orte_node_t *node; + + if (ORTE_JOBID_INVALID == jdata->stdout_target) { + /* nothing to do */ + return; + } + + /* the job is complete - close out the stdin + * of any procs it was feeding + */ + jptr = orte_get_job_data_object(jdata->stdout_target); + map = jptr->map; + /* cycle thru the map to find any node that has at least + * one proc from this job + */ + for (i=0; i < map->nodes->size; i++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { + continue; + } + daemon = node->daemon; + if (daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) { + for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs); + item != opal_list_get_end(&mca_iof_mr_hnp_component.procs); + item = opal_list_get_next(item)) { + proct = (orte_iof_proc_t*)item; + if (proct->name.jobid == jptr->jobid) { + if (NULL != proct->sink) { + /* need to write a 0-byte event to clear the stream and close it */ + orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, 0, proct->sink->wev); + proct->sink = NULL; + } + } + } + } else { + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s sending close stdin to daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&daemon->name))); + + /* need to send a 0-byte message to clear the stream and close it */ + send_data(&daemon->name, ORTE_IOF_STDIN, jptr->jobid, data, 0); + } + } +} + +static int finalize(void) +{ + opal_list_item_t* item; + orte_iof_write_output_t *output; + orte_iof_write_event_t *wev; + int num_written; + bool dump; + int i; + orte_job_t *jdata; + + /* check if anything is still trying to be written out */ + wev = orte_iof_base.iof_write_stdout->wev; + if (!opal_list_is_empty(&wev->outputs)) { + dump = false; + /* make one last attempt to write this out */ + while (NULL != (item = opal_list_remove_first(&wev->outputs))) { + output = (orte_iof_write_output_t*)item; + if (!dump) { + num_written = write(wev->fd, output->data, output->numbytes); + if (num_written < output->numbytes) { + /* don't retry - just cleanout the list and dump it */ + dump = true; + } + } + OBJ_RELEASE(output); + } + } + if (!orte_xml_output) { + /* we only opened stderr channel if we are NOT doing xml output */ + wev = orte_iof_base.iof_write_stderr->wev; + if (!opal_list_is_empty(&wev->outputs)) { + dump = false; + /* make one last attempt to write this out */ + while (NULL != (item = opal_list_remove_first(&wev->outputs))) { + output = (orte_iof_write_output_t*)item; + if (!dump) { + num_written = write(wev->fd, output->data, output->numbytes); + if (num_written < output->numbytes) { + /* don't retry - just cleanout the list and dump it */ + dump = true; + } + } + OBJ_RELEASE(output); + } + } + } + + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_HNP); + + /* clear our stdin job array */ + for (i=0; i < mca_iof_mr_hnp_component.stdin_jobs.size; i++) { + if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(&mca_iof_mr_hnp_component.stdin_jobs, i))) { + continue; + } + OBJ_RELEASE(jdata); + } + OBJ_DESTRUCT(&mca_iof_mr_hnp_component.stdin_jobs); + + return ORTE_SUCCESS; +} + +int mrhnp_ft_event(int state) { + /* + * Replica doesn't need to do anything for a checkpoint + */ + return ORTE_SUCCESS; +} + + +static void stdin_write_handler(int fd, short event, void *cbdata) +{ + orte_iof_sink_t *sink = (orte_iof_sink_t*)cbdata; + orte_iof_write_event_t *wev = sink->wev; + opal_list_item_t *item; + orte_iof_write_output_t *output; + int num_written; + + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s mrhnp:stdin:write:handler writing data to %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + wev->fd)); + + wev->pending = false; + + while (NULL != (item = opal_list_remove_first(&wev->outputs))) { + output = (orte_iof_write_output_t*)item; + /* if an abnormal termination has occurred, just dump + * this data as we are aborting + */ + if (orte_abnormal_term_ordered) { + OBJ_RELEASE(output); + continue; + } + if (0 == output->numbytes) { + /* this indicates we are to close the fd - there is + * nothing to write + */ + OPAL_OUTPUT_VERBOSE((20, orte_iof_base.iof_output, + "%s iof:mrhnp closing fd %d on write event due to zero bytes output", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wev->fd)); + OBJ_RELEASE(wev); + sink->wev = NULL; + /* just leave - we don't want to restart the + * read event! + */ + return; + } + num_written = write(wev->fd, output->data, output->numbytes); + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s mrhnp:stdin:write:handler wrote %d bytes", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + num_written)); + if (num_written < 0) { + if (EAGAIN == errno || EINTR == errno) { + /* push this item back on the front of the list */ + opal_list_prepend(&wev->outputs, item); + /* leave the write event running so it will call us again + * when the fd is ready. + */ + wev->pending = true; + opal_event_add(wev->ev, 0); + goto CHECK; + } + /* otherwise, something bad happened so all we can do is declare an + * error and abort + */ + OBJ_RELEASE(output); + OPAL_OUTPUT_VERBOSE((20, orte_iof_base.iof_output, + "%s iof:mrhnp closing fd %d on write event due to negative bytes written", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wev->fd)); + OBJ_RELEASE(wev); + sink->wev = NULL; + return; + } else if (num_written < output->numbytes) { + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s mrhnp:stdin:write:handler incomplete write %d - adjusting data", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_written)); + /* incomplete write - adjust data to avoid duplicate output */ + memmove(output->data, &output->data[num_written], output->numbytes - num_written); + /* push this item back on the front of the list */ + opal_list_prepend(&wev->outputs, item); + /* leave the write event running so it will call us again + * when the fd is ready. + */ + wev->pending = true; + opal_event_add(wev->ev, 0); + goto CHECK; + } + OBJ_RELEASE(output); + } + +CHECK: + if (NULL != mca_iof_mr_hnp_component.stdinev && + !orte_abnormal_term_ordered && + !mca_iof_mr_hnp_component.stdinev->active) { + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "read event is off - checking if okay to restart")); + /* if we have turned off the read event, check to + * see if the output list has shrunk enough to + * turn it back on + * + * RHC: Note that when multiple procs want stdin, we + * can get into a fight between a proc turnin stdin + * back "on" and other procs turning it "off". There + * is no clear way to resolve this as different procs + * may take input at different rates. + */ + if (opal_list_get_size(&wev->outputs) < ORTE_IOF_MAX_INPUT_BUFFERS) { + /* restart the read */ + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "restarting read event")); + mca_iof_mr_hnp_component.stdinev->active = true; + opal_event_add(mca_iof_mr_hnp_component.stdinev->ev, 0); + } + } +} diff --git a/orte/mca/iof/mr_hnp/iof_mrhnp.h b/orte/mca/iof/mr_hnp/iof_mrhnp.h new file mode 100644 index 0000000000..eb65edf088 --- /dev/null +++ b/orte/mca/iof/mr_hnp/iof_mrhnp.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef ORTE_IOF_MRHNP_H +#define ORTE_IOF_MRHNP_H + +#include "orte_config.h" + +#ifdef HAVE_SYS_TYPES_H +#include +#endif /* HAVE_SYS_TYPES_H */ +#ifdef HAVE_SYS_UIO_H +#include +#endif /* HAVE_SYS_UIO_H */ +#ifdef HAVE_NET_UIO_H +#include +#endif /* HAVE_NET_UIO_H */ + +#include "orte/mca/iof/iof.h" +#include "orte/mca/iof/base/base.h" + + +BEGIN_C_DECLS + +/** + * IOF HNP Component + */ +typedef struct { + orte_iof_base_component_t super; + opal_list_t sinks; + opal_list_t procs; + orte_iof_read_event_t *stdinev; + opal_event_t stdinsig; + char **input_files; + opal_pointer_array_t stdin_jobs; +} orte_iof_mrhnp_component_t; + +ORTE_MODULE_DECLSPEC extern orte_iof_mrhnp_component_t mca_iof_mr_hnp_component; +extern orte_iof_base_module_t orte_iof_mrhnp_module; + +void orte_iof_mrhnp_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); + +void orte_iof_mrhnp_read_local_handler(int fd, short event, void *cbdata); +void orte_iof_mrhnp_stdin_cb(int fd, short event, void *cbdata); +bool orte_iof_mrhnp_stdin_check(int fd); + +int orte_iof_hnp_send_data_to_endpoint(orte_process_name_t *host, + orte_process_name_t *target, + orte_iof_tag_t tag, + unsigned char *data, int numbytes); + +END_C_DECLS + +#endif diff --git a/orte/mca/iof/mr_hnp/iof_mrhnp_component.c b/orte/mca/iof/mr_hnp/iof_mrhnp_component.c new file mode 100644 index 0000000000..4d4299b8e5 --- /dev/null +++ b/orte/mca/iof/mr_hnp/iof_mrhnp_component.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" + +#include "orte/util/proc_info.h" + +#include "orte/mca/iof/base/base.h" +#include "iof_mrhnp.h" + +/* + * Local functions + */ +static int mrhnp_open(void); +static int mrhnp_close(void); +static int mrhnp_query(mca_base_module_t **module, int *priority); + +/* + * Public string showing the iof hnp component version number + */ +const char *mca_iof_mr_hnp_component_version_string = + "Open MPI mr_hnp iof MCA component version " ORTE_VERSION; + +orte_iof_mrhnp_component_t mca_iof_mr_hnp_component = { + { + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + { + ORTE_IOF_BASE_VERSION_2_0_0, + + "mr_hnp", /* MCA component name */ + ORTE_MAJOR_VERSION, /* MCA component major version */ + ORTE_MINOR_VERSION, /* MCA component minor version */ + ORTE_RELEASE_VERSION, /* MCA component release version */ + + /* Component open, close, and query functions */ + mrhnp_open, + mrhnp_close, + mrhnp_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + } +}; + +/** + * component open/close/init function + */ +static int mrhnp_open(void) +{ + return ORTE_SUCCESS; +} + + +static int mrhnp_close(void) +{ + return ORTE_SUCCESS; +} + +/** + * Module query + */ + +static int mrhnp_query(mca_base_module_t **module, int *priority) +{ + mca_iof_mr_hnp_component.input_files = NULL; + + /* select if we are HNP and map-reduce mode is operational */ + if (ORTE_PROC_IS_HNP && orte_map_reduce) { + *priority = 1000; + *module = (mca_base_module_t *) &orte_iof_mrhnp_module; + if (NULL != orte_iof_base.input_files) { + mca_iof_mr_hnp_component.input_files = opal_argv_split(orte_iof_base.input_files, ','); + } + return ORTE_SUCCESS; + } + + *priority = -1; + *module = NULL; + return ORTE_ERROR; +} diff --git a/orte/mca/iof/mr_hnp/iof_mrhnp_read.c b/orte/mca/iof/mr_hnp/iof_mrhnp_read.c new file mode 100644 index 0000000000..464721ee58 --- /dev/null +++ b/orte/mca/iof/mr_hnp/iof_mrhnp_read.c @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ + +#include "opal/dss/dss.h" + +#include "orte/mca/rml/rml.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/odls/odls_types.h" +#include "orte/util/name_fns.h" +#include "orte/mca/state/state.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/iof/iof.h" +#include "orte/mca/iof/base/base.h" + +#include "iof_mrhnp.h" + +static void send_data(orte_process_name_t *name, orte_iof_tag_t tag, + orte_jobid_t jobid, + unsigned char *data, int32_t nbytes); + +static void restart_stdin(int fd, short event, void *cbdata) +{ + orte_timer_t *tm = (orte_timer_t*)cbdata; + + opal_output(0, "RESTART STDIN"); + if (NULL != mca_iof_mr_hnp_component.stdinev && + !orte_job_term_ordered && + !mca_iof_mr_hnp_component.stdinev->active) { + mca_iof_mr_hnp_component.stdinev->active = true; + opal_event_add(mca_iof_mr_hnp_component.stdinev->ev, 0); + } + + /* if this was a timer callback, then release the timer */ + if (NULL != tm) { + OBJ_RELEASE(tm); + } +} + +/* return true if we should read stdin from fd, false otherwise */ +bool orte_iof_mrhnp_stdin_check(int fd) +{ +#if !defined(__WINDOWS__) && defined(HAVE_TCGETPGRP) + if( isatty(fd) && (getpgrp() != tcgetpgrp(fd)) ) { + return false; + } +#elif defined(__WINDOWS__) + return false; +#endif /* !defined(__WINDOWS__) */ + return true; +} + +void orte_iof_mrhnp_stdin_cb(int fd, short event, void *cbdata) +{ + bool should_process = orte_iof_mrhnp_stdin_check(0); + + if (should_process) { + mca_iof_mr_hnp_component.stdinev->active = true; + opal_event_add(mca_iof_mr_hnp_component.stdinev->ev, 0); + } else { + opal_event_del(mca_iof_mr_hnp_component.stdinev->ev); + mca_iof_mr_hnp_component.stdinev->active = false; + } +} + +/* this is the read handler for my own child procs and stdin + */ +void orte_iof_mrhnp_read_local_handler(int fd, short event, void *cbdata) +{ + orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata; + unsigned char data[ORTE_IOF_BASE_MSG_MAX]; + int32_t numbytes; + opal_list_item_t *item; + orte_iof_proc_t *proct; + int i, j; + orte_ns_cmp_bitmask_t mask; + orte_job_t *jdata; + orte_iof_job_t *iofjob; + orte_node_t *node; + orte_proc_t *daemon; + orte_job_map_t *map; + bool write_out=false; + + /* read up to the fragment size */ +#if !defined(__WINDOWS__) + numbytes = read(fd, data, sizeof(data)); +#else + { + DWORD readed; + HANDLE handle = (HANDLE)_get_osfhandle(fd); + ReadFile(handle, data, sizeof(data), &readed, NULL); + numbytes = (int)readed; + } +#endif /* !defined(__WINDOWS__) */ + + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s iof:mrhnp:read handler read %d bytes from %s:%d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, + ORTE_NAME_PRINT(&rev->name), fd)); + + if (numbytes < 0) { + /* either we have a connection error or it was a non-blocking read */ + + /* non-blocking, retry */ + if (EAGAIN == errno || EINTR == errno) { + opal_event_add(rev->ev, 0); + return; + } + + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s iof:mrhnp:read handler %s Error on connection:%d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&rev->name), fd)); + /* Un-recoverable error. Allow the code to flow as usual in order to + * to send the zero bytes message up the stream, and then close the + * file descriptor and delete the event. + */ + numbytes = 0; + } + + /* if job termination has been ordered, just ignore the + * data and delete the stdin read event, if that is what fired + */ + if (orte_job_term_ordered) { + if (ORTE_IOF_STDIN & rev->tag) { + OBJ_RELEASE(mca_iof_mr_hnp_component.stdinev); + } + return; + } + + if (ORTE_IOF_STDIN & rev->tag) { + /* The event has fired, so it's no longer active until we + * re-add it + */ + mca_iof_mr_hnp_component.stdinev->active = false; + /* if this was read from my stdin, I need to send this input to all + * daemons who host mapper procs + */ + for (j=0; j < mca_iof_mr_hnp_component.stdin_jobs.size; j++) { + if (NULL == (iofjob = (orte_iof_job_t*)opal_pointer_array_get_item(&mca_iof_mr_hnp_component.stdin_jobs, j))) { + continue; + } + jdata = iofjob->jdata; + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s read %d bytes from stdin - writing to job %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, + ORTE_JOBID_PRINT(jdata->jobid))); + map = jdata->map; + for (i=0; i < map->nodes->size; i++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { + continue; + } + daemon = node->daemon; + + if (daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) { + /* if it is me, then send the bytes down the stdin pipe + * for every local proc (they are all on my proct list) - we even send 0 byte events + * down the pipe so it forces out any preceding data before + * closing the output stream. We add a 0 byte message if + * numbytes < sizeof(data) as this means the chunk we read + * was the end of the file. + */ + for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs); + item != opal_list_get_end(&mca_iof_mr_hnp_component.procs); + item = opal_list_get_next(item)) { + proct = (orte_iof_proc_t*)item; + if (proct->name.jobid == jdata->jobid) { + if (NULL == proct->sink) { + opal_output(0, "NULL SINK FOR PROC %s", ORTE_NAME_PRINT(&proct->name)); + continue; + } + if (ORTE_IOF_MAX_INPUT_BUFFERS < orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, numbytes, proct->sink->wev)) { + /* getting too backed up - stop the read event for now if it is still active */ + if (mca_iof_mr_hnp_component.stdinev->active) { + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "buffer backed up - holding")); + mca_iof_mr_hnp_component.stdinev->active = false; + } + return; + } + if (0 < numbytes && numbytes < (int)sizeof(data)) { + /* need to write a 0-byte event to clear the stream and close it */ + orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, 0, proct->sink->wev); + proct->sink = NULL; + } + } + } + } else { + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s sending %d bytes from stdin to daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, + ORTE_NAME_PRINT(&daemon->name))); + + /* send the data to the daemon so it can + * write it to all local procs from this job. + * If the connection closed, + * numbytes will be zero so zero bytes will be + * sent - this will tell the daemon to close + * the fd for stdin to that proc + */ + send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, numbytes); + if (0 < numbytes && numbytes < (int)sizeof(data)) { + /* need to send a 0-byte message to clear the stream and close it */ + send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, 0); + } + } + } + } + /* if num_bytes was zero, then we need to terminate the event */ + if (0 == numbytes || numbytes < (int)sizeof(data)) { + /* this will also close our stdin file descriptor */ + if (NULL != mca_iof_mr_hnp_component.stdinev) { + OBJ_RELEASE(mca_iof_mr_hnp_component.stdinev); + } + } else { + /* if we are looking at a tty, then we just go ahead and restart the + * read event assuming we are not backgrounded + */ + if (orte_iof_mrhnp_stdin_check(fd)) { + restart_stdin(fd, 0, NULL); + } else { + /* delay for awhile and then restart */ + ORTE_TIMER_EVENT(0, 10000, restart_stdin, ORTE_INFO_PRI); + } + } + return; + } + + if (ORTE_IOF_STDOUT & rev->tag && 0 < numbytes) { + /* see if we need to forward this output */ + jdata = orte_get_job_data_object(rev->name.jobid); + if (ORTE_JOBID_INVALID == jdata->stdout_target) { + /* end of the chain - just output the info */ + write_out = true; + goto PROCESS; + } + /* it goes to the next job in the chain */ + jdata = orte_get_job_data_object(jdata->stdout_target); + map = jdata->map; + for (i=0; i < map->nodes->size; i++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { + continue; + } + daemon = node->daemon; + + if (daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) { + /* if it is me, then send the bytes down the stdin pipe + * for every local proc (they are all on my proct list) + */ + for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs); + item != opal_list_get_end(&mca_iof_mr_hnp_component.procs); + item = opal_list_get_next(item)) { + proct = (orte_iof_proc_t*)item; + if (proct->name.jobid == jdata->jobid) { + if (NULL == proct->sink) { + opal_output(0, "NULL SINK FOR PROC %s", ORTE_NAME_PRINT(&proct->name)); + continue; + } + orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, numbytes, proct->sink->wev); + } + } + } else { + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s sending %d bytes from stdout of %s to daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, + ORTE_NAME_PRINT(&rev->name), + ORTE_NAME_PRINT(&daemon->name))); + + /* send the data to the daemon so it can + * write it to all local procs from this job + */ + send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, numbytes); + } + } + } + + PROCESS: + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s read %d bytes from %s of %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, + (ORTE_IOF_STDOUT & rev->tag) ? "stdout" : ((ORTE_IOF_STDERR & rev->tag) ? "stderr" : "stddiag"), + ORTE_NAME_PRINT(&rev->name))); + + if (0 == numbytes) { + /* if we read 0 bytes from the stdout/err/diag, find this proc + * on our list and + * release the appropriate event. This will delete the + * read event and close the file descriptor + */ + for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs); + item != opal_list_get_end(&mca_iof_mr_hnp_component.procs); + item = opal_list_get_next(item)) { + proct = (orte_iof_proc_t*)item; + mask = ORTE_NS_CMP_ALL; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) { + /* found it - release corresponding event. This deletes + * the read event and closes the file descriptor + */ + if (rev->tag & ORTE_IOF_STDOUT) { + OBJ_RELEASE(proct->revstdout); + } else if (rev->tag & ORTE_IOF_STDERR) { + OBJ_RELEASE(proct->revstderr); + } else if (rev->tag & ORTE_IOF_STDDIAG) { + OBJ_RELEASE(proct->revstddiag); + } + /* check to see if they are all done */ + if (NULL == proct->revstdout && + NULL == proct->revstderr && + NULL == proct->revstddiag) { + /* this proc's iof is complete */ + opal_list_remove_item(&mca_iof_mr_hnp_component.procs, item); + ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE); + OBJ_RELEASE(proct); + } + break; + } + } + return; + } else { + /* output this to our local output */ + if (ORTE_IOF_STDOUT & rev->tag) { + if (write_out) { + orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stdout->wev); + } + } else { + orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stderr->wev); + } + } + + /* re-add the event */ + opal_event_add(rev->ev, 0); + + return; +} + +static void send_data(orte_process_name_t *name, orte_iof_tag_t tag, + orte_jobid_t jobid, + unsigned char *data, int32_t nbytes) +{ + opal_buffer_t *buf; + int rc; + + buf = OBJ_NEW(opal_buffer_t); + + if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG))) { + ORTE_ERROR_LOG(rc); + return; + } + + if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return; + } + + if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, data, nbytes, OPAL_BYTE))) { + ORTE_ERROR_LOG(rc); + return; + } + + if (0 > (rc = orte_rml.send_buffer_nb(name, buf, ORTE_RML_TAG_IOF_PROXY, + 0, orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + } +} diff --git a/orte/mca/iof/mr_hnp/iof_mrhnp_receive.c b/orte/mca/iof/mr_hnp/iof_mrhnp_receive.c new file mode 100644 index 0000000000..9c25bf8169 --- /dev/null +++ b/orte/mca/iof/mr_hnp/iof_mrhnp_receive.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ +#ifdef HAVE_FCNTL_H +#include +#else +#ifdef HAVE_SYS_FCNTL_H +#include +#endif +#endif + +#include "orte/mca/rml/rml.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/iof/iof.h" +#include "orte/mca/iof/base/base.h" + +#include "iof_mrhnp.h" + + +void orte_iof_mrhnp_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + orte_process_name_t origin; + unsigned char data[ORTE_IOF_BASE_MSG_MAX]; + orte_iof_tag_t stream; + int32_t count, numbytes; + int rc; + + + /* unpack the stream first as this may be flow control info */ + count = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &stream, &count, ORTE_IOF_TAG))) { + ORTE_ERROR_LOG(rc); + goto CLEAN_RETURN; + } + + if (ORTE_IOF_XON & stream) { + /* re-start the stdin read event */ + if (NULL != mca_iof_mr_hnp_component.stdinev && + !orte_job_term_ordered && + !mca_iof_mr_hnp_component.stdinev->active) { + mca_iof_mr_hnp_component.stdinev->active = true; + opal_event_add(mca_iof_mr_hnp_component.stdinev->ev, 0); + } + goto CLEAN_RETURN; + } else if (ORTE_IOF_XOFF & stream) { + /* stop the stdin read event */ + if (NULL != mca_iof_mr_hnp_component.stdinev && + !mca_iof_mr_hnp_component.stdinev->active) { + opal_event_del(mca_iof_mr_hnp_component.stdinev->ev); + mca_iof_mr_hnp_component.stdinev->active = false; + } + goto CLEAN_RETURN; + } + + /* get name of the process whose io we are discussing */ + count = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &origin, &count, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + goto CLEAN_RETURN; + } + + /* this must have come from a daemon forwarding output - unpack the data */ + numbytes=ORTE_IOF_BASE_MSG_MAX; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, data, &numbytes, OPAL_BYTE))) { + ORTE_ERROR_LOG(rc); + goto CLEAN_RETURN; + } + /* numbytes will contain the actual #bytes that were sent */ + + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s unpacked %d bytes from remote proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, + ORTE_NAME_PRINT(&origin))); + + /* output this to our local output */ + if (ORTE_IOF_STDOUT & stream || orte_xml_output) { + orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stdout->wev); + } else { + orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stderr->wev); + } + +CLEAN_RETURN: + return; +} diff --git a/orte/mca/iof/mr_orted/Makefile.am b/orte/mca/iof/mr_orted/Makefile.am new file mode 100644 index 0000000000..a31f298fe9 --- /dev/null +++ b/orte/mca/iof/mr_orted/Makefile.am @@ -0,0 +1,40 @@ +# +# Copyright (c) 2012 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = .windows + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_iof_mr_orted_DSO +component_noinst = +component_install = mca_iof_mr_orted.la +else +component_noinst = libmca_iof_mr_orted.la +component_install = +endif + +mr_orted_SOURCES = \ + iof_mrorted.c \ + iof_mrorted.h \ + iof_mrorted_component.c \ + iof_mrorted_read.c \ + iof_mrorted_receive.c + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_iof_mr_orted_la_SOURCES = $(mr_orted_SOURCES) +mca_iof_mr_orted_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_iof_mr_orted_la_SOURCES = $(mr_orted_SOURCES) +libmca_iof_mr_orted_la_LIBADD = +libmca_iof_mr_orted_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/iof/mr_orted/configure.m4 b/orte/mca/iof/mr_orted/configure.m4 new file mode 100644 index 0000000000..afc1f318de --- /dev/null +++ b/orte/mca/iof/mr_orted/configure.m4 @@ -0,0 +1,19 @@ +# -*- shell-script -*- +# +# Copyright (c) 2012 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# MCA_iof_mr_orted_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_orte_iof_mr_orted_CONFIG], [ + AC_CONFIG_FILES([orte/mca/iof/mr_orted/Makefile]) + + AS_IF([test "$orte_without_full_support" = 0], + [$1], + [$2]) +]) diff --git a/orte/mca/iof/mr_orted/iof_mrorted.c b/orte/mca/iof/mr_orted/iof_mrorted.c new file mode 100644 index 0000000000..dc278085c7 --- /dev/null +++ b/orte/mca/iof/mr_orted/iof_mrorted.c @@ -0,0 +1,464 @@ +/* + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "opal/util/output.h" +#include "orte/constants.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ + +#ifdef HAVE_FCNTL_H +#include +#else +#ifdef HAVE_SYS_FCNTL_H +#include +#endif +#endif + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/odls/odls_types.h" +#include "orte/mca/rml/rml.h" + +#include "orte/mca/iof/iof.h" +#include "orte/mca/iof/base/base.h" + +#include "iof_mrorted.h" + + +/* LOCAL FUNCTIONS */ +static void stdin_write_handler(int fd, short event, void *cbdata); + + +/* API FUNCTIONS */ +static int init(void); + +static int mrorted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd); + +static int mrorted_pull(const orte_process_name_t* src_name, + orte_iof_tag_t src_tag, + int fd); + +static int mrorted_close(const orte_process_name_t* peer, + orte_iof_tag_t source_tag); + +static void mrorted_complete(const orte_job_t *jdata); + +static int finalize(void); + +static int mrorted_ft_event(int state); + +/* The API's in this module are solely used to support LOCAL + * procs - i.e., procs that are co-located to the daemon. Output + * from local procs is automatically sent to the HNP for output + * and possible forwarding to other requestors. The HNP automatically + * determines and wires up the stdin configuration, so we don't + * have to do anything here. + */ + +orte_iof_base_module_t orte_iof_mrorted_module = { + init, + mrorted_push, + mrorted_pull, + mrorted_close, + mrorted_complete, + finalize, + mrorted_ft_event +}; + +static int init(void) +{ + int rc; + + /* post a non-blocking RML receive to get messages + from the HNP IOF component */ + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_IOF_PROXY, + ORTE_RML_PERSISTENT, + orte_iof_mrorted_recv, + NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + + } + + /* setup the local global variables */ + OBJ_CONSTRUCT(&mca_iof_mr_orted_component.sinks, opal_list_t); + OBJ_CONSTRUCT(&mca_iof_mr_orted_component.procs, opal_list_t); + + return ORTE_SUCCESS; +} + +/** + * Push data from the specified file descriptor + * to the HNP + */ + +static int mrorted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd) +{ + int flags; + opal_list_item_t *item; + orte_iof_proc_t *proct; + orte_iof_sink_t *sink; + char *outfile; + int fdout; + orte_job_t *jobdat=NULL; + int np, numdigs; + orte_ns_cmp_bitmask_t mask; + + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s iof:mrorted pushing fd %d for process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + fd, ORTE_NAME_PRINT(dst_name))); + + /* set the file descriptor to non-blocking - do this before we setup + * and activate the read event in case it fires right away + */ + if ((flags = fcntl(fd, F_GETFL, 0)) < 0) { + opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", + __FILE__, __LINE__, errno); + } else { + flags |= O_NONBLOCK; + fcntl(fd, F_SETFL, flags); + } + + /* do we already have this process in our list? */ + for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs); + item != opal_list_get_end(&mca_iof_mr_orted_component.procs); + item = opal_list_get_next(item)) { + proct = (orte_iof_proc_t*)item; + mask = ORTE_NS_CMP_ALL; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, dst_name)) { + /* found it */ + goto SETUP; + } + } + /* if we get here, then we don't yet have this proc in our list */ + proct = OBJ_NEW(orte_iof_proc_t); + proct->name.jobid = dst_name->jobid; + proct->name.vpid = dst_name->vpid; + opal_list_append(&mca_iof_mr_orted_component.procs, &proct->super); + /* see if we are to output to a file */ + if (NULL != orte_output_filename) { + /* get the local jobdata for this proc */ + if (NULL == (jobdat = orte_get_job_data_object(proct->name.jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + np = jobdat->num_procs / 10; + /* determine the number of digits required for max vpid */ + numdigs = 1; + while (np > 0) { + numdigs++; + np = np / 10; + } + /* construct the filename */ + asprintf(&outfile, "%s.%d.%0*lu", orte_output_filename, + (int)ORTE_LOCAL_JOBID(proct->name.jobid), + numdigs, (unsigned long)proct->name.vpid); + /* create the file */ + fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644); + free(outfile); + if (fdout < 0) { + /* couldn't be opened */ + ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); + return ORTE_ERR_FILE_OPEN_FAILURE; + } + /* define a sink to that file descriptor */ + ORTE_IOF_SINK_DEFINE(&sink, dst_name, fdout, ORTE_IOF_STDOUTALL, + orte_iof_base_write_handler, + &mca_iof_mr_orted_component.sinks); + } + +SETUP: + /* define a read event but don't activate it */ + if (src_tag & ORTE_IOF_STDOUT) { + ORTE_IOF_READ_EVENT(&proct->revstdout, dst_name, fd, ORTE_IOF_STDOUT, + orte_iof_mrorted_read_handler, false); + } else if (src_tag & ORTE_IOF_STDERR) { + ORTE_IOF_READ_EVENT(&proct->revstderr, dst_name, fd, ORTE_IOF_STDERR, + orte_iof_mrorted_read_handler, false); + } else if (src_tag & ORTE_IOF_STDDIAG) { + ORTE_IOF_READ_EVENT(&proct->revstddiag, dst_name, fd, ORTE_IOF_STDDIAG, + orte_iof_mrorted_read_handler, false); + } + /* if -all- of the readevents for this proc have been defined, then + * activate them. Otherwise, we can think that the proc is complete + * because one of the readevents fires -prior- to all of them having + * been defined! + */ + if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) { + proct->revstdout->active = true; + opal_event_add(proct->revstdout->ev, 0); + proct->revstderr->active = true; + opal_event_add(proct->revstderr->ev, 0); + proct->revstddiag->active = true; + opal_event_add(proct->revstddiag->ev, 0); + } + return ORTE_SUCCESS; +} + + +/** + * Pull for a daemon tells + * us that any info we receive from someone that is targeted + * for stdin of the specified process should be fed down the + * indicated file descriptor. Thus, all we need to do here + * is define a local endpoint so we know where to feed anything + * that comes to us + */ + +static int mrorted_pull(const orte_process_name_t* dst_name, + orte_iof_tag_t src_tag, + int fd) +{ + orte_iof_sink_t *sink; + int flags; + orte_iof_proc_t *proct, *ptr; + opal_list_item_t *item; + + /* this is a local call - only stdin is supported */ + if (ORTE_IOF_STDIN != src_tag) { + return ORTE_ERR_NOT_SUPPORTED; + } + + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s iof:mrorted pulling fd %d for process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + fd, ORTE_NAME_PRINT(dst_name))); + + /* set the file descriptor to non-blocking - do this before we setup + * the sink in case it fires right away + */ + if((flags = fcntl(fd, F_GETFL, 0)) < 0) { + opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", + __FILE__, __LINE__, errno); + } else { + flags |= O_NONBLOCK; + fcntl(fd, F_SETFL, flags); + } + + ORTE_IOF_SINK_DEFINE(&sink, dst_name, fd, ORTE_IOF_STDIN, + stdin_write_handler, NULL); + + sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; + sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid; + + /* find the proct for this proc */ + proct = NULL; + for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs); + item != opal_list_get_end(&mca_iof_mr_orted_component.procs); + item = opal_list_get_next(item)) { + ptr = (orte_iof_proc_t*)item; + if (ptr->name.jobid == dst_name->jobid && + ptr->name.vpid == dst_name->vpid) { + proct = ptr; + break; + } + } + if (NULL == proct) { + /* we don't yet have this proc in our list */ + proct = OBJ_NEW(orte_iof_proc_t); + proct->name.jobid = dst_name->jobid; + proct->name.vpid = dst_name->vpid; + opal_list_append(&mca_iof_mr_orted_component.procs, &proct->super); + } + proct->sink = sink; + + return ORTE_SUCCESS; +} + + +/* + * One of our local procs wants us to close the specifed + * stream(s), thus terminating any potential io to/from it. + * For the orted, this just means closing the local fd + */ +static int mrorted_close(const orte_process_name_t* peer, + orte_iof_tag_t source_tag) +{ + opal_list_item_t *item, *next_item; + orte_iof_sink_t* sink; + orte_ns_cmp_bitmask_t mask; + + for(item = opal_list_get_first(&mca_iof_mr_orted_component.sinks); + item != opal_list_get_end(&mca_iof_mr_orted_component.sinks); + item = next_item ) { + sink = (orte_iof_sink_t*)item; + next_item = opal_list_get_next(item); + + mask = ORTE_NS_CMP_ALL; + + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, peer) && + (source_tag & sink->tag)) { + + /* No need to delete the event or close the file + * descriptor - the destructor will automatically + * do it for us. + */ + opal_list_remove_item(&mca_iof_mr_orted_component.sinks, item); + OBJ_RELEASE(item); + break; + } + } + + return ORTE_SUCCESS; +} + +static void mrorted_complete(const orte_job_t *jdata) +{ + orte_iof_proc_t *proct; + unsigned char data[1]; + opal_list_item_t *item; + + /* the job is complete - close out the stdin + * of any procs it was feeding + */ + for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs); + item != opal_list_get_end(&mca_iof_mr_orted_component.procs); + item = opal_list_get_next(item)) { + proct = (orte_iof_proc_t*)item; + if (proct->name.jobid == jdata->stdout_target) { + if (NULL == proct->sink) { + opal_output(0, "NULL SINK FOR PROC %s", ORTE_NAME_PRINT(&proct->name)); + continue; + } else { + /* need to write a 0-byte event to clear the stream and close it */ + orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, 0, proct->sink->wev); + proct->sink = NULL; + } + } + } +} + + +static int finalize(void) +{ + int rc; + opal_list_item_t *item; + + while ((item = opal_list_remove_first(&mca_iof_mr_orted_component.sinks)) != NULL) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&mca_iof_mr_orted_component.sinks); + while ((item = opal_list_remove_first(&mca_iof_mr_orted_component.procs)) != NULL) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&mca_iof_mr_orted_component.procs); + /* Cancel the RML receive */ + rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_PROXY); + return rc; +} + +/* + * FT event + */ + +static int mrorted_ft_event(int state) +{ + return ORTE_ERR_NOT_IMPLEMENTED; +} + +static void stdin_write_handler(int fd, short event, void *cbdata) +{ + orte_iof_sink_t *sink = (orte_iof_sink_t*)cbdata; + orte_iof_write_event_t *wev = sink->wev; + opal_list_item_t *item; + orte_iof_write_output_t *output; + int num_written; + + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s mrorted:stdin:write:handler writing data to %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + wev->fd)); + + wev->pending = false; + + while (NULL != (item = opal_list_remove_first(&wev->outputs))) { + output = (orte_iof_write_output_t*)item; + if (0 == output->numbytes) { + /* this indicates we are to close the fd - there is + * nothing to write + */ + OPAL_OUTPUT_VERBOSE((20, orte_iof_base.iof_output, + "%s iof:mrorted closing fd %d on write event due to zero bytes output", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wev->fd)); + OBJ_RELEASE(wev); + sink->wev = NULL; + return; + } + num_written = write(wev->fd, output->data, output->numbytes); + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s mrorted:stdin:write:handler wrote %d bytes", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + num_written)); + if (num_written < 0) { + if (EAGAIN == errno || EINTR == errno) { + /* push this item back on the front of the list */ + opal_list_prepend(&wev->outputs, item); + /* leave the write event running so it will call us again + * when the fd is ready. + */ + wev->pending = true; + opal_event_add(wev->ev, 0); + goto CHECK; + } + /* otherwise, something bad happened so all we can do is declare an error */ + OBJ_RELEASE(output); + OPAL_OUTPUT_VERBOSE((20, orte_iof_base.iof_output, + "%s iof:mrorted closing fd %d on write event due to negative bytes written", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wev->fd)); + OBJ_RELEASE(wev); + sink->wev = NULL; + return; + } else if (num_written < output->numbytes) { + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s mrorted:stdin:write:handler incomplete write %d - adjusting data", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_written)); + /* incomplete write - adjust data to avoid duplicate output */ + memmove(output->data, &output->data[num_written], output->numbytes - num_written); + /* push this item back on the front of the list */ + opal_list_prepend(&wev->outputs, item); + /* leave the write event running so it will call us again + * when the fd is ready. + */ + wev->pending = true; + opal_event_add(wev->ev, 0); + goto CHECK; + } + OBJ_RELEASE(output); + } + +CHECK: + if (sink->xoff) { + /* if we have told the HNP to stop reading stdin, see if + * the proc has absorbed enough to justify restart + * + * RHC: Note that when multiple procs want stdin, we + * can get into a fight between a proc turnin stdin + * back "on" and other procs turning it "off". There + * is no clear way to resolve this as different procs + * may take input at different rates. + */ + if (opal_list_get_size(&wev->outputs) < ORTE_IOF_MAX_INPUT_BUFFERS) { + /* restart the read */ + sink->xoff = false; + orte_iof_mrorted_send_xonxoff(&sink->name, ORTE_IOF_XON); + } + } +} diff --git a/orte/mca/iof/mr_orted/iof_mrorted.h b/orte/mca/iof/mr_orted/iof_mrorted.h new file mode 100644 index 0000000000..f0532a1e4e --- /dev/null +++ b/orte/mca/iof/mr_orted/iof_mrorted.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef ORTE_IOF_MR_ORTED_H +#define ORTE_IOF_MR_ORTED_H + +#include "orte_config.h" + +#include "opal/class/opal_list.h" + +#include "orte/mca/rml/rml_types.h" + +#include "orte/mca/iof/iof.h" + +BEGIN_C_DECLS + +/** + * IOF MR_ORTED Component + */ +typedef struct { + orte_iof_base_component_t super; + opal_list_t sinks; + opal_list_t procs; +} orte_iof_mrorted_component_t; + +ORTE_MODULE_DECLSPEC extern orte_iof_mrorted_component_t mca_iof_mr_orted_component; +extern orte_iof_base_module_t orte_iof_mrorted_module; + +void orte_iof_mrorted_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); + +void orte_iof_mrorted_read_handler(int fd, short event, void *data); +void orte_iof_mrorted_send_xonxoff(orte_process_name_t *name, orte_iof_tag_t tag); + +END_C_DECLS + +#endif diff --git a/orte/mca/iof/mr_orted/iof_mrorted_component.c b/orte/mca/iof/mr_orted/iof_mrorted_component.c new file mode 100644 index 0000000000..ca9740c3cd --- /dev/null +++ b/orte/mca/iof/mr_orted/iof_mrorted_component.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" + +#include "orte/util/proc_info.h" + +#include "iof_mrorted.h" + +/* + * Local functions + */ +static int mr_orted_open(void); +static int mr_orted_close(void); +static int mr_orted_query(mca_base_module_t **module, int *priority); + + +/* + * Public string showing the iof mr_orted component version number + */ +const char *mca_iof_mr_orted_component_version_string = +"Open MPI mr_orted iof MCA component version " ORTE_VERSION; + + +orte_iof_mrorted_component_t mca_iof_mr_orted_component = { + { + { + ORTE_IOF_BASE_VERSION_2_0_0, + + "mr_orted", /* MCA component name */ + ORTE_MAJOR_VERSION, /* MCA component major version */ + ORTE_MINOR_VERSION, /* MCA component minor version */ + ORTE_RELEASE_VERSION, /* MCA component release version */ + + /* Component open, close, and query functions */ + mr_orted_open, + mr_orted_close, + mr_orted_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } + } +}; + +/** + * component open/close/init function + */ +static int mr_orted_open(void) +{ + /* Nothing to do */ + return ORTE_SUCCESS; +} + +static int mr_orted_close(void) +{ + return ORTE_SUCCESS; +} + + +static int mr_orted_query(mca_base_module_t **module, int *priority) +{ + if (ORTE_PROC_IS_DAEMON && orte_map_reduce) { + *priority = 1000; + *module = (mca_base_module_t *) &orte_iof_mrorted_module; + return ORTE_SUCCESS; + } + + *priority = -1; + *module = NULL; + return ORTE_ERROR; +} + diff --git a/orte/mca/iof/mr_orted/iof_mrorted_read.c b/orte/mca/iof/mr_orted/iof_mrorted_read.c new file mode 100644 index 0000000000..d0dd4ff4ab --- /dev/null +++ b/orte/mca/iof/mr_orted/iof_mrorted_read.c @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ + +#include "opal/dss/dss.h" + +#include "orte/mca/rml/rml.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/odls/odls_types.h" +#include "orte/util/name_fns.h" +#include "orte/mca/state/state.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/iof/iof.h" +#include "orte/mca/iof/base/base.h" + +#include "iof_mrorted.h" + +static void send_data(orte_process_name_t *name, orte_iof_tag_t tag, + orte_jobid_t jobid, + unsigned char *data, int32_t nbytes); + +void orte_iof_mrorted_read_handler(int fd, short event, void *cbdata) +{ + orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata; + unsigned char data[ORTE_IOF_BASE_MSG_MAX]; + opal_buffer_t *buf=NULL; + int rc; + int32_t numbytes; + opal_list_item_t *item; + orte_iof_proc_t *proct; + orte_ns_cmp_bitmask_t mask; + orte_job_t *jdata; + orte_job_map_t *map; + int i; + bool write_out=false; + orte_node_t *node; + orte_proc_t *daemon; + + /* read up to the fragment size */ +#if !defined(__WINDOWS__) + numbytes = read(fd, data, sizeof(data)); +#else + { + DWORD readed; + HANDLE handle = (HANDLE)_get_osfhandle(fd); + ReadFile(handle, data, sizeof(data), &readed, NULL); + numbytes = (int)readed; + } +#endif /* !defined(__WINDOWS__) */ + + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s iof:mrorted:read handler read %d bytes from %s, fd %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + numbytes, ORTE_NAME_PRINT(&rev->name), fd)); + + if (numbytes <= 0) { + if (0 > numbytes) { + /* either we have a connection error or it was a non-blocking read */ + if (EAGAIN == errno || EINTR == errno) { + /* non-blocking, retry */ + opal_event_add(rev->ev, 0); + return; + } + + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s iof:mrorted:read handler %s Error on connection:%d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&rev->name), fd)); + } + /* numbytes must have been zero, so go down and close the fd etc */ + goto CLEAN_RETURN; + } + + /* see if the user wanted the output directed to files */ + if (NULL != orte_output_filename) { + /* find the sink for this rank */ + for (item = opal_list_get_first(&mca_iof_mr_orted_component.sinks); + item != opal_list_get_end(&mca_iof_mr_orted_component.sinks); + item = opal_list_get_next(item)) { + orte_iof_sink_t *sink = (orte_iof_sink_t*)item; + /* if the target is set, then this sink is for another purpose - ignore it */ + if (ORTE_JOBID_INVALID != sink->daemon.jobid) { + continue; + } + /* if this sink isn't for output, ignore it */ + if (ORTE_IOF_STDIN & sink->tag) { + continue; + } + + mask = ORTE_NS_CMP_ALL; + + /* is this the desired proc? */ + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, &rev->name)) { + /* output to the corresponding file */ + orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev); + /* done */ + break; + } + } + } + + if (ORTE_IOF_STDOUT & rev->tag) { + /* see if we need to forward this output */ + jdata = orte_get_job_data_object(rev->name.jobid); + if (ORTE_JOBID_INVALID == jdata->stdout_target) { + /* end of the chain - just output the info */ + write_out = true; + goto PROCESS; + } + /* it goes to the next job in the chain */ + jdata = orte_get_job_data_object(jdata->stdout_target); + map = jdata->map; + for (i=0; i < map->nodes->size; i++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { + continue; + } + daemon = node->daemon; + if (daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) { + /* if it is me, then send the bytes down the stdin pipe + * for every local proc (they are all on my proct list) + */ + for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs); + item != opal_list_get_end(&mca_iof_mr_orted_component.procs); + item = opal_list_get_next(item)) { + proct = (orte_iof_proc_t*)item; + if (proct->name.jobid == jdata->jobid) { + if (NULL == proct->sink) { + opal_output(0, "NULL SINK FOR PROC %s", ORTE_NAME_PRINT(&proct->name)); + continue; + } + orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, numbytes, proct->sink->wev); + } + } + } else { + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s sending %d bytes from stdout of %s to daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, + ORTE_NAME_PRINT(&rev->name), + ORTE_NAME_PRINT(&daemon->name))); + + /* send the data to the daemon so it can + * write it to all local procs from this job + */ + send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, numbytes); + } + } + } + + PROCESS: + if (write_out) { + /* prep the buffer */ + buf = OBJ_NEW(opal_buffer_t); + + /* pack the stream first - we do this so that flow control messages can + * consist solely of the tag + */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->tag, 1, ORTE_IOF_TAG))) { + ORTE_ERROR_LOG(rc); + goto CLEAN_RETURN; + } + + /* pack name of process that gave us this data */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->name, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + goto CLEAN_RETURN; + } + + /* pack the data - only pack the #bytes we read! */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &data, numbytes, OPAL_BYTE))) { + ORTE_ERROR_LOG(rc); + goto CLEAN_RETURN; + } + + /* start non-blocking RML call to forward received data */ + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s iof:mrorted:read handler sending %d bytes to HNP", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes)); + + orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP, + 0, orte_rml_send_callback, NULL); + } + + /* re-add the event */ + opal_event_add(rev->ev, 0); + + return; + + CLEAN_RETURN: + /* must be an error, or zero bytes were read indicating that the + * proc terminated this IOF channel - either way, find this proc + * on our list and clean up + */ + for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs); + item != opal_list_get_end(&mca_iof_mr_orted_component.procs); + item = opal_list_get_next(item)) { + proct = (orte_iof_proc_t*)item; + mask = ORTE_NS_CMP_ALL; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) { + /* found it - release corresponding event. This deletes + * the read event and closes the file descriptor + */ + if (rev->tag & ORTE_IOF_STDOUT) { + if( NULL != proct->revstdout ) { + OBJ_RELEASE(proct->revstdout); + } + } else if (rev->tag & ORTE_IOF_STDERR) { + if( NULL != proct->revstderr ) { + OBJ_RELEASE(proct->revstderr); + } + } else if (rev->tag & ORTE_IOF_STDDIAG) { + if( NULL != proct->revstddiag ) { + OBJ_RELEASE(proct->revstddiag); + } + } + /* check to see if they are all done */ + if (NULL == proct->revstdout && + NULL == proct->revstderr && + NULL == proct->revstddiag) { + /* this proc's iof is complete */ + opal_list_remove_item(&mca_iof_mr_orted_component.procs, item); + ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE); + OBJ_RELEASE(proct); + } + break; + } + } + if (NULL != buf) { + OBJ_RELEASE(buf); + } + return; +} + +static void send_data(orte_process_name_t *name, orte_iof_tag_t tag, + orte_jobid_t jobid, + unsigned char *data, int32_t nbytes) +{ + opal_buffer_t *buf; + int rc; + + buf = OBJ_NEW(opal_buffer_t); + + if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG))) { + ORTE_ERROR_LOG(rc); + return; + } + + if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return; + } + + if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, data, nbytes, OPAL_BYTE))) { + ORTE_ERROR_LOG(rc); + return; + } + + if (0 > (rc = orte_rml.send_buffer_nb(name, buf, ORTE_RML_TAG_IOF_PROXY, + 0, orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + } +} diff --git a/orte/mca/iof/mr_orted/iof_mrorted_receive.c b/orte/mca/iof/mr_orted/iof_mrorted_receive.c new file mode 100644 index 0000000000..6aaeaca7fb --- /dev/null +++ b/orte/mca/iof/mr_orted/iof_mrorted_receive.c @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ + +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/rml_types.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/iof/iof_types.h" +#include "orte/mca/iof/base/base.h" + +#include "iof_mrorted.h" + +static void send_cb(int status, orte_process_name_t *peer, + opal_buffer_t *buf, orte_rml_tag_t tag, + void *cbdata) +{ + /* nothing to do here - just release buffer and return */ + OBJ_RELEASE(buf); +} + +void orte_iof_mrorted_send_xonxoff(orte_process_name_t *name, orte_iof_tag_t tag) +{ + opal_buffer_t *buf; + int rc; + + buf = OBJ_NEW(opal_buffer_t); + + /* pack the tag - we do this first so that flow control messages can + * consist solely of the tag + */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + return; + } + /* add the name of the proc */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + return; + } + + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s sending %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (ORTE_IOF_XON == tag) ? "xon" : "xoff")); + + /* send the buffer to the HNP */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP, + 0, send_cb, NULL))) { + ORTE_ERROR_LOG(rc); + } +} + +/* + * The only messages coming to an orted are either: + * + * (a) stdin, which is to be copied to whichever local + * procs "pull'd" a copy + * + * (b) flow control messages + */ +void orte_iof_mrorted_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + unsigned char data[ORTE_IOF_BASE_MSG_MAX]; + orte_iof_tag_t stream; + int32_t count, numbytes; + orte_jobid_t jobid; + opal_list_item_t *item; + int rc; + + /* see what stream generated this data */ + count = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &stream, &count, ORTE_IOF_TAG))) { + ORTE_ERROR_LOG(rc); + goto CLEAN_RETURN; + } + + /* if this isn't stdin, then we have an error */ + if (ORTE_IOF_STDIN != stream) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto CLEAN_RETURN; + } + + /* unpack the intended target */ + count = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jobid, &count, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto CLEAN_RETURN; + } + + /* unpack the data */ + numbytes=ORTE_IOF_BASE_MSG_MAX; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, data, &numbytes, OPAL_BYTE))) { + ORTE_ERROR_LOG(rc); + goto CLEAN_RETURN; + } + /* numbytes will contain the actual #bytes that were sent */ + + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s unpacked %d bytes for local job %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, + ORTE_JOBID_PRINT(jobid))); + + /* cycle through our list of procs */ + for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs); + item != opal_list_get_end(&mca_iof_mr_orted_component.procs); + item = opal_list_get_next(item)) { + orte_iof_proc_t* sink = (orte_iof_proc_t*)item; + + /* is this intended for this jobid? */ + if (jobid == sink->name.jobid) { + OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, + "%s writing data to local proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&sink->name))); + if (NULL == sink->sink->wev || sink->sink->wev->fd < 0) { + /* this sink was already closed - ignore this data */ + goto CLEAN_RETURN; + } + /* send the bytes down the pipe - we even send 0 byte events + * down the pipe so it forces out any preceding data before + * closing the output stream + */ + if (ORTE_IOF_MAX_INPUT_BUFFERS < orte_iof_base_write_output(&sink->name, stream, data, numbytes, sink->sink->wev)) { + /* getting too backed up - tell the HNP to hold off any more input if we + * haven't already told it + */ + if (!sink->sink->xoff) { + sink->sink->xoff = true; + orte_iof_mrorted_send_xonxoff(&sink->name, ORTE_IOF_XOFF); + } + } + } + } + +CLEAN_RETURN: + return; +} diff --git a/orte/mca/iof/orted/iof_orted.c b/orte/mca/iof/orted/iof_orted.c index dae7fef958..c35a3c01dd 100644 --- a/orte/mca/iof/orted/iof_orted.c +++ b/orte/mca/iof/orted/iof_orted.c @@ -84,6 +84,7 @@ orte_iof_base_module_t orte_iof_orted_module = { orted_push, orted_pull, orted_close, + NULL, finalize, orted_ft_event }; diff --git a/orte/mca/iof/tool/iof_tool.c b/orte/mca/iof/tool/iof_tool.c index b170d13ac4..7e1a057d64 100644 --- a/orte/mca/iof/tool/iof_tool.c +++ b/orte/mca/iof/tool/iof_tool.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -62,6 +62,7 @@ orte_iof_base_module_t orte_iof_tool_module = { tool_push, tool_pull, tool_close, + NULL, finalize, tool_ft_event }; diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 1b1051556f..3ba8ad5102 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -229,12 +229,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, return rc; } - /* pack the number of nodes involved in this job */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->num_nodes, 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* pack the number of procs in this launch */ if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->num_procs, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); @@ -267,6 +261,12 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, return rc; } + /* pack the stdout target */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->stdout_target, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack whether or not process recovery is allowed for this job */ if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->enable_recovery, 1, OPAL_BOOL))) { ORTE_ERROR_LOG(rc); @@ -538,15 +538,10 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, goto REPORT_ERROR; } - /* unpack the number of nodes involved in this job */ + /* ensure the map object is present */ if (NULL == jdata->map) { jdata->map = OBJ_NEW(orte_job_map_t); } - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->map->num_nodes, &cnt, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - goto REPORT_ERROR; - } /* unpack the number of procs in this launch */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->num_procs, &cnt, ORTE_VPID))) { @@ -579,6 +574,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } + /* unpack the stdout target for the job */ + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->stdout_target, &cnt, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } /* unpack whether or not process recovery is allowed for this job */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->enable_recovery, &cnt, OPAL_BOOL))) { @@ -1114,7 +1115,6 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata) char **argvsav=NULL; int inm, j, idx; int total_num_local_procs = 0; - orte_nid_t *nid; orte_node_t *node; orte_odls_launch_local_t *caddy = (orte_odls_launch_local_t*)cbdata; orte_job_t *jobdat; @@ -1145,25 +1145,13 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata) /* see if the mapper thinks we are oversubscribed */ oversubscribed = false; - if (ORTE_PROC_IS_HNP) { - /* just fake it - we don't keep a local nidmap */ - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_FAILED_TO_LAUNCH); - goto ERROR_OUT; - } - if (node->oversubscribed) { - oversubscribed = true; - } - } else { - /* RHC: the nidmap will eventually disappear, so for now just - * make this a non-fatal error - */ - if (NULL != (nid = orte_util_lookup_nid(ORTE_PROC_MY_NAME))) { - if (nid->oversubscribed) { - oversubscribed = true; - } - } + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_FAILED_TO_LAUNCH); + goto ERROR_OUT; + } + if (node->oversubscribed) { + oversubscribed = true; } #if OPAL_ENABLE_FT_CR == 1 @@ -1745,7 +1733,7 @@ void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid) opal_dss.pack(&buffer, &vpid1, 1, ORTE_VPID); /* num_procs */ #if OPAL_HAVE_HWLOC bind_level = OPAL_HWLOC_NODE_LEVEL; - opal_dss.pack(&buffer, &bind_level, 1, OPAL_HWLOC_LEVEL_T); /* num_procs */ + opal_dss.pack(&buffer, &bind_level, 1, OPAL_HWLOC_LEVEL_T); /* binding level */ #endif one32 = 0; opal_dss.pack(&buffer, &one32, 1, OPAL_INT32); /* node index */ @@ -2095,6 +2083,9 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata) if (NULL == (cptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } + if (cptr->name.jobid != proc->name.jobid) { + continue; + } if (cptr->registered) { /* someone has registered, and we didn't before * terminating - this is an abnormal termination diff --git a/orte/mca/plm/alps/plm_alps_module.c b/orte/mca/plm/alps/plm_alps_module.c index 07ada275d3..4b73c8b0af 100644 --- a/orte/mca/plm/alps/plm_alps_module.c +++ b/orte/mca/plm/alps/plm_alps_module.c @@ -191,6 +191,7 @@ static void launch_daemons(int fd, short args, void *cbdata) * do it - no new daemons will be launched */ if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) { + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; @@ -213,7 +214,7 @@ static void launch_daemons(int fd, short args, void *cbdata) * job to move to the following step */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } @@ -234,7 +235,9 @@ static void launch_daemons(int fd, short args, void *cbdata) "%s plm:alps: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) { + ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); + } OBJ_RELEASE(state); return; } @@ -404,6 +407,7 @@ static void launch_daemons(int fd, short args, void *cbdata) /* indicate that the daemons for this job were launched */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; /* flag that launch was successful, so far as we currently know */ failed_launch = false; diff --git a/orte/mca/plm/base/base.h b/orte/mca/plm/base/base.h index 6dfc351055..a5bea6a9f5 100644 --- a/orte/mca/plm/base/base.h +++ b/orte/mca/plm/base/base.h @@ -83,6 +83,7 @@ ORTE_DECLSPEC void orte_plm_base_app_report_launch(int fd, short event, void *da ORTE_DECLSPEC void orte_plm_base_receive_process_msg(int fd, short event, void *data); ORTE_DECLSPEC void orte_plm_base_setup_job(int fd, short args, void *cbdata); +ORTE_DECLSPEC void orte_plm_base_setup_job_complete(int fd, short args, void *cbdata); ORTE_DECLSPEC void orte_plm_base_complete_setup(int fd, short args, void *cbdata); ORTE_DECLSPEC void orte_plm_base_daemons_reported(int fd, short args, void *cbdata); ORTE_DECLSPEC void orte_plm_base_daemons_launched(int fd, short args, void *cbdata); diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 5028f97920..0ac248735f 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -80,8 +80,6 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; - int i; - orte_job_t *jdata; #if OPAL_HAVE_HWLOC { @@ -106,21 +104,17 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata) if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } - node->topology = t; + if (NULL == node->topology) { + node->topology = t; + } } } } #endif - /* progress all jobs whose daemons have launched */ - for (i=1; i < orte_job_data->size; i++) { - if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { - continue; - } - if (ORTE_JOB_STATE_DAEMONS_LAUNCHED == jdata->state) { - ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); - } - } + /* progress the job */ + caddy->jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED; + ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_MAP); /* cleanup */ OBJ_RELEASE(caddy); @@ -213,12 +207,21 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata) free(bar2_val); /* set the job state to the next position */ - ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE); + ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_INIT_COMPLETE); /* cleanup */ OBJ_RELEASE(caddy); } +void orte_plm_base_setup_job_complete(int fd, short args, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + + /* nothing to do here but move along */ + ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE); + OBJ_RELEASE(caddy); +} + void orte_plm_base_complete_setup(int fd, short args, void *cbdata) { orte_job_t *jdata, *jdatorted; @@ -510,12 +513,12 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata) { - orte_process_name_t peer; char *rml_uri = NULL, *ptr; int rc, idx; orte_proc_t *daemon=NULL; char *nodename; orte_node_t *node; + orte_job_t *jdata; /* get the daemon job, if necessary */ if (NULL == jdatorted) { @@ -562,7 +565,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:orted_report_launch from daemon %s on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer), nodename)); + ORTE_NAME_PRINT(sender), nodename)); /* look this node up, if necessary */ if (!orte_plm_globals.daemon_nodes_assigned_at_launch) { @@ -593,16 +596,29 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:orted_report_launch attempting to assign daemon %s to node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer), nodename)); + ORTE_NAME_PRINT(sender), nodename)); for (idx=0; idx < orte_node_pool->size; idx++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, idx))) { continue; } - if (NULL != node->daemon) { + if (node->location_verified) { /* already assigned */ continue; } if (0 == strcmp(nodename, node->name)) { + /* flag that we verified the location */ + node->location_verified = true; + if (node == daemon->node) { + /* it wound up right where it should */ + break; + } + /* remove the prior association */ + if (NULL != daemon->node) { + OBJ_RELEASE(daemon->node); + } + if (NULL != node->daemon) { + OBJ_RELEASE(node->daemon); + } /* associate this daemon with the node */ node->daemon = daemon; OBJ_RETAIN(daemon); @@ -687,8 +703,18 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, } else { jdatorted->num_reported++; if (jdatorted->num_procs == jdatorted->num_reported) { - /* activate the daemons_reported state */ - ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_DAEMONS_REPORTED); + jdatorted->state = ORTE_JOB_STATE_DAEMONS_REPORTED; + /* activate the daemons_reported state for all jobs + * whose daemons were launched + */ + for (idx=1; idx < orte_job_data->size; idx++) { + if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, idx))) { + continue; + } + if (ORTE_JOB_STATE_DAEMONS_LAUNCHED == jdata->state) { + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); + } + } } } @@ -776,6 +802,9 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, opal_argv_append(argc, argv, "1"); } #endif + if (orte_map_reduce) { + opal_argv_append(argc, argv, "--mapreduce"); + } /* the following two are not mca params */ if ((int)ORTE_VPID_INVALID != orted_debug_failure) { @@ -1116,7 +1145,6 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) node = (orte_node_t*)item; /* if this node is already in the map, skip it */ if (NULL != node->daemon) { - OBJ_RELEASE(node); continue; } /* add the node to the map */ @@ -1146,19 +1174,22 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) return rc; } ++daemons->num_procs; + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:setup_vm assigning new daemon %s to node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name), + node->name)); + /* point the node to the daemon */ + node->daemon = proc; + OBJ_RETAIN(proc); /* maintain accounting */ + /* point the proc to the node and maintain accounting */ + proc->node = node; + proc->nodename = node->name; + OBJ_RETAIN(node); if (orte_plm_globals.daemon_nodes_assigned_at_launch) { - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:setup_vm assigning new daemon %s to node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name), - node->name)); - /* point the node to the daemon */ - node->daemon = proc; - OBJ_RETAIN(proc); /* maintain accounting */ - /* point the proc to the node and maintain accounting */ - proc->node = node; - proc->nodename = node->name; - OBJ_RETAIN(node); + node->location_verified = true; + } else { + node->location_verified = false; } /* track number of daemons to be launched */ ++map->num_new_daemons; diff --git a/orte/mca/plm/lsf/plm_lsf_module.c b/orte/mca/plm/lsf/plm_lsf_module.c index f61bf5f51a..025d0e136a 100644 --- a/orte/mca/plm/lsf/plm_lsf_module.c +++ b/orte/mca/plm/lsf/plm_lsf_module.c @@ -192,7 +192,7 @@ static void launch_daemons(int fd, short args, void *cbdata) * job to move to the following step */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } @@ -219,7 +219,9 @@ static void launch_daemons(int fd, short args, void *cbdata) "%s plm:lsf: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) { + ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); + } OBJ_RELEASE(state); return; } @@ -349,6 +351,7 @@ static void launch_daemons(int fd, short args, void *cbdata) /* indicate that the daemons for this job were launched */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; /* flag that launch was successful, so far as we currently know */ failed_launch = false; diff --git a/orte/mca/plm/plm_types.h b/orte/mca/plm/plm_types.h index 591a27e2c1..d536509f12 100644 --- a/orte/mca/plm/plm_types.h +++ b/orte/mca/plm/plm_types.h @@ -97,18 +97,19 @@ typedef int32_t orte_job_state_t; #define ORTE_JOB_STATE_UNDEF 0 #define ORTE_JOB_STATE_INIT 1 /* ready to be assigned id */ -#define ORTE_JOB_STATE_ALLOCATE 2 /* ready to be allocated */ -#define ORTE_JOB_STATE_MAP 3 /* ready to be mapped */ -#define ORTE_JOB_STATE_SYSTEM_PREP 4 /* ready for final sanity check and system values updated */ -#define ORTE_JOB_STATE_LAUNCH_DAEMONS 5 /* ready to launch daemons */ -#define ORTE_JOB_STATE_DAEMONS_LAUNCHED 6 /* daemons for this job have been launched */ -#define ORTE_JOB_STATE_DAEMONS_REPORTED 7 /* all launched daemons have reported */ -#define ORTE_JOB_STATE_LAUNCH_APPS 8 /* ready to launch apps */ -#define ORTE_JOB_STATE_RUNNING 9 /* all procs have been fork'd */ -#define ORTE_JOB_STATE_SUSPENDED 10 /* job has been suspended */ -#define ORTE_JOB_STATE_REGISTERED 11 /* all procs registered for sync */ -#define ORTE_JOB_STATE_READY_FOR_DEBUGGERS 12 /* job ready for debugger init after spawn */ -#define ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE 13 /* all local procs have attempted launch */ +#define ORTE_JOB_STATE_INIT_COMPLETE 2 /* jobid assigned and setup */ +#define ORTE_JOB_STATE_ALLOCATE 3 /* ready to be allocated */ +#define ORTE_JOB_STATE_MAP 4 /* ready to be mapped */ +#define ORTE_JOB_STATE_SYSTEM_PREP 5 /* ready for final sanity check and system values updated */ +#define ORTE_JOB_STATE_LAUNCH_DAEMONS 6 /* ready to launch daemons */ +#define ORTE_JOB_STATE_DAEMONS_LAUNCHED 7 /* daemons for this job have been launched */ +#define ORTE_JOB_STATE_DAEMONS_REPORTED 8 /* all launched daemons have reported */ +#define ORTE_JOB_STATE_LAUNCH_APPS 9 /* ready to launch apps */ +#define ORTE_JOB_STATE_RUNNING 10 /* all procs have been fork'd */ +#define ORTE_JOB_STATE_SUSPENDED 11 /* job has been suspended */ +#define ORTE_JOB_STATE_REGISTERED 12 /* all procs registered for sync */ +#define ORTE_JOB_STATE_READY_FOR_DEBUGGERS 13 /* job ready for debugger init after spawn */ +#define ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE 14 /* all local procs have attempted launch */ /* * Define a "boundary" so we can easily and quickly determine diff --git a/orte/mca/plm/process/plm_process_module.c b/orte/mca/plm/process/plm_process_module.c index 84c96e50fa..1bdefbc933 100644 --- a/orte/mca/plm/process/plm_process_module.c +++ b/orte/mca/plm/process/plm_process_module.c @@ -1109,7 +1109,7 @@ static void launch_daemons(int fd, short args, void *cbdata) * job to move to the following step */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } @@ -1127,7 +1127,9 @@ static void launch_daemons(int fd, short args, void *cbdata) * job to move to the following step */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) { + ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); + } OBJ_RELEASE(state); return; } @@ -1410,7 +1412,8 @@ static void launch_daemons(int fd, short args, void *cbdata) /* set the job state to indicate the daemons are launched */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - + daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + /* trigger the event to start processing the launch list */ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:process: activating launch event", diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 7dc51dcd04..9f5f5c6f0b 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -957,6 +957,7 @@ static void launch_daemons(int fd, short args, void *cbdata) * do it - no new daemons will be launched */ if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) { + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; @@ -979,7 +980,7 @@ static void launch_daemons(int fd, short args, void *cbdata) * job to move to the following step */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } @@ -997,7 +998,9 @@ static void launch_daemons(int fd, short args, void *cbdata) * job to move to the following step */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) { + ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); + } OBJ_RELEASE(state); return; } diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index ae87d150ae..e482036c97 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -199,6 +199,7 @@ static void launch_daemons(int fd, short args, void *cbdata) * do it - no new daemons will be launched */ if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) { + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; @@ -221,7 +222,7 @@ static void launch_daemons(int fd, short args, void *cbdata) * job to move to the following step */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } @@ -242,7 +243,9 @@ static void launch_daemons(int fd, short args, void *cbdata) "%s plm:slurm: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) { + ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); + } OBJ_RELEASE(state); return; } @@ -407,6 +410,7 @@ static void launch_daemons(int fd, short args, void *cbdata) /* indicate that the daemons for this job were launched */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; /* flag that launch was successful, so far as we currently know */ failed_launch = false; diff --git a/orte/mca/plm/tm/plm_tm_module.c b/orte/mca/plm/tm/plm_tm_module.c index 1755024c26..84c7e9a42b 100644 --- a/orte/mca/plm/tm/plm_tm_module.c +++ b/orte/mca/plm/tm/plm_tm_module.c @@ -195,6 +195,7 @@ static void launch_daemons(int fd, short args, void *cbdata) * do it - no new daemons will be launched */ if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & jdata->controls) { + jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; @@ -217,7 +218,7 @@ static void launch_daemons(int fd, short args, void *cbdata) * job to move to the following step */ jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } @@ -235,7 +236,9 @@ static void launch_daemons(int fd, short args, void *cbdata) * job to move to the following step */ jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) { + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); + } OBJ_RELEASE(state); return; } @@ -408,6 +411,7 @@ static void launch_daemons(int fd, short args, void *cbdata) /* indicate that the daemons for this job were launched */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; /* flag that launch was successful, so far as we currently know */ failed_launch = false; diff --git a/orte/mca/state/base/state_base_fns.c b/orte/mca/state/base/state_base_fns.c index 5cabf09c25..bb2772d65b 100644 --- a/orte/mca/state/base/state_base_fns.c +++ b/orte/mca/state/base/state_base_fns.c @@ -43,10 +43,10 @@ void orte_state_base_activate_job_state(orte_job_t *jdata, } if (s->job_state == state) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, - "%s ACTIVATING JOB %s STATE %s", + "%s ACTIVATING JOB %s STATE %s PRI %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid), - orte_job_state_to_str(state))); + orte_job_state_to_str(state), s->priority)); if (NULL == s->cbfunc) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, "%s NULL CBFUNC FOR JOB %s STATE %s", @@ -90,6 +90,11 @@ void orte_state_base_activate_job_state(orte_job_t *jdata, caddy->job_state = state; OBJ_RETAIN(jdata); } + OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, + "%s ACTIVATING JOB %s STATE %s PRI %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid), + orte_job_state_to_str(state), s->priority)); opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy); opal_event_set_priority(&caddy->ev, s->priority); opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1); @@ -217,10 +222,10 @@ void orte_state_base_activate_proc_state(orte_process_name_t *proc, } if (s->proc_state == state) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, - "%s ACTIVATING PROC %s STATE %s", + "%s ACTIVATING PROC %s STATE %s PRI %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), - orte_proc_state_to_str(state))); + orte_proc_state_to_str(state), s->priority)); if (NULL == s->cbfunc) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, "%s NULL CBFUNC FOR PROC %s STATE %s", @@ -258,6 +263,11 @@ void orte_state_base_activate_proc_state(orte_process_name_t *proc, caddy = OBJ_NEW(orte_state_caddy_t); caddy->name = *proc; caddy->proc_state = state; + OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, + "%s ACTIVATING PROC %s STATE %s PRI %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state), s->priority)); opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy); opal_event_set_priority(&caddy->ev, s->priority); opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1); diff --git a/orte/mca/state/hnp/state_hnp.c b/orte/mca/state/hnp/state_hnp.c index ff26e5429b..da87006e9a 100644 --- a/orte/mca/state/hnp/state_hnp.c +++ b/orte/mca/state/hnp/state_hnp.c @@ -86,6 +86,7 @@ static void report_progress(int fd, short argc, void *cbdata); */ static orte_job_state_t launch_states[] = { ORTE_JOB_STATE_INIT, + ORTE_JOB_STATE_INIT_COMPLETE, ORTE_JOB_STATE_ALLOCATE, ORTE_JOB_STATE_DAEMONS_LAUNCHED, ORTE_JOB_STATE_DAEMONS_REPORTED, @@ -102,6 +103,7 @@ static orte_job_state_t launch_states[] = { }; static orte_state_cbfunc_t launch_callbacks[] = { orte_plm_base_setup_job, + orte_plm_base_setup_job_complete, orte_ras_base_allocate, orte_plm_base_daemons_launched, orte_plm_base_daemons_reported, @@ -372,6 +374,11 @@ static void check_all_complete(int fd, short args, void *cbdata) /* turn off any sensor monitors on this job */ orte_sensor.stop(jdata->jobid); + /* tell the IOF that the job is complete */ + if (NULL != orte_iof.complete) { + orte_iof.complete(jdata); + } + if (0 < jdata->num_non_zero_exit && !orte_abort_non_zero_exit) { if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { /* update the exit code */ diff --git a/orte/mca/state/orted/state_orted.c b/orte/mca/state/orted/state_orted.c index b0cd59a759..2d9cea45e1 100644 --- a/orte/mca/state/orted/state_orted.c +++ b/orte/mca/state/orted/state_orted.c @@ -160,6 +160,8 @@ static void track_jobs(int fd, short argc, void *cbdata) int rc; if (ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE == caddy->job_state) { +opal_output(0, "%s state:orted:track_jobs sending local launch complete for job %s", +ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(caddy->jdata->jobid)); /* update the HNP with all proc states for this job */ alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ @@ -281,14 +283,6 @@ static void track_procs(int fd, short argc, void *cbdata) * while we are still trying to notify the HNP of * successful launch for short-lived procs */ - /* Release only the stdin IOF file descriptor for this child, if one - * was defined. File descriptors for the other IOF channels - stdout, - * stderr, and stddiag - were released when their associated pipes - * were cleared and closed due to termination of the process - */ - if (NULL != orte_iof.close) { - orte_iof.close(proc, ORTE_IOF_STDIN); - } pdata->iof_complete = true; if (pdata->waitpid_recvd) { /* the proc has terminated */ @@ -325,6 +319,16 @@ static void track_procs(int fd, short argc, void *cbdata) } } } + /* Release the stdin IOF file descriptor for this child, if one + * was defined. File descriptors for the other IOF channels - stdout, + * stderr, and stddiag - were released when their associated pipes + * were cleared and closed due to termination of the process + * Do this after we handle termination in case the IOF needs + * to check to see if all procs from the job are actually terminated + */ + if (NULL != orte_iof.close) { + orte_iof.close(proc, ORTE_IOF_STDIN); + } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index 162405b3d7..4a2df98302 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -116,6 +116,7 @@ static struct { int fail; int fail_delay; bool abort; + bool mapreduce; } orted_globals; /* @@ -205,6 +206,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = { "Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" }, #endif + { NULL, NULL, NULL, '\0', "mapreduce", "mapreduce", 0, + &orted_globals.mapreduce, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to report process bindings to stderr" }, + /* End of list */ { NULL, NULL, NULL, '\0', NULL, NULL, 0, NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } @@ -327,6 +332,11 @@ int orte_daemon(int argc, char *argv[]) #endif tmp_env_var = NULL; /* Silence compiler warning */ + /* if mapreduce set, flag it */ + if (orted_globals.mapreduce) { + orte_map_reduce = true; + } + /* Set the flag telling OpenRTE that I am NOT a * singleton, but am "infrastructure" - prevents setting * up incorrect infrastructure that only a singleton would diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 642aba9b69..ca0e4c964a 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -124,6 +124,7 @@ opal_pointer_array_t *orte_job_data; opal_pointer_array_t *orte_node_pool; opal_pointer_array_t *orte_node_topologies; opal_pointer_array_t *orte_local_children; +uint16_t orte_num_jobs = 0; /* Nidmap and job maps */ opal_pointer_array_t orte_nidmap; @@ -166,9 +167,6 @@ bool orte_do_not_barrier = false; bool orte_enable_recovery; int32_t orte_max_restarts; -/* comm fn for updating state */ -orte_default_comm_fn_t orte_comm; - /* exit status reporting */ bool orte_report_child_jobs_separately; struct timeval orte_child_time_to_exit; @@ -183,6 +181,9 @@ char *orte_forward_envars = NULL; /* preload binaries */ bool orte_preload_binaries = false; +/* map-reduce mode */ +bool orte_map_reduce = false; + /* map stddiag output to stderr so it isn't forwarded to mpirun */ bool orte_map_stddiag_to_stderr = false; @@ -637,6 +638,7 @@ static void orte_job_construct(orte_job_t* job) job->num_apps = 0; job->controls = ORTE_JOB_CONTROL_FORWARD_OUTPUT; job->stdin_target = ORTE_VPID_INVALID; + job->stdout_target = ORTE_JOBID_INVALID; job->total_slots_alloc = 0; job->num_procs = 0; job->procs = OBJ_NEW(opal_pointer_array_t); @@ -758,6 +760,7 @@ static void orte_node_construct(orte_node_t* node) node->index = -1; node->daemon = NULL; node->daemon_launched = false; + node->location_verified = false; node->launch_id = -1; node->num_procs = 0; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 51d3268c13..e5c7322f13 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -208,7 +208,10 @@ typedef uint16_t orte_job_controls_t; #define ORTE_JOB_CONTROL_SPIN_FOR_DEBUG 0x0100 #define ORTE_JOB_CONTROL_RESTART 0x0200 #define ORTE_JOB_CONTROL_PROCS_MIGRATING 0x0400 - +#define ORTE_JOB_CONTROL_MAPPER 0x0800 +#define ORTE_JOB_CONTROL_REDUCER 0x1000 +#define ORTE_JOB_CONTROL_COMBINER 0x2000 + /* global type definitions used by RTE - instanced in orte_globals.c */ /************ @@ -293,6 +296,11 @@ typedef struct { struct orte_proc_t *daemon; /* whether or not this daemon has been launched */ bool daemon_launched; + /* whether or not the location has been verified - used + * for environments where the daemon's final destination + * is uncertain + */ + bool location_verified; /** Launch id - needed by some systems to launch a proc on this node */ int32_t launch_id; /** number of procs on this node */ @@ -359,6 +367,8 @@ typedef struct { * (wildcard), or none (invalid) */ orte_vpid_t stdin_target; + /* job that is to receive the stdout (on its stdin) from this one */ + orte_jobid_t stdout_target; /* collective ids */ orte_grpcomm_coll_id_t peer_modex; orte_grpcomm_coll_id_t peer_init_barrier; @@ -635,6 +645,7 @@ ORTE_DECLSPEC extern opal_pointer_array_t *orte_job_data; ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_pool; ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies; ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children; +ORTE_DECLSPEC extern uint16_t orte_num_jobs; /* Nidmap and job maps */ ORTE_DECLSPEC extern opal_pointer_array_t orte_nidmap; @@ -673,14 +684,6 @@ ORTE_DECLSPEC extern int32_t orte_max_restarts; /* barrier control */ ORTE_DECLSPEC extern bool orte_do_not_barrier; -/* comm interface */ -typedef void (*orte_default_cbfunc_t)(int fd, short event, void *data); - -typedef int (*orte_default_comm_fn_t)(orte_process_name_t *recipient, - opal_buffer_t *buf, - orte_rml_tag_t tag, - orte_default_cbfunc_t cbfunc); - /* exit status reporting */ ORTE_DECLSPEC extern bool orte_report_child_jobs_separately; ORTE_DECLSPEC extern struct timeval orte_child_time_to_exit; @@ -695,6 +698,9 @@ ORTE_DECLSPEC extern char *orte_forward_envars; /* preload binaries */ ORTE_DECLSPEC extern bool orte_preload_binaries; +/* map-reduce mode */ +ORTE_DECLSPEC extern bool orte_map_reduce; + /* map stddiag output to stderr so it isn't forwarded to mpirun */ ORTE_DECLSPEC extern bool orte_map_stddiag_to_stderr; diff --git a/orte/tools/Makefile.am b/orte/tools/Makefile.am index b2b7db55a6..84218c8033 100644 --- a/orte/tools/Makefile.am +++ b/orte/tools/Makefile.am @@ -35,7 +35,8 @@ SUBDIRS += \ tools/wrappers \ tools/orte-top \ tools/orte-info \ - tools/orte-migrate + tools/orte-migrate \ + tools/mapreduce DIST_SUBDIRS += \ tools/orte-checkpoint \ @@ -47,5 +48,6 @@ DIST_SUBDIRS += \ tools/wrappers \ tools/orte-top \ tools/orte-info \ - tools/orte-migrate + tools/orte-migrate \ + tools/mapreduce diff --git a/orte/tools/mapreduce/Makefile.am b/orte/tools/mapreduce/Makefile.am new file mode 100644 index 0000000000..707194d606 --- /dev/null +++ b/orte/tools/mapreduce/Makefile.am @@ -0,0 +1,40 @@ +# +# Copyright (c) 2012 Los Alamos National Security, LLC. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +include $(top_srcdir)/Makefile.man-page-rules + +man_pages = mapreduce.1 +EXTRA_DIST = $(man_pages:.1=.1in) + +if !ORTE_DISABLE_FULL_SUPPORT +if OMPI_INSTALL_BINARIES + +bin_PROGRAMS = mapreduce + +nodist_man_MANS = $(man_pages) + +# Ensure that the man pages are rebuilt if the opal_config.h file +# changes; a "good enough" way to know if configure was run again (and +# therefore the release date or version may have changed) +$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h + +dist_pkgdata_DATA = help-mapreduce.txt + +endif # OMPI_INSTALL_BINARIES + +mapreduce_SOURCES = \ + mapreduce.c + +mapreduce_LDADD = $(top_builddir)/orte/libopen-rte.la + +endif # !ORTE_DISABLE_FULL_SUPPORT + +distclean-local: + rm -f $(man_pages) diff --git a/orte/tools/mapreduce/help-mapreduce.txt b/orte/tools/mapreduce/help-mapreduce.txt new file mode 100644 index 0000000000..e52dbdc624 --- /dev/null +++ b/orte/tools/mapreduce/help-mapreduce.txt @@ -0,0 +1,627 @@ +# -*- text -*- +# +# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for Open RTE's orterun. +# +[orterun:init-failure] +Open RTE was unable to initialize properly. The error occured while +attempting to %s. Returned value %d instead of ORTE_SUCCESS. +[orterun:usage] +%s (%s) %s + +Usage: %s [OPTION]... [PROGRAM]... +Start the given program using Open RTE + +%s + +Report bugs to %s +[orterun:version] +%s (%s) %s + +Report bugs to %s +[orterun:allocate-resources] +%s was unable to allocate enough resources to start your application. +This might be a transient error (too many nodes in the cluster were +unavailable at the time of the request) or a permenant error (you +requsted more nodes than exist in your cluster). + +While probably only useful to Open RTE developers, the error returned +was %d. +[orterun:error-spawning] +%s was unable to start the specified application. An attempt has been +made to clean up all processes that did start. The error returned was +%d. +[orterun:appfile-not-found] +Unable to open the appfile: + + %s + +Double check that this file exists and is readable. +[orterun:executable-not-specified] +No executable was specified on the %s command line. + +Aborting. +[orterun:multi-apps-and-zero-np] +%s found multiple applications specified on the command line, with +at least one that failed to specify the number of processes to execute. +When specifying multiple applications, you must specify how many processes +of each to launch via the -np argument. +[orterun:nothing-to-do] +%s could not find anything to do. + +It is possible that you forgot to specify how many processes to run +via the "-np" argument. +[orterun:call-failed] +%s encountered a %s call failure. This should not happen, and +usually indicates an error within the operating system itself. +Specifically, the following error occurred: + + %s + +The only other available information that may be helpful is the errno +that was returned: %d. +[orterun:environ] +%s was unable to set + %s = %s +in the environment. Returned value %d instead of ORTE_SUCCESS. +[orterun:precondition] +%s was unable to precondition transports +Returned value %d instead of ORTE_SUCCESS. +[orterun:attr-failed] +%s was unable to define an attribute +Returned value %d instead of ORTE_SUCCESS. +# +[orterun:proc-ordered-abort] +%s has exited due to process rank %lu with PID %lu on +node %s calling "abort". This may have caused other processes +in the application to be terminated by signals sent by %s +(as reported here). +# +[orterun:proc-exit-no-sync] +%s has exited due to process rank %lu with PID %lu on +node %s exiting improperly. There are three reasons this could occur: + +1. this process did not call "init" before exiting, but others in +the job did. This can cause a job to hang indefinitely while it waits +for all processes to call "init". By rule, if one process calls "init", +then ALL processes must call "init" prior to termination. + +2. this process called "init", but exited without calling "finalize". +By rule, all processes that call "init" MUST call "finalize" prior to +exiting or it will be considered an "abnormal termination" + +3. this process called "MPI_Abort" or "orte_abort" and the mca parameter +orte_create_session_dirs is set to false. In this case, the run-time cannot +detect that the abort call was an abnormal termination. Hence, the only +error message you will receive is this one. + +This may have caused other processes in the application to be +terminated by signals sent by %s (as reported here). + +You can avoid this message by specifying -quiet on the %s command line. + +# +[orterun:proc-exit-no-sync-unknown] +%s has exited due to a process exiting without calling "finalize", +but has no info as to the process that caused that situation. This +may have caused other processes in the application to be +terminated by signals sent by %s (as reported here). +# +[orterun:proc-aborted] +%s noticed that process rank %lu with PID %lu on node %s exited on signal %d. +# +[orterun:proc-aborted-unknown] +%s noticed that the job aborted, but has no info as to the process +that caused that situation. +# +[orterun:proc-aborted-signal-unknown] +%s noticed that the job aborted by signal, but has no info as +to the process that caused that situation. +# +[orterun:proc-aborted-strsignal] +%s noticed that process rank %lu with PID %lu on node %s exited on signal %d (%s). +# +[orterun:abnormal-exit] +WARNING: %s has exited before it received notification that all +started processes had terminated. You should double check and ensure +that there are no runaway processes still executing. +# +[orterun:sigint-while-processing] +WARNING: %s is in the process of killing a job, but has detected an +interruption (probably control-C). + +It is dangerous to interrupt %s while it is killing a job (proper +termination may not be guaranteed). Hit control-C again within 1 +second if you really want to kill %s immediately. +# +[orterun:double-prefix] +Both a prefix was supplied to %s and the absolute path to %s was +given: + + Prefix: %s + Path: %s + +Only one should be specified to avoid potential version +confusion. Operation will continue, but the -prefix option will be +used. This is done to allow you to select a different prefix for +the backend computation nodes than used on the frontend for %s. +# +[orterun:app-prefix-conflict] +Both a prefix or absolute path was given for %s, and a different +prefix provided for the first app_context: + + Mpirun prefix: %s + App prefix: %s + +Only one should be specified to avoid potential version +confusion. Operation will continue, but the applicaton's prefix +option will be ignored. +# +[orterun:empty-prefix] +A prefix was supplied to %s that only contained slashes. + +This is a fatal error; %s will now abort. No processes were launched. +# +[debugger-mca-param-not-found] +Internal error -- the orte_base_user_debugger MCA parameter was not able to +be found. Please contact the Open RTE developers; this should not +happen. +# +[debugger-orte_base_user_debugger-empty] +The MCA parameter "orte_base_user_debugger" was empty, indicating that +no user-level debuggers have been defined. Please set this MCA +parameter to a value and try again. +# +[debugger-not-found] +A suitable debugger could not be found in your PATH. Check the values +specified in the orte_base_user_debugger MCA parameter for the list of +debuggers that was searched. +# +[debugger-exec-failed] +%s was unable to launch the specified debugger. This is what was +launched: + + %s + +Things to check: + + - Ensure that the debugger is installed properly + - Ensure that the "%s" executable is in your path + - Ensure that any required licenses are available to run the debugger +# +[orterun:sys-limit-pipe] +%s was unable to launch the specified application as it encountered an error: + +Error: system limit exceeded on number of pipes that can be open +Node: %s + +when attempting to start process rank %lu. + +This can be resolved by setting the mca parameter opal_set_max_sys_limits to 1, +increasing your limit descriptor setting (using limit or ulimit commands), +asking the system administrator for that node to increase the system limit, or +by rearranging your processes to place fewer of them on that node. +# +[orterun:sys-limit-sockets] +Error: system limit exceeded on number of network connections that can be open + +This can be resolved by setting the mca parameter opal_set_max_sys_limits to 1, +increasing your limit descriptor setting (using limit or ulimit commands), +or asking the system administrator to increase the system limit. +# +[orterun:pipe-setup-failure] +%s was unable to launch the specified application as it encountered an error: + +Error: pipe function call failed when setting up I/O forwarding subsystem +Node: %s + +while attempting to start process rank %lu. +# +[orterun:sys-limit-children] +%s was unable to launch the specified application as it encountered an error: + +Error: system limit exceeded on number of processes that can be started +Node: %s + +when attempting to start process rank %lu. + +This can be resolved by either asking the system administrator for that node to +increase the system limit, or by rearranging your processes to place fewer of them +on that node. +# +[orterun:failed-term-attrs] +%s was unable to launch the specified application as it encountered an error: + +Error: reading tty attributes function call failed while setting up I/O forwarding system +Node: %s + +while attempting to start process rank %lu. +# +[orterun:wdir-not-found] +%s was unable to launch the specified application as it could not +change to the specified working directory: + +Working directory: %s +Node: %s + +while attempting to start process rank %lu. +# +[orterun:exe-not-found] +%s was unable to find the specified executable file, and therefore +did not launch the job. This error was first reported for process +rank %lu; it may have occurred for other processes as well. + +NOTE: A common cause for this error is misspelling a %s command + line parameter option (remember that %s interprets the first + unrecognized command line token as the executable). + +Node: %s +Executable: %s +# +[orterun:exe-not-accessible] +%s was unable to launch the specified application as it could not access +or execute an executable: + +Executable: %s +Node: %s + +while attempting to start process rank %lu. +# +[orterun:pipe-read-failure] +%s was unable to launch the specified application as it encountered an error: + +Error: reading from a pipe function call failed while spawning a local process +Node: %s + +while attempting to start process rank %lu. +# +[orterun:proc-failed-to-start] +%s was unable to start the specified application as it encountered an +error: + +Error name: %s +Node: %s + +when attempting to start process rank %lu. +# +[orterun:proc-socket-not-avail] +%s was unable to start the specified application as it encountered an +error: + +Error name: %s +Node: %s + +when attempting to start process rank %lu. +# +[orterun:proc-failed-to-start-no-status] +%s was unable to start the specified application as it encountered an +error on node %s. More information may be available above. +# +[orterun:proc-failed-to-start-no-status-no-node] +%s was unable to start the specified application as it encountered an +error. More information may be available above. +# +[debugger requires -np] +The number of MPI processes to launch was not specified on the command +line. + +The %s debugger requires that you specify a number of MPI processes to +launch on the command line via the "-np" command line parameter. For +example: + + %s -np 4 %s + +Skipping the %s debugger for now. +# +[debugger requires executable] +The %s debugger requires that you specify an executable on the %s +command line; you cannot specify application context files when +launching this job in the %s debugger. For example: + + %s -np 4 my_mpi_executable + +Skipping the %s debugger for now. +# +[debugger only accepts single app] +The %s debugger only accepts SPMD-style launching; specifying an +MPMD-style launch (with multiple applications separated via ':') is +not permitted. + +Skipping the %s debugger for now. +# +[orterun:daemon-died-during-execution] +%s has detected that a required daemon terminated during execution +of the application with a non-zero status. This is a fatal error. +A best-effort attempt has been made to cleanup. However, it is +-strongly- recommended that you execute the orte-clean utility +to ensure full cleanup is accomplished. +# +[orterun:no-orted-object-exit] +%s was unable to determine the status of the daemons used to +launch this application. Additional manual cleanup may be required. +Please refer to the "orte-clean" tool for assistance. +# +[orterun:unclean-exit] +%s was unable to cleanly terminate the daemons on the nodes shown +below. Additional manual cleanup may be required - please refer to +the "orte-clean" tool for assistance. +# +[orterun:event-def-failed] +%s was unable to define an event required for proper operation of +the system. The reason for this error was: + +Error: %s + +Please report this to the Open MPI mailing list users@open-mpi.org. +# +[orterun:ompi-server-filename-bad] +%s was unable to parse the filename where contact info for the +ompi-server was to be found. The option we were given was: + +--ompi-server %s + +This appears to be missing the required ':' following the +keyword "file". Please remember that the correct format for this +command line option is: + +--ompi-server file:path-to-file + +where path-to-file can be either relative to the cwd or absolute. +# +[orterun:ompi-server-filename-missing] +%s was unable to parse the filename where contact info for the +ompi-server was to be found. The option we were given was: + +--ompi-server %s + +This appears to be missing a filename following the ':'. Please +remember that the correct format for this command line option is: + +--ompi-server file:path-to-file + +where path-to-file can be either relative to the cwd or absolute. +# +[orterun:ompi-server-filename-access] +%s was unable to access the filename where contact info for the +ompi-server was to be found. The option we were given was: + +--ompi-server %s + +Please remember that the correct format for this command line option is: + +--ompi-server file:path-to-file + +where path-to-file can be either relative to the cwd or absolute, and that +you must have read access permissions to that file. +# +[orterun:ompi-server-file-bad] +%s was unable to read the ompi-server's contact info from the +given filename. The filename we were given was: + +FILE: %s + +Please remember that the correct format for this command line option is: + +--ompi-server file:path-to-file + +where path-to-file can be either relative to the cwd or absolute, and that +the file must have a single line in it that contains the Open MPI +uri for the ompi-server. Note that this is *not* a standard uri, but +a special format used internally by Open MPI for communications. It can +best be generated by simply directing the ompi-server to put its +uri in a file, and then giving %s that filename. +[orterun:multiple-hostfiles] +Error: More than one hostfile was passed for a single application +context, which is not supported at this time. +# +[orterun:conflicting-params] +%s has detected multiple instances of an MCA param being specified on +the command line, with conflicting values: + +MCA param: %s +Value 1: %s +Value 2: %s + +This MCA param does not support multiple values, and the system is unable +to identify which value was intended. If this was done in error, please +re-issue the command with only one value. You may wish to review the +output from ompi_info for guidance on accepted values for this param. + +[orterun:server-not-found] +%s was instructed to wait for the requested ompi-server, but was unable to +establish contact with the server during the specified wait time: + +Server uri: %s +Timeout time: %ld + +Error received: %s + +Please check to ensure that the requested server matches the actual server +information, and that the server is in operation. +# +[orterun:ompi-server-pid-bad] +%s was unable to parse the PID of the %s to be used as the ompi-server. +The option we were given was: + +--ompi-server %s + +Please remember that the correct format for this command line option is: + +--ompi-server PID:pid-of-%s + +where PID can be either "PID" or "pid". +# +[orterun:ompi-server-could-not-get-hnp-list] +%s was unable to search the list of local %s contact files to find the +specified pid. You might check to see if your local session directory +is available and that you have read permissions on the top of that +directory tree. +# +[orterun:ompi-server-pid-not-found] +%s was unable to find an %s with the specified pid of %d that was to +be used as the ompi-server. The option we were given was: + +--ompi-server %s + +Please remember that the correct format for this command line option is: + +--ompi-server PID:pid-of-%s + +where PID can be either "PID" or "pid". +# +[orterun:write_file] +%s was unable to open a file to printout %s as requested. The file +name given was: + +File: %s +# +[orterun:multiple-paffinity-schemes] +Multiple processor affinity schemes were specified (can only specify +one): + +Slot list: %s +opal_paffinity_alone: true + +Please specify only the one desired method. +# +[orterun:slot-list-failed] +We were unable to successfully process/set the requested processor +affinity settings: + +Specified slot list: %s +Error: %s + +This could mean that a non-existent processor was specified, or +that the specification had improper syntax. +# +[orterun:invalid-node-rank] +An invalid node rank was obtained - this is probably something +that should be reported to the OMPI developers. +# +[orterun:invalid-local-rank] +An invalid local rank was obtained - this is probably something +that should be reported to the OMPI developers. +# +[orterun:invalid-phys-cpu] +An invalid physical processor id was returned when attempting to +set processor affinity - please check to ensure that your system +supports such functionality. If so, then this is probably something +that should be reported to the OMPI developers. +# +[orterun:failed-set-paff] +An attempt to set processor affinity has failed - please check to +ensure that your system supports such functionality. If so, then +this is probably something that should be reported to the OMPI +developers. +# +[orterun:topo-not-supported] +An attempt was made to bind a process to a specific hardware topology +mapping (e.g., binding to a socket) but the operating system does not +support such topology-aware actions. Talk to your local system +administrator to find out if your system can support topology-aware +functionality (e.g., Linux Kernels newer than v2.6.18). + +Systems that do not support processor topology-aware functionality +cannot use "bind to socket" and other related functionality. + + Local host: %s + Action attempted: %s %s + Application name: %s +# +[orterun:binding-not-avail] +A request to bind the processes if the operating system supports such +an operation was made, but the OS does not support this operation: + + Local host: %s + Action requested: %s + Application name: %s + +Because the request was made on an "if-available" basis, the job was +launched without taking the requested action. If this is not the +desired behavior, talk to your local system administrator to find out +if your system can support the requested action. +# +[orterun:not-enough-resources] +Not enough %s were found on the local host to meet the requested +binding action: + + Local host: %s + Action requested: %s + Application name: %s + +Please revise the request and try again. +# +[orterun:paffinity-missing-module] +A request to bind processes was made, but no paffinity module +was found: + + Local host: %s + +This is potentially a configuration. You can rerun your job without +requesting binding, or check the configuration. +# +[orterun:invalid-slot-list-range] +A slot list was provided that exceeds the boundaries on available +resources: + + Local host: %s + Slot list: %s + +Please check your boundaries and try again. +# +[orterun:proc-comm-failed] +A critical communication path was lost to: + + My name: %s + Process name: %s + Node: %s +# +[orterun:proc-mem-exceeded] +A process exceeded memory limits: + + Process name: %s + Node: %s +# +[orterun:proc-stalled] +One or more processes appear to have stalled - a monitored file +failed to show the required activity. +# +[orterun:proc-sensor-exceeded] +One or more processes have exceeded a specified sensor limit, but +no further info is available. +# +[orterun:proc-called-abort] +%s detected that one or more processes called %s_abort, thus causing +the job to be terminated. +# +[orterun:proc-heartbeat-failed] +%s failed to receive scheduled heartbeat communications from a remote +process: + + Process name: %s + Node: %s +# +[orterun:non-zero-exit] +%s detected that one or more processes exited with non-zero status, thus causing +the job to be terminated. The first process to do so was: + + Process name: %s + Exit code: %d +# diff --git a/orte/tools/mapreduce/mapreduce.1in b/orte/tools/mapreduce/mapreduce.1in new file mode 100644 index 0000000000..5f83c36090 --- /dev/null +++ b/orte/tools/mapreduce/mapreduce.1in @@ -0,0 +1,1293 @@ +. -*- nroff -*- +.\" Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. +.\" Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved. +.\" +.\" Man page for ORTE's orterun command +.\" +.\" .TH name section center-footer left-footer center-header +.TH MPIRUN 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.\" ************************** +.\" Name Section +.\" ************************** +.SH NAME +. +orterun, mpirun, mpiexec \- Execute serial and parallel jobs in Open MPI. + +.B Note: +\fImpirun\fP, \fImpiexec\fP, and \fIorterun\fP are all synonyms for each +other. Using any of the names will produce the same behavior. +. +.\" ************************** +.\" Synopsis Section +.\" ************************** +.SH SYNOPSIS +. +.PP +Single Process Multiple Data (SPMD) Model: + +.B mpirun +[ options ] +.B +[ ] +.P + +Multiple Instruction Multiple Data (MIMD) Model: + +.B mpirun +[ global_options ] + [ local_options1 ] +.B +[ ] : + [ local_options2 ] +.B +[ ] : + ... : + [ local_optionsN ] +.B +[ ] +.P + +Note that in both models, invoking \fImpirun\fP via an absolute path +name is equivalent to specifying the \fI--prefix\fP option with a +\fI\fR value equivalent to the directory where \fImpirun\fR +resides, minus its last subdirectory. For example: + + \fB%\fP /usr/local/bin/mpirun ... + +is equivalent to + + \fB%\fP mpirun --prefix /usr/local + +. +.\" ************************** +.\" Quick Summary Section +.\" ************************** +.SH QUICK SUMMARY +. +If you are simply looking for how to run an MPI application, you +probably want to use a command line of the following form: + + \fB%\fP mpirun [ -np X ] [ --hostfile ] + +This will run X copies of \fI\fR in your current run-time +environment (if running under a supported resource manager, Open MPI's +\fImpirun\fR will usually automatically use the corresponding resource manager +process starter, as opposed to, for example, \fIrsh\fR or \fIssh\fR, +which require the use of a hostfile, or will default to running all X +copies on the localhost), scheduling (by default) in a round-robin fashion by +CPU slot. See the rest of this page for more details. +. +.\" ************************** +.\" Options Section +.\" ************************** +.SH OPTIONS +. +.I mpirun +will send the name of the directory where it was invoked on the local +node to each of the remote nodes, and attempt to change to that +directory. See the "Current Working Directory" section below for further +details. +.\" +.\" Start options listing +.\" Indent 10 characters from start of first column to start of second column +.TP 10 +.B +The program executable. This is identified as the first non-recognized argument +to mpirun. +. +. +.TP +.B +Pass these run-time arguments to every new process. These must always +be the last arguments to \fImpirun\fP. If an app context file is used, +\fI\fP will be ignored. +. +. +.TP +.B -h\fR,\fP --help +Display help for this command +. +. +.TP +.B -q\fR,\fP --quiet +Suppress informative messages from orterun during application execution. +. +. +.TP +.B -v\fR,\fP --verbose +Be verbose +. +. +.TP +.B -V\fR,\fP --version +Print version number. If no other arguments are given, this will also +cause orterun to exit. +. +. +. +. +.P +To specify which hosts (nodes) of the cluster to run on: +. +. +.TP +.B -H\fR,\fP -host\fR,\fP --host \fR\fP +List of hosts on which to invoke processes. +. +. +.TP +.B +-hostfile\fR,\fP --hostfile \fR\fP +Provide a hostfile to use. +.\" JJH - Should have man page for how to format a hostfile properly. +. +. +.TP +.B -machinefile\fR,\fP --machinefile \fR\fP +Synonym for \fI-hostfile\fP. +. +. +. +. +.P +To specify the number of processes to launch: +. +. +.TP +.B -c\fR,\fP -n\fR,\fP --n\fR,\fP -np \fR<#>\fP +Run this many copies of the program on the given nodes. This option +indicates that the specified file is an executable program and not an +application context. If no value is provided for the number of copies to +execute (i.e., neither the "-np" nor its synonyms are provided on the command +line), Open MPI will automatically execute a copy of the program on +each process slot (see below for description of a "process slot"). This +feature, however, can only be used in the SPMD model and will return an +error (without beginning execution of the application) otherwise. +. +. +.TP +.B -npersocket\fR,\fP --npersocket <#persocket> +On each node, launch this many processes times the number of processor +sockets on the node. +The \fI-npersocket\fP option also turns on the \fI-bind-to-socket\fP option. +. +. +.TP +.B -npernode\fR,\fP --npernode <#pernode> +On each node, launch this many processes. +. +. +.TP +.B -pernode\fR,\fP --pernode +On each node, launch one process -- equivalent to \fI-npernode\fP 1. +. +. +. +. +.P +To map processes to nodes: +. +. +.TP +.B -loadbalance\fR,\fP --loadbalance +Uniform distribution of ranks across all nodes. See more detailed description below. +. +.TP +.B -nolocal\fR,\fP --nolocal +Do not run any copies of the launched application on the same node as +orterun is running. This option will override listing the localhost +with \fB--host\fR or any other host-specifying mechanism. +. +.TP +.B -nooversubscribe\fR,\fP --nooversubscribe +Do not oversubscribe any nodes; error (without starting any processes) +if the requested number of processes would cause oversubscription. +This option implicitly sets "max_slots" equal to the "slots" value for +each node. +. +.TP +.B -bynode\fR,\fP --bynode +Launch processes one per node, cycling by node in a round-robin +fashion. This spreads processes evenly among nodes and assigns +ranks in a round-robin, "by node" manner. +. +. +. +. +.P +For process binding: +. +.TP +.B -bycore\fR,\fP --bycore +Associate processes with successive cores +if used with one of the \fI-bind-to-*\fP options. +. +.TP +.B -bysocket\fR,\fP --bysocket +Associate processes with successive processor sockets +if used with one of the \fI-bind-to-*\fP options. +. +.TP +.B -cpus-per-proc\fR,\fP --cpus-per-proc <#perproc> +Use the number of cores per process +if used with one of the \fI-bind-to-*\fP options. +. +.TP +.B -cpus-per-rank\fR,\fP --cpus-per-rank <#perrank> +Alias for \fI-cpus-per-proc\fP. +. +.TP +.B -bind-to-core\fR,\fP --bind-to-core +Bind processes to cores. +. +.TP +.B -bind-to-socket\fR,\fP --bind-to-socket +Bind processes to processor sockets. +. +.TP +.B -bind-to-none\fR,\fP --bind-to-none +Do not bind processes. (Default.) +. +.TP +.B -report-bindings\fR,\fP --report-bindings +Report any bindings for launched processes. +. +.TP +.B -slot-list\fR,\fP --slot-list +List of processor IDs to be used for binding MPI processes. The specified bindings will +be applied to all MPI processes. See explanation below for syntax. +. +. +. +. +.P +For rankfiles: +. +. +.TP +.B -rf\fR,\fP --rankfile +Provide a rankfile file. +. +. +. +. +.P +To manage standard I/O: +. +. +.TP +.B -output-filename\fR,\fP --output-filename \fR\fP +Redirect the stdout, stderr, and stddiag of all ranks to a rank-unique version of +the specified filename. Any directories in the filename will automatically be created. +Each output file will consist of filename.rank, where the rank will be left-filled with +zero's for correct ordering in listings. +. +. +.TP +.B -stdin\fR,\fP --stdin +The MPI rank that is to receive stdin. The default is to forward stdin to rank=0, but this +option can be used to forward stdin to any rank. It is also acceptable to specify \fInone\fP, +indicating that no ranks are to receive stdin. +. +. +.TP +.B -tag-output\fR,\fP --tag-output +Tag each line of output to stdout, stderr, and stddiag with \fB[jobid, rank]\fP indicating the process jobid +and rank that generated the output, and the channel which generated it. +. +. +.TP +.B -timestamp-output\fR,\fP --timestamp-output +Timestamp each line of output to stdout, stderr, and stddiag. +. +. +.TP +.B -xml\fR,\fP --xml +Provide all output to stdout, stderr, and stddiag in an xml format. +. +. +.TP +.B -xterm\fR,\fP --xterm \fR\fP +Display the specified ranks in separate xterm windows. The ranks are specified +as a comma-separated list of ranges, with a -1 indicating all. A separate +window will be created for each specified rank. +.B Note: +xterm will normally terminate the window upon termination of the process running +within it. However, by adding a "!" to the end of the list of specified ranks, +the proper options will be provided to ensure that xterm keeps the window open +\fIafter\fP the process terminates, thus allowing you to see the process' output. +Each xterm window will subsequently need to be manually closed. +.B Note: +In some environments, xterm may require that the executable be in the user's +path, or be specified in absolute or relative terms. Thus, it may be necessary +to specify a local executable as "./foo" instead of just "foo". If xterm fails to +find the executable, mpirun will hang, but still respond correctly to a ctrl-c. +If this happens, please check that the executable is being specified correctly +and try again. +. +. +. +. +.P +To manage files and runtime environment: +. +. +.TP +.B -path\fR,\fP --path \fR\fP + that will be used when attempting to locate the requested +executables. This is used prior to using the local PATH setting. +. +. +.TP +.B --prefix \fR\fP +Prefix directory that will be used to set the \fIPATH\fR and +\fILD_LIBRARY_PATH\fR on the remote node before invoking Open MPI or +the target process. See the "Remote Execution" section, below. +. +. +.TP +.B --preload-binary +Copy the specified executable(s) to remote machines prior to starting remote processes. The +executables will be copied to the Open MPI session directory and will be deleted upon +completion of the job. +. +. +.TP +.B --preload-files +Preload the comma separated list of files to the current working directory of the remote +machines where processes will be launched prior to starting those processes. +. +. +.TP +.B --preload-files-dest-dir +The destination directory to be used for preload-files, if other than the current working +directory. By default, the absolute and relative paths provided by --preload-files are used. +. +. +.TP +.B --tmpdir \fR\fP +Set the root for the session directory tree for mpirun only. +. +. +.TP +.B -wd \fR\fP +Synonym for \fI-wdir\fP. +. +. +.TP +.B -wdir \fR\fP +Change to the directory before the user's program executes. +See the "Current Working Directory" section for notes on relative paths. +.B Note: +If the \fI-wdir\fP option appears both on the command line and in an +application context, the context will take precedence over the command +line. Thus, if the path to the desired wdir is different +on the backend nodes, then it must be specified as an absolute path that +is correct for the backend node. +. +. +.TP +.B -x \fR\fP +Export the specified environment variables to the remote nodes before +executing the program. Only one environment variable can be specified +per \fI-x\fP option. Existing environment variables can be specified +or new variable names specified with corresponding values. For +example: + \fB%\fP mpirun -x DISPLAY -x OFILE=/tmp/out ... + +The parser for the \fI-x\fP option is not very sophisticated; it does +not even understand quoted values. Users are advised to set variables +in the environment, and then use \fI-x\fP to export (not define) them. +. +. +. +. +.P +Setting MCA parameters: +. +. +.TP +.B -gmca\fR,\fP --gmca \fR \fP +Pass global MCA parameters that are applicable to all contexts. \fI\fP is +the parameter name; \fI\fP is the parameter value. +. +. +.TP +.B -mca\fR,\fP --mca +Send arguments to various MCA modules. See the "MCA" section, below. +. +. +. +. +.P +For debugging: +. +. +.TP +.B -debug\fR,\fP --debug +Invoke the user-level debugger indicated by the \fIorte_base_user_debugger\fP +MCA parameter. +. +. +.TP +.B -debugger\fR,\fP --debugger +Sequence of debuggers to search for when \fI--debug\fP is used (i.e. +a synonym for \fIorte_base_user_debugger\fP MCA parameter). +. +. +.TP +.B -tv\fR,\fP --tv +Launch processes under the TotalView debugger. +Deprecated backwards compatibility flag. Synonym for \fI--debug\fP. +. +. +. +. +.P +There are also other options: +. +. +.TP +.B -aborted\fR,\fP --aborted \fR<#>\fP +Set the maximum number of aborted processes to display. +. +. +.TP +.B --app \fR\fP +Provide an appfile, ignoring all other command line options. +. +. +.TP +.B -cf\fR,\fP --cartofile \fR\fP +Provide a cartography file. +. +. +.TP +.B --hetero +Indicates that multiple app_contexts are being provided that are a mix of 32/64-bit binaries. +. +. +.TP +.B -leave-session-attached\fR,\fP --leave-session-attached +Do not detach OmpiRTE daemons used by this application. This allows error messages from the daemons +as well as the underlying environment (e.g., when failing to launch a daemon) to be output. +. +. +.TP +.B -ompi-server\fR,\fP --ompi-server +Specify the URI of the Open MPI server (or the mpirun to be used as the server) +, the name +of the file (specified as file:filename) that +contains that info, or the PID (specified as pid:#) of the mpirun to be used as + the server. +The Open MPI server is used to support multi-application data exchange via +the MPI-2 MPI_Publish_name and MPI_Lookup_name functions. +. +. +.TP +.B -report-pid\fR,\fP --report-pid +Print out mpirun's PID during startup. The channel must be either a '-' to indi +cate that +the pid is to be output to stdout, a '+' to indicate that the pid is to be outp +ut to stderr, +or a filename to which the pid is to be written. +. +. +.TP +.B -report-uri\fR,\fP --report-uri +Print out mpirun's URI during startup. The channel must be either a '-' to indi +cate that +the URI is to be output to stdout, a '+' to indicate that the URI is to be outp +ut to stderr, +or a filename to which the URI is to be written. +. +. +.TP +.B -wait-for-server\fR,\fP --wait-for-server +Pause mpirun before launching the job until ompi-server is detected. This +is useful in scripts where ompi-server may be started in the background, followed immediately by +an \fImpirun\fP command that wishes to connect to it. Mpirun will pause until either the specified +ompi-server is contacted or the server-wait-time is exceeded. +. +. +.TP +.B -server-wait-time\fR,\fP --server-wait-time +The max amount of time (in seconds) mpirun should wait for the ompi-server to start. The default +is 10 seconds. +. +. +. +. +.P +The following options are useful for developers; they are not generally +useful to most ORTE and/or MPI users: +. +.TP +.B -d\fR,\fP --debug-devel +Enable debugging of the OmpiRTE (the run-time layer in Open MPI). +This is not generally useful for most users. +. +. +.TP +.B --debug-daemons +Enable debugging of any OmpiRTE daemons used by this application. +. +. +.TP +.B --debug-daemons-file +Enable debugging of any OmpiRTE daemons used by this application, storing +output in files. +. +. +.TP +.B -launch-agent\fR,\fP --launch-agent +Name of the executable that is to be used to start processes on the remote nodes. The default +is "orted". This option can be used to test new daemon concepts, or to pass options back to the +daemons without having mpirun itself see them. For example, specifying a launch agent of +\fRorted -mca odls_base_verbose 5\fR allows the developer to ask the orted for debugging output +without clutter from mpirun itself. +. +. +.TP +.B --noprefix +Disable the automatic --prefix behavior +. +. +.P +There may be other options listed with \fImpirun --help\fP. +. +. +.\" ************************** +.\" Description Section +.\" ************************** +.SH DESCRIPTION +. +One invocation of \fImpirun\fP starts an MPI application running under Open +MPI. If the application is single process multiple data (SPMD), the application +can be specified on the \fImpirun\fP command line. + +If the application is multiple instruction multiple data (MIMD), comprising of +multiple programs, the set of programs and argument can be specified in one of +two ways: Extended Command Line Arguments, and Application Context. +.PP +An application context describes the MIMD program set including all arguments +in a separate file. +.\"See appcontext(5) for a description of the application context syntax. +This file essentially contains multiple \fImpirun\fP command lines, less the +command name itself. The ability to specify different options for different +instantiations of a program is another reason to use an application context. +.PP +Extended command line arguments allow for the description of the application +layout on the command line using colons (\fI:\fP) to separate the specification +of programs and arguments. Some options are globally set across all specified +programs (e.g. --hostfile), while others are specific to a single program +(e.g. -np). +. +. +. +.SS Specifying Host Nodes +. +Host nodes can be identified on the \fImpirun\fP command line with the \fI-host\fP +option or in a hostfile. +. +.PP +For example, +. +.TP 4 +mpirun -H aa,aa,bb ./a.out +launches two processes on node aa and one on bb. +. +.PP +Or, consider the hostfile +. + + \fB%\fP cat myhostfile + aa slots=2 + bb slots=2 + cc slots=2 + +. +.PP +Here, we list both the host names (aa, bb, and cc) but also how many "slots" +there are for each. Slots indicate how many processes can potentially execute +on a node. For best performance, the number of slots may be chosen to be the +number of cores on the node or the number of processor sockets. If the hostfile +does not provide slots information, a default of 1 is assumed. +When running under resource managers (e.g., SLURM, Torque, etc.), +Open MPI will obtain both the hostnames and the number of slots directly +from the resource manger. +. +.PP +. +.TP 4 +mpirun -hostfile myhostfile ./a.out +will launch two processes on each of the three nodes. +. +.TP 4 +mpirun -hostfile myhostfile -host aa ./a.out +will launch two processes, both on node aa. +. +.TP 4 +mpirun -hostfile myhostfile -host dd ./a.out +will find no hosts to run on and abort with an error. +That is, the specified host dd is not in the specified hostfile. +. +.SS Specifying Number of Processes +. +As we have just seen, the number of processes to run can be set using the +hostfile. Other mechanisms exist. +. +.PP +The number of processes launched can be specified as a multiple of the +number of nodes or processor sockets available. For example, +. +.TP 4 +mpirun -H aa,bb -npersocket 2 ./a.out +launches processes 0-3 on node aa and process 4-7 on node bb, +where aa and bb are both dual-socket nodes. +The \fI-npersocket\fP option also turns on the \fI-bind-to-socket\fP option, +which is discussed in a later section. +. +.TP 4 +mpirun -H aa,bb -npernode 2 ./a.out +launches processes 0-1 on node aa and processes 2-3 on node bb. +. +.TP 4 +mpirun -H aa,bb -npernode 1 ./a.out +launches one process per host node. +. +.TP 4 +mpirun -H aa,bb -pernode ./a.out +is the same as \fI-npernode\fP 1. +. +. +.PP +Another alternative is to specify the number of processes with the +\fI-np\fP option. Consider now the hostfile +. + + \fB%\fP cat myhostfile + aa slots=4 + bb slots=4 + cc slots=4 + +. +.PP +Now, +. +.TP 4 +mpirun -hostfile myhostfile -np 6 ./a.out +will launch ranks 0-3 on node aa and ranks 4-5 on node bb. The remaining +slots in the hostfile will not be used since the \fI-np\fP option indicated +that only 6 processes should be launched. +. +.SS Mapping Processes to Nodes: Using Policies +. +The examples above illustrate the default mapping of process ranks +to nodes. This mapping can also be controlled with various +\fImpirun\fP options that describe mapping policies. +. +. +.PP +Consider the same hostfile as above, again with \fI-np\fP 6: +. + + node aa node bb node cc + + mpirun 0 1 2 3 4 5 + + mpirun -loadbalance 0 1 2 3 4 5 + + mpirun -bynode 0 3 1 4 2 5 + + mpirun -nolocal 0 1 2 3 4 5 +. +.PP +The \fI-loadbalance\fP option tries to spread processes out fairly among the +nodes. +. +.PP +The \fI-bynode\fP option does likewise but numbers the processes in "by node" +in a round-robin fashion. +. +.PP +The \fI-nolocal\fP option prevents any processes from being mapped onto the +local host (in this case node aa). While \fImpirun\fP typically consumes +few system resources, \fI-nolocal\fP can be helpful for launching very +large jobs where \fImpirun\fP may actually need to use noticeable amounts +of memory and/or processing time. +. +.PP +Just as \fI-np\fP can specify fewer processes than there are slots, it can +also oversubscribe the slots. For example, with the same hostfile: +. +.TP 4 +mpirun -hostfile myhostfile -np 14 ./a.out +will launch processes 0-3 on node aa, 4-7 on bb, and 8-11 on cc. It will +then add the remaining two processes to whichever nodes it chooses. +. +.PP +One can also specify limits to oversubscription. For example, with the same +hostfile: +. +.TP 4 +mpirun -hostfile myhostfile -np 14 -nooversubscribe ./a.out +will produce an error since \fI-nooversubscribe\fP prevents oversubscription. +. +.PP +Limits to oversubscription can also be specified in the hostfile itself: +. + % cat myhostfile + aa slots=4 max_slots=4 + bb max_slots=4 + cc slots=4 +. +.PP +The \fImax_slots\fP field specifies such a limit. When it does, the +\fIslots\fP value defaults to the limit. Now: +. +.TP 4 +mpirun -hostfile myhostfile -np 14 ./a.out +causes the first 12 processes to be launched as before, but the remaining +two processes will be forced onto node cc. The other two nodes are +protected by the hostfile against oversubscription by this job. +. +.PP +Using the \fI--nooversubscribe\fR option can be helpful since Open MPI +currently does not get "max_slots" values from the resource manager. +. +.PP +Of course, \fI-np\fP can also be used with the \fI-H\fP or \fI-host\fP +option. For example, +. +.TP 4 +mpirun -H aa,bb -np 8 ./a.out +launches 8 processes. Since only two hosts are specified, after the first +two processes are mapped, one to aa and one to bb, the remaining processes +oversubscribe the specified hosts. +. +.PP +And here is a MIMD example: +. +.TP 4 +mpirun -H aa -np 1 hostname : -H bb,cc -np 2 uptime +will launch process 0 running \fIhostname\fP on node aa and processes 1 and 2 +each running \fIuptime\fP on nodes bb and cc, respectively. +. +.SS Mapping Processes to Nodes: Using Arbitrary Mappings +. +The mapping of process ranks to nodes can be prescribed not just +with general policies but also, if necessary, using arbitrary mappings +that cannot be described by a simple policy. One can use the "sequential +mapper," which reads the hostfile line by line, assigning processes +to nodes in whatever order the hostfile specifies. Use the +\fI-mca rmaps seq\fP option. For example, using the same hostfile +as before +. +.TP 4 +mpirun -hostfile myhostfile ./a.out +will launch three processes, on ranks aa, bb, and cc, respectively. +The slot counts don't matter; one process is launched per line on +whatever node is listed on the line. +. +.PP +Another way to specify arbitrary mappings is with a rank file, which +gives you detailed control over process binding as well. Rank files +are discussed below. +. +.SS Process Binding +. +Processes may be bound to specific resources on a node. This can +improve performance if the operating system is placing processes +suboptimally. For example, it might oversubscribe some multi-core +processor sockets, leaving other sockets idle; this can lead +processes to contend unnecessarily for common resources. Or, it +might spread processes out too widely; this can be suboptimal if +application performance is sensitive to interprocess communication +costs. Binding can also keep the operating system from migrating +processes excessively, regardless of how optimally those processes +were placed to begin with. +. +.PP +To bind processes, one must first associate them with the resources +on which they should run. For example, the \fI-bycore\fP option +associates the processes on a node with successive cores. Or, +\fI-bysocket\fP associates the processes with successive processor sockets, +cycling through the sockets in a round-robin fashion if necessary. +And \fI-cpus-per-proc\fP indicates how many cores to bind per process. +. +.PP +But, such association is meaningless unless the processes are actually +bound to those resources. The binding option specifies the granularity +of binding -- say, with \fI-bind-to-core\fP or \fI-bind-to-socket\fP. +One can also turn binding off with \fI-bind-to-none\fP, which is +typically the default. +. +.PP +Finally, \fI-report-bindings\fP can be used to report bindings. +. +.PP +As an example, consider a node with two processor sockets, each comprising +four cores. We run \fImpirun\fP with \fI-np 4 -report-bindings\fP and +the following additional options: +. + + % mpirun ... -bycore -bind-to-core + [...] ... binding child [...,0] to cpus 0001 + [...] ... binding child [...,1] to cpus 0002 + [...] ... binding child [...,2] to cpus 0004 + [...] ... binding child [...,3] to cpus 0008 + + % mpirun ... -bysocket -bind-to-socket + [...] ... binding child [...,0] to socket 0 cpus 000f + [...] ... binding child [...,1] to socket 1 cpus 00f0 + [...] ... binding child [...,2] to socket 0 cpus 000f + [...] ... binding child [...,3] to socket 1 cpus 00f0 + + % mpirun ... -cpus-per-proc 2 -bind-to-core + [...] ... binding child [...,0] to cpus 0003 + [...] ... binding child [...,1] to cpus 000c + [...] ... binding child [...,2] to cpus 0030 + [...] ... binding child [...,3] to cpus 00c0 + + % mpirun ... -bind-to-none +. +.PP +Here, \fI-report-bindings\fP shows the binding of each process as a mask. +In the first case, the processes bind to successive cores as indicated by +the masks 0001, 0002, 0004, and 0008. In the second case, processes bind +to all cores on successive sockets as indicated by the masks 000f and 00f0. +The processes cycle through the processor sockets in a round-robin fashion +as many times as are needed. In the third case, the masks show us that +2 cores have been bound per process. In the fourth case, binding is +turned off and no bindings are reported. +. +.PP +Open MPI's support for process binding depends on the underlying +operating system. Therefore, certain process binding options may not be available +on every system. +. +.PP +Process binding can also be set with MCA parameters. +Their usage is less convenient than that of \fImpirun\fP options. +On the other hand, MCA parameters can be set not only on the \fImpirun\fP +command line, but alternatively in a system or user mca-params.conf file +or as environment variables, as described in the MCA section below. +The correspondences are: +. + + mpirun option MCA parameter key value + + -bycore rmaps_base_schedule_policy core + -bysocket rmaps_base_schedule_policy socket + -bind-to-core orte_process_binding core + -bind-to-socket orte_process_binding socket + -bind-to-none orte_process_binding none +. +.PP +The \fIorte_process_binding\fP value can also take on the +\fI:if-avail\fP attribute. This attribute means that processes +will be bound only if this is supported on the underlying +operating system. Without the attribute, if there is no +such support, the binding request results in an error. +For example, you could have +. + + % cat $HOME/.openmpi/mca-params.conf + rmaps_base_schedule_policy = socket + orte_process_binding = socket:if-avail +. +. +.SS Rankfiles +. +Rankfiles provide a means for specifying detailed information about +how process ranks should be mapped to nodes and how they should be bound. +Consider the following: +. + + cat myrankfile + rank 0=aa slot=1:0-2 + rank 1=bb slot=0:0,1 + rank 2=cc slot=1-2 + mpirun -H aa,bb,cc,dd -rf myrankfile ./a.out +. +So that + + Rank 0 runs on node aa, bound to socket 1, cores 0-2. + Rank 1 runs on node bb, bound to socket 0, cores 0 and 1. + Rank 2 runs on node cc, bound to cores 1 and 2. +. +. +.SS Application Context or Executable Program? +. +To distinguish the two different forms, \fImpirun\fP +looks on the command line for \fI--app\fP option. If +it is specified, then the file named on the command line is +assumed to be an application context. If it is not +specified, then the file is assumed to be an executable program. +. +. +. +.SS Locating Files +. +If no relative or absolute path is specified for a file, Open +MPI will first look for files by searching the directories specified +by the \fI--path\fP option. If there is no \fI--path\fP option set or +if the file is not found at the \fI--path\fP location, then Open MPI +will search the user's PATH environment variable as defined on the +source node(s). +.PP +If a relative directory is specified, it must be relative to the initial +working directory determined by the specific starter used. For example when +using the rsh or ssh starters, the initial directory is $HOME by default. Other +starters may set the initial directory to the current working directory from +the invocation of \fImpirun\fP. +. +. +. +.SS Current Working Directory +. +The \fI\-wdir\fP mpirun option (and its synonym, \fI\-wd\fP) allows +the user to change to an arbitrary directory before the program is +invoked. It can also be used in application context files to specify +working directories on specific nodes and/or for specific +applications. +.PP +If the \fI\-wdir\fP option appears both in a context file and on the +command line, the context file directory will override the command +line value. +.PP +If the \fI-wdir\fP option is specified, Open MPI will attempt to +change to the specified directory on all of the remote nodes. If this +fails, \fImpirun\fP will abort. +.PP +If the \fI-wdir\fP option is \fBnot\fP specified, Open MPI will send +the directory name where \fImpirun\fP was invoked to each of the +remote nodes. The remote nodes will try to change to that +directory. If they are unable (e.g., if the directory does not exist on +that node), then Open MPI will use the default directory determined by +the starter. +.PP +All directory changing occurs before the user's program is invoked; it +does not wait until \fIMPI_INIT\fP is called. +. +. +. +.SS Standard I/O +. +Open MPI directs UNIX standard input to /dev/null on all processes +except the MPI_COMM_WORLD rank 0 process. The MPI_COMM_WORLD rank 0 process +inherits standard input from \fImpirun\fP. +.B Note: +The node that invoked \fImpirun\fP need not be the same as the node where the +MPI_COMM_WORLD rank 0 process resides. Open MPI handles the redirection of +\fImpirun\fP's standard input to the rank 0 process. +.PP +Open MPI directs UNIX standard output and error from remote nodes to the node +that invoked \fImpirun\fP and prints it on the standard output/error of +\fImpirun\fP. +Local processes inherit the standard output/error of \fImpirun\fP and transfer +to it directly. +.PP +Thus it is possible to redirect standard I/O for Open MPI applications by +using the typical shell redirection procedure on \fImpirun\fP. + + \fB%\fP mpirun -np 2 my_app < my_input > my_output + +Note that in this example \fIonly\fP the MPI_COMM_WORLD rank 0 process will +receive the stream from \fImy_input\fP on stdin. The stdin on all the other +nodes will be tied to /dev/null. However, the stdout from all nodes will +be collected into the \fImy_output\fP file. +. +. +. +.SS Signal Propagation +. +When orterun receives a SIGTERM and SIGINT, it will attempt to kill +the entire job by sending all processes in the job a SIGTERM, waiting +a small number of seconds, then sending all processes in the job a +SIGKILL. +. +.PP +SIGUSR1 and SIGUSR2 signals received by orterun are propagated to +all processes in the job. +. +.PP +One can turn on forwarding of SIGSTOP and SIGCONT to the program executed +by mpirun by setting the MCA parameter orte_forward_job_control to 1. +A SIGTSTOP signal to mpirun will then cause a SIGSTOP signal to be sent +to all of the programs started by mpirun and likewise a SIGCONT signal +to mpirun will cause a SIGCONT sent. +. +.PP +Other signals are not currently propagated +by orterun. +. +. +.SS Process Termination / Signal Handling +. +During the run of an MPI application, if any rank dies abnormally +(either exiting before invoking \fIMPI_FINALIZE\fP, or dying as the result of a +signal), \fImpirun\fP will print out an error message and kill the rest of the +MPI application. +.PP +User signal handlers should probably avoid trying to cleanup MPI state +(Open MPI is currently not async-signal-safe; see MPI_Init_thread(3) +for details about +.I MPI_THREAD_MULTIPLE +and thread safety). For example, if a segmentation fault occurs in +\fIMPI_SEND\fP (perhaps because a bad buffer was passed in) and a user +signal handler is invoked, if this user handler attempts to invoke +\fIMPI_FINALIZE\fP, Bad Things could happen since Open MPI was already +"in" MPI when the error occurred. Since \fImpirun\fP will notice that +the process died due to a signal, it is probably not necessary (and +safest) for the user to only clean up non-MPI state. +. +. +. +.SS Process Environment +. +Processes in the MPI application inherit their environment from the +Open RTE daemon upon the node on which they are running. The +environment is typically inherited from the user's shell. On remote +nodes, the exact environment is determined by the boot MCA module +used. The \fIrsh\fR launch module, for example, uses either +\fIrsh\fR/\fIssh\fR to launch the Open RTE daemon on remote nodes, and +typically executes one or more of the user's shell-setup files before +launching the Open RTE daemon. When running dynamically linked +applications which require the \fILD_LIBRARY_PATH\fR environment +variable to be set, care must be taken to ensure that it is correctly +set when booting Open MPI. +.PP +See the "Remote Execution" section for more details. +. +. +.SS Remote Execution +. +Open MPI requires that the \fIPATH\fR environment variable be set to +find executables on remote nodes (this is typically only necessary in +\fIrsh\fR- or \fIssh\fR-based environments -- batch/scheduled +environments typically copy the current environment to the execution +of remote jobs, so if the current environment has \fIPATH\fR and/or +\fILD_LIBRARY_PATH\fR set properly, the remote nodes will also have it +set properly). If Open MPI was compiled with shared library support, +it may also be necessary to have the \fILD_LIBRARY_PATH\fR environment +variable set on remote nodes as well (especially to find the shared +libraries required to run user MPI applications). +.PP +However, it is not always desirable or possible to edit shell +startup files to set \fIPATH\fR and/or \fILD_LIBRARY_PATH\fR. The +\fI--prefix\fR option is provided for some simple configurations where +this is not possible. +.PP +The \fI--prefix\fR option takes a single argument: the base directory +on the remote node where Open MPI is installed. Open MPI will use +this directory to set the remote \fIPATH\fR and \fILD_LIBRARY_PATH\fR +before executing any Open MPI or user applications. This allows +running Open MPI jobs without having pre-configured the \fIPATH\fR and +\fILD_LIBRARY_PATH\fR on the remote nodes. +.PP +Open MPI adds the basename of the current +node's "bindir" (the directory where Open MPI's executables are +installed) to the prefix and uses that to set the \fIPATH\fR on the +remote node. Similarly, Open MPI adds the basename of the current +node's "libdir" (the directory where Open MPI's libraries are +installed) to the prefix and uses that to set the +\fILD_LIBRARY_PATH\fR on the remote node. For example: +.TP 15 +Local bindir: +/local/node/directory/bin +.TP +Local libdir: +/local/node/directory/lib64 +.PP +If the following command line is used: + + \fB%\fP mpirun --prefix /remote/node/directory + +Open MPI will add "/remote/node/directory/bin" to the \fIPATH\fR +and "/remote/node/directory/lib64" to the \fLD_LIBRARY_PATH\fR on the +remote node before attempting to execute anything. +.PP +The \fI--prefix\fR option is not sufficient if the installation paths +on the remote node are different than the local node (e.g., if "/lib" +is used on the local node, but "/lib64" is used on the remote node), +or if the installation paths are something other than a subdirectory +under a common prefix. +.PP +Note that executing \fImpirun\fR via an absolute pathname is +equivalent to specifying \fI--prefix\fR without the last subdirectory +in the absolute pathname to \fImpirun\fR. For example: + + \fB%\fP /usr/local/bin/mpirun ... + +is equivalent to + + \fB%\fP mpirun --prefix /usr/local +. +. +. +.SS Exported Environment Variables +. +All environment variables that are named in the form OMPI_* will automatically +be exported to new processes on the local and remote nodes. +The \fI\-x\fP option to \fImpirun\fP can be used to export specific environment +variables to the new processes. While the syntax of the \fI\-x\fP +option allows the definition of new variables, note that the parser +for this option is currently not very sophisticated - it does not even +understand quoted values. Users are advised to set variables in the +environment and use \fI\-x\fP to export them; not to define them. +. +. +. +.SS Setting MCA Parameters +. +The \fI-mca\fP switch allows the passing of parameters to various MCA +(Modular Component Architecture) modules. +.\" Open MPI's MCA modules are described in detail in ompimca(7). +MCA modules have direct impact on MPI programs because they allow tunable +parameters to be set at run time (such as which BTL communication device driver +to use, what parameters to pass to that BTL, etc.). +.PP +The \fI-mca\fP switch takes two arguments: \fI\fP and \fI\fP. +The \fI\fP argument generally specifies which MCA module will receive the value. +For example, the \fI\fP "btl" is used to select which BTL to be used for +transporting MPI messages. The \fI\fP argument is the value that is +passed. +For example: +. +.TP 4 +mpirun -mca btl tcp,self -np 1 foo +Tells Open MPI to use the "tcp" and "self" BTLs, and to run a single copy of +"foo" an allocated node. +. +.TP +mpirun -mca btl self -np 1 foo +Tells Open MPI to use the "self" BTL, and to run a single copy of "foo" an +allocated node. +.\" And so on. Open MPI's BTL MCA modules are described in ompimca_btl(7). +.PP +The \fI-mca\fP switch can be used multiple times to specify different +\fI\fP and/or \fI\fP arguments. If the same \fI\fP is +specified more than once, the \fI\fPs are concatenated with a comma +(",") separating them. +.PP +Note that the \fI-mca\fP switch is simply a shortcut for setting environment variables. +The same effect may be accomplished by setting corresponding environment +variables before running \fImpirun\fP. +The form of the environment variables that Open MPI sets is: + + OMPI_MCA_= +.PP +Thus, the \fI-mca\fP switch overrides any previously set environment +variables. The \fI-mca\fP settings similarly override MCA parameters set +in the +$OPAL_PREFIX/etc/openmpi-mca-params.conf or $HOME/.openmpi/mca-params.conf +file. +. +.PP +Unknown \fI\fP arguments are still set as +environment variable -- they are not checked (by \fImpirun\fP) for correctness. +Illegal or incorrect \fI\fP arguments may or may not be reported -- it +depends on the specific MCA module. +.PP +To find the available component types under the MCA architecture, or to find the +available parameters for a specific component, use the \fIompi_info\fP command. +See the \fIompi_info(1)\fP man page for detailed information on the command. +. +.SS Exit status +. +There is no standard definition for what \fImpirun\fP should return as an exit +status. After considerable discussion, we settled on the following method for +assigning the \fImpirun\fP exit status (note: in the following description, +the "primary" job is the initial application started by mpirun - all jobs that +are spawned by that job are designated "secondary" jobs): +. +.IP \[bu] 2 +if all processes in the primary job normally terminate with exit status 0, we return 0 +.IP \[bu] +if one or more processes in the primary job normally terminate with non-zero exit status, +we return the exit status of the lowest rank to have a non-zero status +.IP \[bu] +if all processes in the primary job normally terminate with exit status 0, and one or more +processes in a secondary job normally terminate with non-zero exit status, we (a) return +the exit status of the lowest rank in the lowest jobid to have a non-zero status, and (b) +output a message summarizing the exit status of the primary and all secondary jobs. +.IP \[bu] +if the cmd line option --report-child-jobs-separately is set, we will return -only- the +exit status of the primary job. Any non-zero exit status in secondary jobs will be +reported solely in a summary print statement. +. +.PP +By default, OMPI records and notes that MPI processes exited with non-zero termination status. +This is generally not considered an "abnormal termination" - i.e., OMPI will not abort an MPI +job if one or more processes return a non-zero status. Instead, the default behavior simply +reports the number of processes terminating with non-zero status upon completion of the job. +.PP +However, in some cases it can be desirable to have the job abort when any process terminates +with non-zero status. For example, a non-MPI job might detect a bad result from a calculation +and want to abort, but doesn't want to generate a core file. Or an MPI job might continue past +a call to MPI_Finalize, but indicate that all processes should abort due to some post-MPI result. +.PP +It is not anticipated that this situation will occur frequently. However, in the interest of +serving the broader community, OMPI now has a means for allowing users to direct that jobs be +aborted upon any process exiting with non-zero status. Setting the MCA parameter +"orte_abort_on_non_zero_status" to 1 will cause OMPI to abort all processes once any process + exits with non-zero status. +.PP +Terminations caused in this manner will be reported on the console as an "abnormal termination", +with the first process to so exit identified along with its exit status. +.PP +. +.\" ************************** +.\" Examples Section +.\" ************************** +.SH EXAMPLES +Be sure also to see the examples throughout the sections above. +. +.TP 4 +mpirun -np 4 -mca btl ib,tcp,self prog1 +Run 4 copies of prog1 using the "ib", "tcp", and "self" BTL's for the +transport of MPI messages. +. +. +.TP 4 +mpirun -np 4 -mca btl tcp,sm,self +.br +--mca btl_tcp_if_include eth0 prog1 +.br +Run 4 copies of prog1 using the "tcp", "sm" and "self" BTLs for the +transport of MPI messages, with TCP using only the eth0 interface to +communicate. Note that other BTLs have similar if_include MCA +parameters. +. +.\" ************************** +.\" Diagnostics Section +.\" ************************** +. +.\" .SH DIAGNOSTICS +.\".TP 4 +.\"Error Msg: +.\"Description +. +.\" ************************** +.\" Return Value Section +.\" ************************** +. +.SH RETURN VALUE +. +\fImpirun\fP returns 0 if all ranks started by \fImpirun\fP exit after calling +MPI_FINALIZE. A non-zero value is returned if an internal error occurred in +mpirun, or one or more ranks exited before calling MPI_FINALIZE. If an +internal error occurred in mpirun, the corresponding error code is returned. +In the event that one or more ranks exit before calling MPI_FINALIZE, the +return value of the rank of the process that \fImpirun\fP first notices died +before calling MPI_FINALIZE will be returned. Note that, in general, this will +be the first rank that died but is not guaranteed to be so. +. +.\" ************************** +.\" See Also Section +.\" ************************** +. +.SH SEE ALSO +MPI_Init_thread(3) diff --git a/orte/tools/mapreduce/mapreduce.c b/orte/tools/mapreduce/mapreduce.c new file mode 100644 index 0000000000..0ce47b7778 --- /dev/null +++ b/orte/tools/mapreduce/mapreduce.c @@ -0,0 +1,2138 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006-2012 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#ifdef HAVE_STRING_H +#include +#endif +#include +#ifdef HAVE_STDLIB_H +#include +#endif /* HAVE_STDLIB_H */ +#ifdef HAVE_STRINGS_H +#include +#endif /* HAVE_STRINGS_H */ +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include +#endif +#include +#include +#include +#ifdef HAVE_SYS_TYPES_H +#include +#endif /* HAVE_SYS_TYPES_H */ +#ifdef HAVE_SYS_WAIT_H +#include +#endif /* HAVE_SYS_WAIT_H */ +#ifdef HAVE_SYS_TIME_H +#include +#endif /* HAVE_SYS_TIME_H */ +#include +#ifdef HAVE_SYS_STAT_H +#include +#endif + +#include "opal/mca/event/event.h" +#include "opal/mca/installdirs/installdirs.h" +#include "opal/mca/paffinity/base/base.h" +#include "opal/mca/base/base.h" +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "opal/util/basename.h" +#include "opal/util/cmd_line.h" +#include "opal/util/opal_environ.h" +#include "opal/util/opal_getcwd.h" +#include "opal/util/show_help.h" +#include "opal/sys/atomic.h" +#if OPAL_ENABLE_FT_CR == 1 +#include "opal/runtime/opal_cr.h" +#endif + +#include "opal/version.h" +#include "opal/runtime/opal.h" +#include "opal/util/os_path.h" +#include "opal/util/path.h" +#include "opal/class/opal_pointer_array.h" +#include "opal/dss/dss.h" + +#include "orte/util/proc_info.h" +#include "orte/util/pre_condition_transports.h" +#include "orte/util/session_dir.h" +#include "orte/util/hnp_contact.h" +#include "orte/util/show_help.h" + +#include "orte/mca/odls/odls.h" +#include "orte/mca/plm/plm.h" +#include "orte/mca/plm/base/plm_private.h" +#include "orte/mca/ras/ras.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/rml_types.h" +#include "orte/mca/rml/base/rml_contact.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/errmgr_private.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/mca/state/state.h" + +#include "orte/runtime/runtime.h" +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_wait.h" +#include "orte/runtime/orte_data_server.h" +#include "orte/runtime/orte_locks.h" +#include "orte/runtime/orte_quit.h" + +/* ensure I can behave like a daemon */ +#include "orte/orted/orted.h" + +/** + * Global struct for catching mapreduce command line options. + */ +struct mapreduce_globals_t { + bool help; + bool version; + bool verbose; + char *report_pid; + char *report_uri; + bool exit; + bool debugger; + int num_procs; + char *env_val; + char *appfile; + char *wdir; + char *path; + char *preload_files; + char *preload_files_dest_dir; + opal_mutex_t lock; + bool sleep; + char *ompi_server; + bool wait_for_server; + int server_wait_timeout; + char *stdin_target; + char *prefix; + char *path_to_mpirun; +#if OPAL_ENABLE_FT_CR == 1 + char *sstore_load; +#endif + bool disable_recovery; + bool mapper; + bool reducer; + bool combiner; + bool single_job; + orte_job_t *combiner_job; +}; + +/* + * Globals + */ +static char **global_mca_env = NULL; +static orte_std_cntr_t total_num_apps = 0; +static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT; +static char *ompi_server=NULL; +/* maintain a local array of job "chains" - each chain starts with a + * mapper and can contain any number of reducer jobs, each operating + * in a sequential chain in that the output of one stage is fed to + * the input of the next. + */ +static opal_pointer_array_t chains; + +/* + * Globals + */ +struct mapreduce_globals_t mapreduce_globals; +static bool globals_init = false; + +static opal_cmd_line_init_t cmd_line_init[] = { + /* Various "obvious" options */ + { NULL, NULL, NULL, 'h', NULL, "help", 0, + &mapreduce_globals.help, OPAL_CMD_LINE_TYPE_BOOL, + "This help message" }, + { NULL, NULL, NULL, 'V', NULL, "version", 0, + &mapreduce_globals.version, OPAL_CMD_LINE_TYPE_BOOL, + "Print version and exit" }, + { NULL, NULL, NULL, 'v', NULL, "verbose", 0, + &mapreduce_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL, + "Be verbose" }, + { "orte", "execute", "quiet", 'q', NULL, "quiet", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Suppress helpful messages" }, + { NULL, NULL, NULL, '\0', "report-pid", "report-pid", 1, + &mapreduce_globals.report_pid, OPAL_CMD_LINE_TYPE_STRING, + "Printout pid on stdout [-], stderr [+], or a file [anything else]" }, + { NULL, NULL, NULL, '\0', "report-uri", "report-uri", 1, + &mapreduce_globals.report_uri, OPAL_CMD_LINE_TYPE_STRING, + "Printout URI on stdout [-], stderr [+], or a file [anything else]" }, + + /* exit status reporting */ + { "orte", "report", "child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Return the exit status of the primary job only" }, + + /* hetero apps */ + { "orte", "hetero", "apps", '\0', NULL, "hetero-apps", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Indicates that multiple app_contexts are being provided that are a mix of 32/64 bit binaries" }, + + /* select XML output */ + { "orte", "xml", "output", '\0', "xml", "xml", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Provide all output in XML format" }, + { "orte", "xml", "file", '\0', "xml-file", "xml-file", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide all output in XML format to the specified file" }, + + /* tag output */ + { "orte", "tag", "output", '\0', "tag-output", "tag-output", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Tag all output with [job,rank]" }, + { "orte", "timestamp", "output", '\0', "timestamp-output", "timestamp-output", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Timestamp all application process output" }, + { "orte", "output", "filename", '\0', "output-filename", "output-filename", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Redirect output from application processes into filename.rank" }, + { "orte", "xterm", NULL, '\0', "xterm", "xterm", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Create a new xterm window and display output from the specified ranks there" }, + + /* =============== MAPREDUCE OPTIONS =============== */ + /* specify input files */ + { "iof", "base", "input_files", '\0', "input-files", "input-files", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Comma-separated list of input files to be read and sent to stdin of procs" }, + + /* mapper designation */ + { NULL, NULL, NULL, '\0', "mapper", "mapper", 0, + &mapreduce_globals.mapper, OPAL_CMD_LINE_TYPE_BOOL, + "Mapper application" }, + + /* reducer designation */ + { NULL, NULL, NULL, '\0', "reducer", "reducer", 0, + &mapreduce_globals.reducer, OPAL_CMD_LINE_TYPE_BOOL, + "Reducer application" }, + + /* combiner designation */ + { NULL, NULL, NULL, '\0', "combiner", "combiner", 0, + &mapreduce_globals.combiner, OPAL_CMD_LINE_TYPE_BOOL, + "Combiner application" }, + + /* ================================================== */ + + /* Specify the launch agent to be used */ + { "orte", "launch", "agent", '\0', "launch-agent", "launch-agent", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Command used to start processes on remote nodes (default: orted)" }, + + /* Preload the binary on the remote machine */ + { "orte", "preload", "binaries", 's', NULL, "preload-binary", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Preload the binary on the remote machine before starting the remote process." }, + + /* Preload files on the remote machine */ + { NULL, NULL, NULL, '\0', NULL, "preload-files", 1, + &mapreduce_globals.preload_files, OPAL_CMD_LINE_TYPE_STRING, + "Preload the comma separated list of files to the remote machines current working directory before starting the remote process." }, + + /* Where to Preload files on the remote machine */ + { NULL, NULL, NULL, '\0', NULL, "preload-files-dest-dir", 1, + &mapreduce_globals.preload_files_dest_dir, OPAL_CMD_LINE_TYPE_STRING, + "The destination directory to use in conjunction with --preload-files. By default the absolute and relative paths provided by --preload-files are used." }, + + /* Use an appfile */ + { NULL, NULL, NULL, '\0', NULL, "app", 1, + &mapreduce_globals.appfile, OPAL_CMD_LINE_TYPE_STRING, + "Provide an appfile; ignore all other command line options" }, + + /* Number of processes; -c, -n, --n, -np, and --np are all + synonyms */ + { NULL, NULL, NULL, 'c', "np", "np", 1, + &mapreduce_globals.num_procs, OPAL_CMD_LINE_TYPE_INT, + "Number of processes to run" }, + { NULL, NULL, NULL, '\0', "n", "n", 1, + &mapreduce_globals.num_procs, OPAL_CMD_LINE_TYPE_INT, + "Number of processes to run" }, + + /* Set a hostfile */ + { NULL, NULL, NULL, '\0', "hostfile", "hostfile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a hostfile" }, + { NULL, NULL, NULL, '\0', "machinefile", "machinefile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a hostfile" }, + { "orte", "default", "hostfile", '\0', "default-hostfile", "default-hostfile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a default hostfile" }, + { "opal", "if", "do_not_resolve", '\0', "do-not-resolve", "do-not-resolve", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Do not attempt to resolve interfaces" }, + + /* uri of Open MPI server, or at least where to get it */ + { NULL, NULL, NULL, '\0', "ompi-server", "ompi-server", 1, + &mapreduce_globals.ompi_server, OPAL_CMD_LINE_TYPE_STRING, + "Specify the URI of the Open MPI server, or the name of the file (specified as file:filename) that contains that info" }, + { NULL, NULL, NULL, '\0', "wait-for-server", "wait-for-server", 0, + &mapreduce_globals.wait_for_server, OPAL_CMD_LINE_TYPE_BOOL, + "If ompi-server is not already running, wait until it is detected (default: false)" }, + { NULL, NULL, NULL, '\0', "server-wait-time", "server-wait-time", 1, + &mapreduce_globals.server_wait_timeout, OPAL_CMD_LINE_TYPE_INT, + "Time in seconds to wait for ompi-server (default: 10 sec)" }, + + { "carto", "file", "path", '\0', "cf", "cartofile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a cartography file" }, + + { "orte", "rankfile", NULL, '\0', "rf", "rankfile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a rankfile file" }, + + /* Export environment variables; potentially used multiple times, + so it does not make sense to set into a variable */ + { NULL, NULL, NULL, 'x', NULL, NULL, 1, + NULL, OPAL_CMD_LINE_TYPE_NULL, + "Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" }, + + /* Mapping controls */ + { "rmaps", "base", "display_map", '\0', "display-map", "display-map", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display the process map just before launch"}, + { "rmaps", "base", "display_devel_map", '\0', "display-devel-map", "display-devel-map", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display a detailed process map (mostly intended for developers) just before launch"}, + { "rmaps", "base", "display_topo_with_map", '\0', "display-topo", "display-topo", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display the topology as part of the process map (mostly intended for developers) just before launch"}, + { "rmaps", "base", "display_diffable_map", '\0', "display-diffable-map", "display-diffable-map", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display a diffable process map (mostly intended for developers) just before launch"}, + { NULL, NULL, NULL, 'H', "host", "host", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "List of hosts to invoke processes on" }, + { "rmaps", "base", "no_schedule_local", '\0', "nolocal", "nolocal", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Do not run any applications on the local node" }, + { "rmaps", "base", "no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Nodes are not to be oversubscribed, even if the system supports such operation"}, + { "rmaps", "base", "oversubscribe", '\0', "oversubscribe", "oversubscribe", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Nodes are allowed to be oversubscribed, even on a managed system"}, +#if 0 + { "rmaps", "base", "cpus_per_rank", '\0', "cpus-per-proc", "cpus-per-proc", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Number of cpus to use for each process [default=1]" }, + { "rmaps", "base", "cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Synonym for cpus-per-proc" }, +#endif + +#if OPAL_HAVE_HWLOC + /* declare hardware threads as independent cpus */ + { "hwloc", "base", "use_hwthreads_as_cpus", '\0', "use-hwthread-cpus", "use-hwthread-cpus", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Use hardware threads as independent cpus" }, + + /* Binding options */ + { "hwloc", "base", "binding_policy", '\0', NULL, "bind-to", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Policy for binding processes [none (default) | hwthread | core | socket | numa | board] (supported qualifiers: overload-allowed,if-supported)" }, + + /* backward compatiblity */ + { "hwloc", "base", "bind_to_core", '\0', "bind-to-core", "bind-to-core", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Bind processes to cores" }, + { "hwloc", "base", "bind_to_socket", '\0', "bind-to-socket", "bind-to-socket", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Bind processes to sockets" }, + + { "hwloc", "base", "report_bindings", '\0', "report-bindings", "report-bindings", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to report process bindings to stderr" }, + + /* slot list option */ + { "hwloc", "base", "slot_list", '\0', "slot-list", "slot-list", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "List of processor IDs to bind processes to [default=NULL]"}, + + /* generalized pattern mapping option */ + { "rmaps", "ppr", "pattern", '\0', NULL, "ppr", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Comma-separated list of number of processes on a given resource type [default: none]" }, +#endif + + /* Allocation options */ + { "ras", "base", "display_alloc", '\0', "display-allocation", "display-allocation", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display the allocation being used by this job"}, + { "ras", "base", "display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display a detailed list (mostly intended for developers) of the allocation being used by this job"}, +#if OPAL_HAVE_HWLOC + { "hwloc", "base", "cpu_set", '\0', "cpu-set", "cpu-set", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]"}, +#endif + { NULL, NULL, NULL, 'H', "host", "host", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "List of hosts to invoke processes on" }, + + /* mpiexec-like arguments */ + { NULL, NULL, NULL, '\0', "wdir", "wdir", 1, + &mapreduce_globals.wdir, OPAL_CMD_LINE_TYPE_STRING, + "Set the working directory of the started processes" }, + { NULL, NULL, NULL, '\0', "wd", "wd", 1, + &mapreduce_globals.wdir, OPAL_CMD_LINE_TYPE_STRING, + "Synonym for --wdir" }, + { NULL, NULL, NULL, '\0', "path", "path", 1, + &mapreduce_globals.path, OPAL_CMD_LINE_TYPE_STRING, + "PATH to be used to look for executables to start processes" }, + + /* OpenRTE arguments */ + { "orte", "debug", NULL, 'd', "debug-devel", "debug-devel", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable debugging of OpenRTE" }, + + { "orte", "debug", "daemons", '\0', "debug-daemons", "debug-daemons", 0, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Enable debugging of any OpenRTE daemons used by this application" }, + + { "orte", "debug", "daemons_file", '\0', "debug-daemons-file", "debug-daemons-file", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable debugging of any OpenRTE daemons used by this application, storing output in files" }, + + { "orte", "leave", "session_attached", '\0', "leave-session-attached", "leave-session-attached", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable debugging of OpenRTE" }, + + { "orte", "do_not", "launch", '\0', "do-not-launch", "do-not-launch", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Perform all necessary operations to prepare to launch the application, but do not actually launch it" }, + + { NULL, NULL, NULL, '\0', NULL, "prefix", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Prefix where Open MPI is installed on remote nodes" }, + { NULL, NULL, NULL, '\0', NULL, "noprefix", 0, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Disable automatic --prefix behavior" }, + + { "orte", "report", "launch_progress", '\0', "show-progress", "show-progress", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Output a brief periodic report on launch progress" }, + + { "orte", "use", "regexp", '\0', "use-regexp", "use-regexp", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Use regular expressions for launch" }, + + { "orte", "report", "events", '\0', "report-events", "report-events", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Report events to a tool listening at the specified URI" }, + + { "orte", "enable", "recovery", '\0', "enable-recovery", "enable-recovery", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable recovery from process failure [Default = disabled]" }, + + { "orte", "max", "restarts", '\0', "max-restarts", "max-restarts", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Max number of times to restart a failed process" }, + +#if OPAL_HAVE_HWLOC + { "orte", "hetero", "nodes", '\0', NULL, "hetero-nodes", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" }, +#endif + + { NULL, NULL, NULL, '\0', "disable-recovery", "disable-recovery", 0, + &mapreduce_globals.disable_recovery, OPAL_CMD_LINE_TYPE_BOOL, + "Disable recovery (resets all recovery options to off)" }, + + /* End of list */ + { NULL, NULL, NULL, '\0', NULL, NULL, 0, + NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } +}; + +/* + * Local functions + */ +static int create_app(int argc, char* argv[], + orte_app_context_t **app, + bool *made_app, char ***app_env, + orte_job_controls_t *jtype); +static int init_globals(void); +static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line); +static int parse_locals(int argc, char* argv[]); +static int parse_appfile(char *filename, char ***env); +static void do_wireup(int fd, short sd, void *cbdata); + +int main(int argc, char *argv[]) +{ + int rc, i, j; + opal_cmd_line_t cmd_line; + char *param; + orte_job_t *daemons; + orte_app_context_t *app, *dapp; + orte_job_t *jdata=NULL; + opal_list_t *chain; + opal_list_item_t *item; + + /* find our basename (the name of the executable) so that we can + use it in pretty-print error messages */ + orte_basename = opal_basename(argv[0]); + + /* Setup and parse the command line */ + init_globals(); + opal_cmd_line_create(&cmd_line, cmd_line_init); + mca_base_cmd_line_setup(&cmd_line); + if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(&cmd_line, true, + argc, argv)) ) { + if (OPAL_ERR_SILENT != rc) { + fprintf(stderr, "%s: command line error (%s)\n", argv[0], + opal_strerror(rc)); + } + return rc; + } + + /* + * Since this process can now handle MCA/GMCA parameters, make sure to + * process them. + */ + mca_base_cmd_line_process_args(&cmd_line, &environ, &environ); + + /* Ensure that enough of OPAL is setup for us to be able to run */ + /* + * NOTE: (JJH) + * We need to allow 'mca_base_cmd_line_process_args()' to process command + * line arguments *before* calling opal_init_util() since the command + * line could contain MCA parameters that affect the way opal_init_util() + * functions. AMCA parameters are one such option normally received on the + * command line that affect the way opal_init_util() behaves. + * It is "safe" to call mca_base_cmd_line_process_args() before + * opal_init_util() since mca_base_cmd_line_process_args() does *not* + * depend upon opal_init_util() functionality. + */ + /* Need to initialize OPAL so that install_dirs are filled in */ + if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) { + exit(1); + } + + /* may look strange, but the way we handle prefix is a little weird + * and probably needs to be addressed more fully at some future point. + * For now, we have a conflict between app_files and cmd line usage. + * Since app_files are used by the C/R system, we will make an + * adjustment here to avoid perturbing that system. + * + * We cannot just have the cmd line parser place any found value + * in the global struct as the app_file parser would replace it. + * So handle this specific cmd line option manually. + */ + mapreduce_globals.prefix = NULL; + mapreduce_globals.path_to_mpirun = NULL; + if (opal_cmd_line_is_taken(&cmd_line, "prefix") || + '/' == argv[0][0] || want_prefix_by_default) { + size_t param_len; + + if ('/' == argv[0][0]) { + char* tmp_basename = NULL; + /* If they specified an absolute path, strip off the + /bin/" and leave just the prefix */ + mapreduce_globals.path_to_mpirun = opal_dirname(argv[0]); + /* Quick sanity check to ensure we got + something/bin/ and that the installation + tree is at least more or less what we expect it to + be */ + tmp_basename = opal_basename(mapreduce_globals.path_to_mpirun); + if (0 == strcmp("bin", tmp_basename)) { + char* tmp = mapreduce_globals.path_to_mpirun; + mapreduce_globals.path_to_mpirun = opal_dirname(tmp); + free(tmp); + } else { + free(mapreduce_globals.path_to_mpirun); + mapreduce_globals.path_to_mpirun = NULL; + } + free(tmp_basename); + } + /* if both are given, check to see if they match */ + if (opal_cmd_line_is_taken(&cmd_line, "prefix") && NULL != mapreduce_globals.path_to_mpirun) { + /* if they don't match, then that merits a warning */ + param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); + if (0 != strcmp(param, mapreduce_globals.path_to_mpirun)) { + orte_show_help("help-orterun.txt", "orterun:double-prefix", + true, orte_basename, param, + mapreduce_globals.path_to_mpirun, orte_basename); + /* use the prefix over the path-to-mpirun so that + * people can specify the backend prefix as different + * from the local one + */ + free(mapreduce_globals.path_to_mpirun); + mapreduce_globals.path_to_mpirun = NULL; + } else { + /* since they match, just use param */ + free(mapreduce_globals.path_to_mpirun); + } + } else if (NULL != mapreduce_globals.path_to_mpirun) { + param = mapreduce_globals.path_to_mpirun; + } else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){ + /* must be --prefix alone */ + param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); + } else { + /* --enable-mapreduce-prefix-default was given to mapreduce */ + param = strdup(opal_install_dirs.prefix); + } + + if (NULL != param) { + /* "Parse" the param, aka remove superfluous path_sep. */ + param_len = strlen(param); + while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) { + param[param_len-1] = '\0'; + param_len--; + if (0 == param_len) { + orte_show_help("help-orterun.txt", "orterun:empty-prefix", + true, orte_basename, orte_basename); + return ORTE_ERR_FATAL; + } + } + + mapreduce_globals.prefix = strdup(param); + free(param); + } + want_prefix_by_default = true; + } + + /* flag that I am the HNP - needs to be done prior to + * registering params + */ + orte_process_info.proc_type = ORTE_PROC_HNP; + + /* Setup MCA params */ + orte_register_params(); + + /*** NOTIFY IF DEPRECATED OPAL_PAFFINITY_ALONE WAS SET ***/ + if (opal_paffinity_alone) { + orte_show_help("help-opal-runtime.txt", + "opal_paffinity_alone:deprecated", + true); + } + + /* Check for some "global" command line params */ + parse_globals(argc, argv, &cmd_line); + OBJ_DESTRUCT(&cmd_line); + + /* Parse each app, adding it to a separate job in one or more MR-chains */ + OBJ_CONSTRUCT(&chains, opal_pointer_array_t); + opal_pointer_array_init(&chains, 1, INT_MAX, 1); + parse_locals(argc, argv); + + /* combine jobs as specified by user */ + if (mapreduce_globals.single_job) { + jdata = OBJ_NEW(orte_job_t); + for (i=0; i < chains.size; i++) { + if (NULL == (chain = (opal_list_t*)opal_pointer_array_get_item(&chains, i))) { + continue; + } + while (NULL != (item = opal_list_remove_first(chain))) { + orte_job_t *jptr = (orte_job_t*)item; + for (j=0; j < jptr->apps->size; j++) { + if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jptr->apps, j))) { + opal_pointer_array_add(jdata->apps, app); + jdata->num_apps++; + } + } + OBJ_RELEASE(item); + } + OBJ_RELEASE(chain); + opal_pointer_array_set_item(&chains, i, NULL); + } + chain = OBJ_NEW(opal_list_t); + opal_list_append(chain, &jdata->super); + opal_pointer_array_set_item(&chains, 0, chain); + orte_num_jobs = 1; + } + + /* save the environment for launch purposes. This MUST be + * done so that we can pass it to any local procs we + * spawn - otherwise, those local procs won't see any + * non-MCA envars were set in the enviro prior to calling + * mapreduce + */ + orte_launch_environ = opal_argv_copy(environ); + + /* purge any ess flag set externally */ + opal_unsetenv("OMPI_MCA_ess", &orte_launch_environ); + + /* flag mapreduce operations */ + orte_map_reduce = true; + + /* Intialize our Open RTE environment + * Set the flag telling orte_init that I am NOT a + * singleton, but am "infrastructure" - prevents setting + * up incorrect infrastructure that only a singleton would + * require + */ + if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_HNP))) { + /* cannot call ORTE_ERROR_LOG as it could be the errmgr + * never got loaded! + */ + return rc; + } + /* finalize the OPAL utils. As they are opened again from orte_init->opal_init + * we continue to have a reference count on them. So we have to finalize them twice... + */ + opal_finalize_util(); + + /* get the daemon job object */ + daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + + /* check for request to report uri */ + if (NULL != mapreduce_globals.report_uri) { + FILE *fp; + char *rml_uri; + rml_uri = orte_rml.get_contact_info(); + if (0 == strcmp(mapreduce_globals.report_uri, "-")) { + /* if '-', then output to stdout */ + printf("%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); + } else if (0 == strcmp(mapreduce_globals.report_uri, "+")) { + /* if '+', output to stderr */ + fprintf(stderr, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); + } else { + fp = fopen(mapreduce_globals.report_uri, "w"); + if (NULL == fp) { + orte_show_help("help-orterun.txt", "orterun:write_file", false, + orte_basename, "uri", mapreduce_globals.report_uri); + exit(0); + } + fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); + fclose(fp); + } + if (NULL != rml_uri) { + free(rml_uri); + } + } + + /* Change the default behavior of libevent such that we want to + continually block rather than blocking for the default timeout + and then looping around the progress engine again. There + should be nothing in the orted that cannot block in libevent + until "something" happens (i.e., there's no need to keep + cycling through progress because the only things that should + happen will happen in libevent). This is a minor optimization, + but what the heck... :-) */ + opal_progress_set_event_flag(OPAL_EVLOOP_ONCE); + + /* If we have a prefix, then modify the PATH and + LD_LIBRARY_PATH environment variables in our copy. This + will ensure that any locally-spawned children will + have our executables and libraries in their path + + For now, default to the prefix_dir provided in the first app_context. + Since there always MUST be at least one app_context, we are safe in + doing this. + */ + chain = (opal_list_t*)opal_pointer_array_get_item(&chains, 0); + jdata = (orte_job_t*)opal_list_get_first(chain); + if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0)) && + NULL != app->prefix_dir) { + char *oldenv, *newenv, *lib_base, *bin_base; + + /* copy the prefix into the daemon job so that any launcher + * can find the orteds when we launch the virtual machine + */ + if (NULL == (dapp = (orte_app_context_t*)opal_pointer_array_get_item(daemons->apps, 0))) { + /* that's an error in the ess */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + dapp->prefix_dir = strdup(app->prefix_dir); + + lib_base = opal_basename(opal_install_dirs.libdir); + bin_base = opal_basename(opal_install_dirs.bindir); + + /* Reset PATH */ + newenv = opal_os_path( false, app->prefix_dir, bin_base, NULL ); + oldenv = getenv("PATH"); + if (NULL != oldenv) { + char *temp; + asprintf(&temp, "%s:%s", newenv, oldenv ); + free( newenv ); + newenv = temp; + } + opal_setenv("PATH", newenv, true, &orte_launch_environ); + if (orte_debug_flag) { + opal_output(0, "%s: reset PATH: %s", orte_basename, newenv); + } + free(newenv); + free(bin_base); + + /* Reset LD_LIBRARY_PATH */ + newenv = opal_os_path( false, app->prefix_dir, lib_base, NULL ); + oldenv = getenv("LD_LIBRARY_PATH"); + if (NULL != oldenv) { + char* temp; + asprintf(&temp, "%s:%s", newenv, oldenv); + free(newenv); + newenv = temp; + } + opal_setenv("LD_LIBRARY_PATH", newenv, true, &orte_launch_environ); + if (orte_debug_flag) { + opal_output(0, "%s: reset LD_LIBRARY_PATH: %s", + orte_basename, newenv); + } + free(newenv); + free(lib_base); + } + + /* pre-condition any network transports that require it */ + for (i=0; i < chains.size; i++) { + if (NULL == (chain = (opal_list_t*)opal_pointer_array_get_item(&chains, i))) { + continue; + } + for (item = opal_list_get_first(chain); + item != opal_list_get_end(chain); + item = opal_list_get_next(item)) { + jdata = (orte_job_t*)item; + if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(jdata))) { + ORTE_ERROR_LOG(rc); + orte_show_help("help-orterun.txt", "orterun:precondition", false, + orte_basename, NULL, NULL, rc); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + goto DONE; + } + } + } + + /* setup to listen for commands sent specifically to me, even though I would probably + * be the one sending them! Unfortunately, since I am a participating daemon, + * there are times I need to send a command to "all daemons", and that means *I* have + * to receive it too + */ + rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, + ORTE_RML_PERSISTENT, orte_daemon_recv, NULL); + if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { + ORTE_ERROR_LOG(rc); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + goto DONE; + } + + /* setup the data server */ + if (ORTE_SUCCESS != (rc = orte_data_server_init())) { + ORTE_ERROR_LOG(rc); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + goto DONE; + } + + /* if an uri for the ompi-server was provided, set the route */ + if (NULL != ompi_server) { + opal_buffer_t buf; + /* setup our route to the server */ + OBJ_CONSTRUCT(&buf, opal_buffer_t); + opal_dss.pack(&buf, &ompi_server, 1, OPAL_STRING); + if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) { + ORTE_ERROR_LOG(rc); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + goto DONE; + } + OBJ_DESTRUCT(&buf); + /* check if we are to wait for the server to start - resolves + * a race condition that can occur when the server is run + * as a background job - e.g., in scripts + */ + if (mapreduce_globals.wait_for_server) { + /* ping the server */ + struct timeval timeout; + timeout.tv_sec = mapreduce_globals.server_wait_timeout; + timeout.tv_usec = 0; + if (ORTE_SUCCESS != (rc = orte_rml.ping(ompi_server, &timeout))) { + /* try it one more time */ + if (ORTE_SUCCESS != (rc = orte_rml.ping(ompi_server, &timeout))) { + /* okay give up */ + orte_show_help("help-orterun.txt", "orterun:server-not-found", true, + orte_basename, ompi_server, + (long)mapreduce_globals.server_wait_timeout, + ORTE_ERROR_NAME(rc)); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + goto DONE; + } + } + } + } + + /* define a callback point for after jobs are assigned their jobids */ + if (ORTE_SUCCESS != (rc = orte_state.set_job_state_callback(ORTE_JOB_STATE_INIT_COMPLETE, do_wireup))) { + ORTE_ERROR_LOG(rc); + } + + /* spawn the jobs and their daemons */ + for (i=0; i < chains.size; i++) { + if (NULL == (chain = (opal_list_t*)opal_pointer_array_get_item(&chains, i))) { + continue; + } + for (item = opal_list_get_first(chain); + item != opal_list_get_end(chain); + item = opal_list_get_next(item)) { + jdata = (orte_job_t*)item; + if (ORTE_SUCCESS != orte_plm.spawn(jdata)) { + goto DONE; + } + } + } + + /* loop the event lib until an exit event is detected */ + while (orte_event_base_active) { + opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + } + + DONE: + /* update the exit status, in case it wasn't done */ + ORTE_UPDATE_EXIT_STATUS(orte_exit_status); + + /* cleanup and leave */ + orte_finalize(); + + if (orte_debug_flag) { + fprintf(stderr, "exiting with status %d\n", orte_exit_status); + } + exit(orte_exit_status); +} + +static int init_globals(void) +{ + /* Only CONSTRUCT things once */ + if (!globals_init) { + OBJ_CONSTRUCT(&mapreduce_globals.lock, opal_mutex_t); + mapreduce_globals.env_val = NULL; + mapreduce_globals.appfile = NULL; + mapreduce_globals.wdir = NULL; + mapreduce_globals.path = NULL; + mapreduce_globals.ompi_server = NULL; + mapreduce_globals.wait_for_server = false; + mapreduce_globals.server_wait_timeout = 10; + mapreduce_globals.stdin_target = "0"; + mapreduce_globals.report_pid = NULL; + mapreduce_globals.report_uri = NULL; + mapreduce_globals.disable_recovery = false; + mapreduce_globals.single_job = false; + mapreduce_globals.combiner_job = NULL; + } + + /* Reset the other fields every time */ + + mapreduce_globals.help = false; + mapreduce_globals.version = false; + mapreduce_globals.verbose = false; + mapreduce_globals.debugger = false; + mapreduce_globals.num_procs = 0; + if( NULL != mapreduce_globals.env_val ) + free( mapreduce_globals.env_val ); + mapreduce_globals.env_val = NULL; + if( NULL != mapreduce_globals.appfile ) + free( mapreduce_globals.appfile ); + mapreduce_globals.appfile = NULL; + if( NULL != mapreduce_globals.wdir ) + free( mapreduce_globals.wdir ); + mapreduce_globals.wdir = NULL; + if( NULL != mapreduce_globals.path ) + free( mapreduce_globals.path ); + mapreduce_globals.path = NULL; + + mapreduce_globals.preload_files = NULL; + mapreduce_globals.preload_files_dest_dir = NULL; + +#if OPAL_ENABLE_FT_CR == 1 + mapreduce_globals.sstore_load = NULL; +#endif + + mapreduce_globals.mapper = false; + mapreduce_globals.reducer = false; + mapreduce_globals.combiner = false; + + /* All done */ + globals_init = true; + return ORTE_SUCCESS; +} + + +static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line) +{ + /* print version if requested. Do this before check for help so + that --version --help works as one might expect. */ + if (mapreduce_globals.version) { + char *str, *project_name = NULL; + if (0 == strcmp(orte_basename, "mpirun")) { + project_name = "Open MPI"; + } else { + project_name = "OpenRTE"; + } + str = opal_show_help_string("help-orterun.txt", "orterun:version", + false, + orte_basename, project_name, OPAL_VERSION, + PACKAGE_BUGREPORT); + if (NULL != str) { + printf("%s", str); + free(str); + } + exit(0); + } + + /* Check for help request */ + if (mapreduce_globals.help) { + char *str, *args = NULL; + char *project_name = NULL; + if (0 == strcmp(orte_basename, "mpirun")) { + project_name = "Open MPI"; + } else { + project_name = "OpenRTE"; + } + args = opal_cmd_line_get_usage_msg(cmd_line); + str = opal_show_help_string("help-orterun.txt", "orterun:usage", false, + orte_basename, project_name, OPAL_VERSION, + orte_basename, args, + PACKAGE_BUGREPORT); + if (NULL != str) { + printf("%s", str); + free(str); + } + free(args); + + /* If someone asks for help, that should be all we do */ + exit(0); + } + + /* check for request to report pid */ + if (NULL != mapreduce_globals.report_pid) { + FILE *fp; + if (0 == strcmp(mapreduce_globals.report_pid, "-")) { + /* if '-', then output to stdout */ + printf("%d\n", (int)getpid()); + } else if (0 == strcmp(mapreduce_globals.report_pid, "+")) { + /* if '+', output to stderr */ + fprintf(stderr, "%d\n", (int)getpid()); + } else { + fp = fopen(mapreduce_globals.report_pid, "w"); + if (NULL == fp) { + orte_show_help("help-orterun.txt", "orterun:write_file", false, + orte_basename, "pid", mapreduce_globals.report_pid); + exit(0); + } + fprintf(fp, "%d\n", (int)getpid()); + fclose(fp); + } + } + + /* if recovery was disabled on the cmd line, do so */ + if (mapreduce_globals.disable_recovery) { + orte_enable_recovery = false; + orte_max_restarts = 0; + } + + return ORTE_SUCCESS; +} + + +static int parse_locals(int argc, char* argv[]) +{ + int i, rc; + int temp_argc; + char **temp_argv, **env; + orte_app_context_t *app; + bool made_app; + orte_std_cntr_t j, size1; + orte_job_t *jdata; + orte_job_controls_t jtype; + opal_list_t *chain; + + /* if the ompi-server was given, then set it up here */ + if (NULL != mapreduce_globals.ompi_server) { + /* someone could have passed us a file instead of a uri, so + * we need to first check to see what we have - if it starts + * with "file", then we know it is a file. Otherwise, we assume + * it is a uri as provided by the ompi-server's output + * of an ORTE-standard string. Note that this is NOT a standard + * uri as it starts with the process name! + */ + if (0 == strncmp(mapreduce_globals.ompi_server, "file", strlen("file")) || + 0 == strncmp(mapreduce_globals.ompi_server, "FILE", strlen("FILE"))) { + char input[1024], *filename; + FILE *fp; + + /* it is a file - get the filename */ + filename = strchr(mapreduce_globals.ompi_server, ':'); + if (NULL == filename) { + /* filename is not correctly formatted */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, + orte_basename, mapreduce_globals.ompi_server); + exit(1); + } + ++filename; /* space past the : */ + + if (0 >= strlen(filename)) { + /* they forgot to give us the name! */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true, + orte_basename, mapreduce_globals.ompi_server); + exit(1); + } + + /* open the file and extract the uri */ + fp = fopen(filename, "r"); + if (NULL == fp) { /* can't find or read file! */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true, + orte_basename, mapreduce_globals.ompi_server); + exit(1); + } + if (NULL == fgets(input, 1024, fp)) { + /* something malformed about file */ + fclose(fp); + orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true, + orte_basename, mapreduce_globals.ompi_server, + orte_basename); + exit(1); + } + fclose(fp); + input[strlen(input)-1] = '\0'; /* remove newline */ + ompi_server = strdup(input); + } else if (0 == strncmp(mapreduce_globals.ompi_server, "pid", strlen("pid")) || + 0 == strncmp(mapreduce_globals.ompi_server, "PID", strlen("PID"))) { + opal_list_t hnp_list; + opal_list_item_t *item; + orte_hnp_contact_t *hnp; + char *ptr; + pid_t pid; + + ptr = strchr(mapreduce_globals.ompi_server, ':'); + if (NULL == ptr) { + /* pid is not correctly formatted */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-pid-bad", true, + orte_basename, orte_basename, + mapreduce_globals.ompi_server, orte_basename); + exit(1); + } + ++ptr; /* space past the : */ + + if (0 >= strlen(ptr)) { + /* they forgot to give us the pid! */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-pid-bad", true, + orte_basename, orte_basename, + mapreduce_globals.ompi_server, orte_basename); + exit(1); + } + + pid = strtoul(ptr, NULL, 10); + + /* to search the local mpirun's, we have to partially initialize the + * orte_process_info structure. This won't fully be setup until orte_init, + * but we finagle a little bit of it here + */ + if (ORTE_SUCCESS != (rc = orte_session_dir_get_name(NULL, &orte_process_info.tmpdir_base, + &orte_process_info.top_session_dir, + NULL, NULL, NULL))) { + orte_show_help("help-orterun.txt", "orterun:ompi-server-could-not-get-hnp-list", true, + orte_basename, orte_basename); + exit(1); + } + + OBJ_CONSTRUCT(&hnp_list, opal_list_t); + + /* get the list of HNPs, but do -not- setup contact info to them in the RML */ + if (ORTE_SUCCESS != (rc = orte_list_local_hnps(&hnp_list, false))) { + orte_show_help("help-orterun.txt", "orterun:ompi-server-could-not-get-hnp-list", true, + orte_basename, orte_basename); + exit(1); + } + + /* search the list for the desired pid */ + while (NULL != (item = opal_list_remove_first(&hnp_list))) { + hnp = (orte_hnp_contact_t*)item; + if (pid == hnp->pid) { + ompi_server = strdup(hnp->rml_uri); + goto hnp_found; + } + OBJ_RELEASE(item); + } + /* if we got here, it wasn't found */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-pid-not-found", true, + orte_basename, orte_basename, pid, mapreduce_globals.ompi_server, + orte_basename); + OBJ_DESTRUCT(&hnp_list); + exit(1); + hnp_found: + /* cleanup rest of list */ + while (NULL != (item = opal_list_remove_first(&hnp_list))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&hnp_list); + } else { + ompi_server = strdup(mapreduce_globals.ompi_server); + } + } + + /* Make the apps */ + + temp_argc = 0; + temp_argv = NULL; + opal_argv_append(&temp_argc, &temp_argv, argv[0]); + + /* NOTE: This bogus env variable is necessary in the calls to + create_app(), below. See comment immediately before the + create_app() function for an explanation. */ + + env = NULL; + for (i = 1; i < argc; ++i) { + if (0 == strcmp(argv[i], ":")) { + /* Make an app with this argv */ + if (opal_argv_count(temp_argv) > 1) { + if (NULL != env) { + opal_argv_free(env); + env = NULL; + } + app = NULL; + rc = create_app(temp_argc, temp_argv, &app, &made_app, &env, &jtype); + /** keep track of the number of apps - point this app_context to that index */ + if (ORTE_SUCCESS != rc) { + /* Assume that the error message has already been + printed; no need to cleanup -- we can just + exit */ + exit(1); + } + if (made_app) { + app->idx = 0; + jdata = OBJ_NEW(orte_job_t); + jdata->controls |= jtype; + opal_pointer_array_add(jdata->apps, app); + ++jdata->num_apps; + if (ORTE_JOB_CONTROL_MAPPER == jtype) { + chain = OBJ_NEW(opal_list_t); + opal_pointer_array_add(&chains, chain); + } else if (ORTE_JOB_CONTROL_COMBINER == jtype) { + chain = OBJ_NEW(opal_list_t); + opal_pointer_array_add(&chains, chain); + /* flag the combiner job */ + if (NULL != mapreduce_globals.combiner_job) { + /* cannot have more than one combiner job */ + orte_show_help("help-orterun.txt", "multiple-combiners", true); + exit(1); + } + mapreduce_globals.combiner_job = jdata; + } + opal_list_append(chain, &jdata->super); + /* track number of jobs */ + orte_num_jobs++; + } + + /* Reset the temps */ + + temp_argc = 0; + temp_argv = NULL; + opal_argv_append(&temp_argc, &temp_argv, argv[0]); + } + } else { + opal_argv_append(&temp_argc, &temp_argv, argv[i]); + } + } + + if (opal_argv_count(temp_argv) > 1) { + app = NULL; + rc = create_app(temp_argc, temp_argv, &app, &made_app, &env, &jtype); + if (ORTE_SUCCESS != rc) { + /* Assume that the error message has already been printed; + no need to cleanup -- we can just exit */ + exit(1); + } + if (made_app) { + app->idx = 0; + jdata = OBJ_NEW(orte_job_t); + jdata->controls |= jtype; + opal_pointer_array_add(jdata->apps, app); + ++jdata->num_apps; + if (ORTE_JOB_CONTROL_MAPPER == jtype) { + chain = OBJ_NEW(opal_list_t); + opal_pointer_array_add(&chains, chain); + } else if (ORTE_JOB_CONTROL_COMBINER == jtype) { + chain = OBJ_NEW(opal_list_t); + opal_pointer_array_add(&chains, chain); + /* flag the combiner job */ + if (NULL != mapreduce_globals.combiner_job) { + /* cannot have more than one combiner job */ + orte_show_help("help-orterun.txt", "multiple-combiners", true); + exit(1); + } + mapreduce_globals.combiner_job = jdata; + } + opal_list_append(chain, &jdata->super); + /* track number of jobs */ + orte_num_jobs++; + } + } + if (NULL != env) { + opal_argv_free(env); + } + opal_argv_free(temp_argv); + + /* Once we've created all the apps, add the global MCA params to + each app's environment (checking for duplicates, of + course -- yay opal_environ_merge()). */ + + if (NULL != global_mca_env) { + size1 = (size_t)opal_pointer_array_get_size(jdata->apps); + /* Iterate through all the apps */ + for (j = 0; j < size1; ++j) { + app = (orte_app_context_t *) + opal_pointer_array_get_item(jdata->apps, j); + if (NULL != app) { + /* Use handy utility function */ + env = opal_environ_merge(global_mca_env, app->env); + opal_argv_free(app->env); + app->env = env; + } + } + } + + /* Now take a subset of the MCA params and set them as MCA + overrides here in mapreduce (so that when we orte_init() later, + all the components see these MCA params). Here's how we decide + which subset of the MCA params we set here in mapreduce: + + 1. If any global MCA params were set, use those + 2. If no global MCA params were set and there was only one app, + then use its app MCA params + 3. Otherwise, don't set any + */ + + env = NULL; + if (NULL != global_mca_env) { + env = global_mca_env; + } else { + if (opal_pointer_array_get_size(jdata->apps) >= 1) { + /* Remember that pointer_array's can be padded with NULL + entries; so only use the app's env if there is exactly + 1 non-NULL entry */ + app = (orte_app_context_t *) + opal_pointer_array_get_item(jdata->apps, 0); + if (NULL != app) { + env = app->env; + for (j = 1; j < opal_pointer_array_get_size(jdata->apps); ++j) { + if (NULL != opal_pointer_array_get_item(jdata->apps, j)) { + env = NULL; + break; + } + } + } + } + } + + if (NULL != env) { + size1 = opal_argv_count(env); + for (j = 0; j < size1; ++j) { + /* Use-after-Free error possible here. putenv does not copy + * the string passed to it, and instead stores only the pointer. + * env[j] may be freed later, in which case the pointer + * in environ will now be left dangling into a deallocated + * region. + * So we make a copy of the variable. + */ + char *s = strdup(env[j]); + + if (NULL == s) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + putenv(s); + } + } + + /* All done */ + + return ORTE_SUCCESS; +} + + +static int capture_cmd_line_params(int argc, int start, char **argv) +{ + int i, j, k; + bool ignore; + char *no_dups[] = { + "grpcomm", + "odls", + "rml", + "routed", + NULL + }; + + for (i = 0; i < (argc-start); ++i) { + if (0 == strcmp("-mca", argv[i]) || + 0 == strcmp("--mca", argv[i]) ) { + /* It would be nice to avoid increasing the length + * of the orted cmd line by removing any non-ORTE + * params. However, this raises a problem since + * there could be OPAL directives that we really + * -do- want the orted to see - it's only the OMPI + * related directives we could ignore. This becomes + * a very complicated procedure, however, since + * the OMPI mca params are not cleanly separated - so + * filtering them out is nearly impossible. + * + * see if this is already present so we at least can + * avoid growing the cmd line with duplicates + */ + ignore = false; + if (NULL != orted_cmd_line) { + for (j=0; NULL != orted_cmd_line[j]; j++) { + if (0 == strcmp(argv[i+1], orted_cmd_line[j])) { + /* already here - if the value is the same, + * we can quitely ignore the fact that they + * provide it more than once. However, some + * frameworks are known to have problems if the + * value is different. We don't have a good way + * to know this, but we at least make a crude + * attempt here to protect ourselves. + */ + if (0 == strcmp(argv[i+2], orted_cmd_line[j+1])) { + /* values are the same */ + ignore = true; + break; + } else { + /* values are different - see if this is a problem */ + for (k=0; NULL != no_dups[k]; k++) { + if (0 == strcmp(no_dups[k], argv[i+1])) { + /* print help message + * and abort as we cannot know which one is correct + */ + orte_show_help("help-orterun.txt", "orterun:conflicting-params", + true, orte_basename, argv[i+1], + argv[i+2], orted_cmd_line[j+1]); + return ORTE_ERR_BAD_PARAM; + } + } + /* this passed muster - just ignore it */ + ignore = true; + break; + } + } + } + } + if (!ignore) { + opal_argv_append_nosize(&orted_cmd_line, argv[i]); + opal_argv_append_nosize(&orted_cmd_line, argv[i+1]); + opal_argv_append_nosize(&orted_cmd_line, argv[i+2]); + } + i += 2; + } + } + + return ORTE_SUCCESS; +} + + +/* + * This function takes a "char ***app_env" parameter to handle the + * specific case: + * + * mapreduce --mca foo bar -app appfile + * + * That is, we'll need to keep foo=bar, but the presence of the app + * file will cause an invocation of parse_appfile(), which will cause + * one or more recursive calls back to create_app(). Since the + * foo=bar value applies globally to all apps in the appfile, we need + * to pass in the "base" environment (that contains the foo=bar value) + * when we parse each line in the appfile. + * + * This is really just a special case -- when we have a simple case like: + * + * mapreduce --mca foo bar -np 4 hostname + * + * Then the upper-level function (parse_locals()) calls create_app() + * with a NULL value for app_env, meaning that there is no "base" + * environment that the app needs to be created from. + */ +static int create_app(int argc, char* argv[], + orte_app_context_t **app_ptr, + bool *made_app, char ***app_env, + orte_job_controls_t *jtype) +{ + opal_cmd_line_t cmd_line; + char cwd[OPAL_PATH_MAX]; + int i, j, count, rc; + char *param, *value, *value2; + orte_app_context_t *app = NULL; + bool cmd_line_made = false; + bool found = false; + char *appname; + + *made_app = false; + + /* Pre-process the command line if we are going to parse an appfile later. + * save any mca command line args so they can be passed + * separately to the daemons. + * Use Case: + * $ cat launch.appfile + * -np 1 -mca aaa bbb ./my-app -mca ccc ddd + * -np 1 -mca aaa bbb ./my-app -mca eee fff + * $ mpirun -np 2 -mca foo bar --app launch.appfile + * Only pick up '-mca foo bar' on this pass. + */ + if (NULL != mapreduce_globals.appfile) { + if (ORTE_SUCCESS != (rc = capture_cmd_line_params(argc, 0, argv))) { + goto cleanup; + } + } + + /* Parse application command line options. */ + + init_globals(); + opal_cmd_line_create(&cmd_line, cmd_line_init); + mca_base_cmd_line_setup(&cmd_line); + cmd_line_made = true; + rc = opal_cmd_line_parse(&cmd_line, true, argc, argv); + if (ORTE_SUCCESS != rc) { + goto cleanup; + } + mca_base_cmd_line_process_args(&cmd_line, app_env, &global_mca_env); + + /* Is there an appfile in here? */ + + if (NULL != mapreduce_globals.appfile) { + OBJ_DESTRUCT(&cmd_line); + return parse_appfile(strdup(mapreduce_globals.appfile), app_env); + } + + /* Setup application context */ + + app = OBJ_NEW(orte_app_context_t); + opal_cmd_line_get_tail(&cmd_line, &count, &app->argv); + + /* See if we have anything left */ + + if (0 == count) { + orte_show_help("help-orterun.txt", "orterun:executable-not-specified", + true, orte_basename, orte_basename); + rc = ORTE_ERR_NOT_FOUND; + goto cleanup; + } + + /* + * Get mca parameters so we can pass them to the daemons. + * Use the count determined above to make sure we do not go past + * the executable name. Example: + * mpirun -np 2 -mca foo bar ./my-app -mca bip bop + * We want to pick up '-mca foo bar' but not '-mca bip bop' + */ + if (ORTE_SUCCESS != (rc = capture_cmd_line_params(argc, count, argv))) { + goto cleanup; + } + + /* Grab all OMPI_* environment variables */ + + app->env = opal_argv_copy(*app_env); + for (i = 0; NULL != environ[i]; ++i) { + if (0 == strncmp("OMPI_", environ[i], 5)) { + /* check for duplicate in app->env - this + * would have been placed there by the + * cmd line processor. By convention, we + * always let the cmd line override the + * environment + */ + param = strdup(environ[i]); + value = strchr(param, '='); + *value = '\0'; + value++; + opal_setenv(param, value, false, &app->env); + free(param); + } + } + + /* add the ompi-server, if provided */ + if (NULL != ompi_server) { + opal_setenv("OMPI_MCA_pubsub_orte_server", ompi_server, true, &app->env); + } + + /* Did the user request to export any environment variables on the cmd line? */ + if (opal_cmd_line_is_taken(&cmd_line, "x")) { + j = opal_cmd_line_get_ninsts(&cmd_line, "x"); + for (i = 0; i < j; ++i) { + param = opal_cmd_line_get_param(&cmd_line, "x", i, 0); + + if (NULL != strchr(param, '=')) { + opal_argv_append_nosize(&app->env, param); + } else { + value = getenv(param); + if (NULL != value) { + if (NULL != strchr(value, '=')) { + opal_argv_append_nosize(&app->env, value); + } else { + asprintf(&value2, "%s=%s", param, value); + opal_argv_append_nosize(&app->env, value2); + free(value2); + } + } else { + opal_output(0, "Warning: could not find environment variable \"%s\"\n", param); + } + } + } + } + + /* Did the user request to export any environment variables via MCA param? */ + if (NULL != orte_forward_envars) { + char **vars; + vars = opal_argv_split(orte_forward_envars, ','); + for (i=0; NULL != vars[i]; i++) { + if (NULL != strchr(vars[i], '=')) { + /* user supplied a value */ + opal_argv_append_nosize(&app->env, vars[i]); + } else { + /* get the value from the environ */ + value = getenv(vars[i]); + if (NULL != value) { + if (NULL != strchr(value, '=')) { + opal_argv_append_nosize(&app->env, value); + } else { + asprintf(&value2, "%s=%s", vars[i], value); + opal_argv_append_nosize(&app->env, value2); + free(value2); + } + } else { + opal_output(0, "Warning: could not find environment variable \"%s\"\n", param); + } + } + } + opal_argv_free(vars); + } + + /* If the user specified --path, store it in the user's app + environment via the OMPI_exec_path variable. */ + if (NULL != mapreduce_globals.path) { + asprintf(&value, "OMPI_exec_path=%s", mapreduce_globals.path); + opal_argv_append_nosize(&app->env, value); + free(value); + } + + /* Did the user request a specific wdir? */ + if (NULL != mapreduce_globals.wdir) { + /* if this is a relative path, convert it to an absolute path */ + if (opal_path_is_absolute(mapreduce_globals.wdir)) { + app->cwd = strdup(mapreduce_globals.wdir); + } else { + /* get the cwd */ + if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) { + orte_show_help("help-orterun.txt", "orterun:init-failure", + true, "get the cwd", rc); + goto cleanup; + } + /* construct the absolute path */ + app->cwd = opal_os_path(false, cwd, mapreduce_globals.wdir, NULL); + } + app->user_specified_cwd = true; + } else { + if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) { + orte_show_help("help-orterun.txt", "orterun:init-failure", + true, "get the cwd", rc); + goto cleanup; + } + app->cwd = strdup(cwd); + app->user_specified_cwd = false; + } + + /* if this is the first app_context, check for prefix directions. + * We only do this for the first app_context because the launchers + * only look at the first one when setting the prefix - we do NOT + * support per-app_context prefix settings! + */ + if (0 == total_num_apps) { + /* Check to see if the user explicitly wanted to disable automatic + --prefix behavior */ + + if (opal_cmd_line_is_taken(&cmd_line, "noprefix")) { + want_prefix_by_default = false; + } + + /* Did the user specify a prefix, or want prefix by default? */ + if (opal_cmd_line_is_taken(&cmd_line, "prefix") || want_prefix_by_default) { + size_t param_len; + /* if both the prefix was given and we have a prefix + * given above, check to see if they match + */ + if (opal_cmd_line_is_taken(&cmd_line, "prefix") && + NULL != mapreduce_globals.prefix) { + /* if they don't match, then that merits a warning */ + param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); + if (0 != strcmp(param, mapreduce_globals.prefix)) { + orte_show_help("help-orterun.txt", "orterun:app-prefix-conflict", + true, orte_basename, mapreduce_globals.prefix, param); + /* let the global-level prefix take precedence since we + * know that one is being used + */ + free(param); + param = mapreduce_globals.prefix; + } else { + /* since they match, just use param */ + free(mapreduce_globals.prefix); + mapreduce_globals.prefix = NULL; + } + } else if (NULL != mapreduce_globals.prefix) { + param = mapreduce_globals.prefix; + } else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){ + /* must be --prefix alone */ + param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); + } else { + /* --enable-orterun-prefix-default was given to mapreduce */ + param = strdup(opal_install_dirs.prefix); + } + + if (NULL != param) { + /* "Parse" the param, aka remove superfluous path_sep. */ + param_len = strlen(param); + while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) { + param[param_len-1] = '\0'; + param_len--; + if (0 == param_len) { + orte_show_help("help-orterun.txt", "orterun:empty-prefix", + true, orte_basename, orte_basename); + return ORTE_ERR_FATAL; + } + } + + app->prefix_dir = strdup(param); + free(param); + } + } + } + + /* Did the user specify a hostfile. Need to check for both + * hostfile and machine file. + * We can only deal with one hostfile per app context, otherwise give an error. + */ + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) { + if(1 < j) { + orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", + true, orte_basename, NULL); + return ORTE_ERR_FATAL; + } else { + value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0); + app->hostfile = strdup(value); + } + } + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) { + if(1 < j || NULL != app->hostfile) { + orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", + true, orte_basename, NULL); + return ORTE_ERR_FATAL; + } else { + value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0); + app->hostfile = strdup(value); + } + } + + /* Did the user specify any hosts? */ + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "host"))) { + for (i = 0; i < j; ++i) { + value = opal_cmd_line_get_param(&cmd_line, "host", i, 0); + opal_argv_append_nosize(&app->dash_host, value); + } + } + + /* Get the numprocs */ + app->num_procs = (orte_std_cntr_t)mapreduce_globals.num_procs; + total_num_apps++; + + /* Preserve if we are to preload the binary */ + app->preload_binary = orte_preload_binaries; + if( NULL != mapreduce_globals.preload_files) + app->preload_files = strdup(mapreduce_globals.preload_files); + else + app->preload_files = NULL; + if( NULL != mapreduce_globals.preload_files_dest_dir) + app->preload_files_dest_dir = strdup(mapreduce_globals.preload_files_dest_dir); + else + app->preload_files_dest_dir = NULL; + + /* flag type of app */ + if (mapreduce_globals.mapper) { + *jtype = ORTE_JOB_CONTROL_MAPPER; + } else if (mapreduce_globals.reducer) { + *jtype = ORTE_JOB_CONTROL_REDUCER; + } else if (mapreduce_globals.combiner) { + *jtype = ORTE_JOB_CONTROL_COMBINER; + } else { + /* should specify type - however, we will assume that + * the first app is the mapper and all others are reducers + */ + if (1 < total_num_apps) { + *jtype = ORTE_JOB_CONTROL_REDUCER; + } else { + *jtype = ORTE_JOB_CONTROL_MAPPER; + } + } + + /* Do not try to find argv[0] here -- the starter is responsible + for that because it may not be relevant to try to find it on + the node where mapreduce is executing. So just strdup() argv[0] + into app. */ + + app->app = strdup(app->argv[0]); + if (NULL == app->app) { + orte_show_help("help-orterun.txt", "orterun:call-failed", + true, orte_basename, "library", "strdup returned NULL", errno); + rc = ORTE_ERR_NOT_FOUND; + goto cleanup; + } + + /* if this is a Java application, we have a bit more work to do. Such + * applications actually need to be run under the Java virtual machine + * and the "java" command will start the "executable". So we need to ensure + * that all the proper java-specific paths are provided + */ + appname = opal_basename(app->app); + if (0 == strcmp(appname, "java")) { + /* see if we were given a library path */ + found = false; + for (i=0; NULL != app->argv[i]; i++) { + if (NULL != strstr(app->argv[i], "java.library.path")) { + /* yep - but does it include the path to the mpi libs? */ + found = true; + if (NULL == strstr(app->argv[i], opal_install_dirs.libdir)) { + /* doesn't appear to - add it to be safe */ + if (':' == app->argv[i][strlen(app->argv[i]-1)]) { + asprintf(&value, "-Djava.library.path=%s%s", app->argv[i], opal_install_dirs.libdir); + } else { + asprintf(&value, "-Djava.library.path=%s:%s", app->argv[i], opal_install_dirs.libdir); + } + free(app->argv[i]); + app->argv[i] = value; + } + } + } + if (!found) { + /* need to add it right after the java command */ + asprintf(&value, "-Djava.library.path=%s", opal_install_dirs.libdir); + opal_argv_insert_element(&app->argv, 1, value); + free(value); + } + + /* see if we were given a class path */ + found = false; + for (i=0; NULL != app->argv[i]; i++) { + if (NULL != strstr(app->argv[i], "cp") || + NULL != strstr(app->argv[i], "classpath")) { + /* yep - but does it include the path to the mpi libs? */ + found = true; + if (NULL == strstr(app->argv[i+1], "mpi.jar")) { + /* nope - need to add it */ + if (':' == app->argv[i+1][strlen(app->argv[i+1]-1)]) { + asprintf(&value, "%s%s/mpi.jar", app->argv[i+1], opal_install_dirs.libdir); + } else { + asprintf(&value, "%s:%s/mpi.jar", app->argv[i+1], opal_install_dirs.libdir); + } + free(app->argv[i+1]); + app->argv[i+1] = value; + } + break; + } + } + if (!found) { + /* check to see if CLASSPATH is in the environment */ + for (i=0; NULL != environ[i]; i++) { + if (0 == strncmp(environ[i], "CLASSPATH", strlen("CLASSPATH"))) { + /* check if mpi.jar is present */ + if (NULL != strstr(environ[i], "mpi.jar")) { + /* yes - just add the envar to the argv in the + * right format + */ + value = strchr(environ[i], '='); + ++value; /* step over the = */ + opal_argv_insert_element(&app->argv, 1, value); + opal_argv_insert_element(&app->argv, 1, "-cp"); + } else { + /* need to add it */ + value = strchr(environ[i], '='); + ++value; /* step over the = */ + if (':' == value[strlen(value-1)]) { + asprintf(¶m, "%s%s/mpi.jar", value, opal_install_dirs.libdir); + } else { + asprintf(¶m, "%s:%s/mpi.jar", value, opal_install_dirs.libdir); + } + opal_argv_insert_element(&app->argv, 1, param); + opal_argv_insert_element(&app->argv, 1, "-cp"); + free(param); + } + found = true; + break; + } + } + if (!found) { + /* need to add it right after the java command - have + * to include the current directory and trust that + * the user set cwd if necessary + */ + asprintf(&value, ".:%s/mpi.jar", opal_install_dirs.libdir); + opal_argv_insert_element(&app->argv, 1, value); + free(value); + opal_argv_insert_element(&app->argv, 1, "-cp"); + } + } + } + free(appname); + if (mapreduce_globals.verbose) { + value = opal_argv_join(app->argv, ' '); + free(value); + } + + *app_ptr = app; + app = NULL; + *made_app = true; + + /* All done */ + + cleanup: + if (NULL != app) { + OBJ_RELEASE(app); + } + if (cmd_line_made) { + OBJ_DESTRUCT(&cmd_line); + } + return rc; +} + + +static int parse_appfile(char *filename, char ***env) +{ + size_t i, len; + FILE *fp; + char line[BUFSIZ]; + int rc, argc; + char **argv; + orte_app_context_t *app; + bool blank, made_app; + char bogus[] = "bogus "; + char **tmp_env; + orte_job_t *jdata; + orte_job_controls_t jtype; + opal_list_t *chain; + + /* + * Make sure to clear out this variable so we don't do anything odd in + * app_create() + */ + if( NULL != mapreduce_globals.appfile ) { + free( mapreduce_globals.appfile ); + mapreduce_globals.appfile = NULL; + } + + /* Try to open the file */ + + fp = fopen(filename, "r"); + if (NULL == fp) { + orte_show_help("help-orterun.txt", "orterun:appfile-not-found", true, + filename); + return ORTE_ERR_NOT_FOUND; + } + + /* Read in line by line */ + + line[sizeof(line) - 1] = '\0'; + do { + + /* We need a bogus argv[0] (because when argv comes in from + the command line, argv[0] is "mapreduce", so the parsing + logic ignores it). So create one here rather than making + an argv and then pre-pending a new argv[0] (which would be + rather inefficient). */ + + line[0] = '\0'; + strcat(line, bogus); + + if (NULL == fgets(line + sizeof(bogus) - 1, + sizeof(line) - sizeof(bogus) - 1, fp)) { + break; + } + + /* Remove a trailing newline */ + + len = strlen(line); + if (len > 0 && '\n' == line[len - 1]) { + line[len - 1] = '\0'; + if (len > 0) { + --len; + } + } + + /* Remove comments */ + + for (i = 0; i < len; ++i) { + if ('#' == line[i]) { + line[i] = '\0'; + break; + } else if (i + 1 < len && '/' == line[i] && '/' == line[i + 1]) { + line[i] = '\0'; + break; + } + } + + /* Is this a blank line? */ + + len = strlen(line); + for (blank = true, i = sizeof(bogus); i < len; ++i) { + if (!isspace(line[i])) { + blank = false; + break; + } + } + if (blank) { + continue; + } + + /* We got a line with *something* on it. So process it */ + + argv = opal_argv_split(line, ' '); + argc = opal_argv_count(argv); + if (argc > 0) { + + /* Create a temporary env to use in the recursive call -- + that is: don't disturb the original env so that we can + have a consistent global env. This allows for the + case: + + mapreduce --mca foo bar --appfile file + + where the "file" contains multiple apps. In this case, + each app in "file" will get *only* foo=bar as the base + environment from which its specific environment is + constructed. */ + + if (NULL != *env) { + tmp_env = opal_argv_copy(*env); + if (NULL == tmp_env) { + return ORTE_ERR_OUT_OF_RESOURCE; + } + } else { + tmp_env = NULL; + } + + rc = create_app(argc, argv, &app, &made_app, &tmp_env, &jtype); + if (ORTE_SUCCESS != rc) { + /* Assume that the error message has already been + printed; no need to cleanup -- we can just exit */ + exit(1); + } + if (NULL != tmp_env) { + opal_argv_free(tmp_env); + } + if (made_app) { + app->idx = 0; + jdata = OBJ_NEW(orte_job_t); + jdata->controls |= jtype; + opal_pointer_array_add(jdata->apps, app); + ++jdata->num_apps; + if (ORTE_JOB_CONTROL_MAPPER == jtype) { + chain = OBJ_NEW(opal_list_t); + opal_pointer_array_add(&chains, chain); + } else if (ORTE_JOB_CONTROL_COMBINER == jtype) { + chain = OBJ_NEW(opal_list_t); + opal_pointer_array_add(&chains, chain); + /* flag the combiner job */ + if (NULL != mapreduce_globals.combiner_job) { + /* cannot have more than one combiner job */ + orte_show_help("help-orterun.txt", "multiple-combiners", true); + exit(1); + } + mapreduce_globals.combiner_job = jdata; + } + opal_list_append(chain, &jdata->super); + /* track number of jobs */ + orte_num_jobs++; + } + } + } while (!feof(fp)); + fclose(fp); + + /* All done */ + + free(filename); + return ORTE_SUCCESS; +} + +static int num_cbacks=0; + +static void do_wireup(int fd, short sd, void *cbdata) +{ + opal_list_item_t *item, *itm; + opal_list_t *chain; + int i; + orte_job_t *jdata, *jptr; + + /* track the number of callbacks */ + num_cbacks++; + + /* if all jobs have completed this phase, then assign + * wireup targets + */ + if (num_cbacks == orte_num_jobs) { + for (i=0; i < chains.size; i++) { + if (NULL == (chain = (opal_list_t*)opal_pointer_array_get_item(&chains, i))) { + continue; + } + for (item = opal_list_get_first(chain); + item != opal_list_get_end(chain); + item = opal_list_get_next(item)) { + jdata = (orte_job_t*)item; + /* ensure stdin is pulled for each job */ + /* see where this job's stdout should go */ + if (ORTE_JOB_CONTROL_MAPPER & jdata->controls) { + jdata->stdin_target = ORTE_VPID_WILDCARD; + /* mappers send their output to the next job in the chain */ + itm = opal_list_get_next(item); + if (itm != opal_list_get_end(chain)) { + jptr = (orte_job_t*)itm; + jdata->stdout_target = jptr->jobid; + } + } else if (ORTE_JOB_CONTROL_REDUCER & jdata->controls) { + jdata->stdin_target = ORTE_VPID_WILDCARD; + /* reducer feeds its output to the next job in the chain, if it exists */ + itm = opal_list_get_next(item); + if (itm != opal_list_get_end(chain)) { + jptr = (orte_job_t*)itm; + jdata->stdout_target = jptr->jobid; + } else { + /* if a combiner exists, then feed the output there */ + if (NULL != mapreduce_globals.combiner_job) { + jdata->stdout_target = mapreduce_globals.combiner_job->jobid; + } + } + } else if (!(ORTE_JOB_CONTROL_COMBINER & jdata->controls)) { + /* should have been something */ + orte_show_help("help-orterun.txt", "orterun:unrecognized-mr-type", + true, orte_basename); + exit(ORTE_ERROR_DEFAULT_EXIT_CODE); + } + } + } + /* now send all the jobs to allocate, in reverse order + * to ensure that all recipients of input are in place + * BEFORE a source begins to generate output + */ + for (i=chains.size-1; 0 <= i; i--) { + if (NULL == (chain = (opal_list_t*)opal_pointer_array_get_item(&chains, i))) { + continue; + } + while (NULL != (item = opal_list_remove_last(chain))) { + jdata = (orte_job_t*)item; + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATE); + } + } + } +} diff --git a/orte/tools/orterun/help-orterun.txt b/orte/tools/orterun/help-orterun.txt index 4e89c3dd20..8cfec11fb5 100644 --- a/orte/tools/orterun/help-orterun.txt +++ b/orte/tools/orterun/help-orterun.txt @@ -621,3 +621,12 @@ the job to be terminated. The first process to do so was: Process name: %s Exit code: %d # +[orterun:unrecognized-mr-type] +%s does not recognize the type of job. This should not happen and +indicates an ORTE internal problem. +# +[multiple-combiners] +More than one combiner was specified. The combiner takes the output +from the final reducer in each chain to produce a single, combined +result. Thus, there can only be one combiner for a job. Please +review your command line and try again. diff --git a/orte/util/error_strings.c b/orte/util/error_strings.c index 2c55788018..26c75db9a2 100644 --- a/orte/util/error_strings.c +++ b/orte/util/error_strings.c @@ -203,6 +203,8 @@ const char *orte_job_state_to_str(orte_job_state_t state) return "UNDEFINED"; case ORTE_JOB_STATE_INIT: return "PENDING INIT"; + case ORTE_JOB_STATE_INIT_COMPLETE: + return "INIT_COMPLETE"; case ORTE_JOB_STATE_ALLOCATE: return "PENDING ALLOCATION"; case ORTE_JOB_STATE_MAP: diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index 5eab0eb0ad..f39789147f 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -9,6 +9,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -1103,13 +1106,15 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo) #endif orte_std_cntr_t n; opal_buffer_t buf; - int rc, j; + int rc, j, k; orte_job_t *jdata; orte_proc_t *proc, *pptr; - orte_node_t *node; + orte_node_t *node, *nptr; orte_proc_state_t *states=NULL; orte_app_idx_t *app_idx=NULL; int32_t *restarts=NULL; + orte_job_map_t *map; + bool found; /* xfer the byte object to a buffer for unpacking */ OBJ_CONSTRUCT(&buf, opal_buffer_t); @@ -1212,6 +1217,11 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo) } /* xfer the data */ + map = jdata->map; + if (NULL == map) { + jdata->map = OBJ_NEW(orte_job_map_t); + map = jdata->map; + } for (i=0; i < num_procs; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { proc = OBJ_NEW(orte_proc_t); @@ -1231,6 +1241,21 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo) OBJ_RELEASE(pptr); opal_pointer_array_set_item(proc->node->procs, j, NULL); proc->node->num_procs--; + if (0 == proc->node->num_procs) { + /* remove node from the map */ + for (k=0; k < map->nodes->size; k++) { + if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(map->nodes, k))) { + continue; + } + if (nptr == proc->node) { + /* maintain accounting */ + OBJ_RELEASE(nptr); + opal_pointer_array_set_item(map->nodes, k, NULL); + map->num_nodes--; + break; + } + } + } break; } } @@ -1242,6 +1267,21 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo) node = OBJ_NEW(orte_node_t); opal_pointer_array_set_item(orte_node_pool, nodes[i], node); } + /* see if this node is already in the map */ + found = false; + for (j=0; j < map->nodes->size; j++) { + if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(map->nodes, j))) { + continue; + } + if (nptr == node) { + found = true; + break; + } + } + if (!found) { + opal_pointer_array_add(map->nodes, node); + map->num_nodes++; + } /* add the node to the proc */ OBJ_RETAIN(node); proc->node = node;