1
1

Extend the iof by adding two new components to support map-reduce IO chaining. Add a mapreduce tool for running such applications.

Fix the state machine to support multiple jobs being simultaneously launched as this is not only required for mapreduce, but can happen under comm-spawn applications as well.

This commit was SVN r26380.
Этот коммит содержится в:
Ralph Castain 2012-05-02 21:00:22 +00:00
родитель 40c2fc5f55
Коммит b2f77bf08f
49 изменённых файлов: 6931 добавлений и 196 удалений

Просмотреть файл

@ -4,7 +4,7 @@
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2011 Los Alamos National Security, LLC. All rights
# Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
@ -32,5 +32,6 @@ AC_DEFUN([ORTE_CONFIG_FILES],[
orte/tools/orte-top/Makefile
orte/tools/orte-migrate/Makefile
orte/tools/orte-info/Makefile
orte/tools/mapreduce/Makefile
])
])

Просмотреть файл

@ -86,7 +86,7 @@ void orte_grpcomm_base_xcast_recv(int status, orte_process_name_t* sender,
* knows what to do - it will also free the bytes in the bo
*/
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:xcast updating nidmap",
"%s grpcomm:base:xcast updating daemon nidmap",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_SUCCESS != (ret = orte_ess.update_nidmap(bo))) {

Просмотреть файл

@ -10,6 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -42,7 +44,10 @@
#include <signal.h>
#endif
#include "opal/class/opal_list.h"
#include "opal/class/opal_bitmap.h"
#include "opal/mca/mca.h"
#include "opal/mca/event/event.h"
#include "orte/mca/iof/iof.h"
#include "orte/runtime/orte_globals.h"
@ -53,6 +58,14 @@ ORTE_DECLSPEC int orte_iof_base_open(void);
#if !ORTE_DISABLE_FULL_SUPPORT
/* track xon/xoff of processes */
typedef struct {
opal_object_t super;
orte_job_t *jdata;
opal_bitmap_t xoff;
} orte_iof_job_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_job_t);
/*
* Maximum size of single msg
*/
@ -76,10 +89,7 @@ typedef struct {
orte_process_name_t daemon;
orte_iof_tag_t tag;
orte_iof_write_event_t *wev;
#if OPAL_ENABLE_DEBUG
char *file;
int line;
#endif
bool xoff;
} orte_iof_sink_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_sink_t);
@ -90,10 +100,6 @@ typedef struct {
int fd;
orte_iof_tag_t tag;
bool active;
#if OPAL_ENABLE_DEBUG
char *file;
int line;
#endif
} orte_iof_read_event_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_read_event_t);
@ -103,6 +109,7 @@ typedef struct {
orte_iof_read_event_t *revstdout;
orte_iof_read_event_t *revstderr;
orte_iof_read_event_t *revstddiag;
orte_iof_sink_t *sink;
} orte_iof_proc_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_proc_t);
@ -116,6 +123,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_write_output_t);
/* the iof globals struct */
struct orte_iof_base_t {
int iof_output;
char *input_files;
opal_list_t iof_components_opened;
opal_mutex_t iof_write_output_lock;
orte_iof_sink_t *iof_write_stdout;
@ -124,8 +132,6 @@ struct orte_iof_base_t {
typedef struct orte_iof_base_t orte_iof_base_t;
#if OPAL_ENABLE_DEBUG
#define ORTE_IOF_SINK_DEFINE(snk, nm, fid, tg, wrthndlr, eplist) \
do { \
orte_iof_sink_t *ep; \
@ -147,8 +153,6 @@ typedef struct orte_iof_base_t orte_iof_base_t;
opal_list_append((eplist), &ep->super); \
} \
*(snk) = ep; \
ep->file = strdup(__FILE__); \
ep->line = __LINE__; \
} while(0);
/* add list of structs that has name of proc + orte_iof_tag_t - when
@ -171,8 +175,6 @@ typedef struct orte_iof_base_t orte_iof_base_t;
rev->tag = (tg); \
rev->fd = (fid); \
*(rv) = rev; \
rev->file = strdup(__FILE__); \
rev->line = __LINE__; \
opal_event_set(orte_event_base, \
rev->ev, (fid), \
OPAL_EV_READ, \
@ -184,49 +186,6 @@ typedef struct orte_iof_base_t orte_iof_base_t;
} while(0);
#else
#define ORTE_IOF_SINK_DEFINE(snk, nm, fid, tg, wrthndlr, eplist) \
do { \
orte_iof_sink_t *ep; \
ep = OBJ_NEW(orte_iof_sink_t); \
ep->name.jobid = (nm)->jobid; \
ep->name.vpid = (nm)->vpid; \
ep->tag = (tg); \
if (0 <= (fid)) { \
ep->wev->fd = (fid); \
opal_event_set(orte_event_base, \
ep->wev->ev, ep->wev->fd, \
OPAL_EV_WRITE, \
wrthndlr, ep); \
} \
if (NULL != (eplist)) { \
opal_list_append((eplist), &ep->super); \
} \
*(snk) = ep; \
} while(0);
#define ORTE_IOF_READ_EVENT(rv, nm, fid, tg, cbfunc, actv) \
do { \
orte_iof_read_event_t *rev; \
rev = OBJ_NEW(orte_iof_read_event_t); \
rev->name.jobid = (nm)->jobid; \
rev->name.vpid = (nm)->vpid; \
rev->tag = (tg); \
rev->fd = (fid); \
*(rv) = rev; \
opal_event_set(orte_event_base, \
rev->ev, (fid), \
OPAL_EV_READ, \
(cbfunc), rev); \
if ((actv)) { \
rev->active = true; \
opal_event_add(rev->ev, 0); \
} \
} while(0);
#endif
ORTE_DECLSPEC int orte_iof_base_close(void);
ORTE_DECLSPEC int orte_iof_base_select(void);
ORTE_DECLSPEC int orte_iof_base_flush(void);

Просмотреть файл

@ -63,11 +63,29 @@ int orte_iof_base_open(void)
#else
/* class instances */
static void orte_iof_job_construct(orte_iof_job_t *ptr)
{
ptr->jdata = NULL;
OBJ_CONSTRUCT(&ptr->xoff, opal_bitmap_t);
}
static void orte_iof_job_destruct(orte_iof_job_t *ptr)
{
if (NULL != ptr->jdata) {
OBJ_RELEASE(ptr->jdata);
}
OBJ_DESTRUCT(&ptr->xoff);
}
OBJ_CLASS_INSTANCE(orte_iof_job_t,
opal_object_t,
orte_iof_job_construct,
orte_iof_job_destruct);
static void orte_iof_base_proc_construct(orte_iof_proc_t* ptr)
{
ptr->revstdout = NULL;
ptr->revstderr = NULL;
ptr->revstddiag = NULL;
ptr->sink = NULL;
}
static void orte_iof_base_proc_destruct(orte_iof_proc_t* ptr)
{
@ -92,6 +110,7 @@ static void orte_iof_base_sink_construct(orte_iof_sink_t* ptr)
ptr->daemon.jobid = ORTE_JOBID_INVALID;
ptr->daemon.vpid = ORTE_VPID_INVALID;
ptr->wev = OBJ_NEW(orte_iof_write_event_t);
ptr->xoff = false;
}
static void orte_iof_base_sink_destruct(orte_iof_sink_t* ptr)
{
@ -205,6 +224,11 @@ int orte_iof_base_open(void)
}
}
/* check for files to be sent to stdin of procs */
mca_base_param_reg_string_name("iof", "base_input_files",
"Comma-separated list of input files to be read and sent to stdin of procs (default: NULL)",
false, false, NULL, &orte_iof_base.input_files);
/* daemons do not need to do this as they do not write out stdout/err */
if (!ORTE_PROC_IS_DAEMON ||
(ORTE_PROC_IS_DAEMON && ORTE_PROC_IS_CM)) {

Просмотреть файл

@ -60,7 +60,8 @@ int orte_iof_base_write_output(orte_process_name_t *name, orte_iof_tag_t stream,
"%s write:output setting up to write %d bytes to %s for %s on fd %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
(ORTE_IOF_STDIN & stream) ? "stdin" : ((ORTE_IOF_STDOUT & stream) ? "stdout" : ((ORTE_IOF_STDERR & stream) ? "stderr" : "stddiag")),
ORTE_NAME_PRINT(name), channel->fd));
ORTE_NAME_PRINT(name),
(NULL == channel) ? -1 : channel->fd));
/* setup output object */
output = OBJ_NEW(orte_iof_write_output_t);
@ -251,9 +252,6 @@ construct:
output->numbytes = k;
process:
/* lock us up to protect global operations */
OPAL_THREAD_LOCK(&orte_iof_base.iof_write_output_lock);
/* add this data to the write list for this fd */
opal_list_append(&channel->outputs, &output->super);
@ -270,9 +268,6 @@ process:
channel->pending = true;
}
/* unlock and go */
OPAL_THREAD_UNLOCK(&orte_iof_base.iof_write_output_lock);
return num_buffered;
}
@ -289,11 +284,13 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
wev->fd));
/* lock us up to protect global operations */
OPAL_THREAD_LOCK(&orte_iof_base.iof_write_output_lock);
while (NULL != (item = opal_list_remove_first(&wev->outputs))) {
output = (orte_iof_write_output_t*)item;
if (0 == output->numbytes) {
/* indicates we are to close this stream */
OBJ_RELEASE(sink);
return;
}
num_written = write(wev->fd, output->data, output->numbytes);
if (num_written < 0) {
if (EAGAIN == errno || EINTR == errno) {
@ -302,7 +299,7 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
/* leave the write event running so it will call us again
* when the fd is ready.
*/
goto DEPART;
return;
}
/* otherwise, something bad happened so all we can do is abort
* this attempt
@ -312,12 +309,12 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
} else if (num_written < output->numbytes) {
/* incomplete write - adjust data to avoid duplicate output */
memmove(output->data, &output->data[num_written], output->numbytes - num_written);
/* push this item back on the front of the list */
/* push this item back on the front of the list */
opal_list_prepend(&wev->outputs, item);
/* leave the write event running so it will call us again
* when the fd is ready
*/
goto DEPART;
return;
}
OBJ_RELEASE(output);
}
@ -325,7 +322,4 @@ ABORT:
opal_event_del(wev->ev);
wev->pending = false;
DEPART:
/* unlock and go */
OPAL_THREAD_UNLOCK(&orte_iof_base.iof_write_output_lock);
}

Просмотреть файл

@ -57,8 +57,11 @@
#include "opal/util/opal_pty.h"
#include "opal/util/opal_environ.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/iof_base_setup.h"

Просмотреть файл

@ -84,6 +84,7 @@ orte_iof_base_module_t orte_iof_hnp_module = {
hnp_push,
hnp_pull,
hnp_close,
NULL,
finalize,
hnp_ft_event
};

Просмотреть файл

@ -181,6 +181,10 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock);
return;
}
if (0 < numbytes && numbytes < (int)sizeof(data)) {
/* need to write a 0-byte event to clear the stream and close it */
orte_iof_base_write_output(&rev->name, ORTE_IOF_STDIN, data, 0, sink->wev);
}
}
} else {
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
@ -207,8 +211,8 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
}
}
}
/* if num_bytes was zero, then we need to terminate the event */
if (0 == numbytes) {
/* if num_bytes was zero, or we read the last piece of the file, then we need to terminate the event */
if (0 == numbytes || numbytes < (int)sizeof(data)) {
/* this will also close our stdin file descriptor */
OBJ_RELEASE(mca_iof_hnp_component.stdinev);
} else {

Просмотреть файл

@ -10,6 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -119,6 +121,7 @@
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "orte/runtime/orte_globals.h"
#include "iof_types.h"
@ -157,6 +160,9 @@ typedef int (*orte_iof_base_pull_fn_t)(const orte_process_name_t* peer,
typedef int (*orte_iof_base_close_fn_t)(const orte_process_name_t* peer,
orte_iof_tag_t source_tag);
/* Flag that a job is complete */
typedef void (*orte_iof_base_complete_fn_t)(const orte_job_t *jdata);
/* finalize the selected module */
typedef int (*orte_iof_base_finalize_fn_t)(void);
@ -173,6 +179,7 @@ struct orte_iof_base_module_2_0_0_t {
orte_iof_base_push_fn_t push;
orte_iof_base_pull_fn_t pull;
orte_iof_base_close_fn_t close;
orte_iof_base_complete_fn_t complete;
orte_iof_base_finalize_fn_t finalize;
orte_iof_base_ft_event_fn_t ft_event;
};

40
orte/mca/iof/mr_hnp/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,40 @@
#
# Copyright (c) 2012 Los Alamos National Security, LLC.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_iof_mr_hnp_DSO
component_noinst =
component_install = mca_iof_mr_hnp.la
else
component_noinst = libmca_iof_mr_hnp.la
component_install =
endif
mr_hnp_SOURCES = \
iof_mrhnp.c \
iof_mrhnp.h \
iof_mrhnp_component.c \
iof_mrhnp_read.c \
iof_mrhnp_receive.c
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_iof_mr_hnp_la_SOURCES = $(mr_hnp_SOURCES)
mca_iof_mr_hnp_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_iof_mr_hnp_la_SOURCES = $(mr_hnp_SOURCES)
libmca_iof_mr_hnp_la_LIBADD =
libmca_iof_mr_hnp_la_LDFLAGS = -module -avoid-version

19
orte/mca/iof/mr_hnp/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,19 @@
# -*- shell-script -*-
#
# Copyright (c) 2012 Los Alamos National Security, LLC.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_iof_mr_hnp_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_iof_mr_hnp_CONFIG], [
AC_CONFIG_FILES([orte/mca/iof/mr_hnp/Makefile])
AS_IF([test "$orte_without_full_support" = 0],
[$1],
[$2])
])

700
orte/mca/iof/mr_hnp/iof_mrhnp.c Обычный файл
Просмотреть файл

@ -0,0 +1,700 @@
/*
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/constants.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#else
#ifdef HAVE_SYS_FCNTL_H
#include <sys/fcntl.h>
#endif
#endif
#include "opal/mca/event/event.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/rml/rml.h"
#include "orte/util/name_fns.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/iof/base/base.h"
#include "iof_mrhnp.h"
/* LOCAL FUNCTIONS */
static void stdin_write_handler(int fd, short event, void *cbdata);
/* API FUNCTIONS */
static int init(void);
static int mrhnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd);
static int mrhnp_pull(const orte_process_name_t* src_name,
orte_iof_tag_t src_tag,
int fd);
static int mrhnp_close(const orte_process_name_t* peer,
orte_iof_tag_t source_tag);
static void mrhnp_complete(const orte_job_t *jdata);
static int finalize(void);
static int mrhnp_ft_event(int state);
/* The API's in this module are solely used to support LOCAL
* procs - i.e., procs that are co-located to the HNP. Remote
* procs interact with the HNP's IOF via the HNP's receive function,
* which operates independently and is in the iof_mrhnp_receive.c file
*/
orte_iof_base_module_t orte_iof_mrhnp_module = {
init,
mrhnp_push,
mrhnp_pull,
mrhnp_close,
mrhnp_complete,
finalize,
mrhnp_ft_event
};
/* Initialize the module */
static int init(void)
{
int rc;
/* post non-blocking recv to catch forwarded IO from
* the orteds
*/
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_IOF_HNP,
ORTE_RML_PERSISTENT,
orte_iof_mrhnp_recv,
NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OBJ_CONSTRUCT(&mca_iof_mr_hnp_component.sinks, opal_list_t);
OBJ_CONSTRUCT(&mca_iof_mr_hnp_component.procs, opal_list_t);
mca_iof_mr_hnp_component.stdinev = NULL;
OBJ_CONSTRUCT(&mca_iof_mr_hnp_component.stdin_jobs, opal_pointer_array_t);
opal_pointer_array_init(&mca_iof_mr_hnp_component.stdin_jobs, 1, INT_MAX, 1);
return ORTE_SUCCESS;
}
/* Setup to read from stdin.
*/
static int mrhnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd)
{
orte_job_t *jdata;
orte_iof_sink_t *sink;
orte_iof_proc_t *proct;
opal_list_item_t *item;
int flags;
char *outfile;
int fdout;
int np, numdigs;
orte_ns_cmp_bitmask_t mask;
orte_iof_job_t *jptr;
int j;
bool found;
/* don't do this if the dst vpid is invalid or the fd is negative! */
if (ORTE_VPID_INVALID == dst_name->vpid || fd < 0) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s iof:mrhnp pushing fd %d for process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
fd, ORTE_NAME_PRINT(dst_name)));
/* we get a push for stdout, stderr, and stddiag on every LOCAL process, so
* setup to read those streams and forward them to the next app_context
*/
if (!(src_tag & ORTE_IOF_STDIN)) {
/* set the file descriptor to non-blocking - do this before we setup
* and activate the read event in case it fires right away
*/
if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n",
__FILE__, __LINE__, errno);
} else {
flags |= O_NONBLOCK;
fcntl(fd, F_SETFL, flags);
}
/* do we already have this process in our list? */
for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs);
item != opal_list_get_end(&mca_iof_mr_hnp_component.procs);
item = opal_list_get_next(item)) {
proct = (orte_iof_proc_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, dst_name)) {
/* found it */
goto SETUP;
}
}
/* if we get here, then we don't yet have this proc in our list */
proct = OBJ_NEW(orte_iof_proc_t);
proct->name.jobid = dst_name->jobid;
proct->name.vpid = dst_name->vpid;
opal_list_append(&mca_iof_mr_hnp_component.procs, &proct->super);
/* see if we are to output to a file */
if (NULL != orte_output_filename) {
/* get the jobdata for this proc */
if (NULL == (jdata = orte_get_job_data_object(dst_name->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
np = jdata->num_procs / 10;
/* determine the number of digits required for max vpid */
numdigs = 1;
while (np > 0) {
numdigs++;
np = np / 10;
}
/* construct the filename */
asprintf(&outfile, "%s.%d.%0*lu", orte_output_filename,
(int)ORTE_LOCAL_JOBID(proct->name.jobid),
numdigs, (unsigned long)proct->name.vpid);
/* create the file */
fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644);
free(outfile);
if (fdout < 0) {
/* couldn't be opened */
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
return ORTE_ERR_FILE_OPEN_FAILURE;
}
/* define a sink to that file descriptor */
ORTE_IOF_SINK_DEFINE(&sink, dst_name, fdout, ORTE_IOF_STDOUTALL,
orte_iof_base_write_handler,
&mca_iof_mr_hnp_component.sinks);
}
SETUP:
/* define a read event but don't activate it */
if (src_tag & ORTE_IOF_STDOUT) {
ORTE_IOF_READ_EVENT(&proct->revstdout, dst_name, fd, ORTE_IOF_STDOUT,
orte_iof_mrhnp_read_local_handler, false);
} else if (src_tag & ORTE_IOF_STDERR) {
ORTE_IOF_READ_EVENT(&proct->revstderr, dst_name, fd, ORTE_IOF_STDERR,
orte_iof_mrhnp_read_local_handler, false);
} else if (src_tag & ORTE_IOF_STDDIAG) {
ORTE_IOF_READ_EVENT(&proct->revstddiag, dst_name, fd, ORTE_IOF_STDDIAG,
orte_iof_mrhnp_read_local_handler, false);
}
/* if -all- of the readevents for this proc have been defined, then
* activate them. Otherwise, we can think that the proc is complete
* because one of the readevents fires -prior- to all of them having been defined!
*/
if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) {
/* now activate read events */
proct->revstdout->active = true;
opal_event_add(proct->revstdout->ev, 0);
proct->revstderr->active = true;
opal_event_add(proct->revstderr->ev, 0);
proct->revstddiag->active = true;
opal_event_add(proct->revstddiag->ev, 0);
}
return ORTE_SUCCESS;
}
/*** HANDLE STDIN PUSH ***/
/* get the job object for this proc and check to see if it
* is a mapper - if so, add it to the jobs that receive
* our stdin
*/
jdata = orte_get_job_data_object(dst_name->jobid);
if (ORTE_JOB_CONTROL_MAPPER & jdata->controls) {
/* see if we already have it */
found = false;
for (j=0; j < mca_iof_mr_hnp_component.stdin_jobs.size; j++) {
if (NULL == (jptr = (orte_iof_job_t*)opal_pointer_array_get_item(&mca_iof_mr_hnp_component.stdin_jobs, j))) {
continue;
}
if (jptr->jdata->jobid == jdata->jobid) {
found = true;
break;
}
}
if (!found) {
jptr = OBJ_NEW(orte_iof_job_t);
OBJ_RETAIN(jdata);
jptr->jdata = jdata;
opal_bitmap_init(&jptr->xoff, jdata->num_procs);
opal_pointer_array_add(&mca_iof_mr_hnp_component.stdin_jobs, jptr);
}
}
/* now setup the read - but check to only do this once */
if (NULL == mca_iof_mr_hnp_component.stdinev) {
/* Since we are the HNP, we don't want to set nonblocking on our
* stdio stream. If we do so, we set the file descriptor to
* non-blocking for everyone that has that file descriptor, which
* includes everyone else in our shell pipeline chain. (See
* http://lists.freebsd.org/pipermail/freebsd-hackers/2005-January/009742.html).
* This causes things like "mpirun -np 1 big_app | cat" to lose
* output, because cat's stdout is then ALSO non-blocking and cat
* isn't built to deal with that case (same with almost all other
* unix text utils).
*/
if (0 != fd) {
if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n",
__FILE__, __LINE__, errno);
} else {
flags |= O_NONBLOCK;
fcntl(fd, F_SETFL, flags);
}
}
if (isatty(fd)) {
/* We should avoid trying to read from stdin if we
* have a terminal, but are backgrounded. Catch the
* signals that are commonly used when we switch
* between being backgrounded and not. If the
* filedescriptor is not a tty, don't worry about it
* and always stay connected.
*/
opal_event_signal_set(orte_event_base, &mca_iof_mr_hnp_component.stdinsig,
SIGCONT, orte_iof_mrhnp_stdin_cb,
NULL);
/* setup a read event to read stdin, but don't activate it yet. The
* dst_name indicates who should receive the stdin. If that recipient
* doesn't do a corresponding pull, however, then the stdin will
* be dropped upon receipt at the local daemon
*/
ORTE_IOF_READ_EVENT(&mca_iof_mr_hnp_component.stdinev,
dst_name, fd, ORTE_IOF_STDIN,
orte_iof_mrhnp_read_local_handler, false);
/* check to see if we want the stdin read event to be
* active - we will always at least define the event,
* but may delay its activation
*/
if (!(src_tag & ORTE_IOF_STDIN) || orte_iof_mrhnp_stdin_check(fd)) {
mca_iof_mr_hnp_component.stdinev->active = true;
opal_event_add(mca_iof_mr_hnp_component.stdinev->ev, 0);
}
} else {
/* if we are not looking at a tty, just setup a read event
* and activate it
*/
ORTE_IOF_READ_EVENT(&mca_iof_mr_hnp_component.stdinev,
dst_name, fd, ORTE_IOF_STDIN,
orte_iof_mrhnp_read_local_handler, true);
}
}
return ORTE_SUCCESS;
}
/*
* Since we are the HNP, the only "pull" call comes from a local
* process so we can record the file descriptor for its stdin.
*/
static int mrhnp_pull(const orte_process_name_t* dst_name,
orte_iof_tag_t src_tag,
int fd)
{
orte_iof_sink_t *sink;
int flags, j;
orte_iof_proc_t *ptr, *proct;
opal_list_item_t *item;
orte_job_t *jdata;
orte_iof_job_t *jptr;
bool found;
/* this is a local call - only stdin is supported */
if (ORTE_IOF_STDIN != src_tag) {
return ORTE_ERR_NOT_SUPPORTED;
}
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s iof:mrhnp pulling fd %d for process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
fd, ORTE_NAME_PRINT(dst_name)));
/* get the job object for this proc and check to see if it
* is a mapper - if so, add it to the jobs that receive
* our stdin
*/
jdata = orte_get_job_data_object(dst_name->jobid);
if (ORTE_JOB_CONTROL_MAPPER & jdata->controls) {
/* see if we already have it */
found = false;
for (j=0; j < mca_iof_mr_hnp_component.stdin_jobs.size; j++) {
if (NULL == (jptr = (orte_iof_job_t*)opal_pointer_array_get_item(&mca_iof_mr_hnp_component.stdin_jobs, j))) {
continue;
}
if (jptr->jdata->jobid == jdata->jobid) {
found = true;
break;
}
}
if (!found) {
jptr = OBJ_NEW(orte_iof_job_t);
OBJ_RETAIN(jdata);
jptr->jdata = jdata;
opal_bitmap_init(&jptr->xoff, jdata->num_procs);
opal_pointer_array_add(&mca_iof_mr_hnp_component.stdin_jobs, jptr);
}
}
/* set the file descriptor to non-blocking - do this before we setup
* the sink in case it fires right away
*/
if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n",
__FILE__, __LINE__, errno);
} else {
flags |= O_NONBLOCK;
fcntl(fd, F_SETFL, flags);
}
ORTE_IOF_SINK_DEFINE(&sink, dst_name, fd, ORTE_IOF_STDIN,
stdin_write_handler, NULL);
sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid;
sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid;
/* find the proct for this proc */
proct = NULL;
for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs);
item != opal_list_get_end(&mca_iof_mr_hnp_component.procs);
item = opal_list_get_next(item)) {
ptr = (orte_iof_proc_t*)item;
if (ptr->name.jobid == dst_name->jobid &&
ptr->name.vpid == dst_name->vpid) {
proct = ptr;
break;
}
}
if (NULL == proct) {
/* we don't yet have this proc in our list */
proct = OBJ_NEW(orte_iof_proc_t);
proct->name.jobid = dst_name->jobid;
proct->name.vpid = dst_name->vpid;
opal_list_append(&mca_iof_mr_hnp_component.procs, &proct->super);
}
proct->sink = sink;
return ORTE_SUCCESS;
}
/*
* One of our local procs wants us to close the specifed
* stream(s), thus terminating any potential io to/from it.
*/
static int mrhnp_close(const orte_process_name_t* peer,
orte_iof_tag_t source_tag)
{
opal_list_item_t *item, *next_item;
orte_iof_sink_t* sink;
orte_ns_cmp_bitmask_t mask;
for (item = opal_list_get_first(&mca_iof_mr_hnp_component.sinks);
item != opal_list_get_end(&mca_iof_mr_hnp_component.sinks);
item = next_item ) {
sink = (orte_iof_sink_t*)item;
next_item = opal_list_get_next(item);
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, peer) &&
(source_tag & sink->tag)) {
/* No need to delete the event or close the file
* descriptor - the destructor will automatically
* do it for us.
*/
opal_list_remove_item(&mca_iof_mr_hnp_component.sinks, item);
OBJ_RELEASE(item);
break;
}
}
return ORTE_SUCCESS;
}
static void send_data(orte_process_name_t *name, orte_iof_tag_t tag,
orte_jobid_t jobid,
unsigned char *data, int32_t nbytes)
{
opal_buffer_t *buf;
int rc;
buf = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG))) {
ORTE_ERROR_LOG(rc);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, data, nbytes, OPAL_BYTE))) {
ORTE_ERROR_LOG(rc);
return;
}
if (0 > (rc = orte_rml.send_buffer_nb(name, buf, ORTE_RML_TAG_IOF_PROXY,
0, orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
}
}
static void mrhnp_complete(const orte_job_t *jdata)
{
orte_job_t *jptr;
orte_job_map_t *map;
orte_proc_t *daemon;
orte_iof_proc_t *proct;
unsigned char data[1];
opal_list_item_t *item;
int i;
orte_node_t *node;
if (ORTE_JOBID_INVALID == jdata->stdout_target) {
/* nothing to do */
return;
}
/* the job is complete - close out the stdin
* of any procs it was feeding
*/
jptr = orte_get_job_data_object(jdata->stdout_target);
map = jptr->map;
/* cycle thru the map to find any node that has at least
* one proc from this job
*/
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
daemon = node->daemon;
if (daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) {
for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs);
item != opal_list_get_end(&mca_iof_mr_hnp_component.procs);
item = opal_list_get_next(item)) {
proct = (orte_iof_proc_t*)item;
if (proct->name.jobid == jptr->jobid) {
if (NULL != proct->sink) {
/* need to write a 0-byte event to clear the stream and close it */
orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, 0, proct->sink->wev);
proct->sink = NULL;
}
}
}
} else {
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s sending close stdin to daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&daemon->name)));
/* need to send a 0-byte message to clear the stream and close it */
send_data(&daemon->name, ORTE_IOF_STDIN, jptr->jobid, data, 0);
}
}
}
static int finalize(void)
{
opal_list_item_t* item;
orte_iof_write_output_t *output;
orte_iof_write_event_t *wev;
int num_written;
bool dump;
int i;
orte_job_t *jdata;
/* check if anything is still trying to be written out */
wev = orte_iof_base.iof_write_stdout->wev;
if (!opal_list_is_empty(&wev->outputs)) {
dump = false;
/* make one last attempt to write this out */
while (NULL != (item = opal_list_remove_first(&wev->outputs))) {
output = (orte_iof_write_output_t*)item;
if (!dump) {
num_written = write(wev->fd, output->data, output->numbytes);
if (num_written < output->numbytes) {
/* don't retry - just cleanout the list and dump it */
dump = true;
}
}
OBJ_RELEASE(output);
}
}
if (!orte_xml_output) {
/* we only opened stderr channel if we are NOT doing xml output */
wev = orte_iof_base.iof_write_stderr->wev;
if (!opal_list_is_empty(&wev->outputs)) {
dump = false;
/* make one last attempt to write this out */
while (NULL != (item = opal_list_remove_first(&wev->outputs))) {
output = (orte_iof_write_output_t*)item;
if (!dump) {
num_written = write(wev->fd, output->data, output->numbytes);
if (num_written < output->numbytes) {
/* don't retry - just cleanout the list and dump it */
dump = true;
}
}
OBJ_RELEASE(output);
}
}
}
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_HNP);
/* clear our stdin job array */
for (i=0; i < mca_iof_mr_hnp_component.stdin_jobs.size; i++) {
if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(&mca_iof_mr_hnp_component.stdin_jobs, i))) {
continue;
}
OBJ_RELEASE(jdata);
}
OBJ_DESTRUCT(&mca_iof_mr_hnp_component.stdin_jobs);
return ORTE_SUCCESS;
}
int mrhnp_ft_event(int state) {
/*
* Replica doesn't need to do anything for a checkpoint
*/
return ORTE_SUCCESS;
}
static void stdin_write_handler(int fd, short event, void *cbdata)
{
orte_iof_sink_t *sink = (orte_iof_sink_t*)cbdata;
orte_iof_write_event_t *wev = sink->wev;
opal_list_item_t *item;
orte_iof_write_output_t *output;
int num_written;
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s mrhnp:stdin:write:handler writing data to %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
wev->fd));
wev->pending = false;
while (NULL != (item = opal_list_remove_first(&wev->outputs))) {
output = (orte_iof_write_output_t*)item;
/* if an abnormal termination has occurred, just dump
* this data as we are aborting
*/
if (orte_abnormal_term_ordered) {
OBJ_RELEASE(output);
continue;
}
if (0 == output->numbytes) {
/* this indicates we are to close the fd - there is
* nothing to write
*/
OPAL_OUTPUT_VERBOSE((20, orte_iof_base.iof_output,
"%s iof:mrhnp closing fd %d on write event due to zero bytes output",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wev->fd));
OBJ_RELEASE(wev);
sink->wev = NULL;
/* just leave - we don't want to restart the
* read event!
*/
return;
}
num_written = write(wev->fd, output->data, output->numbytes);
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s mrhnp:stdin:write:handler wrote %d bytes",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
num_written));
if (num_written < 0) {
if (EAGAIN == errno || EINTR == errno) {
/* push this item back on the front of the list */
opal_list_prepend(&wev->outputs, item);
/* leave the write event running so it will call us again
* when the fd is ready.
*/
wev->pending = true;
opal_event_add(wev->ev, 0);
goto CHECK;
}
/* otherwise, something bad happened so all we can do is declare an
* error and abort
*/
OBJ_RELEASE(output);
OPAL_OUTPUT_VERBOSE((20, orte_iof_base.iof_output,
"%s iof:mrhnp closing fd %d on write event due to negative bytes written",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wev->fd));
OBJ_RELEASE(wev);
sink->wev = NULL;
return;
} else if (num_written < output->numbytes) {
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s mrhnp:stdin:write:handler incomplete write %d - adjusting data",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_written));
/* incomplete write - adjust data to avoid duplicate output */
memmove(output->data, &output->data[num_written], output->numbytes - num_written);
/* push this item back on the front of the list */
opal_list_prepend(&wev->outputs, item);
/* leave the write event running so it will call us again
* when the fd is ready.
*/
wev->pending = true;
opal_event_add(wev->ev, 0);
goto CHECK;
}
OBJ_RELEASE(output);
}
CHECK:
if (NULL != mca_iof_mr_hnp_component.stdinev &&
!orte_abnormal_term_ordered &&
!mca_iof_mr_hnp_component.stdinev->active) {
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"read event is off - checking if okay to restart"));
/* if we have turned off the read event, check to
* see if the output list has shrunk enough to
* turn it back on
*
* RHC: Note that when multiple procs want stdin, we
* can get into a fight between a proc turnin stdin
* back "on" and other procs turning it "off". There
* is no clear way to resolve this as different procs
* may take input at different rates.
*/
if (opal_list_get_size(&wev->outputs) < ORTE_IOF_MAX_INPUT_BUFFERS) {
/* restart the read */
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"restarting read event"));
mca_iof_mr_hnp_component.stdinev->active = true;
opal_event_add(mca_iof_mr_hnp_component.stdinev->ev, 0);
}
}
}

64
orte/mca/iof/mr_hnp/iof_mrhnp.h Обычный файл
Просмотреть файл

@ -0,0 +1,64 @@
/*
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef ORTE_IOF_MRHNP_H
#define ORTE_IOF_MRHNP_H
#include "orte_config.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_UIO_H
#include <sys/uio.h>
#endif /* HAVE_SYS_UIO_H */
#ifdef HAVE_NET_UIO_H
#include <net/uio.h>
#endif /* HAVE_NET_UIO_H */
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
BEGIN_C_DECLS
/**
* IOF HNP Component
*/
typedef struct {
orte_iof_base_component_t super;
opal_list_t sinks;
opal_list_t procs;
orte_iof_read_event_t *stdinev;
opal_event_t stdinsig;
char **input_files;
opal_pointer_array_t stdin_jobs;
} orte_iof_mrhnp_component_t;
ORTE_MODULE_DECLSPEC extern orte_iof_mrhnp_component_t mca_iof_mr_hnp_component;
extern orte_iof_base_module_t orte_iof_mrhnp_module;
void orte_iof_mrhnp_recv(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata);
void orte_iof_mrhnp_read_local_handler(int fd, short event, void *cbdata);
void orte_iof_mrhnp_stdin_cb(int fd, short event, void *cbdata);
bool orte_iof_mrhnp_stdin_check(int fd);
int orte_iof_hnp_send_data_to_endpoint(orte_process_name_t *host,
orte_process_name_t *target,
orte_iof_tag_t tag,
unsigned char *data, int numbytes);
END_C_DECLS
#endif

96
orte/mca/iof/mr_hnp/iof_mrhnp_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,96 @@
/*
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "orte/mca/iof/base/base.h"
#include "iof_mrhnp.h"
/*
* Local functions
*/
static int mrhnp_open(void);
static int mrhnp_close(void);
static int mrhnp_query(mca_base_module_t **module, int *priority);
/*
* Public string showing the iof hnp component version number
*/
const char *mca_iof_mr_hnp_component_version_string =
"Open MPI mr_hnp iof MCA component version " ORTE_VERSION;
orte_iof_mrhnp_component_t mca_iof_mr_hnp_component = {
{
/* First, the mca_base_component_t struct containing meta
information about the component itself */
{
ORTE_IOF_BASE_VERSION_2_0_0,
"mr_hnp", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
/* Component open, close, and query functions */
mrhnp_open,
mrhnp_close,
mrhnp_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
}
};
/**
* component open/close/init function
*/
static int mrhnp_open(void)
{
return ORTE_SUCCESS;
}
static int mrhnp_close(void)
{
return ORTE_SUCCESS;
}
/**
* Module query
*/
static int mrhnp_query(mca_base_module_t **module, int *priority)
{
mca_iof_mr_hnp_component.input_files = NULL;
/* select if we are HNP and map-reduce mode is operational */
if (ORTE_PROC_IS_HNP && orte_map_reduce) {
*priority = 1000;
*module = (mca_base_module_t *) &orte_iof_mrhnp_module;
if (NULL != orte_iof_base.input_files) {
mca_iof_mr_hnp_component.input_files = opal_argv_split(orte_iof_base.input_files, ',');
}
return ORTE_SUCCESS;
}
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}

383
orte/mca/iof/mr_hnp/iof_mrhnp_read.c Обычный файл
Просмотреть файл

@ -0,0 +1,383 @@
/*
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/dss/dss.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/util/name_fns.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
#include "iof_mrhnp.h"
static void send_data(orte_process_name_t *name, orte_iof_tag_t tag,
orte_jobid_t jobid,
unsigned char *data, int32_t nbytes);
static void restart_stdin(int fd, short event, void *cbdata)
{
orte_timer_t *tm = (orte_timer_t*)cbdata;
opal_output(0, "RESTART STDIN");
if (NULL != mca_iof_mr_hnp_component.stdinev &&
!orte_job_term_ordered &&
!mca_iof_mr_hnp_component.stdinev->active) {
mca_iof_mr_hnp_component.stdinev->active = true;
opal_event_add(mca_iof_mr_hnp_component.stdinev->ev, 0);
}
/* if this was a timer callback, then release the timer */
if (NULL != tm) {
OBJ_RELEASE(tm);
}
}
/* return true if we should read stdin from fd, false otherwise */
bool orte_iof_mrhnp_stdin_check(int fd)
{
#if !defined(__WINDOWS__) && defined(HAVE_TCGETPGRP)
if( isatty(fd) && (getpgrp() != tcgetpgrp(fd)) ) {
return false;
}
#elif defined(__WINDOWS__)
return false;
#endif /* !defined(__WINDOWS__) */
return true;
}
void orte_iof_mrhnp_stdin_cb(int fd, short event, void *cbdata)
{
bool should_process = orte_iof_mrhnp_stdin_check(0);
if (should_process) {
mca_iof_mr_hnp_component.stdinev->active = true;
opal_event_add(mca_iof_mr_hnp_component.stdinev->ev, 0);
} else {
opal_event_del(mca_iof_mr_hnp_component.stdinev->ev);
mca_iof_mr_hnp_component.stdinev->active = false;
}
}
/* this is the read handler for my own child procs and stdin
*/
void orte_iof_mrhnp_read_local_handler(int fd, short event, void *cbdata)
{
orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata;
unsigned char data[ORTE_IOF_BASE_MSG_MAX];
int32_t numbytes;
opal_list_item_t *item;
orte_iof_proc_t *proct;
int i, j;
orte_ns_cmp_bitmask_t mask;
orte_job_t *jdata;
orte_iof_job_t *iofjob;
orte_node_t *node;
orte_proc_t *daemon;
orte_job_map_t *map;
bool write_out=false;
/* read up to the fragment size */
#if !defined(__WINDOWS__)
numbytes = read(fd, data, sizeof(data));
#else
{
DWORD readed;
HANDLE handle = (HANDLE)_get_osfhandle(fd);
ReadFile(handle, data, sizeof(data), &readed, NULL);
numbytes = (int)readed;
}
#endif /* !defined(__WINDOWS__) */
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s iof:mrhnp:read handler read %d bytes from %s:%d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
ORTE_NAME_PRINT(&rev->name), fd));
if (numbytes < 0) {
/* either we have a connection error or it was a non-blocking read */
/* non-blocking, retry */
if (EAGAIN == errno || EINTR == errno) {
opal_event_add(rev->ev, 0);
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s iof:mrhnp:read handler %s Error on connection:%d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&rev->name), fd));
/* Un-recoverable error. Allow the code to flow as usual in order to
* to send the zero bytes message up the stream, and then close the
* file descriptor and delete the event.
*/
numbytes = 0;
}
/* if job termination has been ordered, just ignore the
* data and delete the stdin read event, if that is what fired
*/
if (orte_job_term_ordered) {
if (ORTE_IOF_STDIN & rev->tag) {
OBJ_RELEASE(mca_iof_mr_hnp_component.stdinev);
}
return;
}
if (ORTE_IOF_STDIN & rev->tag) {
/* The event has fired, so it's no longer active until we
* re-add it
*/
mca_iof_mr_hnp_component.stdinev->active = false;
/* if this was read from my stdin, I need to send this input to all
* daemons who host mapper procs
*/
for (j=0; j < mca_iof_mr_hnp_component.stdin_jobs.size; j++) {
if (NULL == (iofjob = (orte_iof_job_t*)opal_pointer_array_get_item(&mca_iof_mr_hnp_component.stdin_jobs, j))) {
continue;
}
jdata = iofjob->jdata;
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s read %d bytes from stdin - writing to job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
ORTE_JOBID_PRINT(jdata->jobid)));
map = jdata->map;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
daemon = node->daemon;
if (daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) {
/* if it is me, then send the bytes down the stdin pipe
* for every local proc (they are all on my proct list) - we even send 0 byte events
* down the pipe so it forces out any preceding data before
* closing the output stream. We add a 0 byte message if
* numbytes < sizeof(data) as this means the chunk we read
* was the end of the file.
*/
for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs);
item != opal_list_get_end(&mca_iof_mr_hnp_component.procs);
item = opal_list_get_next(item)) {
proct = (orte_iof_proc_t*)item;
if (proct->name.jobid == jdata->jobid) {
if (NULL == proct->sink) {
opal_output(0, "NULL SINK FOR PROC %s", ORTE_NAME_PRINT(&proct->name));
continue;
}
if (ORTE_IOF_MAX_INPUT_BUFFERS < orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, numbytes, proct->sink->wev)) {
/* getting too backed up - stop the read event for now if it is still active */
if (mca_iof_mr_hnp_component.stdinev->active) {
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"buffer backed up - holding"));
mca_iof_mr_hnp_component.stdinev->active = false;
}
return;
}
if (0 < numbytes && numbytes < (int)sizeof(data)) {
/* need to write a 0-byte event to clear the stream and close it */
orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, 0, proct->sink->wev);
proct->sink = NULL;
}
}
}
} else {
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s sending %d bytes from stdin to daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
ORTE_NAME_PRINT(&daemon->name)));
/* send the data to the daemon so it can
* write it to all local procs from this job.
* If the connection closed,
* numbytes will be zero so zero bytes will be
* sent - this will tell the daemon to close
* the fd for stdin to that proc
*/
send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, numbytes);
if (0 < numbytes && numbytes < (int)sizeof(data)) {
/* need to send a 0-byte message to clear the stream and close it */
send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, 0);
}
}
}
}
/* if num_bytes was zero, then we need to terminate the event */
if (0 == numbytes || numbytes < (int)sizeof(data)) {
/* this will also close our stdin file descriptor */
if (NULL != mca_iof_mr_hnp_component.stdinev) {
OBJ_RELEASE(mca_iof_mr_hnp_component.stdinev);
}
} else {
/* if we are looking at a tty, then we just go ahead and restart the
* read event assuming we are not backgrounded
*/
if (orte_iof_mrhnp_stdin_check(fd)) {
restart_stdin(fd, 0, NULL);
} else {
/* delay for awhile and then restart */
ORTE_TIMER_EVENT(0, 10000, restart_stdin, ORTE_INFO_PRI);
}
}
return;
}
if (ORTE_IOF_STDOUT & rev->tag && 0 < numbytes) {
/* see if we need to forward this output */
jdata = orte_get_job_data_object(rev->name.jobid);
if (ORTE_JOBID_INVALID == jdata->stdout_target) {
/* end of the chain - just output the info */
write_out = true;
goto PROCESS;
}
/* it goes to the next job in the chain */
jdata = orte_get_job_data_object(jdata->stdout_target);
map = jdata->map;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
daemon = node->daemon;
if (daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) {
/* if it is me, then send the bytes down the stdin pipe
* for every local proc (they are all on my proct list)
*/
for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs);
item != opal_list_get_end(&mca_iof_mr_hnp_component.procs);
item = opal_list_get_next(item)) {
proct = (orte_iof_proc_t*)item;
if (proct->name.jobid == jdata->jobid) {
if (NULL == proct->sink) {
opal_output(0, "NULL SINK FOR PROC %s", ORTE_NAME_PRINT(&proct->name));
continue;
}
orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, numbytes, proct->sink->wev);
}
}
} else {
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s sending %d bytes from stdout of %s to daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
ORTE_NAME_PRINT(&rev->name),
ORTE_NAME_PRINT(&daemon->name)));
/* send the data to the daemon so it can
* write it to all local procs from this job
*/
send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, numbytes);
}
}
}
PROCESS:
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s read %d bytes from %s of %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
(ORTE_IOF_STDOUT & rev->tag) ? "stdout" : ((ORTE_IOF_STDERR & rev->tag) ? "stderr" : "stddiag"),
ORTE_NAME_PRINT(&rev->name)));
if (0 == numbytes) {
/* if we read 0 bytes from the stdout/err/diag, find this proc
* on our list and
* release the appropriate event. This will delete the
* read event and close the file descriptor
*/
for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs);
item != opal_list_get_end(&mca_iof_mr_hnp_component.procs);
item = opal_list_get_next(item)) {
proct = (orte_iof_proc_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) {
/* found it - release corresponding event. This deletes
* the read event and closes the file descriptor
*/
if (rev->tag & ORTE_IOF_STDOUT) {
OBJ_RELEASE(proct->revstdout);
} else if (rev->tag & ORTE_IOF_STDERR) {
OBJ_RELEASE(proct->revstderr);
} else if (rev->tag & ORTE_IOF_STDDIAG) {
OBJ_RELEASE(proct->revstddiag);
}
/* check to see if they are all done */
if (NULL == proct->revstdout &&
NULL == proct->revstderr &&
NULL == proct->revstddiag) {
/* this proc's iof is complete */
opal_list_remove_item(&mca_iof_mr_hnp_component.procs, item);
ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE);
OBJ_RELEASE(proct);
}
break;
}
}
return;
} else {
/* output this to our local output */
if (ORTE_IOF_STDOUT & rev->tag) {
if (write_out) {
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stdout->wev);
}
} else {
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stderr->wev);
}
}
/* re-add the event */
opal_event_add(rev->ev, 0);
return;
}
static void send_data(orte_process_name_t *name, orte_iof_tag_t tag,
orte_jobid_t jobid,
unsigned char *data, int32_t nbytes)
{
opal_buffer_t *buf;
int rc;
buf = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG))) {
ORTE_ERROR_LOG(rc);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, data, nbytes, OPAL_BYTE))) {
ORTE_ERROR_LOG(rc);
return;
}
if (0 > (rc = orte_rml.send_buffer_nb(name, buf, ORTE_RML_TAG_IOF_PROXY,
0, orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
}
}

106
orte/mca/iof/mr_hnp/iof_mrhnp_receive.c Обычный файл
Просмотреть файл

@ -0,0 +1,106 @@
/*
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#else
#ifdef HAVE_SYS_FCNTL_H
#include <sys/fcntl.h>
#endif
#endif
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
#include "iof_mrhnp.h"
void orte_iof_mrhnp_recv(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
orte_process_name_t origin;
unsigned char data[ORTE_IOF_BASE_MSG_MAX];
orte_iof_tag_t stream;
int32_t count, numbytes;
int rc;
/* unpack the stream first as this may be flow control info */
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &stream, &count, ORTE_IOF_TAG))) {
ORTE_ERROR_LOG(rc);
goto CLEAN_RETURN;
}
if (ORTE_IOF_XON & stream) {
/* re-start the stdin read event */
if (NULL != mca_iof_mr_hnp_component.stdinev &&
!orte_job_term_ordered &&
!mca_iof_mr_hnp_component.stdinev->active) {
mca_iof_mr_hnp_component.stdinev->active = true;
opal_event_add(mca_iof_mr_hnp_component.stdinev->ev, 0);
}
goto CLEAN_RETURN;
} else if (ORTE_IOF_XOFF & stream) {
/* stop the stdin read event */
if (NULL != mca_iof_mr_hnp_component.stdinev &&
!mca_iof_mr_hnp_component.stdinev->active) {
opal_event_del(mca_iof_mr_hnp_component.stdinev->ev);
mca_iof_mr_hnp_component.stdinev->active = false;
}
goto CLEAN_RETURN;
}
/* get name of the process whose io we are discussing */
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &origin, &count, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto CLEAN_RETURN;
}
/* this must have come from a daemon forwarding output - unpack the data */
numbytes=ORTE_IOF_BASE_MSG_MAX;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, data, &numbytes, OPAL_BYTE))) {
ORTE_ERROR_LOG(rc);
goto CLEAN_RETURN;
}
/* numbytes will contain the actual #bytes that were sent */
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s unpacked %d bytes from remote proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
ORTE_NAME_PRINT(&origin)));
/* output this to our local output */
if (ORTE_IOF_STDOUT & stream || orte_xml_output) {
orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stdout->wev);
} else {
orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stderr->wev);
}
CLEAN_RETURN:
return;
}

40
orte/mca/iof/mr_orted/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,40 @@
#
# Copyright (c) 2012 Los Alamos National Security, LLC.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_iof_mr_orted_DSO
component_noinst =
component_install = mca_iof_mr_orted.la
else
component_noinst = libmca_iof_mr_orted.la
component_install =
endif
mr_orted_SOURCES = \
iof_mrorted.c \
iof_mrorted.h \
iof_mrorted_component.c \
iof_mrorted_read.c \
iof_mrorted_receive.c
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_iof_mr_orted_la_SOURCES = $(mr_orted_SOURCES)
mca_iof_mr_orted_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_iof_mr_orted_la_SOURCES = $(mr_orted_SOURCES)
libmca_iof_mr_orted_la_LIBADD =
libmca_iof_mr_orted_la_LDFLAGS = -module -avoid-version

19
orte/mca/iof/mr_orted/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,19 @@
# -*- shell-script -*-
#
# Copyright (c) 2012 Los Alamos National Security, LLC.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_iof_mr_orted_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_iof_mr_orted_CONFIG], [
AC_CONFIG_FILES([orte/mca/iof/mr_orted/Makefile])
AS_IF([test "$orte_without_full_support" = 0],
[$1],
[$2])
])

464
orte/mca/iof/mr_orted/iof_mrorted.c Обычный файл
Просмотреть файл

@ -0,0 +1,464 @@
/*
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/constants.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#else
#ifdef HAVE_SYS_FCNTL_H
#include <sys/fcntl.h>
#endif
#endif
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
#include "iof_mrorted.h"
/* LOCAL FUNCTIONS */
static void stdin_write_handler(int fd, short event, void *cbdata);
/* API FUNCTIONS */
static int init(void);
static int mrorted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd);
static int mrorted_pull(const orte_process_name_t* src_name,
orte_iof_tag_t src_tag,
int fd);
static int mrorted_close(const orte_process_name_t* peer,
orte_iof_tag_t source_tag);
static void mrorted_complete(const orte_job_t *jdata);
static int finalize(void);
static int mrorted_ft_event(int state);
/* The API's in this module are solely used to support LOCAL
* procs - i.e., procs that are co-located to the daemon. Output
* from local procs is automatically sent to the HNP for output
* and possible forwarding to other requestors. The HNP automatically
* determines and wires up the stdin configuration, so we don't
* have to do anything here.
*/
orte_iof_base_module_t orte_iof_mrorted_module = {
init,
mrorted_push,
mrorted_pull,
mrorted_close,
mrorted_complete,
finalize,
mrorted_ft_event
};
static int init(void)
{
int rc;
/* post a non-blocking RML receive to get messages
from the HNP IOF component */
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_IOF_PROXY,
ORTE_RML_PERSISTENT,
orte_iof_mrorted_recv,
NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* setup the local global variables */
OBJ_CONSTRUCT(&mca_iof_mr_orted_component.sinks, opal_list_t);
OBJ_CONSTRUCT(&mca_iof_mr_orted_component.procs, opal_list_t);
return ORTE_SUCCESS;
}
/**
* Push data from the specified file descriptor
* to the HNP
*/
static int mrorted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd)
{
int flags;
opal_list_item_t *item;
orte_iof_proc_t *proct;
orte_iof_sink_t *sink;
char *outfile;
int fdout;
orte_job_t *jobdat=NULL;
int np, numdigs;
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s iof:mrorted pushing fd %d for process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
fd, ORTE_NAME_PRINT(dst_name)));
/* set the file descriptor to non-blocking - do this before we setup
* and activate the read event in case it fires right away
*/
if ((flags = fcntl(fd, F_GETFL, 0)) < 0) {
opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n",
__FILE__, __LINE__, errno);
} else {
flags |= O_NONBLOCK;
fcntl(fd, F_SETFL, flags);
}
/* do we already have this process in our list? */
for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs);
item != opal_list_get_end(&mca_iof_mr_orted_component.procs);
item = opal_list_get_next(item)) {
proct = (orte_iof_proc_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, dst_name)) {
/* found it */
goto SETUP;
}
}
/* if we get here, then we don't yet have this proc in our list */
proct = OBJ_NEW(orte_iof_proc_t);
proct->name.jobid = dst_name->jobid;
proct->name.vpid = dst_name->vpid;
opal_list_append(&mca_iof_mr_orted_component.procs, &proct->super);
/* see if we are to output to a file */
if (NULL != orte_output_filename) {
/* get the local jobdata for this proc */
if (NULL == (jobdat = orte_get_job_data_object(proct->name.jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
np = jobdat->num_procs / 10;
/* determine the number of digits required for max vpid */
numdigs = 1;
while (np > 0) {
numdigs++;
np = np / 10;
}
/* construct the filename */
asprintf(&outfile, "%s.%d.%0*lu", orte_output_filename,
(int)ORTE_LOCAL_JOBID(proct->name.jobid),
numdigs, (unsigned long)proct->name.vpid);
/* create the file */
fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644);
free(outfile);
if (fdout < 0) {
/* couldn't be opened */
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
return ORTE_ERR_FILE_OPEN_FAILURE;
}
/* define a sink to that file descriptor */
ORTE_IOF_SINK_DEFINE(&sink, dst_name, fdout, ORTE_IOF_STDOUTALL,
orte_iof_base_write_handler,
&mca_iof_mr_orted_component.sinks);
}
SETUP:
/* define a read event but don't activate it */
if (src_tag & ORTE_IOF_STDOUT) {
ORTE_IOF_READ_EVENT(&proct->revstdout, dst_name, fd, ORTE_IOF_STDOUT,
orte_iof_mrorted_read_handler, false);
} else if (src_tag & ORTE_IOF_STDERR) {
ORTE_IOF_READ_EVENT(&proct->revstderr, dst_name, fd, ORTE_IOF_STDERR,
orte_iof_mrorted_read_handler, false);
} else if (src_tag & ORTE_IOF_STDDIAG) {
ORTE_IOF_READ_EVENT(&proct->revstddiag, dst_name, fd, ORTE_IOF_STDDIAG,
orte_iof_mrorted_read_handler, false);
}
/* if -all- of the readevents for this proc have been defined, then
* activate them. Otherwise, we can think that the proc is complete
* because one of the readevents fires -prior- to all of them having
* been defined!
*/
if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) {
proct->revstdout->active = true;
opal_event_add(proct->revstdout->ev, 0);
proct->revstderr->active = true;
opal_event_add(proct->revstderr->ev, 0);
proct->revstddiag->active = true;
opal_event_add(proct->revstddiag->ev, 0);
}
return ORTE_SUCCESS;
}
/**
* Pull for a daemon tells
* us that any info we receive from someone that is targeted
* for stdin of the specified process should be fed down the
* indicated file descriptor. Thus, all we need to do here
* is define a local endpoint so we know where to feed anything
* that comes to us
*/
static int mrorted_pull(const orte_process_name_t* dst_name,
orte_iof_tag_t src_tag,
int fd)
{
orte_iof_sink_t *sink;
int flags;
orte_iof_proc_t *proct, *ptr;
opal_list_item_t *item;
/* this is a local call - only stdin is supported */
if (ORTE_IOF_STDIN != src_tag) {
return ORTE_ERR_NOT_SUPPORTED;
}
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s iof:mrorted pulling fd %d for process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
fd, ORTE_NAME_PRINT(dst_name)));
/* set the file descriptor to non-blocking - do this before we setup
* the sink in case it fires right away
*/
if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n",
__FILE__, __LINE__, errno);
} else {
flags |= O_NONBLOCK;
fcntl(fd, F_SETFL, flags);
}
ORTE_IOF_SINK_DEFINE(&sink, dst_name, fd, ORTE_IOF_STDIN,
stdin_write_handler, NULL);
sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid;
sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid;
/* find the proct for this proc */
proct = NULL;
for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs);
item != opal_list_get_end(&mca_iof_mr_orted_component.procs);
item = opal_list_get_next(item)) {
ptr = (orte_iof_proc_t*)item;
if (ptr->name.jobid == dst_name->jobid &&
ptr->name.vpid == dst_name->vpid) {
proct = ptr;
break;
}
}
if (NULL == proct) {
/* we don't yet have this proc in our list */
proct = OBJ_NEW(orte_iof_proc_t);
proct->name.jobid = dst_name->jobid;
proct->name.vpid = dst_name->vpid;
opal_list_append(&mca_iof_mr_orted_component.procs, &proct->super);
}
proct->sink = sink;
return ORTE_SUCCESS;
}
/*
* One of our local procs wants us to close the specifed
* stream(s), thus terminating any potential io to/from it.
* For the orted, this just means closing the local fd
*/
static int mrorted_close(const orte_process_name_t* peer,
orte_iof_tag_t source_tag)
{
opal_list_item_t *item, *next_item;
orte_iof_sink_t* sink;
orte_ns_cmp_bitmask_t mask;
for(item = opal_list_get_first(&mca_iof_mr_orted_component.sinks);
item != opal_list_get_end(&mca_iof_mr_orted_component.sinks);
item = next_item ) {
sink = (orte_iof_sink_t*)item;
next_item = opal_list_get_next(item);
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, peer) &&
(source_tag & sink->tag)) {
/* No need to delete the event or close the file
* descriptor - the destructor will automatically
* do it for us.
*/
opal_list_remove_item(&mca_iof_mr_orted_component.sinks, item);
OBJ_RELEASE(item);
break;
}
}
return ORTE_SUCCESS;
}
static void mrorted_complete(const orte_job_t *jdata)
{
orte_iof_proc_t *proct;
unsigned char data[1];
opal_list_item_t *item;
/* the job is complete - close out the stdin
* of any procs it was feeding
*/
for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs);
item != opal_list_get_end(&mca_iof_mr_orted_component.procs);
item = opal_list_get_next(item)) {
proct = (orte_iof_proc_t*)item;
if (proct->name.jobid == jdata->stdout_target) {
if (NULL == proct->sink) {
opal_output(0, "NULL SINK FOR PROC %s", ORTE_NAME_PRINT(&proct->name));
continue;
} else {
/* need to write a 0-byte event to clear the stream and close it */
orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, 0, proct->sink->wev);
proct->sink = NULL;
}
}
}
}
static int finalize(void)
{
int rc;
opal_list_item_t *item;
while ((item = opal_list_remove_first(&mca_iof_mr_orted_component.sinks)) != NULL) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&mca_iof_mr_orted_component.sinks);
while ((item = opal_list_remove_first(&mca_iof_mr_orted_component.procs)) != NULL) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&mca_iof_mr_orted_component.procs);
/* Cancel the RML receive */
rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_PROXY);
return rc;
}
/*
* FT event
*/
static int mrorted_ft_event(int state)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
static void stdin_write_handler(int fd, short event, void *cbdata)
{
orte_iof_sink_t *sink = (orte_iof_sink_t*)cbdata;
orte_iof_write_event_t *wev = sink->wev;
opal_list_item_t *item;
orte_iof_write_output_t *output;
int num_written;
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s mrorted:stdin:write:handler writing data to %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
wev->fd));
wev->pending = false;
while (NULL != (item = opal_list_remove_first(&wev->outputs))) {
output = (orte_iof_write_output_t*)item;
if (0 == output->numbytes) {
/* this indicates we are to close the fd - there is
* nothing to write
*/
OPAL_OUTPUT_VERBOSE((20, orte_iof_base.iof_output,
"%s iof:mrorted closing fd %d on write event due to zero bytes output",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wev->fd));
OBJ_RELEASE(wev);
sink->wev = NULL;
return;
}
num_written = write(wev->fd, output->data, output->numbytes);
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s mrorted:stdin:write:handler wrote %d bytes",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
num_written));
if (num_written < 0) {
if (EAGAIN == errno || EINTR == errno) {
/* push this item back on the front of the list */
opal_list_prepend(&wev->outputs, item);
/* leave the write event running so it will call us again
* when the fd is ready.
*/
wev->pending = true;
opal_event_add(wev->ev, 0);
goto CHECK;
}
/* otherwise, something bad happened so all we can do is declare an error */
OBJ_RELEASE(output);
OPAL_OUTPUT_VERBOSE((20, orte_iof_base.iof_output,
"%s iof:mrorted closing fd %d on write event due to negative bytes written",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wev->fd));
OBJ_RELEASE(wev);
sink->wev = NULL;
return;
} else if (num_written < output->numbytes) {
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s mrorted:stdin:write:handler incomplete write %d - adjusting data",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_written));
/* incomplete write - adjust data to avoid duplicate output */
memmove(output->data, &output->data[num_written], output->numbytes - num_written);
/* push this item back on the front of the list */
opal_list_prepend(&wev->outputs, item);
/* leave the write event running so it will call us again
* when the fd is ready.
*/
wev->pending = true;
opal_event_add(wev->ev, 0);
goto CHECK;
}
OBJ_RELEASE(output);
}
CHECK:
if (sink->xoff) {
/* if we have told the HNP to stop reading stdin, see if
* the proc has absorbed enough to justify restart
*
* RHC: Note that when multiple procs want stdin, we
* can get into a fight between a proc turnin stdin
* back "on" and other procs turning it "off". There
* is no clear way to resolve this as different procs
* may take input at different rates.
*/
if (opal_list_get_size(&wev->outputs) < ORTE_IOF_MAX_INPUT_BUFFERS) {
/* restart the read */
sink->xoff = false;
orte_iof_mrorted_send_xonxoff(&sink->name, ORTE_IOF_XON);
}
}
}

45
orte/mca/iof/mr_orted/iof_mrorted.h Обычный файл
Просмотреть файл

@ -0,0 +1,45 @@
/*
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef ORTE_IOF_MR_ORTED_H
#define ORTE_IOF_MR_ORTED_H
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/iof/iof.h"
BEGIN_C_DECLS
/**
* IOF MR_ORTED Component
*/
typedef struct {
orte_iof_base_component_t super;
opal_list_t sinks;
opal_list_t procs;
} orte_iof_mrorted_component_t;
ORTE_MODULE_DECLSPEC extern orte_iof_mrorted_component_t mca_iof_mr_orted_component;
extern orte_iof_base_module_t orte_iof_mrorted_module;
void orte_iof_mrorted_recv(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata);
void orte_iof_mrorted_read_handler(int fd, short event, void *data);
void orte_iof_mrorted_send_xonxoff(orte_process_name_t *name, orte_iof_tag_t tag);
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,85 @@
/*
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "iof_mrorted.h"
/*
* Local functions
*/
static int mr_orted_open(void);
static int mr_orted_close(void);
static int mr_orted_query(mca_base_module_t **module, int *priority);
/*
* Public string showing the iof mr_orted component version number
*/
const char *mca_iof_mr_orted_component_version_string =
"Open MPI mr_orted iof MCA component version " ORTE_VERSION;
orte_iof_mrorted_component_t mca_iof_mr_orted_component = {
{
{
ORTE_IOF_BASE_VERSION_2_0_0,
"mr_orted", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
/* Component open, close, and query functions */
mr_orted_open,
mr_orted_close,
mr_orted_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
}
};
/**
* component open/close/init function
*/
static int mr_orted_open(void)
{
/* Nothing to do */
return ORTE_SUCCESS;
}
static int mr_orted_close(void)
{
return ORTE_SUCCESS;
}
static int mr_orted_query(mca_base_module_t **module, int *priority)
{
if (ORTE_PROC_IS_DAEMON && orte_map_reduce) {
*priority = 1000;
*module = (mca_base_module_t *) &orte_iof_mrorted_module;
return ORTE_SUCCESS;
}
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}

281
orte/mca/iof/mr_orted/iof_mrorted_read.c Обычный файл
Просмотреть файл

@ -0,0 +1,281 @@
/*
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/dss/dss.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/util/name_fns.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
#include "iof_mrorted.h"
static void send_data(orte_process_name_t *name, orte_iof_tag_t tag,
orte_jobid_t jobid,
unsigned char *data, int32_t nbytes);
void orte_iof_mrorted_read_handler(int fd, short event, void *cbdata)
{
orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata;
unsigned char data[ORTE_IOF_BASE_MSG_MAX];
opal_buffer_t *buf=NULL;
int rc;
int32_t numbytes;
opal_list_item_t *item;
orte_iof_proc_t *proct;
orte_ns_cmp_bitmask_t mask;
orte_job_t *jdata;
orte_job_map_t *map;
int i;
bool write_out=false;
orte_node_t *node;
orte_proc_t *daemon;
/* read up to the fragment size */
#if !defined(__WINDOWS__)
numbytes = read(fd, data, sizeof(data));
#else
{
DWORD readed;
HANDLE handle = (HANDLE)_get_osfhandle(fd);
ReadFile(handle, data, sizeof(data), &readed, NULL);
numbytes = (int)readed;
}
#endif /* !defined(__WINDOWS__) */
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s iof:mrorted:read handler read %d bytes from %s, fd %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
numbytes, ORTE_NAME_PRINT(&rev->name), fd));
if (numbytes <= 0) {
if (0 > numbytes) {
/* either we have a connection error or it was a non-blocking read */
if (EAGAIN == errno || EINTR == errno) {
/* non-blocking, retry */
opal_event_add(rev->ev, 0);
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s iof:mrorted:read handler %s Error on connection:%d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&rev->name), fd));
}
/* numbytes must have been zero, so go down and close the fd etc */
goto CLEAN_RETURN;
}
/* see if the user wanted the output directed to files */
if (NULL != orte_output_filename) {
/* find the sink for this rank */
for (item = opal_list_get_first(&mca_iof_mr_orted_component.sinks);
item != opal_list_get_end(&mca_iof_mr_orted_component.sinks);
item = opal_list_get_next(item)) {
orte_iof_sink_t *sink = (orte_iof_sink_t*)item;
/* if the target is set, then this sink is for another purpose - ignore it */
if (ORTE_JOBID_INVALID != sink->daemon.jobid) {
continue;
}
/* if this sink isn't for output, ignore it */
if (ORTE_IOF_STDIN & sink->tag) {
continue;
}
mask = ORTE_NS_CMP_ALL;
/* is this the desired proc? */
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, &rev->name)) {
/* output to the corresponding file */
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev);
/* done */
break;
}
}
}
if (ORTE_IOF_STDOUT & rev->tag) {
/* see if we need to forward this output */
jdata = orte_get_job_data_object(rev->name.jobid);
if (ORTE_JOBID_INVALID == jdata->stdout_target) {
/* end of the chain - just output the info */
write_out = true;
goto PROCESS;
}
/* it goes to the next job in the chain */
jdata = orte_get_job_data_object(jdata->stdout_target);
map = jdata->map;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
daemon = node->daemon;
if (daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) {
/* if it is me, then send the bytes down the stdin pipe
* for every local proc (they are all on my proct list)
*/
for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs);
item != opal_list_get_end(&mca_iof_mr_orted_component.procs);
item = opal_list_get_next(item)) {
proct = (orte_iof_proc_t*)item;
if (proct->name.jobid == jdata->jobid) {
if (NULL == proct->sink) {
opal_output(0, "NULL SINK FOR PROC %s", ORTE_NAME_PRINT(&proct->name));
continue;
}
orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, numbytes, proct->sink->wev);
}
}
} else {
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s sending %d bytes from stdout of %s to daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
ORTE_NAME_PRINT(&rev->name),
ORTE_NAME_PRINT(&daemon->name)));
/* send the data to the daemon so it can
* write it to all local procs from this job
*/
send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, numbytes);
}
}
}
PROCESS:
if (write_out) {
/* prep the buffer */
buf = OBJ_NEW(opal_buffer_t);
/* pack the stream first - we do this so that flow control messages can
* consist solely of the tag
*/
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->tag, 1, ORTE_IOF_TAG))) {
ORTE_ERROR_LOG(rc);
goto CLEAN_RETURN;
}
/* pack name of process that gave us this data */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->name, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto CLEAN_RETURN;
}
/* pack the data - only pack the #bytes we read! */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &data, numbytes, OPAL_BYTE))) {
ORTE_ERROR_LOG(rc);
goto CLEAN_RETURN;
}
/* start non-blocking RML call to forward received data */
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s iof:mrorted:read handler sending %d bytes to HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes));
orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP,
0, orte_rml_send_callback, NULL);
}
/* re-add the event */
opal_event_add(rev->ev, 0);
return;
CLEAN_RETURN:
/* must be an error, or zero bytes were read indicating that the
* proc terminated this IOF channel - either way, find this proc
* on our list and clean up
*/
for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs);
item != opal_list_get_end(&mca_iof_mr_orted_component.procs);
item = opal_list_get_next(item)) {
proct = (orte_iof_proc_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) {
/* found it - release corresponding event. This deletes
* the read event and closes the file descriptor
*/
if (rev->tag & ORTE_IOF_STDOUT) {
if( NULL != proct->revstdout ) {
OBJ_RELEASE(proct->revstdout);
}
} else if (rev->tag & ORTE_IOF_STDERR) {
if( NULL != proct->revstderr ) {
OBJ_RELEASE(proct->revstderr);
}
} else if (rev->tag & ORTE_IOF_STDDIAG) {
if( NULL != proct->revstddiag ) {
OBJ_RELEASE(proct->revstddiag);
}
}
/* check to see if they are all done */
if (NULL == proct->revstdout &&
NULL == proct->revstderr &&
NULL == proct->revstddiag) {
/* this proc's iof is complete */
opal_list_remove_item(&mca_iof_mr_orted_component.procs, item);
ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE);
OBJ_RELEASE(proct);
}
break;
}
}
if (NULL != buf) {
OBJ_RELEASE(buf);
}
return;
}
static void send_data(orte_process_name_t *name, orte_iof_tag_t tag,
orte_jobid_t jobid,
unsigned char *data, int32_t nbytes)
{
opal_buffer_t *buf;
int rc;
buf = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG))) {
ORTE_ERROR_LOG(rc);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, data, nbytes, OPAL_BYTE))) {
ORTE_ERROR_LOG(rc);
return;
}
if (0 > (rc = orte_rml.send_buffer_nb(name, buf, ORTE_RML_TAG_IOF_PROXY,
0, orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
}
}

162
orte/mca/iof/mr_orted/iof_mrorted_receive.c Обычный файл
Просмотреть файл

@ -0,0 +1,162 @@
/*
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/iof/iof_types.h"
#include "orte/mca/iof/base/base.h"
#include "iof_mrorted.h"
static void send_cb(int status, orte_process_name_t *peer,
opal_buffer_t *buf, orte_rml_tag_t tag,
void *cbdata)
{
/* nothing to do here - just release buffer and return */
OBJ_RELEASE(buf);
}
void orte_iof_mrorted_send_xonxoff(orte_process_name_t *name, orte_iof_tag_t tag)
{
opal_buffer_t *buf;
int rc;
buf = OBJ_NEW(opal_buffer_t);
/* pack the tag - we do this first so that flow control messages can
* consist solely of the tag
*/
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return;
}
/* add the name of the proc */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s sending %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(ORTE_IOF_XON == tag) ? "xon" : "xoff"));
/* send the buffer to the HNP */
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP,
0, send_cb, NULL))) {
ORTE_ERROR_LOG(rc);
}
}
/*
* The only messages coming to an orted are either:
*
* (a) stdin, which is to be copied to whichever local
* procs "pull'd" a copy
*
* (b) flow control messages
*/
void orte_iof_mrorted_recv(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
unsigned char data[ORTE_IOF_BASE_MSG_MAX];
orte_iof_tag_t stream;
int32_t count, numbytes;
orte_jobid_t jobid;
opal_list_item_t *item;
int rc;
/* see what stream generated this data */
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &stream, &count, ORTE_IOF_TAG))) {
ORTE_ERROR_LOG(rc);
goto CLEAN_RETURN;
}
/* if this isn't stdin, then we have an error */
if (ORTE_IOF_STDIN != stream) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto CLEAN_RETURN;
}
/* unpack the intended target */
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jobid, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto CLEAN_RETURN;
}
/* unpack the data */
numbytes=ORTE_IOF_BASE_MSG_MAX;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, data, &numbytes, OPAL_BYTE))) {
ORTE_ERROR_LOG(rc);
goto CLEAN_RETURN;
}
/* numbytes will contain the actual #bytes that were sent */
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s unpacked %d bytes for local job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
ORTE_JOBID_PRINT(jobid)));
/* cycle through our list of procs */
for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs);
item != opal_list_get_end(&mca_iof_mr_orted_component.procs);
item = opal_list_get_next(item)) {
orte_iof_proc_t* sink = (orte_iof_proc_t*)item;
/* is this intended for this jobid? */
if (jobid == sink->name.jobid) {
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s writing data to local proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&sink->name)));
if (NULL == sink->sink->wev || sink->sink->wev->fd < 0) {
/* this sink was already closed - ignore this data */
goto CLEAN_RETURN;
}
/* send the bytes down the pipe - we even send 0 byte events
* down the pipe so it forces out any preceding data before
* closing the output stream
*/
if (ORTE_IOF_MAX_INPUT_BUFFERS < orte_iof_base_write_output(&sink->name, stream, data, numbytes, sink->sink->wev)) {
/* getting too backed up - tell the HNP to hold off any more input if we
* haven't already told it
*/
if (!sink->sink->xoff) {
sink->sink->xoff = true;
orte_iof_mrorted_send_xonxoff(&sink->name, ORTE_IOF_XOFF);
}
}
}
}
CLEAN_RETURN:
return;
}

Просмотреть файл

@ -84,6 +84,7 @@ orte_iof_base_module_t orte_iof_orted_module = {
orted_push,
orted_pull,
orted_close,
NULL,
finalize,
orted_ft_event
};

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -62,6 +62,7 @@ orte_iof_base_module_t orte_iof_tool_module = {
tool_push,
tool_pull,
tool_close,
NULL,
finalize,
tool_ft_event
};

Просмотреть файл

@ -229,12 +229,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
return rc;
}
/* pack the number of nodes involved in this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->num_nodes, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the number of procs in this launch */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->num_procs, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
@ -267,6 +261,12 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
return rc;
}
/* pack the stdout target */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->stdout_target, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack whether or not process recovery is allowed for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->enable_recovery, 1, OPAL_BOOL))) {
ORTE_ERROR_LOG(rc);
@ -538,15 +538,10 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
goto REPORT_ERROR;
}
/* unpack the number of nodes involved in this job */
/* ensure the map object is present */
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
}
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->map->num_nodes, &cnt, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the number of procs in this launch */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->num_procs, &cnt, ORTE_VPID))) {
@ -579,6 +574,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the stdout target for the job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->stdout_target, &cnt, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack whether or not process recovery is allowed for this job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->enable_recovery, &cnt, OPAL_BOOL))) {
@ -1114,7 +1115,6 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
char **argvsav=NULL;
int inm, j, idx;
int total_num_local_procs = 0;
orte_nid_t *nid;
orte_node_t *node;
orte_odls_launch_local_t *caddy = (orte_odls_launch_local_t*)cbdata;
orte_job_t *jobdat;
@ -1145,25 +1145,13 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
/* see if the mapper thinks we are oversubscribed */
oversubscribed = false;
if (ORTE_PROC_IS_HNP) {
/* just fake it - we don't keep a local nidmap */
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
goto ERROR_OUT;
}
if (node->oversubscribed) {
oversubscribed = true;
}
} else {
/* RHC: the nidmap will eventually disappear, so for now just
* make this a non-fatal error
*/
if (NULL != (nid = orte_util_lookup_nid(ORTE_PROC_MY_NAME))) {
if (nid->oversubscribed) {
oversubscribed = true;
}
}
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
goto ERROR_OUT;
}
if (node->oversubscribed) {
oversubscribed = true;
}
#if OPAL_ENABLE_FT_CR == 1
@ -1745,7 +1733,7 @@ void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid)
opal_dss.pack(&buffer, &vpid1, 1, ORTE_VPID); /* num_procs */
#if OPAL_HAVE_HWLOC
bind_level = OPAL_HWLOC_NODE_LEVEL;
opal_dss.pack(&buffer, &bind_level, 1, OPAL_HWLOC_LEVEL_T); /* num_procs */
opal_dss.pack(&buffer, &bind_level, 1, OPAL_HWLOC_LEVEL_T); /* binding level */
#endif
one32 = 0;
opal_dss.pack(&buffer, &one32, 1, OPAL_INT32); /* node index */
@ -2095,6 +2083,9 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
if (NULL == (cptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
continue;
}
if (cptr->name.jobid != proc->name.jobid) {
continue;
}
if (cptr->registered) {
/* someone has registered, and we didn't before
* terminating - this is an abnormal termination

Просмотреть файл

@ -191,6 +191,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
* do it - no new daemons will be launched
*/
if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) {
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
OBJ_RELEASE(state);
return;
@ -213,7 +214,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
* job to move to the following step
*/
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED);
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
OBJ_RELEASE(state);
return;
}
@ -234,7 +235,9 @@ static void launch_daemons(int fd, short args, void *cbdata)
"%s plm:alps: no new daemons to launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED);
if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) {
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
}
OBJ_RELEASE(state);
return;
}
@ -404,6 +407,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
/* indicate that the daemons for this job were launched */
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
/* flag that launch was successful, so far as we currently know */
failed_launch = false;

Просмотреть файл

@ -83,6 +83,7 @@ ORTE_DECLSPEC void orte_plm_base_app_report_launch(int fd, short event, void *da
ORTE_DECLSPEC void orte_plm_base_receive_process_msg(int fd, short event, void *data);
ORTE_DECLSPEC void orte_plm_base_setup_job(int fd, short args, void *cbdata);
ORTE_DECLSPEC void orte_plm_base_setup_job_complete(int fd, short args, void *cbdata);
ORTE_DECLSPEC void orte_plm_base_complete_setup(int fd, short args, void *cbdata);
ORTE_DECLSPEC void orte_plm_base_daemons_reported(int fd, short args, void *cbdata);
ORTE_DECLSPEC void orte_plm_base_daemons_launched(int fd, short args, void *cbdata);

Просмотреть файл

@ -80,8 +80,6 @@
void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
int i;
orte_job_t *jdata;
#if OPAL_HAVE_HWLOC
{
@ -106,21 +104,17 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
node->topology = t;
if (NULL == node->topology) {
node->topology = t;
}
}
}
}
#endif
/* progress all jobs whose daemons have launched */
for (i=1; i < orte_job_data->size; i++) {
if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
continue;
}
if (ORTE_JOB_STATE_DAEMONS_LAUNCHED == jdata->state) {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
}
}
/* progress the job */
caddy->jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_MAP);
/* cleanup */
OBJ_RELEASE(caddy);
@ -213,12 +207,21 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
free(bar2_val);
/* set the job state to the next position */
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE);
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_INIT_COMPLETE);
/* cleanup */
OBJ_RELEASE(caddy);
}
void orte_plm_base_setup_job_complete(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
/* nothing to do here but move along */
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE);
OBJ_RELEASE(caddy);
}
void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
{
orte_job_t *jdata, *jdatorted;
@ -510,12 +513,12 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata)
{
orte_process_name_t peer;
char *rml_uri = NULL, *ptr;
int rc, idx;
orte_proc_t *daemon=NULL;
char *nodename;
orte_node_t *node;
orte_job_t *jdata;
/* get the daemon job, if necessary */
if (NULL == jdatorted) {
@ -562,7 +565,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:orted_report_launch from daemon %s on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer), nodename));
ORTE_NAME_PRINT(sender), nodename));
/* look this node up, if necessary */
if (!orte_plm_globals.daemon_nodes_assigned_at_launch) {
@ -593,16 +596,29 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:orted_report_launch attempting to assign daemon %s to node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer), nodename));
ORTE_NAME_PRINT(sender), nodename));
for (idx=0; idx < orte_node_pool->size; idx++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, idx))) {
continue;
}
if (NULL != node->daemon) {
if (node->location_verified) {
/* already assigned */
continue;
}
if (0 == strcmp(nodename, node->name)) {
/* flag that we verified the location */
node->location_verified = true;
if (node == daemon->node) {
/* it wound up right where it should */
break;
}
/* remove the prior association */
if (NULL != daemon->node) {
OBJ_RELEASE(daemon->node);
}
if (NULL != node->daemon) {
OBJ_RELEASE(node->daemon);
}
/* associate this daemon with the node */
node->daemon = daemon;
OBJ_RETAIN(daemon);
@ -687,8 +703,18 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
} else {
jdatorted->num_reported++;
if (jdatorted->num_procs == jdatorted->num_reported) {
/* activate the daemons_reported state */
ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_DAEMONS_REPORTED);
jdatorted->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
/* activate the daemons_reported state for all jobs
* whose daemons were launched
*/
for (idx=1; idx < orte_job_data->size; idx++) {
if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, idx))) {
continue;
}
if (ORTE_JOB_STATE_DAEMONS_LAUNCHED == jdata->state) {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
}
}
}
}
@ -776,6 +802,9 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
opal_argv_append(argc, argv, "1");
}
#endif
if (orte_map_reduce) {
opal_argv_append(argc, argv, "--mapreduce");
}
/* the following two are not mca params */
if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
@ -1116,7 +1145,6 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
node = (orte_node_t*)item;
/* if this node is already in the map, skip it */
if (NULL != node->daemon) {
OBJ_RELEASE(node);
continue;
}
/* add the node to the map */
@ -1146,19 +1174,22 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
return rc;
}
++daemons->num_procs;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:setup_vm assigning new daemon %s to node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
node->name));
/* point the node to the daemon */
node->daemon = proc;
OBJ_RETAIN(proc); /* maintain accounting */
/* point the proc to the node and maintain accounting */
proc->node = node;
proc->nodename = node->name;
OBJ_RETAIN(node);
if (orte_plm_globals.daemon_nodes_assigned_at_launch) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:setup_vm assigning new daemon %s to node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
node->name));
/* point the node to the daemon */
node->daemon = proc;
OBJ_RETAIN(proc); /* maintain accounting */
/* point the proc to the node and maintain accounting */
proc->node = node;
proc->nodename = node->name;
OBJ_RETAIN(node);
node->location_verified = true;
} else {
node->location_verified = false;
}
/* track number of daemons to be launched */
++map->num_new_daemons;

Просмотреть файл

@ -192,7 +192,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
* job to move to the following step
*/
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED);
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
OBJ_RELEASE(state);
return;
}
@ -219,7 +219,9 @@ static void launch_daemons(int fd, short args, void *cbdata)
"%s plm:lsf: no new daemons to launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED);
if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) {
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
}
OBJ_RELEASE(state);
return;
}
@ -349,6 +351,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
/* indicate that the daemons for this job were launched */
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
/* flag that launch was successful, so far as we currently know */
failed_launch = false;

Просмотреть файл

@ -97,18 +97,19 @@ typedef int32_t orte_job_state_t;
#define ORTE_JOB_STATE_UNDEF 0
#define ORTE_JOB_STATE_INIT 1 /* ready to be assigned id */
#define ORTE_JOB_STATE_ALLOCATE 2 /* ready to be allocated */
#define ORTE_JOB_STATE_MAP 3 /* ready to be mapped */
#define ORTE_JOB_STATE_SYSTEM_PREP 4 /* ready for final sanity check and system values updated */
#define ORTE_JOB_STATE_LAUNCH_DAEMONS 5 /* ready to launch daemons */
#define ORTE_JOB_STATE_DAEMONS_LAUNCHED 6 /* daemons for this job have been launched */
#define ORTE_JOB_STATE_DAEMONS_REPORTED 7 /* all launched daemons have reported */
#define ORTE_JOB_STATE_LAUNCH_APPS 8 /* ready to launch apps */
#define ORTE_JOB_STATE_RUNNING 9 /* all procs have been fork'd */
#define ORTE_JOB_STATE_SUSPENDED 10 /* job has been suspended */
#define ORTE_JOB_STATE_REGISTERED 11 /* all procs registered for sync */
#define ORTE_JOB_STATE_READY_FOR_DEBUGGERS 12 /* job ready for debugger init after spawn */
#define ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE 13 /* all local procs have attempted launch */
#define ORTE_JOB_STATE_INIT_COMPLETE 2 /* jobid assigned and setup */
#define ORTE_JOB_STATE_ALLOCATE 3 /* ready to be allocated */
#define ORTE_JOB_STATE_MAP 4 /* ready to be mapped */
#define ORTE_JOB_STATE_SYSTEM_PREP 5 /* ready for final sanity check and system values updated */
#define ORTE_JOB_STATE_LAUNCH_DAEMONS 6 /* ready to launch daemons */
#define ORTE_JOB_STATE_DAEMONS_LAUNCHED 7 /* daemons for this job have been launched */
#define ORTE_JOB_STATE_DAEMONS_REPORTED 8 /* all launched daemons have reported */
#define ORTE_JOB_STATE_LAUNCH_APPS 9 /* ready to launch apps */
#define ORTE_JOB_STATE_RUNNING 10 /* all procs have been fork'd */
#define ORTE_JOB_STATE_SUSPENDED 11 /* job has been suspended */
#define ORTE_JOB_STATE_REGISTERED 12 /* all procs registered for sync */
#define ORTE_JOB_STATE_READY_FOR_DEBUGGERS 13 /* job ready for debugger init after spawn */
#define ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE 14 /* all local procs have attempted launch */
/*
* Define a "boundary" so we can easily and quickly determine

Просмотреть файл

@ -1109,7 +1109,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
* job to move to the following step
*/
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED);
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
OBJ_RELEASE(state);
return;
}
@ -1127,7 +1127,9 @@ static void launch_daemons(int fd, short args, void *cbdata)
* job to move to the following step
*/
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED);
if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) {
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
}
OBJ_RELEASE(state);
return;
}
@ -1410,7 +1412,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
/* set the job state to indicate the daemons are launched */
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
/* trigger the event to start processing the launch list */
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:process: activating launch event",

Просмотреть файл

@ -957,6 +957,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
* do it - no new daemons will be launched
*/
if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) {
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
OBJ_RELEASE(state);
return;
@ -979,7 +980,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
* job to move to the following step
*/
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED);
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
OBJ_RELEASE(state);
return;
}
@ -997,7 +998,9 @@ static void launch_daemons(int fd, short args, void *cbdata)
* job to move to the following step
*/
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED);
if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) {
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
}
OBJ_RELEASE(state);
return;
}

Просмотреть файл

@ -199,6 +199,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
* do it - no new daemons will be launched
*/
if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) {
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
OBJ_RELEASE(state);
return;
@ -221,7 +222,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
* job to move to the following step
*/
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED);
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
OBJ_RELEASE(state);
return;
}
@ -242,7 +243,9 @@ static void launch_daemons(int fd, short args, void *cbdata)
"%s plm:slurm: no new daemons to launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED);
if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) {
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
}
OBJ_RELEASE(state);
return;
}
@ -407,6 +410,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
/* indicate that the daemons for this job were launched */
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
/* flag that launch was successful, so far as we currently know */
failed_launch = false;

Просмотреть файл

@ -195,6 +195,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
* do it - no new daemons will be launched
*/
if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & jdata->controls) {
jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
OBJ_RELEASE(state);
return;
@ -217,7 +218,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
* job to move to the following step
*/
jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
OBJ_RELEASE(state);
return;
}
@ -235,7 +236,9 @@ static void launch_daemons(int fd, short args, void *cbdata)
* job to move to the following step
*/
jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED);
if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
}
OBJ_RELEASE(state);
return;
}
@ -408,6 +411,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
/* indicate that the daemons for this job were launched */
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
/* flag that launch was successful, so far as we currently know */
failed_launch = false;

Просмотреть файл

@ -43,10 +43,10 @@ void orte_state_base_activate_job_state(orte_job_t *jdata,
}
if (s->job_state == state) {
OPAL_OUTPUT_VERBOSE((1, orte_state_base_output,
"%s ACTIVATING JOB %s STATE %s",
"%s ACTIVATING JOB %s STATE %s PRI %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid),
orte_job_state_to_str(state)));
orte_job_state_to_str(state), s->priority));
if (NULL == s->cbfunc) {
OPAL_OUTPUT_VERBOSE((1, orte_state_base_output,
"%s NULL CBFUNC FOR JOB %s STATE %s",
@ -90,6 +90,11 @@ void orte_state_base_activate_job_state(orte_job_t *jdata,
caddy->job_state = state;
OBJ_RETAIN(jdata);
}
OPAL_OUTPUT_VERBOSE((1, orte_state_base_output,
"%s ACTIVATING JOB %s STATE %s PRI %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid),
orte_job_state_to_str(state), s->priority));
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
opal_event_set_priority(&caddy->ev, s->priority);
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
@ -217,10 +222,10 @@ void orte_state_base_activate_proc_state(orte_process_name_t *proc,
}
if (s->proc_state == state) {
OPAL_OUTPUT_VERBOSE((1, orte_state_base_output,
"%s ACTIVATING PROC %s STATE %s",
"%s ACTIVATING PROC %s STATE %s PRI %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state)));
orte_proc_state_to_str(state), s->priority));
if (NULL == s->cbfunc) {
OPAL_OUTPUT_VERBOSE((1, orte_state_base_output,
"%s NULL CBFUNC FOR PROC %s STATE %s",
@ -258,6 +263,11 @@ void orte_state_base_activate_proc_state(orte_process_name_t *proc,
caddy = OBJ_NEW(orte_state_caddy_t);
caddy->name = *proc;
caddy->proc_state = state;
OPAL_OUTPUT_VERBOSE((1, orte_state_base_output,
"%s ACTIVATING PROC %s STATE %s PRI %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state), s->priority));
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
opal_event_set_priority(&caddy->ev, s->priority);
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);

Просмотреть файл

@ -86,6 +86,7 @@ static void report_progress(int fd, short argc, void *cbdata);
*/
static orte_job_state_t launch_states[] = {
ORTE_JOB_STATE_INIT,
ORTE_JOB_STATE_INIT_COMPLETE,
ORTE_JOB_STATE_ALLOCATE,
ORTE_JOB_STATE_DAEMONS_LAUNCHED,
ORTE_JOB_STATE_DAEMONS_REPORTED,
@ -102,6 +103,7 @@ static orte_job_state_t launch_states[] = {
};
static orte_state_cbfunc_t launch_callbacks[] = {
orte_plm_base_setup_job,
orte_plm_base_setup_job_complete,
orte_ras_base_allocate,
orte_plm_base_daemons_launched,
orte_plm_base_daemons_reported,
@ -372,6 +374,11 @@ static void check_all_complete(int fd, short args, void *cbdata)
/* turn off any sensor monitors on this job */
orte_sensor.stop(jdata->jobid);
/* tell the IOF that the job is complete */
if (NULL != orte_iof.complete) {
orte_iof.complete(jdata);
}
if (0 < jdata->num_non_zero_exit && !orte_abort_non_zero_exit) {
if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) {
/* update the exit code */

Просмотреть файл

@ -160,6 +160,8 @@ static void track_jobs(int fd, short argc, void *cbdata)
int rc;
if (ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE == caddy->job_state) {
opal_output(0, "%s state:orted:track_jobs sending local launch complete for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(caddy->jdata->jobid));
/* update the HNP with all proc states for this job */
alert = OBJ_NEW(opal_buffer_t);
/* pack update state command */
@ -281,14 +283,6 @@ static void track_procs(int fd, short argc, void *cbdata)
* while we are still trying to notify the HNP of
* successful launch for short-lived procs
*/
/* Release only the stdin IOF file descriptor for this child, if one
* was defined. File descriptors for the other IOF channels - stdout,
* stderr, and stddiag - were released when their associated pipes
* were cleared and closed due to termination of the process
*/
if (NULL != orte_iof.close) {
orte_iof.close(proc, ORTE_IOF_STDIN);
}
pdata->iof_complete = true;
if (pdata->waitpid_recvd) {
/* the proc has terminated */
@ -325,6 +319,16 @@ static void track_procs(int fd, short argc, void *cbdata)
}
}
}
/* Release the stdin IOF file descriptor for this child, if one
* was defined. File descriptors for the other IOF channels - stdout,
* stderr, and stddiag - were released when their associated pipes
* were cleared and closed due to termination of the process
* Do this after we handle termination in case the IOF needs
* to check to see if all procs from the job are actually terminated
*/
if (NULL != orte_iof.close) {
orte_iof.close(proc, ORTE_IOF_STDIN);
}
} else if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
/* do NOT update the proc state as this can hit
* while we are still trying to notify the HNP of

Просмотреть файл

@ -116,6 +116,7 @@ static struct {
int fail;
int fail_delay;
bool abort;
bool mapreduce;
} orted_globals;
/*
@ -205,6 +206,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
"Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" },
#endif
{ NULL, NULL, NULL, '\0', "mapreduce", "mapreduce", 0,
&orted_globals.mapreduce, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to report process bindings to stderr" },
/* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
@ -327,6 +332,11 @@ int orte_daemon(int argc, char *argv[])
#endif
tmp_env_var = NULL; /* Silence compiler warning */
/* if mapreduce set, flag it */
if (orted_globals.mapreduce) {
orte_map_reduce = true;
}
/* Set the flag telling OpenRTE that I am NOT a
* singleton, but am "infrastructure" - prevents setting
* up incorrect infrastructure that only a singleton would

Просмотреть файл

@ -124,6 +124,7 @@ opal_pointer_array_t *orte_job_data;
opal_pointer_array_t *orte_node_pool;
opal_pointer_array_t *orte_node_topologies;
opal_pointer_array_t *orte_local_children;
uint16_t orte_num_jobs = 0;
/* Nidmap and job maps */
opal_pointer_array_t orte_nidmap;
@ -166,9 +167,6 @@ bool orte_do_not_barrier = false;
bool orte_enable_recovery;
int32_t orte_max_restarts;
/* comm fn for updating state */
orte_default_comm_fn_t orte_comm;
/* exit status reporting */
bool orte_report_child_jobs_separately;
struct timeval orte_child_time_to_exit;
@ -183,6 +181,9 @@ char *orte_forward_envars = NULL;
/* preload binaries */
bool orte_preload_binaries = false;
/* map-reduce mode */
bool orte_map_reduce = false;
/* map stddiag output to stderr so it isn't forwarded to mpirun */
bool orte_map_stddiag_to_stderr = false;
@ -637,6 +638,7 @@ static void orte_job_construct(orte_job_t* job)
job->num_apps = 0;
job->controls = ORTE_JOB_CONTROL_FORWARD_OUTPUT;
job->stdin_target = ORTE_VPID_INVALID;
job->stdout_target = ORTE_JOBID_INVALID;
job->total_slots_alloc = 0;
job->num_procs = 0;
job->procs = OBJ_NEW(opal_pointer_array_t);
@ -758,6 +760,7 @@ static void orte_node_construct(orte_node_t* node)
node->index = -1;
node->daemon = NULL;
node->daemon_launched = false;
node->location_verified = false;
node->launch_id = -1;
node->num_procs = 0;

Просмотреть файл

@ -208,7 +208,10 @@ typedef uint16_t orte_job_controls_t;
#define ORTE_JOB_CONTROL_SPIN_FOR_DEBUG 0x0100
#define ORTE_JOB_CONTROL_RESTART 0x0200
#define ORTE_JOB_CONTROL_PROCS_MIGRATING 0x0400
#define ORTE_JOB_CONTROL_MAPPER 0x0800
#define ORTE_JOB_CONTROL_REDUCER 0x1000
#define ORTE_JOB_CONTROL_COMBINER 0x2000
/* global type definitions used by RTE - instanced in orte_globals.c */
/************
@ -293,6 +296,11 @@ typedef struct {
struct orte_proc_t *daemon;
/* whether or not this daemon has been launched */
bool daemon_launched;
/* whether or not the location has been verified - used
* for environments where the daemon's final destination
* is uncertain
*/
bool location_verified;
/** Launch id - needed by some systems to launch a proc on this node */
int32_t launch_id;
/** number of procs on this node */
@ -359,6 +367,8 @@ typedef struct {
* (wildcard), or none (invalid)
*/
orte_vpid_t stdin_target;
/* job that is to receive the stdout (on its stdin) from this one */
orte_jobid_t stdout_target;
/* collective ids */
orte_grpcomm_coll_id_t peer_modex;
orte_grpcomm_coll_id_t peer_init_barrier;
@ -635,6 +645,7 @@ ORTE_DECLSPEC extern opal_pointer_array_t *orte_job_data;
ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_pool;
ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies;
ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children;
ORTE_DECLSPEC extern uint16_t orte_num_jobs;
/* Nidmap and job maps */
ORTE_DECLSPEC extern opal_pointer_array_t orte_nidmap;
@ -673,14 +684,6 @@ ORTE_DECLSPEC extern int32_t orte_max_restarts;
/* barrier control */
ORTE_DECLSPEC extern bool orte_do_not_barrier;
/* comm interface */
typedef void (*orte_default_cbfunc_t)(int fd, short event, void *data);
typedef int (*orte_default_comm_fn_t)(orte_process_name_t *recipient,
opal_buffer_t *buf,
orte_rml_tag_t tag,
orte_default_cbfunc_t cbfunc);
/* exit status reporting */
ORTE_DECLSPEC extern bool orte_report_child_jobs_separately;
ORTE_DECLSPEC extern struct timeval orte_child_time_to_exit;
@ -695,6 +698,9 @@ ORTE_DECLSPEC extern char *orte_forward_envars;
/* preload binaries */
ORTE_DECLSPEC extern bool orte_preload_binaries;
/* map-reduce mode */
ORTE_DECLSPEC extern bool orte_map_reduce;
/* map stddiag output to stderr so it isn't forwarded to mpirun */
ORTE_DECLSPEC extern bool orte_map_stddiag_to_stderr;

Просмотреть файл

@ -35,7 +35,8 @@ SUBDIRS += \
tools/wrappers \
tools/orte-top \
tools/orte-info \
tools/orte-migrate
tools/orte-migrate \
tools/mapreduce
DIST_SUBDIRS += \
tools/orte-checkpoint \
@ -47,5 +48,6 @@ DIST_SUBDIRS += \
tools/wrappers \
tools/orte-top \
tools/orte-info \
tools/orte-migrate
tools/orte-migrate \
tools/mapreduce

40
orte/tools/mapreduce/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,40 @@
#
# Copyright (c) 2012 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
include $(top_srcdir)/Makefile.man-page-rules
man_pages = mapreduce.1
EXTRA_DIST = $(man_pages:.1=.1in)
if !ORTE_DISABLE_FULL_SUPPORT
if OMPI_INSTALL_BINARIES
bin_PROGRAMS = mapreduce
nodist_man_MANS = $(man_pages)
# Ensure that the man pages are rebuilt if the opal_config.h file
# changes; a "good enough" way to know if configure was run again (and
# therefore the release date or version may have changed)
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
dist_pkgdata_DATA = help-mapreduce.txt
endif # OMPI_INSTALL_BINARIES
mapreduce_SOURCES = \
mapreduce.c
mapreduce_LDADD = $(top_builddir)/orte/libopen-rte.la
endif # !ORTE_DISABLE_FULL_SUPPORT
distclean-local:
rm -f $(man_pages)

627
orte/tools/mapreduce/help-mapreduce.txt Обычный файл
Просмотреть файл

@ -0,0 +1,627 @@
# -*- text -*-
#
# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open RTE's orterun.
#
[orterun:init-failure]
Open RTE was unable to initialize properly. The error occured while
attempting to %s. Returned value %d instead of ORTE_SUCCESS.
[orterun:usage]
%s (%s) %s
Usage: %s [OPTION]... [PROGRAM]...
Start the given program using Open RTE
%s
Report bugs to %s
[orterun:version]
%s (%s) %s
Report bugs to %s
[orterun:allocate-resources]
%s was unable to allocate enough resources to start your application.
This might be a transient error (too many nodes in the cluster were
unavailable at the time of the request) or a permenant error (you
requsted more nodes than exist in your cluster).
While probably only useful to Open RTE developers, the error returned
was %d.
[orterun:error-spawning]
%s was unable to start the specified application. An attempt has been
made to clean up all processes that did start. The error returned was
%d.
[orterun:appfile-not-found]
Unable to open the appfile:
%s
Double check that this file exists and is readable.
[orterun:executable-not-specified]
No executable was specified on the %s command line.
Aborting.
[orterun:multi-apps-and-zero-np]
%s found multiple applications specified on the command line, with
at least one that failed to specify the number of processes to execute.
When specifying multiple applications, you must specify how many processes
of each to launch via the -np argument.
[orterun:nothing-to-do]
%s could not find anything to do.
It is possible that you forgot to specify how many processes to run
via the "-np" argument.
[orterun:call-failed]
%s encountered a %s call failure. This should not happen, and
usually indicates an error within the operating system itself.
Specifically, the following error occurred:
%s
The only other available information that may be helpful is the errno
that was returned: %d.
[orterun:environ]
%s was unable to set
%s = %s
in the environment. Returned value %d instead of ORTE_SUCCESS.
[orterun:precondition]
%s was unable to precondition transports
Returned value %d instead of ORTE_SUCCESS.
[orterun:attr-failed]
%s was unable to define an attribute
Returned value %d instead of ORTE_SUCCESS.
#
[orterun:proc-ordered-abort]
%s has exited due to process rank %lu with PID %lu on
node %s calling "abort". This may have caused other processes
in the application to be terminated by signals sent by %s
(as reported here).
#
[orterun:proc-exit-no-sync]
%s has exited due to process rank %lu with PID %lu on
node %s exiting improperly. There are three reasons this could occur:
1. this process did not call "init" before exiting, but others in
the job did. This can cause a job to hang indefinitely while it waits
for all processes to call "init". By rule, if one process calls "init",
then ALL processes must call "init" prior to termination.
2. this process called "init", but exited without calling "finalize".
By rule, all processes that call "init" MUST call "finalize" prior to
exiting or it will be considered an "abnormal termination"
3. this process called "MPI_Abort" or "orte_abort" and the mca parameter
orte_create_session_dirs is set to false. In this case, the run-time cannot
detect that the abort call was an abnormal termination. Hence, the only
error message you will receive is this one.
This may have caused other processes in the application to be
terminated by signals sent by %s (as reported here).
You can avoid this message by specifying -quiet on the %s command line.
#
[orterun:proc-exit-no-sync-unknown]
%s has exited due to a process exiting without calling "finalize",
but has no info as to the process that caused that situation. This
may have caused other processes in the application to be
terminated by signals sent by %s (as reported here).
#
[orterun:proc-aborted]
%s noticed that process rank %lu with PID %lu on node %s exited on signal %d.
#
[orterun:proc-aborted-unknown]
%s noticed that the job aborted, but has no info as to the process
that caused that situation.
#
[orterun:proc-aborted-signal-unknown]
%s noticed that the job aborted by signal, but has no info as
to the process that caused that situation.
#
[orterun:proc-aborted-strsignal]
%s noticed that process rank %lu with PID %lu on node %s exited on signal %d (%s).
#
[orterun:abnormal-exit]
WARNING: %s has exited before it received notification that all
started processes had terminated. You should double check and ensure
that there are no runaway processes still executing.
#
[orterun:sigint-while-processing]
WARNING: %s is in the process of killing a job, but has detected an
interruption (probably control-C).
It is dangerous to interrupt %s while it is killing a job (proper
termination may not be guaranteed). Hit control-C again within 1
second if you really want to kill %s immediately.
#
[orterun:double-prefix]
Both a prefix was supplied to %s and the absolute path to %s was
given:
Prefix: %s
Path: %s
Only one should be specified to avoid potential version
confusion. Operation will continue, but the -prefix option will be
used. This is done to allow you to select a different prefix for
the backend computation nodes than used on the frontend for %s.
#
[orterun:app-prefix-conflict]
Both a prefix or absolute path was given for %s, and a different
prefix provided for the first app_context:
Mpirun prefix: %s
App prefix: %s
Only one should be specified to avoid potential version
confusion. Operation will continue, but the applicaton's prefix
option will be ignored.
#
[orterun:empty-prefix]
A prefix was supplied to %s that only contained slashes.
This is a fatal error; %s will now abort. No processes were launched.
#
[debugger-mca-param-not-found]
Internal error -- the orte_base_user_debugger MCA parameter was not able to
be found. Please contact the Open RTE developers; this should not
happen.
#
[debugger-orte_base_user_debugger-empty]
The MCA parameter "orte_base_user_debugger" was empty, indicating that
no user-level debuggers have been defined. Please set this MCA
parameter to a value and try again.
#
[debugger-not-found]
A suitable debugger could not be found in your PATH. Check the values
specified in the orte_base_user_debugger MCA parameter for the list of
debuggers that was searched.
#
[debugger-exec-failed]
%s was unable to launch the specified debugger. This is what was
launched:
%s
Things to check:
- Ensure that the debugger is installed properly
- Ensure that the "%s" executable is in your path
- Ensure that any required licenses are available to run the debugger
#
[orterun:sys-limit-pipe]
%s was unable to launch the specified application as it encountered an error:
Error: system limit exceeded on number of pipes that can be open
Node: %s
when attempting to start process rank %lu.
This can be resolved by setting the mca parameter opal_set_max_sys_limits to 1,
increasing your limit descriptor setting (using limit or ulimit commands),
asking the system administrator for that node to increase the system limit, or
by rearranging your processes to place fewer of them on that node.
#
[orterun:sys-limit-sockets]
Error: system limit exceeded on number of network connections that can be open
This can be resolved by setting the mca parameter opal_set_max_sys_limits to 1,
increasing your limit descriptor setting (using limit or ulimit commands),
or asking the system administrator to increase the system limit.
#
[orterun:pipe-setup-failure]
%s was unable to launch the specified application as it encountered an error:
Error: pipe function call failed when setting up I/O forwarding subsystem
Node: %s
while attempting to start process rank %lu.
#
[orterun:sys-limit-children]
%s was unable to launch the specified application as it encountered an error:
Error: system limit exceeded on number of processes that can be started
Node: %s
when attempting to start process rank %lu.
This can be resolved by either asking the system administrator for that node to
increase the system limit, or by rearranging your processes to place fewer of them
on that node.
#
[orterun:failed-term-attrs]
%s was unable to launch the specified application as it encountered an error:
Error: reading tty attributes function call failed while setting up I/O forwarding system
Node: %s
while attempting to start process rank %lu.
#
[orterun:wdir-not-found]
%s was unable to launch the specified application as it could not
change to the specified working directory:
Working directory: %s
Node: %s
while attempting to start process rank %lu.
#
[orterun:exe-not-found]
%s was unable to find the specified executable file, and therefore
did not launch the job. This error was first reported for process
rank %lu; it may have occurred for other processes as well.
NOTE: A common cause for this error is misspelling a %s command
line parameter option (remember that %s interprets the first
unrecognized command line token as the executable).
Node: %s
Executable: %s
#
[orterun:exe-not-accessible]
%s was unable to launch the specified application as it could not access
or execute an executable:
Executable: %s
Node: %s
while attempting to start process rank %lu.
#
[orterun:pipe-read-failure]
%s was unable to launch the specified application as it encountered an error:
Error: reading from a pipe function call failed while spawning a local process
Node: %s
while attempting to start process rank %lu.
#
[orterun:proc-failed-to-start]
%s was unable to start the specified application as it encountered an
error:
Error name: %s
Node: %s
when attempting to start process rank %lu.
#
[orterun:proc-socket-not-avail]
%s was unable to start the specified application as it encountered an
error:
Error name: %s
Node: %s
when attempting to start process rank %lu.
#
[orterun:proc-failed-to-start-no-status]
%s was unable to start the specified application as it encountered an
error on node %s. More information may be available above.
#
[orterun:proc-failed-to-start-no-status-no-node]
%s was unable to start the specified application as it encountered an
error. More information may be available above.
#
[debugger requires -np]
The number of MPI processes to launch was not specified on the command
line.
The %s debugger requires that you specify a number of MPI processes to
launch on the command line via the "-np" command line parameter. For
example:
%s -np 4 %s
Skipping the %s debugger for now.
#
[debugger requires executable]
The %s debugger requires that you specify an executable on the %s
command line; you cannot specify application context files when
launching this job in the %s debugger. For example:
%s -np 4 my_mpi_executable
Skipping the %s debugger for now.
#
[debugger only accepts single app]
The %s debugger only accepts SPMD-style launching; specifying an
MPMD-style launch (with multiple applications separated via ':') is
not permitted.
Skipping the %s debugger for now.
#
[orterun:daemon-died-during-execution]
%s has detected that a required daemon terminated during execution
of the application with a non-zero status. This is a fatal error.
A best-effort attempt has been made to cleanup. However, it is
-strongly- recommended that you execute the orte-clean utility
to ensure full cleanup is accomplished.
#
[orterun:no-orted-object-exit]
%s was unable to determine the status of the daemons used to
launch this application. Additional manual cleanup may be required.
Please refer to the "orte-clean" tool for assistance.
#
[orterun:unclean-exit]
%s was unable to cleanly terminate the daemons on the nodes shown
below. Additional manual cleanup may be required - please refer to
the "orte-clean" tool for assistance.
#
[orterun:event-def-failed]
%s was unable to define an event required for proper operation of
the system. The reason for this error was:
Error: %s
Please report this to the Open MPI mailing list users@open-mpi.org.
#
[orterun:ompi-server-filename-bad]
%s was unable to parse the filename where contact info for the
ompi-server was to be found. The option we were given was:
--ompi-server %s
This appears to be missing the required ':' following the
keyword "file". Please remember that the correct format for this
command line option is:
--ompi-server file:path-to-file
where path-to-file can be either relative to the cwd or absolute.
#
[orterun:ompi-server-filename-missing]
%s was unable to parse the filename where contact info for the
ompi-server was to be found. The option we were given was:
--ompi-server %s
This appears to be missing a filename following the ':'. Please
remember that the correct format for this command line option is:
--ompi-server file:path-to-file
where path-to-file can be either relative to the cwd or absolute.
#
[orterun:ompi-server-filename-access]
%s was unable to access the filename where contact info for the
ompi-server was to be found. The option we were given was:
--ompi-server %s
Please remember that the correct format for this command line option is:
--ompi-server file:path-to-file
where path-to-file can be either relative to the cwd or absolute, and that
you must have read access permissions to that file.
#
[orterun:ompi-server-file-bad]
%s was unable to read the ompi-server's contact info from the
given filename. The filename we were given was:
FILE: %s
Please remember that the correct format for this command line option is:
--ompi-server file:path-to-file
where path-to-file can be either relative to the cwd or absolute, and that
the file must have a single line in it that contains the Open MPI
uri for the ompi-server. Note that this is *not* a standard uri, but
a special format used internally by Open MPI for communications. It can
best be generated by simply directing the ompi-server to put its
uri in a file, and then giving %s that filename.
[orterun:multiple-hostfiles]
Error: More than one hostfile was passed for a single application
context, which is not supported at this time.
#
[orterun:conflicting-params]
%s has detected multiple instances of an MCA param being specified on
the command line, with conflicting values:
MCA param: %s
Value 1: %s
Value 2: %s
This MCA param does not support multiple values, and the system is unable
to identify which value was intended. If this was done in error, please
re-issue the command with only one value. You may wish to review the
output from ompi_info for guidance on accepted values for this param.
[orterun:server-not-found]
%s was instructed to wait for the requested ompi-server, but was unable to
establish contact with the server during the specified wait time:
Server uri: %s
Timeout time: %ld
Error received: %s
Please check to ensure that the requested server matches the actual server
information, and that the server is in operation.
#
[orterun:ompi-server-pid-bad]
%s was unable to parse the PID of the %s to be used as the ompi-server.
The option we were given was:
--ompi-server %s
Please remember that the correct format for this command line option is:
--ompi-server PID:pid-of-%s
where PID can be either "PID" or "pid".
#
[orterun:ompi-server-could-not-get-hnp-list]
%s was unable to search the list of local %s contact files to find the
specified pid. You might check to see if your local session directory
is available and that you have read permissions on the top of that
directory tree.
#
[orterun:ompi-server-pid-not-found]
%s was unable to find an %s with the specified pid of %d that was to
be used as the ompi-server. The option we were given was:
--ompi-server %s
Please remember that the correct format for this command line option is:
--ompi-server PID:pid-of-%s
where PID can be either "PID" or "pid".
#
[orterun:write_file]
%s was unable to open a file to printout %s as requested. The file
name given was:
File: %s
#
[orterun:multiple-paffinity-schemes]
Multiple processor affinity schemes were specified (can only specify
one):
Slot list: %s
opal_paffinity_alone: true
Please specify only the one desired method.
#
[orterun:slot-list-failed]
We were unable to successfully process/set the requested processor
affinity settings:
Specified slot list: %s
Error: %s
This could mean that a non-existent processor was specified, or
that the specification had improper syntax.
#
[orterun:invalid-node-rank]
An invalid node rank was obtained - this is probably something
that should be reported to the OMPI developers.
#
[orterun:invalid-local-rank]
An invalid local rank was obtained - this is probably something
that should be reported to the OMPI developers.
#
[orterun:invalid-phys-cpu]
An invalid physical processor id was returned when attempting to
set processor affinity - please check to ensure that your system
supports such functionality. If so, then this is probably something
that should be reported to the OMPI developers.
#
[orterun:failed-set-paff]
An attempt to set processor affinity has failed - please check to
ensure that your system supports such functionality. If so, then
this is probably something that should be reported to the OMPI
developers.
#
[orterun:topo-not-supported]
An attempt was made to bind a process to a specific hardware topology
mapping (e.g., binding to a socket) but the operating system does not
support such topology-aware actions. Talk to your local system
administrator to find out if your system can support topology-aware
functionality (e.g., Linux Kernels newer than v2.6.18).
Systems that do not support processor topology-aware functionality
cannot use "bind to socket" and other related functionality.
Local host: %s
Action attempted: %s %s
Application name: %s
#
[orterun:binding-not-avail]
A request to bind the processes if the operating system supports such
an operation was made, but the OS does not support this operation:
Local host: %s
Action requested: %s
Application name: %s
Because the request was made on an "if-available" basis, the job was
launched without taking the requested action. If this is not the
desired behavior, talk to your local system administrator to find out
if your system can support the requested action.
#
[orterun:not-enough-resources]
Not enough %s were found on the local host to meet the requested
binding action:
Local host: %s
Action requested: %s
Application name: %s
Please revise the request and try again.
#
[orterun:paffinity-missing-module]
A request to bind processes was made, but no paffinity module
was found:
Local host: %s
This is potentially a configuration. You can rerun your job without
requesting binding, or check the configuration.
#
[orterun:invalid-slot-list-range]
A slot list was provided that exceeds the boundaries on available
resources:
Local host: %s
Slot list: %s
Please check your boundaries and try again.
#
[orterun:proc-comm-failed]
A critical communication path was lost to:
My name: %s
Process name: %s
Node: %s
#
[orterun:proc-mem-exceeded]
A process exceeded memory limits:
Process name: %s
Node: %s
#
[orterun:proc-stalled]
One or more processes appear to have stalled - a monitored file
failed to show the required activity.
#
[orterun:proc-sensor-exceeded]
One or more processes have exceeded a specified sensor limit, but
no further info is available.
#
[orterun:proc-called-abort]
%s detected that one or more processes called %s_abort, thus causing
the job to be terminated.
#
[orterun:proc-heartbeat-failed]
%s failed to receive scheduled heartbeat communications from a remote
process:
Process name: %s
Node: %s
#
[orterun:non-zero-exit]
%s detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:
Process name: %s
Exit code: %d
#

1293
orte/tools/mapreduce/mapreduce.1in Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

2138
orte/tools/mapreduce/mapreduce.c Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -621,3 +621,12 @@ the job to be terminated. The first process to do so was:
Process name: %s
Exit code: %d
#
[orterun:unrecognized-mr-type]
%s does not recognize the type of job. This should not happen and
indicates an ORTE internal problem.
#
[multiple-combiners]
More than one combiner was specified. The combiner takes the output
from the final reducer in each chain to produce a single, combined
result. Thus, there can only be one combiner for a job. Please
review your command line and try again.

Просмотреть файл

@ -203,6 +203,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
return "UNDEFINED";
case ORTE_JOB_STATE_INIT:
return "PENDING INIT";
case ORTE_JOB_STATE_INIT_COMPLETE:
return "INIT_COMPLETE";
case ORTE_JOB_STATE_ALLOCATE:
return "PENDING ALLOCATION";
case ORTE_JOB_STATE_MAP:

Просмотреть файл

@ -9,6 +9,9 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -1103,13 +1106,15 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo)
#endif
orte_std_cntr_t n;
opal_buffer_t buf;
int rc, j;
int rc, j, k;
orte_job_t *jdata;
orte_proc_t *proc, *pptr;
orte_node_t *node;
orte_node_t *node, *nptr;
orte_proc_state_t *states=NULL;
orte_app_idx_t *app_idx=NULL;
int32_t *restarts=NULL;
orte_job_map_t *map;
bool found;
/* xfer the byte object to a buffer for unpacking */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
@ -1212,6 +1217,11 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo)
}
/* xfer the data */
map = jdata->map;
if (NULL == map) {
jdata->map = OBJ_NEW(orte_job_map_t);
map = jdata->map;
}
for (i=0; i < num_procs; i++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
proc = OBJ_NEW(orte_proc_t);
@ -1231,6 +1241,21 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo)
OBJ_RELEASE(pptr);
opal_pointer_array_set_item(proc->node->procs, j, NULL);
proc->node->num_procs--;
if (0 == proc->node->num_procs) {
/* remove node from the map */
for (k=0; k < map->nodes->size; k++) {
if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(map->nodes, k))) {
continue;
}
if (nptr == proc->node) {
/* maintain accounting */
OBJ_RELEASE(nptr);
opal_pointer_array_set_item(map->nodes, k, NULL);
map->num_nodes--;
break;
}
}
}
break;
}
}
@ -1242,6 +1267,21 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo)
node = OBJ_NEW(orte_node_t);
opal_pointer_array_set_item(orte_node_pool, nodes[i], node);
}
/* see if this node is already in the map */
found = false;
for (j=0; j < map->nodes->size; j++) {
if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(map->nodes, j))) {
continue;
}
if (nptr == node) {
found = true;
break;
}
}
if (!found) {
opal_pointer_array_add(map->nodes, node);
map->num_nodes++;
}
/* add the node to the proc */
OBJ_RETAIN(node);
proc->node = node;