1
1

orte/iof: Address the case when output is a regular file

Regular files are always write-ready, so non-blocking I/O does not
give any benefits for them.
More than that - if libevent is using "epoll" to track fd events,
epoll_ctl will refuse attempt to add an fd pointing to a regular
file descriptor with EPERM.
This fix checks the object referenced by fd and avoids event_add
using event_active instead.

In the original configuration that uncovered this issue "epoll"
was used in libevent, it was triggering the following warning
message:
"[warn] Epoll ADD(1) on fd 0 failed.  Old events were 0; read
change was 1 (add); write change was 0 (none): Operation not
permitted"
And the side effect was accumulation of all output in mpirun
memory and actually writing it only at mpirun exit.

Signed-off-by: Artem Polyakov <artpol84@gmail.com>
Этот коммит содержится в:
Artem Polyakov 2017-06-29 06:06:33 +07:00
родитель d1c5955b73
Коммит d9ad918a14
5 изменённых файлов: 112 добавлений и 3 удалений

Просмотреть файл

@ -1,6 +1,7 @@
/* /*
* Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 Sandia National Laboratories. All rights reserved. * Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
* *
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -11,6 +12,14 @@
#include "opal_config.h" #include "opal_config.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_UNISTD_H #ifdef HAVE_UNISTD_H
#include <unistd.h> #include <unistd.h>
#endif #endif
@ -89,3 +98,31 @@ int opal_fd_set_cloexec(int fd)
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
bool opal_fd_is_regular(int fd)
{
struct stat buf;
if (fstat(fd, &buf)) {
return false;
}
return S_ISREG(buf.st_mode);
}
bool opal_fd_is_chardev(int fd)
{
struct stat buf;
if (fstat(fd, &buf)) {
return false;
}
return S_ISCHR(buf.st_mode);
}
bool opal_fd_is_blkdev(int fd)
{
struct stat buf;
if (fstat(fd, &buf)) {
return false;
}
return S_ISBLK(buf.st_mode);
}

Просмотреть файл

@ -1,6 +1,7 @@
/* /*
* Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 Sandia National Laboratories. All rights reserved. * Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
* *
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -63,6 +64,37 @@ OPAL_DECLSPEC int opal_fd_write(int fd, int len, const void *buffer);
*/ */
OPAL_DECLSPEC int opal_fd_set_cloexec(int fd); OPAL_DECLSPEC int opal_fd_set_cloexec(int fd);
/**
* Convenience function to check if fd point to an accessible regular file.
*
* @param fd File descriptor
*
* @returns true if "fd" points to a regular file.
* @returns false otherwise.
*/
OPAL_DECLSPEC bool opal_fd_is_regular(int fd);
/**
* Convenience function to check if fd point to an accessible character device.
*
* @param fd File descriptor
*
* @returns true if "fd" points to a regular file.
* @returns false otherwise.
*/
OPAL_DECLSPEC bool opal_fd_is_chardev(int fd);
/**
* Convenience function to check if fd point to an accessible block device.
*
* @param fd File descriptor
*
* @returns true if "fd" points to a regular file.
* @returns false otherwise.
*/
OPAL_DECLSPEC bool opal_fd_is_blkdev(int fd);
END_C_DECLS END_C_DECLS
#endif #endif

Просмотреть файл

@ -14,6 +14,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved.
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -48,6 +49,7 @@
#include "opal/class/opal_bitmap.h" #include "opal/class/opal_bitmap.h"
#include "orte/mca/mca.h" #include "orte/mca/mca.h"
#include "opal/mca/event/event.h" #include "opal/mca/event/event.h"
#include "opal/util/fd.h"
#include "orte/mca/iof/iof.h" #include "orte/mca/iof/iof.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
@ -84,6 +86,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_job_t);
typedef struct { typedef struct {
opal_list_item_t super; opal_list_item_t super;
bool pending; bool pending;
bool always_writable;
opal_event_t *ev; opal_event_t *ev;
int fd; int fd;
opal_list_t outputs; opal_list_t outputs;
@ -157,6 +160,9 @@ typedef struct orte_iof_base_t orte_iof_base_t;
ep->tag = (tg); \ ep->tag = (tg); \
if (0 <= (fid)) { \ if (0 <= (fid)) { \
ep->wev->fd = (fid); \ ep->wev->fd = (fid); \
ep->wev->always_writable = opal_fd_is_regular(fid) || \
opal_fd_is_chardev(fid) || \
opal_fd_is_blkdev(fid); \
opal_event_set(orte_event_base, \ opal_event_set(orte_event_base, \
ep->wev->ev, ep->wev->fd, \ ep->wev->ev, ep->wev->fd, \
OPAL_EV_WRITE, \ OPAL_EV_WRITE, \

Просмотреть файл

@ -15,6 +15,7 @@
* Copyright (c) 2015-2017 Research Organization for Information Science * Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved.
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -298,6 +299,7 @@ OBJ_CLASS_INSTANCE(orte_iof_read_event_t,
static void orte_iof_base_write_event_construct(orte_iof_write_event_t* wev) static void orte_iof_base_write_event_construct(orte_iof_write_event_t* wev)
{ {
wev->pending = false; wev->pending = false;
wev->always_writable = false;
wev->fd = -1; wev->fd = -1;
OBJ_CONSTRUCT(&wev->outputs, opal_list_t); OBJ_CONSTRUCT(&wev->outputs, opal_list_t);
wev->ev = opal_event_alloc(); wev->ev = opal_event_alloc();

Просмотреть файл

@ -11,6 +11,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -259,13 +260,22 @@ int orte_iof_base_write_output(const orte_process_name_t *name, orte_iof_tag_t s
/* is the write event issued? */ /* is the write event issued? */
if (!channel->pending) { if (!channel->pending) {
int rc = -1;
/* issue it */ /* issue it */
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
"%s write:output adding write event", "%s write:output adding write event",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
channel->pending = true; channel->pending = true;
ORTE_POST_OBJECT(channel); ORTE_POST_OBJECT(channel);
opal_event_add(channel->ev, 0); if (channel->always_writable) {
/* Regular is always write ready. Activate the handler. */
opal_event_active (channel->ev, OPAL_EV_WRITE, 1);
} else {
rc = opal_event_add(channel->ev, 0);
if (rc) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
}
}
} }
return num_buffered; return num_buffered;
@ -297,13 +307,14 @@ void orte_iof_base_static_dump_output(orte_iof_read_event_t *rev)
} }
} }
#define ORTE_IOF_REGULARF_BLOCK (1024)
void orte_iof_base_write_handler(int fd, short event, void *cbdata) void orte_iof_base_write_handler(int fd, short event, void *cbdata)
{ {
orte_iof_sink_t *sink = (orte_iof_sink_t*)cbdata; orte_iof_sink_t *sink = (orte_iof_sink_t*)cbdata;
orte_iof_write_event_t *wev = sink->wev; orte_iof_write_event_t *wev = sink->wev;
opal_list_item_t *item; opal_list_item_t *item;
orte_iof_write_output_t *output; orte_iof_write_output_t *output;
int num_written; int num_written, total_written = 0;
ORTE_ACQUIRE_OBJECT(sink); ORTE_ACQUIRE_OBJECT(sink);
@ -333,6 +344,10 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
/* leave the write event running so it will call us again /* leave the write event running so it will call us again
* when the fd is ready. * when the fd is ready.
*/ */
if(wev->always_writable){
/* Schedule another event */
opal_event_active (wev->ev, OPAL_EV_WRITE, 1);
}
return; return;
} }
/* otherwise, something bad happened so all we can do is abort /* otherwise, something bad happened so all we can do is abort
@ -356,12 +371,29 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
/* leave the write event running so it will call us again /* leave the write event running so it will call us again
* when the fd is ready * when the fd is ready
*/ */
if(wev->always_writable){
/* Schedule another event */
opal_event_active (wev->ev, OPAL_EV_WRITE, 1);
}
return; return;
} }
OBJ_RELEASE(output); OBJ_RELEASE(output);
total_written += num_written;
if(wev->always_writable && (ORTE_IOF_REGULARF_BLOCK <= total_written)){
/* If this is a regular file it will never tell us it will block
* Write no more than ORTE_IOF_REGULARF_BLOCK at a time allowing
* other fds to progress
*/
opal_event_active (wev->ev, OPAL_EV_WRITE, 1);
return;
}
} }
ABORT: ABORT:
opal_event_del(wev->ev); if (!wev->always_writable){
opal_event_del(wev->ev);
}
wev->pending = false; wev->pending = false;
ORTE_POST_OBJECT(wev); ORTE_POST_OBJECT(wev);
} }