1
1

orte/iof: Address the case when output is a regular file

Regular files are always write-ready, so non-blocking I/O does not
give any benefits for them.
More than that - if libevent is using "epoll" to track fd events,
epoll_ctl will refuse attempt to add an fd pointing to a regular
file descriptor with EPERM.
This fix checks the object referenced by fd and avoids event_add
using event_active instead.

In the original configuration that uncovered this issue "epoll"
was used in libevent, it was triggering the following warning
message:
"[warn] Epoll ADD(1) on fd 0 failed.  Old events were 0; read
change was 1 (add); write change was 0 (none): Operation not
permitted"
And the side effect was accumulation of all output in mpirun
memory and actually writing it only at mpirun exit.

Signed-off-by: Artem Polyakov <artpol84@gmail.com>
Этот коммит содержится в:
Artem Polyakov 2017-06-29 06:06:33 +07:00
родитель d1c5955b73
Коммит d9ad918a14
5 изменённых файлов: 112 добавлений и 3 удалений

Просмотреть файл

@ -1,6 +1,7 @@
/*
* Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
*
* $COPYRIGHT$
*
@ -11,6 +12,14 @@
#include "opal_config.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
@ -89,3 +98,31 @@ int opal_fd_set_cloexec(int fd)
return OPAL_SUCCESS;
}
bool opal_fd_is_regular(int fd)
{
struct stat buf;
if (fstat(fd, &buf)) {
return false;
}
return S_ISREG(buf.st_mode);
}
bool opal_fd_is_chardev(int fd)
{
struct stat buf;
if (fstat(fd, &buf)) {
return false;
}
return S_ISCHR(buf.st_mode);
}
bool opal_fd_is_blkdev(int fd)
{
struct stat buf;
if (fstat(fd, &buf)) {
return false;
}
return S_ISBLK(buf.st_mode);
}

Просмотреть файл

@ -1,6 +1,7 @@
/*
* Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
*
* $COPYRIGHT$
*
@ -63,6 +64,37 @@ OPAL_DECLSPEC int opal_fd_write(int fd, int len, const void *buffer);
*/
OPAL_DECLSPEC int opal_fd_set_cloexec(int fd);
/**
* Convenience function to check if fd point to an accessible regular file.
*
* @param fd File descriptor
*
* @returns true if "fd" points to a regular file.
* @returns false otherwise.
*/
OPAL_DECLSPEC bool opal_fd_is_regular(int fd);
/**
* Convenience function to check if fd point to an accessible character device.
*
* @param fd File descriptor
*
* @returns true if "fd" points to a regular file.
* @returns false otherwise.
*/
OPAL_DECLSPEC bool opal_fd_is_chardev(int fd);
/**
* Convenience function to check if fd point to an accessible block device.
*
* @param fd File descriptor
*
* @returns true if "fd" points to a regular file.
* @returns false otherwise.
*/
OPAL_DECLSPEC bool opal_fd_is_blkdev(int fd);
END_C_DECLS
#endif

Просмотреть файл

@ -14,6 +14,7 @@
* All rights reserved.
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -48,6 +49,7 @@
#include "opal/class/opal_bitmap.h"
#include "orte/mca/mca.h"
#include "opal/mca/event/event.h"
#include "opal/util/fd.h"
#include "orte/mca/iof/iof.h"
#include "orte/runtime/orte_globals.h"
@ -84,6 +86,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_job_t);
typedef struct {
opal_list_item_t super;
bool pending;
bool always_writable;
opal_event_t *ev;
int fd;
opal_list_t outputs;
@ -157,6 +160,9 @@ typedef struct orte_iof_base_t orte_iof_base_t;
ep->tag = (tg); \
if (0 <= (fid)) { \
ep->wev->fd = (fid); \
ep->wev->always_writable = opal_fd_is_regular(fid) || \
opal_fd_is_chardev(fid) || \
opal_fd_is_blkdev(fid); \
opal_event_set(orte_event_base, \
ep->wev->ev, ep->wev->fd, \
OPAL_EV_WRITE, \

Просмотреть файл

@ -15,6 +15,7 @@
* Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -298,6 +299,7 @@ OBJ_CLASS_INSTANCE(orte_iof_read_event_t,
static void orte_iof_base_write_event_construct(orte_iof_write_event_t* wev)
{
wev->pending = false;
wev->always_writable = false;
wev->fd = -1;
OBJ_CONSTRUCT(&wev->outputs, opal_list_t);
wev->ev = opal_event_alloc();

Просмотреть файл

@ -11,6 +11,7 @@
* All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -259,13 +260,22 @@ int orte_iof_base_write_output(const orte_process_name_t *name, orte_iof_tag_t s
/* is the write event issued? */
if (!channel->pending) {
int rc = -1;
/* issue it */
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
"%s write:output adding write event",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
channel->pending = true;
ORTE_POST_OBJECT(channel);
opal_event_add(channel->ev, 0);
if (channel->always_writable) {
/* Regular is always write ready. Activate the handler. */
opal_event_active (channel->ev, OPAL_EV_WRITE, 1);
} else {
rc = opal_event_add(channel->ev, 0);
if (rc) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
}
}
}
return num_buffered;
@ -297,13 +307,14 @@ void orte_iof_base_static_dump_output(orte_iof_read_event_t *rev)
}
}
#define ORTE_IOF_REGULARF_BLOCK (1024)
void orte_iof_base_write_handler(int fd, short event, void *cbdata)
{
orte_iof_sink_t *sink = (orte_iof_sink_t*)cbdata;
orte_iof_write_event_t *wev = sink->wev;
opal_list_item_t *item;
orte_iof_write_output_t *output;
int num_written;
int num_written, total_written = 0;
ORTE_ACQUIRE_OBJECT(sink);
@ -333,6 +344,10 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
/* leave the write event running so it will call us again
* when the fd is ready.
*/
if(wev->always_writable){
/* Schedule another event */
opal_event_active (wev->ev, OPAL_EV_WRITE, 1);
}
return;
}
/* otherwise, something bad happened so all we can do is abort
@ -356,12 +371,29 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
/* leave the write event running so it will call us again
* when the fd is ready
*/
if(wev->always_writable){
/* Schedule another event */
opal_event_active (wev->ev, OPAL_EV_WRITE, 1);
}
return;
}
OBJ_RELEASE(output);
total_written += num_written;
if(wev->always_writable && (ORTE_IOF_REGULARF_BLOCK <= total_written)){
/* If this is a regular file it will never tell us it will block
* Write no more than ORTE_IOF_REGULARF_BLOCK at a time allowing
* other fds to progress
*/
opal_event_active (wev->ev, OPAL_EV_WRITE, 1);
return;
}
}
ABORT:
if (!wev->always_writable){
opal_event_del(wev->ev);
}
wev->pending = false;
ORTE_POST_OBJECT(wev);
}