From d9ad918a14f64434936fdfeb4dd266f792ce7170 Mon Sep 17 00:00:00 2001 From: Artem Polyakov Date: Thu, 29 Jun 2017 06:06:33 +0700 Subject: [PATCH] orte/iof: Address the case when output is a regular file Regular files are always write-ready, so non-blocking I/O does not give any benefits for them. More than that - if libevent is using "epoll" to track fd events, epoll_ctl will refuse attempt to add an fd pointing to a regular file descriptor with EPERM. This fix checks the object referenced by fd and avoids event_add using event_active instead. In the original configuration that uncovered this issue "epoll" was used in libevent, it was triggering the following warning message: "[warn] Epoll ADD(1) on fd 0 failed. Old events were 0; read change was 1 (add); write change was 0 (none): Operation not permitted" And the side effect was accumulation of all output in mpirun memory and actually writing it only at mpirun exit. Signed-off-by: Artem Polyakov --- opal/util/fd.c | 37 ++++++++++++++++++++++++++++ opal/util/fd.h | 32 ++++++++++++++++++++++++ orte/mca/iof/base/base.h | 6 +++++ orte/mca/iof/base/iof_base_frame.c | 2 ++ orte/mca/iof/base/iof_base_output.c | 38 ++++++++++++++++++++++++++--- 5 files changed, 112 insertions(+), 3 deletions(-) diff --git a/opal/util/fd.c b/opal/util/fd.c index 63558107a2..a74b92bc2d 100644 --- a/opal/util/fd.c +++ b/opal/util/fd.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. * * $COPYRIGHT$ * @@ -11,6 +12,14 @@ #include "opal_config.h" +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_SYS_STAT_H +#include +#endif + + #ifdef HAVE_UNISTD_H #include #endif @@ -89,3 +98,31 @@ int opal_fd_set_cloexec(int fd) return OPAL_SUCCESS; } + +bool opal_fd_is_regular(int fd) +{ + struct stat buf; + if (fstat(fd, &buf)) { + return false; + } + return S_ISREG(buf.st_mode); +} + +bool opal_fd_is_chardev(int fd) +{ + struct stat buf; + if (fstat(fd, &buf)) { + return false; + } + return S_ISCHR(buf.st_mode); +} + +bool opal_fd_is_blkdev(int fd) +{ + struct stat buf; + if (fstat(fd, &buf)) { + return false; + } + return S_ISBLK(buf.st_mode); +} + diff --git a/opal/util/fd.h b/opal/util/fd.h index d32c3a9810..ea8a7a1a0e 100644 --- a/opal/util/fd.h +++ b/opal/util/fd.h @@ -1,6 +1,7 @@ /* * Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. * * $COPYRIGHT$ * @@ -63,6 +64,37 @@ OPAL_DECLSPEC int opal_fd_write(int fd, int len, const void *buffer); */ OPAL_DECLSPEC int opal_fd_set_cloexec(int fd); +/** + * Convenience function to check if fd point to an accessible regular file. + * + * @param fd File descriptor + * + * @returns true if "fd" points to a regular file. + * @returns false otherwise. + */ +OPAL_DECLSPEC bool opal_fd_is_regular(int fd); + +/** + * Convenience function to check if fd point to an accessible character device. + * + * @param fd File descriptor + * + * @returns true if "fd" points to a regular file. + * @returns false otherwise. + */ +OPAL_DECLSPEC bool opal_fd_is_chardev(int fd); + +/** + * Convenience function to check if fd point to an accessible block device. + * + * @param fd File descriptor + * + * @returns true if "fd" points to a regular file. + * @returns false otherwise. + */ +OPAL_DECLSPEC bool opal_fd_is_blkdev(int fd); + + END_C_DECLS #endif diff --git a/orte/mca/iof/base/base.h b/orte/mca/iof/base/base.h index a67043ff53..a053c7c06a 100644 --- a/orte/mca/iof/base/base.h +++ b/orte/mca/iof/base/base.h @@ -14,6 +14,7 @@ * All rights reserved. * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -48,6 +49,7 @@ #include "opal/class/opal_bitmap.h" #include "orte/mca/mca.h" #include "opal/mca/event/event.h" +#include "opal/util/fd.h" #include "orte/mca/iof/iof.h" #include "orte/runtime/orte_globals.h" @@ -84,6 +86,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_job_t); typedef struct { opal_list_item_t super; bool pending; + bool always_writable; opal_event_t *ev; int fd; opal_list_t outputs; @@ -157,6 +160,9 @@ typedef struct orte_iof_base_t orte_iof_base_t; ep->tag = (tg); \ if (0 <= (fid)) { \ ep->wev->fd = (fid); \ + ep->wev->always_writable = opal_fd_is_regular(fid) || \ + opal_fd_is_chardev(fid) || \ + opal_fd_is_blkdev(fid); \ opal_event_set(orte_event_base, \ ep->wev->ev, ep->wev->fd, \ OPAL_EV_WRITE, \ diff --git a/orte/mca/iof/base/iof_base_frame.c b/orte/mca/iof/base/iof_base_frame.c index 0f8af204a1..249bb86951 100644 --- a/orte/mca/iof/base/iof_base_frame.c +++ b/orte/mca/iof/base/iof_base_frame.c @@ -15,6 +15,7 @@ * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -298,6 +299,7 @@ OBJ_CLASS_INSTANCE(orte_iof_read_event_t, static void orte_iof_base_write_event_construct(orte_iof_write_event_t* wev) { wev->pending = false; + wev->always_writable = false; wev->fd = -1; OBJ_CONSTRUCT(&wev->outputs, opal_list_t); wev->ev = opal_event_alloc(); diff --git a/orte/mca/iof/base/iof_base_output.c b/orte/mca/iof/base/iof_base_output.c index 844a3fc6fc..61acda91f2 100644 --- a/orte/mca/iof/base/iof_base_output.c +++ b/orte/mca/iof/base/iof_base_output.c @@ -11,6 +11,7 @@ * All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2017 Intel, Inc. All rights reserved. + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -259,13 +260,22 @@ int orte_iof_base_write_output(const orte_process_name_t *name, orte_iof_tag_t s /* is the write event issued? */ if (!channel->pending) { + int rc = -1; /* issue it */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, "%s write:output adding write event", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); channel->pending = true; ORTE_POST_OBJECT(channel); - opal_event_add(channel->ev, 0); + if (channel->always_writable) { + /* Regular is always write ready. Activate the handler. */ + opal_event_active (channel->ev, OPAL_EV_WRITE, 1); + } else { + rc = opal_event_add(channel->ev, 0); + if (rc) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + } + } } return num_buffered; @@ -297,13 +307,14 @@ void orte_iof_base_static_dump_output(orte_iof_read_event_t *rev) } } +#define ORTE_IOF_REGULARF_BLOCK (1024) void orte_iof_base_write_handler(int fd, short event, void *cbdata) { orte_iof_sink_t *sink = (orte_iof_sink_t*)cbdata; orte_iof_write_event_t *wev = sink->wev; opal_list_item_t *item; orte_iof_write_output_t *output; - int num_written; + int num_written, total_written = 0; ORTE_ACQUIRE_OBJECT(sink); @@ -333,6 +344,10 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata) /* leave the write event running so it will call us again * when the fd is ready. */ + if(wev->always_writable){ + /* Schedule another event */ + opal_event_active (wev->ev, OPAL_EV_WRITE, 1); + } return; } /* otherwise, something bad happened so all we can do is abort @@ -356,12 +371,29 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata) /* leave the write event running so it will call us again * when the fd is ready */ + if(wev->always_writable){ + /* Schedule another event */ + opal_event_active (wev->ev, OPAL_EV_WRITE, 1); + + } return; } OBJ_RELEASE(output); + + total_written += num_written; + if(wev->always_writable && (ORTE_IOF_REGULARF_BLOCK <= total_written)){ + /* If this is a regular file it will never tell us it will block + * Write no more than ORTE_IOF_REGULARF_BLOCK at a time allowing + * other fds to progress + */ + opal_event_active (wev->ev, OPAL_EV_WRITE, 1); + return; + } } ABORT: - opal_event_del(wev->ev); + if (!wev->always_writable){ + opal_event_del(wev->ev); + } wev->pending = false; ORTE_POST_OBJECT(wev); }