/* * Copyright 2000-2003 Niels Provos * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opal_config.h" #ifdef HAVE_STDINT_H #include #endif #ifdef HAVE_SYS_TYPES_H #include #endif #ifdef HAVE_SYS_RESOURCE_H #include #endif #ifdef HAVE_SYS_TIME_H #include #else #include #endif #include #include #include #include #include #include #ifdef HAVE_UNISTD_H #include #endif #include #ifdef HAVE_FCNTL_H #include #endif #include "event.h" #include "event-internal.h" #include "evsignal.h" #include "log.h" #include "opal/threads/mutex.h" extern volatile sig_atomic_t opal_evsignal_caught; extern opal_mutex_t opal_event_lock; /* due to limitations in the epoll interface, we need to keep track of * all file descriptors outself. */ struct evepoll { struct event *evread; struct event *evwrite; }; struct epollop { struct evepoll *fds; int nfds; struct epoll_event *events; int nevents; int epfd; }; static void *epoll_init (struct event_base *); static int epoll_add (void *, struct event *); static int epoll_del (void *, struct event *); static int epoll_dispatch (struct event_base *, void *, struct timeval *); static void epoll_dealloc (struct event_base *, void *); const struct eventop epollops = { "epoll", epoll_init, epoll_add, epoll_del, epoll_dispatch, epoll_dealloc, 1 /* need reinit */ }; #ifdef HAVE_SETFD #define FD_CLOSEONEXEC(x) do { \ if (fcntl(x, F_SETFD, 1) == -1) \ event_warn("fcntl(%d, F_SETFD)", x); \ } while (0) #else #define FD_CLOSEONEXEC(x) #endif /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the * largest number of msec we can support here is 2147482. Let's * round that down by 47 seconds. */ #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000) #define INITIAL_NFILES 32 #define INITIAL_NEVENTS 32 #define MAX_NEVENTS 4096 static void * epoll_init(struct event_base *base) { int epfd; struct epollop *epollop; /* Disable epollueue when this environment variable is set */ if (evutil_getenv("EVENT_NOEPOLL")) return (NULL); /* Initalize the kernel queue */ if ((epfd = epoll_create(32000)) == -1) { if (errno != ENOSYS) event_warn("epoll_create"); return (NULL); } FD_CLOSEONEXEC(epfd); if (!(epollop = calloc(1, sizeof(struct epollop)))) return (NULL); epollop->epfd = epfd; /* Initalize fields */ epollop->events = malloc(INITIAL_NEVENTS * sizeof(struct epoll_event)); if (epollop->events == NULL) { free(epollop); return (NULL); } epollop->nevents = INITIAL_NEVENTS; epollop->fds = calloc(INITIAL_NFILES, sizeof(struct evepoll)); if (epollop->fds == NULL) { free(epollop->events); free(epollop); return (NULL); } epollop->nfds = INITIAL_NFILES; #if OPAL_EVENT_USE_SIGNALS evsignal_init(base); #endif return (epollop); } static int epoll_recalc(struct event_base *base, void *arg, int max) { struct epollop *epollop = arg; if (max > epollop->nfds) { struct evepoll *fds; int nfds; nfds = epollop->nfds; while (nfds < max) nfds <<= 1; fds = realloc(epollop->fds, nfds * sizeof(struct evepoll)); if (fds == NULL) { event_warn("realloc"); return (-1); } epollop->fds = fds; memset(fds + epollop->nfds, 0, (nfds - epollop->nfds) * sizeof(struct evepoll)); epollop->nfds = nfds; } return (0); } static int epoll_dispatch(struct event_base *base, void *arg, struct timeval *tv) { struct epollop *epollop = arg; struct epoll_event *events = epollop->events; struct evepoll *evep; int i, res, timeout = -1; if (tv != NULL) timeout = tv->tv_sec * 1000 + (tv->tv_usec + 999) / 1000; if (timeout > MAX_EPOLL_TIMEOUT_MSEC) { /* Linux kernels can wait forever if the timeout is too big; * see comment on MAX_EPOLL_TIMEOUT_MSEC. */ timeout = MAX_EPOLL_TIMEOUT_MSEC; } /* we should release the lock if we're going to enter the kernel in a multi-threaded application. However, if we're single threaded, there's really no advantage to releasing the lock and it just takes up time we could spend doing something else. */ OPAL_THREAD_UNLOCK(&opal_event_lock); res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout); OPAL_THREAD_LOCK(&opal_event_lock); if (res == -1) { if (errno != EINTR) { event_warn("epoll_wait"); return (-1); } #if OPAL_EVENT_USE_SIGNALS evsignal_process(base); #endif return (0); } else if (base->sig.evsignal_caught) { #if OPAL_EVENT_USE_SIGNALS evsignal_process(base); #endif } event_debug(("%s: epoll_wait reports %d", __func__, res)); for (i = 0; i < res; i++) { int what = events[i].events; struct event *evread = NULL, *evwrite = NULL; int fd = events[i].data.fd; if (fd < 0 || fd >= epollop->nfds) continue; evep = &epollop->fds[fd]; if (what & (EPOLLHUP|EPOLLERR)) { evread = evep->evread; evwrite = evep->evwrite; } else { if (what & EPOLLIN) { evread = evep->evread; } if (what & EPOLLOUT) { evwrite = evep->evwrite; } } if (!(evread||evwrite)) continue; if (evread != NULL) event_active(evread, EV_READ, 1); if (evwrite != NULL) event_active(evwrite, EV_WRITE, 1); } if (res == epollop->nevents && epollop->nevents < MAX_NEVENTS) { /* We used all of the event space this time. We should be ready for more events next time. */ int new_nevents = epollop->nevents * 2; struct epoll_event *new_events; new_events = realloc(epollop->events, new_nevents * sizeof(struct epoll_event)); if (new_events) { epollop->events = new_events; epollop->nevents = new_nevents; } } return (0); } static int epoll_add(void *arg, struct event *ev) { struct epollop *epollop = arg; struct epoll_event epev = {0, {0}}; struct evepoll *evep; int fd, op, events; #if OPAL_EVENT_USE_SIGNALS if (ev->ev_events & EV_SIGNAL) return (evsignal_add(ev)); #endif fd = ev->ev_fd; if (fd >= epollop->nfds) { /* Extent the file descriptor array as necessary */ if (epoll_recalc(ev->ev_base, epollop, fd) == -1) return (-1); } evep = &epollop->fds[fd]; op = EPOLL_CTL_ADD; events = 0; if (evep->evread != NULL) { events |= EPOLLIN; op = EPOLL_CTL_MOD; } if (evep->evwrite != NULL) { events |= EPOLLOUT; op = EPOLL_CTL_MOD; } if (ev->ev_events & EV_READ) events |= EPOLLIN; if (ev->ev_events & EV_WRITE) events |= EPOLLOUT; epev.data.fd = fd; epev.events = events; if (epoll_ctl(epollop->epfd, op, ev->ev_fd, &epev) == -1) return (-1); /* Update events responsible */ if (ev->ev_events & EV_READ) evep->evread = ev; if (ev->ev_events & EV_WRITE) evep->evwrite = ev; return (0); } static int epoll_del(void *arg, struct event *ev) { struct epollop *epollop = arg; struct epoll_event epev = {0, {0}}; struct evepoll *evep; int fd, events, op; int needwritedelete = 1, needreaddelete = 1; #if OPAL_EVENT_USE_SIGNALS if (ev->ev_events & EV_SIGNAL) return (evsignal_del(ev)); #endif fd = ev->ev_fd; if (fd >= epollop->nfds) return (0); evep = &epollop->fds[fd]; op = EPOLL_CTL_DEL; events = 0; if (ev->ev_events & EV_READ) events |= EPOLLIN; if (ev->ev_events & EV_WRITE) events |= EPOLLOUT; if ((events & (EPOLLIN|EPOLLOUT)) != (EPOLLIN|EPOLLOUT)) { if ((events & EPOLLIN) && evep->evwrite != NULL) { needwritedelete = 0; events = EPOLLOUT; op = EPOLL_CTL_MOD; } else if ((events & EPOLLOUT) && evep->evread != NULL) { needreaddelete = 0; events = EPOLLIN; op = EPOLL_CTL_MOD; } } epev.events = events; epev.data.fd = fd; if (needreaddelete) evep->evread = NULL; if (needwritedelete) evep->evwrite = NULL; if (epoll_ctl(epollop->epfd, op, fd, &epev) == -1) return (-1); return (0); } static void epoll_dealloc(struct event_base *base, void *arg) { struct epollop *epollop = arg; evsignal_dealloc(base); if (epollop->fds) free(epollop->fds); if (epollop->events) free(epollop->events); if (epollop->epfd >= 0) close(epollop->epfd); memset(epollop, 0, sizeof(struct epollop)); free(epollop); }