2 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
3 * Copyright 2007-2010 Niels Provos, Nick Mathewson
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the author may not be used to endorse or promote products
14 * derived from this software without specific prior written permission.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "event2/event-config.h"
30 #include <sys/types.h>
31 #include <sys/resource.h>
32 #ifdef _EVENT_HAVE_SYS_TIME_H
35 #include <sys/queue.h>
36 #include <sys/epoll.h>
44 #ifdef _EVENT_HAVE_FCNTL_H
48 #include "event-internal.h"
49 #include "evsignal-internal.h"
50 #include "event2/thread.h"
51 #include "evthread-internal.h"
52 #include "log-internal.h"
53 #include "evmap-internal.h"
54 #include "changelist-internal.h"
57 struct epoll_event *events;
62 static void *epoll_init(struct event_base *);
63 static int epoll_dispatch(struct event_base *, struct timeval *);
64 static void epoll_dealloc(struct event_base *);
66 static const struct eventop epollops_changelist = {
67 "epoll (with changelist)",
74 EV_FEATURE_ET|EV_FEATURE_O1,
75 EVENT_CHANGELIST_FDINFO_SIZE
79 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
80 short old, short events, void *p);
81 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
82 short old, short events, void *p);
84 const struct eventop epollops = {
87 epoll_nochangelist_add,
88 epoll_nochangelist_del,
92 EV_FEATURE_ET|EV_FEATURE_O1,
96 #define INITIAL_NEVENT 32
97 #define MAX_NEVENT 4096
99 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
100 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be
101 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
102 * largest number of msec we can support here is 2147482. Let's
103 * round that down by 47 seconds.
105 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
108 epoll_init(struct event_base *base)
111 struct epollop *epollop;
113 /* Initialize the kernel queue. (The size field is ignored since
115 if ((epfd = epoll_create(32000)) == -1) {
117 event_warn("epoll_create");
121 evutil_make_socket_closeonexec(epfd);
123 if (!(epollop = mm_calloc(1, sizeof(struct epollop))))
126 epollop->epfd = epfd;
128 /* Initialize fields */
129 epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
130 if (epollop->events == NULL) {
134 epollop->nevents = INITIAL_NEVENT;
136 if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
137 ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
138 evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL))
139 base->evsel = &epollops_changelist;
147 change_to_string(int change)
149 change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
150 if (change == EV_CHANGE_ADD) {
152 } else if (change == EV_CHANGE_DEL) {
154 } else if (change == 0) {
162 epoll_op_to_string(int op)
164 return op == EPOLL_CTL_ADD?"ADD":
165 op == EPOLL_CTL_DEL?"DEL":
166 op == EPOLL_CTL_MOD?"MOD":
171 epoll_apply_one_change(struct event_base *base,
172 struct epollop *epollop,
173 const struct event_change *ch)
175 struct epoll_event epev;
179 /* The logic here is a little tricky. If we had no events set
180 on the fd before, we need to set op="ADD" and set
181 events=the events we want to add. If we had any events set
182 on the fd before, and we want any events to remain on the
183 fd, we need to say op="MOD" and set events=the events we
184 want to remain. But if we want to delete the last event,
185 we say op="DEL" and set events=the remaining events. What
189 /* TODO: Turn this into a switch or a table lookup. */
191 if ((ch->read_change & EV_CHANGE_ADD) ||
192 (ch->write_change & EV_CHANGE_ADD)) {
193 /* If we are adding anything at all, we'll want to do
194 * either an ADD or a MOD. */
197 if (ch->read_change & EV_CHANGE_ADD) {
199 } else if (ch->read_change & EV_CHANGE_DEL) {
201 } else if (ch->old_events & EV_READ) {
204 if (ch->write_change & EV_CHANGE_ADD) {
206 } else if (ch->write_change & EV_CHANGE_DEL) {
208 } else if (ch->old_events & EV_WRITE) {
211 if ((ch->read_change|ch->write_change) & EV_ET)
214 if (ch->old_events) {
215 /* If MOD fails, we retry as an ADD, and if
216 * ADD fails we will retry as a MOD. So the
217 * only hard part here is to guess which one
218 * will work. As a heuristic, we'll try
219 * MOD first if we think there were old
220 * events and ADD if we think there were none.
222 * We can be wrong about the MOD if the file
223 * has in fact been closed and re-opened.
225 * We can be wrong about the ADD if the
226 * the fd has been re-created with a dup()
227 * of the same file that it was before.
231 } else if ((ch->read_change & EV_CHANGE_DEL) ||
232 (ch->write_change & EV_CHANGE_DEL)) {
233 /* If we're deleting anything, we'll want to do a MOD
237 if (ch->read_change & EV_CHANGE_DEL) {
238 if (ch->write_change & EV_CHANGE_DEL) {
239 events = EPOLLIN|EPOLLOUT;
240 } else if (ch->old_events & EV_WRITE) {
246 } else if (ch->write_change & EV_CHANGE_DEL) {
247 if (ch->old_events & EV_READ) {
259 memset(&epev, 0, sizeof(epev));
260 epev.data.fd = ch->fd;
261 epev.events = events;
262 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == -1) {
263 if (op == EPOLL_CTL_MOD && errno == ENOENT) {
264 /* If a MOD operation fails with ENOENT, the
265 * fd was probably closed and re-opened. We
266 * should retry the operation as an ADD.
268 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
269 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
270 (int)epev.events, ch->fd);
273 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
277 } else if (op == EPOLL_CTL_ADD && errno == EEXIST) {
278 /* If an ADD operation fails with EEXIST,
279 * either the operation was redundant (as with a
280 * precautionary add), or we ran into a fun
281 * kernel bug where using dup*() to duplicate the
282 * same file into the same fd gives you the same epitem
283 * rather than a fresh one. For the second case,
284 * we must retry with MOD. */
285 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
286 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
287 (int)epev.events, ch->fd);
290 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
294 } else if (op == EPOLL_CTL_DEL &&
295 (errno == ENOENT || errno == EBADF ||
297 /* If a delete fails with one of these errors,
298 * that's fine too: we closed the fd before we
299 * got around to calling epoll_dispatch. */
300 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
305 event_warn("Epoll %s(%d) on fd %d failed. Old events were %d; read change was %d (%s); write change was %d (%s)",
306 epoll_op_to_string(op),
311 change_to_string(ch->read_change),
313 change_to_string(ch->write_change));
317 event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]",
318 epoll_op_to_string(op),
330 epoll_apply_changes(struct event_base *base)
332 struct event_changelist *changelist = &base->changelist;
333 struct epollop *epollop = base->evbase;
334 struct event_change *ch;
339 for (i = 0; i < changelist->n_changes; ++i) {
340 ch = &changelist->changes[i];
341 if (epoll_apply_one_change(base, epollop, ch) < 0)
349 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
350 short old, short events, void *p)
352 struct event_change ch;
355 ch.read_change = ch.write_change = 0;
356 if (events & EV_WRITE)
357 ch.write_change = EV_CHANGE_ADD |
359 if (events & EV_READ)
360 ch.read_change = EV_CHANGE_ADD |
363 return epoll_apply_one_change(base, base->evbase, &ch);
367 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
368 short old, short events, void *p)
370 struct event_change ch;
373 ch.read_change = ch.write_change = 0;
374 if (events & EV_WRITE)
375 ch.write_change = EV_CHANGE_DEL;
376 if (events & EV_READ)
377 ch.read_change = EV_CHANGE_DEL;
379 return epoll_apply_one_change(base, base->evbase, &ch);
383 epoll_dispatch(struct event_base *base, struct timeval *tv)
385 struct epollop *epollop = base->evbase;
386 struct epoll_event *events = epollop->events;
391 timeout = evutil_tv_to_msec(tv);
392 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
393 /* Linux kernels can wait forever if the timeout is
394 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
395 timeout = MAX_EPOLL_TIMEOUT_MSEC;
399 epoll_apply_changes(base);
400 event_changelist_remove_all(&base->changelist, base);
402 EVBASE_RELEASE_LOCK(base, th_base_lock);
404 res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
406 EVBASE_ACQUIRE_LOCK(base, th_base_lock);
409 if (errno != EINTR) {
410 event_warn("epoll_wait");
417 event_debug(("%s: epoll_wait reports %d", __func__, res));
418 EVUTIL_ASSERT(res <= epollop->nevents);
420 for (i = 0; i < res; i++) {
421 int what = events[i].events;
424 if (what & (EPOLLHUP|EPOLLERR)) {
425 ev = EV_READ | EV_WRITE;
436 evmap_io_active(base, events[i].data.fd, ev | EV_ET);
439 if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
440 /* We used all of the event space this time. We should
441 be ready for more events next time. */
442 int new_nevents = epollop->nevents * 2;
443 struct epoll_event *new_events;
445 new_events = mm_realloc(epollop->events,
446 new_nevents * sizeof(struct epoll_event));
448 epollop->events = new_events;
449 epollop->nevents = new_nevents;
458 epoll_dealloc(struct event_base *base)
460 struct epollop *epollop = base->evbase;
464 mm_free(epollop->events);
465 if (epollop->epfd >= 0)
466 close(epollop->epfd);
468 memset(epollop, 0, sizeof(struct epollop));