]> arthur.barton.de Git - netatalk.git/blob - libevent/epoll.c
Merge master
[netatalk.git] / libevent / epoll.c
1 /*
2  * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
3  * Copyright 2007-2010 Niels Provos, Nick Mathewson
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. The name of the author may not be used to endorse or promote products
14  *    derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 #include "event2/event-config.h"
28
29 #include <stdint.h>
30 #include <sys/types.h>
31 #include <sys/resource.h>
32 #ifdef _EVENT_HAVE_SYS_TIME_H
33 #include <sys/time.h>
34 #endif
35 #include <sys/queue.h>
36 #include <sys/epoll.h>
37 #include <signal.h>
38 #include <limits.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <unistd.h>
43 #include <errno.h>
44 #ifdef _EVENT_HAVE_FCNTL_H
45 #include <fcntl.h>
46 #endif
47
48 #include "event-internal.h"
49 #include "evsignal-internal.h"
50 #include "event2/thread.h"
51 #include "evthread-internal.h"
52 #include "log-internal.h"
53 #include "evmap-internal.h"
54 #include "changelist-internal.h"
55
56 struct epollop {
57         struct epoll_event *events;
58         int nevents;
59         int epfd;
60 };
61
62 static void *epoll_init(struct event_base *);
63 static int epoll_dispatch(struct event_base *, struct timeval *);
64 static void epoll_dealloc(struct event_base *);
65
66 static const struct eventop epollops_changelist = {
67         "epoll (with changelist)",
68         epoll_init,
69         event_changelist_add,
70         event_changelist_del,
71         epoll_dispatch,
72         epoll_dealloc,
73         1, /* need reinit */
74         EV_FEATURE_ET|EV_FEATURE_O1,
75         EVENT_CHANGELIST_FDINFO_SIZE
76 };
77
78
79 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
80     short old, short events, void *p);
81 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
82     short old, short events, void *p);
83
84 const struct eventop epollops = {
85         "epoll",
86         epoll_init,
87         epoll_nochangelist_add,
88         epoll_nochangelist_del,
89         epoll_dispatch,
90         epoll_dealloc,
91         1, /* need reinit */
92         EV_FEATURE_ET|EV_FEATURE_O1,
93         0
94 };
95
96 #define INITIAL_NEVENT 32
97 #define MAX_NEVENT 4096
98
99 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
100  * values bigger than (LONG_MAX - 999ULL)/HZ.  HZ in the wild can be
101  * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
102  * largest number of msec we can support here is 2147482.  Let's
103  * round that down by 47 seconds.
104  */
105 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
106
107 static void *
108 epoll_init(struct event_base *base)
109 {
110         int epfd;
111         struct epollop *epollop;
112
113         /* Initialize the kernel queue.  (The size field is ignored since
114          * 2.6.8.) */
115         if ((epfd = epoll_create(32000)) == -1) {
116                 if (errno != ENOSYS)
117                         event_warn("epoll_create");
118                 return (NULL);
119         }
120
121         evutil_make_socket_closeonexec(epfd);
122
123         if (!(epollop = mm_calloc(1, sizeof(struct epollop))))
124                 return (NULL);
125
126         epollop->epfd = epfd;
127
128         /* Initialize fields */
129         epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
130         if (epollop->events == NULL) {
131                 mm_free(epollop);
132                 return (NULL);
133         }
134         epollop->nevents = INITIAL_NEVENT;
135
136         if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
137             ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
138                 evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL))
139                 base->evsel = &epollops_changelist;
140
141         evsig_init(base);
142
143         return (epollop);
144 }
145
146 static const char *
147 change_to_string(int change)
148 {
149         change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
150         if (change == EV_CHANGE_ADD) {
151                 return "add";
152         } else if (change == EV_CHANGE_DEL) {
153                 return "del";
154         } else if (change == 0) {
155                 return "none";
156         } else {
157                 return "???";
158         }
159 }
160
161 static const char *
162 epoll_op_to_string(int op)
163 {
164         return op == EPOLL_CTL_ADD?"ADD":
165             op == EPOLL_CTL_DEL?"DEL":
166             op == EPOLL_CTL_MOD?"MOD":
167             "???";
168 }
169
170 static int
171 epoll_apply_one_change(struct event_base *base,
172     struct epollop *epollop,
173     const struct event_change *ch)
174 {
175         struct epoll_event epev;
176         int op, events = 0;
177
178         if (1) {
179                 /* The logic here is a little tricky.  If we had no events set
180                    on the fd before, we need to set op="ADD" and set
181                    events=the events we want to add.  If we had any events set
182                    on the fd before, and we want any events to remain on the
183                    fd, we need to say op="MOD" and set events=the events we
184                    want to remain.  But if we want to delete the last event,
185                    we say op="DEL" and set events=the remaining events.  What
186                    fun!
187                 */
188
189                 /* TODO: Turn this into a switch or a table lookup. */
190
191                 if ((ch->read_change & EV_CHANGE_ADD) ||
192                     (ch->write_change & EV_CHANGE_ADD)) {
193                         /* If we are adding anything at all, we'll want to do
194                          * either an ADD or a MOD. */
195                         events = 0;
196                         op = EPOLL_CTL_ADD;
197                         if (ch->read_change & EV_CHANGE_ADD) {
198                                 events |= EPOLLIN;
199                         } else if (ch->read_change & EV_CHANGE_DEL) {
200                                 ;
201                         } else if (ch->old_events & EV_READ) {
202                                 events |= EPOLLIN;
203                         }
204                         if (ch->write_change & EV_CHANGE_ADD) {
205                                 events |= EPOLLOUT;
206                         } else if (ch->write_change & EV_CHANGE_DEL) {
207                                 ;
208                         } else if (ch->old_events & EV_WRITE) {
209                                 events |= EPOLLOUT;
210                         }
211                         if ((ch->read_change|ch->write_change) & EV_ET)
212                                 events |= EPOLLET;
213
214                         if (ch->old_events) {
215                                 /* If MOD fails, we retry as an ADD, and if
216                                  * ADD fails we will retry as a MOD.  So the
217                                  * only hard part here is to guess which one
218                                  * will work.  As a heuristic, we'll try
219                                  * MOD first if we think there were old
220                                  * events and ADD if we think there were none.
221                                  *
222                                  * We can be wrong about the MOD if the file
223                                  * has in fact been closed and re-opened.
224                                  *
225                                  * We can be wrong about the ADD if the
226                                  * the fd has been re-created with a dup()
227                                  * of the same file that it was before.
228                                  */
229                                 op = EPOLL_CTL_MOD;
230                         }
231                 } else if ((ch->read_change & EV_CHANGE_DEL) ||
232                     (ch->write_change & EV_CHANGE_DEL)) {
233                         /* If we're deleting anything, we'll want to do a MOD
234                          * or a DEL. */
235                         op = EPOLL_CTL_DEL;
236
237                         if (ch->read_change & EV_CHANGE_DEL) {
238                                 if (ch->write_change & EV_CHANGE_DEL) {
239                                         events = EPOLLIN|EPOLLOUT;
240                                 } else if (ch->old_events & EV_WRITE) {
241                                         events = EPOLLOUT;
242                                         op = EPOLL_CTL_MOD;
243                                 } else {
244                                         events = EPOLLIN;
245                                 }
246                         } else if (ch->write_change & EV_CHANGE_DEL) {
247                                 if (ch->old_events & EV_READ) {
248                                         events = EPOLLIN;
249                                         op = EPOLL_CTL_MOD;
250                                 } else {
251                                         events = EPOLLOUT;
252                                 }
253                         }
254                 }
255
256                 if (!events)
257                         return 0;
258
259                 memset(&epev, 0, sizeof(epev));
260                 epev.data.fd = ch->fd;
261                 epev.events = events;
262                 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == -1) {
263                         if (op == EPOLL_CTL_MOD && errno == ENOENT) {
264                                 /* If a MOD operation fails with ENOENT, the
265                                  * fd was probably closed and re-opened.  We
266                                  * should retry the operation as an ADD.
267                                  */
268                                 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
269                                         event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
270                                             (int)epev.events, ch->fd);
271                                         return -1;
272                                 } else {
273                                         event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
274                                                 (int)epev.events,
275                                                 ch->fd));
276                                 }
277                         } else if (op == EPOLL_CTL_ADD && errno == EEXIST) {
278                                 /* If an ADD operation fails with EEXIST,
279                                  * either the operation was redundant (as with a
280                                  * precautionary add), or we ran into a fun
281                                  * kernel bug where using dup*() to duplicate the
282                                  * same file into the same fd gives you the same epitem
283                                  * rather than a fresh one.  For the second case,
284                                  * we must retry with MOD. */
285                                 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
286                                         event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
287                                             (int)epev.events, ch->fd);
288                                         return -1;
289                                 } else {
290                                         event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
291                                                 (int)epev.events,
292                                                 ch->fd));
293                                 }
294                         } else if (op == EPOLL_CTL_DEL &&
295                             (errno == ENOENT || errno == EBADF ||
296                                 errno == EPERM)) {
297                                 /* If a delete fails with one of these errors,
298                                  * that's fine too: we closed the fd before we
299                                  * got around to calling epoll_dispatch. */
300                                 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
301                                         (int)epev.events,
302                                         ch->fd,
303                                         strerror(errno)));
304                         } else {
305                                 event_warn("Epoll %s(%d) on fd %d failed.  Old events were %d; read change was %d (%s); write change was %d (%s)",
306                                     epoll_op_to_string(op),
307                                     (int)epev.events,
308                                     ch->fd,
309                                     ch->old_events,
310                                     ch->read_change,
311                                     change_to_string(ch->read_change),
312                                     ch->write_change,
313                                     change_to_string(ch->write_change));
314                                 return -1;
315                         }
316                 } else {
317                         event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]",
318                                 epoll_op_to_string(op),
319                                 (int)epev.events,
320                                 (int)ch->fd,
321                                 ch->old_events,
322                                 ch->read_change,
323                                 ch->write_change));
324                 }
325         }
326         return 0;
327 }
328
329 static int
330 epoll_apply_changes(struct event_base *base)
331 {
332         struct event_changelist *changelist = &base->changelist;
333         struct epollop *epollop = base->evbase;
334         struct event_change *ch;
335
336         int r = 0;
337         int i;
338
339         for (i = 0; i < changelist->n_changes; ++i) {
340                 ch = &changelist->changes[i];
341                 if (epoll_apply_one_change(base, epollop, ch) < 0)
342                         r = -1;
343         }
344
345         return (r);
346 }
347
348 static int
349 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
350     short old, short events, void *p)
351 {
352         struct event_change ch;
353         ch.fd = fd;
354         ch.old_events = old;
355         ch.read_change = ch.write_change = 0;
356         if (events & EV_WRITE)
357                 ch.write_change = EV_CHANGE_ADD |
358                     (events & EV_ET);
359         if (events & EV_READ)
360                 ch.read_change = EV_CHANGE_ADD |
361                     (events & EV_ET);
362
363         return epoll_apply_one_change(base, base->evbase, &ch);
364 }
365
366 static int
367 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
368     short old, short events, void *p)
369 {
370         struct event_change ch;
371         ch.fd = fd;
372         ch.old_events = old;
373         ch.read_change = ch.write_change = 0;
374         if (events & EV_WRITE)
375                 ch.write_change = EV_CHANGE_DEL;
376         if (events & EV_READ)
377                 ch.read_change = EV_CHANGE_DEL;
378
379         return epoll_apply_one_change(base, base->evbase, &ch);
380 }
381
382 static int
383 epoll_dispatch(struct event_base *base, struct timeval *tv)
384 {
385         struct epollop *epollop = base->evbase;
386         struct epoll_event *events = epollop->events;
387         int i, res;
388         long timeout = -1;
389
390         if (tv != NULL) {
391                 timeout = evutil_tv_to_msec(tv);
392                 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
393                         /* Linux kernels can wait forever if the timeout is
394                          * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
395                         timeout = MAX_EPOLL_TIMEOUT_MSEC;
396                 }
397         }
398
399         epoll_apply_changes(base);
400         event_changelist_remove_all(&base->changelist, base);
401
402         EVBASE_RELEASE_LOCK(base, th_base_lock);
403
404         res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
405
406         EVBASE_ACQUIRE_LOCK(base, th_base_lock);
407
408         if (res == -1) {
409                 if (errno != EINTR) {
410                         event_warn("epoll_wait");
411                         return (-1);
412                 }
413
414                 return (0);
415         }
416
417         event_debug(("%s: epoll_wait reports %d", __func__, res));
418         EVUTIL_ASSERT(res <= epollop->nevents);
419
420         for (i = 0; i < res; i++) {
421                 int what = events[i].events;
422                 short ev = 0;
423
424                 if (what & (EPOLLHUP|EPOLLERR)) {
425                         ev = EV_READ | EV_WRITE;
426                 } else {
427                         if (what & EPOLLIN)
428                                 ev |= EV_READ;
429                         if (what & EPOLLOUT)
430                                 ev |= EV_WRITE;
431                 }
432
433                 if (!ev)
434                         continue;
435
436                 evmap_io_active(base, events[i].data.fd, ev | EV_ET);
437         }
438
439         if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
440                 /* We used all of the event space this time.  We should
441                    be ready for more events next time. */
442                 int new_nevents = epollop->nevents * 2;
443                 struct epoll_event *new_events;
444
445                 new_events = mm_realloc(epollop->events,
446                     new_nevents * sizeof(struct epoll_event));
447                 if (new_events) {
448                         epollop->events = new_events;
449                         epollop->nevents = new_nevents;
450                 }
451         }
452
453         return (0);
454 }
455
456
457 static void
458 epoll_dealloc(struct event_base *base)
459 {
460         struct epollop *epollop = base->evbase;
461
462         evsig_dealloc(base);
463         if (epollop->events)
464                 mm_free(epollop->events);
465         if (epollop->epfd >= 0)
466                 close(epollop->epfd);
467
468         memset(epollop, 0, sizeof(struct epollop));
469         mm_free(epollop);
470 }