]> arthur.barton.de Git - netatalk.git/blob - libevent/epoll.c
Writing metadata xattr on directories with sticky bit set, FR#94
[netatalk.git] / libevent / epoll.c
1 /*
2  * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
3  * Copyright 2007-2012 Niels Provos, Nick Mathewson
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. The name of the author may not be used to endorse or promote products
14  *    derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 #include "event2/event-config.h"
28
29 #include <stdint.h>
30 #include <sys/types.h>
31 #include <sys/resource.h>
32 #ifdef _EVENT_HAVE_SYS_TIME_H
33 #include <sys/time.h>
34 #endif
35 #include <sys/queue.h>
36 #include <sys/epoll.h>
37 #include <signal.h>
38 #include <limits.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <unistd.h>
43 #include <errno.h>
44 #ifdef _EVENT_HAVE_FCNTL_H
45 #include <fcntl.h>
46 #endif
47
48 #include "event-internal.h"
49 #include "evsignal-internal.h"
50 #include "event2/thread.h"
51 #include "evthread-internal.h"
52 #include "log-internal.h"
53 #include "evmap-internal.h"
54 #include "changelist-internal.h"
55
56 struct epollop {
57         struct epoll_event *events;
58         int nevents;
59         int epfd;
60 };
61
62 static void *epoll_init(struct event_base *);
63 static int epoll_dispatch(struct event_base *, struct timeval *);
64 static void epoll_dealloc(struct event_base *);
65
66 static const struct eventop epollops_changelist = {
67         "epoll (with changelist)",
68         epoll_init,
69         event_changelist_add,
70         event_changelist_del,
71         epoll_dispatch,
72         epoll_dealloc,
73         1, /* need reinit */
74         EV_FEATURE_ET|EV_FEATURE_O1,
75         EVENT_CHANGELIST_FDINFO_SIZE
76 };
77
78
79 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
80     short old, short events, void *p);
81 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
82     short old, short events, void *p);
83
84 const struct eventop epollops = {
85         "epoll",
86         epoll_init,
87         epoll_nochangelist_add,
88         epoll_nochangelist_del,
89         epoll_dispatch,
90         epoll_dealloc,
91         1, /* need reinit */
92         EV_FEATURE_ET|EV_FEATURE_O1,
93         0
94 };
95
96 #define INITIAL_NEVENT 32
97 #define MAX_NEVENT 4096
98
99 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
100  * values bigger than (LONG_MAX - 999ULL)/HZ.  HZ in the wild can be
101  * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
102  * largest number of msec we can support here is 2147482.  Let's
103  * round that down by 47 seconds.
104  */
105 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
106
107 static void *
108 epoll_init(struct event_base *base)
109 {
110         int epfd;
111         struct epollop *epollop;
112
113         /* Initialize the kernel queue.  (The size field is ignored since
114          * 2.6.8.) */
115         if ((epfd = epoll_create(32000)) == -1) {
116                 if (errno != ENOSYS)
117                         event_warn("epoll_create");
118                 return (NULL);
119         }
120
121         evutil_make_socket_closeonexec(epfd);
122
123         if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
124                 close(epfd);
125                 return (NULL);
126         }
127
128         epollop->epfd = epfd;
129
130         /* Initialize fields */
131         epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
132         if (epollop->events == NULL) {
133                 mm_free(epollop);
134                 close(epfd);
135                 return (NULL);
136         }
137         epollop->nevents = INITIAL_NEVENT;
138
139         if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
140             ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
141                 evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL))
142                 base->evsel = &epollops_changelist;
143
144         evsig_init(base);
145
146         return (epollop);
147 }
148
149 static const char *
150 change_to_string(int change)
151 {
152         change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
153         if (change == EV_CHANGE_ADD) {
154                 return "add";
155         } else if (change == EV_CHANGE_DEL) {
156                 return "del";
157         } else if (change == 0) {
158                 return "none";
159         } else {
160                 return "???";
161         }
162 }
163
164 static const char *
165 epoll_op_to_string(int op)
166 {
167         return op == EPOLL_CTL_ADD?"ADD":
168             op == EPOLL_CTL_DEL?"DEL":
169             op == EPOLL_CTL_MOD?"MOD":
170             "???";
171 }
172
173 static int
174 epoll_apply_one_change(struct event_base *base,
175     struct epollop *epollop,
176     const struct event_change *ch)
177 {
178         struct epoll_event epev;
179         int op, events = 0;
180
181         if (1) {
182                 /* The logic here is a little tricky.  If we had no events set
183                    on the fd before, we need to set op="ADD" and set
184                    events=the events we want to add.  If we had any events set
185                    on the fd before, and we want any events to remain on the
186                    fd, we need to say op="MOD" and set events=the events we
187                    want to remain.  But if we want to delete the last event,
188                    we say op="DEL" and set events=the remaining events.  What
189                    fun!
190                 */
191
192                 /* TODO: Turn this into a switch or a table lookup. */
193
194                 if ((ch->read_change & EV_CHANGE_ADD) ||
195                     (ch->write_change & EV_CHANGE_ADD)) {
196                         /* If we are adding anything at all, we'll want to do
197                          * either an ADD or a MOD. */
198                         events = 0;
199                         op = EPOLL_CTL_ADD;
200                         if (ch->read_change & EV_CHANGE_ADD) {
201                                 events |= EPOLLIN;
202                         } else if (ch->read_change & EV_CHANGE_DEL) {
203                                 ;
204                         } else if (ch->old_events & EV_READ) {
205                                 events |= EPOLLIN;
206                         }
207                         if (ch->write_change & EV_CHANGE_ADD) {
208                                 events |= EPOLLOUT;
209                         } else if (ch->write_change & EV_CHANGE_DEL) {
210                                 ;
211                         } else if (ch->old_events & EV_WRITE) {
212                                 events |= EPOLLOUT;
213                         }
214                         if ((ch->read_change|ch->write_change) & EV_ET)
215                                 events |= EPOLLET;
216
217                         if (ch->old_events) {
218                                 /* If MOD fails, we retry as an ADD, and if
219                                  * ADD fails we will retry as a MOD.  So the
220                                  * only hard part here is to guess which one
221                                  * will work.  As a heuristic, we'll try
222                                  * MOD first if we think there were old
223                                  * events and ADD if we think there were none.
224                                  *
225                                  * We can be wrong about the MOD if the file
226                                  * has in fact been closed and re-opened.
227                                  *
228                                  * We can be wrong about the ADD if the
229                                  * the fd has been re-created with a dup()
230                                  * of the same file that it was before.
231                                  */
232                                 op = EPOLL_CTL_MOD;
233                         }
234                 } else if ((ch->read_change & EV_CHANGE_DEL) ||
235                     (ch->write_change & EV_CHANGE_DEL)) {
236                         /* If we're deleting anything, we'll want to do a MOD
237                          * or a DEL. */
238                         op = EPOLL_CTL_DEL;
239
240                         if (ch->read_change & EV_CHANGE_DEL) {
241                                 if (ch->write_change & EV_CHANGE_DEL) {
242                                         events = EPOLLIN|EPOLLOUT;
243                                 } else if (ch->old_events & EV_WRITE) {
244                                         events = EPOLLOUT;
245                                         op = EPOLL_CTL_MOD;
246                                 } else {
247                                         events = EPOLLIN;
248                                 }
249                         } else if (ch->write_change & EV_CHANGE_DEL) {
250                                 if (ch->old_events & EV_READ) {
251                                         events = EPOLLIN;
252                                         op = EPOLL_CTL_MOD;
253                                 } else {
254                                         events = EPOLLOUT;
255                                 }
256                         }
257                 }
258
259                 if (!events)
260                         return 0;
261
262                 memset(&epev, 0, sizeof(epev));
263                 epev.data.fd = ch->fd;
264                 epev.events = events;
265                 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == -1) {
266                         if (op == EPOLL_CTL_MOD && errno == ENOENT) {
267                                 /* If a MOD operation fails with ENOENT, the
268                                  * fd was probably closed and re-opened.  We
269                                  * should retry the operation as an ADD.
270                                  */
271                                 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
272                                         event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
273                                             (int)epev.events, ch->fd);
274                                         return -1;
275                                 } else {
276                                         event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
277                                                 (int)epev.events,
278                                                 ch->fd));
279                                 }
280                         } else if (op == EPOLL_CTL_ADD && errno == EEXIST) {
281                                 /* If an ADD operation fails with EEXIST,
282                                  * either the operation was redundant (as with a
283                                  * precautionary add), or we ran into a fun
284                                  * kernel bug where using dup*() to duplicate the
285                                  * same file into the same fd gives you the same epitem
286                                  * rather than a fresh one.  For the second case,
287                                  * we must retry with MOD. */
288                                 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
289                                         event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
290                                             (int)epev.events, ch->fd);
291                                         return -1;
292                                 } else {
293                                         event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
294                                                 (int)epev.events,
295                                                 ch->fd));
296                                 }
297                         } else if (op == EPOLL_CTL_DEL &&
298                             (errno == ENOENT || errno == EBADF ||
299                                 errno == EPERM)) {
300                                 /* If a delete fails with one of these errors,
301                                  * that's fine too: we closed the fd before we
302                                  * got around to calling epoll_dispatch. */
303                                 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
304                                         (int)epev.events,
305                                         ch->fd,
306                                         strerror(errno)));
307                         } else {
308                                 event_warn("Epoll %s(%d) on fd %d failed.  Old events were %d; read change was %d (%s); write change was %d (%s)",
309                                     epoll_op_to_string(op),
310                                     (int)epev.events,
311                                     ch->fd,
312                                     ch->old_events,
313                                     ch->read_change,
314                                     change_to_string(ch->read_change),
315                                     ch->write_change,
316                                     change_to_string(ch->write_change));
317                                 return -1;
318                         }
319                 } else {
320                         event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]",
321                                 epoll_op_to_string(op),
322                                 (int)epev.events,
323                                 (int)ch->fd,
324                                 ch->old_events,
325                                 ch->read_change,
326                                 ch->write_change));
327                 }
328         }
329         return 0;
330 }
331
332 static int
333 epoll_apply_changes(struct event_base *base)
334 {
335         struct event_changelist *changelist = &base->changelist;
336         struct epollop *epollop = base->evbase;
337         struct event_change *ch;
338
339         int r = 0;
340         int i;
341
342         for (i = 0; i < changelist->n_changes; ++i) {
343                 ch = &changelist->changes[i];
344                 if (epoll_apply_one_change(base, epollop, ch) < 0)
345                         r = -1;
346         }
347
348         return (r);
349 }
350
351 static int
352 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
353     short old, short events, void *p)
354 {
355         struct event_change ch;
356         ch.fd = fd;
357         ch.old_events = old;
358         ch.read_change = ch.write_change = 0;
359         if (events & EV_WRITE)
360                 ch.write_change = EV_CHANGE_ADD |
361                     (events & EV_ET);
362         if (events & EV_READ)
363                 ch.read_change = EV_CHANGE_ADD |
364                     (events & EV_ET);
365
366         return epoll_apply_one_change(base, base->evbase, &ch);
367 }
368
369 static int
370 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
371     short old, short events, void *p)
372 {
373         struct event_change ch;
374         ch.fd = fd;
375         ch.old_events = old;
376         ch.read_change = ch.write_change = 0;
377         if (events & EV_WRITE)
378                 ch.write_change = EV_CHANGE_DEL;
379         if (events & EV_READ)
380                 ch.read_change = EV_CHANGE_DEL;
381
382         return epoll_apply_one_change(base, base->evbase, &ch);
383 }
384
385 static int
386 epoll_dispatch(struct event_base *base, struct timeval *tv)
387 {
388         struct epollop *epollop = base->evbase;
389         struct epoll_event *events = epollop->events;
390         int i, res;
391         long timeout = -1;
392
393         if (tv != NULL) {
394                 timeout = evutil_tv_to_msec(tv);
395                 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
396                         /* Linux kernels can wait forever if the timeout is
397                          * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
398                         timeout = MAX_EPOLL_TIMEOUT_MSEC;
399                 }
400         }
401
402         epoll_apply_changes(base);
403         event_changelist_remove_all(&base->changelist, base);
404
405         EVBASE_RELEASE_LOCK(base, th_base_lock);
406
407         res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
408
409         EVBASE_ACQUIRE_LOCK(base, th_base_lock);
410
411         if (res == -1) {
412                 if (errno != EINTR) {
413                         event_warn("epoll_wait");
414                         return (-1);
415                 }
416
417                 return (0);
418         }
419
420         event_debug(("%s: epoll_wait reports %d", __func__, res));
421         EVUTIL_ASSERT(res <= epollop->nevents);
422
423         for (i = 0; i < res; i++) {
424                 int what = events[i].events;
425                 short ev = 0;
426
427                 if (what & (EPOLLHUP|EPOLLERR)) {
428                         ev = EV_READ | EV_WRITE;
429                 } else {
430                         if (what & EPOLLIN)
431                                 ev |= EV_READ;
432                         if (what & EPOLLOUT)
433                                 ev |= EV_WRITE;
434                 }
435
436                 if (!ev)
437                         continue;
438
439                 evmap_io_active(base, events[i].data.fd, ev | EV_ET);
440         }
441
442         if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
443                 /* We used all of the event space this time.  We should
444                    be ready for more events next time. */
445                 int new_nevents = epollop->nevents * 2;
446                 struct epoll_event *new_events;
447
448                 new_events = mm_realloc(epollop->events,
449                     new_nevents * sizeof(struct epoll_event));
450                 if (new_events) {
451                         epollop->events = new_events;
452                         epollop->nevents = new_nevents;
453                 }
454         }
455
456         return (0);
457 }
458
459
460 static void
461 epoll_dealloc(struct event_base *base)
462 {
463         struct epollop *epollop = base->evbase;
464
465         evsig_dealloc(base);
466         if (epollop->events)
467                 mm_free(epollop->events);
468         if (epollop->epfd >= 0)
469                 close(epollop->epfd);
470
471         memset(epollop, 0, sizeof(struct epollop));
472         mm_free(epollop);
473 }