2 event.c -- I/O, timeout and signal event handling
3 Copyright (C) 2012-2022 Guus Sliepen <guus@tinc-vpn.org>
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License along
16 with this program; if not, write to the Free Software Foundation, Inc.,
17 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 # if defined(HAVE_SYS_EPOLL_H)
26 # include <sys/epoll.h>
28 # elif defined(HAVE_SYS_EVENT_H)
29 # include <sys/event.h>
30 # define HAVE_KQUEUE 1
32 # define HAVE_SELECT 1
43 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
44 static int event_fd = 0;
45 #elif defined(HAVE_SELECT)
46 static fd_set readfds;
47 static fd_set writefds;
48 #elif defined(HAVE_WINDOWS)
49 static const long READ_EVENTS = FD_READ | FD_ACCEPT | FD_CLOSE;
50 static const long WRITE_EVENTS = FD_WRITE | FD_CONNECT;
51 static DWORD event_count = 0;
54 static inline void event_init(void) {
55 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
58 #if defined(HAVE_EPOLL)
59 /* NOTE: 1024 limit is only used on ancient (pre 2.6.27) kernels.
60 Decent kernels will ignore this value making it unlimited.
61 epoll_create1 might be better, but these kernels would not be supported
63 event_fd = epoll_create(1024);
69 logger(DEBUG_ALWAYS, LOG_EMERG, "Could not initialize events: %s", strerror(errno));
77 static void event_deinit(void) {
78 #if defined(HAVE_EPOLL) || defined(HAVE_KQUEUE)
88 static int io_compare(const io_t *a, const io_t *b) {
93 if(a->event < b->event) {
97 if(a->event > b->event) {
105 static int timeout_compare(const timeout_t *a, const timeout_t *b) {
107 timersub(&a->tv, &b->tv, &diff);
109 if(diff.tv_sec < 0) {
113 if(diff.tv_sec > 0) {
117 if(diff.tv_usec < 0) {
121 if(diff.tv_usec > 0) {
136 static splay_tree_t io_tree = {.compare = (splay_compare_t)io_compare};
137 static splay_tree_t timeout_tree = {.compare = (splay_compare_t)timeout_compare};
139 void io_add(io_t *io, io_cb_t cb, void *data, int fd, int flags) {
148 io->event = WSACreateEvent();
150 if(io->event == WSA_INVALID_EVENT) {
165 if(!splay_insert_node(&io_tree, &io->node)) {
173 void io_add_event(io_t *io, io_cb_t cb, void *data, WSAEVENT event) {
175 io_add(io, cb, data, -1, 0);
179 void io_set(io_t *io, int flags) {
182 if(flags == io->flags) {
194 epoll_ctl(event_fd, EPOLL_CTL_DEL, io->fd, NULL);
196 struct epoll_event ev = {
201 if(flags & IO_READ) {
202 ev.events |= EPOLLIN;
205 if(flags & IO_WRITE) {
206 ev.events |= EPOLLOUT;
207 } else if(ev.events == 0) {
208 io_tree.generation++;
212 if(epoll_ctl(event_fd, EPOLL_CTL_ADD, io->fd, &ev) < 0) {
213 perror("epoll_ctl_add");
219 const struct kevent change[] = {
222 .filter = EVFILT_READ,
223 .flags = EV_RECEIPT | (flags & IO_READ ? EV_ADD : EV_DELETE),
228 .filter = EVFILT_WRITE,
229 .flags = EV_RECEIPT | (flags & IO_WRITE ? EV_ADD : EV_DELETE),
233 struct kevent result[2];
235 if(kevent(event_fd, change, 2, result, 2, NULL) < 0) {
236 logger(DEBUG_ALWAYS, LOG_EMERG, "kevent failed: %s", strerror(errno));
240 int rerr = (int)result[0].data;
241 int werr = (int)result[1].data;
243 if((rerr && rerr != ENOENT) || (werr && werr != ENOENT)) {
244 logger(DEBUG_ALWAYS, LOG_EMERG, "kevent errors: %s, %s", strerror(rerr), strerror(werr));
249 io_tree.generation++;
256 if(flags & IO_READ) {
257 FD_SET(io->fd, &readfds);
259 FD_CLR(io->fd, &readfds);
262 if(flags & IO_WRITE) {
263 FD_SET(io->fd, &writefds);
265 FD_CLR(io->fd, &writefds);
273 if(flags & IO_WRITE) {
274 events |= WRITE_EVENTS;
277 if(flags & IO_READ) {
278 events |= READ_EVENTS;
281 if(WSAEventSelect(io->fd, io->event, events) != 0) {
288 void io_del(io_t *io) {
296 if(io->fd != -1 && WSACloseEvent(io->event) == FALSE) {
304 splay_unlink_node(&io_tree, &io->node);
309 void timeout_add(timeout_t *timeout, timeout_cb_t cb, void *data, const struct timeval *tv) {
311 timeout->data = data;
312 timeout->node.data = timeout;
314 timeout_set(timeout, tv);
317 void timeout_set(timeout_t *timeout, const struct timeval *tv) {
318 if(timerisset(&timeout->tv)) {
319 splay_unlink_node(&timeout_tree, &timeout->node);
323 gettimeofday(&now, NULL);
326 timeradd(&now, tv, &timeout->tv);
328 if(!splay_insert_node(&timeout_tree, &timeout->node)) {
333 void timeout_del(timeout_t *timeout) {
338 splay_unlink_node(&timeout_tree, &timeout->node);
340 timeout->tv = (struct timeval) {
349 # define NSIG (_SIGMAX + 1) /* For QNX */
353 static io_t signalio;
354 static int pipefd[2] = {-1, -1};
355 static signal_t *signal_handle[NSIG + 1] = {NULL};
357 static void signal_handler(int signum) {
358 unsigned char num = signum;
360 if(write(pipefd[1], &num, 1) != 1) {
361 // Pipe full or broken, nothing we can do about it.
365 static void signalio_handler(void *data, int flags) {
368 unsigned char signum;
370 if(read(pipefd[0], &signum, 1) != 1) {
374 signal_t *sig = signal_handle[signum];
381 static void pipe_init(void) {
383 io_add(&signalio, signalio_handler, NULL, pipefd[0], IO_READ);
387 void signal_add(signal_t *sig, signal_cb_t cb, void *data, int signum) {
392 sig->signum = signum;
396 if(pipefd[0] == -1) {
400 signal(signum, signal_handler);
402 signal_handle[signum] = sig;
405 void signal_del(signal_t *sig) {
410 signal(sig->signum, SIG_DFL);
412 signal_handle[sig->signum] = NULL;
417 static struct timeval *timeout_execute(struct timeval *diff) {
418 gettimeofday(&now, NULL);
419 struct timeval *tv = NULL;
421 while(timeout_tree.head) {
422 timeout_t *timeout = timeout_tree.head->data;
423 timersub(&timeout->tv, &now, diff);
425 if(diff->tv_sec < 0) {
426 timeout->cb(timeout->data);
428 if(timercmp(&timeout->tv, &now, <)) {
429 timeout_del(timeout);
440 bool event_loop(void) {
453 struct timeval *tv = timeout_execute(&diff);
456 memcpy(&readable, &readfds, sizeof(readable));
457 memcpy(&writable, &writefds, sizeof(writable));
461 struct epoll_event events[MAX_EVENTS_PER_LOOP];
462 long timeout = (tv->tv_sec * 1000) + (tv->tv_usec / 1000);
464 if(timeout > INT_MAX) {
468 int n = epoll_wait(event_fd, events, MAX_EVENTS_PER_LOOP, (int)timeout);
472 struct kevent events[MAX_EVENTS_PER_LOOP];
474 const struct timespec ts = {
475 .tv_sec = tv->tv_sec,
476 .tv_nsec = tv->tv_usec * 1000,
479 int n = kevent(event_fd, NULL, 0, events, MAX_EVENTS_PER_LOOP, &ts);
486 io_t *last = io_tree.tail->data;
487 maxfds = last->fd + 1;
490 int n = select(maxfds, &readable, &writable, NULL, tv);
494 if(sockwouldblock(sockerrno)) {
505 unsigned int curgen = io_tree.generation;
510 for(int i = 0; i < n; i++) {
511 io_t *io = events[i].data.ptr;
513 if(events[i].events & EPOLLOUT && io->flags & IO_WRITE) {
514 io->cb(io->data, IO_WRITE);
517 if(curgen != io_tree.generation) {
521 if(events[i].events & EPOLLIN && io->flags & IO_READ) {
522 io->cb(io->data, IO_READ);
525 if(curgen != io_tree.generation) {
534 for(int i = 0; i < n; i++) {
535 const struct kevent *evt = &events[i];
536 const io_t *io = evt->udata;
538 if(evt->filter == EVFILT_WRITE) {
539 io->cb(io->data, IO_WRITE);
540 } else if(evt->filter == EVFILT_READ) {
541 io->cb(io->data, IO_READ);
546 if(curgen != io_tree.generation) {
555 for splay_each(io_t, io, &io_tree) {
556 if(FD_ISSET(io->fd, &writable)) {
557 io->cb(io->data, IO_WRITE);
558 } else if(FD_ISSET(io->fd, &readable)) {
559 io->cb(io->data, IO_READ);
565 There are scenarios in which the callback will remove another io_t from the tree
566 (e.g. closing a double connection). Since splay_each does not support that, we
567 need to exit the loop if that happens. That's okay, since any remaining events will
568 get picked up by the next select() call.
570 if(curgen != io_tree.generation) {
579 assert(WSA_WAIT_EVENT_0 == 0);
583 struct timeval *tv = timeout_execute(&diff);
584 DWORD timeout_ms = tv ? (DWORD)(tv->tv_sec * 1000 + tv->tv_usec / 1000 + 1) : WSA_INFINITE;
592 For some reason, Microsoft decided to make the FD_WRITE event edge-triggered instead of level-triggered,
593 which is the opposite of what select() does. In practice, that means that if a FD_WRITE event triggers,
594 it will never trigger again until a send() returns EWOULDBLOCK. Since the semantics of this event loop
595 is that write events are level-triggered (i.e. they continue firing until the socket is full), we need
596 to emulate these semantics by making sure we fire each IO_WRITE that is still writeable.
598 Note that technically FD_CLOSE has the same problem, but it's okay because user code does not rely on
599 this event being fired again if ignored.
601 unsigned int curgen = io_tree.generation;
603 for splay_each(io_t, io, &io_tree) {
604 if(io->flags & IO_WRITE && send(io->fd, NULL, 0, 0) == 0) {
605 io->cb(io->data, IO_WRITE);
607 if(curgen != io_tree.generation) {
613 if(event_count > WSA_MAXIMUM_WAIT_EVENTS) {
614 WSASetLastError(WSA_INVALID_PARAMETER);
618 WSAEVENT events[WSA_MAXIMUM_WAIT_EVENTS];
619 io_t *io_map[WSA_MAXIMUM_WAIT_EVENTS];
620 DWORD event_index = 0;
622 for splay_each(io_t, io, &io_tree) {
623 events[event_index] = io->event;
624 io_map[event_index] = io;
629 * If the generation number changes due to event addition
630 * or removal by a callback we restart the loop.
632 curgen = io_tree.generation;
634 for(DWORD event_offset = 0; event_offset < event_count;) {
635 DWORD result = WSAWaitForMultipleEvents(event_count - event_offset, &events[event_offset], FALSE, timeout_ms, FALSE);
637 if(result == WSA_WAIT_TIMEOUT) {
641 if(result >= event_count - event_offset) {
645 /* Look up io in the map by index. */
646 event_index = result + event_offset;
647 io_t *io = io_map[event_index];
652 if(curgen != io_tree.generation) {
656 WSANETWORKEVENTS network_events;
658 if(WSAEnumNetworkEvents(io->fd, io->event, &network_events) != 0) {
662 if(network_events.lNetworkEvents & READ_EVENTS) {
663 io->cb(io->data, IO_READ);
665 if(curgen != io_tree.generation) {
671 The fd might be available for write too. However, if we already fired the read callback, that
672 callback might have deleted the io (e.g. through terminate_connection()), so we can't fire the
673 write callback here. Instead, we loop back and let the writable io loop above handle it.
677 /* Continue checking the rest of the events. */
678 event_offset = event_index + 1;
680 /* Just poll the next time through. */
691 void event_exit(void) {