root/opal/mca/event/libevent2022/libevent/epoll.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. epoll_init
  2. change_to_string
  3. epoll_op_to_string
  4. epoll_apply_one_change
  5. epoll_apply_changes
  6. epoll_nochangelist_add
  7. epoll_nochangelist_del
  8. epoll_dispatch
  9. epoll_dealloc

   1 /*
   2  * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
   3  * Copyright 2007-2012 Niels Provos, Nick Mathewson
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. The name of the author may not be used to endorse or promote products
  14  *    derived from this software without specific prior written permission.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26  */
  27 #include "event2/event-config.h"
  28 
  29 #include <stdint.h>
  30 #include <sys/types.h>
  31 #include <sys/resource.h>
  32 #ifdef _EVENT_HAVE_SYS_TIME_H
  33 #include <sys/time.h>
  34 #endif
  35 #include <sys/queue.h>
  36 #include <sys/epoll.h>
  37 #include <signal.h>
  38 #include <limits.h>
  39 #include <stdio.h>
  40 #include <stdlib.h>
  41 #include <string.h>
  42 #include <unistd.h>
  43 #include <errno.h>
  44 #ifdef _EVENT_HAVE_FCNTL_H
  45 #include <fcntl.h>
  46 #endif
  47 
  48 #include "event-internal.h"
  49 #include "evsignal-internal.h"
  50 #include "event2/thread.h"
  51 #include "evthread-internal.h"
  52 #include "log-internal.h"
  53 #include "evmap-internal.h"
  54 #include "changelist-internal.h"
  55 
  56 struct epollop {
  57         struct epoll_event *events;
  58         int nevents;
  59         int epfd;
  60 };
  61 
  62 static void *epoll_init(struct event_base *);
  63 static int epoll_dispatch(struct event_base *, struct timeval *);
  64 static void epoll_dealloc(struct event_base *);
  65 
  66 static const struct eventop epollops_changelist = {
  67         "epoll (with changelist)",
  68         epoll_init,
  69         event_changelist_add,
  70         event_changelist_del,
  71         epoll_dispatch,
  72         epoll_dealloc,
  73         1, /* need reinit */
  74         EV_FEATURE_ET|EV_FEATURE_O1,
  75         EVENT_CHANGELIST_FDINFO_SIZE
  76 };
  77 
  78 
  79 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
  80     short old, short events, void *p);
  81 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
  82     short old, short events, void *p);
  83 
  84 const struct eventop epollops = {
  85         "epoll",
  86         epoll_init,
  87         epoll_nochangelist_add,
  88         epoll_nochangelist_del,
  89         epoll_dispatch,
  90         epoll_dealloc,
  91         1, /* need reinit */
  92         EV_FEATURE_ET|EV_FEATURE_O1,
  93         0
  94 };
  95 
  96 #define INITIAL_NEVENT 32
  97 #define MAX_NEVENT 4096
  98 
  99 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
 100  * values bigger than (LONG_MAX - 999ULL)/HZ.  HZ in the wild can be
 101  * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
 102  * largest number of msec we can support here is 2147482.  Let's
 103  * round that down by 47 seconds.
 104  */
 105 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
 106 
 107 static void *
 108 epoll_init(struct event_base *base)
 109 {
 110         int epfd;
 111         struct epollop *epollop;
 112 
 113         /* Initialize the kernel queue.  (The size field is ignored since
 114          * 2.6.8.) */
 115         if ((epfd = epoll_create(32000)) == -1) {
 116                 if (errno != ENOSYS)
 117                         event_warn("epoll_create");
 118                 return (NULL);
 119         }
 120 
 121         evutil_make_socket_closeonexec(epfd);
 122 
 123         if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
 124                 close(epfd);
 125                 return (NULL);
 126         }
 127 
 128         epollop->epfd = epfd;
 129 
 130         /* Initialize fields */
 131         epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
 132         if (epollop->events == NULL) {
 133                 mm_free(epollop);
 134                 close(epfd);
 135                 return (NULL);
 136         }
 137         epollop->nevents = INITIAL_NEVENT;
 138 
 139         if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
 140             ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
 141                 evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL))
 142                 base->evsel = &epollops_changelist;
 143 
 144         evsig_init(base);
 145 
 146         return (epollop);
 147 }
 148 
 149 static const char *
 150 change_to_string(int change)
 151 {
 152         change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
 153         if (change == EV_CHANGE_ADD) {
 154                 return "add";
 155         } else if (change == EV_CHANGE_DEL) {
 156                 return "del";
 157         } else if (change == 0) {
 158                 return "none";
 159         } else {
 160                 return "???";
 161         }
 162 }
 163 
 164 static const char *
 165 epoll_op_to_string(int op)
 166 {
 167         return op == EPOLL_CTL_ADD?"ADD":
 168             op == EPOLL_CTL_DEL?"DEL":
 169             op == EPOLL_CTL_MOD?"MOD":
 170             "???";
 171 }
 172 
 173 static int
 174 epoll_apply_one_change(struct event_base *base,
 175     struct epollop *epollop,
 176     const struct event_change *ch)
 177 {
 178         struct epoll_event epev;
 179         int op, events = 0;
 180 
 181         if (1) {
 182                 /* The logic here is a little tricky.  If we had no events set
 183                    on the fd before, we need to set op="ADD" and set
 184                    events=the events we want to add.  If we had any events set
 185                    on the fd before, and we want any events to remain on the
 186                    fd, we need to say op="MOD" and set events=the events we
 187                    want to remain.  But if we want to delete the last event,
 188                    we say op="DEL" and set events=the remaining events.  What
 189                    fun!
 190                 */
 191 
 192                 /* TODO: Turn this into a switch or a table lookup. */
 193 
 194                 if ((ch->read_change & EV_CHANGE_ADD) ||
 195                     (ch->write_change & EV_CHANGE_ADD)) {
 196                         /* If we are adding anything at all, we'll want to do
 197                          * either an ADD or a MOD. */
 198                         events = 0;
 199                         op = EPOLL_CTL_ADD;
 200                         if (ch->read_change & EV_CHANGE_ADD) {
 201                                 events |= EPOLLIN;
 202                         } else if (ch->read_change & EV_CHANGE_DEL) {
 203                                 ;
 204                         } else if (ch->old_events & EV_READ) {
 205                                 events |= EPOLLIN;
 206                         }
 207                         if (ch->write_change & EV_CHANGE_ADD) {
 208                                 events |= EPOLLOUT;
 209                         } else if (ch->write_change & EV_CHANGE_DEL) {
 210                                 ;
 211                         } else if (ch->old_events & EV_WRITE) {
 212                                 events |= EPOLLOUT;
 213                         }
 214                         if ((ch->read_change|ch->write_change) & EV_ET)
 215                                 events |= EPOLLET;
 216 
 217                         if (ch->old_events) {
 218                                 /* If MOD fails, we retry as an ADD, and if
 219                                  * ADD fails we will retry as a MOD.  So the
 220                                  * only hard part here is to guess which one
 221                                  * will work.  As a heuristic, we'll try
 222                                  * MOD first if we think there were old
 223                                  * events and ADD if we think there were none.
 224                                  *
 225                                  * We can be wrong about the MOD if the file
 226                                  * has in fact been closed and re-opened.
 227                                  *
 228                                  * We can be wrong about the ADD if the
 229                                  * the fd has been re-created with a dup()
 230                                  * of the same file that it was before.
 231                                  */
 232                                 op = EPOLL_CTL_MOD;
 233                         }
 234                 } else if ((ch->read_change & EV_CHANGE_DEL) ||
 235                     (ch->write_change & EV_CHANGE_DEL)) {
 236                         /* If we're deleting anything, we'll want to do a MOD
 237                          * or a DEL. */
 238                         op = EPOLL_CTL_DEL;
 239 
 240                         if (ch->read_change & EV_CHANGE_DEL) {
 241                                 if (ch->write_change & EV_CHANGE_DEL) {
 242                                         events = EPOLLIN|EPOLLOUT;
 243                                 } else if (ch->old_events & EV_WRITE) {
 244                                         events = EPOLLOUT;
 245                                         op = EPOLL_CTL_MOD;
 246                                 } else {
 247                                         events = EPOLLIN;
 248                                 }
 249                         } else if (ch->write_change & EV_CHANGE_DEL) {
 250                                 if (ch->old_events & EV_READ) {
 251                                         events = EPOLLIN;
 252                                         op = EPOLL_CTL_MOD;
 253                                 } else {
 254                                         events = EPOLLOUT;
 255                                 }
 256                         }
 257                 }
 258 
 259                 if (!events)
 260                         return 0;
 261 
 262                 memset(&epev, 0, sizeof(epev));
 263                 epev.data.fd = ch->fd;
 264                 epev.events = events;
 265                 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == -1) {
 266                         if (op == EPOLL_CTL_MOD && errno == ENOENT) {
 267                                 /* If a MOD operation fails with ENOENT, the
 268                                  * fd was probably closed and re-opened.  We
 269                                  * should retry the operation as an ADD.
 270                                  */
 271                                 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
 272                                         event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
 273                                             (int)epev.events, ch->fd);
 274                                         return -1;
 275                                 } else {
 276                                         event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
 277                                                 (int)epev.events,
 278                                                 ch->fd));
 279                                 }
 280                         } else if (op == EPOLL_CTL_ADD && errno == EEXIST) {
 281                                 /* If an ADD operation fails with EEXIST,
 282                                  * either the operation was redundant (as with a
 283                                  * precautionary add), or we ran into a fun
 284                                  * kernel bug where using dup*() to duplicate the
 285                                  * same file into the same fd gives you the same epitem
 286                                  * rather than a fresh one.  For the second case,
 287                                  * we must retry with MOD. */
 288                                 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
 289                                         event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
 290                                             (int)epev.events, ch->fd);
 291                                         return -1;
 292                                 } else {
 293                                         event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
 294                                                 (int)epev.events,
 295                                                 ch->fd));
 296                                 }
 297                         } else if (op == EPOLL_CTL_DEL &&
 298                             (errno == ENOENT || errno == EBADF ||
 299                                 errno == EPERM)) {
 300                                 /* If a delete fails with one of these errors,
 301                                  * that's fine too: we closed the fd before we
 302                                  * got around to calling epoll_dispatch. */
 303                                 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
 304                                         (int)epev.events,
 305                                         ch->fd,
 306                                         strerror(errno)));
 307                         } else {
 308                                 event_warn("Epoll %s(%d) on fd %d failed.  Old events were %d; read change was %d (%s); write change was %d (%s)",
 309                                     epoll_op_to_string(op),
 310                                     (int)epev.events,
 311                                     ch->fd,
 312                                     ch->old_events,
 313                                     ch->read_change,
 314                                     change_to_string(ch->read_change),
 315                                     ch->write_change,
 316                                     change_to_string(ch->write_change));
 317                                 return -1;
 318                         }
 319                 } else {
 320                         event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]",
 321                                 epoll_op_to_string(op),
 322                                 (int)epev.events,
 323                                 (int)ch->fd,
 324                                 ch->old_events,
 325                                 ch->read_change,
 326                                 ch->write_change));
 327                 }
 328         }
 329         return 0;
 330 }
 331 
 332 static int
 333 epoll_apply_changes(struct event_base *base)
 334 {
 335         struct event_changelist *changelist = &base->changelist;
 336         struct epollop *epollop = base->evbase;
 337         struct event_change *ch;
 338 
 339         int r = 0;
 340         int i;
 341 
 342         for (i = 0; i < changelist->n_changes; ++i) {
 343                 ch = &changelist->changes[i];
 344                 if (epoll_apply_one_change(base, epollop, ch) < 0)
 345                         r = -1;
 346         }
 347 
 348         return (r);
 349 }
 350 
 351 static int
 352 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
 353     short old, short events, void *p)
 354 {
 355         struct event_change ch;
 356         ch.fd = fd;
 357         ch.old_events = old;
 358         ch.read_change = ch.write_change = 0;
 359         if (events & EV_WRITE)
 360                 ch.write_change = EV_CHANGE_ADD |
 361                     (events & EV_ET);
 362         if (events & EV_READ)
 363                 ch.read_change = EV_CHANGE_ADD |
 364                     (events & EV_ET);
 365 
 366         return epoll_apply_one_change(base, base->evbase, &ch);
 367 }
 368 
 369 static int
 370 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
 371     short old, short events, void *p)
 372 {
 373         struct event_change ch;
 374         ch.fd = fd;
 375         ch.old_events = old;
 376         ch.read_change = ch.write_change = 0;
 377         if (events & EV_WRITE)
 378                 ch.write_change = EV_CHANGE_DEL;
 379         if (events & EV_READ)
 380                 ch.read_change = EV_CHANGE_DEL;
 381 
 382         return epoll_apply_one_change(base, base->evbase, &ch);
 383 }
 384 
 385 static int
 386 epoll_dispatch(struct event_base *base, struct timeval *tv)
 387 {
 388         struct epollop *epollop = base->evbase;
 389         struct epoll_event *events = epollop->events;
 390         int i, res;
 391         long timeout = -1;
 392 
 393         if (tv != NULL) {
 394                 timeout = evutil_tv_to_msec(tv);
 395                 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
 396                         /* Linux kernels can wait forever if the timeout is
 397                          * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
 398                         timeout = MAX_EPOLL_TIMEOUT_MSEC;
 399                 }
 400         }
 401 
 402         epoll_apply_changes(base);
 403         event_changelist_remove_all(&base->changelist, base);
 404 
 405         EVBASE_RELEASE_LOCK(base, th_base_lock);
 406 
 407         res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
 408 
 409         EVBASE_ACQUIRE_LOCK(base, th_base_lock);
 410 
 411         if (res == -1) {
 412                 if (errno != EINTR) {
 413                         event_warn("epoll_wait");
 414                         return (-1);
 415                 }
 416 
 417                 return (0);
 418         }
 419 
 420         event_debug(("%s: epoll_wait reports %d", __func__, res));
 421         EVUTIL_ASSERT(res <= epollop->nevents);
 422 
 423         for (i = 0; i < res; i++) {
 424                 int what = events[i].events;
 425                 short ev = 0;
 426 
 427                 if (what & (EPOLLHUP|EPOLLERR)) {
 428                         ev = EV_READ | EV_WRITE;
 429                 } else {
 430                         if (what & EPOLLIN)
 431                                 ev |= EV_READ;
 432                         if (what & EPOLLOUT)
 433                                 ev |= EV_WRITE;
 434                 }
 435 
 436                 if (!ev)
 437                         continue;
 438 
 439                 evmap_io_active(base, events[i].data.fd, ev | EV_ET);
 440         }
 441 
 442         if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
 443                 /* We used all of the event space this time.  We should
 444                    be ready for more events next time. */
 445                 int new_nevents = epollop->nevents * 2;
 446                 struct epoll_event *new_events;
 447 
 448                 new_events = mm_realloc(epollop->events,
 449                     new_nevents * sizeof(struct epoll_event));
 450                 if (new_events) {
 451                         epollop->events = new_events;
 452                         epollop->nevents = new_nevents;
 453                 }
 454         }
 455 
 456         return (0);
 457 }
 458 
 459 
 460 static void
 461 epoll_dealloc(struct event_base *base)
 462 {
 463         struct epollop *epollop = base->evbase;
 464 
 465         evsig_dealloc(base);
 466         if (epollop->events)
 467                 mm_free(epollop->events);
 468         if (epollop->epfd >= 0)
 469                 close(epollop->epfd);
 470 
 471         memset(epollop, 0, sizeof(struct epollop));
 472         mm_free(epollop);
 473 }

/* [<][>][^][v][top][bottom][index][help] */