日期:2014-05-16 浏览次数:20833 次
?
?? ?Linux中异步IO等待无非就三个系统调用:select, poll和epoll。很多人无法理解三种调用的区别,或不够了解,今天就结合Linux kernel code详细描述三个的区别!
?
?? ?select:
?? ?select 的限制就是最大1024个fd,可以查看kernel中的posix_types.h,里面定义了fdset数据结构,显然select不适合poll大量fd的场景(如webserver)。?
?
?? ?include/linux/posix_types.h :
?
?
#undef __NFDBITS #define __NFDBITS (8 * sizeof(unsigned long)) #undef __FD_SETSIZE #define __FD_SETSIZE 1024 #undef __FDSET_LONGS #define __FDSET_LONGS (__FD_SETSIZE/__NFDBITS) #undef __FDELT #define __FDELT(d) ((d) / __NFDBITS) #undef __FDMASK #define __FDMASK(d) (1UL << ((d) % __NFDBITS)) typedef struct { unsigned long fds_bits [__FDSET_LONGS]; } __kernel_fd_set;?
?
??poll:
?? poll相对于select改进了fdset size的限制,poll没有再使用fdset数组结构,反而使用了pollfd,这样用户可以自定义非常大的pollfd数组,这个pollfd数组在kernel中的表现形式是poll_list链表,这样就不存在了1024的限制了,除此之外poll相比select无太大区别。
?
?
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount, len, size; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; unsigned long todo = nfds; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, sizeof(struct pollfd) * walk->len)) goto out_fds; todo -= walk->len; if (!todo) break; len = min(todo, POLLFD_PER_PAGE); size = sizeof(struct poll_list) + sizeof(struct pollfd) * len; walk = walk->next = kmalloc(size, GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } }?
?
?? epoll:
?? ?select与poll的共同点是fd有数据后kernel会遍历所有fd,找到有效fd后初始化相应的revents,用户空间程序须再次遍历整个fdset,以找到有效的fd,这样实际上就遍历了两次fd数组表,对于极大量fd的情况,这样的性能非常不好,请看一下do_poll代码:
?
static int do_poll(unsigned int nfds, struct poll_list *list, struct poll_wqueues *wait, struct timespec *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; unsigned long slack = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { pt = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); for (;;) { struct poll_list *walk; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { /* * Fish for events. If we found one, record it * and kill