TheRiver | blog

You have reached the world's edge, none but devils play past here

0%

epoll的源码

参考

从linux源码看epoll

epoll源码解析翻译——说使用了mmap的都是骗子

–是否使用了mmap还存疑,源码分析中有看到mmap的函数,todo

linux内核源码


eventpoll结构.png


源码

epoll_create

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

SYSCALL_DEFINE1(epoll_create, int, size)
{
if (size <= 0)
return -EINVAL;

return do_epoll_create(0);
}

static int do_epoll_create(int flags)
{
int error, fd;
struct eventpoll *ep = NULL;
struct file *file;

/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
/*
* Create the internal data structure ("struct eventpoll").
*/
error = ep_alloc(&ep); /* 在内核空间申请eventpoll内存 */
if (error < 0)
return error;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); /* 获取一个可用的文件描述符 */
if (fd < 0) {
error = fd;
goto out_free_ep;
}

/* 在匿名inode文件系统中分配一个inode,并得到其file结构体 */
/* file->private_data = ep; */
/* file->f_op = &eventpoll_fops */
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
ep->file = file;
/* 将file填入文件描述符数组 */
fd_install(fd, file);
return fd;

out_free_fd:
put_unused_fd(fd);
out_free_ep:
ep_free(ep);
return error;
}


eventpoll_fops

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

static const struct file_operations eventpoll_fops;

struct file_operations {
struct module *owner;
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iopoll)(struct kiocb *kiocb, bool spin);
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
unsigned long mmap_supported_flags;
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
int (*fsync) (struct file *, loff_t, loff_t, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
int (*setlease)(struct file *, long, struct file_lock **, void **);
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *);
#endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
loff_t, size_t, unsigned int);
loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);


总结

epoll_create
|->do_epoll_create
    |->file->private_data = ep
    |->fd_install
  • epoll_create最主要创建一个eventpoll结构体
  • epfd->file->private_date = eventpoll
  • epfd->file->f_op = eventpoll_fops
  • epoll_create的函数参数,在内核某个版本后已经没什么用了,只需要填入大于0的数即可。
  • eventpoll的rdllink链表节点是epitem的rdllink成员,哪个epitem就绪了就把自己的rdllink加到双向链表rdllink中
  • eventpoll的rbr红黑树节点是epitem的rbn成员,每个监控的epitem都要加到红黑树中

epoll_create执行后的主要结构:

图片来源:

从linux源码看epoll

epoll_creat.jpg

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116

/**
* anon_inode_getfile - creates a new file instance by hooking it up to an
* anonymous inode, and a dentry that describe the "class"
* of the file
*
* @name: [in] name of the "class" of the new file
* @fops: [in] file operations for the new file
* @priv: [in] private data for the new file (will be file's private_data)
* @flags: [in] flags
*
* Creates a new file by hooking it on a single inode. This is useful for files
* that do not need to have a full-fledged inode in order to operate correctly.
* All the files created with anon_inode_getfile() will share a single inode,
* hence saving memory and avoiding code duplication for the file/inode/dentry
* setup. Returns the newly created file* or an error pointer.
*/

/* 通过将其挂接到单个inode 上来创建一个新文件。这对于文件*不需要完整的inode才能正常运行
非常有用。*使用anon_inode_getfile()创建的所有文件将共享一个inode,从而节省内存并避免文件
/ inode / dentry的代码重复 */

struct file *anon_inode_getfile(const char *name,
const struct file_operations *fops,
void *priv, int flags)
{
struct file *file;

if (IS_ERR(anon_inode_inode))
return ERR_PTR(-ENODEV);

if (fops->owner && !try_module_get(fops->owner))
return ERR_PTR(-ENOENT);

/*
* We know the anon_inode inode count is always greater than zero,
* so ihold() is safe.
*/
ihold(anon_inode_inode);
file = alloc_file_pseudo(anon_inode_inode, anon_inode_mnt, name,
flags & (O_ACCMODE | O_NONBLOCK), fops);
if (IS_ERR(file))
goto err;

file->f_mapping = anon_inode_inode->i_mapping;

file->private_data = priv;

return file;

err:
iput(anon_inode_inode);
module_put(fops->owner);
return file;
}

struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
const char *name, int flags,
const struct file_operations *fops)
{
static const struct dentry_operations anon_ops = {
.d_dname = simple_dname
};
struct qstr this = QSTR_INIT(name, strlen(name));
struct path path;
struct file *file;

path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
if (!path.dentry)
return ERR_PTR(-ENOMEM);
if (!mnt->mnt_sb->s_d_op)
d_set_d_op(path.dentry, &anon_ops);
path.mnt = mntget(mnt);
d_instantiate(path.dentry, inode);
file = alloc_file(&path, flags, fops);
if (IS_ERR(file)) {
ihold(inode);
path_put(&path);
}
return file;
}


/**
* alloc_file - allocate and initialize a 'struct file'
*
* @path: the (dentry, vfsmount) pair for the new file
* @flags: O_... flags with which the new file will be opened
* @fop: the 'struct file_operations' for the new file
*/
static struct file *alloc_file(const struct path *path, int flags,
const struct file_operations *fop)
{
struct file *file;

file = alloc_empty_file(flags, current_cred());
if (IS_ERR(file))
return file;

file->f_path = *path;
file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
if ((file->f_mode & FMODE_READ) &&
likely(fop->read || fop->read_iter))
file->f_mode |= FMODE_CAN_READ;
if ((file->f_mode & FMODE_WRITE) &&
likely(fop->write || fop->write_iter))
file->f_mode |= FMODE_CAN_WRITE;
file->f_mode |= FMODE_OPENED;
file->f_op = fop;
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(path->dentry->d_inode);
return file;
}


eventpoll.png


epoll_event

1
2
3
4
5
6

struct epoll_event {
__poll_t events; //事件
__u64 data; //fd
} EPOLL_PACKED;


struct epitem

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the "rbr" RB tree.
* Avoid increasing the size of this struct, there can be many thousands
* of these on a server and we do not want this to take another cache line.
*/
//每一个添加到eventpoll的文件描述符都有一个epitem这样的条目链接到红黑树节点
//避免增加此结构的大小,服务器上可能有成千上万个这样的结构,我们不希望这占用另一个缓存行。
struct epitem {
union {
/* RB tree node links this structure to the eventpoll RB tree */
//RB树节点将此结构链接到eventpoll RB树
struct rb_node rbn;
/* Used to free the struct epitem */
struct rcu_head rcu;
};

/* List header used to link this structure to the eventpoll ready list */
//用来连接到eventpoll的就绪队列链表头
struct list_head rdllink;

/*
* Works together "struct eventpoll"->ovflist in keeping the
* single linked chain of items.
*/
//和eventpoll结构体的ovflist成员一起工作保持连接所有item成员
struct epitem *next;

/* The file descriptor information this item refers to */
//引用文件描述符信息
//epoll_ctrl add时候在红黑树中查找是否存在就是通过这个值
struct epoll_filefd ffd;

/* Number of active wait queue attached to poll operations */
//附加到轮询操作的活跃等待队列数
int nwait;

/* List containing poll wait queues */
//poll 等待队列的链表头
struct list_head pwqlist;

/* The "container" of this item */
//指向eventpoll
struct eventpoll *ep;

/* List header used to link this item to the "struct file" items list */
//file结构链表的链表头
struct list_head fllink;

/* wakeup_source used when EPOLLWAKEUP is set */
//设置EPOLLWAKEUP时使用
struct wakeup_source __rcu *ws;

/* The structure that describe the interested events and the source fd */
//保存被触发的事件和相应的fd的值
//EPOLLIN:表示对应的文件描述符可以读;
//EPOLLOUT:表示对应的文件描述符可以写;
//EPOLLPRI:表示对应的文件描述符有紧急的数可读;
//EPOLLERR:表示对应的文件描述符发生错误;
//EPOLLHUP:表示对应的文件描述符被挂断;
//EPOLLET: 表示ET的epoll工作模式;
struct epoll_event event;
};


struct eventpoll

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
/*
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
*/
struct eventpoll {
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
*/
// 这个互斥锁是为了保证在eventloop使用对应的文件描述符的时候,文件描述符不会被移除掉
struct mutex mtx;

/* Wait queue used by sys_epoll_wait() */
// epoll_wait使用的等待队列,和进程唤醒有关
wait_queue_head_t wq;

/* Wait queue used by file->poll() */
// file->poll使用的等待队列,和进程唤醒有关
wait_queue_head_t poll_wait;

/* List of ready file descriptors */
/* 就绪队列,双向链表 */
//链表中每个节点都是基于epitem中的rdllink
struct list_head rdllist;

/* Lock which protects rdllist and ovflist */
//保护rdllist和ovflist的读写锁
rwlock_t lock;

/* RB tree root used to store monitored fd structs */
/* 存储fd的红黑树的root节点 */
//红黑树中每个节点都是基于epitm中的rbn
struct rb_root_cached rbr;

/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
* holding ->lock.
*/
/* 这是一个单链表,链接了在向用户空间传输就绪事件的时候的所有eptem结构体 */
struct epitem *ovflist;

/* wakeup_source used when ep_scan_ready_list is running */
//ep_scan_ready_list运行时wakeup_source被使用
struct wakeup_source *ws;

/* The user that created the eventpoll descriptor */
//用户
struct user_struct *user;

//文件指针,epoll_create创建的那个file,即epfd对应的file
struct file *file;

/* used to optimize loop detection check */
//用于优化回路检测检查
int visited;
struct list_head visited_list_link;

#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
#endif
};


epoll_ctl

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177

/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set.
*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
struct epoll_event epds;

if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
return -EFAULT;

return do_epoll_ctl(epfd, op, fd, &epds, false);
}

int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock)
{
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct eventpoll *tep = NULL;

error = -EBADF;
f = fdget(epfd);
if (!f.file)
goto error_return;

/* Get the "struct file *" for the target file */
tf = fdget(fd);
if (!tf.file)
goto error_fput;

/* The target file descriptor must support poll */
error = -EPERM;
if (!file_can_poll(tf.file))
goto error_tgt_fput;

/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
ep_take_care_of_epollwakeup(epds);

/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
if (f.file == tf.file || !is_file_epoll(f.file))
goto error_tgt_fput;

/*
* epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}

/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = f.file->private_data;

/*
* When we insert an epoll file descriptor, inside another epoll file
* descriptor, there is the change of creating closed loops, which are
* better be handled here, than in more critical paths. While we are
* checking for loops we also determine the list of files reachable
* and hang them on the tfile_check_list, so we can check that we
* haven't created too many possible wakeup paths.
*
* We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
* the epoll file descriptor is attaching directly to a wakeup source,
* unless the epoll file descriptor is nested. The purpose of taking the
* 'epmutex' on add is to prevent complex toplogies such as loops and
* deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations.
*/
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
if (!list_empty(&f.file->f_ep_links) ||
is_file_epoll(tf.file)) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epmutex, 0, nonblock);
if (error)
goto error_tgt_fput;
full_check = 1;
if (is_file_epoll(tf.file)) {
error = -ELOOP;
if (ep_loop_check(ep, tf.file) != 0) {
clear_tfile_check_list();
goto error_tgt_fput;
}
} else
list_add(&tf.file->f_tfile_llink,
&tfile_check_list);
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error) {
out_del:
list_del(&tf.file->f_tfile_llink);
goto error_tgt_fput;
}
if (is_file_epoll(tf.file)) {
tep = tf.file->private_data;
error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
if (error) {
mutex_unlock(&ep->mtx);
goto out_del;
}
}
}
}

/*
* Try to lookup the file inside our RB tree, Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
epi = ep_find(ep, tf.file, fd);

error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, epds, tf.file, fd, full_check);
} else
error = -EEXIST;
if (full_check)
clear_tfile_check_list();
break;
case EPOLL_CTL_DEL:
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
case EPOLL_CTL_MOD:
if (epi) {
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, epds);
}
} else
error = -ENOENT;
break;
}
if (tep != NULL)
mutex_unlock(&tep->mtx);
mutex_unlock(&ep->mtx);

error_tgt_fput:
if (full_check)
mutex_unlock(&epmutex);

fdput(tf);
error_fput:
fdput(f);
error_return:

return error;
}


ep_find

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

/*
* Search the file inside the eventpoll tree. The RB tree operations
* are protected by the "mtx" mutex, and ep_find() must be called with
* "mtx" held.
*/
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
int kcmp;
struct rb_node *rbp;
struct epitem *epi, *epir = NULL;
struct epoll_filefd ffd;

ep_set_ffd(&ffd, file, fd);
//红黑树中查找节点是否已经存在
for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
epi = rb_entry(rbp, struct epitem, rbn);
kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
if (kcmp > 0)
rbp = rbp->rb_right;
else if (kcmp < 0)
rbp = rbp->rb_left;
else {
epir = epi;
break;
}
}

return epir;
}


ep_pqueue

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

/* Wrapper struct used by poll queueing */
struct ep_pqueue {
poll_table pt;
struct epitem *epi;
};

/*
* Do not touch the structure directly, use the access functions
* poll_does_not_wait() and poll_requested_events() instead.
*/
typedef struct poll_table_struct {
poll_queue_proc _qproc;
__poll_t _key;
} poll_table;

ep_insert

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

/*
* Must be called with "mtx" held.
*/
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
struct file *tfile, int fd, int full_check)
{
int error, pwake = 0;
__poll_t revents;
long user_watches;
//初始化epitem
struct epitem *epi;
struct ep_pqueue epq;

lockdep_assert_irqs_enabled();

user_watches = atomic_long_read(&ep->user->epoll_watches);
if (unlikely(user_watches >= max_user_watches))
return -ENOSPC;
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
return -ENOMEM;

/* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
if (epi->event.events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
if (error)
goto error_create_wakeup_source;
} else {
RCU_INIT_POINTER(epi->ws, NULL);
}

/* Initialize the poll table using the queue callback */
epq.epi = epi;
//*********************************************************************
//初始化回调函数
//(&epq.pt)->_qproc = ep_ptable_queue_proc
//(&epq.pt)->_key = ~(__poll_t)0 /* all events enabled */
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function. Note that after
* this operation completes, the poll callback can start hitting
* the new item.
*/
//*********************************************************************
revents = ep_item_poll(epi, &epq.pt, 1);

/*
* We have to check if something went wrong during the poll wait queue
* install process. Namely an allocation for a wait queue failed due
* high memory pressure.
*/
error = -ENOMEM;
if (epi->nwait < 0)
goto error_unregister;

/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_lock);
list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_lock);

/*
* Add the current item to the RB tree. All RB tree operations are
* protected by "mtx", and ep_insert() is called with "mtx" held.
*/
//插入红黑树
ep_rbtree_insert(ep, epi);

/* now check if we've created too many backpaths */
error = -EINVAL;
if (full_check && reverse_path_check())
goto error_remove_epi;

/* We have to drop the new item inside our item list to keep track of it */
write_lock_irq(&ep->lock);

/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);

/* If the file is already "ready" we drop it inside the ready list */
//如果当前有事件已经就绪,那么一开始就会被加入到ready list
if (revents && !ep_is_linked(epi)) {
//将epitem插入就绪链表
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);

/* Notify waiting tasks that events are available */
//唤醒epoll wait
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
//唤醒file->poll
if (waitqueue_active(&ep->poll_wait))
pwake++;
}

write_unlock_irq(&ep->lock);

atomic_long_inc(&ep->user->epoll_watches);

/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);

return 0;

error_remove_epi:
spin_lock(&tfile->f_lock);
list_del_rcu(&epi->fllink);
spin_unlock(&tfile->f_lock);

rb_erase_cached(&epi->rbn, &ep->rbr);

error_unregister:
ep_unregister_pollwait(ep, epi);

/*
* We need to do this because an event could have been arrived on some
* allocated wait queue. Note that we don't care about the ep->ovflist
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
write_unlock_irq(&ep->lock);

wakeup_source_unregister(ep_wakeup_source(epi));

error_create_wakeup_source:
kmem_cache_free(epi_cache, epi);

return error;
}


ep_item_poll

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/*
* Differs from ep_eventpoll_poll() in that internal callers already have
* the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
* is correctly annotated.
*/
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
int depth)
{
struct eventpoll *ep;
bool locked;

//赋值pt->_key
pt->_key = epi->event.events;
if (!is_file_epoll(epi->ffd.file))
return vfs_poll(epi->ffd.file, pt) & epi->event.events;

ep = epi->ffd.file->private_data;
//poll_wait adds your device (represented by the "struct file") to
//the list of those that can wake the process up.
//赋值pt->_qproc
//***********************************************
poll_wait(epi->ffd.file, &ep->poll_wait, pt);
locked = pt && (pt->_qproc == ep_ptable_queue_proc);

return ep_scan_ready_list(epi->ffd.file->private_data,
ep_read_events_proc, &depth, depth,
locked) & epi->event.events;
}

poll_wait

1
2
3
4
5
6
7

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && p->_qproc && wait_address)
p->_qproc(filp, wait_address, p);
}

ep_ptable_queue_proc

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33


/*
* This is the callback that is used to add our wait queue to the
* target file wakeup lists.
*/
//这是回调函数,用于将我们的等待队列添加到目标文件唤醒列表
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;

//nwait:附加到轮询操作的活跃等待队列数
if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
//&pwq->wait->func=ep_poll_callback,用于回调唤醒
//epoll_wait会被唤醒
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
if (epi->event.events & EPOLLEXCLUSIVE)
add_wait_queue_exclusive(whead, &pwq->wait);
else
//这边的whead是sk->sk_sleep,将当前的waitqueue链入到socket对应的sleep列表(如果是accept的话)
add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
}
}

总结

epoll_ctl
|->do_epoll_ctl
    |->ep_find
        |->ep_insert
            |->ep_item_poll
                |->poll_wait(epi->ffd.file, &ep->poll_wait, pt)
                    |->ep_ptable_queue_proc
                        |->waitqueue中    //poll_table添加唤醒回调函数ep_poll_callback
                        |->add_wait_queue(将当前的waitqueue链入到epitem对应的等待列表)
                |->ep_scan_ready_list
        |->ep_remove
        |->ep_modify
  • ep_find的三个参数:epfd对应的eventpoll,epoll_ctrl参数中的fd对应的file结构,epoll_ctrl参数中的fd
  • ep_find的逻辑:将fd转换为epoll_filefd格式,找rbr红黑树中每个epitem的ffd成员进行比较
  • ep_insert新建一个epi(epitem),赋值然后插入红黑树
  • ep_insert创建epq(ep_pqueue),(&epq.pt)->_qproc = ep_ptable_queue_proc poll队列回调函数
  • ep_insert创建epq(ep_pqueue),(&epq.pt)->_key = epi->event.events(初始化后在ep_item_poll中赋值)
  • poll_wait中执行回调函数ep_ptable_queue_proc
  • ep_ptable_queue_proc新建一个pwq(eppoll_entry),&pwq->wait->func=ep_poll_callback(唤醒epoll_wait的函数)
  • ep_ptable_queue_proc中pwq->base = epi
  • ep_ptable_queue_proc中pwq->whead = &ep->poll_wait(file->poll使用的等待队列,和进程唤醒有关)

图片来源: 从linux源码看epoll

epoll_callback.jpg

epoll_ctrl.jpg


epoll_wait

epoll_wait

1
2
3
4
5
6
7
8

//系统调用epoll_wait
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout)
{
return do_epoll_wait(epfd, events, maxevents, timeout);
}


do_epoll_wait

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, int timeout)
{
int error;
struct fd f;
struct eventpoll *ep;

/* The maximum number of event must be greater than zero */
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;

/* Verify that the area passed by the user is writeable */
if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
return -EFAULT;

/* Get the "struct file *" for the eventpoll file */
f = fdget(epfd);
if (!f.file)
return -EBADF;

/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
if (!is_file_epoll(f.file))
goto error_fput;

/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
//取epfd对于的eventpoll结构体
ep = f.file->private_data;

/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, timeout);

error_fput:
fdput(f);
return error;
}


ep_poll

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161

/**
* ep_poll - Retrieves ready events, and delivers them to the caller supplied
* event buffer.
*
* @ep: Pointer to the eventpoll context.
* @events: Pointer to the userspace buffer where the ready events should be
* stored.
* @maxevents: Size (in terms of number of events) of the caller event buffer.
* @timeout: Maximum timeout for the ready events fetch operation, in
* milliseconds. If the @timeout is zero, the function will not block,
* while if the @timeout is less than zero, the function will block
* until at least one event has been retrieved (or an error
* occurred).
*
* Returns: Returns the number of ready events which have been fetched, or an
* error code, in case of error.
*/
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
int res = 0, eavail, timed_out = 0;
u64 slack = 0;
bool waiter = false;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;

lockdep_assert_irqs_enabled();

//大于0,设置超时
if (timeout > 0) {
struct timespec64 end_time = ep_set_mstimeout(timeout);

slack = select_estimate_accuracy(&end_time);
to = &expires;
*to = timespec64_to_ktime(end_time);
//等于0,立刻返回结果
} else if (timeout == 0) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
* caller specified a non blocking operation. We still need
* lock because we could race and not see an epi being added
* to the ready list while in irq callback. Thus incorrectly
* returning 0 back to userspace.
*/
timed_out = 1;

write_lock_irq(&ep->lock);
//检查是否有可用事件,rdllist就绪链表
//return !list_empty_careful(&ep->rdllist) ||READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
eavail = ep_events_available(ep);
write_unlock_irq(&ep->lock);

goto send_events;
}
//小于0,无限期阻塞

fetch_events:

if (!ep_events_available(ep))
ep_busy_loop(ep, timed_out);

//检查是否有可用事件,rdllist就绪链表
//return !list_empty_careful(&ep->rdllist) ||READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
eavail = ep_events_available(ep);
if (eavail)
goto send_events;

/*
* Busy poll timed out. Drop NAPI ID for now, we can add
* it back in when we have moved a socket with a valid NAPI
* ID onto the ready list.
*/
ep_reset_busy_poll_napi_id(ep);

/*
* We don't have any available event to return to the caller. We need
* to sleep here, and we will be woken by ep_poll_callback() when events
* become available.
*/
//我们没有任何可用的事件可返回给呼叫者。我们需要
//睡眠,当事件发生时,我们会被ep_poll_callback()唤醒变得可用。
if (!waiter) {
waiter = true;
//初始化等待链表
init_waitqueue_entry(&wait, current);

//自旋锁
spin_lock_irq(&ep->wq.lock);
//添加到等待链表
__add_wait_queue_exclusive(&ep->wq, &wait);
spin_unlock_irq(&ep->wq.lock);
}

for (;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
* to TASK_INTERRUPTIBLE before doing the checks.
*/
// 设置当前进程状态为可打断
set_current_state(TASK_INTERRUPTIBLE);
/*
* Always short-circuit for fatal signals to allow
* threads to make a timely exit without the chance of
* finding more events available and fetching
* repeatedly.
*/
// 检查当前线程是否有信号要处理,有则返回-EINTR
if (fatal_signal_pending(current)) {
res = -EINTR;
break;
}

//检查是否有可用事件,rdllist就绪链表
//return !list_empty_careful(&ep->rdllist) ||READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
eavail = ep_events_available(ep);
if (eavail)
break;
if (signal_pending(current)) {
res = -EINTR;
break;
}

// sleep until timeout
// 让出cpu
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
timed_out = 1;
break;
}
}

//// 设置进程状态为running
__set_current_state(TASK_RUNNING);

send_events:
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
//尝试将事件转移到用户空间。如果我们收到0个事件,还有剩余的超时时间,我们将再次
//尝试寻找来碰碰运气
if (!res && eavail &&
//// 向用户空间拷贝就绪事件
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;

if (waiter) {
spin_lock_irq(&ep->wq.lock);

//wq_entry = wait
//list_del(&wq_entry->entry);
//从等待链表移除
__remove_wait_queue(&ep->wq, &wait);
spin_unlock_irq(&ep->wq.lock);
}

return res;
}

总结

epoll_wait
    |->do_epoll_wait
        |->ep_poll
            |->send_events
            |->fetch_events
  • ep_poll根据超时时间执行不同策略
  • ep_poll中判断就绪队列rdllist不为空,则发送就绪事件到用户空间send_events
  • ep_poll没有事件则schedule_hrtimeout_range让出cpu,等待唤醒
  • ep->ovflist的作用:记录处理过程中新到来的事件(不是很明确)
  • 中断处理程序执行回调函数唤醒epoll_wait(我的猜测)

图片来源: 从linux源码看epoll

epoll_wait.jpg


ep_send_events

ep_send_events

1
2
3
4
5
6
7
8
9
10
11
12
13

static int ep_send_events(struct eventpoll *ep,
struct epoll_event __user *events, int maxevents)
{
struct ep_send_events_data esed;

esed.maxevents = maxevents;
esed.events = events;

ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
return esed.res;
}


ep_scan_ready_list

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/**
* ep_scan_ready_list - Scans the ready list in a way that makes possible for
* the scan code, to call f_op->poll(). Also allows for
* O(NumReady) performance.
*
* @ep: Pointer to the epoll private data structure.
* @sproc: Pointer to the scan callback.
* @priv: Private opaque data passed to the @sproc callback.
* @depth: The current depth of recursive f_op->poll calls.
*
* Returns: The same integer error code returned by the @sproc callback.
*/
static int ep_scan_ready_list(struct eventpoll *ep,
int (*sproc)(struct eventpoll *,
struct list_head *, void *),
void *priv,
int depth)
{
int error, pwake = 0;
unsigned long flags;
struct epitem *epi, *nepi;
LIST_HEAD(txlist);

/*
* We need to lock this because we could be hit by
* eventpoll_release_file() and epoll_ctl().
*/
mutex_lock_nested(&ep->mtx, depth);

/*
* Steal the ready list, and re-init the original one to the
* empty list. Also, set ep->ovflist to NULL so that events
* happening while looping w/out locks, are not lost. We cannot
* have the poll callback to queue directly on ep->rdllist,
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
spin_lock_irqsave(&ep->lock, flags);
//将rdllist链入到txlist
list_splice_init(&ep->rdllist, &txlist);
ep->ovflist = NULL;
spin_unlock_irqrestore(&ep->lock, flags);

/*
* Now call the callback function.
*/
//***********************************************************************
//执行函数ep_send_events_proc
error = (*sproc)(ep, &txlist, priv);

spin_lock_irqsave(&ep->lock, flags);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
* We re-insert them inside the main ready-list here.
*/
for (nepi = ep->ovflist; (epi = nepi) != NULL;
nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
/*
* We need to check if the item is already in the list.
* During the "sproc" callback execution time, items are
* queued into ->ovflist but the "txlist" might already
* contain them, and the list_splice() below takes care of them.
*/
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
/*
* We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
* releasing the lock, events will be queued in the normal way inside
* ep->rdllist.
*/
ep->ovflist = EP_UNACTIVE_PTR;

/*
* Quickly re-inject items left on "txlist".
*/
list_splice(&txlist, &ep->rdllist);
__pm_relax(ep->ws);

if (!list_empty(&ep->rdllist)) {
/*
* Wake up (if active) both the eventpoll wait list and
* the ->poll() wait list (delayed after we release the lock).
*/
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
spin_unlock_irqrestore(&ep->lock, flags);

mutex_unlock(&ep->mtx);

/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);

return error;
}


init_poll_funcptr

1
2
3
4
5
6
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
pt->_qproc = qproc;
pt->_key = ~(__poll_t)0; /* all events enabled */
}

list_for_each_entry_safe

/**
 * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:    the type * to use as a loop cursor.
 * @n:        another type * to use as temporary storage
 * @head:    the head for your list.
 * @member:    the name of the list_head within the struct.
 */
#define list_for_each_entry_safe(pos, n, head, member)            \
    for (pos = list_first_entry(head, typeof(*pos), member),    \
        n = list_next_entry(pos, member);            \
         &pos->member != (head);                     \
         pos = n, n = list_next_entry(n, member))

ep_send_events_proc

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
{
struct ep_send_events_data *esed = priv;
__poll_t revents;
struct epitem *epi, *tmp;
struct epoll_event __user *uevent = esed->events;
struct wakeup_source *ws;
poll_table pt;

//pt->_qproc = NULL
//pt->_key = ~(__poll_t)0 /* all events enabled */
init_poll_funcptr(&pt, NULL);
esed->res = 0;

/*
* We can loop without lock because we are passed a task private list.
* Items cannot vanish during the loop because ep_scan_ready_list() is
* holding "mtx" during this call.
*/
lockdep_assert_held(&ep->mtx);

//head=ep->rdllist
//遍历rdllink链表,直到其中某个节点的member成员=ep->rdllist的
//我理解是找到ep->rdllist就绪链表在内存某个数据结构中的地址
//这个地址用epi指针保存
//*******************************************************************
list_for_each_entry_safe(epi, tmp, head, rdllink) {
if (esed->res >= esed->maxevents)
break;

/*
* Activate ep->ws before deactivating epi->ws to prevent
* triggering auto-suspend here (in case we reactive epi->ws
* below).
*
* This could be rearranged to delay the deactivation of epi->ws
* instead, but then epi->ws would temporarily be out of sync
* with ep_is_linked().
*/
ws = ep_wakeup_source(epi);
if (ws) {
if (ws->active)
__pm_stay_awake(ep->ws);
__pm_relax(ws);
}

list_del_init(&epi->rdllink);

/*
* If the event mask intersect the caller-requested one,
* deliver the event to userspace. Again, ep_scan_ready_list()
* is holding ep->mtx, so no operations coming from userspace
* can change the item.
*/
//如果事件掩码与调用方请求的掩码相交,则将事件传递到用户空间
//ep_scan_ready_list重新获得锁
//所以来自用户空间的操作不会改变这个epitem
//readylist只是表明当前epi有事件,具体的事件信息还是得调用对应file的poll
//执行tcp_poll,确认是否是感兴趣的事件
//*********************************************************************
revents = ep_item_poll(epi, &pt, 1);
if (!revents)
continue;

//将event放入到用户空间
if (__put_user(revents, &uevent->events) ||
__put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head);
ep_pm_stay_awake(epi);
if (!esed->res)
esed->res = -EFAULT;
return 0;
}
esed->res++;
uevent++;
if (epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS;
else if (!(epi->event.events & EPOLLET)) {
/*
* If this file has been added with Level
* Trigger mode, we need to insert back inside
* the ready list, so that the next call to
* epoll_wait() will check again the events
* availability. At this point, no one can insert
* into ep->rdllist besides us. The epoll_ctl()
* callers are locked out by
* ep_scan_ready_list() holding "mtx" and the
* poll callback will queue them in ep->ovflist.
*/
//如果是水平触发,则将当前的epi重新加回到可用列表中,这样就可以下一次继续触发poll,
//如果下一次poll的revents不为0,那么用户空间依旧能感知
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
//如果是边缘触发,那么就不加回可用列表,因此只能等到下一个可用事件触发的时候才会将对应的epi放到可用列表里面
}

return 0;
}


struct poll_wqueues

1
2
3
4
5
6
7
8
9
10
11
12
13
14
/*
* Structures and helpers for select/poll syscall
*/
struct poll_wqueues {
poll_table pt;
struct poll_table_page *table;
struct task_struct *polling_task;
int triggered;
int error;
int inline_index;
struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};



__pollwait(pollwait)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
/*
* Ok, Peter made a complicated, but straightforward multiple_wait() function.
* I have rewritten this, taking some shortcuts: This code may not be easy to
* follow, but it should be free of race-conditions, and it's practical. If you
* understand what I'm doing here, then you understand how the linux
* sleep/wakeup mechanism works.
*
* Two very simple procedures, poll_wait() and poll_freewait() make all the
* work. poll_wait() is an inline-function defined in <linux/poll.h>,
* as all select/poll functions have to call it to add an entry to the
* poll table.
*/
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p);

/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p)
{
//根据poll_wqueues的成员pt指针p找到所在的poll_wqueues结构指针
struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
//根据pwq找到entry
struct poll_table_entry *entry = poll_get_entry(pwq);
if (!entry)
return;
//entry->filp = get_file(epi->ffd.file)
entry->filp = get_file(filp);
//wait_address = &ep->poll_wait
entry->wait_address = wait_address;
//entry->key = epi->event.events
entry->key = p->_key;

//(&entry->wait)->func = pollwake
init_waitqueue_func_entry(&entry->wait, pollwake);
entry->wait.private = pwq;
//
add_wait_queue(wait_address, &entry->wait);
}

struct poll_table_entry

1
2
3
4
5
6
7
struct poll_table_entry {
struct file *filp;
__poll_t key;
wait_queue_entry_t wait;
wait_queue_head_t *wait_address;
};

init_waitqueue_func_entry

1
2
3
4
5
6
7
static inline void
init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
{
wq_entry->flags = 0;
wq_entry->private = NULL;
wq_entry->func = func;
}

vfs_poll

1
2
3
4
5
6
7
8
9

static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
if (unlikely(!file->f_op->poll))
return DEFAULT_POLLMASK;
// 这边的poll即是tcp_poll,根据tcp本身的信息设置掩码(mask)等信息 & 上兴趣事件掩码,则可以得知当前事件是否是epoll_wait感兴趣的事件
return file->f_op->poll(file, pt);
}

总结

ep_send_events
    |->ep_scan_ready_list
        |->ep_send_events_proc(poll_table pt)
            //pt->_key = epi->event.events;
            |->ep_item_poll(epi, &pt, 1)
                |->vfs_poll(epi->ffd.file, pt)
                    |->file->f_op->poll()
  • ep_send_events只是知道epoll_wait被唤醒,还需要获取具体信息
  • ep_send_events_proc遍历rdllink链表(全局的),直到其中某个节点的member成员=ep->rdllist,找到epi节点
  • ep_item_poll执行epi->ffd.file->f_op->poll() 这边的poll即是file_poll(网络套接字则是tcp_poll),根据file本身的信息设置掩码(mask)等信息 & 上兴趣事件掩码
  • ep_send_events_proc根据上一步的结果确认要不要将event放入用户空间
  • ep_item_poll有2个分支,在两个2同阶段

图片来源: 从linux源码看epoll

epoll_send.jpg

ending

tumblr_p0koo3JtFk1sfie3io1_1280.jpg

----------- ending -----------